From 6dda9266913ad57e09afc1a10d6473f10c806a63 Mon Sep 17 00:00:00 2001 From: "Luck, Tony" Date: Thu, 11 Aug 2011 15:14:39 -0700 Subject: pstore: defer inserting OOPS entries into pstore Life is simple for all the kernel terminating types of kmsg_dump call backs - pstore just saves the tail end of the console log. But for "oops" the situation is more complex - the kernel may carry on running (possibly for ever). So we'd like to make the logged copy of the oops appear in the pstore filesystem - so that the user has a handle to clear the entry from the persistent backing store (if we don't, the store may fill with "oops" entries (that are also safely stashed in /var/log/messages) leaving no space for real errors. Current code calls pstore_mkfile() immediately. But this may not be safe. The oops could have happened with arbitrary locks held, or in interrupt or NMI context. So allocating memory and calling into generic filesystem code seems unwise. This patch defers making the entry appear. At the time of the oops, we merely set a flag "pstore_new_entry" noting that a new entry has been added. A periodic timer checks once a minute to see if the flag is set - if so, it schedules a work queue to rescan the backing store and make all new entries appear in the pstore filesystem. Signed-off-by: Tony Luck --- fs/pstore/inode.c | 40 ++++++++++++++++++++++++++++++++++---- fs/pstore/internal.h | 2 +- fs/pstore/platform.c | 54 +++++++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 82 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 893b961dcfd8..379a02dc1217 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -32,13 +33,18 @@ #include #include #include +#include #include #include "internal.h" #define PSTORE_NAMELEN 64 +static DEFINE_SPINLOCK(allpstore_lock); +static LIST_HEAD(allpstore); + struct pstore_private { + struct list_head list; struct pstore_info *psi; enum pstore_type_id type; u64 id; @@ -81,8 +87,16 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) static void pstore_evict_inode(struct inode *inode) { + struct pstore_private *p = inode->i_private; + unsigned long flags; + end_writeback(inode); - kfree(inode->i_private); + if (p) { + spin_lock_irqsave(&allpstore_lock, flags); + list_del(&p->list); + spin_unlock_irqrestore(&allpstore_lock, flags); + kfree(p); + } } static const struct inode_operations pstore_dir_inode_operations = { @@ -182,9 +196,23 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, struct dentry *root = pstore_sb->s_root; struct dentry *dentry; struct inode *inode; - int rc; + int rc = 0; char name[PSTORE_NAMELEN]; - struct pstore_private *private; + struct pstore_private *private, *pos; + unsigned long flags; + + spin_lock_irqsave(&allpstore_lock, flags); + list_for_each_entry(pos, &allpstore, list) { + if (pos->type == type && + pos->id == id && + pos->psi == psi) { + rc = -EEXIST; + break; + } + } + spin_unlock_irqrestore(&allpstore_lock, flags); + if (rc) + return rc; rc = -ENOMEM; inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0); @@ -229,6 +257,10 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, d_add(dentry, inode); + spin_lock_irqsave(&allpstore_lock, flags); + list_add(&private->list, &allpstore); + spin_unlock_irqrestore(&allpstore_lock, flags); + mutex_unlock(&root->d_inode->i_mutex); return 0; @@ -277,7 +309,7 @@ int pstore_fill_super(struct super_block *sb, void *data, int silent) goto fail; } - pstore_get_records(); + pstore_get_records(0); return 0; fail: diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 611c1b3c46fa..3bde461c3f34 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -1,5 +1,5 @@ extern void pstore_set_kmsg_bytes(int); -extern void pstore_get_records(void); +extern void pstore_get_records(int); extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, char *data, size_t size, struct timespec time, struct pstore_info *psi); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index c5300ec31696..ca60ebcfb15f 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -25,11 +25,28 @@ #include #include #include +#include #include #include +#include #include "internal.h" +/* + * We defer making "oops" entries appear in pstore - see + * whether the system is actually still running well enough + * to let someone see the entry + */ +#define PSTORE_INTERVAL (60 * HZ) + +static int pstore_new_entry; + +static void pstore_timefunc(unsigned long); +static DEFINE_TIMER(pstore_timer, pstore_timefunc, 0, 0); + +static void pstore_dowork(struct work_struct *); +static DECLARE_WORK(pstore_work, pstore_dowork); + /* * pstore_lock just protects "psinfo" during * calls to pstore_register() @@ -100,9 +117,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, id = psinfo->write(PSTORE_TYPE_DMESG, part, hsize + l1_cpy + l2_cpy, psinfo); if (reason == KMSG_DUMP_OOPS && pstore_is_mounted()) - pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, - psinfo->buf, hsize + l1_cpy + l2_cpy, - CURRENT_TIME, psinfo); + pstore_new_entry = 1; l1 -= l1_cpy; l2 -= l2_cpy; total += l1_cpy + l2_cpy; @@ -148,19 +163,24 @@ int pstore_register(struct pstore_info *psi) } if (pstore_is_mounted()) - pstore_get_records(); + pstore_get_records(0); kmsg_dump_register(&pstore_dumper); + pstore_timer.expires = jiffies + PSTORE_INTERVAL; + add_timer(&pstore_timer); + return 0; } EXPORT_SYMBOL_GPL(pstore_register); /* - * Read all the records from the persistent store. Create and - * file files in our filesystem. + * Read all the records from the persistent store. Create + * files in our filesystem. Don't warn about -EEXIST errors + * when we are re-scanning the backing store looking to add new + * error records. */ -void pstore_get_records(void) +void pstore_get_records(int quiet) { struct pstore_info *psi = psinfo; ssize_t size; @@ -178,8 +198,9 @@ void pstore_get_records(void) goto out; while ((size = psi->read(&id, &type, &time, psi)) > 0) { - if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size, - time, psi)) + rc = pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size, + time, psi); + if (rc && (rc != -EEXIST || !quiet)) failed++; } psi->close(psi); @@ -191,6 +212,21 @@ out: failed, psi->name); } +static void pstore_dowork(struct work_struct *work) +{ + pstore_get_records(1); +} + +static void pstore_timefunc(unsigned long dummy) +{ + if (pstore_new_entry) { + pstore_new_entry = 0; + schedule_work(&pstore_work); + } + + mod_timer(&pstore_timer, jiffies + PSTORE_INTERVAL); +} + /* * Call platform driver to write a record to the * persistent store. -- cgit v1.2.3-58-ga151 From abd4d5587be911f63592537284dad78766d97d62 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 12 Aug 2011 10:54:51 -0700 Subject: pstore: change mutex locking to spin_locks pstore was using mutex locking to protect read/write access to the backend plug-ins. This causes problems when pstore is executed in an NMI context through panic() -> kmsg_dump(). This patch changes the mutex to a spin_lock_irqsave then also checks to see if we are in an NMI context. If we are in an NMI and can't get the lock, just print a message stating that and blow by the locking. All this is probably a hack around the bigger locking problem but it solves my current situation of trying to sleep in an NMI context. Tested by loading the lkdtm module and executing a HARDLOCKUP which will cause the machine to panic inside the nmi handler. Signed-off-by: Don Zickus Acked-by: Matthew Garrett Signed-off-by: Tony Luck --- drivers/acpi/apei/erst.c | 2 +- drivers/firmware/efivars.c | 2 +- fs/pstore/platform.c | 28 +++++++++++++++++++++------- include/linux/pstore.h | 2 +- 4 files changed, 24 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c index 2ca59dc69f7f..5e820ea35570 100644 --- a/drivers/acpi/apei/erst.c +++ b/drivers/acpi/apei/erst.c @@ -1165,7 +1165,7 @@ static int __init erst_init(void) goto err_release_erange; buf = kmalloc(erst_erange.size, GFP_KERNEL); - mutex_init(&erst_info.buf_mutex); + spin_lock_init(&erst_info.buf_lock); if (buf) { erst_info.buf = buf + sizeof(struct cper_pstore_record); erst_info.bufsize = erst_erange.size - diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c index eb80b549ed8d..be8bcb035e2a 100644 --- a/drivers/firmware/efivars.c +++ b/drivers/firmware/efivars.c @@ -978,7 +978,7 @@ int register_efivars(struct efivars *efivars, if (efivars->efi_pstore_info.buf) { efivars->efi_pstore_info.bufsize = 1024; efivars->efi_pstore_info.data = efivars; - mutex_init(&efivars->efi_pstore_info.buf_mutex); + spin_lock_init(&efivars->efi_pstore_info.buf_lock); pstore_register(&efivars->efi_pstore_info); } diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index ca60ebcfb15f..0472924024cc 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include "internal.h" @@ -88,13 +89,20 @@ static void pstore_dump(struct kmsg_dumper *dumper, u64 id; int hsize; unsigned int part = 1; + unsigned long flags = 0; + int is_locked = 0; if (reason < ARRAY_SIZE(reason_str)) why = reason_str[reason]; else why = "Unknown"; - mutex_lock(&psinfo->buf_mutex); + if (in_nmi()) { + is_locked = spin_trylock(&psinfo->buf_lock); + if (!is_locked) + pr_err("pstore dump routine blocked in NMI, may corrupt error record\n"); + } else + spin_lock_irqsave(&psinfo->buf_lock, flags); oopscount++; while (total < kmsg_bytes) { dst = psinfo->buf; @@ -123,7 +131,11 @@ static void pstore_dump(struct kmsg_dumper *dumper, total += l1_cpy + l2_cpy; part++; } - mutex_unlock(&psinfo->buf_mutex); + if (in_nmi()) { + if (is_locked) + spin_unlock(&psinfo->buf_lock); + } else + spin_unlock_irqrestore(&psinfo->buf_lock, flags); } static struct kmsg_dumper pstore_dumper = { @@ -188,11 +200,12 @@ void pstore_get_records(int quiet) enum pstore_type_id type; struct timespec time; int failed = 0, rc; + unsigned long flags; if (!psi) return; - mutex_lock(&psinfo->buf_mutex); + spin_lock_irqsave(&psinfo->buf_lock, flags); rc = psi->open(psi); if (rc) goto out; @@ -205,7 +218,7 @@ void pstore_get_records(int quiet) } psi->close(psi); out: - mutex_unlock(&psinfo->buf_mutex); + spin_unlock_irqrestore(&psinfo->buf_lock, flags); if (failed) printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n", @@ -233,7 +246,8 @@ static void pstore_timefunc(unsigned long dummy) */ int pstore_write(enum pstore_type_id type, char *buf, size_t size) { - u64 id; + u64 id; + unsigned long flags; if (!psinfo) return -ENODEV; @@ -241,13 +255,13 @@ int pstore_write(enum pstore_type_id type, char *buf, size_t size) if (size > psinfo->bufsize) return -EFBIG; - mutex_lock(&psinfo->buf_mutex); + spin_lock_irqsave(&psinfo->buf_lock, flags); memcpy(psinfo->buf, buf, size); id = psinfo->write(type, 0, size, psinfo); if (pstore_is_mounted()) pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf, size, CURRENT_TIME, psinfo); - mutex_unlock(&psinfo->buf_mutex); + spin_unlock_irqrestore(&psinfo->buf_lock, flags); return 0; } diff --git a/include/linux/pstore.h b/include/linux/pstore.h index cc03bbf5c4b8..b91440e64d6e 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -32,7 +32,7 @@ enum pstore_type_id { struct pstore_info { struct module *owner; char *name; - struct mutex buf_mutex; /* serialize access to 'buf' */ + spinlock_t buf_lock; /* serialize access to 'buf' */ char *buf; size_t bufsize; int (*open)(struct pstore_info *psi); -- cgit v1.2.3-58-ga151 From 832023bffb4b493f230be901f681020caf3ed1f8 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 8 Aug 2011 17:38:08 +0200 Subject: nfsd4: Remove check for a 32-bit cookie in nfsd4_readdir() Fan Yong noticed setting FMODE_32bithash wouldn't work with nfsd v4, as nfsd4_readdir() checks for 32 bit cookies. However, according to RFC 3530 cookies have a 64 bit type and cookies are also defined as u64 in 'struct nfsd4_readdir'. So remove the test for >32-bit values. Cc: stable@kernel.org Signed-off-by: Bernd Schubert Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index e80777666618..9bf0a6625187 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -691,7 +691,7 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion); readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion); - if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) || + if ((cookie == 1) || (cookie == 2) || (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE))) return nfserr_bad_cookie; -- cgit v1.2.3-58-ga151 From 5a0143a4f00517ea433bf459a80742ccc623a665 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 28 Jul 2011 17:47:10 +0200 Subject: ext3: Remove i_mutex from ext3_sync_file() ext3_sync_file() does not need i_mutex for anything so just drop it. Signed-off-by: Jan Kara --- fs/ext3/fsync.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'fs') diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index d494c554c6e6..1860ed356323 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -61,13 +61,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) goto out; - /* - * Taking the mutex here just to keep consistent with how fsync was - * called previously, however it looks like we don't need to take - * i_mutex at all. - */ - mutex_lock(&inode->i_mutex); - J_ASSERT(ext3_journal_current_handle() == NULL); /* @@ -85,7 +78,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * safe in-journal, which is all fsync() needs to ensure. */ if (ext3_should_journal_data(inode)) { - mutex_unlock(&inode->i_mutex); ret = ext3_force_commit(inode->i_sb); goto out; } @@ -108,8 +100,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ if (needs_barrier) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - - mutex_unlock(&inode->i_mutex); out: trace_ext3_sync_file_exit(inode, ret); return ret; -- cgit v1.2.3-58-ga151 From 1cde201da4e97f10a5dd2434cff4ceff381603d1 Mon Sep 17 00:00:00 2001 From: Toshiyuki Okajima Date: Tue, 2 Aug 2011 18:16:57 +0900 Subject: ext3: fix message in ext3_remount for rw-remount case If there are some inodes in orphan list while a filesystem is being read-only mounted, we should recommend that peole umount and then mount it when they try to remount with read-write. But the current message and comment recommend that they umount and then remount it which may be slightly misleading. Signed-off-by: Toshiyuki Okajima Signed-off-by: Jan Kara --- fs/ext3/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 7beb69ae0015..2043bcc87719 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2669,13 +2669,13 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) /* * If we have an unprocessed orphan list hanging * around from a previously readonly bdev mount, - * require a full umount/remount for now. + * require a full umount & mount for now. */ if (es->s_last_orphan) { ext3_msg(sb, KERN_WARNING, "warning: couldn't " "remount RDWR because of unprocessed " "orphan inode list. Please " - "umount/remount instead."); + "umount & mount instead."); err = -EINVAL; goto restore_opts; } -- cgit v1.2.3-58-ga151 From 6e3d6ca0bf91bcce0453fff9b597154ff6bb9731 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 4 Aug 2011 12:29:32 +0200 Subject: fs/ext2/balloc.c: delete useless initialization Delete nontrivial initialization that is immediately overwritten by the result of an allocation function. The semantic match that makes this change is as follows: (http://coccinelle.lip6.fr/) // @@ type T; identifier i; expression e; @@ ( T i = \(0\|NULL\|ERR_PTR(...)\); | -T i = e; +T i; ) ... when != i i = \(kzalloc\|kcalloc\|kmalloc\)(...); // Signed-off-by: Julia Lawall Signed-off-by: Jan Kara --- fs/ext2/balloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 8f44cef1b3ef..a8cbe1bc6ad4 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -421,7 +421,7 @@ static inline int rsv_is_empty(struct ext2_reserve_window *rsv) void ext2_init_block_alloc_info(struct inode *inode) { struct ext2_inode_info *ei = EXT2_I(inode); - struct ext2_block_alloc_info *block_i = ei->i_block_alloc_info; + struct ext2_block_alloc_info *block_i; struct super_block *sb = inode->i_sb; block_i = kmalloc(sizeof(*block_i), GFP_NOFS); -- cgit v1.2.3-58-ga151 From 46130222df8567ffde773216044c7611a1e71d51 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 4 Aug 2011 12:29:31 +0200 Subject: fs/ext3/balloc.c: delete useless initialization Delete nontrivial initialization that is immediately overwritten by the result of an allocation function. The semantic match that makes this change is as follows: (http://coccinelle.lip6.fr/) // @@ type T; identifier i; expression e; @@ ( T i = \(0\|NULL\|ERR_PTR(...)\); | -T i = e; +T i; ) ... when != i i = \(kzalloc\|kcalloc\|kmalloc\)(...); // Signed-off-by: Julia Lawall Signed-off-by: Jan Kara --- fs/ext3/balloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 6386d76f44a7..caedc00520ed 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -427,7 +427,7 @@ static inline int rsv_is_empty(struct ext3_reserve_window *rsv) void ext3_init_block_alloc_info(struct inode *inode) { struct ext3_inode_info *ei = EXT3_I(inode); - struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info; + struct ext3_block_alloc_info *block_i; struct super_block *sb = inode->i_sb; block_i = kmalloc(sizeof(*block_i), GFP_NOFS); -- cgit v1.2.3-58-ga151 From fbc854027c91fa2813ae7f9de43cc0b5c1119f41 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 16 Aug 2011 18:08:06 +0200 Subject: ext3: remove deprecated oldalloc For a long time now orlov is the default block allocator in the ext3. It performs better than the old one and no one seems to claim otherwise so we can safely drop it and make oldalloc and orlov mount option deprecated. Signed-off-by: Lukas Czerner Signed-off-by: Jan Kara --- Documentation/filesystems/ext3.txt | 8 ------- fs/ext3/ialloc.c | 45 +++----------------------------------- fs/ext3/super.c | 8 +++---- include/linux/ext3_fs.h | 2 +- 4 files changed, 8 insertions(+), 55 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt index 22f3a0eda1d2..b100adc38adb 100644 --- a/Documentation/filesystems/ext3.txt +++ b/Documentation/filesystems/ext3.txt @@ -73,14 +73,6 @@ nobarrier (*) This also requires an IO stack which can support also be used to enable or disable barriers, for consistency with other ext3 mount options. -orlov (*) This enables the new Orlov block allocator. It is - enabled by default. - -oldalloc This disables the Orlov block allocator and enables - the old block allocator. Orlov should have better - performance - we'd like to get some feedback if it's - the contrary for you. - user_xattr Enables Extended User Attributes. Additionally, you need to have extended attribute support enabled in the kernel configuration (CONFIG_EXT3_FS_XATTR). See the diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index bf09cbf938cc..635bd8ce6d59 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -177,42 +177,6 @@ error_return: ext3_std_error(sb, fatal); } -/* - * There are two policies for allocating an inode. If the new inode is - * a directory, then a forward search is made for a block group with both - * free space and a low directory-to-inode ratio; if that fails, then of - * the groups with above-average free space, that group with the fewest - * directories already is chosen. - * - * For other inodes, search forward from the parent directory\'s block - * group to find a free inode. - */ -static int find_group_dir(struct super_block *sb, struct inode *parent) -{ - int ngroups = EXT3_SB(sb)->s_groups_count; - unsigned int freei, avefreei; - struct ext3_group_desc *desc, *best_desc = NULL; - int group, best_group = -1; - - freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter); - avefreei = freei / ngroups; - - for (group = 0; group < ngroups; group++) { - desc = ext3_get_group_desc (sb, group, NULL); - if (!desc || !desc->bg_free_inodes_count) - continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) - continue; - if (!best_desc || - (le16_to_cpu(desc->bg_free_blocks_count) > - le16_to_cpu(best_desc->bg_free_blocks_count))) { - best_group = group; - best_desc = desc; - } - } - return best_group; -} - /* * Orlov's allocator for directories. * @@ -436,12 +400,9 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, sbi = EXT3_SB(sb); es = sbi->s_es; - if (S_ISDIR(mode)) { - if (test_opt (sb, OLDALLOC)) - group = find_group_dir(sb, dir); - else - group = find_group_orlov(sb, dir); - } else + if (S_ISDIR(mode)) + group = find_group_orlov(sb, dir); + else group = find_group_other(sb, dir); err = -ENOSPC; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 2043bcc87719..922d289aeeb3 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -652,8 +652,6 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",nouid32"); if (test_opt(sb, DEBUG)) seq_puts(seq, ",debug"); - if (test_opt(sb, OLDALLOC)) - seq_puts(seq, ",oldalloc"); #ifdef CONFIG_EXT3_FS_XATTR if (test_opt(sb, XATTR_USER)) seq_puts(seq, ",user_xattr"); @@ -1049,10 +1047,12 @@ static int parse_options (char *options, struct super_block *sb, set_opt (sbi->s_mount_opt, DEBUG); break; case Opt_oldalloc: - set_opt (sbi->s_mount_opt, OLDALLOC); + ext3_msg(sb, KERN_WARNING, + "Ignoring deprecated oldalloc option"); break; case Opt_orlov: - clear_opt (sbi->s_mount_opt, OLDALLOC); + ext3_msg(sb, KERN_WARNING, + "Ignoring deprecated orlov option"); break; #ifdef CONFIG_EXT3_FS_XATTR case Opt_user_xattr: diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index 67a803aee619..96a30b95e5c2 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -381,7 +381,7 @@ struct ext3_inode { * Mount flags */ #define EXT3_MOUNT_CHECK 0x00001 /* Do mount-time checks */ -#define EXT3_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ +/* EXT3_MOUNT_OLDALLOC was there */ #define EXT3_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT3_MOUNT_DEBUG 0x00008 /* Some debugging messages */ #define EXT3_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ -- cgit v1.2.3-58-ga151 From 576163005de286bbd418fcb99cfd0971523a0c6d Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 10 Aug 2011 19:16:22 -0400 Subject: nfsd4: fix seqid_mutating_error The set of errors here does *not* agree with the set of errors specified in the rfc! While we're there, turn this macros into a function, for the usual reasons, and move it to the one place where it's actually used. Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 12 ++++++++++++ fs/nfsd/state.h | 6 ------ 2 files changed, 12 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index c8bf405d19de..f81099605256 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1623,6 +1623,18 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c) \ save = resp->p; +static bool seqid_mutating_err(__be32 err) +{ + /* rfc 3530 section 8.1.5: */ + return err != nfserr_stale_clientid && + err != nfserr_stale_stateid && + err != nfserr_bad_stateid && + err != nfserr_bad_seqid && + err != nfserr_bad_xdr && + err != nfserr_resource && + err != nfserr_nofilehandle; +} + /* * Routine for encoding the result of a "seqid-mutating" NFSv4 operation. This * is where sequence id's are incremented, and the replay cache is filled. diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 4eefaf1b42e8..5cfebe504056 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -447,12 +447,6 @@ struct nfs4_stateid { #define WR_STATE 0x00000020 #define CLOSE_STATE 0x00000040 -#define seqid_mutating_err(err) \ - (((err) != nfserr_stale_clientid) && \ - ((err) != nfserr_bad_seqid) && \ - ((err) != nfserr_stale_stateid) && \ - ((err) != nfserr_bad_stateid)) - struct nfsd4_compound_state; extern __be32 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, -- cgit v1.2.3-58-ga151 From aadab6c6f4da38d639394de740602f146c88da0c Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 15 Aug 2011 16:55:02 -0400 Subject: nfsd4: return nfserr_symlink on v4 OPEN of non-regular file Without this, an attempt to open a device special file without first stat'ing it will fail. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 9bf0a6625187..d784ceb81a62 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -168,6 +168,24 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs return status; } +static __be32 nfsd_check_obj_isreg(struct svc_fh *fh) +{ + umode_t mode = fh->fh_dentry->d_inode->i_mode; + + if (S_ISREG(mode)) + return nfs_ok; + if (S_ISDIR(mode)) + return nfserr_isdir; + /* + * Using err_symlink as our catch-all case may look odd; but + * there's no other obvious error for this case in 4.0, and we + * happen to know that it will cause the linux v4 client to do + * the right thing on attempts to open something other than a + * regular file. + */ + return nfserr_symlink; +} + static __be32 do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) { @@ -216,6 +234,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o status = nfsd_lookup(rqstp, current_fh, open->op_fname.data, open->op_fname.len, &resfh); fh_unlock(current_fh); + if (status) + goto out; + status = nfsd_check_obj_isreg(&resfh); } if (status) goto out; -- cgit v1.2.3-58-ga151 From ab83fa4b49a54e6199b076b7d8c1808144e80f0d Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 26 Jul 2011 20:10:51 -0400 Subject: locks: minor lease cleanup Use a helper function, to simplify upcoming changes. Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/locks.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 703f545097de..c528522862b1 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -133,6 +133,11 @@ #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) #define IS_LEASE(fl) (fl->fl_flags & FL_LEASE) +static bool lease_breaking(struct file_lock *fl) +{ + return fl->fl_type & F_INPROGRESS; +} + int leases_enable = 1; int lease_break_time = 45; @@ -1141,7 +1146,7 @@ static void time_out_leases(struct inode *inode) struct file_lock *fl; before = &inode->i_flock; - while ((fl = *before) && IS_LEASE(fl) && (fl->fl_type & F_INPROGRESS)) { + while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) { if ((fl->fl_break_time == 0) || time_before(jiffies, fl->fl_break_time)) { before = &fl->fl_next; @@ -1189,7 +1194,7 @@ int __break_lease(struct inode *inode, unsigned int mode) if (want_write) { /* If we want write access, we have to revoke any lease. */ future = F_UNLCK | F_INPROGRESS; - } else if (flock->fl_type & F_INPROGRESS) { + } else if (lease_breaking(flock)) { /* If the lease is already being broken, we just leave it */ future = flock->fl_type; } else if (flock->fl_type & F_WRLCK) { @@ -1246,7 +1251,7 @@ restart: /* Wait for the next lease that has not been broken yet */ for (flock = inode->i_flock; flock && IS_LEASE(flock); flock = flock->fl_next) { - if (flock->fl_type & F_INPROGRESS) + if (lease_breaking(flock)) goto restart; } error = 0; @@ -2126,7 +2131,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, } } else if (IS_LEASE(fl)) { seq_printf(f, "LEASE "); - if (fl->fl_type & F_INPROGRESS) + if (lease_breaking(fl)) seq_printf(f, "BREAKING "); else if (fl->fl_file) seq_printf(f, "ACTIVE "); @@ -2142,7 +2147,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE "); } else { seq_printf(f, "%s ", - (fl->fl_type & F_INPROGRESS) + (lease_breaking(fl)) ? (fl->fl_type & F_UNLCK) ? "UNLCK" : "READ " : (fl->fl_type & F_WRLCK) ? "WRITE" : "READ "); } -- cgit v1.2.3-58-ga151 From 710b7216964d6455cf1b215c43b03a1a79008c7d Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 26 Jul 2011 16:28:29 -0400 Subject: locks: move F_INPROGRESS from fl_type to fl_flags field F_INPROGRESS isn't exposed to userspace. To me it makes more sense in fl_flags.... Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- arch/alpha/include/asm/fcntl.h | 2 -- fs/locks.c | 14 ++++++++------ include/asm-generic/fcntl.h | 5 ----- include/linux/fs.h | 3 ++- 4 files changed, 10 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/arch/alpha/include/asm/fcntl.h b/arch/alpha/include/asm/fcntl.h index 1b71ca70c9f6..6d9e805f18a7 100644 --- a/arch/alpha/include/asm/fcntl.h +++ b/arch/alpha/include/asm/fcntl.h @@ -51,8 +51,6 @@ #define F_EXLCK 16 /* or 3 */ #define F_SHLCK 32 /* or 4 */ -#define F_INPROGRESS 64 - #include #endif diff --git a/fs/locks.c b/fs/locks.c index c528522862b1..c4215418bca3 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -135,7 +135,7 @@ static bool lease_breaking(struct file_lock *fl) { - return fl->fl_type & F_INPROGRESS; + return fl->fl_flags & FL_INPROGRESS; } int leases_enable = 1; @@ -1132,6 +1132,7 @@ int lease_modify(struct file_lock **before, int arg) if (error) return error; + fl->fl_flags &= ~FL_INPROGRESS; locks_wake_up_blocks(fl); if (arg == F_UNLCK) locks_delete_lock(before); @@ -1152,7 +1153,7 @@ static void time_out_leases(struct inode *inode) before = &fl->fl_next; continue; } - lease_modify(before, fl->fl_type & ~F_INPROGRESS); + lease_modify(before, fl->fl_type); if (fl == *before) /* lease_modify may have freed fl */ before = &fl->fl_next; } @@ -1193,13 +1194,13 @@ int __break_lease(struct inode *inode, unsigned int mode) if (want_write) { /* If we want write access, we have to revoke any lease. */ - future = F_UNLCK | F_INPROGRESS; + future = F_UNLCK; } else if (lease_breaking(flock)) { /* If the lease is already being broken, we just leave it */ future = flock->fl_type; } else if (flock->fl_type & F_WRLCK) { /* Downgrade the exclusive lease to a read-only lease. */ - future = F_RDLCK | F_INPROGRESS; + future = F_RDLCK; } else { /* the existing lease was read-only, so we can read too. */ goto out; @@ -1221,6 +1222,7 @@ int __break_lease(struct inode *inode, unsigned int mode) for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) { if (fl->fl_type != future) { fl->fl_type = future; + fl->fl_flags |= FL_INPROGRESS; fl->fl_break_time = break_time; /* lease must have lmops break callback */ fl->fl_lmops->lm_break(fl); @@ -1319,7 +1321,7 @@ int fcntl_getlease(struct file *filp) for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl); fl = fl->fl_next) { if (fl->fl_file == filp) { - type = fl->fl_type & ~F_INPROGRESS; + type = fl->fl_type; break; } } @@ -1384,7 +1386,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) before = &fl->fl_next) { if (fl->fl_file == filp) my_before = before; - else if (fl->fl_type == (F_INPROGRESS | F_UNLCK)) + else if ((fl->fl_type == F_UNLCK) && lease_breaking(fl)) /* * Someone is in the process of opening this * file for writing so we may not take an diff --git a/include/asm-generic/fcntl.h b/include/asm-generic/fcntl.h index 84793c7025e2..9e5b0356e2bb 100644 --- a/include/asm-generic/fcntl.h +++ b/include/asm-generic/fcntl.h @@ -145,11 +145,6 @@ struct f_owner_ex { #define F_SHLCK 8 /* or 4 */ #endif -/* for leases */ -#ifndef F_INPROGRESS -#define F_INPROGRESS 16 -#endif - /* operations for bsd flock(), also used by the kernel implementation */ #define LOCK_SH 1 /* shared lock */ #define LOCK_EX 2 /* exclusive lock */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 178cdb4f1d4a..327fdd4de85f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1065,6 +1065,7 @@ static inline int file_check_writeable(struct file *filp) #define FL_LEASE 32 /* lease held on this file */ #define FL_CLOSE 64 /* unlock on close */ #define FL_SLEEP 128 /* A blocking lock */ +#define FL_INPROGRESS 256 /* Lease is being broken */ /* * Special return value from posix_lock_file() and vfs_lock_file() for @@ -1111,7 +1112,7 @@ struct file_lock { struct list_head fl_link; /* doubly linked list of all locks */ struct list_head fl_block; /* circular list of blocked processes */ fl_owner_t fl_owner; - unsigned char fl_flags; + unsigned int fl_flags; unsigned char fl_type; unsigned int fl_pid; struct pid *fl_nspid; -- cgit v1.2.3-58-ga151 From 778fc546f749c588aa2f6cd50215d2715c374252 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 26 Jul 2011 18:25:49 -0400 Subject: locks: fix tracking of inprogress lease breaks We currently use a bit in fl_flags to record whether a lease is being broken, and set fl_type to the type (RDLCK or UNLCK) that it will eventually have. This means that once the lease break starts, we forget what the lease's type *used* to be. Breaking a read lease will then result in blocking read opens, even though there's no conflict--because the lease type is now F_UNLCK and we can no longer tell whether it was previously a read or write lease. So, instead keep fl_type as the original type (the type which we enforce), and keep track of whether we're unlocking or merely downgrading by replacing the single FL_INPROGRESS flag by FL_UNLOCK_PENDING and FL_DOWNGRADE_PENDING flags. To get this right we also need to track separate downgrade and break times, to handle the case where a write-leased file gets conflicting opens first for read, then later for write. (I first considered just eliminating the downgrade behavior completely--nfsv4 doesn't need it, and nobody as far as I can tell actually uses it currently--but Jeremy Allison tells me that Windows oplocks do behave this way, so Samba will probably use this some day.) Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/locks.c | 87 ++++++++++++++++++++++++++++++++++-------------------- include/linux/fs.h | 7 +++-- 2 files changed, 60 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index c4215418bca3..c525aa4de234 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -135,7 +135,16 @@ static bool lease_breaking(struct file_lock *fl) { - return fl->fl_flags & FL_INPROGRESS; + return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING); +} + +static int target_leasetype(struct file_lock *fl) +{ + if (fl->fl_flags & FL_UNLOCK_PENDING) + return F_UNLCK; + if (fl->fl_flags & FL_DOWNGRADE_PENDING) + return F_RDLCK; + return fl->fl_type; } int leases_enable = 1; @@ -1124,6 +1133,17 @@ int locks_mandatory_area(int read_write, struct inode *inode, EXPORT_SYMBOL(locks_mandatory_area); +static void lease_clear_pending(struct file_lock *fl, int arg) +{ + switch (arg) { + case F_UNLCK: + fl->fl_flags &= ~FL_UNLOCK_PENDING; + /* fall through: */ + case F_RDLCK: + fl->fl_flags &= ~FL_DOWNGRADE_PENDING; + } +} + /* We already had a lease on this file; just change its type */ int lease_modify(struct file_lock **before, int arg) { @@ -1132,7 +1152,7 @@ int lease_modify(struct file_lock **before, int arg) if (error) return error; - fl->fl_flags &= ~FL_INPROGRESS; + lease_clear_pending(fl, arg); locks_wake_up_blocks(fl); if (arg == F_UNLCK) locks_delete_lock(before); @@ -1141,6 +1161,14 @@ int lease_modify(struct file_lock **before, int arg) EXPORT_SYMBOL(lease_modify); +static bool past_time(unsigned long then) +{ + if (!then) + /* 0 is a special value meaning "this never expires": */ + return false; + return time_after(jiffies, then); +} + static void time_out_leases(struct inode *inode) { struct file_lock **before; @@ -1148,12 +1176,10 @@ static void time_out_leases(struct inode *inode) before = &inode->i_flock; while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) { - if ((fl->fl_break_time == 0) - || time_before(jiffies, fl->fl_break_time)) { - before = &fl->fl_next; - continue; - } - lease_modify(before, fl->fl_type); + if (past_time(fl->fl_downgrade_time)) + lease_modify(before, F_RDLCK); + if (past_time(fl->fl_break_time)) + lease_modify(before, F_UNLCK); if (fl == *before) /* lease_modify may have freed fl */ before = &fl->fl_next; } @@ -1171,7 +1197,7 @@ static void time_out_leases(struct inode *inode) */ int __break_lease(struct inode *inode, unsigned int mode) { - int error = 0, future; + int error = 0; struct file_lock *new_fl, *flock; struct file_lock *fl; unsigned long break_time; @@ -1188,24 +1214,13 @@ int __break_lease(struct inode *inode, unsigned int mode) if ((flock == NULL) || !IS_LEASE(flock)) goto out; + if (!locks_conflict(flock, new_fl)) + goto out; + for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) if (fl->fl_owner == current->files) i_have_this_lease = 1; - if (want_write) { - /* If we want write access, we have to revoke any lease. */ - future = F_UNLCK; - } else if (lease_breaking(flock)) { - /* If the lease is already being broken, we just leave it */ - future = flock->fl_type; - } else if (flock->fl_type & F_WRLCK) { - /* Downgrade the exclusive lease to a read-only lease. */ - future = F_RDLCK; - } else { - /* the existing lease was read-only, so we can read too. */ - goto out; - } - if (IS_ERR(new_fl) && !i_have_this_lease && ((mode & O_NONBLOCK) == 0)) { error = PTR_ERR(new_fl); @@ -1220,13 +1235,18 @@ int __break_lease(struct inode *inode, unsigned int mode) } for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) { - if (fl->fl_type != future) { - fl->fl_type = future; - fl->fl_flags |= FL_INPROGRESS; + if (want_write) { + if (fl->fl_flags & FL_UNLOCK_PENDING) + continue; + fl->fl_flags |= FL_UNLOCK_PENDING; fl->fl_break_time = break_time; - /* lease must have lmops break callback */ - fl->fl_lmops->lm_break(fl); + } else { + if (lease_breaking(flock)) + continue; + fl->fl_flags |= FL_DOWNGRADE_PENDING; + fl->fl_downgrade_time = break_time; } + fl->fl_lmops->lm_break(fl); } if (i_have_this_lease || (mode & O_NONBLOCK)) { @@ -1250,10 +1270,13 @@ restart: if (error >= 0) { if (error == 0) time_out_leases(inode); - /* Wait for the next lease that has not been broken yet */ + /* + * Wait for the next conflicting lease that has not been + * broken yet + */ for (flock = inode->i_flock; flock && IS_LEASE(flock); flock = flock->fl_next) { - if (lease_breaking(flock)) + if (locks_conflict(new_fl, flock)) goto restart; } error = 0; @@ -1321,7 +1344,7 @@ int fcntl_getlease(struct file *filp) for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl); fl = fl->fl_next) { if (fl->fl_file == filp) { - type = fl->fl_type; + type = target_leasetype(fl); break; } } @@ -1386,7 +1409,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) before = &fl->fl_next) { if (fl->fl_file == filp) my_before = before; - else if ((fl->fl_type == F_UNLCK) && lease_breaking(fl)) + else if (fl->fl_flags & FL_UNLOCK_PENDING) /* * Someone is in the process of opening this * file for writing so we may not take an diff --git a/include/linux/fs.h b/include/linux/fs.h index 327fdd4de85f..76460edf1648 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1065,7 +1065,8 @@ static inline int file_check_writeable(struct file *filp) #define FL_LEASE 32 /* lease held on this file */ #define FL_CLOSE 64 /* unlock on close */ #define FL_SLEEP 128 /* A blocking lock */ -#define FL_INPROGRESS 256 /* Lease is being broken */ +#define FL_DOWNGRADE_PENDING 256 /* Lease is being downgraded */ +#define FL_UNLOCK_PENDING 512 /* Lease is being broken */ /* * Special return value from posix_lock_file() and vfs_lock_file() for @@ -1122,7 +1123,9 @@ struct file_lock { loff_t fl_end; struct fasync_struct * fl_fasync; /* for lease break notifications */ - unsigned long fl_break_time; /* for nonblocking lease breaks */ + /* for lease breaks: */ + unsigned long fl_break_time; + unsigned long fl_downgrade_time; const struct file_lock_operations *fl_ops; /* Callbacks for filesystems */ const struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */ -- cgit v1.2.3-58-ga151 From c1f24ef4ed46f58ea5e524a2364c93b6847fb164 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 19 Aug 2011 10:59:49 -0400 Subject: locks: setlease cleanup There's an incorrect comment here. Also clean up the logic: the "rdlease" and "wrlease" locals are confusingly named, and don't really add anything since we can make a decision as soon as we hit one of these cases. Signed-off-by: J. Bruce Fields --- fs/locks.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index c525aa4de234..9b8408eb6946 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1368,7 +1368,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) struct file_lock *fl, **before, **my_before = NULL, *lease; struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; - int error, rdlease_count = 0, wrlease_count = 0; + int error; lease = *flp; @@ -1404,27 +1404,28 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) * then the file is not open by anyone (including us) * except for this filp. */ + error = -EAGAIN; for (before = &inode->i_flock; ((fl = *before) != NULL) && IS_LEASE(fl); before = &fl->fl_next) { - if (fl->fl_file == filp) + if (fl->fl_file == filp) { my_before = before; - else if (fl->fl_flags & FL_UNLOCK_PENDING) - /* - * Someone is in the process of opening this - * file for writing so we may not take an - * exclusive lease on it. - */ - wrlease_count++; - else - rdlease_count++; + continue; + } + /* + * No exclusive leases if someone else has a lease on + * this file: + */ + if (arg == F_WRLCK) + goto out; + /* + * Modifying our existing lease is OK, but no getting a + * new lease if someone else is opening for write: + */ + if (fl->fl_flags & FL_UNLOCK_PENDING) + goto out; } - error = -EAGAIN; - if ((arg == F_RDLCK && (wrlease_count > 0)) || - (arg == F_WRLCK && ((rdlease_count + wrlease_count) > 0))) - goto out; - if (my_before != NULL) { error = lease->fl_lmops->lm_change(my_before, arg); if (!error) -- cgit v1.2.3-58-ga151 From 11fd165c68b73434ca1273e21f21db5eecc90926 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Jul 2011 20:04:09 +0200 Subject: sunrpc: use better NUMA affinities Use NUMA aware allocations to reduce latencies and increase throughput. sunrpc kthreads can use kthread_create_on_node() if pool_mode is "percpu" or "pernode", and svc_prepare_thread()/svc_init_buffer() can also take into account NUMA node affinity for memory allocations. Signed-off-by: Eric Dumazet CC: "J. Bruce Fields" CC: Neil Brown CC: David Miller Reviewed-by: Greg Banks [bfields@redhat.com: fix up caller nfs41_callback_up] Signed-off-by: J. Bruce Fields --- fs/lockd/svc.c | 2 +- fs/nfs/callback.c | 4 ++-- include/linux/sunrpc/svc.h | 2 +- net/sunrpc/svc.c | 33 ++++++++++++++++++++++++--------- 4 files changed, 28 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index abfff9d7979d..c061b9aa7ddb 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -282,7 +282,7 @@ int lockd_up(void) /* * Create the kernel thread and wait for it to start. */ - nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0]); + nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); if (IS_ERR(nlmsvc_rqst)) { error = PTR_ERR(nlmsvc_rqst); nlmsvc_rqst = NULL; diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index e3d294269058..516f3375e067 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -125,7 +125,7 @@ nfs4_callback_up(struct svc_serv *serv) else goto out_err; - return svc_prepare_thread(serv, &serv->sv_pools[0]); + return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); out_err: if (ret == 0) @@ -199,7 +199,7 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) INIT_LIST_HEAD(&serv->sv_cb_list); spin_lock_init(&serv->sv_cb_lock); init_waitqueue_head(&serv->sv_cb_waitq); - rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]); + rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); if (IS_ERR(rqstp)) { svc_xprt_put(serv->sv_bc_xprt); serv->sv_bc_xprt = NULL; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 223588a976a0..a78a51e93373 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -404,7 +404,7 @@ struct svc_procedure { struct svc_serv *svc_create(struct svc_program *, unsigned int, void (*shutdown)(struct svc_serv *)); struct svc_rqst *svc_prepare_thread(struct svc_serv *serv, - struct svc_pool *pool); + struct svc_pool *pool, int node); void svc_exit_thread(struct svc_rqst *); struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, void (*shutdown)(struct svc_serv *), diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 6a69a1131fb7..30d70abb4e2c 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -295,6 +295,18 @@ svc_pool_map_put(void) } +static int svc_pool_map_get_node(unsigned int pidx) +{ + const struct svc_pool_map *m = &svc_pool_map; + + if (m->count) { + if (m->mode == SVC_POOL_PERCPU) + return cpu_to_node(m->pool_to[pidx]); + if (m->mode == SVC_POOL_PERNODE) + return m->pool_to[pidx]; + } + return NUMA_NO_NODE; +} /* * Set the given thread's cpus_allowed mask so that it * will only run on cpus in the given pool. @@ -499,7 +511,7 @@ EXPORT_SYMBOL_GPL(svc_destroy); * We allocate pages and place them in rq_argpages. */ static int -svc_init_buffer(struct svc_rqst *rqstp, unsigned int size) +svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node) { unsigned int pages, arghi; @@ -513,7 +525,7 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size) arghi = 0; BUG_ON(pages > RPCSVC_MAXPAGES); while (pages) { - struct page *p = alloc_page(GFP_KERNEL); + struct page *p = alloc_pages_node(node, GFP_KERNEL, 0); if (!p) break; rqstp->rq_pages[arghi++] = p; @@ -536,11 +548,11 @@ svc_release_buffer(struct svc_rqst *rqstp) } struct svc_rqst * -svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool) +svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) { struct svc_rqst *rqstp; - rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL); + rqstp = kzalloc_node(sizeof(*rqstp), GFP_KERNEL, node); if (!rqstp) goto out_enomem; @@ -554,15 +566,15 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool) rqstp->rq_server = serv; rqstp->rq_pool = pool; - rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL); + rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node); if (!rqstp->rq_argp) goto out_thread; - rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL); + rqstp->rq_resp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node); if (!rqstp->rq_resp) goto out_thread; - if (!svc_init_buffer(rqstp, serv->sv_max_mesg)) + if (!svc_init_buffer(rqstp, serv->sv_max_mesg, node)) goto out_thread; return rqstp; @@ -647,6 +659,7 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) struct svc_pool *chosen_pool; int error = 0; unsigned int state = serv->sv_nrthreads-1; + int node; if (pool == NULL) { /* The -1 assumes caller has done a svc_get() */ @@ -662,14 +675,16 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) nrservs--; chosen_pool = choose_pool(serv, pool, &state); - rqstp = svc_prepare_thread(serv, chosen_pool); + node = svc_pool_map_get_node(chosen_pool->sp_id); + rqstp = svc_prepare_thread(serv, chosen_pool, node); if (IS_ERR(rqstp)) { error = PTR_ERR(rqstp); break; } __module_get(serv->sv_module); - task = kthread_create(serv->sv_function, rqstp, serv->sv_name); + task = kthread_create_on_node(serv->sv_function, rqstp, + node, serv->sv_name); if (IS_ERR(task)) { error = PTR_ERR(task); module_put(serv->sv_module); -- cgit v1.2.3-58-ga151 From d37854cf99319966f34bb19c7a897b87d478b56c Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 22 Aug 2011 16:23:56 +0300 Subject: UBIFS: introduce a helper to dump scanning info This commit adds 'dbg_dump_sleb()' helper function to dump scanning information. Signed-off-by: Artem Bityutskiy --- fs/ubifs/debug.c | 16 ++++++++++++++++ fs/ubifs/debug.h | 5 +++++ 2 files changed, 21 insertions(+) (limited to 'fs') diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index eef109a1a927..b09ba2dd8b62 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -870,6 +870,22 @@ void dbg_dump_lpt_info(struct ubifs_info *c) spin_unlock(&dbg_lock); } +void dbg_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs) +{ + struct ubifs_scan_node *snod; + + printk(KERN_DEBUG "(pid %d) start dumping scanned data from LEB %d:%d\n", + current->pid, sleb->lnum, offs); + + list_for_each_entry(snod, &sleb->nodes, list) { + cond_resched(); + printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", sleb->lnum, + snod->offs, snod->len); + dbg_dump_node(c, snod->node); + } +} + void dbg_dump_leb(const struct ubifs_info *c, int lnum) { struct ubifs_scan_leb *sleb; diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 45174b534377..2bf84211e320 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -269,6 +269,8 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp); void dbg_dump_lprops(struct ubifs_info *c); void dbg_dump_lpt_info(struct ubifs_info *c); void dbg_dump_leb(const struct ubifs_info *c, int lnum); +void dbg_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs); void dbg_dump_znode(const struct ubifs_info *c, const struct ubifs_znode *znode); void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat); @@ -387,6 +389,9 @@ static inline void dbg_dump_lpt_info(struct ubifs_info *c) { return; } static inline void dbg_dump_leb(const struct ubifs_info *c, int lnum) { return; } static inline void +dbg_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs) { return; } +static inline void dbg_dump_znode(const struct ubifs_info *c, const struct ubifs_znode *znode) { return; } static inline void dbg_dump_heap(struct ubifs_info *c, -- cgit v1.2.3-58-ga151 From bd33d12fba60dea3b8f944478c54ac3ba18d2c4d Mon Sep 17 00:00:00 2001 From: Harry Wei Date: Sat, 16 Jul 2011 16:45:13 +0800 Subject: debugfs: Fix a comment mistake The file is fs/debugfs/inode.c but the comment says it is file.c. This patch can fix this little mistake. Signed-off-by: Harry Wei Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index e7a7a2f07324..f3a257d7a985 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -1,5 +1,5 @@ /* - * file.c - part of debugfs, a tiny little debug file system + * inode.c - part of debugfs, a tiny little debug file system * * Copyright (C) 2004 Greg Kroah-Hartman * Copyright (C) 2004 IBM Inc. -- cgit v1.2.3-58-ga151 From 7f9838fd01833ffb30177d964983076924344c9e Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 21 Jul 2011 19:59:22 -0400 Subject: sysfs: count subdirectories sysfs: count subdirectories This patch introduces a subdirectory counter for each sysfs directory. Without the patch, sysfs_refresh_inode would walk all entries of the directory to calculate the number of subdirectories. This patch improves time of "ls -la /sys/block" when there are 10000 block devices from 9 seconds to 0.19 seconds. Signed-off-by: Mikulas Patocka Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 6 ++++++ fs/sysfs/inode.c | 14 +------------- fs/sysfs/sysfs.h | 2 ++ 3 files changed, 9 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index ea9120a830d8..7d240e6b7176 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -47,6 +47,9 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd) BUG_ON(sd->s_sibling); + if (sysfs_type(sd) == SYSFS_DIR) + parent_sd->s_dir.subdirs++; + /* Store directory entries in order by ino. This allows * readdir to properly restart without having to add a * cursor into the s_dir.children list. @@ -73,6 +76,9 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd) { struct sysfs_dirent **pos; + if (sysfs_type(sd) == SYSFS_DIR) + sd->s_parent->s_dir.subdirs--; + for (pos = &sd->s_parent->s_dir.children; *pos; pos = &(*pos)->s_sibling) { if (*pos == sd) { diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index e3f091a81c72..1ee18c81df78 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -202,18 +202,6 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) inode->i_ctime = iattr->ia_ctime; } -static int sysfs_count_nlink(struct sysfs_dirent *sd) -{ - struct sysfs_dirent *child; - int nr = 0; - - for (child = sd->s_dir.children; child; child = child->s_sibling) - if (sysfs_type(child) == SYSFS_DIR) - nr++; - - return nr + 2; -} - static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode) { struct sysfs_inode_attrs *iattrs = sd->s_iattr; @@ -230,7 +218,7 @@ static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode) } if (sysfs_type(sd) == SYSFS_DIR) - inode->i_nlink = sysfs_count_nlink(sd); + inode->i_nlink = sd->s_dir.subdirs + 2; } int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 845ab3ad229d..6348e2c753f6 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -19,6 +19,8 @@ struct sysfs_elem_dir { struct kobject *kobj; /* children list starts here and goes through sd->s_sibling */ struct sysfs_dirent *children; + + unsigned long subdirs; }; struct sysfs_elem_symlink { -- cgit v1.2.3-58-ga151 From 4f72c0cab40536a0be501d85ea4918467ab82ad5 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 25 Jul 2011 17:55:57 -0400 Subject: sysfs: use rb-tree for name lookups sysfs: use rb-tree for name lookups Use red-black tree for name lookups. Signed-off-by: Mikulas Patocka Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++------- fs/sysfs/sysfs.h | 5 +++++ 2 files changed, 55 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 7d240e6b7176..3e937da224d4 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -45,6 +45,9 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd) struct sysfs_dirent *parent_sd = sd->s_parent; struct sysfs_dirent **pos; + struct rb_node **p; + struct rb_node *parent; + BUG_ON(sd->s_sibling); if (sysfs_type(sd) == SYSFS_DIR) @@ -60,6 +63,23 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd) } sd->s_sibling = *pos; *pos = sd; + + p = &parent_sd->s_dir.name_tree.rb_node; + parent = NULL; + while (*p) { + int c; + parent = *p; +#define node rb_entry(parent, struct sysfs_dirent, name_node) + c = strcmp(sd->s_name, node->s_name); + if (c < 0) { + p = &node->name_node.rb_left; + } else { + p = &node->name_node.rb_right; + } +#undef node + } + rb_link_node(&sd->name_node, parent, p); + rb_insert_color(&sd->name_node, &parent_sd->s_dir.name_tree); } /** @@ -87,6 +107,8 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd) break; } } + + rb_erase(&sd->name_node, &sd->s_parent->s_dir.name_tree); } /** @@ -546,15 +568,36 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, const void *ns, const unsigned char *name) { - struct sysfs_dirent *sd; + struct rb_node *p = parent_sd->s_dir.name_tree.rb_node; + struct sysfs_dirent *found = NULL; + + while (p) { + int c; +#define node rb_entry(p, struct sysfs_dirent, name_node) + c = strcmp(name, node->s_name); + if (c < 0) { + p = node->name_node.rb_left; + } else if (c > 0) { + p = node->name_node.rb_right; + } else { + found = node; + p = node->name_node.rb_left; + } +#undef node + } - for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) { - if (ns && sd->s_ns && (sd->s_ns != ns)) - continue; - if (!strcmp(sd->s_name, name)) - return sd; + if (found && ns) { + while (found->s_ns && found->s_ns != ns) { + p = rb_next(&found->name_node); + if (!p) + return NULL; + found = rb_entry(p, struct sysfs_dirent, name_node); + if (strcmp(name, found->s_name)) + return NULL; + } } - return NULL; + + return found; } /** diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 6348e2c753f6..fe1a9e8650bf 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -11,6 +11,7 @@ #include #include #include +#include struct sysfs_open_dirent; @@ -21,6 +22,8 @@ struct sysfs_elem_dir { struct sysfs_dirent *children; unsigned long subdirs; + + struct rb_root name_tree; }; struct sysfs_elem_symlink { @@ -61,6 +64,8 @@ struct sysfs_dirent { struct sysfs_dirent *s_sibling; const char *s_name; + struct rb_node name_node; + const void *s_ns; /* namespace tag */ union { struct sysfs_elem_dir s_dir; -- cgit v1.2.3-58-ga151 From 58f2a4c7932d8bec866d0394f806004146cde827 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 21 Jul 2011 20:01:12 -0400 Subject: sysfs: remove s_sibling hacks sysfs: remove s_sibling hacks s_sibling was used for three different purposes: 1) as a linked list of entries in the directory 2) as a linked list of entries to be deleted 3) as a pointer to "struct completion" This patch removes the hack and introduces new union u which holds pointers for cases 2) and 3). This change is needed for the following patch that removes s_sibling at all and replaces it with a rb tree. Signed-off-by: Mikulas Patocka Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 19 +++++++------------ fs/sysfs/sysfs.h | 5 +++++ 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 3e937da224d4..a4846979d4e8 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -154,7 +154,6 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) */ void sysfs_put_active(struct sysfs_dirent *sd) { - struct completion *cmpl; int v; if (unlikely(!sd)) @@ -166,10 +165,9 @@ void sysfs_put_active(struct sysfs_dirent *sd) return; /* atomic_dec_return() is a mb(), we'll always see the updated - * sd->s_sibling. + * sd->u.completion. */ - cmpl = (void *)sd->s_sibling; - complete(cmpl); + complete(sd->u.completion); } /** @@ -183,16 +181,16 @@ static void sysfs_deactivate(struct sysfs_dirent *sd) DECLARE_COMPLETION_ONSTACK(wait); int v; - BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED)); + BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED)); if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF)) return; - sd->s_sibling = (void *)&wait; + sd->u.completion = (void *)&wait; rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_); /* atomic_add_return() is a mb(), put_active() will always see - * the updated sd->s_sibling. + * the updated sd->u.completion. */ v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active); @@ -201,8 +199,6 @@ static void sysfs_deactivate(struct sysfs_dirent *sd) wait_for_completion(&wait); } - sd->s_sibling = NULL; - lock_acquired(&sd->dep_map, _RET_IP_); rwsem_release(&sd->dep_map, 1, _RET_IP_); } @@ -518,7 +514,7 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) } sd->s_flags |= SYSFS_FLAG_REMOVED; - sd->s_sibling = acxt->removed; + sd->u.removed_list = acxt->removed; acxt->removed = sd; } @@ -542,8 +538,7 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) while (acxt->removed) { struct sysfs_dirent *sd = acxt->removed; - acxt->removed = sd->s_sibling; - sd->s_sibling = NULL; + acxt->removed = sd->u.removed_list; sysfs_deactivate(sd); unmap_bin_file(sd); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index fe1a9e8650bf..3c261681713b 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -66,6 +66,11 @@ struct sysfs_dirent { struct rb_node name_node; + union { + struct completion *completion; + struct sysfs_dirent *removed_list; + } u; + const void *s_ns; /* namespace tag */ union { struct sysfs_elem_dir s_dir; -- cgit v1.2.3-58-ga151 From a406f75840e15afbabd98cb64ae36b51424a8033 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 25 Jul 2011 17:57:03 -0400 Subject: sysfs: use rb-tree for inode number lookup sysfs: use rb-tree for inode number lookup This patch makes sysfs use red-black tree for inode number lookup. Together with a previous patch to use red-black tree for name lookup, this patch makes all sysfs lookups to have O(log n) complexity. Signed-off-by: Mikulas Patocka Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 89 +++++++++++++++++++++++++++++++------------------------- fs/sysfs/sysfs.h | 5 ++-- 2 files changed, 52 insertions(+), 42 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index a4846979d4e8..c3646d93a032 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -43,26 +43,30 @@ static DEFINE_IDA(sysfs_ino_ida); static void sysfs_link_sibling(struct sysfs_dirent *sd) { struct sysfs_dirent *parent_sd = sd->s_parent; - struct sysfs_dirent **pos; struct rb_node **p; struct rb_node *parent; - BUG_ON(sd->s_sibling); - if (sysfs_type(sd) == SYSFS_DIR) parent_sd->s_dir.subdirs++; - /* Store directory entries in order by ino. This allows - * readdir to properly restart without having to add a - * cursor into the s_dir.children list. - */ - for (pos = &parent_sd->s_dir.children; *pos; pos = &(*pos)->s_sibling) { - if (sd->s_ino < (*pos)->s_ino) - break; + p = &parent_sd->s_dir.inode_tree.rb_node; + parent = NULL; + while (*p) { + parent = *p; +#define node rb_entry(parent, struct sysfs_dirent, inode_node) + if (sd->s_ino < node->s_ino) { + p = &node->inode_node.rb_left; + } else if (sd->s_ino > node->s_ino) { + p = &node->inode_node.rb_right; + } else { + printk(KERN_CRIT "sysfs: inserting duplicate inode '%lx'\n", sd->s_ino); + BUG(); + } +#undef node } - sd->s_sibling = *pos; - *pos = sd; + rb_link_node(&sd->inode_node, parent, p); + rb_insert_color(&sd->inode_node, &parent_sd->s_dir.inode_tree); p = &parent_sd->s_dir.name_tree.rb_node; parent = NULL; @@ -94,20 +98,10 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd) */ static void sysfs_unlink_sibling(struct sysfs_dirent *sd) { - struct sysfs_dirent **pos; - if (sysfs_type(sd) == SYSFS_DIR) sd->s_parent->s_dir.subdirs--; - for (pos = &sd->s_parent->s_dir.children; *pos; - pos = &(*pos)->s_sibling) { - if (*pos == sd) { - *pos = sd->s_sibling; - sd->s_sibling = NULL; - break; - } - } - + rb_erase(&sd->inode_node, &sd->s_parent->s_dir.inode_tree); rb_erase(&sd->name_node, &sd->s_parent->s_dir.name_tree); } @@ -788,21 +782,19 @@ void sysfs_remove_subdir(struct sysfs_dirent *sd) static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd) { struct sysfs_addrm_cxt acxt; - struct sysfs_dirent **pos; + struct rb_node *pos; if (!dir_sd) return; pr_debug("sysfs %s: removing dir\n", dir_sd->s_name); sysfs_addrm_start(&acxt, dir_sd); - pos = &dir_sd->s_dir.children; - while (*pos) { - struct sysfs_dirent *sd = *pos; - + pos = rb_first(&dir_sd->s_dir.inode_tree); + while (pos) { + struct sysfs_dirent *sd = rb_entry(pos, struct sysfs_dirent, inode_node); + pos = rb_next(pos); if (sysfs_type(sd) != SYSFS_DIR) sysfs_remove_one(&acxt, sd); - else - pos = &(*pos)->s_sibling; } sysfs_addrm_finish(&acxt); @@ -925,12 +917,28 @@ static struct sysfs_dirent *sysfs_dir_pos(const void *ns, pos = NULL; } if (!pos && (ino > 1) && (ino < INT_MAX)) { - pos = parent_sd->s_dir.children; - while (pos && (ino > pos->s_ino)) - pos = pos->s_sibling; + struct rb_node *p = parent_sd->s_dir.inode_tree.rb_node; + while (p) { +#define node rb_entry(p, struct sysfs_dirent, inode_node) + if (ino < node->s_ino) { + pos = node; + p = node->inode_node.rb_left; + } else if (ino > node->s_ino) { + p = node->inode_node.rb_right; + } else { + pos = node; + break; + } +#undef node + } + } + while (pos && pos->s_ns && pos->s_ns != ns) { + struct rb_node *p = rb_next(&pos->inode_node); + if (!p) + pos = NULL; + else + pos = rb_entry(p, struct sysfs_dirent, inode_node); } - while (pos && pos->s_ns && pos->s_ns != ns) - pos = pos->s_sibling; return pos; } @@ -938,10 +946,13 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns, struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos) { pos = sysfs_dir_pos(ns, parent_sd, ino, pos); - if (pos) - pos = pos->s_sibling; - while (pos && pos->s_ns && pos->s_ns != ns) - pos = pos->s_sibling; + if (pos) do { + struct rb_node *p = rb_next(&pos->inode_node); + if (!p) + pos = NULL; + else + pos = rb_entry(p, struct sysfs_dirent, inode_node); + } while (pos && pos->s_ns && pos->s_ns != ns); return pos; } diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 3c261681713b..ce29e28b766d 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -18,11 +18,10 @@ struct sysfs_open_dirent; /* type-specific structures for sysfs_dirent->s_* union members */ struct sysfs_elem_dir { struct kobject *kobj; - /* children list starts here and goes through sd->s_sibling */ - struct sysfs_dirent *children; unsigned long subdirs; + struct rb_root inode_tree; struct rb_root name_tree; }; @@ -61,9 +60,9 @@ struct sysfs_dirent { struct lockdep_map dep_map; #endif struct sysfs_dirent *s_parent; - struct sysfs_dirent *s_sibling; const char *s_name; + struct rb_node inode_node; struct rb_node name_node; union { -- cgit v1.2.3-58-ga151 From d27769ec3df1a8de9ca450d2dcd72d1ab259ba32 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Aug 2011 20:01:04 +0200 Subject: block: add GENHD_FL_NO_PART_SCAN There are cases where suppressing partition scan is useful - e.g. for lo devices and pseudo SATA devices which advertise to be a disk but get upset on partition scan (some port multiplier control devices show such behavior). This patch adds GENHD_FL_NO_PART_SCAN which suppresses partition scan regardless of the number of possible partitions. disk_partitionable() is renamed to disk_part_scan_enabled() as suppressing partition scan doesn't imply the device can't be partitioned using BLKPG_ADD/DEL_PARTITION calls from userland. show_partition() now directly tests disk_max_parts() to maintain backward-compatibility. -v2: Updated to make it clear that only partition scan is suppressed not partitioning itself as suggested by Kay Sievers. Signed-off-by: Tejun Heo Cc: Kay Sievers Signed-off-by: Jens Axboe --- block/genhd.c | 4 ++-- block/ioctl.c | 2 +- fs/block_dev.c | 2 +- include/linux/genhd.h | 6 ++++-- 4 files changed, 8 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/block/genhd.c b/block/genhd.c index 5cb51c55f6d8..2429ecbbd97d 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -536,7 +536,7 @@ void register_disk(struct gendisk *disk) disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); /* No minors to use for partitions */ - if (!disk_partitionable(disk)) + if (!disk_part_scan_enabled(disk)) goto exit; /* No such device (e.g., media were just removed) */ @@ -841,7 +841,7 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_partitionable(sgp) && + if (!get_capacity(sgp) || (!disk_max_parts(sgp) && (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) diff --git a/block/ioctl.c b/block/ioctl.c index 1124cd297263..5c74efc01903 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -101,7 +101,7 @@ static int blkdev_reread_part(struct block_device *bdev) struct gendisk *disk = bdev->bd_disk; int res; - if (!disk_partitionable(disk) || bdev != bdev->bd_contains) + if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EACCES; diff --git a/fs/block_dev.c b/fs/block_dev.c index ff77262e887c..0bed0d4588dd 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -971,7 +971,7 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty) if (!bdev->bd_disk) return; - if (disk_partitionable(bdev->bd_disk)) + if (disk_part_scan_enabled(bdev->bd_disk)) bdev->bd_invalidated = 1; } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 02fa4697a0e5..6d18f3531f18 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -128,6 +128,7 @@ struct hd_struct { #define GENHD_FL_EXT_DEVT 64 /* allow extended devt */ #define GENHD_FL_NATIVE_CAPACITY 128 #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 256 +#define GENHD_FL_NO_PART_SCAN 512 enum { DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ @@ -234,9 +235,10 @@ static inline int disk_max_parts(struct gendisk *disk) return disk->minors; } -static inline bool disk_partitionable(struct gendisk *disk) +static inline bool disk_part_scan_enabled(struct gendisk *disk) { - return disk_max_parts(disk) > 1; + return disk_max_parts(disk) > 1 && + !(disk->flags & GENHD_FL_NO_PART_SCAN); } static inline dev_t disk_devt(struct gendisk *disk) -- cgit v1.2.3-58-ga151 From 7606f85a701ed8feeac065e133ff9a51c267aa0d Mon Sep 17 00:00:00 2001 From: srimugunthan dhandapani Date: Fri, 26 Aug 2011 16:08:39 +0530 Subject: UBIFS: fix the dark space calculation The dark space calculation should be 64 bit type-casted, when assigning to tmp64 (similar to how total_free is calculated). Overflow will occur for very large flashes. Signed-off-by: srimugunthan Signed-off-by: Artem Bityutskiy --- fs/ubifs/recovery.c | 2 +- fs/ubifs/sb.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index af02790d9328..ee4f43f4bb99 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -983,7 +983,7 @@ int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf) } /** - * clean_an_unclean_leb - read and write a LEB to remove corruption. + * clean_an_unclean_leb - read and write a LEB to remove corruption. * @c: UBIFS file-system description object * @ucleb: unclean LEB information * @sbuf: LEB-sized buffer to use diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 93d938ad3d2a..6094c5a5d7a8 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -247,7 +247,7 @@ static int create_default_filesystem(struct ubifs_info *c) mst->total_dirty = cpu_to_le64(tmp64); /* The indexing LEB does not contribute to dark space */ - tmp64 = (c->main_lebs - 1) * c->dark_wm; + tmp64 = ((long long)(c->main_lebs - 1) * c->dark_wm); mst->total_dark = cpu_to_le64(tmp64); mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ); -- cgit v1.2.3-58-ga151 From 3d2544b1e4909b6dffa0d140273628913e255e45 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 15 Aug 2011 11:49:30 -0400 Subject: nfsd4: clean up S_IS -> NF4 file type mapping A slightly unconventional approach to make the code more compact I could live with, but let's give the poor reader *some* chance. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index f81099605256..51ec1f274501 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1772,12 +1772,19 @@ static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp, return 0; } -static u32 nfs4_ftypes[16] = { - NF4BAD, NF4FIFO, NF4CHR, NF4BAD, - NF4DIR, NF4BAD, NF4BLK, NF4BAD, - NF4REG, NF4BAD, NF4LNK, NF4BAD, - NF4SOCK, NF4BAD, NF4LNK, NF4BAD, -}; +static u32 nfs4_file_type(umode_t mode) +{ + switch (mode & S_IFMT) { + case S_IFIFO: return NF4FIFO; + case S_IFCHR: return NF4CHR; + case S_IFDIR: return NF4DIR; + case S_IFBLK: return NF4BLK; + case S_IFLNK: return NF4LNK; + case S_IFREG: return NF4REG; + case S_IFSOCK: return NF4SOCK; + default: return NF4BAD; + }; +} static __be32 nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group, @@ -1966,7 +1973,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, if (bmval0 & FATTR4_WORD0_TYPE) { if ((buflen -= 4) < 0) goto out_resource; - dummy = nfs4_ftypes[(stat.mode & S_IFMT) >> 12]; + dummy = nfs4_file_type(stat.mode); if (dummy == NF4BAD) goto out_serverfault; WRITE32(dummy); -- cgit v1.2.3-58-ga151 From 7d818a7b8fc8d26c24ee44ed1c5dece69455a7b6 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 15 Aug 2011 16:59:55 -0400 Subject: nfsd: open-code special directory-hardlink check We allow the fh_verify caller to specify that any object *except* those of a given type is allowed, by passing a negative type. But only one caller actually uses it. Open-code that check in the one caller. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsfh.c | 9 --------- fs/nfsd/vfs.c | 6 ++++-- 2 files changed, 4 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 90c6aa6d5e0f..8cd49b9bf085 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -61,7 +61,6 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry) static inline __be32 nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type) { - /* Type can be negative when creating hardlinks - not to a dir */ if (type > 0 && (mode & S_IFMT) != type) { if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK) return nfserr_symlink; @@ -72,14 +71,6 @@ nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type) else return nfserr_inval; } - if (type < 0 && (mode & S_IFMT) == -type) { - if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK) - return nfserr_symlink; - else if (type == -S_IFDIR) - return nfserr_isdir; - else - return nfserr_notdir; - } return 0; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index fd0acca5370a..0c92a17d1770 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1632,10 +1632,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; - err = fh_verify(rqstp, tfhp, -S_IFDIR, NFSD_MAY_NOP); + err = fh_verify(rqstp, tfhp, 0, NFSD_MAY_NOP); if (err) goto out; - + err = nfserr_isdir; + if (S_ISDIR(tfhp->fh_dentry->d_inode->i_mode)) + goto out; err = nfserr_perm; if (!len) goto out; -- cgit v1.2.3-58-ga151 From e10f9e1413576f58c18b89f6ae212a37a88b24e7 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 15 Aug 2011 17:04:19 -0400 Subject: nfsd: clean up nfsd_mode_check() Add some more comments, simplify logic, do & S_IFMT just once, name "type" more helpfully. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsfh.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 8cd49b9bf085..c763de5c1157 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -59,19 +59,25 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry) * the write call). */ static inline __be32 -nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type) +nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int requested) { - if (type > 0 && (mode & S_IFMT) != type) { - if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK) - return nfserr_symlink; - else if (type == S_IFDIR) - return nfserr_notdir; - else if ((mode & S_IFMT) == S_IFDIR) - return nfserr_isdir; - else - return nfserr_inval; - } - return 0; + mode &= S_IFMT; + + if (requested == 0) /* the caller doesn't care */ + return nfs_ok; + if (mode == requested) + return nfs_ok; + /* + * v4 has an error more specific than err_notdir which we should + * return in preference to err_notdir: + */ + if (rqstp->rq_vers == 4 && mode == S_IFLNK) + return nfserr_symlink; + if (requested == S_IFDIR) + return nfserr_notdir; + if (mode == S_IFDIR) + return nfserr_isdir; + return nfserr_inval; } static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, -- cgit v1.2.3-58-ga151 From e281d8100995133dc65e00b1dec8f84b91b6e8c3 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 15 Aug 2011 16:57:07 -0400 Subject: nfsd4: fix incorrect comment in nfsd4_set_nfs4_acl Zero means "I don't care what kind of file this is". And that's probably what we want--acls are also settable at least on directories, and if the filesystem doesn't want them on other objects, leave it to it to complain. Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 0c92a17d1770..4c22870293e6 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -502,7 +502,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int flags = 0; /* Get inode */ - error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR); + error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR); if (error) return error; -- cgit v1.2.3-58-ga151 From 75c096f753b273b59f1b9a0745e9e4b5d911a312 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 15 Aug 2011 18:39:32 -0400 Subject: nfsd4: it's OK to return nfserr_symlink The nfsd4 code has a bunch of special exceptions for error returns which map nfserr_symlink to other errors. In fact, the spec makes it clear that nfserr_symlink is to be preferred over less specific errors where possible. The patch that introduced it back in 2.6.4 is "kNFSd: correct symlink related error returns.", which claims that these special exceptions are represent an NFSv4 break from v2/v3 tradition--when in fact the symlink error was introduced with v4. I suspect what happened was pynfs tests were written that were overly faithful to the (known-incomplete) rfc3530 error return lists, and then code was fixed up mindlessly to make the tests pass. Delete these unnecessary exceptions. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 15 +-------------- fs/nfsd/nfs4state.c | 6 +----- fs/nfsd/nfs4xdr.c | 4 ---- 3 files changed, 2 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index d784ceb81a62..479ffb185df9 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -488,17 +488,12 @@ static __be32 nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_commit *commit) { - __be32 status; - u32 *p = (u32 *)commit->co_verf.data; *p++ = nfssvc_boot.tv_sec; *p++ = nfssvc_boot.tv_usec; - status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, + return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, commit->co_count); - if (status == nfserr_symlink) - status = nfserr_inval; - return status; } static __be32 @@ -513,8 +508,6 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_CREATE); - if (status == nfserr_symlink) - status = nfserr_notdir; if (status) return status; @@ -740,8 +733,6 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_grace; status = nfsd_unlink(rqstp, &cstate->current_fh, 0, remove->rm_name, remove->rm_namelen); - if (status == nfserr_symlink) - return nfserr_notdir; if (!status) { fh_unlock(&cstate->current_fh); set_change_info(&remove->rm_cinfo, &cstate->current_fh); @@ -772,8 +763,6 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, (S_ISDIR(cstate->save_fh.fh_dentry->d_inode->i_mode) && S_ISDIR(cstate->current_fh.fh_dentry->d_inode->i_mode))) status = nfserr_exist; - else if (status == nfserr_symlink) - status = nfserr_notdir; if (!status) { set_change_info(&rename->rn_sinfo, &cstate->current_fh); @@ -913,8 +902,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, write->wr_bytes_written = cnt; - if (status == nfserr_symlink) - status = nfserr_inval; return status; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3787ec117400..aa0a36e3b09e 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4179,12 +4179,8 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid)) goto out; - if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) { - dprintk("NFSD: nfsd4_lockt: fh_verify() failed!\n"); - if (status == nfserr_symlink) - status = nfserr_inval; + if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) goto out; - } inode = cstate->current_fh.fh_dentry->d_inode; locks_init_lock(&file_lock); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 51ec1f274501..78c792fb59a8 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2778,8 +2778,6 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, &maxcount); - if (nfserr == nfserr_symlink) - nfserr = nfserr_inval; if (nfserr) return nfserr; eof = (read->rd_offset + maxcount >= @@ -2905,8 +2903,6 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 readdir->common.err == nfserr_toosmall && readdir->buffer == page) nfserr = nfserr_toosmall; - if (nfserr == nfserr_symlink) - nfserr = nfserr_notdir; if (nfserr) goto err_no_verf; -- cgit v1.2.3-58-ga151 From c10bd39d800d42adef55ed9016f802677cd0ab5f Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 19 Aug 2011 11:38:52 -0400 Subject: Remove include/linux/nfsd/const.h Userspace shouldn't have a use for these constants. Nothing here is used outside fs/nfsd. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsd.h | 26 ++++++++++++++++++++++++++ include/linux/nfsd/Kbuild | 1 - include/linux/nfsd/const.h | 42 ------------------------------------------ include/linux/nfsd/nfsfh.h | 7 +++++-- 4 files changed, 31 insertions(+), 45 deletions(-) delete mode 100644 include/linux/nfsd/const.h (limited to 'fs') diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 7ecfa2420307..8da03e16ab35 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -11,13 +11,39 @@ #include #include +#include +#include +#include +#include +#include + #include #include #include + /* * nfsd version */ #define NFSD_SUPPORTED_MINOR_VERSION 1 +/* + * Maximum blocksizes supported by daemon under various circumstances. + */ +#define NFSSVC_MAXBLKSIZE RPCSVC_MAXPAYLOAD +/* NFSv2 is limited by the protocol specification, see RFC 1094 */ +#define NFSSVC_MAXBLKSIZE_V2 (8*1024) + + +/* + * Largest number of bytes we need to allocate for an NFS + * call or reply. Used to control buffer sizes. We use + * the length of v3 WRITE, READDIR and READDIR replies + * which are an RPC header, up to 26 XDR units of reply + * data, and some page data. + * + * Note that accuracy here doesn't matter too much as the + * size is rounded up to a page size when allocating space. + */ +#define NFSD_BUFSIZE ((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE) struct readdir_cd { __be32 err; /* 0, nfserr, or nfserr_eof */ diff --git a/include/linux/nfsd/Kbuild b/include/linux/nfsd/Kbuild index 55d1467de3c1..0e528606d46f 100644 --- a/include/linux/nfsd/Kbuild +++ b/include/linux/nfsd/Kbuild @@ -1,4 +1,3 @@ -header-y += const.h header-y += debug.h header-y += export.h header-y += nfsfh.h diff --git a/include/linux/nfsd/const.h b/include/linux/nfsd/const.h deleted file mode 100644 index feb3764617a4..000000000000 --- a/include/linux/nfsd/const.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * include/linux/nfsd/const.h - * - * Various constants related to NFS. - * - * Copyright (C) 1995-1997 Olaf Kirch - */ - -#ifndef _LINUX_NFSD_CONST_H -#define _LINUX_NFSD_CONST_H - -#include -#include -#include -#include - -/* - * Maximum blocksizes supported by daemon under various circumstances. - */ -#define NFSSVC_MAXBLKSIZE RPCSVC_MAXPAYLOAD -/* NFSv2 is limited by the protocol specification, see RFC 1094 */ -#define NFSSVC_MAXBLKSIZE_V2 (8*1024) - -#ifdef __KERNEL__ - -#include - -/* - * Largest number of bytes we need to allocate for an NFS - * call or reply. Used to control buffer sizes. We use - * the length of v3 WRITE, READDIR and READDIR replies - * which are an RPC header, up to 26 XDR units of reply - * data, and some page data. - * - * Note that accuracy here doesn't matter too much as the - * size is rounded up to a page size when allocating space. - */ -#define NFSD_BUFSIZE ((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE) - -#endif /* __KERNEL__ */ - -#endif /* _LINUX_NFSD_CONST_H */ diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h index f76d80ccec10..ce4743a26015 100644 --- a/include/linux/nfsd/nfsfh.h +++ b/include/linux/nfsd/nfsfh.h @@ -14,11 +14,14 @@ #ifndef _LINUX_NFSD_FH_H #define _LINUX_NFSD_FH_H -# include +#include +#include +#include +#include +#include #ifdef __KERNEL__ # include #endif -#include /* * This is the old "dentry style" Linux NFSv2 file handle. -- cgit v1.2.3-58-ga151 From a043226bc140a2c1dde162246d68a67e5043e6b2 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 25 Aug 2011 10:48:39 -0400 Subject: nfsd4: permit read opens of executable-only files A client that wants to execute a file must be able to read it. Read opens over nfs are therefore implicitly allowed for executable files even when those files are not readable. NFSv2/v3 get this right by using a passed-in NFSD_MAY_OWNER_OVERRIDE on read requests, but NFSv4 has gotten this wrong ever since dc730e173785e29b297aa605786c94adaffe2544 "nfsd4: fix owner-override on open", when we realized that the file owner shouldn't override permissions on non-reclaim NFSv4 opens. So we can't use NFSD_MAY_OWNER_OVERRIDE to tell nfsd_permission to allow reads of executable files. So, do the same thing we do whenever we encounter another weird NFS permission nit: define yet another NFSD_MAY_* flag. The industry's future standardization on 128-bit processors will be motivated primarily by the need for integers with enough bits for all the NFSD_MAY_* flags. Reported-by: Leonardo Borda Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 2 ++ fs/nfsd/vfs.c | 3 ++- fs/nfsd/vfs.h | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 479ffb185df9..b5530984db91 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -156,6 +156,8 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs !(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) return nfserr_inval; + accmode |= NFSD_MAY_READ_IF_EXEC; + if (open->op_share_access & NFS4_SHARE_ACCESS_READ) accmode |= NFSD_MAY_READ; if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 4c22870293e6..75c35fa46155 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -2116,7 +2116,8 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, /* Allow read access to binaries even when mode 111 */ if (err == -EACCES && S_ISREG(inode->i_mode) && - acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE)) + (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) || + acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC))) err = inode_permission(inode, MAY_EXEC); return err? nfserrno(err) : 0; diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index e0bbac04d1dd..a22e40e27861 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -25,6 +25,7 @@ #define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 #define NFSD_MAY_NOT_BREAK_LEASE 512 #define NFSD_MAY_BYPASS_GSS 1024 +#define NFSD_MAY_READ_IF_EXEC 2048 #define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) #define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) -- cgit v1.2.3-58-ga151 From 8e82fa8fdcd1271d45bf6a5195801c706e190d69 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 25 Aug 2011 14:23:39 -0400 Subject: nfsd: prettify NFSD_MAY_* flag definitions Acked-by: Jim Rees Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index a22e40e27861..503f3bf11abd 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -10,22 +10,22 @@ /* * Flags for nfsd_permission */ -#define NFSD_MAY_NOP 0 -#define NFSD_MAY_EXEC 1 /* == MAY_EXEC */ -#define NFSD_MAY_WRITE 2 /* == MAY_WRITE */ -#define NFSD_MAY_READ 4 /* == MAY_READ */ -#define NFSD_MAY_SATTR 8 -#define NFSD_MAY_TRUNC 16 -#define NFSD_MAY_LOCK 32 -#define NFSD_MAY_MASK 63 +#define NFSD_MAY_NOP 0 +#define NFSD_MAY_EXEC 0x001 /* == MAY_EXEC */ +#define NFSD_MAY_WRITE 0x002 /* == MAY_WRITE */ +#define NFSD_MAY_READ 0x004 /* == MAY_READ */ +#define NFSD_MAY_SATTR 0x008 +#define NFSD_MAY_TRUNC 0x010 +#define NFSD_MAY_LOCK 0x020 +#define NFSD_MAY_MASK 0x03f /* extra hints to permission and open routines: */ -#define NFSD_MAY_OWNER_OVERRIDE 64 -#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ -#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 -#define NFSD_MAY_NOT_BREAK_LEASE 512 -#define NFSD_MAY_BYPASS_GSS 1024 -#define NFSD_MAY_READ_IF_EXEC 2048 +#define NFSD_MAY_OWNER_OVERRIDE 0x040 +#define NFSD_MAY_LOCAL_ACCESS 0x080 /* for device special files */ +#define NFSD_MAY_BYPASS_GSS_ON_ROOT 0x100 +#define NFSD_MAY_NOT_BREAK_LEASE 0x200 +#define NFSD_MAY_BYPASS_GSS 0x400 +#define NFSD_MAY_READ_IF_EXEC 0x800 #define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) #define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) -- cgit v1.2.3-58-ga151 From 48483bf23a568f3ef4cc7ad2c8f1a082f10ad0e7 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 26 Aug 2011 20:40:28 -0400 Subject: nfsd4: simplify recovery dir setting Move around some of this code, simplify a bit. Reviewed-by: Boaz Harrosh Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4recover.c | 36 ++++++++++++++++++++++++++++++++---- fs/nfsd/nfs4state.c | 41 +---------------------------------------- fs/nfsd/state.h | 2 +- 3 files changed, 34 insertions(+), 45 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 29d77f60585b..c3466610e6cd 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -45,6 +45,7 @@ /* Globals */ static struct file *rec_file; +static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; static int nfs4_save_creds(const struct cred **original_creds) @@ -354,13 +355,13 @@ nfsd4_recdir_load(void) { */ void -nfsd4_init_recdir(char *rec_dirname) +nfsd4_init_recdir() { const struct cred *original_cred; int status; printk("NFSD: Using %s as the NFSv4 state recovery directory\n", - rec_dirname); + user_recovery_dirname); BUG_ON(rec_file); @@ -372,10 +373,10 @@ nfsd4_init_recdir(char *rec_dirname) return; } - rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0); + rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0); if (IS_ERR(rec_file)) { printk("NFSD: unable to find recovery directory %s\n", - rec_dirname); + user_recovery_dirname); rec_file = NULL; } @@ -390,3 +391,30 @@ nfsd4_shutdown_recdir(void) fput(rec_file); rec_file = NULL; } + +/* + * Change the NFSv4 recovery directory to recdir. + */ +int +nfs4_reset_recoverydir(char *recdir) +{ + int status; + struct path path; + + status = kern_path(recdir, LOOKUP_FOLLOW, &path); + if (status) + return status; + status = -ENOTDIR; + if (S_ISDIR(path.dentry->d_inode->i_mode)) { + strcpy(user_recovery_dirname, recdir); + status = 0; + } + path_put(&path); + return status; +} + +char * +nfs4_recoverydir(void) +{ + return user_recovery_dirname; +} diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index aa0a36e3b09e..36d0beb76864 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -64,8 +64,6 @@ static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); static struct nfs4_stateid * search_for_stateid(stateid_t *stid); static struct nfs4_delegation * search_for_delegation(stateid_t *stid); static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); -static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; -static void nfs4_set_recdir(char *recdir); static int check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner); /* Locking: */ @@ -4523,7 +4521,7 @@ nfsd4_load_reboot_recovery_data(void) int status; nfs4_lock_state(); - nfsd4_init_recdir(user_recovery_dirname); + nfsd4_init_recdir(); status = nfsd4_recdir_load(); nfs4_unlock_state(); if (status) @@ -4632,40 +4630,3 @@ nfs4_state_shutdown(void) nfs4_unlock_state(); nfsd4_destroy_callback_queue(); } - -/* - * user_recovery_dirname is protected by the nfsd_mutex since it's only - * accessed when nfsd is starting. - */ -static void -nfs4_set_recdir(char *recdir) -{ - strcpy(user_recovery_dirname, recdir); -} - -/* - * Change the NFSv4 recovery directory to recdir. - */ -int -nfs4_reset_recoverydir(char *recdir) -{ - int status; - struct path path; - - status = kern_path(recdir, LOOKUP_FOLLOW, &path); - if (status) - return status; - status = -ENOTDIR; - if (S_ISDIR(path.dentry->d_inode->i_mode)) { - nfs4_set_recdir(recdir); - status = 0; - } - path_put(&path); - return status; -} - -char * -nfs4_recoverydir(void) -{ - return user_recovery_dirname; -} diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 5cfebe504056..12c1b1ef52ec 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -467,7 +467,7 @@ extern void nfsd4_destroy_callback_queue(void); extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfs4_put_delegation(struct nfs4_delegation *dp); extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); -extern void nfsd4_init_recdir(char *recdir_name); +extern void nfsd4_init_recdir(void); extern int nfsd4_recdir_load(void); extern void nfsd4_shutdown_recdir(void); extern int nfs4_client_to_reclaim(const char *name); -- cgit v1.2.3-58-ga151 From 6577aac01f00636c16cd583c30bd4dedf18475d5 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Fri, 12 Aug 2011 17:30:12 -0700 Subject: nfsd4: fix failure to end nfsd4 grace period Even if we fail to write a recovery record, we should still mark the client as having acquired its first state. Otherwise we leave 4.1 clients with indefinite ERR_GRACE returns. However, an inability to write stable storage records may cause failures of reboot recovery, and the problem should still be brought to the server administrator's attention. So, make sure the error is logged. These errors shouldn't normally be triggered on a corectly functioning server--this isn't a case where a misconfigured client could spam the logs. Signed-off-by: Boaz Harrosh Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4recover.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index c3466610e6cd..493851b844fe 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -130,6 +130,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) if (!rec_file || clp->cl_firststate) return 0; + clp->cl_firststate = 1; status = nfs4_save_creds(&original_cred); if (status < 0) return status; @@ -144,10 +145,8 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) goto out_unlock; } status = -EEXIST; - if (dentry->d_inode) { - dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n"); + if (dentry->d_inode) goto out_put; - } status = mnt_want_write(rec_file->f_path.mnt); if (status) goto out_put; @@ -157,12 +156,14 @@ out_put: dput(dentry); out_unlock: mutex_unlock(&dir->d_inode->i_mutex); - if (status == 0) { - clp->cl_firststate = 1; + if (status == 0) vfs_fsync(rec_file, 0); - } + else + printk(KERN_ERR "NFSD: failed to write recovery record" + " (err %d); please check that %s exists" + " and is writeable", status, + user_recovery_dirname); nfs4_reset_creds(original_cred); - dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status); return status; } -- cgit v1.2.3-58-ga151 From 3e77246393c0a433247631a1f0e9ec98d3d78a1c Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 10 Aug 2011 19:07:33 -0400 Subject: nfsd4: stop using nfserr_resource for transitory errors The server is returning nfserr_resource for both permanent errors and for errors (like allocation failures) that might be resolved by retrying later. Save nfserr_resource for the former and use delay/jukebox for the latter. Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 2 +- fs/nfsd/nfs4recover.c | 2 +- fs/nfsd/nfs4state.c | 14 +++++++------- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index b5530984db91..50bae7471147 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -940,7 +940,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, count = 4 + (verify->ve_attrlen >> 2); buf = kmalloc(count << 2, GFP_KERNEL); if (!buf) - return nfserr_resource; + return nfserr_jukebox; status = nfsd4_encode_fattr(&cstate->current_fh, cstate->current_fh.fh_export, diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 493851b844fe..ed083b9a731b 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -89,7 +89,7 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname) struct xdr_netobj cksum; struct hash_desc desc; struct scatterlist sg; - __be32 status = nfserr_resource; + __be32 status = nfserr_jukebox; dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n", clname->len, clname->data); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 36d0beb76864..c7d54f6a19c9 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1944,7 +1944,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * of 5 bullet points, labeled as CASE0 - CASE4 below. */ unconf = find_unconfirmed_client_by_str(dname, strhashval); - status = nfserr_resource; + status = nfserr_jukebox; if (!conf) { /* * RFC 3530 14.2.33 CASE 4: @@ -2481,7 +2481,7 @@ renew: if (open->op_stateowner == NULL) { sop = alloc_init_open_stateowner(strhashval, clp, open); if (sop == NULL) - return nfserr_resource; + return nfserr_jukebox; open->op_stateowner = sop; } list_del_init(&sop->so_close_lru); @@ -2617,7 +2617,7 @@ nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp, stp = nfs4_alloc_stateid(); if (stp == NULL) - return nfserr_resource; + return nfserr_jukebox; status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open); if (status) { @@ -2848,7 +2848,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfserr_bad_stateid; if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) goto out; - status = nfserr_resource; + status = nfserr_jukebox; fp = alloc_init_file(ino); if (fp == NULL) goto out; @@ -4033,7 +4033,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* XXX: Do we need to check for duplicate stateowners on * the same file, or should they just be allowed (and * create new stateids)? */ - status = nfserr_resource; + status = nfserr_jukebox; lock_sop = alloc_init_lock_stateowner(strhashval, open_sop->so_client, open_stp, lock); if (lock_sop == NULL) @@ -4117,9 +4117,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, case (EDEADLK): status = nfserr_deadlock; break; - default: + default: dprintk("NFSD: nfsd4_lock: vfs_lock_file() failed! status %d\n",err); - status = nfserr_resource; + status = nfserrno(err); break; } out: -- cgit v1.2.3-58-ga151 From ddc04c41636f8cd374d72cdd3a0fcaa916fbc5d0 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 30 Jul 2011 23:46:29 -0400 Subject: nfsd4: replace some macros by functions For all the usual reasons. (Type safety, readability.) Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 53 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c7d54f6a19c9..ce8e70cd04c5 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -139,10 +139,19 @@ unsigned int max_delegations; #define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) #define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) -#define ownerid_hashval(id) \ - ((id) & OWNER_HASH_MASK) -#define ownerstr_hashval(clientid, ownername) \ - (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK) +static unsigned int ownerid_hashval(const u32 id) +{ + return id & OWNER_HASH_MASK; +} + +static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername) +{ + unsigned int ret; + + ret = opaque_hashval(ownername->data, ownername->len); + ret += clientid; + return ret & OWNER_HASH_MASK; +} static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE]; static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; @@ -156,10 +165,16 @@ static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; #define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) #define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) -#define file_hashval(x) \ - hash_ptr(x, FILE_HASH_BITS) -#define stateid_hashval(owner_id, file_id) \ - (((owner_id) + (file_id)) & STATEID_HASH_MASK) +static unsigned int file_hashval(struct inode *ino) +{ + /* XXX: why are we hashing on inode pointer, anyway? */ + return hash_ptr(ino, FILE_HASH_BITS); +} + +static unsigned int stateid_hashval(u32 owner_id, u32 file_id) +{ + return (owner_id + file_id) & STATEID_HASH_MASK; +} static struct list_head file_hashtbl[FILE_HASH_SIZE]; static struct list_head stateid_hashtbl[STATEID_HASH_SIZE]; @@ -290,10 +305,16 @@ static DEFINE_SPINLOCK(client_lock); #define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) #define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) -#define clientid_hashval(id) \ - ((id) & CLIENT_HASH_MASK) -#define clientstr_hashval(name) \ - (opaque_hashval((name), 8) & CLIENT_HASH_MASK) +static unsigned int clientid_hashval(u32 id) +{ + return id & CLIENT_HASH_MASK; +} + +static unsigned int clientstr_hashval(const char *name) +{ + return opaque_hashval(name, 8) & CLIENT_HASH_MASK; +} + /* * reclaim_str_hashtbl[] holds known client info from previous reset/reboot * used in reboot/reset lease grace period processing @@ -2443,7 +2464,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, if (STALE_CLIENTID(&open->op_clientid)) return nfserr_stale_clientid; - strhashval = ownerstr_hashval(clientid->cl_id, open->op_owner); + strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner); sop = find_openstateowner_str(strhashval, open); open->op_stateowner = sop; if (!sop) { @@ -3711,8 +3732,10 @@ last_byte_offset(u64 start, u64 len) return end > start ? end - 1: NFS4_MAX_UINT64; } -#define lockownerid_hashval(id) \ - ((id) & LOCK_HASH_MASK) +static unsigned int lockownerid_hashval(u32 id) +{ + return id & LOCK_HASH_MASK; +} static inline unsigned int lock_ownerstr_hashval(struct inode *inode, u32 cl_id, -- cgit v1.2.3-58-ga151 From 506f275fffcaceb8abe3db69e581d86c7da48a82 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 11 Aug 2011 18:50:18 -0400 Subject: nfsd4: name openowner data structures more clearly These appear to be generic (for both open and lock owners), but they're actually just for open owners. This has confused me more than once. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ce8e70cd04c5..dd6424face32 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -134,27 +134,27 @@ unsigned int max_delegations; * Open owner state (share locks) */ -/* hash tables for nfs4_stateowner */ -#define OWNER_HASH_BITS 8 -#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) -#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) +/* hash tables for open owners */ +#define OPEN_OWNER_HASH_BITS 8 +#define OPEN_OWNER_HASH_SIZE (1 << OPEN_OWNER_HASH_BITS) +#define OPEN_OWNER_HASH_MASK (OPEN_OWNER_HASH_SIZE - 1) -static unsigned int ownerid_hashval(const u32 id) +static unsigned int open_ownerid_hashval(const u32 id) { - return id & OWNER_HASH_MASK; + return id & OPEN_OWNER_HASH_MASK; } -static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername) +static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername) { unsigned int ret; ret = opaque_hashval(ownername->data, ownername->len); ret += clientid; - return ret & OWNER_HASH_MASK; + return ret & OPEN_OWNER_HASH_MASK; } -static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE]; -static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; +static struct list_head open_ownerid_hashtbl[OPEN_OWNER_HASH_SIZE]; +static struct list_head open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE]; /* hash table for nfs4_file */ #define FILE_HASH_BITS 8 @@ -2240,7 +2240,7 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str if (!(sop = alloc_stateowner(&open->op_owner))) return NULL; - idhashval = ownerid_hashval(current_ownerid); + idhashval = open_ownerid_hashval(current_ownerid); INIT_LIST_HEAD(&sop->so_idhash); INIT_LIST_HEAD(&sop->so_strhash); INIT_LIST_HEAD(&sop->so_perclient); @@ -2248,8 +2248,8 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str INIT_LIST_HEAD(&sop->so_perstateid); /* not used */ INIT_LIST_HEAD(&sop->so_close_lru); sop->so_time = 0; - list_add(&sop->so_idhash, &ownerid_hashtbl[idhashval]); - list_add(&sop->so_strhash, &ownerstr_hashtbl[strhashval]); + list_add(&sop->so_idhash, &open_ownerid_hashtbl[idhashval]); + list_add(&sop->so_strhash, &open_ownerstr_hashtbl[strhashval]); list_add(&sop->so_perclient, &clp->cl_openowners); sop->so_is_open_owner = 1; sop->so_id = current_ownerid++; @@ -2313,7 +2313,7 @@ find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open) { struct nfs4_stateowner *so = NULL; - list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { + list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) { if (same_owner_str(so, &open->op_owner, &open->op_clientid)) return so; } @@ -2464,7 +2464,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, if (STALE_CLIENTID(&open->op_clientid)) return nfserr_stale_clientid; - strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner); + strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner); sop = find_openstateowner_str(strhashval, open); open->op_stateowner = sop; if (!sop) { @@ -4518,9 +4518,9 @@ nfs4_state_init(void) for (i = 0; i < FILE_HASH_SIZE; i++) { INIT_LIST_HEAD(&file_hashtbl[i]); } - for (i = 0; i < OWNER_HASH_SIZE; i++) { - INIT_LIST_HEAD(&ownerstr_hashtbl[i]); - INIT_LIST_HEAD(&ownerid_hashtbl[i]); + for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) { + INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]); + INIT_LIST_HEAD(&open_ownerid_hashtbl[i]); } for (i = 0; i < STATEID_HASH_SIZE; i++) { INIT_LIST_HEAD(&stateid_hashtbl[i]); -- cgit v1.2.3-58-ga151 From ff194bd95959ea9047d536b4f4ad6a992754e48d Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 12 Aug 2011 09:42:57 -0400 Subject: nfsd4: cleanup lock/stateowner initialization Share some common code, stop doing silly things like initializing a list head immediately before adding it to a list, etc. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 100 +++++++++++++++++++++++++++------------------------- 1 file changed, 52 insertions(+), 48 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index dd6424face32..3e64288399f7 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2215,51 +2215,61 @@ nfs4_free_stateowner(struct kref *kref) kmem_cache_free(stateowner_slab, sop); } -static inline struct nfs4_stateowner * -alloc_stateowner(struct xdr_netobj *owner) +static void init_nfs4_replay(struct nfs4_replay *rp) { - struct nfs4_stateowner *sop; - - if ((sop = kmem_cache_alloc(stateowner_slab, GFP_KERNEL))) { - if ((sop->so_owner.data = kmalloc(owner->len, GFP_KERNEL))) { - memcpy(sop->so_owner.data, owner->data, owner->len); - sop->so_owner.len = owner->len; - kref_init(&sop->so_ref); - return sop; - } - kmem_cache_free(stateowner_slab, sop); - } - return NULL; + rp->rp_status = nfserr_serverfault; + rp->rp_buflen = 0; + rp->rp_buf = rp->rp_ibuf; } -static struct nfs4_stateowner * -alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) { +static inline struct nfs4_stateowner *alloc_stateowner(struct xdr_netobj *owner, struct nfs4_client *clp) +{ struct nfs4_stateowner *sop; - struct nfs4_replay *rp; - unsigned int idhashval; - if (!(sop = alloc_stateowner(&open->op_owner))) + sop = kmem_cache_alloc(stateowner_slab, GFP_KERNEL); + if (!sop) + return NULL; + + sop->so_owner.data = kmemdup(owner->data, owner->len, GFP_KERNEL); + if (!sop->so_owner.data) { + kmem_cache_free(stateowner_slab, sop); return NULL; - idhashval = open_ownerid_hashval(current_ownerid); - INIT_LIST_HEAD(&sop->so_idhash); - INIT_LIST_HEAD(&sop->so_strhash); + } + sop->so_owner.len = owner->len; + + kref_init(&sop->so_ref); INIT_LIST_HEAD(&sop->so_perclient); INIT_LIST_HEAD(&sop->so_stateids); - INIT_LIST_HEAD(&sop->so_perstateid); /* not used */ + INIT_LIST_HEAD(&sop->so_perstateid); INIT_LIST_HEAD(&sop->so_close_lru); + sop->so_id = current_ownerid++; sop->so_time = 0; + sop->so_client = clp; + init_nfs4_replay(&sop->so_replay); + return sop; +} + +static void hash_openowner(struct nfs4_stateowner *sop, struct nfs4_client *clp, unsigned int strhashval) +{ + unsigned int idhashval; + + idhashval = open_ownerid_hashval(sop->so_id); list_add(&sop->so_idhash, &open_ownerid_hashtbl[idhashval]); list_add(&sop->so_strhash, &open_ownerstr_hashtbl[strhashval]); list_add(&sop->so_perclient, &clp->cl_openowners); +} + +static struct nfs4_stateowner * +alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) { + struct nfs4_stateowner *sop; + + sop = alloc_stateowner(&open->op_owner, clp); + if (!sop) + return NULL; sop->so_is_open_owner = 1; - sop->so_id = current_ownerid++; - sop->so_client = clp; sop->so_seqid = open->op_seqid; sop->so_confirmed = 0; - rp = &sop->so_replay; - rp->rp_status = nfserr_serverfault; - rp->rp_buflen = 0; - rp->rp_buf = rp->rp_ibuf; + hash_openowner(sop, clp, strhashval); return sop; } @@ -3902,6 +3912,16 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid, return NULL; } +static void hash_lockowner(struct nfs4_stateowner *sop, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp) +{ + unsigned int idhashval; + + idhashval = lockownerid_hashval(sop->so_id); + list_add(&sop->so_idhash, &lock_ownerid_hashtbl[idhashval]); + list_add(&sop->so_strhash, &lock_ownerstr_hashtbl[strhashval]); + list_add(&sop->so_perstateid, &open_stp->st_lockowners); +} + /* * Alloc a lock owner structure. * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has @@ -3913,33 +3933,17 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid, static struct nfs4_stateowner * alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp, struct nfsd4_lock *lock) { struct nfs4_stateowner *sop; - struct nfs4_replay *rp; - unsigned int idhashval; - if (!(sop = alloc_stateowner(&lock->lk_new_owner))) + sop = alloc_stateowner(&lock->lk_new_owner, clp); + if (!sop) return NULL; - idhashval = lockownerid_hashval(current_ownerid); - INIT_LIST_HEAD(&sop->so_idhash); - INIT_LIST_HEAD(&sop->so_strhash); - INIT_LIST_HEAD(&sop->so_perclient); INIT_LIST_HEAD(&sop->so_stateids); - INIT_LIST_HEAD(&sop->so_perstateid); - INIT_LIST_HEAD(&sop->so_close_lru); /* not used */ - sop->so_time = 0; - list_add(&sop->so_idhash, &lock_ownerid_hashtbl[idhashval]); - list_add(&sop->so_strhash, &lock_ownerstr_hashtbl[strhashval]); - list_add(&sop->so_perstateid, &open_stp->st_lockowners); sop->so_is_open_owner = 0; - sop->so_id = current_ownerid++; - sop->so_client = clp; /* It is the openowner seqid that will be incremented in encode in the * case of new lockowners; so increment the lock seqid manually: */ sop->so_seqid = lock->lk_new_lock_seqid + 1; sop->so_confirmed = 1; - rp = &sop->so_replay; - rp->rp_status = nfserr_serverfault; - rp->rp_buflen = 0; - rp->rp_buf = rp->rp_ibuf; + hash_lockowner(sop, strhashval, clp, open_stp); return sop; } -- cgit v1.2.3-58-ga151 From 28dde241cc65c9464b7627d9a9ed3a66e4df2586 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 22 Aug 2011 10:07:12 -0400 Subject: nfsd4: remove HAS_SESSION This flag doesn't really buy us anything. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 30 ++++++++++-------------------- fs/nfsd/state.h | 3 +-- fs/nfsd/xdr4.h | 2 +- 3 files changed, 12 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3e64288399f7..14c8dd64e136 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3168,13 +3168,13 @@ grace_disallows_io(struct inode *inode) return locks_in_grace() && mandatory_lock(inode); } -static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags) +static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session) { /* * When sessions are used the stateid generation number is ignored * when it is zero. */ - if ((flags & HAS_SESSION) && in->si_generation == 0) + if (has_session && in->si_generation == 0) goto out; /* If the client sends us a stateid from the future, it's buggy: */ @@ -3206,7 +3206,7 @@ static int is_open_stateid(struct nfs4_stateid *stateid) return stateid->st_openstp == NULL; } -__be32 nfs4_validate_stateid(stateid_t *stateid, int flags) +__be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session) { struct nfs4_stateid *stp = NULL; __be32 status = nfserr_stale_stateid; @@ -3223,7 +3223,7 @@ __be32 nfs4_validate_stateid(stateid_t *stateid, int flags) if (!stp->st_stateowner->so_confirmed) goto out; - status = check_stateid_generation(stateid, &stp->st_stateid, flags); + status = check_stateid_generation(stateid, &stp->st_stateid, has_session); if (status) goto out; @@ -3251,9 +3251,6 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, if (grace_disallows_io(ino)) return nfserr_grace; - if (nfsd4_has_session(cstate)) - flags |= HAS_SESSION; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return check_special_stateids(current_fh, stateid, flags); @@ -3270,8 +3267,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, dp = find_delegation_stateid(ino, stateid); if (!dp) goto out; - status = check_stateid_generation(stateid, &dp->dl_stateid, - flags); + status = check_stateid_generation(stateid, &dp->dl_stateid, nfsd4_has_session(cstate)); if (status) goto out; status = nfs4_check_delegmode(dp, flags); @@ -3292,7 +3288,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, if (!stp->st_stateowner->so_confirmed) goto out; status = check_stateid_generation(stateid, &stp->st_stateid, - flags); + nfsd4_has_session(cstate)); if (status) goto out; status = nfs4_check_openmode(stp, flags); @@ -3421,9 +3417,6 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, if (STALE_STATEID(stateid)) return nfserr_stale_stateid; - if (nfsd4_has_session(cstate)) - flags |= HAS_SESSION; - /* * We return BAD_STATEID if filehandle doesn't match stateid, * the confirmed flag is incorrecly set, or the generation @@ -3457,7 +3450,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, if (lock->lk_is_new) { if (!sop->so_is_open_owner) return nfserr_bad_stateid; - if (!(flags & HAS_SESSION) && + if (!nfsd4_has_session(cstate) && !same_clid(&clp->cl_clientid, lockclid)) return nfserr_bad_stateid; /* stp is the open stateid */ @@ -3482,7 +3475,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, * For the moment, we ignore the possibility of * generation number wraparound. */ - if (!(flags & HAS_SESSION) && seqid != sop->so_seqid) + if (!nfsd4_has_session(cstate) && seqid != sop->so_seqid) goto check_replay; if (sop->so_confirmed && flags & CONFIRM) { @@ -3495,7 +3488,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, " confirmed yet!\n"); return nfserr_bad_stateid; } - status = check_stateid_generation(stateid, &stp->st_stateid, flags); + status = check_stateid_generation(stateid, &stp->st_stateid, nfsd4_has_session(cstate)); if (status) return status; renew_client(sop->so_client); @@ -3679,14 +3672,11 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stateid_t *stateid = &dr->dr_stateid; struct inode *inode; __be32 status; - int flags = 0; if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) return status; inode = cstate->current_fh.fh_dentry->d_inode; - if (nfsd4_has_session(cstate)) - flags |= HAS_SESSION; nfs4_lock_state(); status = nfserr_bad_stateid; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) @@ -3701,7 +3691,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dp = find_delegation_stateid(inode, stateid); if (!dp) goto out; - status = check_stateid_generation(stateid, &dp->dl_stateid, flags); + status = check_stateid_generation(stateid, &dp->dl_stateid, nfsd4_has_session(cstate)); if (status) goto out; renew_client(dp->dl_client); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 12c1b1ef52ec..f02badd70cf2 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -439,7 +439,6 @@ struct nfs4_stateid { }; /* flags for preprocess_seqid_op() */ -#define HAS_SESSION 0x00000001 #define CONFIRM 0x00000002 #define OPEN_STATE 0x00000004 #define LOCK_STATE 0x00000008 @@ -476,7 +475,7 @@ extern void nfsd4_recdir_purge_old(void); extern int nfsd4_create_clid_dir(struct nfs4_client *clp); extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); extern void release_session_client(struct nfsd4_session *); -extern __be32 nfs4_validate_stateid(stateid_t *, int); +extern __be32 nfs4_validate_stateid(stateid_t *, bool); static inline void nfs4_put_stateowner(struct nfs4_stateowner *so) diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index d2a8d04428c7..663193b21a24 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -351,7 +351,7 @@ struct nfsd4_saved_compoundargs { struct nfsd4_test_stateid { __be32 ts_num_ids; - __be32 ts_has_session; + bool ts_has_session; struct nfsd4_compoundargs *ts_saved_args; struct nfsd4_saved_compoundargs ts_savedp; }; -- cgit v1.2.3-58-ga151 From a9004abc34239705840eaf6fe3d6cc9cb7656cba Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 23 Aug 2011 15:43:04 -0400 Subject: nfsd4: cleanup and consolidate seqid_mutating_err Signed-off-by: J. Bruce Fields --- fs/nfs/nfs4_fs.h | 24 ------------------------ fs/nfsd/nfs4xdr.c | 14 +------------- include/linux/nfs4.h | 16 ++++++++++++++++ 3 files changed, 17 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 1ec1a85fa71c..1a652a0bd7db 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -13,30 +13,6 @@ struct idmap; -/* - * In a seqid-mutating op, this macro controls which error return - * values trigger incrementation of the seqid. - * - * from rfc 3010: - * The client MUST monotonically increment the sequence number for the - * CLOSE, LOCK, LOCKU, OPEN, OPEN_CONFIRM, and OPEN_DOWNGRADE - * operations. This is true even in the event that the previous - * operation that used the sequence number received an error. The only - * exception to this rule is if the previous operation received one of - * the following errors: NFSERR_STALE_CLIENTID, NFSERR_STALE_STATEID, - * NFSERR_BAD_STATEID, NFSERR_BAD_SEQID, NFSERR_BADXDR, - * NFSERR_RESOURCE, NFSERR_NOFILEHANDLE. - * - */ -#define seqid_mutating_err(err) \ -(((err) != NFSERR_STALE_CLIENTID) && \ - ((err) != NFSERR_STALE_STATEID) && \ - ((err) != NFSERR_BAD_STATEID) && \ - ((err) != NFSERR_BAD_SEQID) && \ - ((err) != NFSERR_BAD_XDR) && \ - ((err) != NFSERR_RESOURCE) && \ - ((err) != NFSERR_NOFILEHANDLE)) - enum nfs4_client_state { NFS4CLNT_MANAGER_RUNNING = 0, NFS4CLNT_CHECK_LEASE, diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 78c792fb59a8..04ad9a2ca3d0 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1623,18 +1623,6 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c) \ save = resp->p; -static bool seqid_mutating_err(__be32 err) -{ - /* rfc 3530 section 8.1.5: */ - return err != nfserr_stale_clientid && - err != nfserr_stale_stateid && - err != nfserr_bad_stateid && - err != nfserr_bad_seqid && - err != nfserr_bad_xdr && - err != nfserr_resource && - err != nfserr_nofilehandle; -} - /* * Routine for encoding the result of a "seqid-mutating" NFSv4 operation. This * is where sequence id's are incremented, and the replay cache is filled. @@ -1643,7 +1631,7 @@ static bool seqid_mutating_err(__be32 err) */ #define ENCODE_SEQID_OP_TAIL(stateowner) do { \ - if (seqid_mutating_err(nfserr) && stateowner) { \ + if (seqid_mutating_err(ntohl(nfserr)) && stateowner) { \ stateowner->so_seqid++; \ stateowner->so_replay.rp_status = nfserr; \ stateowner->so_replay.rp_buflen = \ diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 76f99e8714f3..b875b0324fc0 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -373,6 +373,22 @@ enum nfsstat4 { NFS4ERR_DELEG_REVOKED = 10087, /* deleg./layout revoked */ }; +static inline bool seqid_mutating_err(u32 err) +{ + /* rfc 3530 section 8.1.5: */ + switch (err) { + case NFS4ERR_STALE_CLIENTID: + case NFS4ERR_STALE_STATEID: + case NFS4ERR_BAD_STATEID: + case NFS4ERR_BAD_SEQID: + case NFS4ERR_BADXDR: + case NFS4ERR_RESOURCE: + case NFS4ERR_NOFILEHANDLE: + return false; + }; + return true; +} + /* * Note: NF4BAD is not actually part of the protocol; it is just used * internally by nfsd. -- cgit v1.2.3-58-ga151 From 9afb978400e65ea6a3200b3e163606ce86e13c25 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 22 Aug 2011 11:39:07 -0400 Subject: nfsd4: simplify lock openmode check Note that the special handling for the lock stateid case is already done by nfs4_check_openmode() (as of 02921914170e3b7fea1cd82dac9713685d2de5e2 "nfsd4: fix openmode checking on IO using lock stateid") so we no longer need these two cases in the caller. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 14c8dd64e136..aafc41a0a8d0 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3453,16 +3453,11 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, if (!nfsd4_has_session(cstate) && !same_clid(&clp->cl_clientid, lockclid)) return nfserr_bad_stateid; - /* stp is the open stateid */ - status = nfs4_check_openmode(stp, lkflg); - if (status) - return status; - } else { - /* stp is the lock stateid */ - status = nfs4_check_openmode(stp->st_openstp, lkflg); - if (status) - return status; - } + } + /* stp is the open stateid */ + status = nfs4_check_openmode(stp, lkflg); + if (status) + return status; } if (nfs4_check_fh(current_fh, stp)) { -- cgit v1.2.3-58-ga151 From b34f27aa5da75b0b6c054e76bb4b92aea7aac04b Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 22 Aug 2011 13:13:31 -0400 Subject: nfsd4: get lock checks out of preprocess_seqid_op We've got some lock-specific code here in nfs4_preprocess_seqid_op which is only used by nfsd4_lock(). Move it to the caller. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 49 ++++++++++++++++++++----------------------------- 1 file changed, 20 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index aafc41a0a8d0..d2bf80d8d85b 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3396,7 +3396,7 @@ static __be32 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, - struct nfs4_stateid **stpp, struct nfsd4_lock *lock) + struct nfs4_stateid **stpp) { struct nfs4_stateid *stp; struct nfs4_stateowner *sop; @@ -3439,27 +3439,6 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, *stpp = stp; *sopp = sop = stp->st_stateowner; - if (lock) { - clientid_t *lockclid = &lock->v.new.clientid; - struct nfs4_client *clp = sop->so_client; - int lkflg = 0; - __be32 status; - - lkflg = setlkflg(lock->lk_type); - - if (lock->lk_is_new) { - if (!sop->so_is_open_owner) - return nfserr_bad_stateid; - if (!nfsd4_has_session(cstate) && - !same_clid(&clp->cl_clientid, lockclid)) - return nfserr_bad_stateid; - } - /* stp is the open stateid */ - status = nfs4_check_openmode(stp, lkflg); - if (status) - return status; - } - if (nfs4_check_fh(current_fh, stp)) { dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n"); return nfserr_bad_stateid; @@ -3522,7 +3501,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if ((status = nfs4_preprocess_seqid_op(cstate, oc->oc_seqid, &oc->oc_req_stateid, CONFIRM | OPEN_STATE, - &oc->oc_stateowner, &stp, NULL))) + &oc->oc_stateowner, &stp))) goto out; sop = oc->oc_stateowner; @@ -3585,7 +3564,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, od->od_seqid, &od->od_stateid, OPEN_STATE, - &od->od_stateowner, &stp, NULL))) + &od->od_stateowner, &stp))) goto out; status = nfserr_inval; @@ -3635,7 +3614,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, close->cl_seqid, &close->cl_stateid, OPEN_STATE | CLOSE_STATE, - &close->cl_stateowner, &stp, NULL))) + &close->cl_stateowner, &stp))) goto out; status = nfs_ok; update_stateid(&stp->st_stateid); @@ -3997,6 +3976,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct file_lock conflock; __be32 status = 0; unsigned int strhashval; + int lkflg; int err; dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n", @@ -4032,11 +4012,17 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, lock->lk_new_open_seqid, &lock->lk_new_open_stateid, OPEN_STATE, - &lock->lk_replay_owner, &open_stp, - lock); + &lock->lk_replay_owner, &open_stp); if (status) goto out; + status = nfserr_bad_stateid; open_sop = lock->lk_replay_owner; + if (!open_sop->so_is_open_owner) + goto out; + if (!nfsd4_has_session(cstate) && + !same_clid(&open_sop->so_client->cl_clientid, + &lock->v.new.clientid)) + goto out; /* create lockowner and lock stateid */ fp = open_stp->st_file; strhashval = lock_ownerstr_hashval(fp->fi_inode, @@ -4059,7 +4045,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, lock->lk_old_lock_seqid, &lock->lk_old_lock_stateid, LOCK_STATE, - &lock->lk_replay_owner, &lock_stp, lock); + &lock->lk_replay_owner, &lock_stp); if (status) goto out; lock_sop = lock->lk_replay_owner; @@ -4067,6 +4053,11 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } /* lock->lk_replay_owner and lock_stp have been created or found */ + lkflg = setlkflg(lock->lk_type); + status = nfs4_check_openmode(lock_stp, lkflg); + if (status) + goto out; + status = nfserr_grace; if (locks_in_grace() && !lock->lk_reclaim) goto out; @@ -4259,7 +4250,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, locku->lu_seqid, &locku->lu_stateid, LOCK_STATE, - &locku->lu_stateowner, &stp, NULL))) + &locku->lu_stateowner, &stp))) goto out; filp = find_any_file(stp->st_file); -- cgit v1.2.3-58-ga151 From 3cc9fda40a427aeb176bab898edca4e9a3ada524 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 23 Aug 2011 15:17:50 -0400 Subject: nfsd4: remove redundant is_open_owner check When called with OPEN_STATE, preprocess_seqid_op only returns an open stateid, hence only an open owner. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d2bf80d8d85b..d2b637b717c3 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4017,8 +4017,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; status = nfserr_bad_stateid; open_sop = lock->lk_replay_owner; - if (!open_sop->so_is_open_owner) - goto out; if (!nfsd4_has_session(cstate) && !same_clid(&open_sop->so_client->cl_clientid, &lock->v.new.clientid)) -- cgit v1.2.3-58-ga151 From f32948ddd1179ac0b105ceacc235cfc3f98ebea3 Mon Sep 17 00:00:00 2001 From: Li Haifeng Date: Tue, 30 Aug 2011 17:32:50 +0200 Subject: ext2: fix the outdated comment in ext2_nfs_get_inode() Signed-off-by: Li Haifeng Signed-off-by: Jan Kara --- fs/ext2/super.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 1dd62ed35b85..bd8ac164a3bf 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -327,10 +327,10 @@ static struct inode *ext2_nfs_get_inode(struct super_block *sb, if (ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count)) return ERR_PTR(-ESTALE); - /* iget isn't really right if the inode is currently unallocated!! - * ext2_read_inode currently does appropriate checks, but - * it might be "neater" to call ext2_get_inode first and check - * if the inode is valid..... + /* + * ext2_iget isn't quite right if the inode is currently unallocated! + * However ext2_iget currently does appropriate checks to handle stale + * inodes so everything is OK. */ inode = ext2_iget(sb, ino); if (IS_ERR(inode)) -- cgit v1.2.3-58-ga151 From c152292f9ee7eb4ed30edc0bd5027a5beef5f5e8 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 26 Aug 2011 17:22:06 -0400 Subject: nfsd: remove include/linux/nfsd/syscall.h We don't need this any more. Signed-off-by: J. Bruce Fields --- fs/compat.c | 1 - fs/nfsd/export.c | 1 - fs/nfsd/nfsctl.c | 1 - include/linux/nfsd/Kbuild | 1 - include/linux/nfsd/syscall.h | 116 ------------------------------------------- 5 files changed, 120 deletions(-) delete mode 100644 include/linux/nfsd/syscall.h (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index 0b48d018e38a..f2b36d472c73 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index f4cc1e2bfc54..d491421cd708 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -16,7 +16,6 @@ #include #include -#include #include #include "nfsd.h" diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index c7716143cbd1..db34a585e112 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/include/linux/nfsd/Kbuild b/include/linux/nfsd/Kbuild index 0e528606d46f..b8d4001212b3 100644 --- a/include/linux/nfsd/Kbuild +++ b/include/linux/nfsd/Kbuild @@ -2,4 +2,3 @@ header-y += debug.h header-y += export.h header-y += nfsfh.h header-y += stats.h -header-y += syscall.h diff --git a/include/linux/nfsd/syscall.h b/include/linux/nfsd/syscall.h deleted file mode 100644 index 812bc1e160dc..000000000000 --- a/include/linux/nfsd/syscall.h +++ /dev/null @@ -1,116 +0,0 @@ -/* - * include/linux/nfsd/syscall.h - * - * This file holds all declarations for the knfsd syscall interface. - * - * Copyright (C) 1995-1997 Olaf Kirch - */ - -#ifndef NFSD_SYSCALL_H -#define NFSD_SYSCALL_H - -#include -#include - -/* - * Version of the syscall interface - */ -#define NFSCTL_VERSION 0x0201 - -/* - * These are the commands understood by nfsctl(). - */ -#define NFSCTL_SVC 0 /* This is a server process. */ -#define NFSCTL_ADDCLIENT 1 /* Add an NFS client. */ -#define NFSCTL_DELCLIENT 2 /* Remove an NFS client. */ -#define NFSCTL_EXPORT 3 /* export a file system. */ -#define NFSCTL_UNEXPORT 4 /* unexport a file system. */ -/*#define NFSCTL_UGIDUPDATE 5 / * update a client's uid/gid map. DISCARDED */ -/*#define NFSCTL_GETFH 6 / * get an fh by ino DISCARDED */ -#define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */ -#define NFSCTL_GETFS 8 /* get an fh by path with max FH len */ - -/* SVC */ -struct nfsctl_svc { - unsigned short svc_port; - int svc_nthreads; -}; - -/* ADDCLIENT/DELCLIENT */ -struct nfsctl_client { - char cl_ident[NFSCLNT_IDMAX+1]; - int cl_naddr; - struct in_addr cl_addrlist[NFSCLNT_ADDRMAX]; - int cl_fhkeytype; - int cl_fhkeylen; - unsigned char cl_fhkey[NFSCLNT_KEYMAX]; -}; - -/* EXPORT/UNEXPORT */ -struct nfsctl_export { - char ex_client[NFSCLNT_IDMAX+1]; - char ex_path[NFS_MAXPATHLEN+1]; - __kernel_old_dev_t ex_dev; - __kernel_ino_t ex_ino; - int ex_flags; - __kernel_uid_t ex_anon_uid; - __kernel_gid_t ex_anon_gid; -}; - -/* GETFD */ -struct nfsctl_fdparm { - struct sockaddr gd_addr; - char gd_path[NFS_MAXPATHLEN+1]; - int gd_version; -}; - -/* GETFS - GET Filehandle with Size */ -struct nfsctl_fsparm { - struct sockaddr gd_addr; - char gd_path[NFS_MAXPATHLEN+1]; - int gd_maxlen; -}; - -/* - * This is the argument union. - */ -struct nfsctl_arg { - int ca_version; /* safeguard */ - union { - struct nfsctl_svc u_svc; - struct nfsctl_client u_client; - struct nfsctl_export u_export; - struct nfsctl_fdparm u_getfd; - struct nfsctl_fsparm u_getfs; - /* - * The following dummy member is needed to preserve binary compatibility - * on platforms where alignof(void*)>alignof(int). It's needed because - * this union used to contain a member (u_umap) which contained a - * pointer. - */ - void *u_ptr; - } u; -#define ca_svc u.u_svc -#define ca_client u.u_client -#define ca_export u.u_export -#define ca_getfd u.u_getfd -#define ca_getfs u.u_getfs -}; - -union nfsctl_res { - __u8 cr_getfh[NFS_FHSIZE]; - struct knfsd_fh cr_getfs; -}; - -#ifdef __KERNEL__ -/* - * Kernel syscall implementation. - */ -extern int exp_addclient(struct nfsctl_client *ncp); -extern int exp_delclient(struct nfsctl_client *ncp); -extern int exp_export(struct nfsctl_export *nxp); -extern int exp_unexport(struct nfsctl_export *nxp); - -#endif /* __KERNEL__ */ - -#endif /* NFSD_SYSCALL_H */ -- cgit v1.2.3-58-ga151 From 1cd9f0976aa4606db8d6e3dc3edd0aca8019372a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 31 Aug 2011 11:54:51 -0400 Subject: ext2,ext3,ext4: don't inherit APPEND_FL or IMMUTABLE_FL for new inodes This doesn't make much sense, and it exposes a bug in the kernel where attempts to create a new file in an append-only directory using O_CREAT will fail (but still leave a zero-length file). This was discovered when xfstests #79 was generalized so it could run on all file systems. Signed-off-by: "Theodore Ts'o" Cc:stable@kernel.org --- fs/ext4/ext4.h | 3 +-- include/linux/ext2_fs.h | 4 ++-- include/linux/ext3_fs.h | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b7d7bd0f066e..5c38120c389c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -358,8 +358,7 @@ struct flex_groups { /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ - EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\ - EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ + EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) diff --git a/include/linux/ext2_fs.h b/include/linux/ext2_fs.h index 53792bf36c71..ce1b719e8bd4 100644 --- a/include/linux/ext2_fs.h +++ b/include/linux/ext2_fs.h @@ -197,8 +197,8 @@ struct ext2_group_desc /* Flags that should be inherited by new inodes from their parent. */ #define EXT2_FL_INHERITED (EXT2_SECRM_FL | EXT2_UNRM_FL | EXT2_COMPR_FL |\ - EXT2_SYNC_FL | EXT2_IMMUTABLE_FL | EXT2_APPEND_FL |\ - EXT2_NODUMP_FL | EXT2_NOATIME_FL | EXT2_COMPRBLK_FL|\ + EXT2_SYNC_FL | EXT2_NODUMP_FL |\ + EXT2_NOATIME_FL | EXT2_COMPRBLK_FL |\ EXT2_NOCOMP_FL | EXT2_JOURNAL_DATA_FL |\ EXT2_NOTAIL_FL | EXT2_DIRSYNC_FL) diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index 67a803aee619..0244611eb2b8 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -180,8 +180,8 @@ struct ext3_group_desc /* Flags that should be inherited by new inodes from their parent. */ #define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\ - EXT3_SYNC_FL | EXT3_IMMUTABLE_FL | EXT3_APPEND_FL |\ - EXT3_NODUMP_FL | EXT3_NOATIME_FL | EXT3_COMPRBLK_FL|\ + EXT3_SYNC_FL | EXT3_NODUMP_FL |\ + EXT3_NOATIME_FL | EXT3_COMPRBLK_FL |\ EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\ EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL) -- cgit v1.2.3-58-ga151 From 84ebd795613488992b273220c2937d575d27d2a9 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 31 Aug 2011 11:56:51 -0400 Subject: ext4: fake direct I/O mode for data=journal Currently attempts to open a file with O_DIRECT in data=journal mode causes the open to fail with -EINVAL. This makes it very hard to test data=journal mode. So we will let the open succeed, but then always fall back to O_DSYNC buffered writes. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 18d2558b7624..b84f127c085d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2854,6 +2854,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, struct inode *inode = file->f_mapping->host; ssize_t ret; + /* + * If we are doing data journalling we don't support O_DIRECT + */ + if (ext4_should_journal_data(inode)) + return 0; + trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); @@ -2923,6 +2929,7 @@ static const struct address_space_operations ext4_journalled_aops = { .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; -- cgit v1.2.3-58-ga151 From bcaa992975041e40449be8c010c26192b8c8b409 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 31 Aug 2011 11:58:51 -0400 Subject: ext4: ext4_rename should dirty dir_bh with the correct directory When ext4_rename performs a directory rename (move), dir_bh is a buffer that is modified to update the '..' link in the directory being moved (old_inode). However, ext4_handle_dirty_metadata is called with the old parent directory inode (old_dir) and dir_bh, which is incorrect because dir_bh does not belong to the parent inode. Fix this error. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f8068c7bae9f..09f930b7a785 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2529,7 +2529,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); - retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh); + retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh); if (retval) { ext4_std_error(old_dir->i_sb, retval); goto end_rename; -- cgit v1.2.3-58-ga151 From f9287c1f2d329f4d78a3bbc9cf0db0ebae6f146a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 31 Aug 2011 12:00:51 -0400 Subject: ext4: ext4_mkdir should dirty dir_block with newly created directory inode ext4_mkdir calls ext4_handle_dirty_metadata with dir_block and the inode "dir". Unfortunately, dir_block belongs to the newly created directory (which is "inode"), not the parent directory (which is "dir"). Fix the incorrect association. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 09f930b7a785..f0abe4323136 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1862,7 +1862,7 @@ retry: ext4_set_de_type(dir->i_sb, de, S_IFDIR); inode->i_nlink = 2; BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, dir, dir_block); + err = ext4_handle_dirty_metadata(handle, inode, dir_block); if (err) goto out_clear_inode; err = ext4_mark_inode_dirty(handle, inode); -- cgit v1.2.3-58-ga151 From 5930ea643805feb50a2f8383ae12eb6f10935e49 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 31 Aug 2011 12:02:51 -0400 Subject: ext4: call ext4_handle_dirty_metadata with correct inode in ext4_dx_add_entry ext4_dx_add_entry manipulates bh2 and frames[0].bh, which are two buffer_heads that point to directory blocks assigned to the directory inode. However, the function calls ext4_handle_dirty_metadata with the inode of the file that's being added to the directory, not the directory inode itself. Therefore, correct the code to dirty the directory buffers with the directory inode, not the file inode. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f0abe4323136..a067835bbac1 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1585,7 +1585,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, dxtrace(dx_show_index("node", frames[1].entries)); dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); - err = ext4_handle_dirty_metadata(handle, inode, bh2); + err = ext4_handle_dirty_metadata(handle, dir, bh2); if (err) goto journal_error; brelse (bh2); @@ -1611,7 +1611,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; } - err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh); + err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh); if (err) { ext4_std_error(inode->i_sb, err); goto cleanup; -- cgit v1.2.3-58-ga151 From b7d7ca35807b4c8ca3271885b47e67c843376f77 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 31 Aug 2011 15:39:30 -0400 Subject: nfsd4: fix off-by-one-error in SEQUENCE reply The values here represent highest slotid numbers. Since slotid's are numbered starting from zero, the highest should be one less than the number of slots. Reported-by: Rick Macklem Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 04ad9a2ca3d0..f982d8574d95 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3221,9 +3221,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); WRITE32(seq->seqid); WRITE32(seq->slotid); - WRITE32(seq->maxslots); - /* For now: target_maxslots = maxslots */ - WRITE32(seq->maxslots); + /* Note slotid's are numbered from zero: */ + WRITE32(seq->maxslots - 1); /* sr_highest_slotid */ + WRITE32(seq->maxslots - 1); /* sr_target_highest_slotid */ WRITE32(seq->status_flags); ADJUST_ARGS(); -- cgit v1.2.3-58-ga151 From c2d8eb7ac645e1baba7205cb2631e2f21db3d6a9 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 29 Aug 2011 10:36:11 -0400 Subject: nfsd4: remove typoed replay field Wow, I wonder how long that typo's been there. Signed-off-by: J. Bruce Fields --- fs/nfsd/state.h | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index f02badd70cf2..6b706a60ce88 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -312,7 +312,6 @@ struct nfs4_replay { __be32 rp_status; unsigned int rp_buflen; char *rp_buf; - unsigned intrp_allocated; struct knfsd_fh rp_openfh; char rp_ibuf[NFSD4_REPLAY_ISIZE]; }; -- cgit v1.2.3-58-ga151 From 5fa0bbb4ee5481a6b3e83c4968142ca433d71914 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 31 Aug 2011 15:25:46 -0400 Subject: nfsd4: simplify distinguishing lock & open stateid's The trick free_stateid is using is a little cheesy, and we'll have more uses for this field later. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 9 +++------ fs/nfsd/state.h | 3 +++ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d2b637b717c3..7de214b860db 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2285,6 +2285,7 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open * list_add(&stp->st_hash, &stateid_hashtbl[hashval]); list_add(&stp->st_perstateowner, &sop->so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); + stp->st_type = NFS4_OPEN_STID; stp->st_stateowner = sop; get_nfs4_file(fp); stp->st_file = fp; @@ -3201,11 +3202,6 @@ static int is_delegation_stateid(stateid_t *stateid) return stateid->si_fileid == 0; } -static int is_open_stateid(struct nfs4_stateid *stateid) -{ - return stateid->st_openstp == NULL; -} - __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session) { struct nfs4_stateid *stp = NULL; @@ -3369,7 +3365,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } } - if (is_open_stateid(stp)) { + if (stp->st_type == NFS4_OPEN_STID) { ret = nfserr_locks_held; goto out; } else { @@ -3928,6 +3924,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc list_add(&stp->st_perfile, &fp->fi_stateids); list_add(&stp->st_perstateowner, &sop->so_stateids); stp->st_stateowner = sop; + stp->st_type = NFS4_LOCK_STID; get_nfs4_file(fp); stp->st_file = fp; stp->st_stateid.si_boot = boot_time; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 6b706a60ce88..a06f55bd38b6 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -425,6 +425,9 @@ static inline struct file *find_any_file(struct nfs4_file *f) */ struct nfs4_stateid { +#define NFS4_OPEN_STID 1 +#define NFS4_LOCK_STID 2 + char st_type; struct list_head st_hash; struct list_head st_perfile; struct list_head st_perstateowner; -- cgit v1.2.3-58-ga151 From b79abaddfe7ef29e00d71721cf03a33e00d53317 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 22 Aug 2011 18:01:43 -0400 Subject: nfsd4: consolidate lock & open stateid tables There's no reason to have two separate hash tables for open and lock stateid's. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 63 +++++++++++++---------------------------------------- 1 file changed, 15 insertions(+), 48 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 7de214b860db..0198328c0e84 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -61,7 +61,6 @@ static u64 current_sessionid = 1; /* forward declarations */ static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); -static struct nfs4_stateid * search_for_stateid(stateid_t *stid); static struct nfs4_delegation * search_for_delegation(stateid_t *stid); static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); static int check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner); @@ -3211,7 +3210,7 @@ __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session) goto out; status = nfserr_expired; - stp = search_for_stateid(stateid); + stp = find_stateid(stateid, 0); if (!stp) goto out; status = nfserr_bad_stateid; @@ -3349,7 +3348,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - stp = search_for_stateid(stateid); + stp = find_stateid(stateid, 0); if (!stp) { ret = nfserr_bad_stateid; goto out; @@ -3718,7 +3717,6 @@ lock_ownerstr_hashval(struct inode *inode, u32 cl_id, static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE]; static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; -static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE]; static int same_stateid(stateid_t *id_one, stateid_t *id_two) @@ -3728,50 +3726,21 @@ same_stateid(stateid_t *id_one, stateid_t *id_two) return id_one->si_fileid == id_two->si_fileid; } -static struct nfs4_stateid * -find_stateid(stateid_t *stid, int flags) +static struct nfs4_stateid *find_stateid(stateid_t *t, int flags) { - struct nfs4_stateid *local; - u32 st_id = stid->si_stateownerid; - u32 f_id = stid->si_fileid; + struct nfs4_stateid *s; unsigned int hashval; - dprintk("NFSD: find_stateid flags 0x%x\n",flags); - if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) { - hashval = stateid_hashval(st_id, f_id); - list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) { - if ((local->st_stateid.si_stateownerid == st_id) && - (local->st_stateid.si_fileid == f_id)) - return local; - } - } - - if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) { - hashval = stateid_hashval(st_id, f_id); - list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) { - if ((local->st_stateid.si_stateownerid == st_id) && - (local->st_stateid.si_fileid == f_id)) - return local; + hashval = stateid_hashval(t->si_stateownerid, t->si_fileid); + list_for_each_entry(s, &stateid_hashtbl[hashval], st_hash) { + if (!same_stateid(&s->st_stateid, t)) + continue; + if (flags & LOCK_STATE && s->st_type != NFS4_LOCK_STID) + return NULL; + if (flags & OPEN_STATE && s->st_type != NFS4_OPEN_STID) + return NULL; + return s; } - } - return NULL; -} - -static struct nfs4_stateid * -search_for_stateid(stateid_t *stid) -{ - struct nfs4_stateid *local; - unsigned int hashval = stateid_hashval(stid->si_stateownerid, stid->si_fileid); - - list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) { - if (same_stateid(&local->st_stateid, stid)) - return local; - } - - list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) { - if (same_stateid(&local->st_stateid, stid)) - return local; - } return NULL; } @@ -3920,7 +3889,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc INIT_LIST_HEAD(&stp->st_perfile); INIT_LIST_HEAD(&stp->st_perstateowner); INIT_LIST_HEAD(&stp->st_lockowners); /* not used */ - list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]); + list_add(&stp->st_hash, &stateid_hashtbl[hashval]); list_add(&stp->st_perfile, &fp->fi_stateids); list_add(&stp->st_perstateowner, &sop->so_stateids); stp->st_stateowner = sop; @@ -4497,10 +4466,8 @@ nfs4_state_init(void) INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]); INIT_LIST_HEAD(&open_ownerid_hashtbl[i]); } - for (i = 0; i < STATEID_HASH_SIZE; i++) { + for (i = 0; i < STATEID_HASH_SIZE; i++) INIT_LIST_HEAD(&stateid_hashtbl[i]); - INIT_LIST_HEAD(&lockstateid_hashtbl[i]); - } for (i = 0; i < LOCK_HASH_SIZE; i++) { INIT_LIST_HEAD(&lock_ownerid_hashtbl[i]); INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]); -- cgit v1.2.3-58-ga151 From 81b829655d418316f0707b3656b45cff7a1dbf12 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 23 Aug 2011 11:03:29 -0400 Subject: nfsd4: simplify stateid generation code, fix wraparound Follow the recommendation from rfc3530bis for stateid generation number wraparound, simplify some code, and fix or remove incorrect comments. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 52 +++++++++++++++++++++++----------------------------- fs/nfsd/state.h | 3 +++ 2 files changed, 26 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 0198328c0e84..106e8fa63cdf 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3168,6 +3168,12 @@ grace_disallows_io(struct inode *inode) return locks_in_grace() && mandatory_lock(inode); } +/* Returns true iff a is later than b: */ +static bool stateid_generation_after(stateid_t *a, stateid_t *b) +{ + return (s32)a->si_generation - (s32)b->si_generation > 0; +} + static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session) { /* @@ -3175,25 +3181,25 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_sess * when it is zero. */ if (has_session && in->si_generation == 0) - goto out; + return nfs_ok; + + if (in->si_generation == ref->si_generation) + return nfs_ok; /* If the client sends us a stateid from the future, it's buggy: */ - if (in->si_generation > ref->si_generation) + if (stateid_generation_after(in, ref)) return nfserr_bad_stateid; /* - * The following, however, can happen. For example, if the - * client sends an open and some IO at the same time, the open - * may bump si_generation while the IO is still in flight. - * Thanks to hard links and renames, the client never knows what - * file an open will affect. So it could avoid that situation - * only by serializing all opens and IO from the same open - * owner. To recover from the old_stateid error, the client - * will just have to retry the IO: + * However, we could see a stateid from the past, even from a + * non-buggy client. For example, if the client sends a lock + * while some IO is outstanding, the lock may bump si_generation + * while the IO is still in flight. The client could avoid that + * situation by waiting for responses on all the IO requests, + * but better performance may result in retrying IO that + * receives an old_stateid error if requests are rarely + * reordered in flight: */ - if (in->si_generation < ref->si_generation) - return nfserr_old_stateid; -out: - return nfs_ok; + return nfserr_old_stateid; } static int is_delegation_stateid(stateid_t *stateid) @@ -3353,16 +3359,9 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, ret = nfserr_bad_stateid; goto out; } - if (stateid->si_generation != 0) { - if (stateid->si_generation < stp->st_stateid.si_generation) { - ret = nfserr_old_stateid; - goto out; - } - if (stateid->si_generation > stp->st_stateid.si_generation) { - ret = nfserr_bad_stateid; - goto out; - } - } + ret = check_stateid_generation(stateid, &stp->st_stateid, 1); + if (ret) + goto out; if (stp->st_type == NFS4_OPEN_STID) { ret = nfserr_locks_held; @@ -3439,11 +3438,6 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, return nfserr_bad_stateid; } - /* - * We now validate the seqid and stateid generation numbers. - * For the moment, we ignore the possibility of - * generation number wraparound. - */ if (!nfsd4_has_session(cstate) && seqid != sop->so_seqid) goto check_replay; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index a06f55bd38b6..c425717715f6 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -293,6 +293,9 @@ static inline void update_stateid(stateid_t *stateid) { stateid->si_generation++; + /* Wraparound recommendation from 3530bis-13 9.1.3.2: */ + if (stateid->si_generation == 0) + stateid->si_generation = 1; } /* A reasonable value for REPLAY_ISIZE was estimated as follows: -- cgit v1.2.3-58-ga151 From 73997dc4183c580278ea8cb41c7a9655940801e0 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 31 Aug 2011 15:47:21 -0400 Subject: nfsd4: make delegation stateid's seqid start at 1 Thanks to Casey for reminding me that 5661 gives a special meaning to a value of 0 in the stateid's seqid field, so all stateid's should start out with si_generation 1. We were doing that in the open and lock cases for minorversion 1, but not for the delegation stateid, and not for openstateid's with v4.0. It doesn't *really* matter much for v4.0 or for delegation stateid's (which never get the seqid field incremented), but we may as well do the same for all of them. Reported-by: Casey Bodley Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 106e8fa63cdf..80af79ee5d51 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -250,7 +250,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f dp->dl_stateid.si_boot = boot_time; dp->dl_stateid.si_stateownerid = current_delegid++; dp->dl_stateid.si_fileid = 0; - dp->dl_stateid.si_generation = 0; + dp->dl_stateid.si_generation = 1; fh_copy_shallow(&dp->dl_fh, ¤t_fh->fh_handle); dp->dl_time = 0; atomic_set(&dp->dl_count, 1); @@ -2291,6 +2291,7 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open * stp->st_stateid.si_boot = boot_time; stp->st_stateid.si_stateownerid = sop->so_id; stp->st_stateid.si_fileid = fp->fi_id; + /* note will be incremented before first return to client: */ stp->st_stateid.si_generation = 0; stp->st_access_bmap = 0; stp->st_deny_bmap = 0; @@ -2894,7 +2895,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); if (status) goto out; - update_stateid(&stp->st_stateid); } else { status = nfs4_new_open(rqstp, &stp, fp, current_fh, open); if (status) @@ -2905,9 +2905,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf release_open_stateid(stp); goto out; } - if (nfsd4_has_session(&resp->cstate)) - update_stateid(&stp->st_stateid); } + update_stateid(&stp->st_stateid); memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); if (nfsd4_has_session(&resp->cstate)) @@ -3893,6 +3892,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc stp->st_stateid.si_boot = boot_time; stp->st_stateid.si_stateownerid = sop->so_id; stp->st_stateid.si_fileid = fp->fi_id; + /* note will be incremented before first return to client: */ stp->st_stateid.si_generation = 0; stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; -- cgit v1.2.3-58-ga151 From f3e4223751392b9bc0195a806a6e99b4cc399ac0 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 24 Aug 2011 12:27:31 -0400 Subject: nfsd4: centralize handling of replay owners Set the stateowner associated with a replay in one spot in nfs4_preprocess_seqid_op() and keep it in cstate. This allows removing a few lines of boilerplate from all the nfs4_preprocess_seqid_op() callers. Also turn ENCODE_SEQID_OP_TAIL into a function while we're here. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 25 ++++--------------------- fs/nfsd/nfs4xdr.c | 34 +++++++++++++++++++--------------- 2 files changed, 23 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 80af79ee5d51..e4535ff92876 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3425,12 +3425,15 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, /* It's not stale; let's assume it's expired: */ if (sop == NULL) return nfserr_expired; - *sopp = sop; + nfs4_get_stateowner(sop); + cstate->replay_owner = sop; goto check_replay; } *stpp = stp; *sopp = sop = stp->st_stateowner; + nfs4_get_stateowner(sop); + cstate->replay_owner = sop; if (nfs4_check_fh(current_fh, stp)) { dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n"); @@ -3501,10 +3504,6 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfsd4_create_clid_dir(sop->so_client); out: - if (oc->oc_stateowner) { - nfs4_get_stateowner(oc->oc_stateowner); - cstate->replay_owner = oc->oc_stateowner; - } nfs4_unlock_state(); return status; } @@ -3574,10 +3573,6 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, memcpy(&od->od_stateid, &stp->st_stateid, sizeof(stateid_t)); status = nfs_ok; out: - if (od->od_stateowner) { - nfs4_get_stateowner(od->od_stateowner); - cstate->replay_owner = od->od_stateowner; - } nfs4_unlock_state(); return status; } @@ -3618,10 +3613,6 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (list_empty(&close->cl_stateowner->so_stateids)) move_to_close_lru(close->cl_stateowner); out: - if (close->cl_stateowner) { - nfs4_get_stateowner(close->cl_stateowner); - cstate->replay_owner = close->cl_stateowner; - } nfs4_unlock_state(); return status; } @@ -4086,10 +4077,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, out: if (status && lock->lk_is_new && lock_sop) release_lockowner(lock_sop); - if (lock->lk_replay_owner) { - nfs4_get_stateowner(lock->lk_replay_owner); - cstate->replay_owner = lock->lk_replay_owner; - } nfs4_unlock_state(); return status; } @@ -4244,10 +4231,6 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, memcpy(&locku->lu_stateid, &stp->st_stateid, sizeof(stateid_t)); out: - if (locku->lu_stateowner) { - nfs4_get_stateowner(locku->lu_stateowner); - cstate->replay_owner = locku->lu_stateowner; - } nfs4_unlock_state(); return status; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index f982d8574d95..ee1267838719 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1630,15 +1630,19 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c) * we know whether the error to be returned is a sequence id mutating error. */ -#define ENCODE_SEQID_OP_TAIL(stateowner) do { \ - if (seqid_mutating_err(ntohl(nfserr)) && stateowner) { \ - stateowner->so_seqid++; \ - stateowner->so_replay.rp_status = nfserr; \ - stateowner->so_replay.rp_buflen = \ - (((char *)(resp)->p - (char *)save)); \ - memcpy(stateowner->so_replay.rp_buf, save, \ - stateowner->so_replay.rp_buflen); \ - } } while (0); +static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, __be32 nfserr) +{ + struct nfs4_stateowner *stateowner = resp->cstate.replay_owner; + + if (seqid_mutating_err(ntohl(nfserr)) && stateowner) { + stateowner->so_seqid++; + stateowner->so_replay.rp_status = nfserr; + stateowner->so_replay.rp_buflen = + (char *)resp->p - (char *)save; + memcpy(stateowner->so_replay.rp_buf, save, + stateowner->so_replay.rp_buflen); + } +} /* Encode as an array of strings the string given with components * separated @sep. @@ -2495,7 +2499,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c if (!nfserr) nfsd4_encode_stateid(resp, &close->cl_stateid); - ENCODE_SEQID_OP_TAIL(close->cl_stateowner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2599,7 +2603,7 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo else if (nfserr == nfserr_denied) nfsd4_encode_lock_denied(resp, &lock->lk_denied); - ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2619,7 +2623,7 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l if (!nfserr) nfsd4_encode_stateid(resp, &locku->lu_stateid); - ENCODE_SEQID_OP_TAIL(locku->lu_stateowner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2700,7 +2704,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op } /* XXX save filehandle here */ out: - ENCODE_SEQID_OP_TAIL(open->op_stateowner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2712,7 +2716,7 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct if (!nfserr) nfsd4_encode_stateid(resp, &oc->oc_resp_stateid); - ENCODE_SEQID_OP_TAIL(oc->oc_stateowner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2724,7 +2728,7 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struc if (!nfserr) nfsd4_encode_stateid(resp, &od->od_stateid); - ENCODE_SEQID_OP_TAIL(od->od_stateowner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } -- cgit v1.2.3-58-ga151 From 9072d5c66b17292e3cd055bc8057b2ce6af2fe34 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 24 Aug 2011 12:45:03 -0400 Subject: nfsd4: cleanup seqid op stateowner usage Now that the replay owner is in the cstate we can remove it from a lot of other individual operations and further simplify nfs4_preprocess_seqid_op(). Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 57 ++++++++++++++++++++++------------------------------- fs/nfsd/nfs4xdr.c | 5 ----- fs/nfsd/xdr4.h | 7 ------- 3 files changed, 24 insertions(+), 45 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index e4535ff92876..bc1a9dbc289c 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3388,7 +3388,6 @@ setlkflg (int type) static __be32 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, int flags, - struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp) { struct nfs4_stateid *stp; @@ -3400,7 +3399,6 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, seqid, STATEID_VAL(stateid)); *stpp = NULL; - *sopp = NULL; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { dprintk("NFSD: preprocess_seqid_op: magic stateid!\n"); @@ -3431,7 +3429,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, } *stpp = stp; - *sopp = sop = stp->st_stateowner; + sop = stp->st_stateowner; nfs4_get_stateowner(sop); cstate->replay_owner = sop; @@ -3467,7 +3465,6 @@ check_replay: } dprintk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n", sop->so_seqid, seqid); - *sopp = NULL; return nfserr_bad_seqid; } @@ -3489,13 +3486,13 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); - if ((status = nfs4_preprocess_seqid_op(cstate, + status = nfs4_preprocess_seqid_op(cstate, oc->oc_seqid, &oc->oc_req_stateid, - CONFIRM | OPEN_STATE, - &oc->oc_stateowner, &stp))) + CONFIRM | OPEN_STATE, &stp); + if (status) goto out; - sop = oc->oc_stateowner; + sop = stp->st_stateowner; sop->so_confirmed = 1; update_stateid(&stp->st_stateid); memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t)); @@ -3547,11 +3544,9 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, return nfserr_inval; nfs4_lock_state(); - if ((status = nfs4_preprocess_seqid_op(cstate, - od->od_seqid, - &od->od_stateid, - OPEN_STATE, - &od->od_stateowner, &stp))) + status = nfs4_preprocess_seqid_op(cstate, od->od_seqid, + &od->od_stateid, OPEN_STATE, &stp); + if (status) goto out; status = nfserr_inval; @@ -3586,6 +3581,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { __be32 status; struct nfs4_stateid *stp; + struct nfs4_stateowner *so; dprintk("NFSD: nfsd4_close on file %.*s\n", (int)cstate->current_fh.fh_dentry->d_name.len, @@ -3593,12 +3589,12 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); /* check close_lru for replay */ - if ((status = nfs4_preprocess_seqid_op(cstate, - close->cl_seqid, + status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, &close->cl_stateid, - OPEN_STATE | CLOSE_STATE, - &close->cl_stateowner, &stp))) + OPEN_STATE | CLOSE_STATE, &stp); + if (status) goto out; + so = stp->st_stateowner; status = nfs_ok; update_stateid(&stp->st_stateid); memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t)); @@ -3610,8 +3606,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * released by the laundromat service after the lease period * to enable us to handle CLOSE replay */ - if (list_empty(&close->cl_stateowner->so_stateids)) - move_to_close_lru(close->cl_stateowner); + if (list_empty(&so->so_stateids)) + move_to_close_lru(so); out: nfs4_unlock_state(); return status; @@ -3962,12 +3958,11 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_seqid_op(cstate, lock->lk_new_open_seqid, &lock->lk_new_open_stateid, - OPEN_STATE, - &lock->lk_replay_owner, &open_stp); + OPEN_STATE, &open_stp); if (status) goto out; status = nfserr_bad_stateid; - open_sop = lock->lk_replay_owner; + open_sop = open_stp->st_stateowner; if (!nfsd4_has_session(cstate) && !same_clid(&open_sop->so_client->cl_clientid, &lock->v.new.clientid)) @@ -3993,14 +3988,13 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_seqid_op(cstate, lock->lk_old_lock_seqid, &lock->lk_old_lock_stateid, - LOCK_STATE, - &lock->lk_replay_owner, &lock_stp); + LOCK_STATE, &lock_stp); if (status) goto out; - lock_sop = lock->lk_replay_owner; + lock_sop = lock_stp->st_stateowner; fp = lock_stp->st_file; } - /* lock->lk_replay_owner and lock_stp have been created or found */ + /* lock_sop and lock_stp have been created or found */ lkflg = setlkflg(lock->lk_type); status = nfs4_check_openmode(lock_stp, lkflg); @@ -4191,13 +4185,10 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); - if ((status = nfs4_preprocess_seqid_op(cstate, - locku->lu_seqid, - &locku->lu_stateid, - LOCK_STATE, - &locku->lu_stateowner, &stp))) + status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid, + &locku->lu_stateid, LOCK_STATE, &stp); + if (status) goto out; - filp = find_any_file(stp->st_file); if (!filp) { status = nfserr_lock_range; @@ -4206,7 +4197,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, BUG_ON(!filp); locks_init_lock(&file_lock); file_lock.fl_type = F_UNLCK; - file_lock.fl_owner = (fl_owner_t) locku->lu_stateowner; + file_lock.fl_owner = (fl_owner_t) stp->st_stateowner; file_lock.fl_pid = current->tgid; file_lock.fl_file = filp; file_lock.fl_flags = FL_POSIX; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index ee1267838719..462c6eff8471 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -456,7 +456,6 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) { DECODE_HEAD; - close->cl_stateowner = NULL; READ_BUF(4); READ32(close->cl_seqid); return nfsd4_decode_stateid(argp, &close->cl_stateid); @@ -551,7 +550,6 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) { DECODE_HEAD; - lock->lk_replay_owner = NULL; /* * type, reclaim(boolean), offset, length, new_lock_owner(boolean) */ @@ -611,7 +609,6 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) { DECODE_HEAD; - locku->lu_stateowner = NULL; READ_BUF(8); READ32(locku->lu_type); if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT)) @@ -739,7 +736,6 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con { DECODE_HEAD; - open_conf->oc_stateowner = NULL; status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid); if (status) return status; @@ -754,7 +750,6 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d { DECODE_HEAD; - open_down->od_stateowner = NULL; status = nfsd4_decode_stateid(argp, &open_down->od_stateid); if (status) return status; diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 663193b21a24..341f0a17d217 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -81,7 +81,6 @@ struct nfsd4_access { struct nfsd4_close { u32 cl_seqid; /* request */ stateid_t cl_stateid; /* request+response */ - struct nfs4_stateowner * cl_stateowner; /* response */ }; struct nfsd4_commit { @@ -165,9 +164,6 @@ struct nfsd4_lock { } ok; struct nfsd4_lock_denied denied; } u; - /* The lk_replay_owner is the open owner in the open_to_lock_owner - * case and the lock owner otherwise: */ - struct nfs4_stateowner *lk_replay_owner; }; #define lk_new_open_seqid v.new.open_seqid #define lk_new_open_stateid v.new.open_stateid @@ -199,7 +195,6 @@ struct nfsd4_locku { stateid_t lu_stateid; u64 lu_offset; u64 lu_length; - struct nfs4_stateowner *lu_stateowner; }; @@ -243,7 +238,6 @@ struct nfsd4_open_confirm { stateid_t oc_req_stateid /* request */; u32 oc_seqid /* request */; stateid_t oc_resp_stateid /* response */; - struct nfs4_stateowner * oc_stateowner; /* response */ }; struct nfsd4_open_downgrade { @@ -251,7 +245,6 @@ struct nfsd4_open_downgrade { u32 od_seqid; u32 od_share_access; u32 od_share_deny; - struct nfs4_stateowner *od_stateowner; }; -- cgit v1.2.3-58-ga151 From 5ec094c1096ab3bb795651855d53f18daa26afde Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 30 Aug 2011 17:02:48 -0400 Subject: nfsd4: extend state lock over seqid replay logic There are currently a couple races in the seqid replay code: a retransmission could come while we're still encoding the original reply, or a new seqid-mutating call could come as we're encoding a replay. So, extend the state lock over the encoding (both encoding of a replayed reply and caching of the original encoded reply). I really hate doing this, and previously added the stateowner reference-counting code to avoid it (which was insufficient)--but I don't see a less complicated alternative at the moment. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 5 +++-- fs/nfsd/nfs4state.c | 12 ++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 50bae7471147..50063a85f505 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -408,8 +408,8 @@ out: if (open->op_stateowner) { nfs4_get_stateowner(open->op_stateowner); cstate->replay_owner = open->op_stateowner; - } - nfs4_unlock_state(); + } else + nfs4_unlock_state(); return status; } @@ -1227,6 +1227,7 @@ encode_op: be32_to_cpu(status)); if (cstate->replay_owner) { + nfs4_unlock_state(); nfs4_put_stateowner(cstate->replay_owner); cstate->replay_owner = NULL; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index bc1a9dbc289c..6cf729a096c3 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3501,7 +3501,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfsd4_create_clid_dir(sop->so_client); out: - nfs4_unlock_state(); + if (!cstate->replay_owner) + nfs4_unlock_state(); return status; } @@ -3568,7 +3569,8 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, memcpy(&od->od_stateid, &stp->st_stateid, sizeof(stateid_t)); status = nfs_ok; out: - nfs4_unlock_state(); + if (!cstate->replay_owner) + nfs4_unlock_state(); return status; } @@ -3609,7 +3611,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (list_empty(&so->so_stateids)) move_to_close_lru(so); out: - nfs4_unlock_state(); + if (!cstate->replay_owner) + nfs4_unlock_state(); return status; } @@ -4071,7 +4074,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, out: if (status && lock->lk_is_new && lock_sop) release_lockowner(lock_sop); - nfs4_unlock_state(); + if (!cstate->replay_owner) + nfs4_unlock_state(); return status; } -- cgit v1.2.3-58-ga151 From fff6ca9cc46857e5814cf687e5fb1b8a876766a4 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 25 Aug 2011 18:17:52 -0400 Subject: nfsd4: eliminate impossible open replay case If open fails with any error other than nfserr_replay_me, then the main nfsd4_proc_compound() loop continues unconditionally to nfsd4_encode_operation(), which will always call encode_seqid_op_tail. Thus the condition we check for here does not occur. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 6cf729a096c3..26b0c75aa93b 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2495,18 +2495,8 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, open->op_stateowner = NULL; goto renew; } - if (open->op_seqid == sop->so_seqid - 1) { - if (sop->so_replay.rp_buflen) - return nfserr_replay_me; - /* The original OPEN failed so spectacularly - * that we don't even have replay data saved! - * Therefore, we have no choice but to continue - * processing this OPEN; presumably, we'll - * fail again for the same reason. - */ - dprintk("nfsd4_process_open1: replay with no replay cache\n"); - goto renew; - } + if (open->op_seqid == sop->so_seqid - 1) + return nfserr_replay_me; if (open->op_seqid != sop->so_seqid) return nfserr_bad_seqid; renew: -- cgit v1.2.3-58-ga151 From 7c13f344cf8bec22301c5ed7ef1d90eecb57ba43 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 30 Aug 2011 22:15:47 -0400 Subject: nfsd4: drop most stateowner refcounting Maybe we'll bring it back some day, but we don't have much real use for it now. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 6 ++---- fs/nfsd/nfs4state.c | 23 ++++++++++++----------- fs/nfsd/nfs4xdr.c | 11 ++++++----- fs/nfsd/state.h | 15 +-------------- fs/nfsd/xdr4.h | 2 +- 5 files changed, 22 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 50063a85f505..ce151f0ed4b9 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -405,10 +405,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, */ status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); out: - if (open->op_stateowner) { - nfs4_get_stateowner(open->op_stateowner); + if (open->op_stateowner) cstate->replay_owner = open->op_stateowner; - } else + else nfs4_unlock_state(); return status; } @@ -1228,7 +1227,6 @@ encode_op: if (cstate->replay_owner) { nfs4_unlock_state(); - nfs4_put_stateowner(cstate->replay_owner); cstate->replay_owner = NULL; } /* XXX Ugh, we need to get rid of this kind of special case: */ diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 26b0c75aa93b..834a5f844f42 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -453,7 +453,7 @@ static void unhash_lockowner(struct nfs4_stateowner *sop) static void release_lockowner(struct nfs4_stateowner *sop) { unhash_lockowner(sop); - nfs4_put_stateowner(sop); + nfs4_free_stateowner(sop); } static void @@ -496,7 +496,7 @@ static void release_openowner(struct nfs4_stateowner *sop) { unhash_openowner(sop); list_del(&sop->so_close_lru); - nfs4_put_stateowner(sop); + nfs4_free_stateowner(sop); } #define SESSION_HASH_SIZE 512 @@ -2206,10 +2206,8 @@ out_nomem: } void -nfs4_free_stateowner(struct kref *kref) +nfs4_free_stateowner(struct nfs4_stateowner *sop) { - struct nfs4_stateowner *sop = - container_of(kref, struct nfs4_stateowner, so_ref); kfree(sop->so_owner.data); kmem_cache_free(stateowner_slab, sop); } @@ -2236,7 +2234,6 @@ static inline struct nfs4_stateowner *alloc_stateowner(struct xdr_netobj *owner, } sop->so_owner.len = owner->len; - kref_init(&sop->so_ref); INIT_LIST_HEAD(&sop->so_perclient); INIT_LIST_HEAD(&sop->so_stateids); INIT_LIST_HEAD(&sop->so_perstateid); @@ -3413,14 +3410,12 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, /* It's not stale; let's assume it's expired: */ if (sop == NULL) return nfserr_expired; - nfs4_get_stateowner(sop); cstate->replay_owner = sop; goto check_replay; } *stpp = stp; sop = stp->st_stateowner; - nfs4_get_stateowner(sop); cstate->replay_owner = sop; if (nfs4_check_fh(current_fh, stp)) { @@ -3783,11 +3778,17 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) if (fl->fl_lmops == &nfsd_posix_mng_ops) { sop = (struct nfs4_stateowner *) fl->fl_owner; - kref_get(&sop->so_ref); - deny->ld_sop = sop; + deny->ld_owner.data = kmemdup(sop->so_owner.data, + sop->so_owner.len, GFP_KERNEL); + if (!deny->ld_owner.data) + /* We just don't care that much */ + goto nevermind; + deny->ld_owner.len = sop->so_owner.len; deny->ld_clientid = sop->so_client->cl_clientid; } else { - deny->ld_sop = NULL; +nevermind: + deny->ld_owner.len = 0; + deny->ld_owner.data = NULL; deny->ld_clientid.cl_boot = 0; deny->ld_clientid.cl_id = 0; } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 462c6eff8471..c4dcba3aac1f 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2570,17 +2570,18 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh static void nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld) { + struct xdr_netobj *conf = &ld->ld_owner; __be32 *p; - RESERVE_SPACE(32 + XDR_LEN(ld->ld_sop ? ld->ld_sop->so_owner.len : 0)); + RESERVE_SPACE(32 + XDR_LEN(conf->len)); WRITE64(ld->ld_start); WRITE64(ld->ld_length); WRITE32(ld->ld_type); - if (ld->ld_sop) { + if (conf->len) { WRITEMEM(&ld->ld_clientid, 8); - WRITE32(ld->ld_sop->so_owner.len); - WRITEMEM(ld->ld_sop->so_owner.data, ld->ld_sop->so_owner.len); - kref_put(&ld->ld_sop->so_ref, nfs4_free_stateowner); + WRITE32(conf->len); + WRITEMEM(conf->data, conf->len); + kfree(conf->data); } else { /* non - nfsv4 lock in conflict, no clientid nor owner */ WRITE64((u64)0); /* clientid */ WRITE32(0); /* length of owner name */ diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index c425717715f6..f7114fc21dee 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -338,7 +338,6 @@ struct nfs4_replay { * reaped by laundramat thread after lease period. */ struct nfs4_stateowner { - struct kref so_ref; struct list_head so_idhash; /* hash by so_id */ struct list_head so_strhash; /* hash by op_name */ struct list_head so_perclient; @@ -459,7 +458,7 @@ extern void nfs4_lock_state(void); extern void nfs4_unlock_state(void); extern int nfs4_in_grace(void); extern __be32 nfs4_check_open_reclaim(clientid_t *clid); -extern void nfs4_free_stateowner(struct kref *kref); +extern void nfs4_free_stateowner(struct nfs4_stateowner *sop); extern int set_callback_cred(void); extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); @@ -482,16 +481,4 @@ extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); extern void release_session_client(struct nfsd4_session *); extern __be32 nfs4_validate_stateid(stateid_t *, bool); -static inline void -nfs4_put_stateowner(struct nfs4_stateowner *so) -{ - kref_put(&so->so_ref, nfs4_free_stateowner); -} - -static inline void -nfs4_get_stateowner(struct nfs4_stateowner *so) -{ - kref_get(&so->so_ref); -} - #endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 341f0a17d217..de236fb89e74 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -130,7 +130,7 @@ struct nfsd4_link { struct nfsd4_lock_denied { clientid_t ld_clientid; - struct nfs4_stateowner *ld_sop; + struct xdr_netobj ld_owner; u64 ld_start; u64 ld_length; u32 ld_type; -- cgit v1.2.3-58-ga151 From 16d259418b7c0dda79b71bfbfeaedc0ba4035f23 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 1 Sep 2011 11:31:45 -0400 Subject: nfsd4: eliminate unused lt_stateowner This is used only as a local variable. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 8 ++++---- fs/nfsd/xdr4.h | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 834a5f844f42..a47bf884726f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4098,6 +4098,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct inode *inode; struct file_lock file_lock; + struct nfs4_stateowner *so; int error; __be32 status; @@ -4107,7 +4108,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (check_lock_length(lockt->lt_offset, lockt->lt_length)) return nfserr_inval; - lockt->lt_stateowner = NULL; nfs4_lock_state(); status = nfserr_stale_clientid; @@ -4134,10 +4134,10 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - lockt->lt_stateowner = find_lockstateowner_str(inode, + so = find_lockstateowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner); - if (lockt->lt_stateowner) - file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner; + if (so) + file_lock.fl_owner = (fl_owner_t)so; file_lock.fl_pid = current->tgid; file_lock.fl_flags = FL_POSIX; diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index de236fb89e74..27a3dfab96a9 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -184,7 +184,6 @@ struct nfsd4_lockt { struct xdr_netobj lt_owner; u64 lt_offset; u64 lt_length; - struct nfs4_stateowner * lt_stateowner; struct nfsd4_lock_denied lt_denied; }; -- cgit v1.2.3-58-ga151 From 7a8711c9a6e2299c324c153da6e3381b1fd56128 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 2 Sep 2011 09:03:37 -0400 Subject: nfsd4: share common seqid checks Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a47bf884726f..8edc9ad63ea6 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2456,6 +2456,16 @@ static const struct lock_manager_operations nfsd_lease_mng_ops = { .lm_change = nfsd_change_deleg_cb, }; +static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4_stateowner *so, u32 seqid) +{ + if (nfsd4_has_session(cstate)) + return nfs_ok; + if (seqid == so->so_seqid - 1) + return nfserr_replay_me; + if (seqid == so->so_seqid) + return nfs_ok; + return nfserr_bad_seqid; +} __be32 nfsd4_process_open1(struct nfsd4_compound_state *cstate, @@ -2465,6 +2475,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, struct nfs4_client *clp = NULL; unsigned int strhashval; struct nfs4_stateowner *sop = NULL; + __be32 status; if (!check_name(open->op_owner)) return nfserr_inval; @@ -2482,9 +2493,6 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, return nfserr_expired; goto renew; } - /* When sessions are used, skip open sequenceid processing */ - if (nfsd4_has_session(cstate)) - goto renew; if (!sop->so_confirmed) { /* Replace unconfirmed owners without checking for replay. */ clp = sop->so_client; @@ -2492,10 +2500,9 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, open->op_stateowner = NULL; goto renew; } - if (open->op_seqid == sop->so_seqid - 1) - return nfserr_replay_me; - if (open->op_seqid != sop->so_seqid) - return nfserr_bad_seqid; + status = nfsd4_check_seqid(cstate, sop, open->op_seqid); + if (status) + return status; renew: if (open->op_stateowner == NULL) { sop = alloc_init_open_stateowner(strhashval, clp, open); @@ -3411,7 +3418,10 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, if (sop == NULL) return nfserr_expired; cstate->replay_owner = sop; - goto check_replay; + status = nfsd4_check_seqid(cstate, sop, seqid); + if (status) + return status; + return nfserr_bad_seqid; } *stpp = stp; @@ -3423,8 +3433,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, return nfserr_bad_stateid; } - if (!nfsd4_has_session(cstate) && seqid != sop->so_seqid) - goto check_replay; + status = nfsd4_check_seqid(cstate, sop, seqid); + if (status) + return status; if (sop->so_confirmed && flags & CONFIRM) { dprintk("NFSD: preprocess_seqid_op: expected" @@ -3441,16 +3452,6 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, return status; renew_client(sop->so_client); return nfs_ok; - -check_replay: - if (seqid == sop->so_seqid - 1) { - dprintk("NFSD: preprocess_seqid_op: retransmission?\n"); - /* indicate replay to calling function */ - return nfserr_replay_me; - } - dprintk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n", - sop->so_seqid, seqid); - return nfserr_bad_seqid; } __be32 -- cgit v1.2.3-58-ga151 From 77eaae8d44ec5942033b751d0e0d2914c9411862 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 2 Sep 2011 12:08:20 -0400 Subject: nfsd4: simplify check_open logic Sometimes the single-exit style is good, sometimes it's unnecessarily convoluted.... Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8edc9ad63ea6..8694e60a4520 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2573,7 +2573,6 @@ static __be32 nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp) { struct nfs4_stateid *local; - __be32 status = nfserr_share_denied; struct nfs4_stateowner *sop = open->op_stateowner; list_for_each_entry(local, &fp->fi_stateids, st_perfile) { @@ -2585,11 +2584,9 @@ nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_state *stpp = local; /* check for conflicting share reservations */ if (!test_share(local, open)) - goto out; + return nfserr_share_denied; } - status = 0; -out: - return status; + return nfs_ok; } static inline struct nfs4_stateid * -- cgit v1.2.3-58-ga151 From 68b66e8270f44d297a48662e6aed72c5944f77bd Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 2 Sep 2011 12:19:43 -0400 Subject: nfsd4: move double-confirm test to open_confirm I don't see the point of having this check in nfs4_preprocess_seqid_op() when it's only needed by the one caller. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8694e60a4520..9c44630a245a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3434,11 +3434,6 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, if (status) return status; - if (sop->so_confirmed && flags & CONFIRM) { - dprintk("NFSD: preprocess_seqid_op: expected" - " unconfirmed stateowner!\n"); - return nfserr_bad_stateid; - } if (!sop->so_confirmed && !(flags & CONFIRM)) { dprintk("NFSD: preprocess_seqid_op: stateowner not" " confirmed yet!\n"); @@ -3473,9 +3468,11 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, oc->oc_seqid, &oc->oc_req_stateid, CONFIRM | OPEN_STATE, &stp); if (status) - goto out; - + goto out; sop = stp->st_stateowner; + status = nfserr_bad_stateid; + if (sop->so_confirmed) + goto out; sop->so_confirmed = 1; update_stateid(&stp->st_stateid); memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t)); @@ -3483,6 +3480,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid)); nfsd4_create_clid_dir(sop->so_client); + status = nfs_ok; out: if (!cstate->replay_owner) nfs4_unlock_state(); -- cgit v1.2.3-58-ga151 From 4e96b2dbbf1d7e81f22047a50f862555a6cb87cb Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Sat, 3 Sep 2011 11:51:09 -0400 Subject: ext4: Add new ext4_discard_partial_page_buffers routines This patch adds two new routines: ext4_discard_partial_page_buffers and ext4_discard_partial_page_buffers_no_lock. The ext4_discard_partial_page_buffers routine is a wrapper function to ext4_discard_partial_page_buffers_no_lock. The wrapper function locks the page and passes it to ext4_discard_partial_page_buffers_no_lock. Calling functions that already have the page locked can call ext4_discard_partial_page_buffers_no_lock directly. The ext4_discard_partial_page_buffers_no_lock function zeros a specified range in a page, and unmaps the corresponding buffer heads. Only block aligned regions of the page will have their buffer heads unmapped. Unblock aligned regions will be mapped if needed so that they can be updated with the partial zero out. This function is meant to be used to update a page and its buffer heads to be zeroed and unmapped when the corresponding blocks have been released or will be released. This routine is used in the following scenarios: * A hole is punched and the non page aligned regions of the head and tail of the hole need to be discarded * The file is truncated and the partial page beyond EOF needs to be discarded * The end of a hole is in the same page as EOF. After the page is flushed, the partial page beyond EOF needs to be discarded. * A write operation begins or ends inside a hole and the partial page appearing before or after the write needs to be discarded * A write operation extends EOF and the partial page beyond EOF needs to be discarded This function takes a flag EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED which is used when a write operation begins or ends in a hole. When the EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED flag is used, only buffer heads that are already unmapped will have the corresponding regions of the page zeroed. Signed-off-by: Allison Henderson Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 11 +++ fs/ext4/inode.c | 224 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 235 insertions(+) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5c38120c389c..ccfa81f33bb0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -528,6 +528,11 @@ struct ext4_new_group_data { #define EXT4_FREE_BLOCKS_VALIDATED 0x0004 #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 +/* + * Flags used by ext4_discard_partial_page_buffers + */ +#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001 + /* * ioctl commands */ @@ -1838,6 +1843,12 @@ extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length); +extern int ext4_discard_partial_page_buffers(handle_t *handle, + struct address_space *mapping, loff_t from, + loff_t length, int flags); +extern int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, + struct inode *inode, struct page *page, loff_t from, + loff_t length, int flags); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b84f127c085d..d1b1ef71e5b7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2966,6 +2966,230 @@ void ext4_set_aops(struct inode *inode) inode->i_mapping->a_ops = &ext4_journalled_aops; } + +/* + * ext4_discard_partial_page_buffers() + * Wrapper function for ext4_discard_partial_page_buffers_no_lock. + * This function finds and locks the page containing the offset + * "from" and passes it to ext4_discard_partial_page_buffers_no_lock. + * Calling functions that already have the page locked should call + * ext4_discard_partial_page_buffers_no_lock directly. + */ +int ext4_discard_partial_page_buffers(handle_t *handle, + struct address_space *mapping, loff_t from, + loff_t length, int flags) +{ + struct inode *inode = mapping->host; + struct page *page; + int err = 0; + + page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, + mapping_gfp_mask(mapping) & ~__GFP_FS); + if (!page) + return -EINVAL; + + err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page, + from, length, flags); + + unlock_page(page); + page_cache_release(page); + return err; +} + +/* + * ext4_discard_partial_page_buffers_no_lock() + * Zeros a page range of length 'length' starting from offset 'from'. + * Buffer heads that correspond to the block aligned regions of the + * zeroed range will be unmapped. Unblock aligned regions + * will have the corresponding buffer head mapped if needed so that + * that region of the page can be updated with the partial zero out. + * + * This function assumes that the page has already been locked. The + * The range to be discarded must be contained with in the given page. + * If the specified range exceeds the end of the page it will be shortened + * to the end of the page that corresponds to 'from'. This function is + * appropriate for updating a page and it buffer heads to be unmapped and + * zeroed for blocks that have been either released, or are going to be + * released. + * + * handle: The journal handle + * inode: The files inode + * page: A locked page that contains the offset "from" + * from: The starting byte offset (from the begining of the file) + * to begin discarding + * len: The length of bytes to discard + * flags: Optional flags that may be used: + * + * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED + * Only zero the regions of the page whose buffer heads + * have already been unmapped. This flag is appropriate + * for updateing the contents of a page whose blocks may + * have already been released, and we only want to zero + * out the regions that correspond to those released blocks. + * + * Returns zero on sucess or negative on failure. + */ +int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, + struct inode *inode, struct page *page, loff_t from, + loff_t length, int flags) +{ + ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; + unsigned int offset = from & (PAGE_CACHE_SIZE-1); + unsigned int blocksize, max, pos; + unsigned int end_of_block, range_to_discard; + ext4_lblk_t iblock; + struct buffer_head *bh; + int err = 0; + + blocksize = inode->i_sb->s_blocksize; + max = PAGE_CACHE_SIZE - offset; + + if (index != page->index) + return -EINVAL; + + /* + * correct length if it does not fall between + * 'from' and the end of the page + */ + if (length > max || length < 0) + length = max; + + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + if (!page_has_buffers(page)) { + /* + * If the range to be discarded covers a partial block + * we need to get the page buffers. This is because + * partial blocks cannot be released and the page needs + * to be updated with the contents of the block before + * we write the zeros on top of it. + */ + if (!(from & (blocksize - 1)) || + !((from + length) & (blocksize - 1))) { + create_empty_buffers(page, blocksize, 0); + } else { + /* + * If there are no partial blocks, + * there is nothing to update, + * so we can return now + */ + return 0; + } + } + + /* Find the buffer that contains "offset" */ + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + pos = offset; + while (pos < offset + length) { + err = 0; + + /* The length of space left to zero and unmap */ + range_to_discard = offset + length - pos; + + /* The length of space until the end of the block */ + end_of_block = blocksize - (pos & (blocksize-1)); + + /* + * Do not unmap or zero past end of block + * for this buffer head + */ + if (range_to_discard > end_of_block) + range_to_discard = end_of_block; + + + /* + * Skip this buffer head if we are only zeroing unampped + * regions of the page + */ + if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED && + buffer_mapped(bh)) + goto next; + + /* If the range is block aligned, unmap */ + if (range_to_discard == blocksize) { + clear_buffer_dirty(bh); + bh->b_bdev = NULL; + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + clear_buffer_delay(bh); + clear_buffer_unwritten(bh); + clear_buffer_uptodate(bh); + zero_user(page, pos, range_to_discard); + BUFFER_TRACE(bh, "Buffer discarded"); + goto next; + } + + /* + * If this block is not completely contained in the range + * to be discarded, then it is not going to be released. Because + * we need to keep this block, we need to make sure this part + * of the page is uptodate before we modify it by writeing + * partial zeros on it. + */ + if (!buffer_mapped(bh)) { + /* + * Buffer head must be mapped before we can read + * from the block + */ + BUFFER_TRACE(bh, "unmapped"); + ext4_get_block(inode, iblock, bh, 0); + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "still unmapped"); + goto next; + } + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt.*/ + if (!buffer_uptodate(bh)) + goto next; + } + + if (ext4_should_journal_data(inode)) { + BUFFER_TRACE(bh, "get write access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) + goto next; + } + + zero_user(page, pos, range_to_discard); + + err = 0; + if (ext4_should_journal_data(inode)) { + err = ext4_handle_dirty_metadata(handle, inode, bh); + } else { + if (ext4_should_order_data(inode) && + EXT4_I(inode)->jinode) + err = ext4_jbd2_file_inode(handle, inode); + mark_buffer_dirty(bh); + } + + BUFFER_TRACE(bh, "Partial buffer zeroed"); +next: + bh = bh->b_this_page; + iblock++; + pos += range_to_discard; + } + + return err; +} + /* * ext4_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. -- cgit v1.2.3-58-ga151 From ba06208a1315ab2d2217e09c79582b886c9f629e Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Sat, 3 Sep 2011 11:55:59 -0400 Subject: ext4: fix xfstests 75, 112, 127 punch hole failure This patch addresses a bug found by xfstests 75, 112, 127 when blocksize = 1k This bug happens because the punch hole code only zeros out non block aligned regions of the page. This means that if the blocks are smaller than a page, then the block aligned regions of the page inside the hole are left un-zeroed, and their buffer heads are still mapped. This bug is corrected by using ext4_discard_partial_page_buffers to properly zero the partial page at the head and tail of the hole, and unmap the corresponding buffer heads This patch also addresses a bug reported by Lukas while working on a new patch to add discard support for loop devices using punch hole. The bug happened because of the first and last block number needed to be cast to a larger data type before calculating the byte offset, but since now we only need the byte offsets of the pages, we no longer even need to be calculating the byte offsets of the blocks. The code to do the block offset calculations is removed in this patch. Signed-off-by: Allison Henderson --- fs/ext4/extents.c | 61 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 57cf568a98ab..18f7e04a4fa3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4162,17 +4162,14 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) struct address_space *mapping = inode->i_mapping; struct ext4_map_blocks map; handle_t *handle; - loff_t first_block_offset, last_block_offset, block_len; - loff_t first_page, last_page, first_page_offset, last_page_offset; + loff_t first_page, last_page, page_len; + loff_t first_page_offset, last_page_offset; int ret, credits, blocks_released, err = 0; first_block = (offset + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); - first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb); - last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb); - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; last_page = (offset + length) >> PAGE_CACHE_SHIFT; @@ -4211,24 +4208,44 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) goto out; /* - * Now we need to zero out the un block aligned data. - * If the file is smaller than a block, just - * zero out the middle + * Now we need to zero out the non-page-aligned data in the + * pages at the start and tail of the hole, and unmap the buffer + * heads for the block aligned regions of the page that were + * completely zeroed. */ - if (first_block > last_block) - ext4_block_zero_page_range(handle, mapping, offset, length); - else { - /* zero out the head of the hole before the first block */ - block_len = first_block_offset - offset; - if (block_len > 0) - ext4_block_zero_page_range(handle, mapping, - offset, block_len); - - /* zero out the tail of the hole after the last block */ - block_len = offset + length - last_block_offset; - if (block_len > 0) { - ext4_block_zero_page_range(handle, mapping, - last_block_offset, block_len); + if (first_page > last_page) { + /* + * If the file space being truncated is contained within a page + * just zero out and unmap the middle of that page + */ + err = ext4_discard_partial_page_buffers(handle, + mapping, offset, length, 0); + + if (err) + goto out; + } else { + /* + * zero out and unmap the partial page that contains + * the start of the hole + */ + page_len = first_page_offset - offset; + if (page_len > 0) { + err = ext4_discard_partial_page_buffers(handle, mapping, + offset, page_len, 0); + if (err) + goto out; + } + + /* + * zero out and unmap the partial page that contains + * the end of the hole + */ + page_len = offset + length - last_page_offset; + if (page_len > 0) { + err = ext4_discard_partial_page_buffers(handle, mapping, + last_page_offset, page_len, 0); + if (err) + goto out; } } -- cgit v1.2.3-58-ga151 From 2be4751b21ae1cacb002da48cfc5bf6743fee8c1 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Sat, 3 Sep 2011 11:56:52 -0400 Subject: ext4: fix 2nd xfstests 127 punch hole failure This patch fixes a second punch hole bug found by xfstests 127. This bug happens because punch hole needs to flush the pages of the hole to avoid race conditions. But if the end of the hole is in the same page as i_size, the buffer heads beyond i_size need to be unmapped and the page needs to be zeroed after it is flushed. To correct this, the new ext4_discard_partial_page_buffers routine is used to zero and unmap the partial page beyond i_size if the end of the hole appears in the same page as i_size. The code has also been optimized to set the end of the hole to the page after i_size if the specified hole exceeds i_size, and the code that flushes the pages has been simplified. Signed-off-by: Allison Henderson --- fs/ext4/extents.c | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 18f7e04a4fa3..9124cd24e093 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4166,6 +4166,20 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) loff_t first_page_offset, last_page_offset; int ret, credits, blocks_released, err = 0; + /* No need to punch hole beyond i_size */ + if (offset >= inode->i_size) + return 0; + + /* + * If the hole extends beyond i_size, set the hole + * to end after the page that contains i_size + */ + if (offset + length > inode->i_size) { + length = inode->i_size + + PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - + offset; + } + first_block = (offset + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); @@ -4182,11 +4196,10 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) */ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { err = filemap_write_and_wait_range(mapping, - first_page_offset == 0 ? 0 : first_page_offset-1, - last_page_offset); + offset, offset + length - 1); - if (err) - return err; + if (err) + return err; } /* Now release the pages */ @@ -4249,6 +4262,26 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) } } + + /* + * If i_size is contained in the last page, we need to + * unmap and zero the partial page after i_size + */ + if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && + inode->i_size % PAGE_CACHE_SIZE != 0) { + + page_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + + if (page_len > 0) { + err = ext4_discard_partial_page_buffers(handle, + mapping, inode->i_size, page_len, 0); + + if (err) + goto out; + } + } + /* If there are no blocks to remove, return now */ if (first_block >= last_block) goto out; -- cgit v1.2.3-58-ga151 From 56889787cfa77dfd96f0b3a3e6a4f26c2e4a5134 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 3 Sep 2011 18:22:38 -0400 Subject: ext4: improve handling of conflicting mount options If the user explicitly specifies conflicting mount options for delalloc or dioread_nolock and data=journal, fail the mount, instead of printing a warning and continuing (since many user's won't look at dmesg and notice the warning). Also, print a single warning that data=journal implies that delayed allocation is not on by default (since it's not supported), and furthermore that O_DIRECT is not supported. Improve the text in Documentation/filesystems/ext4.txt so this is clear there as well. Similarly, if the dioread_nolock mount option is specified when the file system block size != PAGE_SIZE, fail the mount instead of printing a warning message and ignoring the mount option. Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/ext4.txt | 8 +++--- fs/ext4/ext4.h | 3 +++ fs/ext4/super.c | 50 ++++++++++++++++++++++---------------- 3 files changed, 37 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 232a575a0c48..168242b5c045 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -160,7 +160,9 @@ noload if the filesystem was not unmounted cleanly, lead to any number of problems. data=journal All data are committed into the journal prior to being - written into the main file system. + written into the main file system. Enabling + this mode will disable delayed allocation and + O_DIRECT support. data=ordered (*) All data are forced directly out to the main file system prior to its metadata being committed to the @@ -419,8 +421,8 @@ written to the journal first, and then to its final location. In the event of a crash, the journal can be replayed, bringing both data and metadata into a consistent state. This mode is the slowest except when data needs to be read from and written to disk at the same time where it -outperforms all others modes. Currently ext4 does not have delayed -allocation support if this data journalling mode is selected. +outperforms all others modes. Enabling this mode will disable delayed +allocation and O_DIRECT support. /proc entries ============= diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ccfa81f33bb0..48ae98819d35 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -922,6 +922,9 @@ struct ext4_inode_info { #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ #define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ +#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly + specified delalloc */ + #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt #define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 44d0c8db2239..ee2f74a7084d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1801,6 +1801,7 @@ set_qf_format: break; case Opt_nodelalloc: clear_opt(sb, DELALLOC); + clear_opt2(sb, EXPLICIT_DELALLOC); break; case Opt_mblk_io_submit: set_opt(sb, MBLK_IO_SUBMIT); @@ -1817,6 +1818,7 @@ set_qf_format: break; case Opt_delalloc: set_opt(sb, DELALLOC); + set_opt2(sb, EXPLICIT_DELALLOC); break; case Opt_block_validity: set_opt(sb, BLOCK_VALIDITY); @@ -3224,6 +3226,33 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) &journal_ioprio, NULL, 0)) goto failed_mount; + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + printk_once(KERN_WARNING "EXT4-fs: Warning: mounting " + "with data=journal disables delayed " + "allocation and O_DIRECT support!\n"); + if (test_opt2(sb, EXPLICIT_DELALLOC)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and delalloc"); + goto failed_mount; + } + if (test_opt(sb, DIOREAD_NOLOCK)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and delalloc"); + goto failed_mount; + } + if (test_opt(sb, DELALLOC)) + clear_opt(sb, DELALLOC); + } + + blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + if (test_opt(sb, DIOREAD_NOLOCK)) { + if (blocksize < PAGE_SIZE) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "dioread_nolock if block size != PAGE_SIZE"); + goto failed_mount; + } + } + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); @@ -3265,8 +3294,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) goto failed_mount; - blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); - if (blocksize < EXT4_MIN_BLOCK_SIZE || blocksize > EXT4_MAX_BLOCK_SIZE) { ext4_msg(sb, KERN_ERR, @@ -3679,25 +3706,6 @@ no_journal: "available"); } - if (test_opt(sb, DELALLOC) && - (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { - ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " - "requested data journaling mode"); - clear_opt(sb, DELALLOC); - } - if (test_opt(sb, DIOREAD_NOLOCK)) { - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " - "option - requested data journaling mode"); - clear_opt(sb, DIOREAD_NOLOCK); - } - if (sb->s_blocksize < PAGE_SIZE) { - ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " - "option - block size is too small"); - clear_opt(sb, DIOREAD_NOLOCK); - } - } - err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " -- cgit v1.2.3-58-ga151 From f4dee24cca98739a4190a00fa014cd1b7e2581a4 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 2 Sep 2011 16:36:49 -0400 Subject: nfsd4: move CLOSE_STATE special case to caller Move the CLOSE_STATE case into the unique caller that cares about it rather than putting it in preprocess_seqid_op. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 56 ++++++++++++++++++++++++++--------------------------- fs/nfsd/state.h | 1 - 2 files changed, 27 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9c44630a245a..eb11626babc6 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3070,15 +3070,13 @@ laundromat_main(struct work_struct *not_used) } static struct nfs4_stateowner * -search_close_lru(u32 st_id, int flags) +search_close_lru(u32 st_id) { - struct nfs4_stateowner *local = NULL; + struct nfs4_stateowner *local; - if (flags & CLOSE_STATE) { - list_for_each_entry(local, &close_lru, so_close_lru) { - if (local->so_id == st_id) - return local; - } + list_for_each_entry(local, &close_lru, so_close_lru) { + if (local->so_id == st_id) + return local; } return NULL; } @@ -3381,7 +3379,6 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateid **stpp) { - struct nfs4_stateid *stp; struct nfs4_stateowner *sop; struct svc_fh *current_fh = &cstate->current_fh; __be32 status; @@ -3404,28 +3401,14 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, * the confirmed flag is incorrecly set, or the generation * number is incorrect. */ - stp = find_stateid(stateid, flags); - if (stp == NULL) { - /* - * Also, we should make sure this isn't just the result of - * a replayed close: - */ - sop = search_close_lru(stateid->si_stateownerid, flags); - /* It's not stale; let's assume it's expired: */ - if (sop == NULL) - return nfserr_expired; - cstate->replay_owner = sop; - status = nfsd4_check_seqid(cstate, sop, seqid); - if (status) - return status; - return nfserr_bad_seqid; - } + *stpp = find_stateid(stateid, flags); + if (*stpp == NULL) + return nfserr_expired; - *stpp = stp; - sop = stp->st_stateowner; + sop = (*stpp)->st_stateowner; cstate->replay_owner = sop; - if (nfs4_check_fh(current_fh, stp)) { + if (nfs4_check_fh(current_fh, *stpp)) { dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n"); return nfserr_bad_stateid; } @@ -3439,7 +3422,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, " confirmed yet!\n"); return nfserr_bad_stateid; } - status = check_stateid_generation(stateid, &stp->st_stateid, nfsd4_has_session(cstate)); + status = check_stateid_generation(stateid, &(*stpp)->st_stateid, nfsd4_has_session(cstate)); if (status) return status; renew_client(sop->so_client); @@ -3574,7 +3557,22 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* check close_lru for replay */ status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, &close->cl_stateid, - OPEN_STATE | CLOSE_STATE, &stp); + OPEN_STATE, &stp); + if (stp == NULL && status == nfserr_expired) { + /* + * Also, we should make sure this isn't just the result of + * a replayed close: + */ + so = search_close_lru(close->cl_stateid.si_stateownerid); + /* It's not stale; let's assume it's expired: */ + if (so == NULL) + goto out; + cstate->replay_owner = so; + status = nfsd4_check_seqid(cstate, so, close->cl_seqid); + if (status) + goto out; + status = nfserr_bad_seqid; + } if (status) goto out; so = stp->st_stateowner; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index f7114fc21dee..0d88000d15d7 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -448,7 +448,6 @@ struct nfs4_stateid { #define LOCK_STATE 0x00000008 #define RD_STATE 0x00000010 #define WR_STATE 0x00000020 -#define CLOSE_STATE 0x00000040 struct nfsd4_compound_state; -- cgit v1.2.3-58-ga151 From 9ea7a0df63630ad8197716cd313ea66e28906fc0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 4 Sep 2011 10:18:14 -0400 Subject: jbd2: add debugging information to jbd2_journal_dirty_metadata() Add debugging information in case jbd2_journal_dirty_metadata() is called with a buffer_head which didn't have jbd2_journal_get_write_access() called on it, or if the journal_head has the wrong transaction in it. In addition, return an error code. This won't change anything for ocfs2, which will BUG_ON() the non-zero exit code. For ext4, the caller of this function is ext4_handle_dirty_metadata(), and on seeing a non-zero return code, will call __ext4_journal_stop(), which will print the function and line number of the (buggy) calling function and abort the journal. This will allow us to recover instead of bug halting, which is better from a robustness and reliability point of view. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_jbd2.c | 8 ++++--- fs/ext4/extents.c | 10 ++++++--- fs/jbd2/transaction.c | 58 +++++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 64 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index f5240aa15601..aca179017582 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -109,9 +109,11 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); - if (err) - ext4_journal_abort_handle(where, line, __func__, - bh, handle, err); + if (err) { + /* Errors can only happen if there is a bug */ + handle->h_err = err; + __ext4_journal_stop(where, line, handle); + } } else { if (inode) mark_buffer_dirty_inode(bh, inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9124cd24e093..2c5216a8d03b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -96,13 +96,17 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, * - ENOMEM * - EIO */ -static int ext4_ext_dirty(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path) +#define ext4_ext_dirty(handle, inode, path) \ + __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) +static int __ext4_ext_dirty(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) { int err; if (path->p_bh) { /* path points to block */ - err = ext4_handle_dirty_metadata(handle, inode, path->p_bh); + err = __ext4_handle_dirty_metadata(where, line, handle, + inode, path->p_bh); } else { /* path points to leaf/index in inode body */ err = ext4_mark_inode_dirty(handle, inode); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 2d7109414cdd..cb56fe9aaabb 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1049,6 +1049,10 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh, * mark dirty metadata which needs to be journaled as part of the current * transaction. * + * The buffer must have previously had jbd2_journal_get_write_access() + * called so that it has a valid journal_head attached to the buffer + * head. + * * The buffer is placed on the transaction's metadata list and is marked * as belonging to the transaction. * @@ -1065,11 +1069,16 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; struct journal_head *jh = bh2jh(bh); + int ret = 0; jbd_debug(5, "journal_head %p\n", jh); JBUFFER_TRACE(jh, "entry"); if (is_handle_aborted(handle)) goto out; + if (!buffer_jbd(bh)) { + ret = -EUCLEAN; + goto out; + } jbd_lock_bh_state(bh); @@ -1093,8 +1102,20 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) */ if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) { JBUFFER_TRACE(jh, "fastpath"); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_running_transaction); + if (unlikely(jh->b_transaction != + journal->j_running_transaction)) { + printk(KERN_EMERG "JBD: %s: " + "jh->b_transaction (%llu, %p, %u) != " + "journal->j_running_transaction (%p, %u)", + journal->j_devname, + (unsigned long long) bh->b_blocknr, + jh->b_transaction, + jh->b_transaction ? jh->b_transaction->t_tid : 0, + journal->j_running_transaction, + journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0); + ret = -EINVAL; + } goto out_unlock_bh; } @@ -1108,9 +1129,32 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) */ if (jh->b_transaction != transaction) { JBUFFER_TRACE(jh, "already on other transaction"); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_committing_transaction); - J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + if (unlikely(jh->b_transaction != + journal->j_committing_transaction)) { + printk(KERN_EMERG "JBD: %s: " + "jh->b_transaction (%llu, %p, %u) != " + "journal->j_committing_transaction (%p, %u)", + journal->j_devname, + (unsigned long long) bh->b_blocknr, + jh->b_transaction, + jh->b_transaction ? jh->b_transaction->t_tid : 0, + journal->j_committing_transaction, + journal->j_committing_transaction ? + journal->j_committing_transaction->t_tid : 0); + ret = -EINVAL; + } + if (unlikely(jh->b_next_transaction != transaction)) { + printk(KERN_EMERG "JBD: %s: " + "jh->b_next_transaction (%llu, %p, %u) != " + "transaction (%p, %u)", + journal->j_devname, + (unsigned long long) bh->b_blocknr, + jh->b_next_transaction, + jh->b_next_transaction ? + jh->b_next_transaction->t_tid : 0, + transaction, transaction->t_tid); + ret = -EINVAL; + } /* And this case is illegal: we can't reuse another * transaction's data buffer, ever. */ goto out_unlock_bh; @@ -1127,7 +1171,9 @@ out_unlock_bh: jbd_unlock_bh_state(bh); out: JBUFFER_TRACE(jh, "exit"); - return 0; + if (ret) + __WARN(); /* All errors are bugs, so dump the stack */ + return ret; } /* -- cgit v1.2.3-58-ga151 From d2159fb7b8bac12684aabdf41d84b56da9f5c062 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sun, 4 Sep 2011 10:20:14 -0400 Subject: jbd2: use gfp_t instead of int This silences some Sparse warnings: fs/jbd2/transaction.c:135:69: warning: incorrect type in argument 2 (different base types) fs/jbd2/transaction.c:135:69: expected restricted gfp_t [usertype] flags fs/jbd2/transaction.c:135:69: got int [signed] gfp_mask Signed-off-by: Dan Carpenter Signed-off-by: "Theodore Ts'o" --- fs/jbd2/transaction.c | 6 +++--- include/linux/jbd2.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index cb56fe9aaabb..b01fd6104089 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -115,7 +115,7 @@ static inline void update_t_max_wait(transaction_t *transaction, */ static int start_this_handle(journal_t *journal, handle_t *handle, - int gfp_mask) + gfp_t gfp_mask) { transaction_t *transaction, *new_transaction = NULL; tid_t tid; @@ -320,7 +320,7 @@ static handle_t *new_handle(int nblocks) * Return a pointer to a newly allocated handle, or an ERR_PTR() value * on failure. */ -handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) +handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask) { handle_t *handle = journal_current_handle(); int err; @@ -443,7 +443,7 @@ out: * transaction capabable of guaranteeing the requested number of * credits. */ -int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask) +int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 38f307b8c334..3dd101e49d36 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1106,9 +1106,9 @@ static inline handle_t *journal_current_handle(void) */ extern handle_t *jbd2_journal_start(journal_t *, int nblocks); -extern handle_t *jbd2__journal_start(journal_t *, int nblocks, int gfp_mask); +extern handle_t *jbd2__journal_start(journal_t *, int nblocks, gfp_t gfp_mask); extern int jbd2_journal_restart(handle_t *, int nblocks); -extern int jbd2__journal_restart(handle_t *, int nblocks, int gfp_mask); +extern int jbd2__journal_restart(handle_t *, int nblocks, gfp_t gfp_mask); extern int jbd2_journal_extend (handle_t *, int nblocks); extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *); extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *); -- cgit v1.2.3-58-ga151 From decbd919f4bb9cb698486880c026c4104b13d3c3 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 6 Sep 2011 02:37:06 -0400 Subject: ext4: only call ext4_jbd2_file_inode when an inode has been extended In delayed allocation mode, it's important to only call ext4_jbd2_file_inode when the file has been extended. This is necessary to avoid a race which first got introduced in commit 678aaf481, but which was made much more common with the introduction of the "punch hole" functionality. (Especially when dioread_nolock was enabled; when I could reliably reproduce this problem with xfstests #74.) The race is this: If while trying to writeback a delayed allocation inode, there is a need to map delalloc blocks, and we run out of space in the journal, *and* at the same time the inode is already on the committing transaction's t_inode_list (because for example while doing the punch hole operation, ext4_jbd2_file_inode() is called), then the commit operation will wait for the inode to finish all of its pending writebacks by calling filemap_fdatawait(), but since that inode has one or more pages with the PageWriteback flag set, the commit operation will wait forever, and the so the writeback of the inode can never take place, and the kjournald thread and the writeback thread end up waiting for each other --- forever. It's important at this point to recall why an inode is placed on the t_inode_list; it is to provide the data=ordered guarantees that we don't end up exposing stale data. In the case where we are truncating or punching a hole in the inode, there is no possibility that stale data could be exposed in the first place, so we don't need to put the inode on the t_inode_list! The right long-term fix is to get rid of data=ordered mode altogether, and only update the extent tree or indirect blocks after the data has been written. Until then, this change will also avoid some unnecessary waiting in the commit operation. Signed-off-by: "Theodore Ts'o" Cc: Allison Henderson Cc: Jan Kara --- fs/ext4/inode.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d1b1ef71e5b7..f86b149fb8b1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1471,13 +1471,13 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) for (i = 0; i < map.m_len; i++) unmap_underlying_metadata(bdev, map.m_pblk + i); - } - if (ext4_should_order_data(mpd->inode)) { - err = ext4_jbd2_file_inode(handle, mpd->inode); - if (err) - /* This only happens if the journal is aborted */ - return; + if (ext4_should_order_data(mpd->inode)) { + err = ext4_jbd2_file_inode(handle, mpd->inode); + if (err) + /* Only if the journal is aborted */ + return; + } } /* @@ -3173,12 +3173,8 @@ int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, err = 0; if (ext4_should_journal_data(inode)) { err = ext4_handle_dirty_metadata(handle, inode, bh); - } else { - if (ext4_should_order_data(inode) && - EXT4_I(inode)->jinode) - err = ext4_jbd2_file_inode(handle, inode); + } else mark_buffer_dirty(bh); - } BUFFER_TRACE(bh, "Partial buffer zeroed"); next: @@ -3301,11 +3297,8 @@ int ext4_block_zero_page_range(handle_t *handle, err = 0; if (ext4_should_journal_data(inode)) { err = ext4_handle_dirty_metadata(handle, inode, bh); - } else { - if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode) - err = ext4_jbd2_file_inode(handle, inode); + } else mark_buffer_dirty(bh); - } unlock: unlock_page(page); -- cgit v1.2.3-58-ga151 From 189e868fa8fdca702eb9db9d8afc46b5cb9144c9 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Tue, 6 Sep 2011 21:49:44 -0400 Subject: ext4: fix fsx truncate failure While running extended fsx tests to verify the first two patches, a similar bug was also found in the truncate operation. This bug happens because the truncate routine only zeros the unblock aligned portion of the last page. This means that the block aligned portions of the page appearing after i_size are left unzeroed, and the buffer heads still mapped. This bug is corrected by using ext4_discard_partial_page_buffers in the truncate routine to zero the partial page and unmap the buffer headers. Signed-off-by: Allison Henderson Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 13 +++++++++++-- fs/ext4/indirect.c | 13 +++++++++++-- 2 files changed, 22 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2c5216a8d03b..ba7bd5a176ce 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3653,6 +3653,7 @@ void ext4_ext_truncate(struct inode *inode) struct super_block *sb = inode->i_sb; ext4_lblk_t last_block; handle_t *handle; + loff_t page_len; int err = 0; /* @@ -3669,8 +3670,16 @@ void ext4_ext_truncate(struct inode *inode) if (IS_ERR(handle)) return; - if (inode->i_size & (sb->s_blocksize - 1)) - ext4_block_truncate_page(handle, mapping, inode->i_size); + if (inode->i_size % PAGE_CACHE_SIZE != 0) { + page_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + + err = ext4_discard_partial_page_buffers(handle, + mapping, inode->i_size, page_len, 0); + + if (err) + goto out_stop; + } if (ext4_orphan_add(handle, inode)) goto out_stop; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 0962642119c0..ea704dff8669 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1343,7 +1343,9 @@ void ext4_ind_truncate(struct inode *inode) __le32 nr = 0; int n = 0; ext4_lblk_t last_block, max_block; + loff_t page_len; unsigned blocksize = inode->i_sb->s_blocksize; + int err; handle = start_transaction(inode); if (IS_ERR(handle)) @@ -1354,9 +1356,16 @@ void ext4_ind_truncate(struct inode *inode) max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - if (inode->i_size & (blocksize - 1)) - if (ext4_block_truncate_page(handle, mapping, inode->i_size)) + if (inode->i_size % PAGE_CACHE_SIZE != 0) { + page_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + + err = ext4_discard_partial_page_buffers(handle, + mapping, inode->i_size, page_len, 0); + + if (err) goto out_stop; + } if (last_block != max_block) { n = ext4_block_to_path(inode, last_block, offsets, NULL); -- cgit v1.2.3-58-ga151 From 02fac1297eb3f471a27368271aadd285548297b0 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Tue, 6 Sep 2011 21:53:01 -0400 Subject: ext4: fix partial page writes While running extended fsx tests to verify the preceeding patches, a similar bug was also found in the write operation When ever a write operation begins or ends in a hole, or extends EOF, the partial page contained in the hole or beyond EOF needs to be zeroed out. To correct this the new ext4_discard_partial_page_buffers_no_lock routine is used to zero out the partial page, but only for buffer heads that are already unmapped. Signed-off-by: Allison Henderson Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f86b149fb8b1..6ecc93979e48 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2255,6 +2255,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; struct inode *inode = mapping->host; handle_t *handle; + loff_t page_len; index = pos >> PAGE_CACHE_SHIFT; @@ -2301,6 +2302,13 @@ retry: */ if (pos + len > inode->i_size) ext4_truncate_failed_write(inode); + } else { + page_len = pos & (PAGE_CACHE_SIZE - 1); + if (page_len > 0) { + ret = ext4_discard_partial_page_buffers_no_lock(handle, + inode, page, pos - page_len, page_len, + EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED); + } } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) @@ -2343,6 +2351,7 @@ static int ext4_da_write_end(struct file *file, loff_t new_i_size; unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; + loff_t page_len; if (write_mode == FALL_BACK_TO_NONDELALLOC) { if (ext4_should_order_data(inode)) { @@ -2391,6 +2400,16 @@ static int ext4_da_write_end(struct file *file, } ret2 = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + + page_len = PAGE_CACHE_SIZE - + ((pos + copied - 1) & (PAGE_CACHE_SIZE - 1)); + + if (page_len > 0) { + ret = ext4_discard_partial_page_buffers_no_lock(handle, + inode, page, pos + copied - 1, page_len, + EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED); + } + copied = ret2; if (ret2 < 0) ret = ret2; -- cgit v1.2.3-58-ga151 From fe0750e5c43189adb6e6fc59837af7d5a588f413 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 30 Jul 2011 23:33:59 -0400 Subject: nfsd4: split stateowners into open and lockowners The stateowner has some fields that only make sense for openowners, and some that only make sense for lockowners, and I find it a lot clearer if those are separated out. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 18 +-- fs/nfsd/nfs4state.c | 367 ++++++++++++++++++++++++++-------------------------- fs/nfsd/nfs4xdr.c | 2 +- fs/nfsd/state.h | 33 ++++- fs/nfsd/xdr4.h | 2 +- 5 files changed, 224 insertions(+), 198 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index ce151f0ed4b9..460eeb329d81 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -250,7 +250,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o fh_dup2(current_fh, &resfh); /* set reply cache */ - fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh, + fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh, &resfh.fh_handle); if (!created) status = do_open_permission(rqstp, current_fh, open, @@ -277,7 +277,7 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_ memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info)); /* set replay cache */ - fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh, + fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh, ¤t_fh->fh_handle); open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) && @@ -306,9 +306,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; struct nfsd4_compoundres *resp; - dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n", + dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n", (int)open->op_fname.len, open->op_fname.data, - open->op_stateowner); + open->op_openowner); /* This check required by spec. */ if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) @@ -332,7 +332,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, resp = rqstp->rq_resp; status = nfsd4_process_open1(&resp->cstate, open); if (status == nfserr_replay_me) { - struct nfs4_replay *rp = &open->op_stateowner->so_replay; + struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay; fh_put(&cstate->current_fh); fh_copy_shallow(&cstate->current_fh.fh_handle, &rp->rp_openfh); @@ -374,7 +374,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; break; case NFS4_OPEN_CLAIM_PREVIOUS: - open->op_stateowner->so_confirmed = 1; + open->op_openowner->oo_confirmed = 1; /* * The CURRENT_FH is already set to the file being * opened. (1) set open->op_cinfo, (2) set @@ -387,7 +387,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; break; case NFS4_OPEN_CLAIM_DELEGATE_PREV: - open->op_stateowner->so_confirmed = 1; + open->op_openowner->oo_confirmed = 1; dprintk("NFSD: unsupported OPEN claim type %d\n", open->op_claim_type); status = nfserr_notsupp; @@ -405,8 +405,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, */ status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); out: - if (open->op_stateowner) - cstate->replay_owner = open->op_stateowner; + if (open->op_openowner) + cstate->replay_owner = &open->op_openowner->oo_owner; else nfs4_unlock_state(); return status; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index eb11626babc6..567130dccda0 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -63,7 +63,7 @@ static u64 current_sessionid = 1; static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); static struct nfs4_delegation * search_for_delegation(stateid_t *stid); static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); -static int check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner); +static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner); /* Locking: */ @@ -77,7 +77,8 @@ static DEFINE_MUTEX(client_mutex); */ static DEFINE_SPINLOCK(recall_lock); -static struct kmem_cache *stateowner_slab = NULL; +static struct kmem_cache *openowner_slab = NULL; +static struct kmem_cache *lockowner_slab = NULL; static struct kmem_cache *file_slab = NULL; static struct kmem_cache *stateid_slab = NULL; static struct kmem_cache *deleg_slab = NULL; @@ -432,41 +433,39 @@ static void release_lock_stateid(struct nfs4_stateid *stp) unhash_generic_stateid(stp); file = find_any_file(stp->st_file); if (file) - locks_remove_posix(file, (fl_owner_t)stp->st_stateowner); + locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner)); free_generic_stateid(stp); } -static void unhash_lockowner(struct nfs4_stateowner *sop) +static void unhash_lockowner(struct nfs4_lockowner *lo) { struct nfs4_stateid *stp; - list_del(&sop->so_idhash); - list_del(&sop->so_strhash); - list_del(&sop->so_perstateid); - while (!list_empty(&sop->so_stateids)) { - stp = list_first_entry(&sop->so_stateids, + list_del(&lo->lo_owner.so_idhash); + list_del(&lo->lo_owner.so_strhash); + list_del(&lo->lo_perstateid); + while (!list_empty(&lo->lo_owner.so_stateids)) { + stp = list_first_entry(&lo->lo_owner.so_stateids, struct nfs4_stateid, st_perstateowner); release_lock_stateid(stp); } } -static void release_lockowner(struct nfs4_stateowner *sop) +static void release_lockowner(struct nfs4_lockowner *lo) { - unhash_lockowner(sop); - nfs4_free_stateowner(sop); + unhash_lockowner(lo); + nfs4_free_lockowner(lo); } static void release_stateid_lockowners(struct nfs4_stateid *open_stp) { - struct nfs4_stateowner *lock_sop; + struct nfs4_lockowner *lo; while (!list_empty(&open_stp->st_lockowners)) { - lock_sop = list_entry(open_stp->st_lockowners.next, - struct nfs4_stateowner, so_perstateid); - /* list_del(&open_stp->st_lockowners); */ - BUG_ON(lock_sop->so_is_open_owner); - release_lockowner(lock_sop); + lo = list_entry(open_stp->st_lockowners.next, + struct nfs4_lockowner, lo_perstateid); + release_lockowner(lo); } } @@ -477,26 +476,25 @@ static void release_open_stateid(struct nfs4_stateid *stp) free_generic_stateid(stp); } -static void unhash_openowner(struct nfs4_stateowner *sop) +static void unhash_openowner(struct nfs4_openowner *oo) { struct nfs4_stateid *stp; - list_del(&sop->so_idhash); - list_del(&sop->so_strhash); - list_del(&sop->so_perclient); - list_del(&sop->so_perstateid); /* XXX: necessary? */ - while (!list_empty(&sop->so_stateids)) { - stp = list_first_entry(&sop->so_stateids, + list_del(&oo->oo_owner.so_idhash); + list_del(&oo->oo_owner.so_strhash); + list_del(&oo->oo_perclient); + while (!list_empty(&oo->oo_owner.so_stateids)) { + stp = list_first_entry(&oo->oo_owner.so_stateids, struct nfs4_stateid, st_perstateowner); release_open_stateid(stp); } } -static void release_openowner(struct nfs4_stateowner *sop) +static void release_openowner(struct nfs4_openowner *oo) { - unhash_openowner(sop); - list_del(&sop->so_close_lru); - nfs4_free_stateowner(sop); + unhash_openowner(oo); + list_del(&oo->oo_close_lru); + nfs4_free_openowner(oo); } #define SESSION_HASH_SIZE 512 @@ -961,7 +959,7 @@ unhash_client_locked(struct nfs4_client *clp) static void expire_client(struct nfs4_client *clp) { - struct nfs4_stateowner *sop; + struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct list_head reaplist; @@ -979,8 +977,8 @@ expire_client(struct nfs4_client *clp) unhash_delegation(dp); } while (!list_empty(&clp->cl_openowners)) { - sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); - release_openowner(sop); + oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient); + release_openowner(oo); } nfsd4_shutdown_callback(clp); if (clp->cl_cb_conn.cb_xprt) @@ -2173,7 +2171,8 @@ nfsd4_free_slab(struct kmem_cache **slab) void nfsd4_free_slabs(void) { - nfsd4_free_slab(&stateowner_slab); + nfsd4_free_slab(&openowner_slab); + nfsd4_free_slab(&lockowner_slab); nfsd4_free_slab(&file_slab); nfsd4_free_slab(&stateid_slab); nfsd4_free_slab(&deleg_slab); @@ -2182,9 +2181,13 @@ nfsd4_free_slabs(void) static int nfsd4_init_slabs(void) { - stateowner_slab = kmem_cache_create("nfsd4_stateowners", - sizeof(struct nfs4_stateowner), 0, 0, NULL); - if (stateowner_slab == NULL) + openowner_slab = kmem_cache_create("nfsd4_openowners", + sizeof(struct nfs4_openowner), 0, 0, NULL); + if (openowner_slab == NULL) + goto out_nomem; + lockowner_slab = kmem_cache_create("nfsd4_lockowners", + sizeof(struct nfs4_openowner), 0, 0, NULL); + if (lockowner_slab == NULL) goto out_nomem; file_slab = kmem_cache_create("nfsd4_files", sizeof(struct nfs4_file), 0, 0, NULL); @@ -2205,11 +2208,16 @@ out_nomem: return -ENOMEM; } -void -nfs4_free_stateowner(struct nfs4_stateowner *sop) +void nfs4_free_openowner(struct nfs4_openowner *oo) +{ + kfree(oo->oo_owner.so_owner.data); + kmem_cache_free(openowner_slab, oo); +} + +void nfs4_free_lockowner(struct nfs4_lockowner *lo) { - kfree(sop->so_owner.data); - kmem_cache_free(stateowner_slab, sop); + kfree(lo->lo_owner.so_owner.data); + kmem_cache_free(lockowner_slab, lo); } static void init_nfs4_replay(struct nfs4_replay *rp) @@ -2219,74 +2227,72 @@ static void init_nfs4_replay(struct nfs4_replay *rp) rp->rp_buf = rp->rp_ibuf; } -static inline struct nfs4_stateowner *alloc_stateowner(struct xdr_netobj *owner, struct nfs4_client *clp) +static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp) { struct nfs4_stateowner *sop; - sop = kmem_cache_alloc(stateowner_slab, GFP_KERNEL); + sop = kmem_cache_alloc(slab, GFP_KERNEL); if (!sop) return NULL; sop->so_owner.data = kmemdup(owner->data, owner->len, GFP_KERNEL); if (!sop->so_owner.data) { - kmem_cache_free(stateowner_slab, sop); + kmem_cache_free(slab, sop); return NULL; } sop->so_owner.len = owner->len; - INIT_LIST_HEAD(&sop->so_perclient); INIT_LIST_HEAD(&sop->so_stateids); - INIT_LIST_HEAD(&sop->so_perstateid); - INIT_LIST_HEAD(&sop->so_close_lru); sop->so_id = current_ownerid++; - sop->so_time = 0; sop->so_client = clp; init_nfs4_replay(&sop->so_replay); return sop; } -static void hash_openowner(struct nfs4_stateowner *sop, struct nfs4_client *clp, unsigned int strhashval) +static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval) { unsigned int idhashval; - idhashval = open_ownerid_hashval(sop->so_id); - list_add(&sop->so_idhash, &open_ownerid_hashtbl[idhashval]); - list_add(&sop->so_strhash, &open_ownerstr_hashtbl[strhashval]); - list_add(&sop->so_perclient, &clp->cl_openowners); + idhashval = open_ownerid_hashval(oo->oo_owner.so_id); + list_add(&oo->oo_owner.so_idhash, &open_ownerid_hashtbl[idhashval]); + list_add(&oo->oo_owner.so_strhash, &open_ownerstr_hashtbl[strhashval]); + list_add(&oo->oo_perclient, &clp->cl_openowners); } -static struct nfs4_stateowner * +static struct nfs4_openowner * alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) { - struct nfs4_stateowner *sop; + struct nfs4_openowner *oo; - sop = alloc_stateowner(&open->op_owner, clp); - if (!sop) + oo = alloc_stateowner(openowner_slab, &open->op_owner, clp); + if (!oo) return NULL; - sop->so_is_open_owner = 1; - sop->so_seqid = open->op_seqid; - sop->so_confirmed = 0; - hash_openowner(sop, clp, strhashval); - return sop; + oo->oo_owner.so_is_open_owner = 1; + oo->oo_owner.so_seqid = open->op_seqid; + oo->oo_confirmed = 0; + oo->oo_time = 0; + INIT_LIST_HEAD(&oo->oo_close_lru); + hash_openowner(oo, clp, strhashval); + return oo; } static inline void init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { - struct nfs4_stateowner *sop = open->op_stateowner; - unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id); + struct nfs4_openowner *oo = open->op_openowner; + unsigned int hashval = stateid_hashval(oo->oo_owner.so_id, fp->fi_id); INIT_LIST_HEAD(&stp->st_hash); INIT_LIST_HEAD(&stp->st_perstateowner); INIT_LIST_HEAD(&stp->st_lockowners); INIT_LIST_HEAD(&stp->st_perfile); list_add(&stp->st_hash, &stateid_hashtbl[hashval]); - list_add(&stp->st_perstateowner, &sop->so_stateids); + list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); stp->st_type = NFS4_OPEN_STID; - stp->st_stateowner = sop; + stp->st_stateowner = &oo->oo_owner; get_nfs4_file(fp); stp->st_file = fp; stp->st_stateid.si_boot = boot_time; - stp->st_stateid.si_stateownerid = sop->so_id; + stp->st_stateid.si_stateownerid = oo->oo_owner.so_id; stp->st_stateid.si_fileid = fp->fi_id; /* note will be incremented before first return to client: */ stp->st_stateid.si_generation = 0; @@ -2299,12 +2305,12 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open * } static void -move_to_close_lru(struct nfs4_stateowner *sop) +move_to_close_lru(struct nfs4_openowner *oo) { - dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop); + dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo); - list_move_tail(&sop->so_close_lru, &close_lru); - sop->so_time = get_seconds(); + list_move_tail(&oo->oo_close_lru, &close_lru); + oo->oo_time = get_seconds(); } static int @@ -2316,14 +2322,14 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner, (sop->so_client->cl_clientid.cl_id == clid->cl_id); } -static struct nfs4_stateowner * +static struct nfs4_openowner * find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open) { struct nfs4_stateowner *so = NULL; list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) { if (same_owner_str(so, &open->op_owner, &open->op_clientid)) - return so; + return container_of(so, struct nfs4_openowner, oo_owner); } return NULL; } @@ -2474,7 +2480,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, clientid_t *clientid = &open->op_clientid; struct nfs4_client *clp = NULL; unsigned int strhashval; - struct nfs4_stateowner *sop = NULL; + struct nfs4_openowner *oo = NULL; __be32 status; if (!check_name(open->op_owner)) @@ -2484,34 +2490,34 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, return nfserr_stale_clientid; strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner); - sop = find_openstateowner_str(strhashval, open); - open->op_stateowner = sop; - if (!sop) { + oo = find_openstateowner_str(strhashval, open); + open->op_openowner = oo; + if (!oo) { /* Make sure the client's lease hasn't expired. */ clp = find_confirmed_client(clientid); if (clp == NULL) return nfserr_expired; goto renew; } - if (!sop->so_confirmed) { + if (!oo->oo_confirmed) { /* Replace unconfirmed owners without checking for replay. */ - clp = sop->so_client; - release_openowner(sop); - open->op_stateowner = NULL; + clp = oo->oo_owner.so_client; + release_openowner(oo); + open->op_openowner = NULL; goto renew; } - status = nfsd4_check_seqid(cstate, sop, open->op_seqid); + status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid); if (status) return status; renew: - if (open->op_stateowner == NULL) { - sop = alloc_init_open_stateowner(strhashval, clp, open); - if (sop == NULL) + if (open->op_openowner == NULL) { + oo = alloc_init_open_stateowner(strhashval, clp, open); + if (oo == NULL) return nfserr_jukebox; - open->op_stateowner = sop; + open->op_openowner = oo; } - list_del_init(&sop->so_close_lru); - renew_client(sop->so_client); + list_del_init(&oo->oo_close_lru); + renew_client(oo->oo_owner.so_client); return nfs_ok; } @@ -2565,7 +2571,7 @@ out: return nfs_ok; if (status) return status; - open->op_stateowner->so_confirmed = 1; + open->op_openowner->oo_confirmed = 1; return nfs_ok; } @@ -2573,14 +2579,14 @@ static __be32 nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp) { struct nfs4_stateid *local; - struct nfs4_stateowner *sop = open->op_stateowner; + struct nfs4_openowner *oo = open->op_openowner; list_for_each_entry(local, &fp->fi_stateids, st_perfile) { /* ignore lock owners */ if (local->st_stateowner->so_is_open_owner == 0) continue; /* remember if we have seen this open owner */ - if (local->st_stateowner == sop) + if (local->st_stateowner == &oo->oo_owner) *stpp = local; /* check for conflicting share reservations */ if (!test_share(local, open)) @@ -2698,8 +2704,8 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c static void nfs4_set_claim_prev(struct nfsd4_open *open) { - open->op_stateowner->so_confirmed = 1; - open->op_stateowner->so_client->cl_firststate = 1; + open->op_openowner->oo_confirmed = 1; + open->op_openowner->oo_owner.so_client->cl_firststate = 1; } /* Should we give out recallable state?: */ @@ -2782,11 +2788,11 @@ static void nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_stateid *stp) { struct nfs4_delegation *dp; - struct nfs4_stateowner *sop = stp->st_stateowner; + struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner); int cb_up; int status, flag = 0; - cb_up = nfsd4_cb_channel_good(sop->so_client); + cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client); flag = NFS4_OPEN_DELEGATE_NONE; open->op_recall = 0; switch (open->op_claim_type) { @@ -2802,7 +2808,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta * had the chance to reclaim theirs.... */ if (locks_in_grace()) goto out; - if (!cb_up || !sop->so_confirmed) + if (!cb_up || !oo->oo_confirmed) goto out; if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) flag = NFS4_OPEN_DELEGATE_WRITE; @@ -2813,7 +2819,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta goto out; } - dp = alloc_init_deleg(sop->so_client, stp, fh, flag); + dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh, flag); if (dp == NULL) goto out_no_deleg; status = nfs4_set_delegation(dp, flag); @@ -2901,7 +2907,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); if (nfsd4_has_session(&resp->cstate)) - open->op_stateowner->so_confirmed = 1; + open->op_openowner->oo_confirmed = 1; /* * Attempt to hand out a delegation. No error return, because the @@ -2922,7 +2928,7 @@ out: * To finish the open response, we just need to set the rflags. */ open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; - if (!open->op_stateowner->so_confirmed && + if (!open->op_openowner->oo_confirmed && !nfsd4_has_session(&resp->cstate)) open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; @@ -2981,7 +2987,7 @@ static time_t nfs4_laundromat(void) { struct nfs4_client *clp; - struct nfs4_stateowner *sop; + struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct list_head *pos, *next, reaplist; time_t cutoff = get_seconds() - nfsd4_lease; @@ -3038,16 +3044,16 @@ nfs4_laundromat(void) } test_val = nfsd4_lease; list_for_each_safe(pos, next, &close_lru) { - sop = list_entry(pos, struct nfs4_stateowner, so_close_lru); - if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) { - u = sop->so_time - cutoff; + oo = container_of(pos, struct nfs4_openowner, oo_close_lru); + if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) { + u = oo->oo_time - cutoff; if (test_val > u) test_val = u; break; } dprintk("NFSD: purging unused open stateowner (so_id %d)\n", - sop->so_id); - release_openowner(sop); + oo->oo_owner.so_id); + release_openowner(oo); } if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; @@ -3069,13 +3075,12 @@ laundromat_main(struct work_struct *not_used) queue_delayed_work(laundry_wq, &laundromat_work, t*HZ); } -static struct nfs4_stateowner * -search_close_lru(u32 st_id) +static struct nfs4_openowner * search_close_lru(u32 st_id) { - struct nfs4_stateowner *local; + struct nfs4_openowner *local; - list_for_each_entry(local, &close_lru, so_close_lru) { - if (local->so_id == st_id) + list_for_each_entry(local, &close_lru, oo_close_lru) { + if (local->oo_owner.so_id == st_id) return local; } return NULL; @@ -3209,7 +3214,8 @@ __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session) goto out; status = nfserr_bad_stateid; - if (!stp->st_stateowner->so_confirmed) + if (stp->st_stateowner->so_is_open_owner + && !openowner(stp->st_stateowner)->oo_confirmed) goto out; status = check_stateid_generation(stateid, &stp->st_stateid, has_session); @@ -3274,7 +3280,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, status = nfserr_bad_stateid; if (nfs4_check_fh(current_fh, stp)) goto out; - if (!stp->st_stateowner->so_confirmed) + if (stp->st_stateowner->so_is_open_owner + && !openowner(stp->st_stateowner)->oo_confirmed) goto out; status = check_stateid_generation(stateid, &stp->st_stateid, nfsd4_has_session(cstate)); @@ -3308,7 +3315,7 @@ nfsd4_free_delegation_stateid(stateid_t *stateid) static __be32 nfsd4_free_lock_stateid(struct nfs4_stateid *stp) { - if (check_for_locks(stp->st_file, stp->st_stateowner)) + if (check_for_locks(stp->st_file, lockowner(stp->st_stateowner))) return nfserr_locks_held; release_lock_stateid(stp); return nfs_ok; @@ -3417,7 +3424,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, if (status) return status; - if (!sop->so_confirmed && !(flags & CONFIRM)) { + if (sop->so_is_open_owner && !openowner(sop)->oo_confirmed + && !(flags & CONFIRM)) { dprintk("NFSD: preprocess_seqid_op: stateowner not" " confirmed yet!\n"); return nfserr_bad_stateid; @@ -3434,7 +3442,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open_confirm *oc) { __be32 status; - struct nfs4_stateowner *sop; + struct nfs4_openowner *oo; struct nfs4_stateid *stp; dprintk("NFSD: nfsd4_open_confirm on file %.*s\n", @@ -3452,17 +3460,17 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, CONFIRM | OPEN_STATE, &stp); if (status) goto out; - sop = stp->st_stateowner; + oo = openowner(stp->st_stateowner); status = nfserr_bad_stateid; - if (sop->so_confirmed) + if (oo->oo_confirmed) goto out; - sop->so_confirmed = 1; + oo->oo_confirmed = 1; update_stateid(&stp->st_stateid); memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t)); dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid)); - nfsd4_create_clid_dir(sop->so_client); + nfsd4_create_clid_dir(oo->oo_owner.so_client); status = nfs_ok; out: if (!cstate->replay_owner) @@ -3513,7 +3521,6 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, &od->od_stateid, OPEN_STATE, &stp); if (status) goto out; - status = nfserr_inval; if (!test_bit(od->od_share_access, &stp->st_access_bmap)) { dprintk("NFSD:access not a subset current bitmap: 0x%lx, input access=%08x\n", @@ -3546,8 +3553,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_close *close) { __be32 status; + struct nfs4_openowner *oo; struct nfs4_stateid *stp; - struct nfs4_stateowner *so; dprintk("NFSD: nfsd4_close on file %.*s\n", (int)cstate->current_fh.fh_dentry->d_name.len, @@ -3563,19 +3570,19 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * Also, we should make sure this isn't just the result of * a replayed close: */ - so = search_close_lru(close->cl_stateid.si_stateownerid); + oo = search_close_lru(close->cl_stateid.si_stateownerid); /* It's not stale; let's assume it's expired: */ - if (so == NULL) + if (oo == NULL) goto out; - cstate->replay_owner = so; - status = nfsd4_check_seqid(cstate, so, close->cl_seqid); + cstate->replay_owner = &oo->oo_owner; + status = nfsd4_check_seqid(cstate, &oo->oo_owner, close->cl_seqid); if (status) goto out; status = nfserr_bad_seqid; } if (status) goto out; - so = stp->st_stateowner; + oo = openowner(stp->st_stateowner); status = nfs_ok; update_stateid(&stp->st_stateid); memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t)); @@ -3587,8 +3594,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * released by the laundromat service after the lease period * to enable us to handle CLOSE replay */ - if (list_empty(&so->so_stateids)) - move_to_close_lru(so); + if (list_empty(&oo->oo_owner.so_stateids)) + move_to_close_lru(oo); out: if (!cstate->replay_owner) nfs4_unlock_state(); @@ -3768,17 +3775,17 @@ static const struct lock_manager_operations nfsd_posix_mng_ops = { static inline void nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) { - struct nfs4_stateowner *sop; + struct nfs4_lockowner *lo; if (fl->fl_lmops == &nfsd_posix_mng_ops) { - sop = (struct nfs4_stateowner *) fl->fl_owner; - deny->ld_owner.data = kmemdup(sop->so_owner.data, - sop->so_owner.len, GFP_KERNEL); + lo = (struct nfs4_lockowner *) fl->fl_owner; + deny->ld_owner.data = kmemdup(lo->lo_owner.so_owner.data, + lo->lo_owner.so_owner.len, GFP_KERNEL); if (!deny->ld_owner.data) /* We just don't care that much */ goto nevermind; - deny->ld_owner.len = sop->so_owner.len; - deny->ld_clientid = sop->so_client->cl_clientid; + deny->ld_owner.len = lo->lo_owner.so_owner.len; + deny->ld_clientid = lo->lo_owner.so_client->cl_clientid; } else { nevermind: deny->ld_owner.len = 0; @@ -3795,8 +3802,8 @@ nevermind: deny->ld_type = NFS4_WRITE_LT; } -static struct nfs4_stateowner * -find_lockstateowner_str(struct inode *inode, clientid_t *clid, +static struct nfs4_lockowner * +find_lockowner_str(struct inode *inode, clientid_t *clid, struct xdr_netobj *owner) { unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner); @@ -3804,19 +3811,19 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid, list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) { if (same_owner_str(op, owner, clid)) - return op; + return lockowner(op); } return NULL; } -static void hash_lockowner(struct nfs4_stateowner *sop, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp) +static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp) { unsigned int idhashval; - idhashval = lockownerid_hashval(sop->so_id); - list_add(&sop->so_idhash, &lock_ownerid_hashtbl[idhashval]); - list_add(&sop->so_strhash, &lock_ownerstr_hashtbl[strhashval]); - list_add(&sop->so_perstateid, &open_stp->st_lockowners); + idhashval = lockownerid_hashval(lo->lo_owner.so_id); + list_add(&lo->lo_owner.so_idhash, &lock_ownerid_hashtbl[idhashval]); + list_add(&lo->lo_owner.so_strhash, &lock_ownerstr_hashtbl[strhashval]); + list_add(&lo->lo_perstateid, &open_stp->st_lockowners); } /* @@ -3827,28 +3834,27 @@ static void hash_lockowner(struct nfs4_stateowner *sop, unsigned int strhashval, * strhashval = lock_ownerstr_hashval */ -static struct nfs4_stateowner * +static struct nfs4_lockowner * alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp, struct nfsd4_lock *lock) { - struct nfs4_stateowner *sop; + struct nfs4_lockowner *lo; - sop = alloc_stateowner(&lock->lk_new_owner, clp); - if (!sop) + lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp); + if (!lo) return NULL; - INIT_LIST_HEAD(&sop->so_stateids); - sop->so_is_open_owner = 0; + INIT_LIST_HEAD(&lo->lo_owner.so_stateids); + lo->lo_owner.so_is_open_owner = 0; /* It is the openowner seqid that will be incremented in encode in the * case of new lockowners; so increment the lock seqid manually: */ - sop->so_seqid = lock->lk_new_lock_seqid + 1; - sop->so_confirmed = 1; - hash_lockowner(sop, strhashval, clp, open_stp); - return sop; + lo->lo_owner.so_seqid = lock->lk_new_lock_seqid + 1; + hash_lockowner(lo, strhashval, clp, open_stp); + return lo; } static struct nfs4_stateid * -alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struct nfs4_stateid *open_stp) +alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_stateid *open_stp) { struct nfs4_stateid *stp; - unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id); + unsigned int hashval = stateid_hashval(lo->lo_owner.so_id, fp->fi_id); stp = nfs4_alloc_stateid(); if (stp == NULL) @@ -3859,13 +3865,13 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc INIT_LIST_HEAD(&stp->st_lockowners); /* not used */ list_add(&stp->st_hash, &stateid_hashtbl[hashval]); list_add(&stp->st_perfile, &fp->fi_stateids); - list_add(&stp->st_perstateowner, &sop->so_stateids); - stp->st_stateowner = sop; + list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); + stp->st_stateowner = &lo->lo_owner; stp->st_type = NFS4_LOCK_STID; get_nfs4_file(fp); stp->st_file = fp; stp->st_stateid.si_boot = boot_time; - stp->st_stateid.si_stateownerid = sop->so_id; + stp->st_stateid.si_stateownerid = lo->lo_owner.so_id; stp->st_stateid.si_fileid = fp->fi_id; /* note will be incremented before first return to client: */ stp->st_stateid.si_generation = 0; @@ -3902,8 +3908,8 @@ __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_lock *lock) { - struct nfs4_stateowner *open_sop = NULL; - struct nfs4_stateowner *lock_sop = NULL; + struct nfs4_openowner *open_sop = NULL; + struct nfs4_lockowner *lock_sop = NULL; struct nfs4_stateid *lock_stp; struct nfs4_file *fp; struct file *filp = NULL; @@ -3949,23 +3955,23 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, OPEN_STATE, &open_stp); if (status) goto out; + open_sop = openowner(open_stp->st_stateowner); status = nfserr_bad_stateid; - open_sop = open_stp->st_stateowner; if (!nfsd4_has_session(cstate) && - !same_clid(&open_sop->so_client->cl_clientid, + !same_clid(&open_sop->oo_owner.so_client->cl_clientid, &lock->v.new.clientid)) goto out; /* create lockowner and lock stateid */ fp = open_stp->st_file; - strhashval = lock_ownerstr_hashval(fp->fi_inode, - open_sop->so_client->cl_clientid.cl_id, + strhashval = lock_ownerstr_hashval(fp->fi_inode, + open_sop->oo_owner.so_client->cl_clientid.cl_id, &lock->v.new.owner); /* XXX: Do we need to check for duplicate stateowners on * the same file, or should they just be allowed (and * create new stateids)? */ status = nfserr_jukebox; lock_sop = alloc_init_lock_stateowner(strhashval, - open_sop->so_client, open_stp, lock); + open_sop->oo_owner.so_client, open_stp, lock); if (lock_sop == NULL) goto out; lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp); @@ -3974,12 +3980,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } else { /* lock (lock owner + lock stateid) already exists */ status = nfs4_preprocess_seqid_op(cstate, - lock->lk_old_lock_seqid, - &lock->lk_old_lock_stateid, + lock->lk_old_lock_seqid, + &lock->lk_old_lock_stateid, LOCK_STATE, &lock_stp); if (status) goto out; - lock_sop = lock_stp->st_stateowner; + lock_sop = lockowner(lock_stp->st_stateowner); fp = lock_stp->st_file; } /* lock_sop and lock_stp have been created or found */ @@ -4092,7 +4098,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct inode *inode; struct file_lock file_lock; - struct nfs4_stateowner *so; + struct nfs4_lockowner *lo; int error; __be32 status; @@ -4128,10 +4134,9 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - so = find_lockstateowner_str(inode, - &lockt->lt_clientid, &lockt->lt_owner); - if (so) - file_lock.fl_owner = (fl_owner_t)so; + lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner); + if (lo) + file_lock.fl_owner = (fl_owner_t)lo; file_lock.fl_pid = current->tgid; file_lock.fl_flags = FL_POSIX; @@ -4186,7 +4191,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, BUG_ON(!filp); locks_init_lock(&file_lock); file_lock.fl_type = F_UNLCK; - file_lock.fl_owner = (fl_owner_t) stp->st_stateowner; + file_lock.fl_owner = (fl_owner_t)lockowner(stp->st_stateowner); file_lock.fl_pid = current->tgid; file_lock.fl_file = filp; file_lock.fl_flags = FL_POSIX; @@ -4225,7 +4230,7 @@ out_nfserr: * 0: no locks held by lockowner */ static int -check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner) +check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner) { struct file_lock **flpp; struct inode *inode = filp->fi_inode; @@ -4250,6 +4255,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, { clientid_t *clid = &rlockowner->rl_clientid; struct nfs4_stateowner *sop; + struct nfs4_lockowner *lo; struct nfs4_stateid *stp; struct xdr_netobj *owner = &rlockowner->rl_owner; struct list_head matches; @@ -4279,11 +4285,10 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, continue; list_for_each_entry(stp, &sop->so_stateids, st_perstateowner) { - if (check_for_locks(stp->st_file, sop)) + lo = lockowner(sop); + if (check_for_locks(stp->st_file, lo)) goto out; - /* Note: so_perclient unused for lockowners, - * so it's OK to fool with here. */ - list_add(&sop->so_perclient, &matches); + list_add(&lo->lo_list, &matches); } } } @@ -4292,12 +4297,12 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, * have been checked. */ status = nfs_ok; while (!list_empty(&matches)) { - sop = list_entry(matches.next, struct nfs4_stateowner, - so_perclient); + lo = list_entry(matches.next, struct nfs4_lockowner, + lo_list); /* unhash_stateowner deletes so_perclient only * for openowners. */ - list_del(&sop->so_perclient); - release_lockowner(sop); + list_del(&lo->lo_list); + release_lockowner(lo); } out: nfs4_unlock_state(); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index c4dcba3aac1f..182570bed472 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -646,7 +646,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) memset(open->op_bmval, 0, sizeof(open->op_bmval)); open->op_iattr.ia_valid = 0; - open->op_stateowner = NULL; + open->op_openowner = NULL; /* seqid, share_access, share_deny, clientid, ownerlen */ READ_BUF(16 + sizeof(clientid_t)); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 0d88000d15d7..7994ed9be3cc 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -337,14 +337,11 @@ struct nfs4_replay { * reaped (when so_perfilestate is empty) to hold the last close replay. * reaped by laundramat thread after lease period. */ + struct nfs4_stateowner { struct list_head so_idhash; /* hash by so_id */ struct list_head so_strhash; /* hash by op_name */ - struct list_head so_perclient; struct list_head so_stateids; - struct list_head so_perstateid; /* for lockowners only */ - struct list_head so_close_lru; /* tail queue */ - time_t so_time; /* time of placement on so_close_lru */ int so_is_open_owner; /* 1=openowner,0=lockowner */ u32 so_id; struct nfs4_client * so_client; @@ -352,10 +349,33 @@ struct nfs4_stateowner { * sequence id expected from the client: */ u32 so_seqid; struct xdr_netobj so_owner; /* open owner name */ - int so_confirmed; /* successful OPEN_CONFIRM? */ struct nfs4_replay so_replay; }; +struct nfs4_openowner { + struct nfs4_stateowner oo_owner; /* must be first field */ + struct list_head oo_perclient; + struct list_head oo_close_lru; /* tail queue */ + time_t oo_time; /* time of placement on so_close_lru */ + int oo_confirmed; /* successful OPEN_CONFIRM? */ +}; + +struct nfs4_lockowner { + struct nfs4_stateowner lo_owner; /* must be first element */ + struct list_head lo_perstateid; /* for lockowners only */ + struct list_head lo_list; /* for temporary uses */ +}; + +static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so) +{ + return container_of(so, struct nfs4_openowner, oo_owner); +} + +static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so) +{ + return container_of(so, struct nfs4_lockowner, lo_owner); +} + /* * nfs4_file: a file opened by some number of (open) nfs4_stateowners. * o fi_perfile list is used to search for conflicting @@ -457,7 +477,8 @@ extern void nfs4_lock_state(void); extern void nfs4_unlock_state(void); extern int nfs4_in_grace(void); extern __be32 nfs4_check_open_reclaim(clientid_t *clid); -extern void nfs4_free_stateowner(struct nfs4_stateowner *sop); +extern void nfs4_free_openowner(struct nfs4_openowner *); +extern void nfs4_free_lockowner(struct nfs4_lockowner *); extern int set_callback_cred(void); extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 27a3dfab96a9..f95a72482064 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -227,7 +227,7 @@ struct nfsd4_open { struct nfsd4_change_info op_cinfo; /* response */ u32 op_rflags; /* response */ int op_truncate; /* used during processing */ - struct nfs4_stateowner *op_stateowner; /* used during processing */ + struct nfs4_openowner *op_openowner; /* used during processing */ struct nfs4_acl *op_acl; }; #define op_iattr iattr -- cgit v1.2.3-58-ga151 From 4665e2bac5076d02264f4a4d79edafa05ec7b752 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 6 Sep 2011 14:50:49 -0400 Subject: nfsd4: split out some free_generic_stateid code We'll use this elsewhere. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 567130dccda0..c28432a80210 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -411,7 +411,7 @@ static void unhash_generic_stateid(struct nfs4_stateid *stp) list_del(&stp->st_perstateowner); } -static void free_generic_stateid(struct nfs4_stateid *stp) +static void close_generic_stateid(struct nfs4_stateid *stp) { int i; @@ -420,9 +420,16 @@ static void free_generic_stateid(struct nfs4_stateid *stp) if (test_bit(i, &stp->st_access_bmap)) nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(i)); + __clear_bit(i, &stp->st_access_bmap); } } put_nfs4_file(stp->st_file); + stp->st_file = NULL; +} + +static void free_generic_stateid(struct nfs4_stateid *stp) +{ + close_generic_stateid(stp); kmem_cache_free(stateid_slab, stp); } -- cgit v1.2.3-58-ga151 From a25cac5198d4ff2842ccca63b423962848ad24b2 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 24 Aug 2011 09:40:25 +0200 Subject: proc: Consider NO_HZ when printing idle and iowait times show_stat handler of the /proc/stat file relies on kstat_cpu(cpu) statistics when priting information about idle and iowait times. This is OK if we are not using tickless kernel (CONFIG_NO_HZ) because counters are updated periodically. With NO_HZ things got more tricky because we are not doing idle/iowait accounting while we are tickless so the value might get outdated. Users of /proc/stat will notice that by unchanged idle/iowait values which is then interpreted as 0% idle/iowait time. From the user space POV this is an unexpected behavior and a change of the interface. Let's fix this by using get_cpu_{idle,iowait}_time_us which accounts the total idle/iowait time since boot and it doesn't rely on sampling or any other periodic activity. Fall back to the previous behavior if NO_HZ is disabled or not configured. Signed-off-by: Michal Hocko Cc: Dave Jones Cc: Arnd Bergmann Cc: Alexey Dobriyan Link: http://lkml.kernel.org/r/39181366adac1b39cb6aa3cd53ff0f7c78d32676.1314172057.git.mhocko@suse.cz Signed-off-by: Thomas Gleixner --- fs/proc/stat.c | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 9758b654a1bc..42b274da92c3 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -10,6 +10,7 @@ #include #include #include +#include #ifndef arch_irq_stat_cpu #define arch_irq_stat_cpu(cpu) 0 @@ -21,6 +22,35 @@ #define arch_idle_time(cpu) 0 #endif +static cputime64_t get_idle_time(int cpu) +{ + u64 idle_time = get_cpu_idle_time_us(cpu, NULL); + cputime64_t idle; + + if (idle_time == -1ULL) { + /* !NO_HZ so we can rely on cpustat.idle */ + idle = kstat_cpu(cpu).cpustat.idle; + idle = cputime64_add(idle, arch_idle_time(cpu)); + } else + idle = usecs_to_cputime(idle_time); + + return idle; +} + +static cputime64_t get_iowait_time(int cpu) +{ + u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL); + cputime64_t iowait; + + if (iowait_time == -1ULL) + /* !NO_HZ so we can rely on cpustat.iowait */ + iowait = kstat_cpu(cpu).cpustat.iowait; + else + iowait = usecs_to_cputime(iowait_time); + + return iowait; +} + static int show_stat(struct seq_file *p, void *v) { int i, j; @@ -42,9 +72,8 @@ static int show_stat(struct seq_file *p, void *v) user = cputime64_add(user, kstat_cpu(i).cpustat.user); nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); system = cputime64_add(system, kstat_cpu(i).cpustat.system); - idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle); - idle = cputime64_add(idle, arch_idle_time(i)); - iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait); + idle = cputime64_add(idle, get_idle_time(i)); + iowait = cputime64_add(iowait, get_iowait_time(i)); irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); @@ -76,14 +105,12 @@ static int show_stat(struct seq_file *p, void *v) (unsigned long long)cputime64_to_clock_t(guest), (unsigned long long)cputime64_to_clock_t(guest_nice)); for_each_online_cpu(i) { - /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ user = kstat_cpu(i).cpustat.user; nice = kstat_cpu(i).cpustat.nice; system = kstat_cpu(i).cpustat.system; - idle = kstat_cpu(i).cpustat.idle; - idle = cputime64_add(idle, arch_idle_time(i)); - iowait = kstat_cpu(i).cpustat.iowait; + idle = get_idle_time(i); + iowait = get_iowait_time(i); irq = kstat_cpu(i).cpustat.irq; softirq = kstat_cpu(i).cpustat.softirq; steal = kstat_cpu(i).cpustat.steal; -- cgit v1.2.3-58-ga151 From 7c2e70879fc0949b4220ee61b7c4553f6976a94d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 9 Sep 2011 18:28:51 -0400 Subject: ext4: add ext4-specific kludge to avoid an oops after the disk disappears The del_gendisk() function uninitializes the disk-specific data structures, including the bdi structure, without telling anyone else. Once this happens, any attempt to call mark_buffer_dirty() (for example, by ext4_commit_super), will cause a kernel OOPS. Fix this for now until we can fix things in an architecturally correct way. Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ee2f74a7084d..5dcd0dacc591 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -414,6 +414,22 @@ static void save_error_info(struct super_block *sb, const char *func, ext4_commit_super(sb, 1); } +/* + * The del_gendisk() function uninitializes the disk-specific data + * structures, including the bdi structure, without telling anyone + * else. Once this happens, any attempt to call mark_buffer_dirty() + * (for example, by ext4_commit_super), will cause a kernel OOPS. + * This is a kludge to prevent these oops until we can put in a proper + * hook in del_gendisk() to inform the VFS and file system layers. + */ +static int block_device_ejected(struct super_block *sb) +{ + struct inode *bd_inode = sb->s_bdev->bd_inode; + struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info; + + return bdi->dev == NULL; +} + /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. @@ -4072,7 +4088,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; int error = 0; - if (!sbh) + if (!sbh || block_device_ejected(sb)) return error; if (buffer_write_io_error(sbh)) { /* -- cgit v1.2.3-58-ga151 From 281b59959707dfae03ce038cdf231bf4904e170c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 9 Sep 2011 18:34:51 -0400 Subject: ext4: read-only support for bigalloc file systems This adds supports for bigalloc file systems. It teaches the mount code just enough about bigalloc superblock fields that it will mount the file system without freaking out that the number of blocks per group is too big. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 17 ++++++++++++++--- fs/ext4/super.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 64 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 48ae98819d35..f7257aa6bf81 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -239,8 +239,11 @@ struct ext4_io_submit { # define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) #endif #define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) +#define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \ + EXT4_SB(s)->s_cluster_bits) #ifdef __KERNEL__ # define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +# define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits) #else # define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) #endif @@ -306,6 +309,7 @@ struct flex_groups { #define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) #ifdef __KERNEL__ # define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) +# define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group) # define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) # define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) # define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) @@ -975,9 +979,9 @@ struct ext4_super_block { /*10*/ __le32 s_free_inodes_count; /* Free inodes count */ __le32 s_first_data_block; /* First Data Block */ __le32 s_log_block_size; /* Block size */ - __le32 s_obso_log_frag_size; /* Obsoleted fragment size */ + __le32 s_log_cluster_size; /* Allocation cluster size */ /*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ - __le32 s_obso_frags_per_group; /* Obsoleted fragments per group */ + __le32 s_clusters_per_group; /* # Clusters per group */ __le32 s_inodes_per_group; /* # Inodes per group */ __le32 s_mtime; /* Mount time */ /*30*/ __le32 s_wtime; /* Write time */ @@ -1073,7 +1077,10 @@ struct ext4_super_block { __u8 s_last_error_func[32]; /* function where the error happened */ #define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) __u8 s_mount_opts[64]; - __le32 s_reserved[112]; /* Padding to the end of the block */ + __le32 s_usr_quota_inum; /* inode for tracking user quota */ + __le32 s_grp_quota_inum; /* inode for tracking group quota */ + __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ + __le32 s_reserved[109]; /* Padding to the end of the block */ }; #define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) @@ -1093,6 +1100,7 @@ struct ext4_sb_info { unsigned long s_desc_size; /* Size of a group descriptor in bytes */ unsigned long s_inodes_per_block;/* Number of inodes per block */ unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_clusters_per_group; /* Number of clusters in a group */ unsigned long s_inodes_per_group;/* Number of inodes in a group */ unsigned long s_itb_per_group; /* Number of inode table blocks per group */ unsigned long s_gdb_count; /* Number of group descriptor blocks */ @@ -1101,6 +1109,8 @@ struct ext4_sb_info { ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ unsigned long s_overhead_last; /* Last calculated overhead */ unsigned long s_blocks_last; /* Last seen block count */ + unsigned int s_cluster_ratio; /* Number of blocks per cluster */ + unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */ loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ struct buffer_head * s_sbh; /* Buffer containing the super block */ struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ @@ -1367,6 +1377,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 #define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 +#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5dcd0dacc591..823e7d9deee2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1953,7 +1953,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, res = MS_RDONLY; } if (read_only) - return res; + goto done; if (!(sbi->s_mount_state & EXT4_VALID_FS)) ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " "running e2fsck is recommended"); @@ -1984,6 +1984,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); ext4_commit_super(sb, 1); +done: if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n", @@ -3105,10 +3106,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) char *cp; const char *descr; int ret = -ENOMEM; - int blocksize; + int blocksize, clustersize; unsigned int db_count; unsigned int i; - int needs_recovery, has_huge_files; + int needs_recovery, has_huge_files, has_bigalloc; __u64 blocks_count; int err; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; @@ -3412,12 +3413,53 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_dirt = 1; } - if (sbi->s_blocks_per_group > blocksize * 8) { - ext4_msg(sb, KERN_ERR, - "#blocks per group too big: %lu", - sbi->s_blocks_per_group); - goto failed_mount; + /* Handle clustersize */ + clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); + has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC); + if (has_bigalloc) { + if (clustersize < blocksize) { + ext4_msg(sb, KERN_ERR, + "cluster size (%d) smaller than " + "block size (%d)", clustersize, blocksize); + goto failed_mount; + } + sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - + le32_to_cpu(es->s_log_block_size); + sbi->s_clusters_per_group = + le32_to_cpu(es->s_clusters_per_group); + if (sbi->s_clusters_per_group > blocksize * 8) { + ext4_msg(sb, KERN_ERR, + "#clusters per group too big: %lu", + sbi->s_clusters_per_group); + goto failed_mount; + } + if (sbi->s_blocks_per_group != + (sbi->s_clusters_per_group * (clustersize / blocksize))) { + ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and " + "clusters per group (%lu) inconsistent", + sbi->s_blocks_per_group, + sbi->s_clusters_per_group); + goto failed_mount; + } + } else { + if (clustersize != blocksize) { + ext4_warning(sb, "fragment/cluster size (%d) != " + "block size (%d)", clustersize, + blocksize); + clustersize = blocksize; + } + if (sbi->s_blocks_per_group > blocksize * 8) { + ext4_msg(sb, KERN_ERR, + "#blocks per group too big: %lu", + sbi->s_blocks_per_group); + goto failed_mount; + } + sbi->s_clusters_per_group = sbi->s_blocks_per_group; + sbi->s_cluster_bits = 0; } + sbi->s_cluster_ratio = clustersize / blocksize; + if (sbi->s_inodes_per_group > blocksize * 8) { ext4_msg(sb, KERN_ERR, "#inodes per group too big: %lu", -- cgit v1.2.3-58-ga151 From bab08ab9646288f1b0b72a7aaeecdff94bd62c18 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 9 Sep 2011 18:36:51 -0400 Subject: ext4: enforce bigalloc restrictions (e.g., no online resizing, etc.) At least initially if the bigalloc feature is enabled, we will not support non-extent mapped inodes, online resizing, online defrag, or the FITRIM ioctl. This simplifies the initial implementation. Signed-off-by: "Theodore Ts'o" --- fs/ext4/indirect.c | 7 +++++++ fs/ext4/inode.c | 5 +++++ fs/ext4/ioctl.c | 33 +++++++++++++++++++++++++++++---- fs/ext4/super.c | 7 +++++++ 4 files changed, 48 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index ea704dff8669..3cfc73fbca8e 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -699,6 +699,13 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, /* * Okay, we need to do block allocation. */ + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + EXT4_ERROR_INODE(inode, "Can't allocate blocks for " + "non-extent mapped inodes with bigalloc"); + return -ENOSPC; + } + goal = ext4_find_goal(inode, map->m_lblk, partial); /* the number of blocks need to allocate for [d,t]indirect blocks */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6ecc93979e48..904a9a623dab 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3358,6 +3358,11 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) return -ENOTSUPP; } + if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) { + /* TODO: Add support for bigalloc file systems */ + return -ENOTSUPP; + } + return ext4_ext_punch_hole(file, offset, length); } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index f18bfe37aff8..2046d699b4a7 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -21,6 +21,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = filp->f_dentry->d_inode; + struct super_block *sb = inode->i_sb; struct ext4_inode_info *ei = EXT4_I(inode); unsigned int flags; @@ -183,7 +184,6 @@ setversion_out: * Returns 1 if it slept, else zero. */ { - struct super_block *sb = inode->i_sb; DECLARE_WAITQUEUE(wait, current); int ret = 0; @@ -199,7 +199,6 @@ setversion_out: #endif case EXT4_IOC_GROUP_EXTEND: { ext4_fsblk_t n_blocks_count; - struct super_block *sb = inode->i_sb; int err, err2=0; err = ext4_resize_begin(sb); @@ -209,6 +208,13 @@ setversion_out: if (get_user(n_blocks_count, (__u32 __user *)arg)) return -EFAULT; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not supported with bigalloc"); + return -EOPNOTSUPP; + } + err = mnt_want_write(filp->f_path.mnt); if (err) return err; @@ -250,6 +256,13 @@ setversion_out: goto mext_out; } + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported with bigalloc"); + return -EOPNOTSUPP; + } + err = mnt_want_write(filp->f_path.mnt); if (err) goto mext_out; @@ -270,7 +283,6 @@ mext_out: case EXT4_IOC_GROUP_ADD: { struct ext4_new_group_data input; - struct super_block *sb = inode->i_sb; int err, err2=0; err = ext4_resize_begin(sb); @@ -281,6 +293,13 @@ mext_out: sizeof(input))) return -EFAULT; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not supported with bigalloc"); + return -EOPNOTSUPP; + } + err = mnt_want_write(filp->f_path.mnt); if (err) return err; @@ -337,7 +356,6 @@ mext_out: case FITRIM: { - struct super_block *sb = inode->i_sb; struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; int ret = 0; @@ -348,6 +366,13 @@ mext_out: if (!blk_queue_discard(q)) return -EOPNOTSUPP; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "FITRIM not supported with bigalloc"); + return -EOPNOTSUPP; + } + if (copy_from_user(&range, (struct fstrim_range *)arg, sizeof(range))) return -EFAULT; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 823e7d9deee2..25a4bfe3f39f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2701,6 +2701,13 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) return 0; } } + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) && + !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { + ext4_msg(sb, KERN_ERR, + "Can't support bigalloc feature without " + "extents feature\n"); + return 0; + } return 1; } -- cgit v1.2.3-58-ga151 From 7137d7a48e2213eb1f6d6529da14c2ed3706b795 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 9 Sep 2011 18:38:51 -0400 Subject: ext4: convert instances of EXT4_BLOCKS_PER_GROUP to EXT4_CLUSTERS_PER_GROUP Change the places in fs/ext4/mballoc.c where EXT4_BLOCKS_PER_GROUP are used to indicate the number of bits in a block bitmap (which is really a cluster allocation bitmap in bigalloc file systems). There are still some places in the ext4 codebase where usage of EXT4_BLOCKS_PER_GROUP needs to be audited/fixed, in code paths that aren't used given the initial restricted assumptions for bigalloc. These will need to be fixed before we can relax those restrictions. Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 17a5a57c415a..81e28657a3c2 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -653,7 +653,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, ext4_grpblk_t chunk; unsigned short border; - BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); + BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); border = 2 << sb->s_blocksize_bits; @@ -705,7 +705,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); - ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb); + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_grpblk_t first; ext4_grpblk_t len; @@ -1624,8 +1624,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, struct ext4_free_extent *gex = &ac->ac_g_ex; BUG_ON(ex->fe_len <= 0); - BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); - BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); + BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); ac->ac_found++; @@ -1823,8 +1823,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, while (free && ac->ac_status == AC_STATUS_CONTINUE) { i = mb_find_next_zero_bit(bitmap, - EXT4_BLOCKS_PER_GROUP(sb), i); - if (i >= EXT4_BLOCKS_PER_GROUP(sb)) { + EXT4_CLUSTERS_PER_GROUP(sb), i); + if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) { /* * IF we have corrupt bitmap, we won't find any * free blocks even though group info says we @@ -1887,7 +1887,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, do_div(a, sbi->s_stripe); i = (a * sbi->s_stripe) - first_group_block; - while (i < EXT4_BLOCKS_PER_GROUP(sb)) { + while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { if (!mb_test_bit(i, bitmap)) { max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); if (max >= sbi->s_stripe) { @@ -3036,7 +3036,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, } BUG_ON(start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical); - BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); /* now prepare goal request */ @@ -3690,7 +3690,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, } if (needed == 0) - needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; + needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; INIT_LIST_HEAD(&list); repeat: @@ -4007,8 +4007,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, len = ar->len; /* just a dirty hack to filter too big requests */ - if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10) - len = EXT4_BLOCKS_PER_GROUP(sb) - 10; + if (len >= EXT4_CLUSTERS_PER_GROUP(sb) - 10) + len = EXT4_CLUSTERS_PER_GROUP(sb) - 10; /* start searching from the goal */ goal = ar->goal; @@ -4552,8 +4552,8 @@ do_more: * Check to see if we are freeing blocks across a group * boundary. */ - if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { - overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); + if (bit + count > EXT4_CLUSTERS_PER_GROUP(sb)) { + overflow = bit + count - EXT4_CLUSTERS_PER_GROUP(sb); count -= overflow; } bitmap_bh = ext4_read_block_bitmap(sb, block_group); @@ -4948,7 +4948,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) struct ext4_group_info *grp; ext4_group_t first_group, last_group; ext4_group_t group, ngroups = ext4_get_groups_count(sb); - ext4_grpblk_t cnt = 0, first_block, last_block; + ext4_grpblk_t cnt = 0, first_cluster, last_cluster; uint64_t start, len, minlen, trimmed = 0; ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); @@ -4958,7 +4958,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) len = range->len >> sb->s_blocksize_bits; minlen = range->minlen >> sb->s_blocksize_bits; - if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) + if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb))) return -EINVAL; if (start + len <= first_data_blk) goto out; @@ -4969,11 +4969,11 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) /* Determine first and last group to examine based on start and len */ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, - &first_group, &first_block); + &first_group, &first_cluster); ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len), - &last_group, &last_block); + &last_group, &last_cluster); last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group; - last_block = EXT4_BLOCKS_PER_GROUP(sb); + last_cluster = EXT4_CLUSTERS_PER_GROUP(sb); if (first_group > last_group) return -EINVAL; @@ -4993,20 +4993,20 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) * change it for the last group in which case start + * len < EXT4_BLOCKS_PER_GROUP(sb). */ - if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb)) - last_block = first_block + len; - len -= last_block - first_block; + if (first_cluster + len < EXT4_CLUSTERS_PER_GROUP(sb)) + last_cluster = first_cluster + len; + len -= last_cluster - first_cluster; if (grp->bb_free >= minlen) { - cnt = ext4_trim_all_free(sb, group, first_block, - last_block, minlen); + cnt = ext4_trim_all_free(sb, group, first_cluster, + last_cluster, minlen); if (cnt < 0) { ret = cnt; break; } } trimmed += cnt; - first_block = 0; + first_cluster = 0; } range->len = trimmed * sb->s_blocksize; -- cgit v1.2.3-58-ga151 From 49f7f9af4bb4d7972f3a35a74877937fec9f622d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 9 Sep 2011 18:40:51 -0400 Subject: ext4: factor out block group accounting into functions This makes it easier to understand how ext4_init_block_bitmap() works, and it will assist when we split out ext4_free_blocks_after_init() in the next commit. Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 80 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 48 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index f8224adf496e..8573e2bfb78a 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -23,6 +23,9 @@ #include +static unsigned int num_base_meta_blocks(struct super_block *sb, + ext4_group_t block_group); + /* * balloc.c contains the blocks allocation and deallocation routines */ @@ -83,14 +86,30 @@ static int ext4_group_used_meta_blocks(struct super_block *sb, return used_blocks; } +static unsigned int num_blocks_in_group(struct super_block *sb, + ext4_group_t block_group) +{ + if (block_group == ext4_get_groups_count(sb) - 1) { + /* + * Even though mke2fs always initializes the first and + * last group, just in case some other tool was used, + * we need to make sure we calculate the right free + * blocks. + */ + return ext4_blocks_count(EXT4_SB(sb)->s_es) - + ext4_group_first_block_no(sb, block_group); + } else + return EXT4_BLOCKS_PER_GROUP(sb); +} + /* Initializes an uninitialized block bitmap if given, and returns the * number of blocks free in the group. */ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_group_t block_group, struct ext4_group_desc *gdp) { - int bit, bit_max; + unsigned int bit, bit_max = num_base_meta_blocks(sb, block_group); ext4_group_t ngroups = ext4_get_groups_count(sb); - unsigned free_blocks, group_blocks; + unsigned group_blocks = num_blocks_in_group(sb, block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); if (bh) { @@ -110,35 +129,6 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, memset(bh->b_data, 0, sb->s_blocksize); } - /* Check for superblock and gdt backups in this group */ - bit_max = ext4_bg_has_super(sb, block_group); - - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || - block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * - sbi->s_desc_per_block) { - if (bit_max) { - bit_max += ext4_bg_num_gdb(sb, block_group); - bit_max += - le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); - } - } else { /* For META_BG_BLOCK_GROUPS */ - bit_max += ext4_bg_num_gdb(sb, block_group); - } - - if (block_group == ngroups - 1) { - /* - * Even though mke2fs always initialize first and last group - * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need - * to make sure we calculate the right free blocks - */ - group_blocks = ext4_blocks_count(sbi->s_es) - - ext4_group_first_block_no(sb, ngroups - 1); - } else { - group_blocks = EXT4_BLOCKS_PER_GROUP(sb); - } - - free_blocks = group_blocks - bit_max; - if (bh) { ext4_fsblk_t start, tmp; int flex_bg = 0; @@ -176,7 +166,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data); } - return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp); + return group_blocks - bit_max - + ext4_group_used_meta_blocks(sb, block_group, gdp); } @@ -620,6 +611,31 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) } +/* + * This function returns the number of file system metadata blocks at + * the beginning of a block group, including the reserved gdt blocks. + */ +static unsigned int num_base_meta_blocks(struct super_block *sb, + ext4_group_t block_group) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int num; + + /* Check for superblock and gdt backups in this group */ + num = ext4_bg_has_super(sb, block_group); + + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || + block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * + sbi->s_desc_per_block) { + if (num) { + num += ext4_bg_num_gdb(sb, block_group); + num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); + } + } else { /* For META_BG_BLOCK_GROUPS */ + num += ext4_bg_num_gdb(sb, block_group); + } + return num; +} /** * ext4_inode_to_goal_block - return a hint for block allocation * @inode: inode for block allocation -- cgit v1.2.3-58-ga151 From fd034a84e1ea5c8c8d159cd2089c32e792c269b0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 9 Sep 2011 18:42:51 -0400 Subject: ext4: split out ext4_free_blocks_after_init() The function ext4_free_blocks_after_init() used to be a #define of ext4_init_block_bitmap(). This actually made it difficult to understand how the function worked, and made it hard make changes to support clusters. So as an initial cleanup, I've separated out the functionality of initializing block bitmap from calculating the number of free blocks in the new block group. Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 105 +++++++++++++++++++++++++++---------------------------- fs/ext4/ext4.h | 13 +++---- fs/ext4/ialloc.c | 20 ++++------- 3 files changed, 66 insertions(+), 72 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 8573e2bfb78a..735d9fcc72e6 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -102,74 +102,73 @@ static unsigned int num_blocks_in_group(struct super_block *sb, return EXT4_BLOCKS_PER_GROUP(sb); } -/* Initializes an uninitialized block bitmap if given, and returns the - * number of blocks free in the group. */ -unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, - ext4_group_t block_group, struct ext4_group_desc *gdp) +/* Initializes an uninitialized block bitmap */ +void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, + ext4_group_t block_group, + struct ext4_group_desc *gdp) { unsigned int bit, bit_max = num_base_meta_blocks(sb, block_group); - ext4_group_t ngroups = ext4_get_groups_count(sb); - unsigned group_blocks = num_blocks_in_group(sb, block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (bh) { - J_ASSERT_BH(bh, buffer_locked(bh)); - - /* If checksum is bad mark all blocks used to prevent allocation - * essentially implementing a per-group read-only flag. */ - if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { - ext4_error(sb, "Checksum bad for group %u", - block_group); - ext4_free_blks_set(sb, gdp, 0); - ext4_free_inodes_set(sb, gdp, 0); - ext4_itable_unused_set(sb, gdp, 0); - memset(bh->b_data, 0xff, sb->s_blocksize); - return 0; - } - memset(bh->b_data, 0, sb->s_blocksize); + ext4_fsblk_t start, tmp; + int flex_bg = 0; + + J_ASSERT_BH(bh, buffer_locked(bh)); + + /* If checksum is bad mark all blocks used to prevent allocation + * essentially implementing a per-group read-only flag. */ + if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { + ext4_error(sb, "Checksum bad for group %u", block_group); + ext4_free_blks_set(sb, gdp, 0); + ext4_free_inodes_set(sb, gdp, 0); + ext4_itable_unused_set(sb, gdp, 0); + memset(bh->b_data, 0xff, sb->s_blocksize); + return; } + memset(bh->b_data, 0, sb->s_blocksize); - if (bh) { - ext4_fsblk_t start, tmp; - int flex_bg = 0; + for (bit = 0; bit < bit_max; bit++) + ext4_set_bit(bit, bh->b_data); - for (bit = 0; bit < bit_max; bit++) - ext4_set_bit(bit, bh->b_data); + start = ext4_group_first_block_no(sb, block_group); - start = ext4_group_first_block_no(sb, block_group); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + flex_bg = 1; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_FLEX_BG)) - flex_bg = 1; + /* Set bits for block and inode bitmaps, and inode table */ + tmp = ext4_block_bitmap(sb, gdp); + if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(tmp - start, bh->b_data); - /* Set bits for block and inode bitmaps, and inode table */ - tmp = ext4_block_bitmap(sb, gdp); - if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) - ext4_set_bit(tmp - start, bh->b_data); + tmp = ext4_inode_bitmap(sb, gdp); + if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(tmp - start, bh->b_data); - tmp = ext4_inode_bitmap(sb, gdp); + tmp = ext4_inode_table(sb, gdp); + for (; tmp < ext4_inode_table(sb, gdp) + + sbi->s_itb_per_group; tmp++) { if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(tmp - start, bh->b_data); - - tmp = ext4_inode_table(sb, gdp); - for (; tmp < ext4_inode_table(sb, gdp) + - sbi->s_itb_per_group; tmp++) { - if (!flex_bg || - ext4_block_in_group(sb, tmp, block_group)) - ext4_set_bit(tmp - start, bh->b_data); - } - /* - * Also if the number of blocks within the group is - * less than the blocksize * 8 ( which is the size - * of bitmap ), set rest of the block bitmap to 1 - */ - ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8, - bh->b_data); } - return group_blocks - bit_max - - ext4_group_used_meta_blocks(sb, block_group, gdp); + /* + * Also if the number of blocks within the group is less than + * the blocksize * 8 ( which is the size of bitmap ), set rest + * of the block bitmap to 1 + */ + ext4_mark_bitmap_end(num_blocks_in_group(sb, block_group), + sb->s_blocksize * 8, bh->b_data); } +/* Return the number of free blocks in a block group. It is used when + * the block bitmap is uninitialized, so we can't just count the bits + * in the bitmap. */ +unsigned ext4_free_blocks_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + return num_blocks_in_group(sb, block_group) - + num_base_meta_blocks(sb, block_group) - + ext4_group_used_meta_blocks(sb, block_group, gdp); +} /* * The free blocks are managed by bitmaps. A file system contains several diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f7257aa6bf81..b0b7b67e439d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1763,12 +1763,13 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group); -extern unsigned ext4_init_block_bitmap(struct super_block *sb, - struct buffer_head *bh, - ext4_group_t group, - struct ext4_group_desc *desc); -#define ext4_free_blocks_after_init(sb, group, desc) \ - ext4_init_block_bitmap(sb, NULL, group, desc) +extern void ext4_init_block_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t group, + struct ext4_group_desc *desc); +extern unsigned ext4_free_blocks_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); /* dir.c */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 9c63f273b550..b7a8130d0af4 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -816,7 +816,6 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, int ret2, err = 0; struct inode *ret; ext4_group_t i; - int free = 0; static int once = 1; ext4_group_t flex_group; @@ -950,26 +949,21 @@ got: goto fail; } - free = 0; - ext4_lock_group(sb, group); + BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); + err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); + brelse(block_bitmap_bh); + /* recheck and clear flag under lock if we still need to */ + ext4_lock_group(sb, group); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - free = ext4_free_blocks_after_init(sb, group, gdp); gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - ext4_free_blks_set(sb, gdp, free); + ext4_free_blks_set(sb, gdp, + ext4_free_blocks_after_init(sb, group, gdp)); gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); } ext4_unlock_group(sb, group); - /* Don't need to dirty bitmap block if we didn't change it */ - if (free) { - BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); - err = ext4_handle_dirty_metadata(handle, - NULL, block_bitmap_bh); - } - - brelse(block_bitmap_bh); if (err) goto fail; } -- cgit v1.2.3-58-ga151 From d5b8f31007a93777cfb0603b665858fb7aebebfc Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 9 Sep 2011 18:44:51 -0400 Subject: ext4: bigalloc changes to block bitmap initialization functions Add bigalloc support to ext4_init_block_bitmap() and ext4_free_blocks_after_init(). Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 131 ++++++++++++++++++++++++++++++++++++++----------------- fs/ext4/ext4.h | 13 ++++++ 2 files changed, 103 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 735d9fcc72e6..1c6d777b35a2 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -23,9 +23,6 @@ #include -static unsigned int num_base_meta_blocks(struct super_block *sb, - ext4_group_t block_group); - /* * balloc.c contains the blocks allocation and deallocation routines */ @@ -58,37 +55,87 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, return 0; } -static int ext4_group_used_meta_blocks(struct super_block *sb, - ext4_group_t block_group, - struct ext4_group_desc *gdp) +/* Return the number of clusters used for file system metadata; this + * represents the overhead needed by the file system. + */ +unsigned ext4_num_overhead_clusters(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp) { - ext4_fsblk_t tmp; + unsigned num_clusters; + int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c; + ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group); + ext4_fsblk_t itbl_blk; struct ext4_sb_info *sbi = EXT4_SB(sb); - /* block bitmap, inode bitmap, and inode table blocks */ - int used_blocks = sbi->s_itb_per_group + 2; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { - if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), - block_group)) - used_blocks--; - - if (!ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), - block_group)) - used_blocks--; - - tmp = ext4_inode_table(sb, gdp); - for (; tmp < ext4_inode_table(sb, gdp) + - sbi->s_itb_per_group; tmp++) { - if (!ext4_block_in_group(sb, tmp, block_group)) - used_blocks -= 1; + /* This is the number of clusters used by the superblock, + * block group descriptors, and reserved block group + * descriptor blocks */ + num_clusters = ext4_num_base_meta_clusters(sb, block_group); + + /* + * For the allocation bitmaps and inode table, we first need + * to check to see if the block is in the block group. If it + * is, then check to see if the cluster is already accounted + * for in the clusters used for the base metadata cluster, or + * if we can increment the base metadata cluster to include + * that block. Otherwise, we will have to track the cluster + * used for the allocation bitmap or inode table explicitly. + * Normally all of these blocks are contiguous, so the special + * case handling shouldn't be necessary except for *very* + * unusual file system layouts. + */ + if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { + block_cluster = EXT4_B2C(sbi, (start - + ext4_block_bitmap(sb, gdp))); + if (block_cluster < num_clusters) + block_cluster = -1; + else if (block_cluster == num_clusters) { + num_clusters++; + block_cluster = -1; + } + } + + if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { + inode_cluster = EXT4_B2C(sbi, + start - ext4_inode_bitmap(sb, gdp)); + if (inode_cluster < num_clusters) + inode_cluster = -1; + else if (inode_cluster == num_clusters) { + num_clusters++; + inode_cluster = -1; + } + } + + itbl_blk = ext4_inode_table(sb, gdp); + for (i = 0; i < sbi->s_itb_per_group; i++) { + if (ext4_block_in_group(sb, itbl_blk + i, block_group)) { + c = EXT4_B2C(sbi, start - itbl_blk + i); + if ((c < num_clusters) || (c == inode_cluster) || + (c == block_cluster) || (c == itbl_cluster)) + continue; + if (c == num_clusters) { + num_clusters++; + continue; + } + num_clusters++; + itbl_cluster = c; } } - return used_blocks; + + if (block_cluster != -1) + num_clusters++; + if (inode_cluster != -1) + num_clusters++; + + return num_clusters; } -static unsigned int num_blocks_in_group(struct super_block *sb, - ext4_group_t block_group) +static unsigned int num_clusters_in_group(struct super_block *sb, + ext4_group_t block_group) { + unsigned int blocks; + if (block_group == ext4_get_groups_count(sb) - 1) { /* * Even though mke2fs always initializes the first and @@ -96,10 +143,11 @@ static unsigned int num_blocks_in_group(struct super_block *sb, * we need to make sure we calculate the right free * blocks. */ - return ext4_blocks_count(EXT4_SB(sb)->s_es) - + blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) - ext4_group_first_block_no(sb, block_group); } else - return EXT4_BLOCKS_PER_GROUP(sb); + blocks = EXT4_BLOCKS_PER_GROUP(sb); + return EXT4_NUM_B2C(EXT4_SB(sb), blocks); } /* Initializes an uninitialized block bitmap */ @@ -107,7 +155,7 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_group_t block_group, struct ext4_group_desc *gdp) { - unsigned int bit, bit_max = num_base_meta_blocks(sb, block_group); + unsigned int bit, bit_max; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start, tmp; int flex_bg = 0; @@ -126,6 +174,7 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, } memset(bh->b_data, 0, sb->s_blocksize); + bit_max = ext4_num_base_meta_clusters(sb, block_group); for (bit = 0; bit < bit_max; bit++) ext4_set_bit(bit, bh->b_data); @@ -137,24 +186,25 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, /* Set bits for block and inode bitmaps, and inode table */ tmp = ext4_block_bitmap(sb, gdp); if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) - ext4_set_bit(tmp - start, bh->b_data); + ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); tmp = ext4_inode_bitmap(sb, gdp); if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) - ext4_set_bit(tmp - start, bh->b_data); + ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); tmp = ext4_inode_table(sb, gdp); for (; tmp < ext4_inode_table(sb, gdp) + sbi->s_itb_per_group; tmp++) { if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) - ext4_set_bit(tmp - start, bh->b_data); + ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); } + /* * Also if the number of blocks within the group is less than * the blocksize * 8 ( which is the size of bitmap ), set rest * of the block bitmap to 1 */ - ext4_mark_bitmap_end(num_blocks_in_group(sb, block_group), + ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group), sb->s_blocksize * 8, bh->b_data); } @@ -165,9 +215,8 @@ unsigned ext4_free_blocks_after_init(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { - return num_blocks_in_group(sb, block_group) - - num_base_meta_blocks(sb, block_group) - - ext4_group_used_meta_blocks(sb, block_group, gdp); + return num_clusters_in_group(sb, block_group) - + ext4_num_overhead_clusters(sb, block_group, gdp); } /* @@ -611,14 +660,14 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) } /* - * This function returns the number of file system metadata blocks at + * This function returns the number of file system metadata clusters at * the beginning of a block group, including the reserved gdt blocks. */ -static unsigned int num_base_meta_blocks(struct super_block *sb, - ext4_group_t block_group) +unsigned ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group) { struct ext4_sb_info *sbi = EXT4_SB(sb); - int num; + unsigned num; /* Check for superblock and gdt backups in this group */ num = ext4_bg_has_super(sb, block_group); @@ -633,7 +682,7 @@ static unsigned int num_base_meta_blocks(struct super_block *sb, } else { /* For META_BG_BLOCK_GROUPS */ num += ext4_bg_num_gdb(sb, block_group); } - return num; + return EXT4_NUM_B2C(sbi, num); } /** * ext4_inode_to_goal_block - return a hint for block allocation diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b0b7b67e439d..803cfa42e1e8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -261,6 +261,14 @@ struct ext4_io_submit { #endif #define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) +/* Translate a block number to a cluster number */ +#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) +/* Translate a cluster number to a block number */ +#define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits) +/* Translate # of blks to # of clusters */ +#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ + (sbi)->s_cluster_bits) + /* * Structure of a blocks group descriptor */ @@ -1770,6 +1778,11 @@ extern void ext4_init_block_bitmap(struct super_block *sb, extern unsigned ext4_free_blocks_after_init(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp); +extern unsigned ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group); +extern unsigned ext4_num_overhead_clusters(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); /* dir.c */ -- cgit v1.2.3-58-ga151 From 3212a80a58062056bb922811071062be58d8fee1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 9 Sep 2011 18:46:51 -0400 Subject: ext4: convert block group-relative offsets to use clusters Certain parts of the ext4 code base, primarily in mballoc.c, use a block group number and offset from the beginning of the block group. This offset is invariably used to index into the allocation bitmap, so change the offset to be denominated in units of clusters. Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 6 ++++-- fs/ext4/mballoc.h | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1c6d777b35a2..89abf1f7b253 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -28,7 +28,8 @@ */ /* - * Calculate the block group number and offset, given a block number + * Calculate the block group number and offset into the block/cluster + * allocation bitmap, given a block number */ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp) @@ -37,7 +38,8 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ext4_grpblk_t offset; blocknr = blocknr - le32_to_cpu(es->s_first_data_block); - offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)); + offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >> + EXT4_SB(sb)->s_cluster_bits; if (offsetp) *offsetp = offset; if (blockgrpp) diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 9d4a636b546c..3cdb8aa9f6b7 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -216,6 +216,7 @@ struct ext4_buddy { static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, struct ext4_free_extent *fex) { - return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start; + return ext4_group_first_block_no(sb, fex->fe_group) + + (fex->fe_start << EXT4_SB(sb)->s_cluster_bits); } #endif -- cgit v1.2.3-58-ga151 From 53accfa9f819c80056db6f03f9c5cfa4bcba1ed8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 9 Sep 2011 18:48:51 -0400 Subject: ext4: teach mballoc preallocation code about bigalloc clusters In most of mballoc.c, we do everything in units of clusters, since the block allocation bitmaps and buddy bitmaps are all denominated in clusters. The one place where we do deal with absolute block numbers is in the code that handles the preallocation regions, since in the case of inode-based preallocation regions, the start of the preallocation region can't be relative to the beginning of the group. So this adds a bit of complexity, where pa_pstart and pa_lstart are block numbers, while pa_free, pa_len, and fe_len are denominated in units of clusters. Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 95 +++++++++++++++++++++++++++++++------------------------ fs/ext4/mballoc.h | 4 +-- 2 files changed, 56 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 81e28657a3c2..8765f2512f13 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -70,8 +70,8 @@ * * pa_lstart -> the logical start block for this prealloc space * pa_pstart -> the physical start block for this prealloc space - * pa_len -> length for this prealloc space - * pa_free -> free space available in this prealloc space + * pa_len -> length for this prealloc space (in clusters) + * pa_free -> free space available in this prealloc space (in clusters) * * The inode preallocation space is used looking at the _logical_ start * block. If only the logical file block falls within the range of prealloc @@ -459,7 +459,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); - blocknr += first + i; + blocknr += EXT4_C2B(EXT4_SB(sb), first + i); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, @@ -734,7 +734,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, if (free != grp->bb_free) { ext4_grp_locked_error(sb, group, 0, 0, - "%u blocks in bitmap, %u in gd", + "%u clusters in bitmap, %u in gd", free, grp->bb_free); /* * If we intent to continue, we consider group descritor @@ -1339,7 +1339,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); - blocknr += block; + blocknr += EXT4_C2B(EXT4_SB(sb), block); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, @@ -1831,7 +1831,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, * we have free blocks */ ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, - "%d free blocks as per " + "%d free clusters as per " "group info. But bitmap says 0", free); break; @@ -1841,7 +1841,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, BUG_ON(ex.fe_len <= 0); if (free < ex.fe_len) { ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, - "%d free blocks as per " + "%d free clusters as per " "group info. But got %d blocks", free, ex.fe_len); /* @@ -2752,7 +2752,7 @@ void ext4_exit_mballoc(void) */ static noinline_for_stack int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, - handle_t *handle, unsigned int reserv_blks) + handle_t *handle, unsigned int reserv_clstrs) { struct buffer_head *bitmap_bh = NULL; struct ext4_group_desc *gdp; @@ -2791,7 +2791,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); - len = ac->ac_b_ex.fe_len; + len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (!ext4_data_block_valid(sbi, block, len)) { ext4_error(sb, "Allocating blocks %llu-%llu which overlap " "fs metadata\n", block, block+len); @@ -2838,7 +2838,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, */ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ - percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); + percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_clstrs); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, @@ -2886,6 +2886,7 @@ static noinline_for_stack void ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int bsbits, max; ext4_lblk_t end; loff_t size, orig_size, start_off; @@ -2916,7 +2917,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* first, let's learn actual file size * given current request is allocated */ - size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; + size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); size = size << bsbits; if (size < i_size_read(ac->ac_inode)) size = i_size_read(ac->ac_inode); @@ -2988,7 +2989,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, continue; } - pa_end = pa->pa_lstart + pa->pa_len; + pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), + pa->pa_len); /* PA must not overlap original request */ BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || @@ -3018,9 +3020,11 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, rcu_read_lock(); list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { ext4_lblk_t pa_end; + spin_lock(&pa->pa_lock); if (pa->pa_deleted == 0) { - pa_end = pa->pa_lstart + pa->pa_len; + pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), + pa->pa_len); BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); } spin_unlock(&pa->pa_lock); @@ -3043,7 +3047,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* XXX: is it better to align blocks WRT to logical * placement or satisfy big request as is */ ac->ac_g_ex.fe_logical = start; - ac->ac_g_ex.fe_len = size; + ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); /* define goal start in order to merge */ if (ar->pright && (ar->lright == (start + size))) { @@ -3112,14 +3116,16 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); ext4_fsblk_t start; ext4_fsblk_t end; int len; /* found preallocated blocks, use them */ start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); - end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len); - len = end - start; + end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), + start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len)); + len = EXT4_NUM_B2C(sbi, end - start); ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, &ac->ac_b_ex.fe_start); ac->ac_b_ex.fe_len = len; @@ -3127,7 +3133,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, ac->ac_pa = pa; BUG_ON(start < pa->pa_pstart); - BUG_ON(start + len > pa->pa_pstart + pa->pa_len); + BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); BUG_ON(pa->pa_free < len); pa->pa_free -= len; @@ -3193,6 +3199,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block, static noinline_for_stack int ext4_mb_use_preallocated(struct ext4_allocation_context *ac) { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int order, i; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; @@ -3210,12 +3217,14 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* all fields in this condition don't change, * so we can skip locking for them */ if (ac->ac_o_ex.fe_logical < pa->pa_lstart || - ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) + ac->ac_o_ex.fe_logical >= (pa->pa_lstart + + EXT4_C2B(sbi, pa->pa_len))) continue; /* non-extent files can't have physical blocks past 2^32 */ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && - pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) + (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) > + EXT4_MAX_BLOCK_FILE_PHYS)) continue; /* found preallocated blocks, use them */ @@ -3412,6 +3421,7 @@ static noinline_for_stack int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_prealloc_space *pa; struct ext4_group_info *grp; struct ext4_inode_info *ei; @@ -3443,16 +3453,18 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; /* also, we should cover whole original request */ - wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len; + wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len); /* the smallest one defines real window */ win = min(winl, wins); - offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len; + offs = ac->ac_o_ex.fe_logical % + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (offs && offs < win) win = offs; - ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win; + ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - + EXT4_B2C(sbi, win); BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); } @@ -3477,7 +3489,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) trace_ext4_mb_new_inode_pa(ac, pa); ext4_mb_use_inode_pa(ac, pa); - atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); + atomic_add(pa->pa_free, &sbi->s_mb_preallocated); ei = EXT4_I(ac->ac_inode); grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); @@ -3592,7 +3604,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); - grp_blk_start = pa->pa_pstart - bit; + grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); end = bit + pa->pa_len; @@ -3607,7 +3619,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, free += next - bit; trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); - trace_ext4_mb_release_inode_pa(pa, grp_blk_start + bit, + trace_ext4_mb_release_inode_pa(pa, (grp_blk_start + + EXT4_C2B(sbi, bit)), next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; @@ -3958,7 +3971,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) return; - size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; + size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; @@ -4019,18 +4032,15 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, /* set up allocation goals */ memset(ac, 0, sizeof(struct ext4_allocation_context)); - ac->ac_b_ex.fe_logical = ar->logical; + ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1); ac->ac_status = AC_STATUS_CONTINUE; ac->ac_sb = sb; ac->ac_inode = ar->inode; - ac->ac_o_ex.fe_logical = ar->logical; + ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical; ac->ac_o_ex.fe_group = group; ac->ac_o_ex.fe_start = block; ac->ac_o_ex.fe_len = len; - ac->ac_g_ex.fe_logical = ar->logical; - ac->ac_g_ex.fe_group = group; - ac->ac_g_ex.fe_start = block; - ac->ac_g_ex.fe_len = len; + ac->ac_g_ex = ac->ac_o_ex; ac->ac_flags = ar->flags; /* we have to define context: we'll we work with a file or @@ -4182,13 +4192,14 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) */ static int ext4_mb_release_context(struct ext4_allocation_context *ac) { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; if (pa) { if (pa->pa_type == MB_GROUP_PA) { /* see comment in ext4_mb_use_group_pa() */ spin_lock(&pa->pa_lock); - pa->pa_pstart += ac->ac_b_ex.fe_len; - pa->pa_lstart += ac->ac_b_ex.fe_len; + pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); pa->pa_free -= ac->ac_b_ex.fe_len; pa->pa_len -= ac->ac_b_ex.fe_len; spin_unlock(&pa->pa_lock); @@ -4249,7 +4260,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, struct super_block *sb; ext4_fsblk_t block = 0; unsigned int inquota = 0; - unsigned int reserv_blks = 0; + unsigned int reserv_clstrs = 0; sb = ar->inode->i_sb; sbi = EXT4_SB(sb); @@ -4279,12 +4290,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, *errp = -ENOSPC; return 0; } - reserv_blks = ar->len; + reserv_clstrs = ar->len; if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { - dquot_alloc_block_nofail(ar->inode, ar->len); + dquot_alloc_block_nofail(ar->inode, + EXT4_C2B(sbi, ar->len)); } else { while (ar->len && - dquot_alloc_block(ar->inode, ar->len)) { + dquot_alloc_block(ar->inode, + EXT4_C2B(sbi, ar->len))) { ar->flags |= EXT4_MB_HINT_NOPREALLOC; ar->len--; @@ -4328,7 +4341,7 @@ repeat: ext4_mb_new_preallocation(ac); } if (likely(ac->ac_status == AC_STATUS_FOUND)) { - *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); + *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); if (*errp == -EAGAIN) { /* * drop the reference that we took @@ -4364,13 +4377,13 @@ out: if (ac) kmem_cache_free(ext4_ac_cachep, ac); if (inquota && ar->len < inquota) - dquot_free_block(ar->inode, inquota - ar->len); + dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); if (!ar->len) { if (!ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ percpu_counter_sub(&sbi->s_dirtyblocks_counter, - reserv_blks); + reserv_clstrs); } trace_ext4_allocate_blocks(ar, (unsigned long long)block); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 3cdb8aa9f6b7..1641f4b57439 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -139,9 +139,9 @@ enum { struct ext4_free_extent { ext4_lblk_t fe_logical; - ext4_grpblk_t fe_start; + ext4_grpblk_t fe_start; /* In cluster units */ ext4_group_t fe_group; - ext4_grpblk_t fe_len; + ext4_grpblk_t fe_len; /* In cluster units */ }; /* -- cgit v1.2.3-58-ga151 From 84130193e0e6568dfdfb823f0e1e19aec80aff6e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 9 Sep 2011 18:50:51 -0400 Subject: ext4: teach ext4_free_blocks() about bigalloc and clusters The ext4_free_blocks() function now has two new flags that indicate whether a partial cluster at the beginning or the end of the block extents should be freed or not. That will be up the caller (i.e., truncate), who can figure out whether partial clusters at the beginning or the end of a block range can be freed. We also have to update the ext4_mb_free_metadata() and release_blocks_on_commit() machinery to be cluster-based, since it is used by ext4_free_blocks(). Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 2 ++ fs/ext4/mballoc.c | 86 +++++++++++++++++++++++++++++++++++++++---------------- fs/ext4/mballoc.h | 2 +- 3 files changed, 65 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 803cfa42e1e8..030bfc1cb59d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -539,6 +539,8 @@ struct ext4_new_group_data { #define EXT4_FREE_BLOCKS_FORGET 0x0002 #define EXT4_FREE_BLOCKS_VALIDATED 0x0004 #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 +#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 +#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 /* * Flags used by ext4_discard_partial_page_buffers diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8765f2512f13..57ce6960e940 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2602,11 +2602,13 @@ int ext4_mb_release(struct super_block *sb) } static inline int ext4_issue_discard(struct super_block *sb, - ext4_group_t block_group, ext4_grpblk_t block, int count) + ext4_group_t block_group, ext4_grpblk_t cluster, int count) { ext4_fsblk_t discard_block; - discard_block = block + ext4_group_first_block_no(sb, block_group); + discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) + + ext4_group_first_block_no(sb, block_group)); + count = EXT4_C2B(EXT4_SB(sb), count); trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); @@ -2633,7 +2635,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) if (test_opt(sb, DISCARD)) ext4_issue_discard(sb, entry->group, - entry->start_blk, entry->count); + entry->start_cluster, entry->count); err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ @@ -2646,7 +2648,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) ext4_lock_group(sb, entry->group); /* Take it out of per group rb tree */ rb_erase(&entry->node, &(db->bb_free_root)); - mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); + mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count); /* * Clear the trimmed flag for the group so that the next @@ -3300,7 +3302,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, while (n) { entry = rb_entry(n, struct ext4_free_data, node); - ext4_set_bits(bitmap, entry->start_blk, entry->count); + ext4_set_bits(bitmap, entry->start_cluster, entry->count); n = rb_next(n); } return; @@ -4401,7 +4403,7 @@ static int can_merge(struct ext4_free_data *entry1, { if ((entry1->t_tid == entry2->t_tid) && (entry1->group == entry2->group) && - ((entry1->start_blk + entry1->count) == entry2->start_blk)) + ((entry1->start_cluster + entry1->count) == entry2->start_cluster)) return 1; return 0; } @@ -4411,7 +4413,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_free_data *new_entry) { ext4_group_t group = e4b->bd_group; - ext4_grpblk_t block; + ext4_grpblk_t cluster; struct ext4_free_data *entry; struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; @@ -4424,7 +4426,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, BUG_ON(e4b->bd_buddy_page == NULL); new_node = &new_entry->node; - block = new_entry->start_blk; + cluster = new_entry->start_cluster; if (!*n) { /* first free block exent. We need to @@ -4438,13 +4440,14 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, while (*n) { parent = *n; entry = rb_entry(parent, struct ext4_free_data, node); - if (block < entry->start_blk) + if (cluster < entry->start_cluster) n = &(*n)->rb_left; - else if (block >= (entry->start_blk + entry->count)) + else if (cluster >= (entry->start_cluster + entry->count)) n = &(*n)->rb_right; else { ext4_grp_locked_error(sb, group, 0, - ext4_group_first_block_no(sb, group) + block, + ext4_group_first_block_no(sb, group) + + EXT4_C2B(sbi, cluster), "Block already on to-be-freed list"); return 0; } @@ -4458,7 +4461,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, if (node) { entry = rb_entry(node, struct ext4_free_data, node); if (can_merge(entry, new_entry)) { - new_entry->start_blk = entry->start_blk; + new_entry->start_cluster = entry->start_cluster; new_entry->count += entry->count; rb_erase(node, &(db->bb_free_root)); spin_lock(&sbi->s_md_lock); @@ -4509,6 +4512,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, ext4_group_t block_group; struct ext4_sb_info *sbi; struct ext4_buddy e4b; + unsigned int count_clusters; int err = 0; int ret; @@ -4557,6 +4561,38 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, if (!ext4_should_writeback_data(inode)) flags |= EXT4_FREE_BLOCKS_METADATA; + /* + * If the extent to be freed does not begin on a cluster + * boundary, we need to deal with partial clusters at the + * beginning and end of the extent. Normally we will free + * blocks at the beginning or the end unless we are explicitly + * requested to avoid doing so. + */ + overflow = block & (sbi->s_cluster_ratio - 1); + if (overflow) { + if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { + overflow = sbi->s_cluster_ratio - overflow; + block += overflow; + if (count > overflow) + count -= overflow; + else + return; + } else { + block -= overflow; + count += overflow; + } + } + overflow = count & (sbi->s_cluster_ratio - 1); + if (overflow) { + if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { + if (count > overflow) + count -= overflow; + else + return; + } else + count += sbi->s_cluster_ratio - overflow; + } + do_more: overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); @@ -4565,10 +4601,12 @@ do_more: * Check to see if we are freeing blocks across a group * boundary. */ - if (bit + count > EXT4_CLUSTERS_PER_GROUP(sb)) { - overflow = bit + count - EXT4_CLUSTERS_PER_GROUP(sb); + if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) { + overflow = EXT4_C2B(sbi, bit) + count - + EXT4_BLOCKS_PER_GROUP(sb); count -= overflow; } + count_clusters = EXT4_B2C(sbi, count); bitmap_bh = ext4_read_block_bitmap(sb, block_group); if (!bitmap_bh) { err = -EIO; @@ -4583,9 +4621,9 @@ do_more: if (in_range(ext4_block_bitmap(sb, gdp), block, count) || in_range(ext4_inode_bitmap(sb, gdp), block, count) || in_range(block, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group) || + EXT4_SB(sb)->s_itb_per_group) || in_range(block + count - 1, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group)) { + EXT4_SB(sb)->s_itb_per_group)) { ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); @@ -4610,11 +4648,11 @@ do_more: #ifdef AGGRESSIVE_CHECK { int i; - for (i = 0; i < count; i++) + for (i = 0; i < count_clusters; i++) BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); } #endif - trace_ext4_mballoc_free(sb, inode, block_group, bit, count); + trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); err = ext4_mb_load_buddy(sb, block_group, &e4b); if (err) @@ -4631,13 +4669,13 @@ do_more: err = -ENOMEM; goto error_return; } - new_entry->start_blk = bit; + new_entry->start_cluster = bit; new_entry->group = block_group; - new_entry->count = count; + new_entry->count = count_clusters; new_entry->t_tid = handle->h_transaction->t_tid; ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count); + mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); ext4_mb_free_metadata(handle, &e4b, new_entry); } else { /* need to update group_info->bb_free and bitmap @@ -4645,11 +4683,11 @@ do_more: * them with group lock_held */ ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count); - mb_free_blocks(inode, &e4b, bit, count); + mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); + mb_free_blocks(inode, &e4b, bit, count_clusters); } - ret = ext4_free_blks_count(sb, gdp) + count; + ret = ext4_free_blks_count(sb, gdp) + count_clusters; ext4_free_blks_set(sb, gdp, ret); gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); ext4_unlock_group(sb, block_group); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 1641f4b57439..dc99930d4cb5 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -106,7 +106,7 @@ struct ext4_free_data { ext4_group_t group; /* free block extent */ - ext4_grpblk_t start_blk; + ext4_grpblk_t start_cluster; ext4_grpblk_t count; /* transaction which freed this extent */ -- cgit v1.2.3-58-ga151 From 4d33b1ef10995d7ba6191d67456202c697a92a32 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 9 Sep 2011 18:52:51 -0400 Subject: ext4: teach ext4_ext_map_blocks() about the bigalloc feature If we need to allocate a new block in ext4_ext_map_blocks(), the function needs to see if the cluster has already been allocated. Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 181 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 162 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ba7bd5a176ce..bd42ab29efec 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1270,7 +1270,8 @@ static int ext4_ext_search_left(struct inode *inode, */ static int ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, - ext4_lblk_t *logical, ext4_fsblk_t *phys) + ext4_lblk_t *logical, ext4_fsblk_t *phys, + struct ext4_extent **ret_ex) { struct buffer_head *bh = NULL; struct ext4_extent_header *eh; @@ -1312,9 +1313,7 @@ static int ext4_ext_search_right(struct inode *inode, return -EIO; } } - *logical = le32_to_cpu(ex->ee_block); - *phys = ext4_ext_pblock(ex); - return 0; + goto found_extent; } if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { @@ -1327,9 +1326,7 @@ static int ext4_ext_search_right(struct inode *inode, if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { /* next allocated block in this leaf */ ex++; - *logical = le32_to_cpu(ex->ee_block); - *phys = ext4_ext_pblock(ex); - return 0; + goto found_extent; } /* go up and search for index to the right */ @@ -1372,9 +1369,12 @@ got_index: return -EIO; } ex = EXT_FIRST_EXTENT(eh); +found_extent: *logical = le32_to_cpu(ex->ee_block); *phys = ext4_ext_pblock(ex); - put_bh(bh); + *ret_ex = ex; + if (bh) + put_bh(bh); return 0; } @@ -1627,7 +1627,8 @@ static int ext4_ext_try_to_merge(struct inode *inode, * such that there will be no overlap, and then returns 1. * If there is no overlap found, it returns 0. */ -static unsigned int ext4_ext_check_overlap(struct inode *inode, +static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, + struct inode *inode, struct ext4_extent *newext, struct ext4_ext_path *path) { @@ -1641,6 +1642,7 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode, if (!path[depth].p_ext) goto out; b2 = le32_to_cpu(path[depth].p_ext->ee_block); + b2 &= ~(sbi->s_cluster_ratio - 1); /* * get the next allocated block if the extent in the path @@ -1650,6 +1652,7 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode, b2 = ext4_ext_next_allocated_block(path); if (b2 == EXT_MAX_BLOCKS) goto out; + b2 &= ~(sbi->s_cluster_ratio - 1); } /* check for wrap through zero on extent logical start block*/ @@ -3293,6 +3296,106 @@ out2: return err ? err : allocated; } +/* + * get_implied_cluster_alloc - check to see if the requested + * allocation (in the map structure) overlaps with a cluster already + * allocated in an extent. + * @sbi The ext4-specific superblock structure + * @map The requested lblk->pblk mapping + * @ex The extent structure which might contain an implied + * cluster allocation + * + * This function is called by ext4_ext_map_blocks() after we failed to + * find blocks that were already in the inode's extent tree. Hence, + * we know that the beginning of the requested region cannot overlap + * the extent from the inode's extent tree. There are three cases we + * want to catch. The first is this case: + * + * |--- cluster # N--| + * |--- extent ---| |---- requested region ---| + * |==========| + * + * The second case that we need to test for is this one: + * + * |--------- cluster # N ----------------| + * |--- requested region --| |------- extent ----| + * |=======================| + * + * The third case is when the requested region lies between two extents + * within the same cluster: + * |------------- cluster # N-------------| + * |----- ex -----| |---- ex_right ----| + * |------ requested region ------| + * |================| + * + * In each of the above cases, we need to set the map->m_pblk and + * map->m_len so it corresponds to the return the extent labelled as + * "|====|" from cluster #N, since it is already in use for data in + * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to + * signal to ext4_ext_map_blocks() that map->m_pblk should be treated + * as a new "allocated" block region. Otherwise, we will return 0 and + * ext4_ext_map_blocks() will then allocate one or more new clusters + * by calling ext4_mb_new_blocks(). + */ +static int get_implied_cluster_alloc(struct ext4_sb_info *sbi, + struct ext4_map_blocks *map, + struct ext4_extent *ex, + struct ext4_ext_path *path) +{ + ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1); + ext4_lblk_t ex_cluster_start, ex_cluster_end; + ext4_lblk_t rr_cluster_start, rr_cluster_end; + ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); + ext4_fsblk_t ee_start = ext4_ext_pblock(ex); + unsigned short ee_len = ext4_ext_get_actual_len(ex); + + /* The extent passed in that we are trying to match */ + ex_cluster_start = EXT4_B2C(sbi, ee_block); + ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1); + + /* The requested region passed into ext4_map_blocks() */ + rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); + rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1); + + if ((rr_cluster_start == ex_cluster_end) || + (rr_cluster_start == ex_cluster_start)) { + if (rr_cluster_start == ex_cluster_end) + ee_start += ee_len - 1; + map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) + + c_offset; + map->m_len = min(map->m_len, + (unsigned) sbi->s_cluster_ratio - c_offset); + /* + * Check for and handle this case: + * + * |--------- cluster # N-------------| + * |------- extent ----| + * |--- requested region ---| + * |===========| + */ + + if (map->m_lblk < ee_block) + map->m_len = min(map->m_len, ee_block - map->m_lblk); + + /* + * Check for the case where there is already another allocated + * block to the right of 'ex' but before the end of the cluster. + * + * |------------- cluster # N-------------| + * |----- ex -----| |---- ex_right ----| + * |------ requested region ------| + * |================| + */ + if (map->m_lblk > ee_block) { + ext4_lblk_t next = ext4_ext_next_allocated_block(path); + map->m_len = min(map->m_len, next - map->m_lblk); + } + return 1; + } + return 0; +} + + /* * Block allocation/map/preallocation routine for extents based files * @@ -3315,14 +3418,16 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct ext4_ext_path *path = NULL; - struct ext4_extent newex, *ex; + struct ext4_extent newex, *ex, *ex2; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0; - int err = 0, depth, ret; - unsigned int allocated = 0; + int free_on_err = 0, err = 0, depth, ret; + unsigned int allocated = 0, offset = 0; unsigned int punched_out = 0; unsigned int result = 0; struct ext4_allocation_request ar; ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; + ext4_lblk_t cluster_offset; struct ext4_map_blocks punch_map; ext_debug("blocks %u/%u requested for inode %lu\n", @@ -3508,9 +3613,23 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); goto out2; } + /* * Okay, we need to do block allocation. */ + newex.ee_block = cpu_to_le32(map->m_lblk); + cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1); + + /* + * If we are doing bigalloc, check to see if the extent returned + * by ext4_ext_find_extent() implies a cluster we can use. + */ + if (cluster_offset && ex && + get_implied_cluster_alloc(sbi, map, ex, path)) { + ar.len = allocated = map->m_len; + newblock = map->m_pblk; + goto got_allocated_blocks; + } /* find neighbour allocated blocks */ ar.lleft = map->m_lblk; @@ -3518,10 +3637,20 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if (err) goto out2; ar.lright = map->m_lblk; - err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); + ex2 = NULL; + err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); if (err) goto out2; + /* Check if the extent after searching to the right implies a + * cluster we can use. */ + if ((sbi->s_cluster_ratio > 1) && ex2 && + get_implied_cluster_alloc(sbi, map, ex2, path)) { + ar.len = allocated = map->m_len; + newblock = map->m_pblk; + goto got_allocated_blocks; + } + /* * See if request is beyond maximum number of blocks we can have in * a single extent. For an initialized extent this limit is @@ -3536,9 +3665,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, map->m_len = EXT_UNINIT_MAX_LEN; /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ - newex.ee_block = cpu_to_le32(map->m_lblk); newex.ee_len = cpu_to_le16(map->m_len); - err = ext4_ext_check_overlap(inode, &newex, path); + err = ext4_ext_check_overlap(sbi, inode, &newex, path); if (err) allocated = ext4_ext_get_actual_len(&newex); else @@ -3548,7 +3676,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ar.inode = inode; ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); ar.logical = map->m_lblk; - ar.len = allocated; + /* + * We calculate the offset from the beginning of the cluster + * for the logical block number, since when we allocate a + * physical cluster, the physical block should start at the + * same offset from the beginning of the cluster. This is + * needed so that future calls to get_implied_cluster_alloc() + * work correctly. + */ + offset = map->m_lblk & (sbi->s_cluster_ratio - 1); + ar.len = EXT4_NUM_B2C(sbi, offset+allocated); + ar.goal -= offset; + ar.logical -= offset; if (S_ISREG(inode->i_mode)) ar.flags = EXT4_MB_HINT_DATA; else @@ -3561,9 +3700,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, goto out2; ext_debug("allocate new block: goal %llu, found %llu/%u\n", ar.goal, newblock, allocated); + free_on_err = 1; + ar.len = EXT4_C2B(sbi, ar.len) - offset; + if (ar.len > allocated) + ar.len = allocated; +got_allocated_blocks: /* try to insert new extent into found leaf and return */ - ext4_ext_store_pblock(&newex, newblock); + ext4_ext_store_pblock(&newex, newblock + offset); newex.ee_len = cpu_to_le16(ar.len); /* Mark uninitialized */ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ @@ -3591,7 +3735,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if (!err) err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); - if (err) { + if (err && free_on_err) { int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; /* free data blocks we just allocated */ @@ -4115,7 +4259,6 @@ found_delayed_extent: return EXT_BREAK; return EXT_CONTINUE; } - /* fiemap flags we can handle specified here */ #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) -- cgit v1.2.3-58-ga151 From 0aa060000e83ca3d09ddc446a7174fb0820d99bc Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 9 Sep 2011 18:54:51 -0400 Subject: ext4: teach ext4_ext_truncate() about the bigalloc feature When we are truncating (as opposed unlinking) a file, we need to worry about partial truncates of a file, especially in the light of sparse files. The changes here make sure that arbitrary truncates of sparse files works correctly. Yeah, it's messy. Note that these functions will need to be revisted when the punch ioctl is integrated --- in fact this commit will probably have merge conflicts with the punch changes which Allison Henders and the IBM LTC have been working on. I will need to fix this up when either patch hits mainline. Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 76 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bd42ab29efec..cd4479c08031 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2202,14 +2202,39 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) } static int ext4_remove_blocks(handle_t *handle, struct inode *inode, - struct ext4_extent *ex, - ext4_lblk_t from, ext4_lblk_t to) + struct ext4_extent *ex, + ext4_fsblk_t *partial_cluster, + ext4_lblk_t from, ext4_lblk_t to) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned short ee_len = ext4_ext_get_actual_len(ex); + ext4_fsblk_t pblk; int flags = EXT4_FREE_BLOCKS_FORGET; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) flags |= EXT4_FREE_BLOCKS_METADATA; + /* + * For bigalloc file systems, we never free a partial cluster + * at the beginning of the extent. Instead, we make a note + * that we tried freeing the cluster, and check to see if we + * need to free it on a subsequent call to ext4_remove_blocks, + * or at the end of the ext4_truncate() operation. + */ + flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; + + /* + * If we have a partial cluster, and it's different from the + * cluster of the last block, we need to explicitly free the + * partial cluster here. + */ + pblk = ext4_ext_pblock(ex) + ee_len - 1; + if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) { + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, *partial_cluster), + sbi->s_cluster_ratio, flags); + *partial_cluster = 0; + } + #ifdef EXTENTS_STATS { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -2229,12 +2254,24 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { /* tail removal */ ext4_lblk_t num; - ext4_fsblk_t start; num = le32_to_cpu(ex->ee_block) + ee_len - from; - start = ext4_ext_pblock(ex) + ee_len - num; - ext_debug("free last %u blocks starting %llu\n", num, start); - ext4_free_blocks(handle, inode, NULL, start, num, flags); + pblk = ext4_ext_pblock(ex) + ee_len - num; + ext_debug("free last %u blocks starting %llu\n", num, pblk); + ext4_free_blocks(handle, inode, NULL, pblk, num, flags); + /* + * If the block range to be freed didn't start at the + * beginning of a cluster, and we removed the entire + * extent, save the partial cluster here, since we + * might need to delete if we determine that the + * truncate operation has removed all of the blocks in + * the cluster. + */ + if (pblk & (sbi->s_cluster_ratio - 1) && + (ee_len == num)) + *partial_cluster = EXT4_B2C(sbi, pblk); + else + *partial_cluster = 0; } else if (from == le32_to_cpu(ex->ee_block) && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { /* head removal */ @@ -2269,9 +2306,10 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, */ static int ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, ext4_lblk_t start, - ext4_lblk_t end) + struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster, + ext4_lblk_t start, ext4_lblk_t end) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int err = 0, correct_index = 0; int depth = ext_depth(inode), credits; struct ext4_extent_header *eh; @@ -2423,7 +2461,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (err) goto out; - err = ext4_remove_blocks(handle, inode, ex, a, b); + err = ext4_remove_blocks(handle, inode, ex, partial_cluster, + a, b); if (err) goto out; @@ -2471,7 +2510,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, sizeof(struct ext4_extent)); } le16_add_cpu(&eh->eh_entries, -1); - } + } else + *partial_cluster = 0; ext_debug("new extent: %u:%u:%llu\n", block, num, ext4_ext_pblock(ex)); @@ -2483,6 +2523,25 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (correct_index && eh->eh_entries) err = ext4_ext_correct_indexes(handle, inode, path); + /* + * If there is still a entry in the leaf node, check to see if + * it references the partial cluster. This is the only place + * where it could; if it doesn't, we can free the cluster. + */ + if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) && + (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != + *partial_cluster)) { + int flags = EXT4_FREE_BLOCKS_FORGET; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_METADATA; + + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, *partial_cluster), + sbi->s_cluster_ratio, flags); + *partial_cluster = 0; + } + /* if this leaf is free, then we should * remove it from index block above */ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) @@ -2518,6 +2577,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) struct super_block *sb = inode->i_sb; int depth = ext_depth(inode); struct ext4_ext_path *path; + ext4_fsblk_t partial_cluster = 0; handle_t *handle; int i, err; @@ -2553,7 +2613,8 @@ again: if (i == depth) { /* this is leaf block */ err = ext4_ext_rm_leaf(handle, inode, path, - start, EXT_MAX_BLOCKS - 1); + &partial_cluster, start, + EXT_MAX_BLOCKS - 1); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; @@ -3495,6 +3556,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ee_len = ext4_ext_get_actual_len(ex); /* if found extent covers block, simply return it */ if (in_range(map->m_lblk, ee_block, ee_len)) { + ext4_fsblk_t partial_cluster = 0; + newblock = map->m_lblk - ee_block + ee_start; /* number of remaining blocks in the extent */ allocated = ee_len - (map->m_lblk - ee_block); @@ -3578,7 +3641,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_ext_invalidate_cache(inode); err = ext4_ext_rm_leaf(handle, inode, path, - map->m_lblk, map->m_lblk + punched_out); + &partial_cluster, map->m_lblk, + map->m_lblk + punched_out); if (!err && path->p_hdr->eh_entries == 0) { /* -- cgit v1.2.3-58-ga151 From 5704265188ffe4290ed73b3cb685206c3ed8209d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 9 Sep 2011 18:56:51 -0400 Subject: ext4: convert s_{dirty,free}blocks_counter to s_{dirty,free}clusters_counter Convert the percpu counters s_dirtyblocks_counter and s_freeblocks_counter in struct ext4_super_info to be s_dirtyclusters_counter and s_freeclusters_counter. Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 10 +++++----- fs/ext4/ext4.h | 5 +++-- fs/ext4/ialloc.c | 3 ++- fs/ext4/inode.c | 18 ++++++++++-------- fs/ext4/mballoc.c | 12 +++++++----- fs/ext4/resize.c | 4 ++-- fs/ext4/super.c | 29 +++++++++++++++-------------- 7 files changed, 44 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 89abf1f7b253..9080a857cda9 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -414,16 +414,16 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks, unsigned int flags) { s64 free_blocks, dirty_blocks, root_blocks; - struct percpu_counter *fbc = &sbi->s_freeblocks_counter; - struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter; + struct percpu_counter *fcc = &sbi->s_freeclusters_counter; + struct percpu_counter *dbc = &sbi->s_dirtyclusters_counter; - free_blocks = percpu_counter_read_positive(fbc); + free_blocks = percpu_counter_read_positive(fcc); dirty_blocks = percpu_counter_read_positive(dbc); root_blocks = ext4_r_blocks_count(sbi->s_es); if (free_blocks - (nblocks + root_blocks + dirty_blocks) < EXT4_FREEBLOCKS_WATERMARK) { - free_blocks = percpu_counter_sum_positive(fbc); + free_blocks = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc)); dirty_blocks = percpu_counter_sum_positive(dbc); } /* Check whether we have space after @@ -449,7 +449,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks, unsigned int flags) { if (ext4_has_free_blocks(sbi, nblocks, flags)) { - percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks); + percpu_counter_add(&sbi->s_dirtyclusters_counter, nblocks); return 0; } else return -ENOSPC; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 030bfc1cb59d..c7588366471c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -855,6 +855,7 @@ struct ext4_inode_info { ext4_group_t i_last_alloc_group; /* allocation reservation info for delalloc */ + /* In case of bigalloc, these refer to clusters rather than blocks */ unsigned int i_reserved_data_blocks; unsigned int i_reserved_meta_blocks; unsigned int i_allocated_meta_blocks; @@ -1144,10 +1145,10 @@ struct ext4_sb_info { u32 s_hash_seed[4]; int s_def_hash_version; int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ - struct percpu_counter s_freeblocks_counter; + struct percpu_counter s_freeclusters_counter; struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; - struct percpu_counter s_dirtyblocks_counter; + struct percpu_counter s_dirtyclusters_counter; struct blockgroup_lock *s_blockgroup_lock; struct proc_dir_entry *s_proc; struct kobject s_kobj; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index b7a8130d0af4..58115bad163f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -490,7 +490,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); avefreei = freei / ngroups; - freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + freeb = EXT4_C2B(sbi, + percpu_counter_read_positive(&sbi->s_freeclusters_counter)); avefreeb = freeb; do_div(avefreeb, ngroups); ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 904a9a623dab..40f51aae42fe 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -281,7 +281,7 @@ void ext4_da_update_reserve_space(struct inode *inode, /* Update per-inode reservations */ ei->i_reserved_data_blocks -= used; ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; - percpu_counter_sub(&sbi->s_dirtyblocks_counter, + percpu_counter_sub(&sbi->s_dirtyclusters_counter, used + ei->i_allocated_meta_blocks); ei->i_allocated_meta_blocks = 0; @@ -291,7 +291,7 @@ void ext4_da_update_reserve_space(struct inode *inode, * only when we have written all of the delayed * allocation blocks. */ - percpu_counter_sub(&sbi->s_dirtyblocks_counter, + percpu_counter_sub(&sbi->s_dirtyclusters_counter, ei->i_reserved_meta_blocks); ei->i_reserved_meta_blocks = 0; ei->i_da_metadata_calc_len = 0; @@ -1119,14 +1119,14 @@ static void ext4_da_release_space(struct inode *inode, int to_free) * only when we have written all of the delayed * allocation blocks. */ - percpu_counter_sub(&sbi->s_dirtyblocks_counter, + percpu_counter_sub(&sbi->s_dirtyclusters_counter, ei->i_reserved_meta_blocks); ei->i_reserved_meta_blocks = 0; ei->i_da_metadata_calc_len = 0; } /* update fs dirty data blocks counter */ - percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); + percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); @@ -1349,9 +1349,10 @@ static void ext4_print_free_blocks(struct inode *inode) ext4_count_free_blocks(inode->i_sb)); printk(KERN_CRIT "Free/Dirty block details\n"); printk(KERN_CRIT "free_blocks=%lld\n", - (long long) percpu_counter_sum(&sbi->s_freeblocks_counter)); + (long long) EXT4_C2B(EXT4_SB(inode->i_sb), + percpu_counter_sum(&sbi->s_freeclusters_counter))); printk(KERN_CRIT "dirty_blocks=%lld\n", - (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); + (long long) percpu_counter_sum(&sbi->s_dirtyclusters_counter)); printk(KERN_CRIT "Block reservation details\n"); printk(KERN_CRIT "i_reserved_data_blocks=%u\n", EXT4_I(inode)->i_reserved_data_blocks); @@ -2226,8 +2227,9 @@ static int ext4_nonda_switch(struct super_block *sb) * Delalloc need an accurate free block accounting. So switch * to non delalloc when we are near to error range. */ - free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); - dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter); + free_blocks = EXT4_C2B(sbi, + percpu_counter_read_positive(&sbi->s_freeclusters_counter)); + dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); if (2 * free_blocks < 3 * dirty_blocks || free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { /* diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 57ce6960e940..4a38b65bd564 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2834,13 +2834,14 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); - percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); + percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); /* * Now reduce the dirty block count also. Should not go negative */ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ - percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_clstrs); + percpu_counter_sub(&sbi->s_dirtyclusters_counter, + reserv_clstrs); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, @@ -4384,7 +4385,7 @@ out: if (!ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ - percpu_counter_sub(&sbi->s_dirtyblocks_counter, + percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); } @@ -4691,7 +4692,7 @@ do_more: ext4_free_blks_set(sb, gdp, ret); gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); ext4_unlock_group(sb, block_group); - percpu_counter_add(&sbi->s_freeblocks_counter, count); + percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); @@ -4833,7 +4834,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_free_blks_set(sb, desc, blk_free_count); desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); ext4_unlock_group(sb, block_group); - percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); + percpu_counter_add(&sbi->s_freeclusters_counter, + EXT4_B2C(sbi, blocks_freed)); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 707d3f16f7ce..a324a537f2dc 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -937,8 +937,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) input->reserved_blocks); /* Update the free space counts */ - percpu_counter_add(&sbi->s_freeblocks_counter, - input->free_blocks_count); + percpu_counter_add(&sbi->s_freeclusters_counter, + EXT4_B2C(sbi, input->free_blocks_count)); percpu_counter_add(&sbi->s_freeinodes_counter, EXT4_INODES_PER_GROUP(sb)); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 25a4bfe3f39f..f81e7e791655 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -837,10 +837,10 @@ static void ext4_put_super(struct super_block *sb) brelse(sbi->s_group_desc[i]); ext4_kvfree(sbi->s_group_desc); ext4_kvfree(sbi->s_flex_groups); - percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyblocks_counter); + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) @@ -2473,7 +2473,7 @@ static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, char *buf) { return snprintf(buf, PAGE_SIZE, "%llu\n", - (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); + (s64) percpu_counter_sum(&sbi->s_dirtyclusters_counter)); } static ssize_t session_write_kbytes_show(struct ext4_attr *a, @@ -3575,7 +3575,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_err_report.function = print_daily_error_info; sbi->s_err_report.data = (unsigned long) sb; - err = percpu_counter_init(&sbi->s_freeblocks_counter, + err = percpu_counter_init(&sbi->s_freeclusters_counter, ext4_count_free_blocks(sb)); if (!err) { err = percpu_counter_init(&sbi->s_freeinodes_counter, @@ -3586,7 +3586,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_count_dirs(sb)); } if (!err) { - err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); + err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); } if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); @@ -3701,13 +3701,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * The journal may have updated the bg summary counts, so we * need to update the global counters. */ - percpu_counter_set(&sbi->s_freeblocks_counter, + percpu_counter_set(&sbi->s_freeclusters_counter, ext4_count_free_blocks(sb)); percpu_counter_set(&sbi->s_freeinodes_counter, ext4_count_free_inodes(sb)); percpu_counter_set(&sbi->s_dirs_counter, ext4_count_dirs(sb)); - percpu_counter_set(&sbi->s_dirtyblocks_counter, 0); + percpu_counter_set(&sbi->s_dirtyclusters_counter, 0); no_journal: /* @@ -3847,10 +3847,10 @@ failed_mount3: del_timer(&sbi->s_err_report); if (sbi->s_flex_groups) ext4_kvfree(sbi->s_flex_groups); - percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyblocks_counter); + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); failed_mount2: @@ -4173,8 +4173,9 @@ static int ext4_commit_super(struct super_block *sb, int sync) else es->s_kbytes_written = cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); - ext4_free_blocks_count_set(es, percpu_counter_sum_positive( - &EXT4_SB(sb)->s_freeblocks_counter)); + ext4_free_blocks_count_set(es, + EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeclusters_counter))); es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( &EXT4_SB(sb)->s_freeinodes_counter)); @@ -4629,10 +4630,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = EXT4_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; - bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - - percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); + bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - + percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); /* prevent underflow in case that few free space is available */ - buf->f_bfree = max_t(s64, bfree, 0); + buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); if (buf->f_bfree < ext4_r_blocks_count(es)) buf->f_bavail = 0; -- cgit v1.2.3-58-ga151 From 24aaa8ef4e2b5764ada1fc69787e2fbd4f6276e5 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 9 Sep 2011 18:58:51 -0400 Subject: ext4: convert the free_blocks field in s_flex_groups to be free_clusters Convert the free_blocks to be free_clusters to make the final revised bigalloc changes easier to read/understand. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 2 +- fs/ext4/ialloc.c | 40 +++++++++++++++++++++------------------- fs/ext4/mballoc.c | 9 +++++---- fs/ext4/resize.c | 4 ++-- fs/ext4/super.c | 2 +- 5 files changed, 30 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c7588366471c..d2584224c89a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -300,7 +300,7 @@ struct ext4_group_desc struct flex_groups { atomic_t free_inodes; - atomic_t free_blocks; + atomic_t free_clusters; atomic_t used_dirs; }; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 58115bad163f..0be5862313f0 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -346,7 +346,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent, int flex_size = ext4_flex_bg_size(sbi); ext4_group_t best_flex = parent_fbg_group; int blocks_per_flex = sbi->s_blocks_per_group * flex_size; - int flexbg_free_blocks; + int flexbg_free_clusters; int flex_freeb_ratio; ext4_group_t n_fbg_groups; ext4_group_t i; @@ -355,8 +355,9 @@ static int find_group_flex(struct super_block *sb, struct inode *parent, sbi->s_log_groups_per_flex; find_close_to_parent: - flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks); - flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; + flexbg_free_clusters = atomic_read(&flex_group[best_flex].free_clusters); + flex_freeb_ratio = EXT4_C2B(sbi, flexbg_free_clusters) * 100 / + blocks_per_flex; if (atomic_read(&flex_group[best_flex].free_inodes) && flex_freeb_ratio > free_block_ratio) goto found_flexbg; @@ -370,8 +371,9 @@ find_close_to_parent: if (i == parent_fbg_group || i == parent_fbg_group - 1) continue; - flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks); - flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; + flexbg_free_clusters = atomic_read(&flex_group[i].free_clusters); + flex_freeb_ratio = EXT4_C2B(sbi, flexbg_free_clusters) * 100 / + blocks_per_flex; if (flex_freeb_ratio > free_block_ratio && (atomic_read(&flex_group[i].free_inodes))) { @@ -380,14 +382,14 @@ find_close_to_parent: } if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) || - ((atomic_read(&flex_group[i].free_blocks) > - atomic_read(&flex_group[best_flex].free_blocks)) && + ((atomic_read(&flex_group[i].free_clusters) > + atomic_read(&flex_group[best_flex].free_clusters)) && atomic_read(&flex_group[i].free_inodes))) best_flex = i; } if (!atomic_read(&flex_group[best_flex].free_inodes) || - !atomic_read(&flex_group[best_flex].free_blocks)) + !atomic_read(&flex_group[best_flex].free_clusters)) return -1; found_flexbg: @@ -407,7 +409,7 @@ out: struct orlov_stats { __u32 free_inodes; - __u32 free_blocks; + __u32 free_clusters; __u32 used_dirs; }; @@ -424,7 +426,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, if (flex_size > 1) { stats->free_inodes = atomic_read(&flex_group[g].free_inodes); - stats->free_blocks = atomic_read(&flex_group[g].free_blocks); + stats->free_clusters = atomic_read(&flex_group[g].free_clusters); stats->used_dirs = atomic_read(&flex_group[g].used_dirs); return; } @@ -432,11 +434,11 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, desc = ext4_get_group_desc(sb, g, NULL); if (desc) { stats->free_inodes = ext4_free_inodes_count(sb, desc); - stats->free_blocks = ext4_free_blks_count(sb, desc); + stats->free_clusters = ext4_free_blks_count(sb, desc); stats->used_dirs = ext4_used_dirs_count(sb, desc); } else { stats->free_inodes = 0; - stats->free_blocks = 0; + stats->free_clusters = 0; stats->used_dirs = 0; } } @@ -471,10 +473,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, ext4_group_t real_ngroups = ext4_get_groups_count(sb); int inodes_per_group = EXT4_INODES_PER_GROUP(sb); unsigned int freei, avefreei; - ext4_fsblk_t freeb, avefreeb; + ext4_fsblk_t freeb, avefreec; unsigned int ndirs; int max_dirs, min_inodes; - ext4_grpblk_t min_blocks; + ext4_grpblk_t min_clusters; ext4_group_t i, grp, g, ngroups; struct ext4_group_desc *desc; struct orlov_stats stats; @@ -492,8 +494,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, avefreei = freei / ngroups; freeb = EXT4_C2B(sbi, percpu_counter_read_positive(&sbi->s_freeclusters_counter)); - avefreeb = freeb; - do_div(avefreeb, ngroups); + avefreec = freeb; + do_div(avefreec, ngroups); ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); if (S_ISDIR(mode) && @@ -519,7 +521,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, continue; if (stats.free_inodes < avefreei) continue; - if (stats.free_blocks < avefreeb) + if (stats.free_clusters < avefreec) continue; grp = g; ret = 0; @@ -557,7 +559,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, min_inodes = avefreei - inodes_per_group*flex_size / 4; if (min_inodes < 1) min_inodes = 1; - min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4; + min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4; /* * Start looking in the flex group where we last allocated an @@ -576,7 +578,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, continue; if (stats.free_inodes < min_inodes) continue; - if (stats.free_blocks < min_blocks) + if (stats.free_clusters < min_clusters) continue; goto found_flex_bg; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4a38b65bd564..f8e37cf2c2dd 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2847,7 +2847,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ext4_group_t flex_group = ext4_flex_group(sbi, ac->ac_b_ex.fe_group); atomic_sub(ac->ac_b_ex.fe_len, - &sbi->s_flex_groups[flex_group].free_blocks); + &sbi->s_flex_groups[flex_group].free_clusters); } err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); @@ -4696,7 +4696,8 @@ do_more: if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks); + atomic_add(count_clusters, + &sbi->s_flex_groups[flex_group].free_clusters); } ext4_mb_unload_buddy(&e4b); @@ -4839,8 +4840,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(blocks_freed, - &sbi->s_flex_groups[flex_group].free_blocks); + atomic_add(EXT4_B2C(sbi, blocks_freed), + &sbi->s_flex_groups[flex_group].free_clusters); } ext4_mb_unload_buddy(&e4b); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index a324a537f2dc..95a09ddca3b9 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -946,8 +946,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) sbi->s_log_groups_per_flex) { ext4_group_t flex_group; flex_group = ext4_flex_group(sbi, input->group); - atomic_add(input->free_blocks_count, - &sbi->s_flex_groups[flex_group].free_blocks); + atomic_add(EXT4_B2C(sbi, input->free_blocks_count), + &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(EXT4_INODES_PER_GROUP(sb), &sbi->s_flex_groups[flex_group].free_inodes); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f81e7e791655..d7e0e045b11b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2035,7 +2035,7 @@ static int ext4_fill_flex_info(struct super_block *sb) atomic_add(ext4_free_inodes_count(sb, gdp), &sbi->s_flex_groups[flex_group].free_inodes); atomic_add(ext4_free_blks_count(sb, gdp), - &sbi->s_flex_groups[flex_group].free_blocks); + &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(ext4_used_dirs_count(sb, gdp), &sbi->s_flex_groups[flex_group].used_dirs); } -- cgit v1.2.3-58-ga151 From f975d6bcc7a698a10cc755115e27d3612dcfe322 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 9 Sep 2011 19:00:51 -0400 Subject: ext4: teach ext4_statfs() to deal with clusters if bigalloc is enabled Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d7e0e045b11b..6810957e0ac7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4580,16 +4580,34 @@ restore_opts: return err; } +/* + * Note: calculating the overhead so we can be compatible with + * historical BSD practice is quite difficult in the face of + * clusters/bigalloc. This is because multiple metadata blocks from + * different block group can end up in the same allocation cluster. + * Calculating the exact overhead in the face of clustered allocation + * requires either O(all block bitmaps) in memory or O(number of block + * groups**2) in time. We will still calculate the superblock for + * older file systems --- and if we come across with a bigalloc file + * system with zero in s_overhead_clusters the estimate will be close to + * correct especially for very large cluster sizes --- but for newer + * file systems, it's better to calculate this figure once at mkfs + * time, and store it in the superblock. If the superblock value is + * present (even for non-bigalloc file systems), we will use it. + */ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; + struct ext4_group_desc *gdp; u64 fsid; s64 bfree; if (test_opt(sb, MINIX_DF)) { sbi->s_overhead_last = 0; + } else if (es->s_overhead_clusters) { + sbi->s_overhead_last = le32_to_cpu(es->s_overhead_clusters); } else if (sbi->s_blocks_last != ext4_blocks_count(es)) { ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; @@ -4604,24 +4622,16 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) * All of the blocks before first_data_block are * overhead */ - overhead = le32_to_cpu(es->s_first_data_block); + overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block)); /* - * Add the overhead attributed to the superblock and - * block group descriptors. If the sparse superblocks - * feature is turned on, then not all groups have this. + * Add the overhead found in each block group */ for (i = 0; i < ngroups; i++) { - overhead += ext4_bg_has_super(sb, i) + - ext4_bg_num_gdb(sb, i); + gdp = ext4_get_group_desc(sb, i, NULL); + overhead += ext4_num_overhead_clusters(sb, i, gdp); cond_resched(); } - - /* - * Every block group has an inode bitmap, a block - * bitmap, and an inode table. - */ - overhead += ngroups * (2 + sbi->s_itb_per_group); sbi->s_overhead_last = overhead; smp_wmb(); sbi->s_blocks_last = ext4_blocks_count(es); @@ -4629,7 +4639,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = EXT4_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; - buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; + buf->f_blocks = (ext4_blocks_count(es) - + EXT4_C2B(sbi, sbi->s_overhead_last)); bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); /* prevent underflow in case that few free space is available */ -- cgit v1.2.3-58-ga151 From 27baebb849d46d901e756e6502b0a65a62e43771 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 9 Sep 2011 19:02:51 -0400 Subject: ext4: tune mballoc's default group prealloc size for bigalloc file systems The default group preallocation size had been previously set to 512 blocks/clusters, regardless of the block/cluster size. This is probably too big for large cluster sizes. So adjust the default so that it is 2 megabytes or 32 clusters, whichever is larger. Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f8e37cf2c2dd..63dd56703342 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -126,7 +126,8 @@ * list. In case of inode preallocation we follow a list of heuristics * based on file size. This can be found in ext4_mb_normalize_request. If * we are doing a group prealloc we try to normalize the request to - * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is + * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is + * dependent on the cluster size; for non-bigalloc file systems, it is * 512 blocks. This can be tuned via * /sys/fs/ext4//mb_group_prealloc. The value is represented in * terms of number of blocks. If we have mounted the file system with -O @@ -2473,7 +2474,20 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; - sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; + /* + * The default group preallocation is 512, which for 4k block + * sizes translates to 2 megabytes. However for bigalloc file + * systems, this is probably too big (i.e, if the cluster size + * is 1 megabyte, then group preallocation size becomes half a + * gigabyte!). As a default, we will keep a two megabyte + * group pralloc size for cluster sizes up to 64k, and after + * that, we will force a minimum group preallocation size of + * 32 clusters. This translates to 8 megs when the cluster + * size is 256k, and 32 megs when the cluster size is 1 meg, + * which seems reasonable as a default. + */ + sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >> + sbi->s_cluster_bits, 32); /* * If there is a s_stripe > 1, then we set the s_mb_group_prealloc * to the lowest multiple of s_stripe which is bigger than -- cgit v1.2.3-58-ga151 From 7b415bf60f6afb0499fd3dc0ee33444f54e28567 Mon Sep 17 00:00:00 2001 From: Aditya Kali Date: Fri, 9 Sep 2011 19:04:51 -0400 Subject: ext4: Fix bigalloc quota accounting and i_blocks value With bigalloc changes, the i_blocks value was not correctly set (it was still set to number of blocks being used, but in case of bigalloc, we want i_blocks to represent the number of clusters being used). Since the quota subsystem sets the i_blocks value, this patch fixes the quota accounting and makes sure that the i_blocks value is set correctly. Signed-off-by: Aditya Kali Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 5 +- fs/ext4/ext4.h | 16 ++- fs/ext4/ext4_extents.h | 2 + fs/ext4/extents.c | 306 ++++++++++++++++++++++++++++++++++++++++++++++++- fs/ext4/inode.c | 54 ++++++--- fs/ext4/mballoc.c | 5 +- fs/ext4/super.c | 3 +- 7 files changed, 366 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 9080a857cda9..bf42b3219e3c 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -485,7 +485,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) * @handle: handle to this transaction * @inode: file inode * @goal: given target block(filesystem wide) - * @count: pointer to total number of blocks needed + * @count: pointer to total number of clusters needed * @errp: error code * * Return 1st allocated block number on success, *count stores total account @@ -517,7 +517,8 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, spin_lock(&EXT4_I(inode)->i_block_reservation_lock); EXT4_I(inode)->i_allocated_meta_blocks += ar.len; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - dquot_alloc_block_nofail(inode, ar.len); + dquot_alloc_block_nofail(inode, + EXT4_C2B(EXT4_SB(inode->i_sb), ar.len)); } return ret; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d2584224c89a..a6307f7c9807 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -144,9 +144,17 @@ struct ext4_allocation_request { #define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) #define EXT4_MAP_BOUNDARY (1 << BH_Boundary) #define EXT4_MAP_UNINIT (1 << BH_Uninit) +/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of + * ext4_map_blocks wants to know whether or not the underlying cluster has + * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that + * the requested mapping was from previously mapped (or delayed allocated) + * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster + * should never appear on buffer_head's state flags. + */ +#define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ - EXT4_MAP_UNINIT) + EXT4_MAP_UNINIT | EXT4_MAP_FROM_CLUSTER) struct ext4_map_blocks { ext4_fsblk_t m_pblk; @@ -1884,6 +1892,7 @@ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); +extern int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock); /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, @@ -2284,6 +2293,11 @@ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); enum ext4_state_bits { BH_Uninit /* blocks are allocated but uninitialized on disk */ = BH_JBDPrivateStart, + BH_AllocFromCluster, /* allocated blocks were part of already + * allocated cluster. Note that this flag will + * never, ever appear in a buffer_head's state + * flag. See EXT4_MAP_FROM_CLUSTER to see where + * this is used. */ }; BUFFER_FNS(Uninit, uninit) diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 095c36f3b612..a52db3a69a30 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -290,5 +290,7 @@ extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, struct ext4_ext_path *); extern void ext4_ext_drop_refs(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); +extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, + int search_hint_reverse); #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index cd4479c08031..c4e005864534 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2686,6 +2686,21 @@ again: } } + /* If we still have something in the partial cluster and we have removed + * even the first extent, then we should free the blocks in the partial + * cluster as well. */ + if (partial_cluster && path->p_hdr->eh_entries == 0) { + int flags = EXT4_FREE_BLOCKS_FORGET; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_METADATA; + + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(EXT4_SB(sb), partial_cluster), + EXT4_SB(sb)->s_cluster_ratio, flags); + partial_cluster = 0; + } + /* TODO: flexible tree reduction should be here */ if (path->p_hdr->eh_entries == 0) { /* @@ -3233,6 +3248,195 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode, return ext4_mark_inode_dirty(handle, inode); } +/** + * ext4_find_delalloc_range: find delayed allocated block in the given range. + * + * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns + * whether there are any buffers marked for delayed allocation. It returns '1' + * on the first delalloc'ed buffer head found. If no buffer head in the given + * range is marked for delalloc, it returns 0. + * lblk_start should always be <= lblk_end. + * search_hint_reverse is to indicate that searching in reverse from lblk_end to + * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed + * block sooner). This is useful when blocks are truncated sequentially from + * lblk_start towards lblk_end. + */ +static int ext4_find_delalloc_range(struct inode *inode, + ext4_lblk_t lblk_start, + ext4_lblk_t lblk_end, + int search_hint_reverse) +{ + struct address_space *mapping = inode->i_mapping; + struct buffer_head *head, *bh = NULL; + struct page *page; + ext4_lblk_t i, pg_lblk; + pgoff_t index; + + /* reverse search wont work if fs block size is less than page size */ + if (inode->i_blkbits < PAGE_CACHE_SHIFT) + search_hint_reverse = 0; + + if (search_hint_reverse) + i = lblk_end; + else + i = lblk_start; + + index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + + while ((i >= lblk_start) && (i <= lblk_end)) { + page = find_get_page(mapping, index); + if (!page || !PageDirty(page)) + goto nextpage; + + if (PageWriteback(page)) { + /* + * This might be a race with allocation and writeout. In + * this case we just assume that the rest of the range + * will eventually be written and there wont be any + * delalloc blocks left. + * TODO: the above assumption is troublesome, but might + * work better in practice. other option could be note + * somewhere that the cluster is getting written out and + * detect that here. + */ + page_cache_release(page); + return 0; + } + + if (!page_has_buffers(page)) + goto nextpage; + + head = page_buffers(page); + if (!head) + goto nextpage; + + bh = head; + pg_lblk = index << (PAGE_CACHE_SHIFT - + inode->i_blkbits); + do { + if (unlikely(pg_lblk < lblk_start)) { + /* + * This is possible when fs block size is less + * than page size and our cluster starts/ends in + * middle of the page. So we need to skip the + * initial few blocks till we reach the 'lblk' + */ + pg_lblk++; + continue; + } + + if (buffer_delay(bh)) { + page_cache_release(page); + return 1; + } + if (search_hint_reverse) + i--; + else + i++; + } while ((i >= lblk_start) && (i <= lblk_end) && + ((bh = bh->b_this_page) != head)); +nextpage: + if (page) + page_cache_release(page); + /* + * Move to next page. 'i' will be the first lblk in the next + * page. + */ + if (search_hint_reverse) + index--; + else + index++; + i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + } + + return 0; +} + +int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, + int search_hint_reverse) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t lblk_start, lblk_end; + lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); + lblk_end = lblk_start + sbi->s_cluster_ratio - 1; + + return ext4_find_delalloc_range(inode, lblk_start, lblk_end, + search_hint_reverse); +} + +/** + * Determines how many complete clusters (out of those specified by the 'map') + * are under delalloc and were reserved quota for. + * This function is called when we are writing out the blocks that were + * originally written with their allocation delayed, but then the space was + * allocated using fallocate() before the delayed allocation could be resolved. + * The cases to look for are: + * ('=' indicated delayed allocated blocks + * '-' indicates non-delayed allocated blocks) + * (a) partial clusters towards beginning and/or end outside of allocated range + * are not delalloc'ed. + * Ex: + * |----c---=|====c====|====c====|===-c----| + * |++++++ allocated ++++++| + * ==> 4 complete clusters in above example + * + * (b) partial cluster (outside of allocated range) towards either end is + * marked for delayed allocation. In this case, we will exclude that + * cluster. + * Ex: + * |----====c========|========c========| + * |++++++ allocated ++++++| + * ==> 1 complete clusters in above example + * + * Ex: + * |================c================| + * |++++++ allocated ++++++| + * ==> 0 complete clusters in above example + * + * The ext4_da_update_reserve_space will be called only if we + * determine here that there were some "entire" clusters that span + * this 'allocated' range. + * In the non-bigalloc case, this function will just end up returning num_blks + * without ever calling ext4_find_delalloc_range. + */ +static unsigned int +get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, + unsigned int num_blks) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t alloc_cluster_start, alloc_cluster_end; + ext4_lblk_t lblk_from, lblk_to, c_offset; + unsigned int allocated_clusters = 0; + + alloc_cluster_start = EXT4_B2C(sbi, lblk_start); + alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1); + + /* max possible clusters for this allocation */ + allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1; + + /* Check towards left side */ + c_offset = lblk_start & (sbi->s_cluster_ratio - 1); + if (c_offset) { + lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); + lblk_to = lblk_from + c_offset - 1; + + if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) + allocated_clusters--; + } + + /* Now check towards right. */ + c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1); + if (allocated_clusters && c_offset) { + lblk_from = lblk_start + num_blks; + lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; + + if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) + allocated_clusters--; + } + + return allocated_clusters; +} + static int ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, @@ -3338,8 +3542,15 @@ out: * But fallocate would have already updated quota and block * count for this offset. So cancel these reservation */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) - ext4_da_update_reserve_space(inode, allocated, 0); + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { + unsigned int reserved_clusters; + reserved_clusters = get_reserved_cluster_alloc(inode, + map->m_lblk, map->m_len); + if (reserved_clusters) + ext4_da_update_reserve_space(inode, + reserved_clusters, + 0); + } map_out: map->m_flags |= EXT4_MAP_MAPPED; @@ -3484,6 +3695,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t newblock = 0; int free_on_err = 0, err = 0, depth, ret; unsigned int allocated = 0, offset = 0; + unsigned int allocated_clusters = 0, reserved_clusters = 0; unsigned int punched_out = 0; unsigned int result = 0; struct ext4_allocation_request ar; @@ -3499,6 +3711,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) && ext4_ext_in_cache(inode, map->m_lblk, &newex)) { if (!newex.ee_start_lo && !newex.ee_start_hi) { + if ((sbi->s_cluster_ratio > 1) && + ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) + map->m_flags |= EXT4_MAP_FROM_CLUSTER; + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* * block isn't allocated yet and @@ -3509,6 +3725,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* we should allocate requested block */ } else { /* block is already allocated */ + if (sbi->s_cluster_ratio > 1) + map->m_flags |= EXT4_MAP_FROM_CLUSTER; newblock = map->m_lblk - le32_to_cpu(newex.ee_block) + ext4_ext_pblock(&newex); @@ -3665,6 +3883,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, } } + if ((sbi->s_cluster_ratio > 1) && + ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) + map->m_flags |= EXT4_MAP_FROM_CLUSTER; + /* * requested block isn't allocated yet; * we couldn't try to create block if create flag is zero @@ -3681,6 +3903,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* * Okay, we need to do block allocation. */ + map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; newex.ee_block = cpu_to_le32(map->m_lblk); cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1); @@ -3692,6 +3915,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, get_implied_cluster_alloc(sbi, map, ex, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; + map->m_flags |= EXT4_MAP_FROM_CLUSTER; goto got_allocated_blocks; } @@ -3712,6 +3936,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, get_implied_cluster_alloc(sbi, map, ex2, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; + map->m_flags |= EXT4_MAP_FROM_CLUSTER; goto got_allocated_blocks; } @@ -3765,6 +3990,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext_debug("allocate new block: goal %llu, found %llu/%u\n", ar.goal, newblock, allocated); free_on_err = 1; + allocated_clusters = ar.len; ar.len = EXT4_C2B(sbi, ar.len) - offset; if (ar.len > allocated) ar.len = allocated; @@ -3822,8 +4048,80 @@ got_allocated_blocks: * Update reserved blocks/metadata blocks after successful * block allocation which had been deferred till now. */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) - ext4_da_update_reserve_space(inode, allocated, 1); + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { + /* + * Check how many clusters we had reserved this allocted range. + */ + reserved_clusters = get_reserved_cluster_alloc(inode, + map->m_lblk, allocated); + if (map->m_flags & EXT4_MAP_FROM_CLUSTER) { + if (reserved_clusters) { + /* + * We have clusters reserved for this range. + * But since we are not doing actual allocation + * and are simply using blocks from previously + * allocated cluster, we should release the + * reservation and not claim quota. + */ + ext4_da_update_reserve_space(inode, + reserved_clusters, 0); + } + } else { + BUG_ON(allocated_clusters < reserved_clusters); + /* We will claim quota for all newly allocated blocks.*/ + ext4_da_update_reserve_space(inode, allocated_clusters, + 1); + if (reserved_clusters < allocated_clusters) { + int reservation = allocated_clusters - + reserved_clusters; + /* + * It seems we claimed few clusters outside of + * the range of this allocation. We should give + * it back to the reservation pool. This can + * happen in the following case: + * + * * Suppose s_cluster_ratio is 4 (i.e., each + * cluster has 4 blocks. Thus, the clusters + * are [0-3],[4-7],[8-11]... + * * First comes delayed allocation write for + * logical blocks 10 & 11. Since there were no + * previous delayed allocated blocks in the + * range [8-11], we would reserve 1 cluster + * for this write. + * * Next comes write for logical blocks 3 to 8. + * In this case, we will reserve 2 clusters + * (for [0-3] and [4-7]; and not for [8-11] as + * that range has a delayed allocated blocks. + * Thus total reserved clusters now becomes 3. + * * Now, during the delayed allocation writeout + * time, we will first write blocks [3-8] and + * allocate 3 clusters for writing these + * blocks. Also, we would claim all these + * three clusters above. + * * Now when we come here to writeout the + * blocks [10-11], we would expect to claim + * the reservation of 1 cluster we had made + * (and we would claim it since there are no + * more delayed allocated blocks in the range + * [8-11]. But our reserved cluster count had + * already gone to 0. + * + * Thus, at the step 4 above when we determine + * that there are still some unwritten delayed + * allocated blocks outside of our current + * block range, we should increment the + * reserved clusters count so that when the + * remaining blocks finally gets written, we + * could claim them. + */ + while (reservation) { + ext4_da_reserve_space(inode, + map->m_lblk); + reservation--; + } + } + } + } /* * Cache the extent and update transaction to commit on fdatasync only diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 40f51aae42fe..d1c17e47c1c6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -300,14 +300,14 @@ void ext4_da_update_reserve_space(struct inode *inode, /* Update quota subsystem for data blocks */ if (quota_claim) - dquot_claim_block(inode, used); + dquot_claim_block(inode, EXT4_C2B(sbi, used)); else { /* * We did fallocate with an offset that is already delayed * allocated. So on delayed allocated writeback we should * not re-claim the quota for fallocated blocks. */ - dquot_release_reservation_block(inode, used); + dquot_release_reservation_block(inode, EXT4_C2B(sbi, used)); } /* @@ -1037,14 +1037,14 @@ static int ext4_journalled_write_end(struct file *file, } /* - * Reserve a single block located at lblock + * Reserve a single cluster located at lblock */ -static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) +int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) { int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); - unsigned long md_needed; + unsigned int md_needed; int ret; /* @@ -1054,7 +1054,8 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) */ repeat: spin_lock(&ei->i_block_reservation_lock); - md_needed = ext4_calc_metadata_amount(inode, lblock); + md_needed = EXT4_NUM_B2C(sbi, + ext4_calc_metadata_amount(inode, lblock)); trace_ext4_da_reserve_space(inode, md_needed); spin_unlock(&ei->i_block_reservation_lock); @@ -1063,7 +1064,7 @@ repeat: * us from metadata over-estimation, though we may go over by * a small amount in the end. Here we just reserve for data. */ - ret = dquot_reserve_block(inode, 1); + ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1)); if (ret) return ret; /* @@ -1071,7 +1072,7 @@ repeat: * we cannot afford to run out of free blocks. */ if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) { - dquot_release_reservation_block(inode, 1); + dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { yield(); goto repeat; @@ -1118,6 +1119,8 @@ static void ext4_da_release_space(struct inode *inode, int to_free) * We can release all of the reserved metadata blocks * only when we have written all of the delayed * allocation blocks. + * Note that in case of bigalloc, i_reserved_meta_blocks, + * i_reserved_data_blocks, etc. refer to number of clusters. */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, ei->i_reserved_meta_blocks); @@ -1130,7 +1133,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free) spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - dquot_release_reservation_block(inode, to_free); + dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); } static void ext4_da_page_release_reservation(struct page *page, @@ -1139,6 +1142,9 @@ static void ext4_da_page_release_reservation(struct page *page, int to_release = 0; struct buffer_head *head, *bh; unsigned int curr_off = 0; + struct inode *inode = page->mapping->host; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int num_clusters; head = page_buffers(page); bh = head; @@ -1151,7 +1157,20 @@ static void ext4_da_page_release_reservation(struct page *page, } curr_off = next_off; } while ((bh = bh->b_this_page) != head); - ext4_da_release_space(page->mapping->host, to_release); + + /* If we have released all the blocks belonging to a cluster, then we + * need to release the reserved space for that cluster. */ + num_clusters = EXT4_NUM_B2C(sbi, to_release); + while (num_clusters > 0) { + ext4_fsblk_t lblk; + lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) + + ((num_clusters - 1) << sbi->s_cluster_bits); + if (sbi->s_cluster_ratio == 1 || + !ext4_find_delalloc_cluster(inode, lblk, 1)) + ext4_da_release_space(inode, 1); + + num_clusters--; + } } /* @@ -1352,7 +1371,8 @@ static void ext4_print_free_blocks(struct inode *inode) (long long) EXT4_C2B(EXT4_SB(inode->i_sb), percpu_counter_sum(&sbi->s_freeclusters_counter))); printk(KERN_CRIT "dirty_blocks=%lld\n", - (long long) percpu_counter_sum(&sbi->s_dirtyclusters_counter)); + (long long) EXT4_C2B(EXT4_SB(inode->i_sb), + percpu_counter_sum(&sbi->s_dirtyclusters_counter))); printk(KERN_CRIT "Block reservation details\n"); printk(KERN_CRIT "i_reserved_data_blocks=%u\n", EXT4_I(inode)->i_reserved_data_blocks); @@ -1626,10 +1646,14 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, /* * XXX: __block_write_begin() unmaps passed block, is it OK? */ - ret = ext4_da_reserve_space(inode, iblock); - if (ret) - /* not enough space to reserve */ - return ret; + /* If the block was allocated from previously allocated cluster, + * then we dont need to reserve it again. */ + if (!(map.m_flags & EXT4_MAP_FROM_CLUSTER)) { + ret = ext4_da_reserve_space(inode, iblock); + if (ret) + /* not enough space to reserve */ + return ret; + } map_bh(bh, inode->i_sb, invalid_block); set_buffer_new(bh); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 63dd56703342..5e1215d38331 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4718,6 +4718,9 @@ do_more: freed += count; + if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); + /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); @@ -4736,8 +4739,6 @@ do_more: } ext4_mark_super_dirty(sb); error_return: - if (freed && !(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) - dquot_free_block(inode, freed); brelse(bitmap_bh); ext4_std_error(sb, err); return; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6810957e0ac7..66b8cfa15636 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2473,7 +2473,8 @@ static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, char *buf) { return snprintf(buf, PAGE_SIZE, "%llu\n", - (s64) percpu_counter_sum(&sbi->s_dirtyclusters_counter)); + (s64) EXT4_C2B(sbi, + percpu_counter_sum(&sbi->s_dirtyclusters_counter))); } static ssize_t session_write_kbytes_show(struct ext4_attr *a, -- cgit v1.2.3-58-ga151 From 6f16b60690ba04cf476480a6f19b204e4b95b4a6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 9 Sep 2011 19:06:51 -0400 Subject: ext4: enable mounting bigalloc as read/write Now that we have implemented all of the changes needed for bigalloc, we can finally enable it! Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a6307f7c9807..a5a7e369f719 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1439,7 +1439,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ - EXT4_FEATURE_RO_COMPAT_HUGE_FILE) + EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ + EXT4_FEATURE_RO_COMPAT_BIGALLOC) /* * Default values for user and/or group using reserved blocks -- cgit v1.2.3-58-ga151 From 021b65bb1e4e4b625c80bbb82651e5e155721ef3 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 9 Sep 2011 19:08:51 -0400 Subject: ext4: Rename ext4_free_blks_{count,set}() to refer to clusters The field bg_free_blocks_count_{lo,high} in the block group descriptor has been repurposed to hold the number of free clusters for bigalloc functions. So rename the functions so it makes it easier to read and audit the block allocation and block freeing code. Note: at this point in bigalloc development we doesn't support online resize, so this also makes it really obvious all of the places we need to fix up to add support for online resize. Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 8 ++++---- fs/ext4/ext4.h | 9 +++++---- fs/ext4/ialloc.c | 14 +++++++------- fs/ext4/mballoc.c | 22 +++++++++++----------- fs/ext4/resize.c | 2 +- fs/ext4/super.c | 10 +++++----- 6 files changed, 33 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index bf42b3219e3c..84dc3d62b40c 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -168,7 +168,7 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); - ext4_free_blks_set(sb, gdp, 0); + ext4_free_group_clusters_set(sb, gdp, 0); ext4_free_inodes_set(sb, gdp, 0); ext4_itable_unused_set(sb, gdp, 0); memset(bh->b_data, 0xff, sb->s_blocksize); @@ -550,7 +550,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += ext4_free_blks_count(sb, gdp); + desc_count += ext4_free_group_clusters(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_block_bitmap(sb, i); if (bitmap_bh == NULL) @@ -558,7 +558,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) x = ext4_count_free(bitmap_bh, sb->s_blocksize); printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", - i, ext4_free_blks_count(sb, gdp), x); + i, ext4_free_group_clusters(sb, gdp), x); bitmap_count += x; } brelse(bitmap_bh); @@ -572,7 +572,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += ext4_free_blks_count(sb, gdp); + desc_count += ext4_free_group_clusters(sb, gdp); } return desc_count; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a5a7e369f719..db279c362017 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1978,8 +1978,8 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, struct ext4_group_desc *bg); extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, struct ext4_group_desc *bg); -extern __u32 ext4_free_blks_count(struct super_block *sb, - struct ext4_group_desc *bg); +extern __u32 ext4_free_group_clusters(struct super_block *sb, + struct ext4_group_desc *bg); extern __u32 ext4_free_inodes_count(struct super_block *sb, struct ext4_group_desc *bg); extern __u32 ext4_used_dirs_count(struct super_block *sb, @@ -1992,8 +1992,9 @@ extern void ext4_inode_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); extern void ext4_inode_table_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); -extern void ext4_free_blks_set(struct super_block *sb, - struct ext4_group_desc *bg, __u32 count); +extern void ext4_free_group_clusters_set(struct super_block *sb, + struct ext4_group_desc *bg, + __u32 count); extern void ext4_free_inodes_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern void ext4_used_dirs_set(struct super_block *sb, diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 0be5862313f0..71a9c8f3dece 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -78,7 +78,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, * allocation, essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); - ext4_free_blks_set(sb, gdp, 0); + ext4_free_group_clusters_set(sb, gdp, 0); ext4_free_inodes_set(sb, gdp, 0); ext4_itable_unused_set(sb, gdp, 0); memset(bh->b_data, 0xff, sb->s_blocksize); @@ -322,8 +322,8 @@ static int find_group_dir(struct super_block *sb, struct inode *parent, if (ext4_free_inodes_count(sb, desc) < avefreei) continue; if (!best_desc || - (ext4_free_blks_count(sb, desc) > - ext4_free_blks_count(sb, best_desc))) { + (ext4_free_group_clusters(sb, desc) > + ext4_free_group_clusters(sb, best_desc))) { *best_group = group; best_desc = desc; ret = 0; @@ -434,7 +434,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, desc = ext4_get_group_desc(sb, g, NULL); if (desc) { stats->free_inodes = ext4_free_inodes_count(sb, desc); - stats->free_clusters = ext4_free_blks_count(sb, desc); + stats->free_clusters = ext4_free_group_clusters(sb, desc); stats->used_dirs = ext4_used_dirs_count(sb, desc); } else { stats->free_inodes = 0; @@ -662,7 +662,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, *group = parent_group; desc = ext4_get_group_desc(sb, *group, NULL); if (desc && ext4_free_inodes_count(sb, desc) && - ext4_free_blks_count(sb, desc)) + ext4_free_group_clusters(sb, desc)) return 0; /* @@ -686,7 +686,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, *group -= ngroups; desc = ext4_get_group_desc(sb, *group, NULL); if (desc && ext4_free_inodes_count(sb, desc) && - ext4_free_blks_count(sb, desc)) + ext4_free_group_clusters(sb, desc)) return 0; } @@ -960,7 +960,7 @@ got: ext4_lock_group(sb, group); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - ext4_free_blks_set(sb, gdp, + ext4_free_group_clusters_set(sb, gdp, ext4_free_blocks_after_init(sb, group, gdp)); gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 5e1215d38331..76db2f12107f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2256,7 +2256,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, ext4_free_blocks_after_init(sb, group, desc); } else { meta_group_info[i]->bb_free = - ext4_free_blks_count(sb, desc); + ext4_free_group_clusters(sb, desc); } INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); @@ -2799,7 +2799,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, goto out_err; ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, - ext4_free_blks_count(sb, gdp)); + ext4_free_group_clusters(sb, gdp)); err = ext4_journal_get_write_access(handle, gdp_bh); if (err) @@ -2839,12 +2839,12 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ac->ac_b_ex.fe_len); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - ext4_free_blks_set(sb, gdp, - ext4_free_blocks_after_init(sb, - ac->ac_b_ex.fe_group, gdp)); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_blocks_after_init(sb, + ac->ac_b_ex.fe_group, gdp)); } - len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len; - ext4_free_blks_set(sb, gdp, len); + len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; + ext4_free_group_clusters_set(sb, gdp, len); gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); @@ -4702,8 +4702,8 @@ do_more: mb_free_blocks(inode, &e4b, bit, count_clusters); } - ret = ext4_free_blks_count(sb, gdp) + count_clusters; - ext4_free_blks_set(sb, gdp, ret); + ret = ext4_free_group_clusters(sb, gdp) + count_clusters; + ext4_free_group_clusters_set(sb, gdp, ret); gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); @@ -4846,8 +4846,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count); mb_free_blocks(NULL, &e4b, bit, count); - blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); - ext4_free_blks_set(sb, desc, blk_free_count); + blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc); + ext4_free_group_clusters_set(sb, desc, blk_free_count); desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 95a09ddca3b9..996780ab4f4e 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -875,7 +875,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ - ext4_free_blks_set(sb, gdp, input->free_blocks_count); + ext4_free_group_clusters_set(sb, gdp, input->free_blocks_count); ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED); gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 66b8cfa15636..30cf0e1bd89c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -163,8 +163,8 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb, (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); } -__u32 ext4_free_blks_count(struct super_block *sb, - struct ext4_group_desc *bg) +__u32 ext4_free_group_clusters(struct super_block *sb, + struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_free_blocks_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? @@ -219,8 +219,8 @@ void ext4_inode_table_set(struct super_block *sb, bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); } -void ext4_free_blks_set(struct super_block *sb, - struct ext4_group_desc *bg, __u32 count) +void ext4_free_group_clusters_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) { bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) @@ -2034,7 +2034,7 @@ static int ext4_fill_flex_info(struct super_block *sb) flex_group = ext4_flex_group(sbi, i); atomic_add(ext4_free_inodes_count(sb, gdp), &sbi->s_flex_groups[flex_group].free_inodes); - atomic_add(ext4_free_blks_count(sb, gdp), + atomic_add(ext4_free_group_clusters(sb, gdp), &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(ext4_used_dirs_count(sb, gdp), &sbi->s_flex_groups[flex_group].used_dirs); -- cgit v1.2.3-58-ga151 From 5dee54372c1ea15ab482b959634cda8c01b042bd Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 9 Sep 2011 19:10:51 -0400 Subject: ext4: rename ext4_count_free_blocks() to ext4_count_free_clusters() This function really counts the free clusters reported in the block group descriptors, so rename it to reduce confusion. Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 11 ++++++----- fs/ext4/ext4.h | 2 +- fs/ext4/inode.c | 6 +++--- fs/ext4/super.c | 7 ++++--- 4 files changed, 14 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 84dc3d62b40c..4f303809e076 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -524,12 +524,12 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, } /** - * ext4_count_free_blocks() -- count filesystem free blocks + * ext4_count_free_clusters() -- count filesystem free clusters * @sb: superblock * - * Adds up the number of free blocks from each block group. + * Adds up the number of free clusters from each block group. */ -ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) +ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) { ext4_fsblk_t desc_count; struct ext4_group_desc *gdp; @@ -562,8 +562,9 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) bitmap_count += x; } brelse(bitmap_bh); - printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu" - ", computed = %llu, %llu\n", ext4_free_blocks_count(es), + printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" + ", computed = %llu, %llu\n", + EXT4_B2C(sbi, ext4_free_blocks_count(es)), desc_count, bitmap_count); return bitmap_count; #else diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index db279c362017..f6963cd7f709 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1775,7 +1775,7 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, int *errp); extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks, unsigned int flags); -extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); +extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); extern void ext4_check_blocks_bitmap(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d1c17e47c1c6..f9198d7875cc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1365,7 +1365,8 @@ static void ext4_print_free_blocks(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); printk(KERN_CRIT "Total free blocks count %lld\n", - ext4_count_free_blocks(inode->i_sb)); + EXT4_C2B(EXT4_SB(inode->i_sb), + ext4_count_free_clusters(inode->i_sb))); printk(KERN_CRIT "Free/Dirty block details\n"); printk(KERN_CRIT "free_blocks=%lld\n", (long long) EXT4_C2B(EXT4_SB(inode->i_sb), @@ -1451,8 +1452,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) if (err == -EAGAIN) goto submit_io; - if (err == -ENOSPC && - ext4_count_free_blocks(sb)) { + if (err == -ENOSPC && ext4_count_free_clusters(sb)) { mpd->retval = err; goto submit_io; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 30cf0e1bd89c..07f3de341452 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2153,7 +2153,8 @@ static int ext4_check_descriptors(struct super_block *sb, if (NULL != first_not_zeroed) *first_not_zeroed = grp; - ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb)); + ext4_free_blocks_count_set(sbi->s_es, + EXT4_C2B(sbi, ext4_count_free_clusters(sb))); sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb)); return 1; } @@ -3577,7 +3578,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_err_report.data = (unsigned long) sb; err = percpu_counter_init(&sbi->s_freeclusters_counter, - ext4_count_free_blocks(sb)); + ext4_count_free_clusters(sb)); if (!err) { err = percpu_counter_init(&sbi->s_freeinodes_counter, ext4_count_free_inodes(sb)); @@ -3703,7 +3704,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * need to update the global counters. */ percpu_counter_set(&sbi->s_freeclusters_counter, - ext4_count_free_blocks(sb)); + ext4_count_free_clusters(sb)); percpu_counter_set(&sbi->s_freeinodes_counter, ext4_count_free_inodes(sb)); percpu_counter_set(&sbi->s_dirs_counter, -- cgit v1.2.3-58-ga151 From cff1dfd767d1ee3c773fd8b57fe310957e5f8abb Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 9 Sep 2011 19:12:51 -0400 Subject: ext4: rename ext4_free_blocks_after_init() to ext4_free_clusters_after_init() This function really returns the number of clusters after initializing an uninitalized block bitmap has been initialized. Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 6 +++--- fs/ext4/ext4.h | 6 +++--- fs/ext4/ialloc.c | 2 +- fs/ext4/mballoc.c | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 4f303809e076..3d07c35c7089 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -213,9 +213,9 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, /* Return the number of free blocks in a block group. It is used when * the block bitmap is uninitialized, so we can't just count the bits * in the bitmap. */ -unsigned ext4_free_blocks_after_init(struct super_block *sb, - ext4_group_t block_group, - struct ext4_group_desc *gdp) +unsigned ext4_free_clusters_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp) { return num_clusters_in_group(sb, block_group) - ext4_num_overhead_clusters(sb, block_group, gdp); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f6963cd7f709..1473c06e4c94 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1787,9 +1787,9 @@ extern void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_group_t group, struct ext4_group_desc *desc); -extern unsigned ext4_free_blocks_after_init(struct super_block *sb, - ext4_group_t block_group, - struct ext4_group_desc *gdp); +extern unsigned ext4_free_clusters_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp); extern unsigned ext4_num_base_meta_clusters(struct super_block *sb, ext4_group_t block_group); extern unsigned ext4_num_overhead_clusters(struct super_block *sb, diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 71a9c8f3dece..d50a7d5e4726 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -961,7 +961,7 @@ got: if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, - ext4_free_blocks_after_init(sb, group, gdp)); + ext4_free_clusters_after_init(sb, group, gdp)); gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 76db2f12107f..90a3ed7c565c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2253,7 +2253,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, */ if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { meta_group_info[i]->bb_free = - ext4_free_blocks_after_init(sb, group, desc); + ext4_free_clusters_after_init(sb, group, desc); } else { meta_group_info[i]->bb_free = ext4_free_group_clusters(sb, desc); @@ -2840,7 +2840,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, - ext4_free_blocks_after_init(sb, + ext4_free_clusters_after_init(sb, ac->ac_b_ex.fe_group, gdp)); } len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; -- cgit v1.2.3-58-ga151 From e7d5f3156e6827970f75ab27ad7eb0155826eb0b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 9 Sep 2011 19:14:51 -0400 Subject: ext4: rename ext4_claim_free_blocks() to ext4_claim_free_clusters() This function really claims a number of free clusters, not blocks, so rename it so it's clearer what's going on. Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 8 ++++---- fs/ext4/ext4.h | 4 ++-- fs/ext4/inode.c | 2 +- fs/ext4/mballoc.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 3d07c35c7089..af0c5357e35a 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -445,11 +445,11 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, return 0; } -int ext4_claim_free_blocks(struct ext4_sb_info *sbi, - s64 nblocks, unsigned int flags) +int ext4_claim_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags) { - if (ext4_has_free_blocks(sbi, nblocks, flags)) { - percpu_counter_add(&sbi->s_dirtyclusters_counter, nblocks); + if (ext4_has_free_blocks(sbi, nclusters, flags)) { + percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters); return 0; } else return -ENOSPC; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1473c06e4c94..f97638634e34 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1773,8 +1773,8 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, unsigned int flags, unsigned long *count, int *errp); -extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, - s64 nblocks, unsigned int flags); +extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags); extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); extern void ext4_check_blocks_bitmap(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f9198d7875cc..557c34ffa05b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1071,7 +1071,7 @@ repeat: * We do still charge estimated metadata to the sb though; * we cannot afford to run out of free blocks. */ - if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) { + if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) { dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { yield(); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 90a3ed7c565c..1c83161090f8 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4297,7 +4297,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, * and verify allocation doesn't exceed the quota limits. */ while (ar->len && - ext4_claim_free_blocks(sbi, ar->len, ar->flags)) { + ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { /* let others to free the space */ yield(); -- cgit v1.2.3-58-ga151 From df55c99dc8ee4c3c886a5edc8a4aa6b131c95afc Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 9 Sep 2011 19:16:51 -0400 Subject: ext4: rename ext4_has_free_blocks() to ext4_has_free_clusters() Rename the function so it is more clear what is going on. Also rename the various variables so it's clearer what's happening. Also fix a missing blocks to cluster conversion when reading the number of reserved blocks for root. Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 43 ++++++++++++++++++++++--------------------- fs/ext4/ext4.h | 8 ++++---- fs/ext4/inode.c | 2 +- 3 files changed, 27 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index af0c5357e35a..f6dba4505f1c 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -403,42 +403,43 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) } /** - * ext4_has_free_blocks() + * ext4_has_free_clusters() * @sbi: in-core super block structure. - * @nblocks: number of needed blocks + * @nclusters: number of needed blocks + * @flags: flags from ext4_mb_new_blocks() * - * Check if filesystem has nblocks free & available for allocation. + * Check if filesystem has nclusters free & available for allocation. * On success return 1, return 0 on failure. */ -static int ext4_has_free_blocks(struct ext4_sb_info *sbi, - s64 nblocks, unsigned int flags) +static int ext4_has_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags) { - s64 free_blocks, dirty_blocks, root_blocks; + s64 free_clusters, dirty_clusters, root_clusters; struct percpu_counter *fcc = &sbi->s_freeclusters_counter; - struct percpu_counter *dbc = &sbi->s_dirtyclusters_counter; + struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; - free_blocks = percpu_counter_read_positive(fcc); - dirty_blocks = percpu_counter_read_positive(dbc); - root_blocks = ext4_r_blocks_count(sbi->s_es); + free_clusters = percpu_counter_read_positive(fcc); + dirty_clusters = percpu_counter_read_positive(dcc); + root_clusters = EXT4_B2C(sbi, ext4_r_blocks_count(sbi->s_es)); - if (free_blocks - (nblocks + root_blocks + dirty_blocks) < - EXT4_FREEBLOCKS_WATERMARK) { - free_blocks = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc)); - dirty_blocks = percpu_counter_sum_positive(dbc); + if (free_clusters - (nclusters + root_clusters + dirty_clusters) < + EXT4_FREECLUSTERS_WATERMARK) { + free_clusters = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc)); + dirty_clusters = percpu_counter_sum_positive(dcc); } - /* Check whether we have space after - * accounting for current dirty blocks & root reserved blocks. + /* Check whether we have space after accounting for current + * dirty clusters & root reserved clusters. */ - if (free_blocks >= ((root_blocks + nblocks) + dirty_blocks)) + if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters)) return 1; - /* Hm, nope. Are (enough) root reserved blocks available? */ + /* Hm, nope. Are (enough) root reserved clusters available? */ if (sbi->s_resuid == current_fsuid() || ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || capable(CAP_SYS_RESOURCE) || (flags & EXT4_MB_USE_ROOT_BLOCKS)) { - if (free_blocks >= (nblocks + dirty_blocks)) + if (free_clusters >= (nclusters + dirty_clusters)) return 1; } @@ -448,7 +449,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, int ext4_claim_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags) { - if (ext4_has_free_blocks(sbi, nclusters, flags)) { + if (ext4_has_free_clusters(sbi, nclusters, flags)) { percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters); return 0; } else @@ -469,7 +470,7 @@ int ext4_claim_free_clusters(struct ext4_sb_info *sbi, */ int ext4_should_retry_alloc(struct super_block *sb, int *retries) { - if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) || + if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) || (*retries)++ > 3 || !EXT4_SB(sb)->s_journal) return 0; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f97638634e34..21ea65d8bd46 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2103,13 +2103,13 @@ do { \ } while (0) #ifdef CONFIG_SMP -/* Each CPU can accumulate percpu_counter_batch blocks in their local - * counters. So we need to make sure we have free blocks more +/* Each CPU can accumulate percpu_counter_batch clusters in their local + * counters. So we need to make sure we have free clusters more * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. */ -#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) +#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) #else -#define EXT4_FREEBLOCKS_WATERMARK 0 +#define EXT4_FREECLUSTERS_WATERMARK 0 #endif static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 557c34ffa05b..88dc63a01756 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2255,7 +2255,7 @@ static int ext4_nonda_switch(struct super_block *sb) percpu_counter_read_positive(&sbi->s_freeclusters_counter)); dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); if (2 * free_blocks < 3 * dirty_blocks || - free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { + free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { /* * free block count is less than 150% of dirty blocks * or free blocks is less than watermark -- cgit v1.2.3-58-ga151 From d8990240d8c911064447f8aa5a440f9345a6d692 Mon Sep 17 00:00:00 2001 From: Aditya Kali Date: Fri, 9 Sep 2011 19:18:51 -0400 Subject: ext4: add some tracepoints in ext4/extents.c This patch adds some tracepoints in ext4/extents.c and updates a tracepoint in ext4/inode.c. Tested: Built and ran the kernel and verified that these tracepoints work. Also ran xfstests. Signed-off-by: Aditya Kali Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 2 + fs/ext4/extents.c | 44 ++++- fs/ext4/inode.c | 3 +- fs/ext4/migrate.c | 1 - fs/ext4/move_extent.c | 1 - include/trace/events/ext4.h | 398 +++++++++++++++++++++++++++++++++++++++++++- 6 files changed, 433 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 21ea65d8bd46..751277a4890c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2339,4 +2339,6 @@ extern void ext4_resize_end(struct super_block *sb); #endif /* __KERNEL__ */ +#include "ext4_extents.h" + #endif /* _EXT4_H */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c4e005864534..9b119308daea 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -42,7 +42,6 @@ #include #include #include "ext4_jbd2.h" -#include "ext4_extents.h" #include @@ -1969,6 +1968,7 @@ ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, struct ext4_ext_cache *cex; BUG_ON(len == 0); spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + trace_ext4_ext_put_in_cache(inode, block, len, start); cex = &EXT4_I(inode)->i_cached_extent; cex->ec_block = block; cex->ec_len = len; @@ -2070,6 +2070,7 @@ errout: sbi->extent_cache_misses++; else sbi->extent_cache_hits++; + trace_ext4_ext_in_cache(inode, block, ret); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); return ret; } @@ -2137,6 +2138,8 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, if (err) return err; ext_debug("index is empty, remove it, free block %llu\n", leaf); + trace_ext4_ext_rm_idx(inode, leaf); + ext4_free_blocks(handle, inode, NULL, leaf, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return err; @@ -2222,6 +2225,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, */ flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; + trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster); /* * If we have a partial cluster, and it's different from the * cluster of the last block, we need to explicitly free the @@ -2336,6 +2340,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); + trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); + while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { @@ -2591,6 +2597,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) again: ext4_ext_invalidate_cache(inode); + trace_ext4_ext_remove_space(inode, start, depth); + /* * We start scanning from right side, freeing all the blocks * after i_size and walking into the tree depth-wise. @@ -2686,6 +2694,9 @@ again: } } + trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster, + path->p_hdr->eh_entries); + /* If we still have something in the partial cluster and we have removed * even the first extent, then we should free the blocks in the partial * cluster as well. */ @@ -3300,6 +3311,10 @@ static int ext4_find_delalloc_range(struct inode *inode, * detect that here. */ page_cache_release(page); + trace_ext4_find_delalloc_range(inode, + lblk_start, lblk_end, + search_hint_reverse, + 0, i); return 0; } @@ -3327,6 +3342,10 @@ static int ext4_find_delalloc_range(struct inode *inode, if (buffer_delay(bh)) { page_cache_release(page); + trace_ext4_find_delalloc_range(inode, + lblk_start, lblk_end, + search_hint_reverse, + 1, i); return 1; } if (search_hint_reverse) @@ -3349,6 +3368,8 @@ nextpage: i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); } + trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end, + search_hint_reverse, 0, 0); return 0; } @@ -3414,6 +3435,8 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, /* max possible clusters for this allocation */ allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1; + trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); + /* Check towards left side */ c_offset = lblk_start & (sbi->s_cluster_ratio - 1); if (c_offset) { @@ -3453,6 +3476,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, flags, allocated); ext4_ext_show_leaf(inode, path); + trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated, + newblock); + /* get_block() before submit the IO, split the extent */ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { ret = ext4_split_unwritten_extents(handle, inode, map, @@ -3572,7 +3598,7 @@ out2: * get_implied_cluster_alloc - check to see if the requested * allocation (in the map structure) overlaps with a cluster already * allocated in an extent. - * @sbi The ext4-specific superblock structure + * @sb The filesystem superblock structure * @map The requested lblk->pblk mapping * @ex The extent structure which might contain an implied * cluster allocation @@ -3609,11 +3635,12 @@ out2: * ext4_ext_map_blocks() will then allocate one or more new clusters * by calling ext4_mb_new_blocks(). */ -static int get_implied_cluster_alloc(struct ext4_sb_info *sbi, +static int get_implied_cluster_alloc(struct super_block *sb, struct ext4_map_blocks *map, struct ext4_extent *ex, struct ext4_ext_path *path) { + struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1); ext4_lblk_t ex_cluster_start, ex_cluster_end; ext4_lblk_t rr_cluster_start, rr_cluster_end; @@ -3662,8 +3689,12 @@ static int get_implied_cluster_alloc(struct ext4_sb_info *sbi, ext4_lblk_t next = ext4_ext_next_allocated_block(path); map->m_len = min(map->m_len, next - map->m_lblk); } + + trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1); return 1; } + + trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0); return 0; } @@ -3772,6 +3803,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * we split out initialized portions during a write. */ ee_len = ext4_ext_get_actual_len(ex); + + trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len); + /* if found extent covers block, simply return it */ if (in_range(map->m_lblk, ee_block, ee_len)) { ext4_fsblk_t partial_cluster = 0; @@ -3912,7 +3946,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * by ext4_ext_find_extent() implies a cluster we can use. */ if (cluster_offset && ex && - get_implied_cluster_alloc(sbi, map, ex, path)) { + get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; map->m_flags |= EXT4_MAP_FROM_CLUSTER; @@ -3933,7 +3967,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* Check if the extent after searching to the right implies a * cluster we can use. */ if ((sbi->s_cluster_ratio > 1) && ex2 && - get_implied_cluster_alloc(sbi, map, ex2, path)) { + get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; map->m_flags |= EXT4_MAP_FROM_CLUSTER; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 88dc63a01756..2dcd4fed96ec 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -42,7 +42,6 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" -#include "ext4_extents.h" #include "truncate.h" #include @@ -268,7 +267,7 @@ void ext4_da_update_reserve_space(struct inode *inode, struct ext4_inode_info *ei = EXT4_I(inode); spin_lock(&ei->i_block_reservation_lock); - trace_ext4_da_update_reserve_space(inode, used); + trace_ext4_da_update_reserve_space(inode, used, quota_claim); if (unlikely(used > ei->i_reserved_data_blocks)) { ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " "with only %d reserved data blocks\n", diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index b57b98fb44d1..6f07a06f2437 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -15,7 +15,6 @@ #include #include #include "ext4_jbd2.h" -#include "ext4_extents.h" /* * The contiguous blocks details which can be diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index f57455a1b1b2..c5826c623e7a 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -17,7 +17,6 @@ #include #include #include "ext4_jbd2.h" -#include "ext4_extents.h" #include "ext4.h" /** diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index b50a54736242..c9a341e385a3 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -12,6 +12,8 @@ struct ext4_allocation_request; struct ext4_prealloc_space; struct ext4_inode_info; struct mpage_da_data; +struct ext4_map_blocks; +struct ext4_extent; #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) @@ -1032,9 +1034,9 @@ TRACE_EVENT(ext4_forget, ); TRACE_EVENT(ext4_da_update_reserve_space, - TP_PROTO(struct inode *inode, int used_blocks), + TP_PROTO(struct inode *inode, int used_blocks, int quota_claim), - TP_ARGS(inode, used_blocks), + TP_ARGS(inode, used_blocks, quota_claim), TP_STRUCT__entry( __field( dev_t, dev ) @@ -1045,6 +1047,7 @@ TRACE_EVENT(ext4_da_update_reserve_space, __field( int, reserved_data_blocks ) __field( int, reserved_meta_blocks ) __field( int, allocated_meta_blocks ) + __field( int, quota_claim ) ), TP_fast_assign( @@ -1053,19 +1056,24 @@ TRACE_EVENT(ext4_da_update_reserve_space, __entry->mode = inode->i_mode; __entry->i_blocks = inode->i_blocks; __entry->used_blocks = used_blocks; - __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; - __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; - __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks; + __entry->reserved_data_blocks = + EXT4_I(inode)->i_reserved_data_blocks; + __entry->reserved_meta_blocks = + EXT4_I(inode)->i_reserved_meta_blocks; + __entry->allocated_meta_blocks = + EXT4_I(inode)->i_allocated_meta_blocks; + __entry->quota_claim = quota_claim; ), TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d " "reserved_data_blocks %d reserved_meta_blocks %d " - "allocated_meta_blocks %d", + "allocated_meta_blocks %d quota_claim %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->i_blocks, __entry->used_blocks, __entry->reserved_data_blocks, - __entry->reserved_meta_blocks, __entry->allocated_meta_blocks) + __entry->reserved_meta_blocks, __entry->allocated_meta_blocks, + __entry->quota_claim) ); TRACE_EVENT(ext4_da_reserve_space, @@ -1589,6 +1597,382 @@ DEFINE_EVENT(ext4__trim, ext4_trim_all_free, TP_ARGS(sb, group, start, len) ); +TRACE_EVENT(ext4_ext_handle_uninitialized_extents, + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, + unsigned int allocated, ext4_fsblk_t newblock), + + TP_ARGS(inode, map, allocated, newblock), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( ext4_lblk_t, lblk ) + __field( ext4_fsblk_t, pblk ) + __field( unsigned int, len ) + __field( int, flags ) + __field( unsigned int, allocated ) + __field( ext4_fsblk_t, newblk ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->lblk = map->m_lblk; + __entry->pblk = map->m_pblk; + __entry->len = map->m_len; + __entry->flags = map->m_flags; + __entry->allocated = allocated; + __entry->newblk = newblock; + ), + + TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %d" + "allocated %d newblock %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned) __entry->lblk, (unsigned long long) __entry->pblk, + __entry->len, __entry->flags, + (unsigned int) __entry->allocated, + (unsigned long long) __entry->newblk) +); + +TRACE_EVENT(ext4_get_implied_cluster_alloc_exit, + TP_PROTO(struct super_block *sb, struct ext4_map_blocks *map, int ret), + + TP_ARGS(sb, map, ret), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ext4_lblk_t, lblk ) + __field( ext4_fsblk_t, pblk ) + __field( unsigned int, len ) + __field( unsigned int, flags ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->lblk = map->m_lblk; + __entry->pblk = map->m_pblk; + __entry->len = map->m_len; + __entry->flags = map->m_flags; + __entry->ret = ret; + ), + + TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %u ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lblk, (unsigned long long) __entry->pblk, + __entry->len, __entry->flags, __entry->ret) +); + +TRACE_EVENT(ext4_ext_put_in_cache, + TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len, + ext4_fsblk_t start), + + TP_ARGS(inode, lblk, len, start), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( ext4_lblk_t, lblk ) + __field( unsigned int, len ) + __field( ext4_fsblk_t, start ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->lblk = lblk; + __entry->len = len; + __entry->start = start; + ), + + TP_printk("dev %d,%d ino %lu lblk %u len %u start %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned) __entry->lblk, + __entry->len, + (unsigned long long) __entry->start) +); + +TRACE_EVENT(ext4_ext_in_cache, + TP_PROTO(struct inode *inode, ext4_lblk_t lblk, int ret), + + TP_ARGS(inode, lblk, ret), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( ext4_lblk_t, lblk ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->lblk = lblk; + __entry->ret = ret; + ), + + TP_printk("dev %d,%d ino %lu lblk %u ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned) __entry->lblk, + __entry->ret) + +); + +TRACE_EVENT(ext4_find_delalloc_range, + TP_PROTO(struct inode *inode, ext4_lblk_t from, ext4_lblk_t to, + int reverse, int found, ext4_lblk_t found_blk), + + TP_ARGS(inode, from, to, reverse, found, found_blk), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( ext4_lblk_t, from ) + __field( ext4_lblk_t, to ) + __field( int, reverse ) + __field( int, found ) + __field( ext4_lblk_t, found_blk ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->from = from; + __entry->to = to; + __entry->reverse = reverse; + __entry->found = found; + __entry->found_blk = found_blk; + ), + + TP_printk("dev %d,%d ino %lu from %u to %u reverse %d found %d " + "(blk = %u)", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned) __entry->from, (unsigned) __entry->to, + __entry->reverse, __entry->found, + (unsigned) __entry->found_blk) +); + +TRACE_EVENT(ext4_get_reserved_cluster_alloc, + TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len), + + TP_ARGS(inode, lblk, len), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( ext4_lblk_t, lblk ) + __field( unsigned int, len ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->lblk = lblk; + __entry->len = len; + ), + + TP_printk("dev %d,%d ino %lu lblk %u len %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned) __entry->lblk, + __entry->len) +); + +TRACE_EVENT(ext4_ext_show_extent, + TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, + unsigned short len), + + TP_ARGS(inode, lblk, pblk, len), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( ext4_lblk_t, lblk ) + __field( ext4_fsblk_t, pblk ) + __field( unsigned short, len ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->lblk = lblk; + __entry->pblk = pblk; + __entry->len = len; + ), + + TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned) __entry->lblk, + (unsigned long long) __entry->pblk, + (unsigned short) __entry->len) +); + +TRACE_EVENT(ext4_remove_blocks, + TP_PROTO(struct inode *inode, struct ext4_extent *ex, + ext4_lblk_t from, ext4_fsblk_t to, + ext4_fsblk_t partial_cluster), + + TP_ARGS(inode, ex, from, to, partial_cluster), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( ext4_lblk_t, ee_lblk ) + __field( ext4_fsblk_t, ee_pblk ) + __field( unsigned short, ee_len ) + __field( ext4_lblk_t, from ) + __field( ext4_lblk_t, to ) + __field( ext4_fsblk_t, partial ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->ee_lblk = cpu_to_le32(ex->ee_block); + __entry->ee_pblk = ext4_ext_pblock(ex); + __entry->ee_len = ext4_ext_get_actual_len(ex); + __entry->from = from; + __entry->to = to; + __entry->partial = partial_cluster; + ), + + TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]" + "from %u to %u partial_cluster %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned) __entry->ee_lblk, + (unsigned long long) __entry->ee_pblk, + (unsigned short) __entry->ee_len, + (unsigned) __entry->from, + (unsigned) __entry->to, + (unsigned) __entry->partial) +); + +TRACE_EVENT(ext4_ext_rm_leaf, + TP_PROTO(struct inode *inode, ext4_lblk_t start, + struct ext4_extent *ex, ext4_fsblk_t partial_cluster), + + TP_ARGS(inode, start, ex, partial_cluster), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( ext4_lblk_t, start ) + __field( ext4_lblk_t, ee_lblk ) + __field( ext4_fsblk_t, ee_pblk ) + __field( short, ee_len ) + __field( ext4_fsblk_t, partial ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->start = start; + __entry->ee_lblk = le32_to_cpu(ex->ee_block); + __entry->ee_pblk = ext4_ext_pblock(ex); + __entry->ee_len = ext4_ext_get_actual_len(ex); + __entry->partial = partial_cluster; + ), + + TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]" + "partial_cluster %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned) __entry->start, + (unsigned) __entry->ee_lblk, + (unsigned long long) __entry->ee_pblk, + (unsigned short) __entry->ee_len, + (unsigned) __entry->partial) +); + +TRACE_EVENT(ext4_ext_rm_idx, + TP_PROTO(struct inode *inode, ext4_fsblk_t pblk), + + TP_ARGS(inode, pblk), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( ext4_fsblk_t, pblk ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->pblk = pblk; + ), + + TP_printk("dev %d,%d ino %lu index_pblk %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long long) __entry->pblk) +); + +TRACE_EVENT(ext4_ext_remove_space, + TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth), + + TP_ARGS(inode, start, depth), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( ext4_lblk_t, start ) + __field( int, depth ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->start = start; + __entry->depth = depth; + ), + + TP_printk("dev %d,%d ino %lu since %u depth %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned) __entry->start, + __entry->depth) +); + +TRACE_EVENT(ext4_ext_remove_space_done, + TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth, + ext4_lblk_t partial, unsigned short eh_entries), + + TP_ARGS(inode, start, depth, partial, eh_entries), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( ext4_lblk_t, start ) + __field( int, depth ) + __field( ext4_lblk_t, partial ) + __field( unsigned short, eh_entries ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->start = start; + __entry->depth = depth; + __entry->partial = partial; + __entry->eh_entries = eh_entries; + ), + + TP_printk("dev %d,%d ino %lu since %u depth %d partial %u " + "remaining_entries %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned) __entry->start, + __entry->depth, + (unsigned) __entry->partial, + (unsigned short) __entry->eh_entries) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ -- cgit v1.2.3-58-ga151 From 5356f2615cd558c57a1f7d7528d1ad4de3640d96 Mon Sep 17 00:00:00 2001 From: Aditya Kali Date: Fri, 9 Sep 2011 19:20:51 -0400 Subject: ext4: attempt to fix race in bigalloc code path Currently, there exists a race between delayed allocated writes and the writeback when bigalloc feature is in use. The race was because we wanted to determine what blocks in a cluster are under delayed allocation and we were using buffer_delayed(bh) check for it. But, the writeback codepath clears this bit without any synchronization which resulted in a race and an ext4 warning similar to: EXT4-fs (ram1): ext4_da_update_reserve_space: ino 13, used 1 with only 0 reserved data blocks The race existed in two places. (1) between ext4_find_delalloc_range() and ext4_map_blocks() when called from writeback code path. (2) between ext4_find_delalloc_range() and ext4_da_get_block_prep() (where buffer_delayed(bh) is set. To fix (1), this patch introduces a new buffer_head state bit - BH_Da_Mapped. This bit is set under the protection of EXT4_I(inode)->i_data_sem when we have actually mapped the delayed allocated blocks during the writeout time. We can now reliably check for this bit inside ext4_find_delalloc_range() to determine whether the reservation for the blocks have already been claimed or not. To fix (2), it was necessary to set buffer_delay(bh) under the protection of i_data_sem. So, I extracted the very beginning of ext4_map_blocks into a new function - ext4_da_map_blocks() - and performed the required setting of bh_delay bit and the quota reservation under the protection of i_data_sem. These two fixes makes the checking of buffer_delay(bh) and buffer_da_mapped(bh) consistent, thus removing the race. Tested: I was able to reproduce the problem by running 'dd' and 'fsync' in parallel. Also, xfstests sometimes used to reproduce this race. After the fix both my test and xfstests were successful and no race (warning message) was observed. Google-Bug-Id: 4997027 Signed-off-by: Aditya Kali Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 5 +- fs/ext4/extents.c | 38 +++++--------- fs/ext4/inode.c | 146 +++++++++++++++++++++++++++++++++++++++++++----------- 3 files changed, 134 insertions(+), 55 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 751277a4890c..1bbd2caebe7f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1893,7 +1893,6 @@ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); -extern int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock); /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, @@ -2300,10 +2299,14 @@ enum ext4_state_bits { * never, ever appear in a buffer_head's state * flag. See EXT4_MAP_FROM_CLUSTER to see where * this is used. */ + BH_Da_Mapped, /* Delayed allocated block that now has a mapping. This + * flag is set when ext4_map_blocks is called on a + * delayed allocated block to get its real mapping. */ }; BUFFER_FNS(Uninit, uninit) TAS_BUFFER_FNS(Uninit, uninit) +BUFFER_FNS(Da_Mapped, da_mapped) /* * Add new method to test wether block and inode bitmaps are properly diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9b119308daea..ad39627c1fbc 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3296,28 +3296,9 @@ static int ext4_find_delalloc_range(struct inode *inode, while ((i >= lblk_start) && (i <= lblk_end)) { page = find_get_page(mapping, index); - if (!page || !PageDirty(page)) + if (!page) goto nextpage; - if (PageWriteback(page)) { - /* - * This might be a race with allocation and writeout. In - * this case we just assume that the rest of the range - * will eventually be written and there wont be any - * delalloc blocks left. - * TODO: the above assumption is troublesome, but might - * work better in practice. other option could be note - * somewhere that the cluster is getting written out and - * detect that here. - */ - page_cache_release(page); - trace_ext4_find_delalloc_range(inode, - lblk_start, lblk_end, - search_hint_reverse, - 0, i); - return 0; - } - if (!page_has_buffers(page)) goto nextpage; @@ -3340,7 +3321,11 @@ static int ext4_find_delalloc_range(struct inode *inode, continue; } - if (buffer_delay(bh)) { + /* Check if the buffer is delayed allocated and that it + * is not yet mapped. (when da-buffers are mapped during + * their writeout, their da_mapped bit is set.) + */ + if (buffer_delay(bh) && !buffer_da_mapped(bh)) { page_cache_release(page); trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end, @@ -4106,6 +4091,7 @@ got_allocated_blocks: ext4_da_update_reserve_space(inode, allocated_clusters, 1); if (reserved_clusters < allocated_clusters) { + struct ext4_inode_info *ei = EXT4_I(inode); int reservation = allocated_clusters - reserved_clusters; /* @@ -4148,11 +4134,11 @@ got_allocated_blocks: * remaining blocks finally gets written, we * could claim them. */ - while (reservation) { - ext4_da_reserve_space(inode, - map->m_lblk); - reservation--; - } + dquot_reserve_block(inode, + EXT4_C2B(sbi, reservation)); + spin_lock(&ei->i_block_reservation_lock); + ei->i_reserved_data_blocks += reservation; + spin_unlock(&ei->i_block_reservation_lock); } } } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2dcd4fed96ec..1380cd29c312 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -397,6 +397,49 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, return num; } +/* + * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map. + */ +static void set_buffers_da_mapped(struct inode *inode, + struct ext4_map_blocks *map) +{ + struct address_space *mapping = inode->i_mapping; + struct pagevec pvec; + int i, nr_pages; + pgoff_t index, end; + + index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + end = (map->m_lblk + map->m_len - 1) >> + (PAGE_CACHE_SHIFT - inode->i_blkbits); + + pagevec_init(&pvec, 0); + while (index <= end) { + nr_pages = pagevec_lookup(&pvec, mapping, index, + min(end - index + 1, + (pgoff_t)PAGEVEC_SIZE)); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + struct buffer_head *bh, *head; + + if (unlikely(page->mapping != mapping) || + !PageDirty(page)) + break; + + if (page_has_buffers(page)) { + bh = head = page_buffers(page); + do { + set_buffer_da_mapped(bh); + bh = bh->b_this_page; + } while (bh != head); + } + index++; + } + pagevec_release(&pvec); + } +} + /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -516,9 +559,17 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) ext4_da_update_reserve_space(inode, retval, 1); } - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); + /* If we have successfully mapped the delayed allocated blocks, + * set the BH_Da_Mapped bit on them. Its important to do this + * under the protection of i_data_sem. + */ + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) + set_buffers_da_mapped(inode, map); + } + up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { int ret = check_block_validity(inode, map); @@ -1038,7 +1089,7 @@ static int ext4_journalled_write_end(struct file *file, /* * Reserve a single cluster located at lblock */ -int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) +static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) { int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -1153,6 +1204,7 @@ static void ext4_da_page_release_reservation(struct page *page, if ((offset <= curr_off) && (buffer_delay(bh))) { to_release++; clear_buffer_delay(bh); + clear_buffer_da_mapped(bh); } curr_off = next_off; } while ((bh = bh->b_this_page) != head); @@ -1271,6 +1323,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, clear_buffer_delay(bh); bh->b_blocknr = pblock; } + if (buffer_da_mapped(bh)) + clear_buffer_da_mapped(bh); if (buffer_unwritten(bh) || buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); @@ -1603,6 +1657,66 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); } +/* + * This function is grabs code from the very beginning of + * ext4_map_blocks, but assumes that the caller is from delayed write + * time. This function looks up the requested blocks and sets the + * buffer delay bit under the protection of i_data_sem. + */ +static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, + struct ext4_map_blocks *map, + struct buffer_head *bh) +{ + int retval; + sector_t invalid_block = ~((sector_t) 0xffff); + + if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) + invalid_block = ~0; + + map->m_flags = 0; + ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u," + "logical block %lu\n", inode->i_ino, map->m_len, + (unsigned long) map->m_lblk); + /* + * Try to see if we can get the block without requesting a new + * file system block. + */ + down_read((&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + retval = ext4_ext_map_blocks(NULL, inode, map, 0); + else + retval = ext4_ind_map_blocks(NULL, inode, map, 0); + + if (retval == 0) { + /* + * XXX: __block_prepare_write() unmaps passed block, + * is it OK? + */ + /* If the block was allocated from previously allocated cluster, + * then we dont need to reserve it again. */ + if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { + retval = ext4_da_reserve_space(inode, iblock); + if (retval) + /* not enough space to reserve */ + goto out_unlock; + } + + /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served + * and it should not appear on the bh->b_state. + */ + map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; + + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); + } + +out_unlock: + up_read((&EXT4_I(inode)->i_data_sem)); + + return retval; +} + /* * This is a special get_blocks_t callback which is used by * ext4_da_write_begin(). It will either return mapped block or @@ -1620,10 +1734,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, { struct ext4_map_blocks map; int ret = 0; - sector_t invalid_block = ~((sector_t) 0xffff); - - if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) - invalid_block = ~0; BUG_ON(create == 0); BUG_ON(bh->b_size != inode->i_sb->s_blocksize); @@ -1636,29 +1746,9 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) + ret = ext4_da_map_blocks(inode, iblock, &map, bh); + if (ret <= 0) return ret; - if (ret == 0) { - if (buffer_delay(bh)) - return 0; /* Not sure this could or should happen */ - /* - * XXX: __block_write_begin() unmaps passed block, is it OK? - */ - /* If the block was allocated from previously allocated cluster, - * then we dont need to reserve it again. */ - if (!(map.m_flags & EXT4_MAP_FROM_CLUSTER)) { - ret = ext4_da_reserve_space(inode, iblock); - if (ret) - /* not enough space to reserve */ - return ret; - } - - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); - return 0; - } map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; -- cgit v1.2.3-58-ga151 From e8a0e41266e9c207ad8ac158cee9547ef1bc90ac Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Mon, 13 Jun 2011 22:16:44 +0200 Subject: jffs2: Avoid unneeded 'if' before kfree kfree() deals gracefully with NULL pointers, so it's pointless to test for one prior to calling it. This removes such a test from jffs2_scan_medium(). Signed-off-by: Jesper Juhl Signed-off-by: Artem Bityutskiy --- fs/jffs2/scan.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 8d8cd3419d02..28107ca136e4 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -275,9 +275,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) else c->mtd->unpoint(c->mtd, 0, c->mtd->size); #endif - if (s) - kfree(s); - + kfree(s); return ret; } -- cgit v1.2.3-58-ga151 From 51b11e3630672b7ce8793ecf13e5759656edf38a Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Tue, 28 Jun 2011 00:21:30 +0400 Subject: jffs2: use mutex_is_locked() in __jffs2_flush_wbuf() Use a helper to test if a mutex is held instead of a hack with mutex_trylock(). Signed-off-by: Alexey Khoroshilov Signed-off-by: Artem Bityutskiy --- fs/jffs2/wbuf.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 4515bea0268f..a10fb24ca6e6 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -578,8 +578,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) if (!jffs2_is_writebuffered(c)) return 0; - if (mutex_trylock(&c->alloc_sem)) { - mutex_unlock(&c->alloc_sem); + if (!mutex_is_locked(&c->alloc_sem)) { printk(KERN_CRIT "jffs2_flush_wbuf() called with alloc_sem not locked!\n"); BUG(); } -- cgit v1.2.3-58-ga151 From 0612b9ddc2eeda014dd805c87c752b342d8f80f0 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Tue, 30 Aug 2011 18:45:40 -0700 Subject: mtd: rename MTD_OOB_* to MTD_OPS_* These modes are not necessarily for OOB only. Particularly, MTD_OOB_RAW affected operations on in-band page data as well. To clarify these options and to emphasize that their effect is applied per-operation, we change the primary prefix to MTD_OPS_. Signed-off-by: Brian Norris Signed-off-by: Artem Bityutskiy --- drivers/mtd/devices/doc2000.c | 4 ++-- drivers/mtd/devices/doc2001.c | 4 ++-- drivers/mtd/devices/doc2001plus.c | 4 ++-- drivers/mtd/inftlcore.c | 6 ++--- drivers/mtd/mtdchar.c | 10 +++++---- drivers/mtd/mtdpart.c | 2 +- drivers/mtd/mtdswap.c | 6 ++--- drivers/mtd/nand/gpmi-nand/gpmi-nand.c | 2 +- drivers/mtd/nand/nand_base.c | 40 +++++++++++++++++----------------- drivers/mtd/nand/nand_bbt.c | 8 +++---- drivers/mtd/nand/sm_common.c | 2 +- drivers/mtd/nftlcore.c | 6 ++--- drivers/mtd/onenand/onenand_base.c | 38 ++++++++++++++++---------------- drivers/mtd/onenand/onenand_bbt.c | 2 +- drivers/mtd/sm_ftl.c | 4 ++-- drivers/mtd/ssfdc.c | 2 +- drivers/mtd/tests/mtd_oobtest.c | 24 ++++++++++---------- drivers/mtd/tests/mtd_readtest.c | 2 +- drivers/staging/spectra/lld_mtd.c | 6 ++--- fs/jffs2/wbuf.c | 6 ++--- include/linux/mtd/mtd.h | 2 +- include/mtd/mtd-abi.h | 16 +++++++------- 22 files changed, 99 insertions(+), 97 deletions(-) (limited to 'fs') diff --git a/drivers/mtd/devices/doc2000.c b/drivers/mtd/devices/doc2000.c index 8c9703309496..e9fad9151219 100644 --- a/drivers/mtd/devices/doc2000.c +++ b/drivers/mtd/devices/doc2000.c @@ -927,7 +927,7 @@ static int doc_read_oob(struct mtd_info *mtd, loff_t ofs, uint8_t *buf = ops->oobbuf; size_t len = ops->len; - BUG_ON(ops->mode != MTD_OOB_PLACE); + BUG_ON(ops->mode != MTD_OPS_PLACE_OOB); ofs += ops->ooboffs; @@ -1091,7 +1091,7 @@ static int doc_write_oob(struct mtd_info *mtd, loff_t ofs, struct DiskOnChip *this = mtd->priv; int ret; - BUG_ON(ops->mode != MTD_OOB_PLACE); + BUG_ON(ops->mode != MTD_OPS_PLACE_OOB); mutex_lock(&this->lock); ret = doc_write_oob_nolock(mtd, ofs + ops->ooboffs, ops->len, diff --git a/drivers/mtd/devices/doc2001.c b/drivers/mtd/devices/doc2001.c index 3d2b459cea92..a3f7a27499be 100644 --- a/drivers/mtd/devices/doc2001.c +++ b/drivers/mtd/devices/doc2001.c @@ -631,7 +631,7 @@ static int doc_read_oob(struct mtd_info *mtd, loff_t ofs, uint8_t *buf = ops->oobbuf; size_t len = ops->len; - BUG_ON(ops->mode != MTD_OOB_PLACE); + BUG_ON(ops->mode != MTD_OPS_PLACE_OOB); ofs += ops->ooboffs; @@ -689,7 +689,7 @@ static int doc_write_oob(struct mtd_info *mtd, loff_t ofs, uint8_t *buf = ops->oobbuf; size_t len = ops->len; - BUG_ON(ops->mode != MTD_OOB_PLACE); + BUG_ON(ops->mode != MTD_OPS_PLACE_OOB); ofs += ops->ooboffs; diff --git a/drivers/mtd/devices/doc2001plus.c b/drivers/mtd/devices/doc2001plus.c index d28c9d99979f..99351bc3e0ed 100644 --- a/drivers/mtd/devices/doc2001plus.c +++ b/drivers/mtd/devices/doc2001plus.c @@ -834,7 +834,7 @@ static int doc_read_oob(struct mtd_info *mtd, loff_t ofs, uint8_t *buf = ops->oobbuf; size_t len = ops->len; - BUG_ON(ops->mode != MTD_OOB_PLACE); + BUG_ON(ops->mode != MTD_OPS_PLACE_OOB); ofs += ops->ooboffs; @@ -919,7 +919,7 @@ static int doc_write_oob(struct mtd_info *mtd, loff_t ofs, uint8_t *buf = ops->oobbuf; size_t len = ops->len; - BUG_ON(ops->mode != MTD_OOB_PLACE); + BUG_ON(ops->mode != MTD_OPS_PLACE_OOB); ofs += ops->ooboffs; diff --git a/drivers/mtd/inftlcore.c b/drivers/mtd/inftlcore.c index 21aa981e42cd..652065e47a79 100644 --- a/drivers/mtd/inftlcore.c +++ b/drivers/mtd/inftlcore.c @@ -152,7 +152,7 @@ int inftl_read_oob(struct mtd_info *mtd, loff_t offs, size_t len, struct mtd_oob_ops ops; int res; - ops.mode = MTD_OOB_PLACE; + ops.mode = MTD_OPS_PLACE_OOB; ops.ooboffs = offs & (mtd->writesize - 1); ops.ooblen = len; ops.oobbuf = buf; @@ -172,7 +172,7 @@ int inftl_write_oob(struct mtd_info *mtd, loff_t offs, size_t len, struct mtd_oob_ops ops; int res; - ops.mode = MTD_OOB_PLACE; + ops.mode = MTD_OPS_PLACE_OOB; ops.ooboffs = offs & (mtd->writesize - 1); ops.ooblen = len; ops.oobbuf = buf; @@ -192,7 +192,7 @@ static int inftl_write(struct mtd_info *mtd, loff_t offs, size_t len, struct mtd_oob_ops ops; int res; - ops.mode = MTD_OOB_PLACE; + ops.mode = MTD_OPS_PLACE_OOB; ops.ooboffs = offs; ops.ooblen = mtd->oobsize; ops.oobbuf = oob; diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c index d0eaef67b9bb..9b76438a3c27 100644 --- a/drivers/mtd/mtdchar.c +++ b/drivers/mtd/mtdchar.c @@ -221,7 +221,7 @@ static ssize_t mtd_read(struct file *file, char __user *buf, size_t count,loff_t { struct mtd_oob_ops ops; - ops.mode = MTD_OOB_RAW; + ops.mode = MTD_OPS_RAW; ops.datbuf = kbuf; ops.oobbuf = NULL; ops.len = len; @@ -317,7 +317,7 @@ static ssize_t mtd_write(struct file *file, const char __user *buf, size_t count { struct mtd_oob_ops ops; - ops.mode = MTD_OOB_RAW; + ops.mode = MTD_OPS_RAW; ops.datbuf = kbuf; ops.oobbuf = NULL; ops.ooboffs = 0; @@ -413,7 +413,8 @@ static int mtd_do_writeoob(struct file *file, struct mtd_info *mtd, ops.ooblen = length; ops.ooboffs = start & (mtd->writesize - 1); ops.datbuf = NULL; - ops.mode = (mfi->mode == MTD_MODE_RAW) ? MTD_OOB_RAW : MTD_OOB_PLACE; + ops.mode = (mfi->mode == MTD_MODE_RAW) ? MTD_OPS_RAW : + MTD_OPS_PLACE_OOB; if (ops.ooboffs && ops.ooblen > (mtd->oobsize - ops.ooboffs)) return -EINVAL; @@ -457,7 +458,8 @@ static int mtd_do_readoob(struct file *file, struct mtd_info *mtd, ops.ooblen = length; ops.ooboffs = start & (mtd->writesize - 1); ops.datbuf = NULL; - ops.mode = (mfi->mode == MTD_MODE_RAW) ? MTD_OOB_RAW : MTD_OOB_PLACE; + ops.mode = (mfi->mode == MTD_MODE_RAW) ? MTD_OPS_RAW : + MTD_OPS_PLACE_OOB; if (ops.ooboffs && ops.ooblen > (mtd->oobsize - ops.ooboffs)) return -EINVAL; diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c index c90b7ba362d7..cd7785aa1649 100644 --- a/drivers/mtd/mtdpart.c +++ b/drivers/mtd/mtdpart.c @@ -130,7 +130,7 @@ static int part_read_oob(struct mtd_info *mtd, loff_t from, if (ops->oobbuf) { size_t len, pages; - if (ops->mode == MTD_OOB_AUTO) + if (ops->mode == MTD_OPS_AUTO_OOB) len = mtd->oobavail; else len = mtd->oobsize; diff --git a/drivers/mtd/mtdswap.c b/drivers/mtd/mtdswap.c index 9961063b90a2..910309f260f8 100644 --- a/drivers/mtd/mtdswap.c +++ b/drivers/mtd/mtdswap.c @@ -350,7 +350,7 @@ static int mtdswap_read_markers(struct mtdswap_dev *d, struct swap_eb *eb) ops.oobbuf = d->oob_buf; ops.ooboffs = 0; ops.datbuf = NULL; - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ret = mtdswap_read_oob(d, offset, &ops); @@ -389,7 +389,7 @@ static int mtdswap_write_marker(struct mtdswap_dev *d, struct swap_eb *eb, ops.ooboffs = 0; ops.oobbuf = (uint8_t *)&n; - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.datbuf = NULL; if (marker == MTDSWAP_TYPE_CLEAN) { @@ -931,7 +931,7 @@ static unsigned int mtdswap_eblk_passes(struct mtdswap_dev *d, struct mtd_oob_ops ops; int ret; - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.len = mtd->writesize; ops.ooblen = mtd->ecclayout->oobavail; ops.ooboffs = 0; diff --git a/drivers/mtd/nand/gpmi-nand/gpmi-nand.c b/drivers/mtd/nand/gpmi-nand/gpmi-nand.c index 5c0fe0dd7057..071b63420f0e 100644 --- a/drivers/mtd/nand/gpmi-nand/gpmi-nand.c +++ b/drivers/mtd/nand/gpmi-nand/gpmi-nand.c @@ -1104,7 +1104,7 @@ gpmi_ecc_write_oob(struct mtd_info *mtd, struct nand_chip *chip, int page) * The BCH will use all the (page + oob). * Our gpmi_hw_ecclayout can only prohibit the JFFS2 to write the oob. * But it can not stop some ioctls such MEMWRITEOOB which uses - * MTD_OOB_PLACE. So We have to implement this function to prohibit + * MTD_OPS_PLACE_OOB. So We have to implement this function to prohibit * these ioctls too. */ return -EPERM; diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c index ad40607f5f24..686b55770113 100644 --- a/drivers/mtd/nand/nand_base.c +++ b/drivers/mtd/nand/nand_base.c @@ -1382,12 +1382,12 @@ static uint8_t *nand_transfer_oob(struct nand_chip *chip, uint8_t *oob, { switch (ops->mode) { - case MTD_OOB_PLACE: - case MTD_OOB_RAW: + case MTD_OPS_PLACE_OOB: + case MTD_OPS_RAW: memcpy(oob, chip->oob_poi + ops->ooboffs, len); return oob + len; - case MTD_OOB_AUTO: { + case MTD_OPS_AUTO_OOB: { struct nand_oobfree *free = chip->ecc.layout->oobfree; uint32_t boffs = 0, roffs = ops->ooboffs; size_t bytes = 0; @@ -1437,7 +1437,7 @@ static int nand_do_read_ops(struct mtd_info *mtd, loff_t from, int ret = 0; uint32_t readlen = ops->len; uint32_t oobreadlen = ops->ooblen; - uint32_t max_oobsize = ops->mode == MTD_OOB_AUTO ? + uint32_t max_oobsize = ops->mode == MTD_OPS_AUTO_OOB ? mtd->oobavail : mtd->oobsize; uint8_t *bufpoi, *oob, *buf; @@ -1469,7 +1469,7 @@ static int nand_do_read_ops(struct mtd_info *mtd, loff_t from, } /* Now read the page into the buffer */ - if (unlikely(ops->mode == MTD_OOB_RAW)) + if (unlikely(ops->mode == MTD_OPS_RAW)) ret = chip->ecc.read_page_raw(mtd, chip, bufpoi, page); else if (!aligned && NAND_SUBPAGE_READ(chip) && !oob) @@ -1759,7 +1759,7 @@ static int nand_do_read_oob(struct mtd_info *mtd, loff_t from, stats = mtd->ecc_stats; - if (ops->mode == MTD_OOB_AUTO) + if (ops->mode == MTD_OPS_AUTO_OOB) len = chip->ecc.layout->oobavail; else len = mtd->oobsize; @@ -1787,7 +1787,7 @@ static int nand_do_read_oob(struct mtd_info *mtd, loff_t from, page = realpage & chip->pagemask; while (1) { - if (ops->mode == MTD_OOB_RAW) + if (ops->mode == MTD_OPS_RAW) sndcmd = chip->ecc.read_oob_raw(mtd, chip, page, sndcmd); else sndcmd = chip->ecc.read_oob(mtd, chip, page, sndcmd); @@ -1865,9 +1865,9 @@ static int nand_read_oob(struct mtd_info *mtd, loff_t from, nand_get_device(chip, mtd, FL_READING); switch (ops->mode) { - case MTD_OOB_PLACE: - case MTD_OOB_AUTO: - case MTD_OOB_RAW: + case MTD_OPS_PLACE_OOB: + case MTD_OPS_AUTO_OOB: + case MTD_OPS_RAW: break; default: @@ -2113,12 +2113,12 @@ static uint8_t *nand_fill_oob(struct mtd_info *mtd, uint8_t *oob, size_t len, switch (ops->mode) { - case MTD_OOB_PLACE: - case MTD_OOB_RAW: + case MTD_OPS_PLACE_OOB: + case MTD_OPS_RAW: memcpy(chip->oob_poi + ops->ooboffs, oob, len); return oob + len; - case MTD_OOB_AUTO: { + case MTD_OPS_AUTO_OOB: { struct nand_oobfree *free = chip->ecc.layout->oobfree; uint32_t boffs = 0, woffs = ops->ooboffs; size_t bytes = 0; @@ -2167,7 +2167,7 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to, uint32_t writelen = ops->len; uint32_t oobwritelen = ops->ooblen; - uint32_t oobmaxlen = ops->mode == MTD_OOB_AUTO ? + uint32_t oobmaxlen = ops->mode == MTD_OPS_AUTO_OOB ? mtd->oobavail : mtd->oobsize; uint8_t *oob = ops->oobbuf; @@ -2236,7 +2236,7 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to, } ret = chip->write_page(mtd, chip, wbuf, page, cached, - (ops->mode == MTD_OOB_RAW)); + (ops->mode == MTD_OPS_RAW)); if (ret) break; @@ -2356,7 +2356,7 @@ static int nand_do_write_oob(struct mtd_info *mtd, loff_t to, pr_debug("%s: to = 0x%08x, len = %i\n", __func__, (unsigned int)to, (int)ops->ooblen); - if (ops->mode == MTD_OOB_AUTO) + if (ops->mode == MTD_OPS_AUTO_OOB) len = chip->ecc.layout->oobavail; else len = mtd->oobsize; @@ -2408,7 +2408,7 @@ static int nand_do_write_oob(struct mtd_info *mtd, loff_t to, nand_fill_oob(mtd, ops->oobbuf, ops->ooblen, ops); - if (ops->mode == MTD_OOB_RAW) + if (ops->mode == MTD_OPS_RAW) status = chip->ecc.write_oob_raw(mtd, chip, page & chip->pagemask); else status = chip->ecc.write_oob(mtd, chip, page & chip->pagemask); @@ -2445,9 +2445,9 @@ static int nand_write_oob(struct mtd_info *mtd, loff_t to, nand_get_device(chip, mtd, FL_WRITING); switch (ops->mode) { - case MTD_OOB_PLACE: - case MTD_OOB_AUTO: - case MTD_OOB_RAW: + case MTD_OPS_PLACE_OOB: + case MTD_OPS_AUTO_OOB: + case MTD_OPS_RAW: break; default: diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c index 6aa8125772b8..c488bcb84c90 100644 --- a/drivers/mtd/nand/nand_bbt.c +++ b/drivers/mtd/nand/nand_bbt.c @@ -302,7 +302,7 @@ static int scan_read_raw_oob(struct mtd_info *mtd, uint8_t *buf, loff_t offs, struct mtd_oob_ops ops; int res; - ops.mode = MTD_OOB_RAW; + ops.mode = MTD_OPS_RAW; ops.ooboffs = 0; ops.ooblen = mtd->oobsize; @@ -350,7 +350,7 @@ static int scan_write_bbt(struct mtd_info *mtd, loff_t offs, size_t len, { struct mtd_oob_ops ops; - ops.mode = MTD_OOB_PLACE; + ops.mode = MTD_OPS_PLACE_OOB; ops.ooboffs = 0; ops.ooblen = mtd->oobsize; ops.datbuf = buf; @@ -433,7 +433,7 @@ static int scan_block_fast(struct mtd_info *mtd, struct nand_bbt_descr *bd, ops.oobbuf = buf; ops.ooboffs = 0; ops.datbuf = NULL; - ops.mode = MTD_OOB_PLACE; + ops.mode = MTD_OPS_PLACE_OOB; for (j = 0; j < len; j++) { /* @@ -672,7 +672,7 @@ static int write_bbt(struct mtd_info *mtd, uint8_t *buf, ops.ooblen = mtd->oobsize; ops.ooboffs = 0; ops.datbuf = NULL; - ops.mode = MTD_OOB_PLACE; + ops.mode = MTD_OPS_PLACE_OOB; if (!rcode) rcode = 0xff; diff --git a/drivers/mtd/nand/sm_common.c b/drivers/mtd/nand/sm_common.c index b6332e83b289..2c438ef53cf5 100644 --- a/drivers/mtd/nand/sm_common.c +++ b/drivers/mtd/nand/sm_common.c @@ -47,7 +47,7 @@ static int sm_block_markbad(struct mtd_info *mtd, loff_t ofs) /* As long as this function is called on erase block boundaries it will work correctly for 256 byte nand */ - ops.mode = MTD_OOB_PLACE; + ops.mode = MTD_OPS_PLACE_OOB; ops.ooboffs = 0; ops.ooblen = mtd->oobsize; ops.oobbuf = (void *)&oob; diff --git a/drivers/mtd/nftlcore.c b/drivers/mtd/nftlcore.c index 93d6fc68b892..272e3c03e324 100644 --- a/drivers/mtd/nftlcore.c +++ b/drivers/mtd/nftlcore.c @@ -147,7 +147,7 @@ int nftl_read_oob(struct mtd_info *mtd, loff_t offs, size_t len, struct mtd_oob_ops ops; int res; - ops.mode = MTD_OOB_PLACE; + ops.mode = MTD_OPS_PLACE_OOB; ops.ooboffs = offs & mask; ops.ooblen = len; ops.oobbuf = buf; @@ -168,7 +168,7 @@ int nftl_write_oob(struct mtd_info *mtd, loff_t offs, size_t len, struct mtd_oob_ops ops; int res; - ops.mode = MTD_OOB_PLACE; + ops.mode = MTD_OPS_PLACE_OOB; ops.ooboffs = offs & mask; ops.ooblen = len; ops.oobbuf = buf; @@ -191,7 +191,7 @@ static int nftl_write(struct mtd_info *mtd, loff_t offs, size_t len, struct mtd_oob_ops ops; int res; - ops.mode = MTD_OOB_PLACE; + ops.mode = MTD_OPS_PLACE_OOB; ops.ooboffs = offs & mask; ops.ooblen = mtd->oobsize; ops.oobbuf = oob; diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c index 493901a59e6e..a52aa0f6b0c3 100644 --- a/drivers/mtd/onenand/onenand_base.c +++ b/drivers/mtd/onenand/onenand_base.c @@ -1125,7 +1125,7 @@ static int onenand_mlc_read_ops_nolock(struct mtd_info *mtd, loff_t from, pr_debug("%s: from = 0x%08x, len = %i\n", __func__, (unsigned int)from, (int)len); - if (ops->mode == MTD_OOB_AUTO) + if (ops->mode == MTD_OPS_AUTO_OOB) oobsize = this->ecclayout->oobavail; else oobsize = mtd->oobsize; @@ -1170,7 +1170,7 @@ static int onenand_mlc_read_ops_nolock(struct mtd_info *mtd, loff_t from, thisooblen = oobsize - oobcolumn; thisooblen = min_t(int, thisooblen, ooblen - oobread); - if (ops->mode == MTD_OOB_AUTO) + if (ops->mode == MTD_OPS_AUTO_OOB) onenand_transfer_auto_oob(mtd, oobbuf, oobcolumn, thisooblen); else this->read_bufferram(mtd, ONENAND_SPARERAM, oobbuf, oobcolumn, thisooblen); @@ -1229,7 +1229,7 @@ static int onenand_read_ops_nolock(struct mtd_info *mtd, loff_t from, pr_debug("%s: from = 0x%08x, len = %i\n", __func__, (unsigned int)from, (int)len); - if (ops->mode == MTD_OOB_AUTO) + if (ops->mode == MTD_OPS_AUTO_OOB) oobsize = this->ecclayout->oobavail; else oobsize = mtd->oobsize; @@ -1291,7 +1291,7 @@ static int onenand_read_ops_nolock(struct mtd_info *mtd, loff_t from, thisooblen = oobsize - oobcolumn; thisooblen = min_t(int, thisooblen, ooblen - oobread); - if (ops->mode == MTD_OOB_AUTO) + if (ops->mode == MTD_OPS_AUTO_OOB) onenand_transfer_auto_oob(mtd, oobbuf, oobcolumn, thisooblen); else this->read_bufferram(mtd, ONENAND_SPARERAM, oobbuf, oobcolumn, thisooblen); @@ -1363,7 +1363,7 @@ static int onenand_read_oob_nolock(struct mtd_info *mtd, loff_t from, /* Initialize return length value */ ops->oobretlen = 0; - if (mode == MTD_OOB_AUTO) + if (mode == MTD_OPS_AUTO_OOB) oobsize = this->ecclayout->oobavail; else oobsize = mtd->oobsize; @@ -1409,7 +1409,7 @@ static int onenand_read_oob_nolock(struct mtd_info *mtd, loff_t from, break; } - if (mode == MTD_OOB_AUTO) + if (mode == MTD_OPS_AUTO_OOB) onenand_transfer_auto_oob(mtd, buf, column, thislen); else this->read_bufferram(mtd, ONENAND_SPARERAM, buf, column, thislen); @@ -1487,10 +1487,10 @@ static int onenand_read_oob(struct mtd_info *mtd, loff_t from, int ret; switch (ops->mode) { - case MTD_OOB_PLACE: - case MTD_OOB_AUTO: + case MTD_OPS_PLACE_OOB: + case MTD_OPS_AUTO_OOB: break; - case MTD_OOB_RAW: + case MTD_OPS_RAW: /* Not implemented yet */ default: return -EINVAL; @@ -1908,7 +1908,7 @@ static int onenand_write_ops_nolock(struct mtd_info *mtd, loff_t to, if (!len) return 0; - if (ops->mode == MTD_OOB_AUTO) + if (ops->mode == MTD_OPS_AUTO_OOB) oobsize = this->ecclayout->oobavail; else oobsize = mtd->oobsize; @@ -1945,7 +1945,7 @@ static int onenand_write_ops_nolock(struct mtd_info *mtd, loff_t to, /* We send data to spare ram with oobsize * to prevent byte access */ memset(oobbuf, 0xff, mtd->oobsize); - if (ops->mode == MTD_OOB_AUTO) + if (ops->mode == MTD_OPS_AUTO_OOB) onenand_fill_auto_oob(mtd, oobbuf, oob, oobcolumn, thisooblen); else memcpy(oobbuf + oobcolumn, oob, thisooblen); @@ -2084,7 +2084,7 @@ static int onenand_write_oob_nolock(struct mtd_info *mtd, loff_t to, /* Initialize retlen, in case of early exit */ ops->oobretlen = 0; - if (mode == MTD_OOB_AUTO) + if (mode == MTD_OPS_AUTO_OOB) oobsize = this->ecclayout->oobavail; else oobsize = mtd->oobsize; @@ -2128,7 +2128,7 @@ static int onenand_write_oob_nolock(struct mtd_info *mtd, loff_t to, /* We send data to spare ram with oobsize * to prevent byte access */ memset(oobbuf, 0xff, mtd->oobsize); - if (mode == MTD_OOB_AUTO) + if (mode == MTD_OPS_AUTO_OOB) onenand_fill_auto_oob(mtd, oobbuf, buf, column, thislen); else memcpy(oobbuf + column, buf, thislen); @@ -2217,10 +2217,10 @@ static int onenand_write_oob(struct mtd_info *mtd, loff_t to, int ret; switch (ops->mode) { - case MTD_OOB_PLACE: - case MTD_OOB_AUTO: + case MTD_OPS_PLACE_OOB: + case MTD_OPS_AUTO_OOB: break; - case MTD_OOB_RAW: + case MTD_OPS_RAW: /* Not implemented yet */ default: return -EINVAL; @@ -2603,7 +2603,7 @@ static int onenand_default_block_markbad(struct mtd_info *mtd, loff_t ofs) struct bbm_info *bbm = this->bbm; u_char buf[2] = {0, 0}; struct mtd_oob_ops ops = { - .mode = MTD_OOB_PLACE, + .mode = MTD_OPS_PLACE_OOB, .ooblen = 2, .oobbuf = buf, .ooboffs = 0, @@ -3171,7 +3171,7 @@ static int do_otp_lock(struct mtd_info *mtd, loff_t from, size_t len, this->command(mtd, ONENAND_CMD_RESET, 0, 0); this->wait(mtd, FL_RESETING); } else { - ops.mode = MTD_OOB_PLACE; + ops.mode = MTD_OPS_PLACE_OOB; ops.ooblen = len; ops.oobbuf = buf; ops.ooboffs = 0; @@ -3677,7 +3677,7 @@ static int flexonenand_check_blocks_erased(struct mtd_info *mtd, int start, int int i, ret; int block; struct mtd_oob_ops ops = { - .mode = MTD_OOB_PLACE, + .mode = MTD_OPS_PLACE_OOB, .ooboffs = 0, .ooblen = mtd->oobsize, .datbuf = NULL, diff --git a/drivers/mtd/onenand/onenand_bbt.c b/drivers/mtd/onenand/onenand_bbt.c index 3b9a2a9573c6..09956c4fd850 100644 --- a/drivers/mtd/onenand/onenand_bbt.c +++ b/drivers/mtd/onenand/onenand_bbt.c @@ -80,7 +80,7 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr startblock = 0; from = 0; - ops.mode = MTD_OOB_PLACE; + ops.mode = MTD_OPS_PLACE_OOB; ops.ooblen = readlen; ops.oobbuf = buf; ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; diff --git a/drivers/mtd/sm_ftl.c b/drivers/mtd/sm_ftl.c index 311a9e39a956..d927641cb0f5 100644 --- a/drivers/mtd/sm_ftl.c +++ b/drivers/mtd/sm_ftl.c @@ -256,7 +256,7 @@ static int sm_read_sector(struct sm_ftl *ftl, if (!oob) oob = &tmp_oob; - ops.mode = ftl->smallpagenand ? MTD_OOB_RAW : MTD_OOB_PLACE; + ops.mode = ftl->smallpagenand ? MTD_OPS_RAW : MTD_OPS_PLACE_OOB; ops.ooboffs = 0; ops.ooblen = SM_OOB_SIZE; ops.oobbuf = (void *)oob; @@ -336,7 +336,7 @@ static int sm_write_sector(struct sm_ftl *ftl, if (ftl->unstable) return -EIO; - ops.mode = ftl->smallpagenand ? MTD_OOB_RAW : MTD_OOB_PLACE; + ops.mode = ftl->smallpagenand ? MTD_OPS_RAW : MTD_OPS_PLACE_OOB; ops.len = SM_SECTOR_SIZE; ops.datbuf = buffer; ops.ooboffs = 0; diff --git a/drivers/mtd/ssfdc.c b/drivers/mtd/ssfdc.c index 5f917f0a9609..976e3d28b962 100644 --- a/drivers/mtd/ssfdc.c +++ b/drivers/mtd/ssfdc.c @@ -169,7 +169,7 @@ static int read_raw_oob(struct mtd_info *mtd, loff_t offs, uint8_t *buf) struct mtd_oob_ops ops; int ret; - ops.mode = MTD_OOB_RAW; + ops.mode = MTD_OPS_RAW; ops.ooboffs = 0; ops.ooblen = OOB_SIZE; ops.oobbuf = buf; diff --git a/drivers/mtd/tests/mtd_oobtest.c b/drivers/mtd/tests/mtd_oobtest.c index dec92ae6111a..6bcff42296a9 100644 --- a/drivers/mtd/tests/mtd_oobtest.c +++ b/drivers/mtd/tests/mtd_oobtest.c @@ -131,7 +131,7 @@ static int write_eraseblock(int ebnum) for (i = 0; i < pgcnt; ++i, addr += mtd->writesize) { set_random_data(writebuf, use_len); - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.len = 0; ops.retlen = 0; ops.ooblen = use_len; @@ -184,7 +184,7 @@ static int verify_eraseblock(int ebnum) for (i = 0; i < pgcnt; ++i, addr += mtd->writesize) { set_random_data(writebuf, use_len); - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.len = 0; ops.retlen = 0; ops.ooblen = use_len; @@ -211,7 +211,7 @@ static int verify_eraseblock(int ebnum) if (use_offset != 0 || use_len < mtd->ecclayout->oobavail) { int k; - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.len = 0; ops.retlen = 0; ops.ooblen = mtd->ecclayout->oobavail; @@ -276,7 +276,7 @@ static int verify_eraseblock_in_one_go(int ebnum) size_t len = mtd->ecclayout->oobavail * pgcnt; set_random_data(writebuf, len); - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.len = 0; ops.retlen = 0; ops.ooblen = len; @@ -507,7 +507,7 @@ static int __init mtd_oobtest_init(void) addr0 += mtd->erasesize; /* Attempt to write off end of OOB */ - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.len = 0; ops.retlen = 0; ops.ooblen = 1; @@ -527,7 +527,7 @@ static int __init mtd_oobtest_init(void) } /* Attempt to read off end of OOB */ - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.len = 0; ops.retlen = 0; ops.ooblen = 1; @@ -551,7 +551,7 @@ static int __init mtd_oobtest_init(void) "block is bad\n"); else { /* Attempt to write off end of device */ - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.len = 0; ops.retlen = 0; ops.ooblen = mtd->ecclayout->oobavail + 1; @@ -571,7 +571,7 @@ static int __init mtd_oobtest_init(void) } /* Attempt to read off end of device */ - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.len = 0; ops.retlen = 0; ops.ooblen = mtd->ecclayout->oobavail + 1; @@ -595,7 +595,7 @@ static int __init mtd_oobtest_init(void) goto out; /* Attempt to write off end of device */ - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.len = 0; ops.retlen = 0; ops.ooblen = mtd->ecclayout->oobavail; @@ -615,7 +615,7 @@ static int __init mtd_oobtest_init(void) } /* Attempt to read off end of device */ - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.len = 0; ops.retlen = 0; ops.ooblen = mtd->ecclayout->oobavail; @@ -655,7 +655,7 @@ static int __init mtd_oobtest_init(void) addr = (i + 1) * mtd->erasesize - mtd->writesize; for (pg = 0; pg < cnt; ++pg) { set_random_data(writebuf, sz); - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.len = 0; ops.retlen = 0; ops.ooblen = sz; @@ -683,7 +683,7 @@ static int __init mtd_oobtest_init(void) continue; set_random_data(writebuf, mtd->ecclayout->oobavail * 2); addr = (i + 1) * mtd->erasesize - mtd->writesize; - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.len = 0; ops.retlen = 0; ops.ooblen = mtd->ecclayout->oobavail * 2; diff --git a/drivers/mtd/tests/mtd_readtest.c b/drivers/mtd/tests/mtd_readtest.c index 836792d1d60e..587e1e371c6c 100644 --- a/drivers/mtd/tests/mtd_readtest.c +++ b/drivers/mtd/tests/mtd_readtest.c @@ -66,7 +66,7 @@ static int read_eraseblock_by_page(int ebnum) if (mtd->oobsize) { struct mtd_oob_ops ops; - ops.mode = MTD_OOB_PLACE; + ops.mode = MTD_OPS_PLACE_OOB; ops.len = 0; ops.retlen = 0; ops.ooblen = mtd->oobsize; diff --git a/drivers/staging/spectra/lld_mtd.c b/drivers/staging/spectra/lld_mtd.c index 2bd34662beb5..a9c309a167c2 100644 --- a/drivers/staging/spectra/lld_mtd.c +++ b/drivers/staging/spectra/lld_mtd.c @@ -340,7 +340,7 @@ u16 mtd_Read_Page_Main_Spare(u8 *read_data, u32 Block, struct mtd_oob_ops ops; int ret; - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.datbuf = read_data; ops.len = DeviceInfo.wPageDataSize; ops.oobbuf = read_data + DeviceInfo.wPageDataSize + BTSIG_OFFSET; @@ -400,7 +400,7 @@ u16 mtd_Write_Page_Main_Spare(u8 *write_data, u32 Block, struct mtd_oob_ops ops; int ret; - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.datbuf = write_data; ops.len = DeviceInfo.wPageDataSize; ops.oobbuf = write_data + DeviceInfo.wPageDataSize + BTSIG_OFFSET; @@ -473,7 +473,7 @@ u16 mtd_Read_Page_Spare(u8 *read_data, u32 Block, struct mtd_oob_ops ops; int ret; - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.datbuf = NULL; ops.len = 0; ops.oobbuf = read_data; diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index a10fb24ca6e6..b09e51d2f81f 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -1025,7 +1025,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c, int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); struct mtd_oob_ops ops; - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail; ops.oobbuf = c->oobbuf; ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; @@ -1068,7 +1068,7 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, struct mtd_oob_ops ops; int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.ooblen = cmlen; ops.oobbuf = c->oobbuf; ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; @@ -1094,7 +1094,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, struct mtd_oob_ops ops; int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.ooblen = cmlen; ops.oobbuf = (uint8_t *)&oob_cleanmarker; ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index 6882cd968a3e..c2047b85691d 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -79,7 +79,7 @@ struct mtd_erase_region_info { * @ooblen: number of oob bytes to write/read * @oobretlen: number of oob bytes written/read * @ooboffs: offset of oob data in the oob area (only relevant when - * mode = MTD_OOB_PLACE) + * mode = MTD_OPS_PLACE_OOB) * @datbuf: data buffer - if NULL only oob data are read/written * @oobbuf: oob data buffer * diff --git a/include/mtd/mtd-abi.h b/include/mtd/mtd-abi.h index af42c7a34805..d30990e1858b 100644 --- a/include/mtd/mtd-abi.h +++ b/include/mtd/mtd-abi.h @@ -46,18 +46,18 @@ struct mtd_oob_buf64 { }; /* - * oob operation modes + * MTD operation modes * - * MTD_OOB_PLACE: oob data are placed at the given offset (default) - * MTD_OOB_AUTO: oob data are automatically placed at the free areas - * which are defined by the internal ecclayout - * MTD_OOB_RAW: mode to read or write oob and data without doing ECC + * MTD_OPS_PLACE_OOB: oob data are placed at the given offset (default) + * MTD_OPS_AUTO_OOB: oob data are automatically placed at the free areas + * which are defined by the internal ecclayout + * MTD_OPS_RAW: mode to read or write oob and data without doing ECC * checking */ enum { - MTD_OOB_PLACE = 0, - MTD_OOB_AUTO = 1, - MTD_OOB_RAW = 2, + MTD_OPS_PLACE_OOB = 0, + MTD_OPS_AUTO_OOB = 1, + MTD_OPS_RAW = 2, }; #define MTD_ABSENT 0 -- cgit v1.2.3-58-ga151 From 4581d1409977c5fe686a4ed487cdce2e50031826 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 6 Sep 2011 14:56:09 -0400 Subject: nfsd4: rearrange to avoid a forward reference Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 53 ++++++++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c28432a80210..f0eccc236a0d 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -60,7 +60,6 @@ static u64 current_sessionid = 1; #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) /* forward declarations */ -static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); static struct nfs4_delegation * search_for_delegation(stateid_t *stid); static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner); @@ -1061,6 +1060,32 @@ static void gen_confirm(struct nfs4_client *clp) *p++ = i++; } +static int +same_stateid(stateid_t *id_one, stateid_t *id_two) +{ + if (id_one->si_stateownerid != id_two->si_stateownerid) + return 0; + return id_one->si_fileid == id_two->si_fileid; +} + +static struct nfs4_stateid *find_stateid(stateid_t *t, int flags) +{ + struct nfs4_stateid *s; + unsigned int hashval; + + hashval = stateid_hashval(t->si_stateownerid, t->si_fileid); + list_for_each_entry(s, &stateid_hashtbl[hashval], st_hash) { + if (!same_stateid(&s->st_stateid, t)) + continue; + if (flags & LOCK_STATE && s->st_type != NFS4_LOCK_STID) + return NULL; + if (flags & OPEN_STATE && s->st_type != NFS4_OPEN_STID) + return NULL; + return s; + } + return NULL; +} + static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, struct svc_rqst *rqstp, nfs4_verifier *verf) { @@ -3694,32 +3719,6 @@ lock_ownerstr_hashval(struct inode *inode, u32 cl_id, static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE]; static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; -static int -same_stateid(stateid_t *id_one, stateid_t *id_two) -{ - if (id_one->si_stateownerid != id_two->si_stateownerid) - return 0; - return id_one->si_fileid == id_two->si_fileid; -} - -static struct nfs4_stateid *find_stateid(stateid_t *t, int flags) -{ - struct nfs4_stateid *s; - unsigned int hashval; - - hashval = stateid_hashval(t->si_stateownerid, t->si_fileid); - list_for_each_entry(s, &stateid_hashtbl[hashval], st_hash) { - if (!same_stateid(&s->st_stateid, t)) - continue; - if (flags & LOCK_STATE && s->st_type != NFS4_LOCK_STID) - return NULL; - if (flags & OPEN_STATE && s->st_type != NFS4_OPEN_STID) - return NULL; - return s; - } - return NULL; -} - static struct nfs4_delegation * search_for_delegation(stateid_t *stid) { -- cgit v1.2.3-58-ga151 From 4d71ab8751c1d749f9fdf84ec094989adf579493 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 6 Sep 2011 16:48:57 -0400 Subject: nfsd4: split up find_stateid Minor cleanup. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index f0eccc236a0d..aa088bc3b169 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1068,21 +1068,29 @@ same_stateid(stateid_t *id_one, stateid_t *id_two) return id_one->si_fileid == id_two->si_fileid; } -static struct nfs4_stateid *find_stateid(stateid_t *t, int flags) +static struct nfs4_stateid *find_stateid(stateid_t *t) { struct nfs4_stateid *s; unsigned int hashval; hashval = stateid_hashval(t->si_stateownerid, t->si_fileid); - list_for_each_entry(s, &stateid_hashtbl[hashval], st_hash) { - if (!same_stateid(&s->st_stateid, t)) - continue; - if (flags & LOCK_STATE && s->st_type != NFS4_LOCK_STID) - return NULL; - if (flags & OPEN_STATE && s->st_type != NFS4_OPEN_STID) - return NULL; + list_for_each_entry(s, &stateid_hashtbl[hashval], st_hash) + if (same_stateid(&s->st_stateid, t)) + return s; + return NULL; +} + +static struct nfs4_stateid *find_stateid_by_type(stateid_t *t, int flags) +{ + struct nfs4_stateid *s; + + s = find_stateid(t); + if (!s) + return NULL; + if (flags & LOCK_STATE && s->st_type == NFS4_LOCK_STID) + return s; + if (flags & OPEN_STATE && s->st_type == NFS4_OPEN_STID) return s; - } return NULL; } @@ -3241,7 +3249,7 @@ __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session) goto out; status = nfserr_expired; - stp = find_stateid(stateid, 0); + stp = find_stateid(stateid); if (!stp) goto out; status = nfserr_bad_stateid; @@ -3306,7 +3314,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, BUG_ON(!*filpp); } } else { /* open or lock stateid */ - stp = find_stateid(stateid, flags); + stp = find_stateid(stateid); if (!stp) goto out; status = nfserr_bad_stateid; @@ -3381,7 +3389,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - stp = find_stateid(stateid, 0); + stp = find_stateid(stateid); if (!stp) { ret = nfserr_bad_stateid; goto out; @@ -3440,7 +3448,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, * the confirmed flag is incorrecly set, or the generation * number is incorrect. */ - *stpp = find_stateid(stateid, flags); + *stpp = find_stateid_by_type(stateid, flags); if (*stpp == NULL) return nfserr_expired; -- cgit v1.2.3-58-ga151 From c0a5d93efbbb79117bdf7f5f81fba9d679c35dfa Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 6 Sep 2011 15:19:46 -0400 Subject: nfsd4: split preprocess_seqid, cleanup Move most of this into helper functions. Also move the non-CONFIRM case into caller, providing a helper function for that purpose. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 87 +++++++++++++++++++++++++++-------------------------- fs/nfsd/state.h | 1 - 2 files changed, 45 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index aa088bc3b169..ad20bbf0a1f8 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3253,7 +3253,6 @@ __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session) if (!stp) goto out; status = nfserr_bad_stateid; - if (stp->st_stateowner->so_is_open_owner && !openowner(stp->st_stateowner)->oo_confirmed) goto out; @@ -3418,6 +3417,29 @@ setlkflg (int type) RD_STATE : WR_STATE; } +static __be32 nfs4_nospecial_stateid_checks(stateid_t *stateid) +{ + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + return nfserr_bad_stateid; + if (STALE_STATEID(stateid)) + return nfserr_stale_stateid; + return nfs_ok; +} + +static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_stateid *stp) +{ + struct svc_fh *current_fh = &cstate->current_fh; + struct nfs4_stateowner *sop = stp->st_stateowner; + __be32 status; + + if (nfs4_check_fh(current_fh, stp)) + return nfserr_bad_stateid; + status = nfsd4_check_seqid(cstate, sop, seqid); + if (status) + return status; + return check_stateid_generation(stateid, &stp->st_stateid, nfsd4_has_session(cstate)); +} + /* * Checks for sequence id mutating operations. */ @@ -3426,54 +3448,36 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateid **stpp) { - struct nfs4_stateowner *sop; - struct svc_fh *current_fh = &cstate->current_fh; __be32 status; dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__, seqid, STATEID_VAL(stateid)); *stpp = NULL; - - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { - dprintk("NFSD: preprocess_seqid_op: magic stateid!\n"); - return nfserr_bad_stateid; - } - - if (STALE_STATEID(stateid)) - return nfserr_stale_stateid; - - /* - * We return BAD_STATEID if filehandle doesn't match stateid, - * the confirmed flag is incorrecly set, or the generation - * number is incorrect. - */ + status = nfs4_nospecial_stateid_checks(stateid); + if (status) + return status; *stpp = find_stateid_by_type(stateid, flags); if (*stpp == NULL) return nfserr_expired; + cstate->replay_owner = (*stpp)->st_stateowner; + renew_client((*stpp)->st_stateowner->so_client); - sop = (*stpp)->st_stateowner; - cstate->replay_owner = sop; + return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp); +} - if (nfs4_check_fh(current_fh, *stpp)) { - dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n"); - return nfserr_bad_stateid; - } +static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_stateid **stpp) +{ + __be32 status; + struct nfs4_openowner *oo; - status = nfsd4_check_seqid(cstate, sop, seqid); + status = nfs4_preprocess_seqid_op(cstate, seqid, stateid, + OPEN_STATE, stpp); if (status) return status; - - if (sop->so_is_open_owner && !openowner(sop)->oo_confirmed - && !(flags & CONFIRM)) { - dprintk("NFSD: preprocess_seqid_op: stateowner not" - " confirmed yet!\n"); + oo = openowner((*stpp)->st_stateowner); + if (!oo->oo_confirmed) return nfserr_bad_stateid; - } - status = check_stateid_generation(stateid, &(*stpp)->st_stateid, nfsd4_has_session(cstate)); - if (status) - return status; - renew_client(sop->so_client); return nfs_ok; } @@ -3497,7 +3501,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_seqid_op(cstate, oc->oc_seqid, &oc->oc_req_stateid, - CONFIRM | OPEN_STATE, &stp); + OPEN_STATE, &stp); if (status) goto out; oo = openowner(stp->st_stateowner); @@ -3557,8 +3561,8 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, return nfserr_inval; nfs4_lock_state(); - status = nfs4_preprocess_seqid_op(cstate, od->od_seqid, - &od->od_stateid, OPEN_STATE, &stp); + status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid, + &od->od_stateid, &stp); if (status) goto out; status = nfserr_inval; @@ -3602,9 +3606,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); /* check close_lru for replay */ - status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, - &close->cl_stateid, - OPEN_STATE, &stp); + status = nfs4_preprocess_confirmed_seqid_op(cstate, close->cl_seqid, + &close->cl_stateid, &stp); if (stp == NULL && status == nfserr_expired) { /* * Also, we should make sure this isn't just the result of @@ -3963,10 +3966,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; /* validate and update open stateid and open seqid */ - status = nfs4_preprocess_seqid_op(cstate, + status = nfs4_preprocess_confirmed_seqid_op(cstate, lock->lk_new_open_seqid, &lock->lk_new_open_stateid, - OPEN_STATE, &open_stp); + &open_stp); if (status) goto out; open_sop = openowner(open_stp->st_stateowner); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 7994ed9be3cc..9745cc781e74 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -463,7 +463,6 @@ struct nfs4_stateid { }; /* flags for preprocess_seqid_op() */ -#define CONFIRM 0x00000002 #define OPEN_STATE 0x00000004 #define LOCK_STATE 0x00000008 #define RD_STATE 0x00000010 -- cgit v1.2.3-58-ga151 From 2288d0e3958b94bcc3c00a78ea06909a8eb66378 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 6 Sep 2011 15:50:21 -0400 Subject: nfsd4: pass around typemask instead of flags We're only using those flags to choose lock or open stateid's at this point. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 18 ++++++++---------- fs/nfsd/state.h | 2 -- 2 files changed, 8 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ad20bbf0a1f8..48134635fc2f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1080,16 +1080,14 @@ static struct nfs4_stateid *find_stateid(stateid_t *t) return NULL; } -static struct nfs4_stateid *find_stateid_by_type(stateid_t *t, int flags) +static struct nfs4_stateid *find_stateid_by_type(stateid_t *t, char typemask) { struct nfs4_stateid *s; s = find_stateid(t); if (!s) return NULL; - if (flags & LOCK_STATE && s->st_type == NFS4_LOCK_STID) - return s; - if (flags & OPEN_STATE && s->st_type == NFS4_OPEN_STID) + if (typemask & s->st_type) return s; return NULL; } @@ -3445,7 +3443,7 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ */ static __be32 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, - stateid_t *stateid, int flags, + stateid_t *stateid, char typemask, struct nfs4_stateid **stpp) { __be32 status; @@ -3457,7 +3455,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, status = nfs4_nospecial_stateid_checks(stateid); if (status) return status; - *stpp = find_stateid_by_type(stateid, flags); + *stpp = find_stateid_by_type(stateid, typemask); if (*stpp == NULL) return nfserr_expired; cstate->replay_owner = (*stpp)->st_stateowner; @@ -3472,7 +3470,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs struct nfs4_openowner *oo; status = nfs4_preprocess_seqid_op(cstate, seqid, stateid, - OPEN_STATE, stpp); + NFS4_OPEN_STID, stpp); if (status) return status; oo = openowner((*stpp)->st_stateowner); @@ -3501,7 +3499,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_seqid_op(cstate, oc->oc_seqid, &oc->oc_req_stateid, - OPEN_STATE, &stp); + NFS4_OPEN_STID, &stp); if (status) goto out; oo = openowner(stp->st_stateowner); @@ -3999,7 +3997,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_seqid_op(cstate, lock->lk_old_lock_seqid, &lock->lk_old_lock_stateid, - LOCK_STATE, &lock_stp); + NFS4_LOCK_STID, &lock_stp); if (status) goto out; lock_sop = lockowner(lock_stp->st_stateowner); @@ -4197,7 +4195,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid, - &locku->lu_stateid, LOCK_STATE, &stp); + &locku->lu_stateid, NFS4_LOCK_STID, &stp); if (status) goto out; filp = find_any_file(stp->st_file); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 9745cc781e74..ef949eb3a86e 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -463,8 +463,6 @@ struct nfs4_stateid { }; /* flags for preprocess_seqid_op() */ -#define OPEN_STATE 0x00000004 -#define LOCK_STATE 0x00000008 #define RD_STATE 0x00000010 #define WR_STATE 0x00000020 -- cgit v1.2.3-58-ga151 From 881ea2b11e8a786a8d30c108261296953380bead Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 6 Sep 2011 17:20:34 -0400 Subject: nfsd4: rename init_stateid Note this is actually open-stateid specific. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 48134635fc2f..73b5e1e264fa 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2314,7 +2314,7 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str } static inline void -init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { +init_open_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { struct nfs4_openowner *oo = open->op_openowner; unsigned int hashval = stateid_hashval(oo->oo_owner.so_id, fp->fi_id); @@ -2934,7 +2934,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs4_new_open(rqstp, &stp, fp, current_fh, open); if (status) goto out; - init_stateid(stp, fp, open); + init_open_stateid(stp, fp, open); status = nfsd4_truncate(rqstp, current_fh, open); if (status) { release_open_stateid(stp); -- cgit v1.2.3-58-ga151 From 91a8c04031d051db41ec2a8c15e24622da037382 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 7 Sep 2011 12:54:06 -0400 Subject: nfsd4: remove redundant stateid initialization Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 73b5e1e264fa..768382d1de97 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2318,10 +2318,7 @@ init_open_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_o struct nfs4_openowner *oo = open->op_openowner; unsigned int hashval = stateid_hashval(oo->oo_owner.so_id, fp->fi_id); - INIT_LIST_HEAD(&stp->st_hash); - INIT_LIST_HEAD(&stp->st_perstateowner); INIT_LIST_HEAD(&stp->st_lockowners); - INIT_LIST_HEAD(&stp->st_perfile); list_add(&stp->st_hash, &stateid_hashtbl[hashval]); list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); @@ -3874,10 +3871,6 @@ alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct stp = nfs4_alloc_stateid(); if (stp == NULL) goto out; - INIT_LIST_HEAD(&stp->st_hash); - INIT_LIST_HEAD(&stp->st_perfile); - INIT_LIST_HEAD(&stp->st_perstateowner); - INIT_LIST_HEAD(&stp->st_lockowners); /* not used */ list_add(&stp->st_hash, &stateid_hashtbl[hashval]); list_add(&stp->st_perfile, &fp->fi_stateids); list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); -- cgit v1.2.3-58-ga151 From dcef0413da9a17bfca917d8b49baf309ce76b737 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 7 Sep 2011 16:06:42 -0400 Subject: nfsd4: move some of nfs4_stateid into a separate structure We want delegations to share more with open/lock stateid's, so first we'll pull out some of the common stuff we want to share. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 174 ++++++++++++++++++++++++++-------------------------- fs/nfsd/state.h | 29 ++++++--- 2 files changed, 106 insertions(+), 97 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 768382d1de97..d0bb5a5613a9 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -159,7 +159,7 @@ static struct list_head open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE]; #define FILE_HASH_BITS 8 #define FILE_HASH_SIZE (1 << FILE_HASH_BITS) -/* hash table for (open)nfs4_stateid */ +/* hash table for (open)nfs4_ol_stateid */ #define STATEID_HASH_BITS 10 #define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) #define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) @@ -219,7 +219,7 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag) } static struct nfs4_delegation * -alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type) +alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type) { struct nfs4_delegation *dp; struct nfs4_file *fp = stp->st_file; @@ -380,7 +380,7 @@ set_deny(unsigned int *deny, unsigned long bmap) { } static int -test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) { +test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) { unsigned int access, deny; set_access(&access, stp->st_access_bmap); @@ -403,14 +403,14 @@ static int nfs4_access_to_omode(u32 access) BUG(); } -static void unhash_generic_stateid(struct nfs4_stateid *stp) +static void unhash_generic_stateid(struct nfs4_ol_stateid *stp) { - list_del(&stp->st_hash); + list_del(&stp->st_stid.sc_hash); list_del(&stp->st_perfile); list_del(&stp->st_perstateowner); } -static void close_generic_stateid(struct nfs4_stateid *stp) +static void close_generic_stateid(struct nfs4_ol_stateid *stp) { int i; @@ -426,13 +426,13 @@ static void close_generic_stateid(struct nfs4_stateid *stp) stp->st_file = NULL; } -static void free_generic_stateid(struct nfs4_stateid *stp) +static void free_generic_stateid(struct nfs4_ol_stateid *stp) { close_generic_stateid(stp); kmem_cache_free(stateid_slab, stp); } -static void release_lock_stateid(struct nfs4_stateid *stp) +static void release_lock_stateid(struct nfs4_ol_stateid *stp) { struct file *file; @@ -445,14 +445,14 @@ static void release_lock_stateid(struct nfs4_stateid *stp) static void unhash_lockowner(struct nfs4_lockowner *lo) { - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; list_del(&lo->lo_owner.so_idhash); list_del(&lo->lo_owner.so_strhash); list_del(&lo->lo_perstateid); while (!list_empty(&lo->lo_owner.so_stateids)) { stp = list_first_entry(&lo->lo_owner.so_stateids, - struct nfs4_stateid, st_perstateowner); + struct nfs4_ol_stateid, st_perstateowner); release_lock_stateid(stp); } } @@ -464,7 +464,7 @@ static void release_lockowner(struct nfs4_lockowner *lo) } static void -release_stateid_lockowners(struct nfs4_stateid *open_stp) +release_stateid_lockowners(struct nfs4_ol_stateid *open_stp) { struct nfs4_lockowner *lo; @@ -475,7 +475,7 @@ release_stateid_lockowners(struct nfs4_stateid *open_stp) } } -static void release_open_stateid(struct nfs4_stateid *stp) +static void release_open_stateid(struct nfs4_ol_stateid *stp) { unhash_generic_stateid(stp); release_stateid_lockowners(stp); @@ -484,14 +484,14 @@ static void release_open_stateid(struct nfs4_stateid *stp) static void unhash_openowner(struct nfs4_openowner *oo) { - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; list_del(&oo->oo_owner.so_idhash); list_del(&oo->oo_owner.so_strhash); list_del(&oo->oo_perclient); while (!list_empty(&oo->oo_owner.so_stateids)) { stp = list_first_entry(&oo->oo_owner.so_stateids, - struct nfs4_stateid, st_perstateowner); + struct nfs4_ol_stateid, st_perstateowner); release_open_stateid(stp); } } @@ -1068,26 +1068,26 @@ same_stateid(stateid_t *id_one, stateid_t *id_two) return id_one->si_fileid == id_two->si_fileid; } -static struct nfs4_stateid *find_stateid(stateid_t *t) +static struct nfs4_ol_stateid *find_stateid(stateid_t *t) { - struct nfs4_stateid *s; + struct nfs4_stid *s; unsigned int hashval; hashval = stateid_hashval(t->si_stateownerid, t->si_fileid); - list_for_each_entry(s, &stateid_hashtbl[hashval], st_hash) - if (same_stateid(&s->st_stateid, t)) - return s; + list_for_each_entry(s, &stateid_hashtbl[hashval], sc_hash) + if (same_stateid(&s->sc_stateid, t)) + return openlockstateid(s); return NULL; } -static struct nfs4_stateid *find_stateid_by_type(stateid_t *t, char typemask) +static struct nfs4_ol_stateid *find_stateid_by_type(stateid_t *t, char typemask) { - struct nfs4_stateid *s; + struct nfs4_ol_stateid *s; s = find_stateid(t); if (!s) return NULL; - if (typemask & s->st_type) + if (typemask & s->st_stid.sc_type) return s; return NULL; } @@ -2232,7 +2232,7 @@ nfsd4_init_slabs(void) if (file_slab == NULL) goto out_nomem; stateid_slab = kmem_cache_create("nfsd4_stateids", - sizeof(struct nfs4_stateid), 0, 0, NULL); + sizeof(struct nfs4_ol_stateid), 0, 0, NULL); if (stateid_slab == NULL) goto out_nomem; deleg_slab = kmem_cache_create("nfsd4_delegations", @@ -2314,23 +2314,23 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str } static inline void -init_open_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { +init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { struct nfs4_openowner *oo = open->op_openowner; unsigned int hashval = stateid_hashval(oo->oo_owner.so_id, fp->fi_id); INIT_LIST_HEAD(&stp->st_lockowners); - list_add(&stp->st_hash, &stateid_hashtbl[hashval]); + list_add(&stp->st_stid.sc_hash, &stateid_hashtbl[hashval]); list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); - stp->st_type = NFS4_OPEN_STID; + stp->st_stid.sc_type = NFS4_OPEN_STID; stp->st_stateowner = &oo->oo_owner; get_nfs4_file(fp); stp->st_file = fp; - stp->st_stateid.si_boot = boot_time; - stp->st_stateid.si_stateownerid = oo->oo_owner.so_id; - stp->st_stateid.si_fileid = fp->fi_id; + stp->st_stid.sc_stateid.si_boot = boot_time; + stp->st_stid.sc_stateid.si_stateownerid = oo->oo_owner.so_id; + stp->st_stid.sc_stateid.si_fileid = fp->fi_id; /* note will be incremented before first return to client: */ - stp->st_stateid.si_generation = 0; + stp->st_stid.sc_stateid.si_generation = 0; stp->st_access_bmap = 0; stp->st_deny_bmap = 0; __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK, @@ -2422,7 +2422,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) { struct inode *ino = current_fh->fh_dentry->d_inode; struct nfs4_file *fp; - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; __be32 ret; dprintk("NFSD: nfs4_share_conflict\n"); @@ -2611,9 +2611,9 @@ out: } static __be32 -nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp) +nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_stateid **stpp) { - struct nfs4_stateid *local; + struct nfs4_ol_stateid *local; struct nfs4_openowner *oo = open->op_openowner; list_for_each_entry(local, &fp->fi_stateids, st_perfile) { @@ -2630,7 +2630,7 @@ nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_state return nfs_ok; } -static inline struct nfs4_stateid * +static inline struct nfs4_ol_stateid * nfs4_alloc_stateid(void) { return kmem_cache_alloc(stateid_slab, GFP_KERNEL); @@ -2672,11 +2672,11 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, } static __be32 -nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp, +nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_ol_stateid **stpp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfsd4_open *open) { - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; __be32 status; stp = nfs4_alloc_stateid(); @@ -2708,7 +2708,7 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, } static __be32 -nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open) +nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) { u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK; bool new_access; @@ -2820,7 +2820,7 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag) * Attempt to hand out a delegation. */ static void -nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_stateid *stp) +nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_stateid *stp) { struct nfs4_delegation *dp; struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner); @@ -2888,7 +2888,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf struct nfsd4_compoundres *resp = rqstp->rq_resp; struct nfs4_file *fp = NULL; struct inode *ino = current_fh->fh_dentry->d_inode; - struct nfs4_stateid *stp = NULL; + struct nfs4_ol_stateid *stp = NULL; struct nfs4_delegation *dp = NULL; __be32 status; @@ -2938,8 +2938,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf goto out; } } - update_stateid(&stp->st_stateid); - memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); if (nfsd4_has_session(&resp->cstate)) open->op_openowner->oo_confirmed = 1; @@ -2953,7 +2953,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs_ok; dprintk("%s: stateid=" STATEID_FMT "\n", __func__, - STATEID_VAL(&stp->st_stateid)); + STATEID_VAL(&stp->st_stid.sc_stateid)); out: if (fp) put_nfs4_file(fp); @@ -3122,7 +3122,7 @@ static struct nfs4_openowner * search_close_lru(u32 st_id) } static inline int -nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp) +nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) { return fhp->fh_dentry->d_inode != stp->st_file->fi_inode; } @@ -3153,7 +3153,7 @@ access_permit_write(unsigned long access_bmap) } static -__be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags) +__be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags) { __be32 status = nfserr_openmode; @@ -3237,7 +3237,7 @@ static int is_delegation_stateid(stateid_t *stateid) __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session) { - struct nfs4_stateid *stp = NULL; + struct nfs4_ol_stateid *stp = NULL; __be32 status = nfserr_stale_stateid; if (STALE_STATEID(stateid)) @@ -3252,7 +3252,7 @@ __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session) && !openowner(stp->st_stateowner)->oo_confirmed) goto out; - status = check_stateid_generation(stateid, &stp->st_stateid, has_session); + status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, has_session); if (status) goto out; @@ -3268,7 +3268,7 @@ __be32 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, stateid_t *stateid, int flags, struct file **filpp) { - struct nfs4_stateid *stp = NULL; + struct nfs4_ol_stateid *stp = NULL; struct nfs4_delegation *dp = NULL; struct svc_fh *current_fh = &cstate->current_fh; struct inode *ino = current_fh->fh_dentry->d_inode; @@ -3317,7 +3317,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, if (stp->st_stateowner->so_is_open_owner && !openowner(stp->st_stateowner)->oo_confirmed) goto out; - status = check_stateid_generation(stateid, &stp->st_stateid, + status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); if (status) goto out; @@ -3347,7 +3347,7 @@ nfsd4_free_delegation_stateid(stateid_t *stateid) } static __be32 -nfsd4_free_lock_stateid(struct nfs4_stateid *stp) +nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp) { if (check_for_locks(stp->st_file, lockowner(stp->st_stateowner))) return nfserr_locks_held; @@ -3374,7 +3374,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_free_stateid *free_stateid) { stateid_t *stateid = &free_stateid->fr_stateid; - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; __be32 ret; nfs4_lock_state(); @@ -3388,11 +3388,11 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, ret = nfserr_bad_stateid; goto out; } - ret = check_stateid_generation(stateid, &stp->st_stateid, 1); + ret = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, 1); if (ret) goto out; - if (stp->st_type == NFS4_OPEN_STID) { + if (stp->st_stid.sc_type == NFS4_OPEN_STID) { ret = nfserr_locks_held; goto out; } else { @@ -3421,7 +3421,7 @@ static __be32 nfs4_nospecial_stateid_checks(stateid_t *stateid) return nfs_ok; } -static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_stateid *stp) +static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_ol_stateid *stp) { struct svc_fh *current_fh = &cstate->current_fh; struct nfs4_stateowner *sop = stp->st_stateowner; @@ -3432,7 +3432,7 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ status = nfsd4_check_seqid(cstate, sop, seqid); if (status) return status; - return check_stateid_generation(stateid, &stp->st_stateid, nfsd4_has_session(cstate)); + return check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); } /* @@ -3441,7 +3441,7 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ static __be32 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, char typemask, - struct nfs4_stateid **stpp) + struct nfs4_ol_stateid **stpp) { __be32 status; @@ -3461,7 +3461,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp); } -static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_stateid **stpp) +static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_ol_stateid **stpp) { __be32 status; struct nfs4_openowner *oo; @@ -3482,7 +3482,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { __be32 status; struct nfs4_openowner *oo; - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; dprintk("NFSD: nfsd4_open_confirm on file %.*s\n", (int)cstate->current_fh.fh_dentry->d_name.len, @@ -3504,10 +3504,10 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (oo->oo_confirmed) goto out; oo->oo_confirmed = 1; - update_stateid(&stp->st_stateid); - memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t)); + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", - __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid)); + __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid)); nfsd4_create_clid_dir(oo->oo_owner.so_client); status = nfs_ok; @@ -3517,7 +3517,7 @@ out: return status; } -static inline void nfs4_file_downgrade(struct nfs4_stateid *stp, unsigned int to_access) +static inline void nfs4_file_downgrade(struct nfs4_ol_stateid *stp, unsigned int to_access) { int i; @@ -3545,7 +3545,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, struct nfsd4_open_downgrade *od) { __be32 status; - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n", (int)cstate->current_fh.fh_dentry->d_name.len, @@ -3575,8 +3575,8 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap); - update_stateid(&stp->st_stateid); - memcpy(&od->od_stateid, &stp->st_stateid, sizeof(stateid_t)); + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); status = nfs_ok; out: if (!cstate->replay_owner) @@ -3593,7 +3593,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { __be32 status; struct nfs4_openowner *oo; - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; dprintk("NFSD: nfsd4_close on file %.*s\n", (int)cstate->current_fh.fh_dentry->d_name.len, @@ -3622,8 +3622,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; oo = openowner(stp->st_stateowner); status = nfs_ok; - update_stateid(&stp->st_stateid); - memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t)); + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); /* release_stateid() calls nfsd_close() if needed */ release_open_stateid(stp); @@ -3828,7 +3828,7 @@ find_lockowner_str(struct inode *inode, clientid_t *clid, return NULL; } -static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp) +static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp) { unsigned int idhashval; @@ -3847,7 +3847,7 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s */ static struct nfs4_lockowner * -alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp, struct nfsd4_lock *lock) { +alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp, struct nfsd4_lock *lock) { struct nfs4_lockowner *lo; lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp); @@ -3862,27 +3862,27 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, str return lo; } -static struct nfs4_stateid * -alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_stateid *open_stp) +static struct nfs4_ol_stateid * +alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_ol_stateid *open_stp) { - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; unsigned int hashval = stateid_hashval(lo->lo_owner.so_id, fp->fi_id); stp = nfs4_alloc_stateid(); if (stp == NULL) goto out; - list_add(&stp->st_hash, &stateid_hashtbl[hashval]); + list_add(&stp->st_stid.sc_hash, &stateid_hashtbl[hashval]); list_add(&stp->st_perfile, &fp->fi_stateids); list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); stp->st_stateowner = &lo->lo_owner; - stp->st_type = NFS4_LOCK_STID; + stp->st_stid.sc_type = NFS4_LOCK_STID; get_nfs4_file(fp); stp->st_file = fp; - stp->st_stateid.si_boot = boot_time; - stp->st_stateid.si_stateownerid = lo->lo_owner.so_id; - stp->st_stateid.si_fileid = fp->fi_id; + stp->st_stid.sc_stateid.si_boot = boot_time; + stp->st_stid.sc_stateid.si_stateownerid = lo->lo_owner.so_id; + stp->st_stid.sc_stateid.si_fileid = fp->fi_id; /* note will be incremented before first return to client: */ - stp->st_stateid.si_generation = 0; + stp->st_stid.sc_stateid.si_generation = 0; stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; @@ -3898,7 +3898,7 @@ check_lock_length(u64 offset, u64 length) LOFF_OVERFLOW(offset, length))); } -static void get_lock_access(struct nfs4_stateid *lock_stp, u32 access) +static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access) { struct nfs4_file *fp = lock_stp->st_file; int oflag = nfs4_access_to_omode(access); @@ -3918,7 +3918,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfs4_openowner *open_sop = NULL; struct nfs4_lockowner *lock_sop = NULL; - struct nfs4_stateid *lock_stp; + struct nfs4_ol_stateid *lock_stp; struct nfs4_file *fp; struct file *filp = NULL; struct file_lock file_lock; @@ -3949,7 +3949,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * Use open owner and open stateid to create lock owner and * lock stateid. */ - struct nfs4_stateid *open_stp = NULL; + struct nfs4_ol_stateid *open_stp = NULL; status = nfserr_stale_clientid; if (!nfsd4_has_session(cstate) && @@ -4052,8 +4052,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock); switch (-err) { case 0: /* success! */ - update_stateid(&lock_stp->st_stateid); - memcpy(&lock->lk_resp_stateid, &lock_stp->st_stateid, + update_stateid(&lock_stp->st_stid.sc_stateid); + memcpy(&lock->lk_resp_stateid, &lock_stp->st_stid.sc_stateid, sizeof(stateid_t)); status = 0; break; @@ -4172,7 +4172,7 @@ __be32 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_locku *locku) { - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; struct file *filp = NULL; struct file_lock file_lock; __be32 status; @@ -4220,8 +4220,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* * OK, unlock succeeded; the only thing left to do is update the stateid. */ - update_stateid(&stp->st_stateid); - memcpy(&locku->lu_stateid, &stp->st_stateid, sizeof(stateid_t)); + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); out: nfs4_unlock_state(); @@ -4264,7 +4264,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, clientid_t *clid = &rlockowner->rl_clientid; struct nfs4_stateowner *sop; struct nfs4_lockowner *lo; - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; struct xdr_netobj *owner = &rlockowner->rl_owner; struct list_head matches; int i; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index ef949eb3a86e..d7fffabb8d56 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -328,10 +328,10 @@ struct nfs4_replay { * for lock_owner * so_perclient: nfs4_client->cl_perclient entry - used when nfs4_client * struct is reaped. -* so_perfilestate: heads the list of nfs4_stateid (either open or lock) -* and is used to ensure no dangling nfs4_stateid references when we +* so_perfilestate: heads the list of nfs4_ol_stateid (either open or lock) +* and is used to ensure no dangling nfs4_ol_stateid references when we * release a stateowner. -* so_perlockowner: (open) nfs4_stateid->st_perlockowner entry - used when +* so_perlockowner: (open) nfs4_ol_stateid->st_perlockowner entry - used when * close is called to reap associated byte-range locks * so_close_lru: (open) stateowner is placed on this list instead of being * reaped (when so_perfilestate is empty) to hold the last close replay. @@ -430,9 +430,9 @@ static inline struct file *find_any_file(struct nfs4_file *f) } /* -* nfs4_stateid can either be an open stateid or (eventually) a lock stateid +* nfs4_ol_stateid can either be an open stateid or (eventually) a lock stateid * -* (open)nfs4_stateid: one per (open)nfs4_stateowner, nfs4_file +* (open)nfs4_ol_stateid: one per (open)nfs4_stateowner, nfs4_file * * st_hash: stateid_hashtbl[] entry or lockstateid_hashtbl entry * st_perfile: file_hashtbl[] entry. @@ -446,22 +446,31 @@ static inline struct file *find_any_file(struct nfs4_file *f) * we should consider defining separate structs for the two cases. */ -struct nfs4_stateid { +struct nfs4_stid { #define NFS4_OPEN_STID 1 #define NFS4_LOCK_STID 2 - char st_type; - struct list_head st_hash; + char sc_type; + struct list_head sc_hash; + stateid_t sc_stateid; +}; + +struct nfs4_ol_stateid { + struct nfs4_stid st_stid; struct list_head st_perfile; struct list_head st_perstateowner; struct list_head st_lockowners; struct nfs4_stateowner * st_stateowner; struct nfs4_file * st_file; - stateid_t st_stateid; unsigned long st_access_bmap; unsigned long st_deny_bmap; - struct nfs4_stateid * st_openstp; + struct nfs4_ol_stateid * st_openstp; }; +static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s) +{ + return container_of(s, struct nfs4_ol_stateid, st_stid); +} + /* flags for preprocess_seqid_op() */ #define RD_STATE 0x00000010 #define WR_STATE 0x00000020 -- cgit v1.2.3-58-ga151 From d5477a8db8134c481ad7b4b745f6defa119253e1 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 8 Sep 2011 12:07:44 -0400 Subject: nfsd4: add common dl_stid field to delegation Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 2 +- fs/nfsd/nfs4state.c | 20 ++++++++++---------- fs/nfsd/state.h | 20 +++++++++++--------- 3 files changed, 22 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 02eb4edf0ece..93b5e405ad38 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -351,7 +351,7 @@ static void encode_cb_recall4args(struct xdr_stream *xdr, __be32 *p; encode_nfs_cb_opnum4(xdr, OP_CB_RECALL); - encode_stateid4(xdr, &dp->dl_stateid); + encode_stateid4(xdr, &dp->dl_stid.sc_stateid); p = xdr_reserve_space(xdr, 4); *p++ = xdr_zero; /* truncate */ diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d0bb5a5613a9..3e3d605d3e73 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -247,10 +247,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv get_nfs4_file(fp); dp->dl_file = fp; dp->dl_type = type; - dp->dl_stateid.si_boot = boot_time; - dp->dl_stateid.si_stateownerid = current_delegid++; - dp->dl_stateid.si_fileid = 0; - dp->dl_stateid.si_generation = 1; + dp->dl_stid.sc_stateid.si_boot = boot_time; + dp->dl_stid.sc_stateid.si_stateownerid = current_delegid++; + dp->dl_stid.sc_stateid.si_fileid = 0; + dp->dl_stid.sc_stateid.si_generation = 1; fh_copy_shallow(&dp->dl_fh, ¤t_fh->fh_handle); dp->dl_time = 0; atomic_set(&dp->dl_count, 1); @@ -2572,7 +2572,7 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid) spin_lock(&recall_lock); list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) - if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) { + if (dp->dl_stid.sc_stateid.si_stateownerid == stid->si_stateownerid) { spin_unlock(&recall_lock); return dp; } @@ -2861,10 +2861,10 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_ if (status) goto out_free; - memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid)); + memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid)); dprintk("NFSD: delegation stateid=" STATEID_FMT "\n", - STATEID_VAL(&dp->dl_stateid)); + STATEID_VAL(&dp->dl_stid.sc_stateid)); out: if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && flag == NFS4_OPEN_DELEGATE_NONE @@ -3296,7 +3296,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, dp = find_delegation_stateid(ino, stateid); if (!dp) goto out; - status = check_stateid_generation(stateid, &dp->dl_stateid, nfsd4_has_session(cstate)); + status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate)); if (status) goto out; status = nfs4_check_delegmode(dp, flags); @@ -3667,7 +3667,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dp = find_delegation_stateid(inode, stateid); if (!dp) goto out; - status = check_stateid_generation(stateid, &dp->dl_stateid, nfsd4_has_session(cstate)); + status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate)); if (status) goto out; renew_client(dp->dl_client); @@ -3737,7 +3737,7 @@ search_for_delegation(stateid_t *stid) list_for_each_entry(fp, &file_hashtbl[i], fi_hash) { list_for_each(pos, &fp->fi_delegations) { dp = list_entry(pos, struct nfs4_delegation, dl_perfile); - if (same_stateid(&dp->dl_stateid, stid)) + if (same_stateid(&dp->dl_stid.sc_stateid, stid)) return dp; } } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index d7fffabb8d56..e3ff7c9f9264 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -76,6 +76,15 @@ struct nfsd4_callback { bool cb_done; }; +struct nfs4_stid { +#define NFS4_OPEN_STID 1 +#define NFS4_LOCK_STID 2 +#define NFS4_DELEG_STID 4 + char sc_type; + struct list_head sc_hash; + stateid_t sc_stateid; +}; + struct nfs4_delegation { struct list_head dl_perfile; struct list_head dl_perclnt; @@ -86,7 +95,7 @@ struct nfs4_delegation { u32 dl_type; time_t dl_time; /* For recall: */ - stateid_t dl_stateid; + struct nfs4_stid dl_stid; struct knfsd_fh dl_fh; int dl_retries; struct nfsd4_callback dl_recall; @@ -446,14 +455,7 @@ static inline struct file *find_any_file(struct nfs4_file *f) * we should consider defining separate structs for the two cases. */ -struct nfs4_stid { -#define NFS4_OPEN_STID 1 -#define NFS4_LOCK_STID 2 - char sc_type; - struct list_head sc_hash; - stateid_t sc_stateid; -}; - +/* "ol" stands for "Open or Lock". Better suggestions welcome. */ struct nfs4_ol_stateid { struct nfs4_stid st_stid; struct list_head st_perfile; -- cgit v1.2.3-58-ga151 From 36d44c6038f6d3f8786187a5558c2f3b39a7af95 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 8 Sep 2011 12:16:03 -0400 Subject: nfsd4: share common stid-hashing helper function Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3e3d605d3e73..581a5d01b005 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -218,6 +218,15 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag) __nfs4_file_put_access(fp, oflag); } +static inline void hash_stid(struct nfs4_stid *stid) +{ + stateid_t *s = &stid->sc_stateid; + unsigned int hashval; + + hashval = stateid_hashval(s->si_stateownerid, s->si_fileid); + list_add(&stid->sc_hash, &stateid_hashtbl[hashval]); +} + static struct nfs4_delegation * alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type) { @@ -2316,10 +2325,8 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str static inline void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { struct nfs4_openowner *oo = open->op_openowner; - unsigned int hashval = stateid_hashval(oo->oo_owner.so_id, fp->fi_id); INIT_LIST_HEAD(&stp->st_lockowners); - list_add(&stp->st_stid.sc_hash, &stateid_hashtbl[hashval]); list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); stp->st_stid.sc_type = NFS4_OPEN_STID; @@ -2331,6 +2338,7 @@ init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd stp->st_stid.sc_stateid.si_fileid = fp->fi_id; /* note will be incremented before first return to client: */ stp->st_stid.sc_stateid.si_generation = 0; + hash_stid(&stp->st_stid); stp->st_access_bmap = 0; stp->st_deny_bmap = 0; __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK, @@ -3866,12 +3874,10 @@ static struct nfs4_ol_stateid * alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_ol_stateid *open_stp) { struct nfs4_ol_stateid *stp; - unsigned int hashval = stateid_hashval(lo->lo_owner.so_id, fp->fi_id); stp = nfs4_alloc_stateid(); if (stp == NULL) goto out; - list_add(&stp->st_stid.sc_hash, &stateid_hashtbl[hashval]); list_add(&stp->st_perfile, &fp->fi_stateids); list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); stp->st_stateowner = &lo->lo_owner; @@ -3883,6 +3889,7 @@ alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct stp->st_stid.sc_stateid.si_fileid = fp->fi_id; /* note will be incremented before first return to client: */ stp->st_stid.sc_stateid.si_generation = 0; + hash_stid(&stp->st_stid); stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; -- cgit v1.2.3-58-ga151 From f459e4535904e16ca9f0cc202c78345c332bbbad Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 9 Sep 2011 09:06:12 -0400 Subject: nfsd4: hash deleg stateid's like any other It's simpler to look up delegation stateid's in the same hash table as any other stateid. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 112 +++++++++++++++++++++------------------------------- fs/nfsd/state.h | 5 +++ 2 files changed, 51 insertions(+), 66 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 581a5d01b005..24685a02690f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -60,8 +60,6 @@ static u64 current_sessionid = 1; #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) /* forward declarations */ -static struct nfs4_delegation * search_for_delegation(stateid_t *stid); -static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner); /* Locking: */ @@ -256,10 +254,12 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv get_nfs4_file(fp); dp->dl_file = fp; dp->dl_type = type; + dp->dl_stid.sc_type = NFS4_DELEG_STID; dp->dl_stid.sc_stateid.si_boot = boot_time; dp->dl_stid.sc_stateid.si_stateownerid = current_delegid++; dp->dl_stid.sc_stateid.si_fileid = 0; dp->dl_stid.sc_stateid.si_generation = 1; + hash_stid(&dp->dl_stid); fh_copy_shallow(&dp->dl_fh, ¤t_fh->fh_handle); dp->dl_time = 0; atomic_set(&dp->dl_count, 1); @@ -292,6 +292,7 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp) static void unhash_delegation(struct nfs4_delegation *dp) { + list_del_init(&dp->dl_stid.sc_hash); list_del_init(&dp->dl_perclnt); spin_lock(&recall_lock); list_del_init(&dp->dl_perfile); @@ -1077,7 +1078,7 @@ same_stateid(stateid_t *id_one, stateid_t *id_two) return id_one->si_fileid == id_two->si_fileid; } -static struct nfs4_ol_stateid *find_stateid(stateid_t *t) +static struct nfs4_stid *find_stateid(stateid_t *t) { struct nfs4_stid *s; unsigned int hashval; @@ -1085,22 +1086,42 @@ static struct nfs4_ol_stateid *find_stateid(stateid_t *t) hashval = stateid_hashval(t->si_stateownerid, t->si_fileid); list_for_each_entry(s, &stateid_hashtbl[hashval], sc_hash) if (same_stateid(&s->sc_stateid, t)) - return openlockstateid(s); + return s; return NULL; } -static struct nfs4_ol_stateid *find_stateid_by_type(stateid_t *t, char typemask) +static struct nfs4_ol_stateid *find_ol_stateid(stateid_t *t) { - struct nfs4_ol_stateid *s; + struct nfs4_stid *s; + + s = find_stateid(t); + if (!s) + return NULL; + return openlockstateid(s); +} + +static struct nfs4_stid *find_stateid_by_type(stateid_t *t, char typemask) +{ + struct nfs4_stid *s; s = find_stateid(t); if (!s) return NULL; - if (typemask & s->st_stid.sc_type) + if (typemask & s->sc_type) return s; return NULL; } +static struct nfs4_ol_stateid *find_ol_stateid_by_type(stateid_t *t, char typemask) +{ + struct nfs4_stid *s; + + s = find_stateid_by_type(t, typemask); + if (!s) + return NULL; + return openlockstateid(s); +} + static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, struct svc_rqst *rqstp, nfs4_verifier *verf) { @@ -2573,26 +2594,21 @@ nfs4_check_delegmode(struct nfs4_delegation *dp, int flags) return nfs_ok; } -static struct nfs4_delegation * -find_delegation_file(struct nfs4_file *fp, stateid_t *stid) +static int share_access_to_flags(u32 share_access) { - struct nfs4_delegation *dp; + share_access &= ~NFS4_SHARE_WANT_MASK; - spin_lock(&recall_lock); - list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) - if (dp->dl_stid.sc_stateid.si_stateownerid == stid->si_stateownerid) { - spin_unlock(&recall_lock); - return dp; - } - spin_unlock(&recall_lock); - return NULL; + return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; } -static int share_access_to_flags(u32 share_access) +static struct nfs4_delegation *find_deleg_stateid(stateid_t *s) { - share_access &= ~NFS4_SHARE_WANT_MASK; + struct nfs4_stid *ret; - return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; + ret = find_stateid_by_type(s, NFS4_DELEG_STID); + if (!ret) + return NULL; + return delegstateid(ret); } static __be32 @@ -2602,7 +2618,7 @@ nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open, int flags; __be32 status = nfserr_bad_stateid; - *dp = find_delegation_file(fp, &open->op_delegate_stateid); + *dp = find_deleg_stateid(&open->op_delegate_stateid); if (*dp == NULL) goto out; flags = share_access_to_flags(open->op_share_access); @@ -3252,7 +3268,7 @@ __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session) goto out; status = nfserr_expired; - stp = find_stateid(stateid); + stp = find_ol_stateid(stateid); if (!stp) goto out; status = nfserr_bad_stateid; @@ -3301,7 +3317,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, */ status = nfserr_expired; if (is_delegation_stateid(stateid)) { - dp = find_delegation_stateid(ino, stateid); + dp = find_deleg_stateid(stateid); if (!dp) goto out; status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate)); @@ -3316,7 +3332,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, BUG_ON(!*filpp); } } else { /* open or lock stateid */ - stp = find_stateid(stateid); + stp = find_ol_stateid(stateid); if (!stp) goto out; status = nfserr_bad_stateid; @@ -3348,9 +3364,10 @@ out: static __be32 nfsd4_free_delegation_stateid(stateid_t *stateid) { - struct nfs4_delegation *dp = search_for_delegation(stateid); + struct nfs4_delegation *dp = find_deleg_stateid(stateid); if (dp) return nfserr_locks_held; + return nfserr_bad_stateid; } @@ -3391,7 +3408,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - stp = find_stateid(stateid); + stp = find_ol_stateid(stateid); if (!stp) { ret = nfserr_bad_stateid; goto out; @@ -3460,7 +3477,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, status = nfs4_nospecial_stateid_checks(stateid); if (status) return status; - *stpp = find_stateid_by_type(stateid, typemask); + *stpp = find_ol_stateid_by_type(stateid, typemask); if (*stpp == NULL) return nfserr_expired; cstate->replay_owner = (*stpp)->st_stateowner; @@ -3672,7 +3689,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!is_delegation_stateid(stateid)) goto out; status = nfserr_expired; - dp = find_delegation_stateid(inode, stateid); + dp = find_deleg_stateid(stateid); if (!dp) goto out; status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate)); @@ -3733,43 +3750,6 @@ lock_ownerstr_hashval(struct inode *inode, u32 cl_id, static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE]; static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; -static struct nfs4_delegation * -search_for_delegation(stateid_t *stid) -{ - struct nfs4_file *fp; - struct nfs4_delegation *dp; - struct list_head *pos; - int i; - - for (i = 0; i < FILE_HASH_SIZE; i++) { - list_for_each_entry(fp, &file_hashtbl[i], fi_hash) { - list_for_each(pos, &fp->fi_delegations) { - dp = list_entry(pos, struct nfs4_delegation, dl_perfile); - if (same_stateid(&dp->dl_stid.sc_stateid, stid)) - return dp; - } - } - } - return NULL; -} - -static struct nfs4_delegation * -find_delegation_stateid(struct inode *ino, stateid_t *stid) -{ - struct nfs4_file *fp; - struct nfs4_delegation *dl; - - dprintk("NFSD: %s: stateid=" STATEID_FMT "\n", __func__, - STATEID_VAL(stid)); - - fp = find_file(ino); - if (!fp) - return NULL; - dl = find_delegation_file(fp, stid); - put_nfs4_file(fp); - return dl; -} - /* * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that * we can't properly handle lock requests that go beyond the (2^63 - 1)-th diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index e3ff7c9f9264..12c142436705 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -113,6 +113,11 @@ struct nfs4_cb_conn { struct svc_xprt *cb_xprt; /* minorversion 1 only */ }; +static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s) +{ + return container_of(s, struct nfs4_delegation, dl_stid); +} + /* Maximum number of slots per session. 160 is useful for long haul TCP */ #define NFSD_MAX_SLOTS_PER_SESSION 160 /* Maximum number of operations per session compound */ -- cgit v1.2.3-58-ga151 From 97b7e3b6d4ae838694b43f66b4f0b32b5ec7635b Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 9 Sep 2011 11:26:58 -0400 Subject: nfsd4: fix test_stateid for delegation stateid's Test_stateid should handle delegation stateid's as well. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 24685a02690f..30387f3d9881 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3261,28 +3261,26 @@ static int is_delegation_stateid(stateid_t *stateid) __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session) { - struct nfs4_ol_stateid *stp = NULL; - __be32 status = nfserr_stale_stateid; + struct nfs4_stid *s; + struct nfs4_ol_stateid *ols; + __be32 status; if (STALE_STATEID(stateid)) - goto out; - - status = nfserr_expired; - stp = find_ol_stateid(stateid); - if (!stp) - goto out; - status = nfserr_bad_stateid; - if (stp->st_stateowner->so_is_open_owner - && !openowner(stp->st_stateowner)->oo_confirmed) - goto out; + return nfserr_stale_stateid; - status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, has_session); + s = find_stateid(stateid); + if (!s) + return nfserr_stale_stateid; + status = check_stateid_generation(stateid, &s->sc_stateid, has_session); if (status) - goto out; - - status = nfs_ok; -out: - return status; + return status; + if (!(s->sc_type & (NFS4_OPEN_STID | NFS4_LOCK_STID))) + return nfs_ok; + ols = openlockstateid(s); + if (ols->st_stateowner->so_is_open_owner + && !openowner(ols->st_stateowner)->oo_confirmed) + return nfserr_bad_stateid; + return nfs_ok; } /* -- cgit v1.2.3-58-ga151 From 69064a2764fe195f1478be3ea83d15abe5d71025 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 9 Sep 2011 11:54:57 -0400 Subject: nfsd4: use deleg changes to cleanup preprocess_stateid_op Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 30387f3d9881..ea338d0c62a1 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3290,6 +3290,7 @@ __be32 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, stateid_t *stateid, int flags, struct file **filpp) { + struct nfs4_stid *s; struct nfs4_ol_stateid *stp = NULL; struct nfs4_delegation *dp = NULL; struct svc_fh *current_fh = &cstate->current_fh; @@ -3314,13 +3315,14 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, * but that we can't find, is expired: */ status = nfserr_expired; - if (is_delegation_stateid(stateid)) { - dp = find_deleg_stateid(stateid); - if (!dp) - goto out; - status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate)); - if (status) - goto out; + s = find_stateid(stateid); + if (!s) + goto out; + status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate)); + if (status) + goto out; + if (s->sc_type == NFS4_DELEG_STID) { + dp = delegstateid(s); status = nfs4_check_delegmode(dp, flags); if (status) goto out; @@ -3330,19 +3332,13 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, BUG_ON(!*filpp); } } else { /* open or lock stateid */ - stp = find_ol_stateid(stateid); - if (!stp) - goto out; + stp = openlockstateid(s); status = nfserr_bad_stateid; if (nfs4_check_fh(current_fh, stp)) goto out; if (stp->st_stateowner->so_is_open_owner && !openowner(stp->st_stateowner)->oo_confirmed) goto out; - status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, - nfsd4_has_session(cstate)); - if (status) - goto out; status = nfs4_check_openmode(stp, flags); if (status) goto out; -- cgit v1.2.3-58-ga151 From ee626a77d3725a129391b1c85edd46f3b470cca9 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sun, 11 Sep 2011 13:48:41 -0400 Subject: nfsd4: better stateid hashing First, we shouldn't care here about the structure of the opaque part of the stateid. Second, this hash is really dumb. (I'm not sure the replacement is much better, though--to look at it another patch.) Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ea338d0c62a1..0cd346477f29 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -168,9 +168,9 @@ static unsigned int file_hashval(struct inode *ino) return hash_ptr(ino, FILE_HASH_BITS); } -static unsigned int stateid_hashval(u32 owner_id, u32 file_id) +static unsigned int stateid_hashval(stateid_t *s) { - return (owner_id + file_id) & STATEID_HASH_MASK; + return opaque_hashval(&s->si_opaque, sizeof(stateid_opaque_t)) & STATEID_HASH_MASK; } static struct list_head file_hashtbl[FILE_HASH_SIZE]; @@ -221,7 +221,7 @@ static inline void hash_stid(struct nfs4_stid *stid) stateid_t *s = &stid->sc_stateid; unsigned int hashval; - hashval = stateid_hashval(s->si_stateownerid, s->si_fileid); + hashval = stateid_hashval(s); list_add(&stid->sc_hash, &stateid_hashtbl[hashval]); } @@ -1083,7 +1083,7 @@ static struct nfs4_stid *find_stateid(stateid_t *t) struct nfs4_stid *s; unsigned int hashval; - hashval = stateid_hashval(t->si_stateownerid, t->si_fileid); + hashval = stateid_hashval(t); list_for_each_entry(s, &stateid_hashtbl[hashval], sc_hash) if (same_stateid(&s->sc_stateid, t)) return s; -- cgit v1.2.3-58-ga151 From ed748aacb8e3318fa2cf24e1c197d35b5fd29605 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 12 Sep 2011 19:37:06 -0400 Subject: NFSD: Cleanup for nfsd4_path() The current code is sort of hackish in that it assumes a referral is always matched to an export. When we add support for junctions that may not be the case. We can replace nfsd4_path() with a function that encodes the components directly from the dentries. Since nfsd4_path is currently the only user of the 'ex_pathname' field in struct svc_export, this has the added benefit of allowing us to get rid of that. Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/export.c | 4 +- fs/nfsd/nfs4xdr.c | 106 ++++++++++++++++++++++++++++++++------------ include/linux/nfsd/export.h | 1 + 3 files changed, 81 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index d491421cd708..99229b0c153e 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1009,7 +1009,7 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct path *path) return exp; } -static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp) +struct svc_export *rqst_find_fsidzero_export(struct svc_rqst *rqstp) { u32 fsidv[2]; @@ -1029,7 +1029,7 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) struct svc_export *exp; __be32 rv; - exp = find_fsidzero_export(rqstp); + exp = rqst_find_fsidzero_export(rqstp); if (IS_ERR(exp)) return nfserrno(PTR_ERR(exp)); rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 182570bed472..5252d6681960 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1696,36 +1696,89 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location, } /* - * Return the path to an export point in the pseudo filesystem namespace - * Returned string is safe to use as long as the caller holds a reference - * to @exp. + * Encode a path in RFC3530 'pathname4' format */ -static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat) +static __be32 nfsd4_encode_path(const struct path *root, + const struct path *path, __be32 **pp, int *buflen) { - struct svc_fh tmp_fh; - char *path = NULL, *rootpath; - size_t rootlen; + struct path cur = { + .mnt = path->mnt, + .dentry = path->dentry, + }; + __be32 *p = *pp; + struct dentry **components = NULL; + unsigned int ncomponents = 0; + __be32 err = nfserr_jukebox; - fh_init(&tmp_fh, NFS4_FHSIZE); - *stat = exp_pseudoroot(rqstp, &tmp_fh); - if (*stat) - return NULL; - rootpath = tmp_fh.fh_export->ex_pathname; + dprintk("nfsd4_encode_components("); - path = exp->ex_pathname; + path_get(&cur); + /* First walk the path up to the nfsd root, and store the + * dentries/path components in an array. + */ + for (;;) { + if (cur.dentry == root->dentry && cur.mnt == root->mnt) + break; + if (cur.dentry == cur.mnt->mnt_root) { + if (follow_up(&cur)) + continue; + goto out_free; + } + if ((ncomponents & 15) == 0) { + struct dentry **new; + new = krealloc(components, + sizeof(*new) * (ncomponents + 16), + GFP_KERNEL); + if (!new) + goto out_free; + components = new; + } + components[ncomponents++] = cur.dentry; + cur.dentry = dget_parent(cur.dentry); + } - rootlen = strlen(rootpath); - if (strncmp(path, rootpath, rootlen)) { - dprintk("nfsd: fs_locations failed;" - "%s is not contained in %s\n", path, rootpath); - *stat = nfserr_notsupp; - path = NULL; - goto out; + *buflen -= 4; + if (*buflen < 0) + goto out_free; + WRITE32(ncomponents); + + while (ncomponents) { + struct dentry *dentry = components[ncomponents - 1]; + unsigned int len = dentry->d_name.len; + + *buflen -= 4 + (XDR_QUADLEN(len) << 2); + if (*buflen < 0) + goto out_free; + WRITE32(len); + WRITEMEM(dentry->d_name.name, len); + dprintk("/%s", dentry->d_name.name); + dput(dentry); + ncomponents--; } - path += rootlen; -out: - fh_put(&tmp_fh); - return path; + + *pp = p; + err = 0; +out_free: + dprintk(")\n"); + while (ncomponents) + dput(components[--ncomponents]); + kfree(components); + path_put(&cur); + return err; +} + +static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp, + const struct path *path, __be32 **pp, int *buflen) +{ + struct svc_export *exp_ps; + __be32 res; + + exp_ps = rqst_find_fsidzero_export(rqstp); + if (IS_ERR(exp_ps)) + return nfserrno(PTR_ERR(exp_ps)); + res = nfsd4_encode_path(&exp_ps->ex_path, path, pp, buflen); + exp_put(exp_ps); + return res; } /* @@ -1739,11 +1792,8 @@ static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp, int i; __be32 *p = *pp; struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs; - char *root = nfsd4_path(rqstp, exp, &status); - if (status) - return status; - status = nfsd4_encode_components('/', root, &p, buflen); + status = nfsd4_encode_fsloc_fsroot(rqstp, &exp->ex_path, &p, buflen); if (status) return status; if ((*buflen -= 4) < 0) diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h index 8a31a20efe7e..7ba3fd43f312 100644 --- a/include/linux/nfsd/export.h +++ b/include/linux/nfsd/export.h @@ -137,6 +137,7 @@ struct svc_export * rqst_exp_get_by_name(struct svc_rqst *, struct path *); struct svc_export * rqst_exp_parent(struct svc_rqst *, struct path *); +struct svc_export * rqst_find_fsidzero_export(struct svc_rqst *); int exp_rootfh(struct auth_domain *, char *path, struct knfsd_fh *, int maxsize); __be32 exp_pseudoroot(struct svc_rqst *, struct svc_fh *); -- cgit v1.2.3-58-ga151 From 2f1ddda1749a223d1a05e16dc6ea28632b9ec570 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 12 Sep 2011 19:37:16 -0400 Subject: NFSD: Remove the ex_pathname field from struct svc_export There are no more users... Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/export.c | 11 ----------- include/linux/nfsd/export.h | 1 - 2 files changed, 12 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 99229b0c153e..62f3b9074e84 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -317,7 +317,6 @@ static void svc_export_put(struct kref *ref) struct svc_export *exp = container_of(ref, struct svc_export, h.ref); path_put(&exp->ex_path); auth_domain_put(exp->ex_client); - kfree(exp->ex_pathname); nfsd4_fslocs_free(&exp->ex_fslocs); kfree(exp); } @@ -527,11 +526,6 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) exp.ex_client = dom; - err = -ENOMEM; - exp.ex_pathname = kstrdup(buf, GFP_KERNEL); - if (!exp.ex_pathname) - goto out2; - /* expiry */ err = -EINVAL; exp.h.expiry_time = get_expiry(&mesg); @@ -612,8 +606,6 @@ out4: nfsd4_fslocs_free(&exp.ex_fslocs); kfree(exp.ex_uuid); out3: - kfree(exp.ex_pathname); -out2: path_put(&exp.ex_path); out1: auth_domain_put(dom); @@ -677,7 +669,6 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) new->ex_client = item->ex_client; new->ex_path.dentry = dget(item->ex_path.dentry); new->ex_path.mnt = mntget(item->ex_path.mnt); - new->ex_pathname = NULL; new->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = 0; @@ -695,8 +686,6 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) new->ex_fsid = item->ex_fsid; new->ex_uuid = item->ex_uuid; item->ex_uuid = NULL; - new->ex_pathname = item->ex_pathname; - item->ex_pathname = NULL; new->ex_fslocs.locations = item->ex_fslocs.locations; item->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = item->ex_fslocs.locations_count; diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h index 7ba3fd43f312..f85308e688fd 100644 --- a/include/linux/nfsd/export.h +++ b/include/linux/nfsd/export.h @@ -96,7 +96,6 @@ struct svc_export { struct auth_domain * ex_client; int ex_flags; struct path ex_path; - char *ex_pathname; uid_t ex_anon_uid; gid_t ex_anon_gid; int ex_fsid; -- cgit v1.2.3-58-ga151 From 11fcee0293a6d9f0973e04f8b3fb6cd15a55bcce Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 12 Sep 2011 19:37:26 -0400 Subject: NFSD: Add a cache for fs_locations information Signed-off-by: Trond Myklebust [ cel: since this is server-side, use nfsd4_ prefix instead of nfs4_ prefix. ] [ cel: implement S_ISVTX filter in bfields-normal form ] Signed-off-by: Chuck Lever Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsd.h | 7 +++++++ fs/nfsd/vfs.c | 18 ++++++++++++++++++ 2 files changed, 25 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 8da03e16ab35..58134a23fdfb 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -361,6 +361,13 @@ static inline u32 nfsd_suppattrs2(u32 minorversion) #define NFSD_SUPPATTR_EXCLCREAT_WORD2 \ NFSD_WRITEABLE_ATTRS_WORD2 +extern int nfsd4_is_junction(struct dentry *dentry); +#else +static inline int nfsd4_is_junction(struct dentry *dentry) +{ + return 0; +} + #endif /* CONFIG_NFSD_V4 */ #endif /* LINUX_NFSD_NFSD_H */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 75c35fa46155..4dd91283d039 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -168,6 +168,8 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) { if (d_mountpoint(dentry)) return 1; + if (nfsd4_is_junction(dentry)) + return 1; if (!(exp->ex_flags & NFSEXP_V4ROOT)) return 0; return dentry->d_inode != NULL; @@ -592,6 +594,22 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac return error; } +#define NFSD_XATTR_JUNCTION_PREFIX XATTR_TRUSTED_PREFIX "junction." +#define NFSD_XATTR_JUNCTION_TYPE NFSD_XATTR_JUNCTION_PREFIX "type" +int nfsd4_is_junction(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + if (inode == NULL) + return 0; + if (inode->i_mode & S_IXUGO) + return 0; + if (!(inode->i_mode & S_ISVTX)) + return 0; + if (vfs_getxattr(dentry, NFSD_XATTR_JUNCTION_TYPE, NULL, 0) <= 0) + return 0; + return 1; +} #endif /* defined(CONFIG_NFSD_V4) */ #ifdef CONFIG_NFSD_V3 -- cgit v1.2.3-58-ga151 From 849a1cf13d4394d398d91752166e92e9ecd64f8d Mon Sep 17 00:00:00 2001 From: Mi Jinlong Date: Tue, 30 Aug 2011 17:18:41 +0800 Subject: SUNRPC: Replace svc_addr_u by sockaddr_storage For IPv6 local address, lockd can not callback to client for missing scope id when binding address at inet6_bind: 324 if (addr_type & IPV6_ADDR_LINKLOCAL) { 325 if (addr_len >= sizeof(struct sockaddr_in6) && 326 addr->sin6_scope_id) { 327 /* Override any existing binding, if another one 328 * is supplied by user. 329 */ 330 sk->sk_bound_dev_if = addr->sin6_scope_id; 331 } 332 333 /* Binding to link-local address requires an interface */ 334 if (!sk->sk_bound_dev_if) { 335 err = -EINVAL; 336 goto out_unlock; 337 } Replacing svc_addr_u by sockaddr_storage, let rqstp->rq_daddr contains more info besides address. Reviewed-by: Jeff Layton Reviewed-by: Chuck Lever Signed-off-by: Mi Jinlong Signed-off-by: J. Bruce Fields --- fs/lockd/host.c | 25 ++----------------------- fs/nfsd/nfs4state.c | 16 +--------------- include/linux/sunrpc/svc.h | 30 +++++++++++++++++++++--------- net/sunrpc/svc_xprt.c | 13 ++----------- net/sunrpc/svcsock.c | 23 +++++++++++++++++------ 5 files changed, 43 insertions(+), 64 deletions(-) (limited to 'fs') diff --git a/fs/lockd/host.c b/fs/lockd/host.c index b7c99bfb3da6..6f29836ec0cb 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -316,14 +316,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, struct hlist_node *pos; struct nlm_host *host = NULL; struct nsm_handle *nsm = NULL; - struct sockaddr_in sin = { - .sin_family = AF_INET, - }; - struct sockaddr_in6 sin6 = { - .sin6_family = AF_INET6, - }; - struct sockaddr *src_sap; - size_t src_len = rqstp->rq_addrlen; + struct sockaddr *src_sap = svc_daddr(rqstp); + size_t src_len = rqstp->rq_daddrlen; struct nlm_lookup_host_info ni = { .server = 1, .sap = svc_addr(rqstp), @@ -340,21 +334,6 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, mutex_lock(&nlm_host_mutex); - switch (ni.sap->sa_family) { - case AF_INET: - sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr; - src_sap = (struct sockaddr *)&sin; - break; - case AF_INET6: - ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6); - src_sap = (struct sockaddr *)&sin6; - break; - default: - dprintk("lockd: %s failed; unrecognized address family\n", - __func__); - goto out; - } - if (time_after_eq(jiffies, next_gc)) nlm_gc_hosts(); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 0cd346477f29..e7f83bd9b4a8 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1257,20 +1257,6 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval) return NULL; } -static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr) -{ - switch (family) { - case AF_INET: - ((struct sockaddr_in *)sa)->sin_family = AF_INET; - ((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr; - return; - case AF_INET6: - ((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6; - ((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6; - return; - } -} - static void gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp) { @@ -1302,7 +1288,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r conn->cb_prog = se->se_callback_prog; conn->cb_ident = se->se_callback_ident; - rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr); + memcpy(&conn->cb_saddr, &rqstp->rq_daddr, rqstp->rq_daddrlen); return; out_err: conn->cb_addr.ss_family = AF_UNSPEC; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index a78a51e93373..d8d5d93071b3 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -212,11 +212,6 @@ static inline void svc_putu32(struct kvec *iov, __be32 val) iov->iov_len += sizeof(__be32); } -union svc_addr_u { - struct in_addr addr; - struct in6_addr addr6; -}; - /* * The context of a single thread, including the request currently being * processed. @@ -225,8 +220,12 @@ struct svc_rqst { struct list_head rq_list; /* idle list */ struct list_head rq_all; /* all threads list */ struct svc_xprt * rq_xprt; /* transport ptr */ + struct sockaddr_storage rq_addr; /* peer address */ size_t rq_addrlen; + struct sockaddr_storage rq_daddr; /* dest addr of request + * - reply from here */ + size_t rq_daddrlen; struct svc_serv * rq_server; /* RPC service definition */ struct svc_pool * rq_pool; /* thread pool */ @@ -255,9 +254,6 @@ struct svc_rqst { unsigned short rq_secure : 1; /* secure port */ - union svc_addr_u rq_daddr; /* dest addr of request - * - reply from here */ - void * rq_argp; /* decoded arguments */ void * rq_resp; /* xdr'd results */ void * rq_auth_data; /* flavor-specific data */ @@ -300,6 +296,21 @@ static inline struct sockaddr *svc_addr(const struct svc_rqst *rqst) return (struct sockaddr *) &rqst->rq_addr; } +static inline struct sockaddr_in *svc_daddr_in(const struct svc_rqst *rqst) +{ + return (struct sockaddr_in *) &rqst->rq_daddr; +} + +static inline struct sockaddr_in6 *svc_daddr_in6(const struct svc_rqst *rqst) +{ + return (struct sockaddr_in6 *) &rqst->rq_daddr; +} + +static inline struct sockaddr *svc_daddr(const struct svc_rqst *rqst) +{ + return (struct sockaddr *) &rqst->rq_daddr; +} + /* * Check buffer bounds after decoding arguments */ @@ -340,7 +351,8 @@ struct svc_deferred_req { struct svc_xprt *xprt; struct sockaddr_storage addr; /* where reply must go */ size_t addrlen; - union svc_addr_u daddr; /* where reply must come from */ + struct sockaddr_storage daddr; /* where reply must come from */ + size_t daddrlen; struct cache_deferred_req handle; size_t xprt_hlen; int argslen; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index bd31208bbb61..d86bb673e1f6 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -254,8 +254,6 @@ EXPORT_SYMBOL_GPL(svc_create_xprt); */ void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt) { - struct sockaddr *sin; - memcpy(&rqstp->rq_addr, &xprt->xpt_remote, xprt->xpt_remotelen); rqstp->rq_addrlen = xprt->xpt_remotelen; @@ -263,15 +261,8 @@ void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt) * Destination address in request is needed for binding the * source address in RPC replies/callbacks later. */ - sin = (struct sockaddr *)&xprt->xpt_local; - switch (sin->sa_family) { - case AF_INET: - rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr; - break; - case AF_INET6: - rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr; - break; - } + memcpy(&rqstp->rq_daddr, &xprt->xpt_local, xprt->xpt_locallen); + rqstp->rq_daddrlen = xprt->xpt_locallen; } EXPORT_SYMBOL_GPL(svc_xprt_copy_addrs); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 767d494de7a2..dfd686eb0b7f 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -143,19 +143,20 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) cmh->cmsg_level = SOL_IP; cmh->cmsg_type = IP_PKTINFO; pki->ipi_ifindex = 0; - pki->ipi_spec_dst.s_addr = rqstp->rq_daddr.addr.s_addr; + pki->ipi_spec_dst.s_addr = + svc_daddr_in(rqstp)->sin_addr.s_addr; cmh->cmsg_len = CMSG_LEN(sizeof(*pki)); } break; case AF_INET6: { struct in6_pktinfo *pki = CMSG_DATA(cmh); + struct sockaddr_in6 *daddr = svc_daddr_in6(rqstp); cmh->cmsg_level = SOL_IPV6; cmh->cmsg_type = IPV6_PKTINFO; - pki->ipi6_ifindex = 0; - ipv6_addr_copy(&pki->ipi6_addr, - &rqstp->rq_daddr.addr6); + pki->ipi6_ifindex = daddr->sin6_scope_id; + ipv6_addr_copy(&pki->ipi6_addr, &daddr->sin6_addr); cmh->cmsg_len = CMSG_LEN(sizeof(*pki)); } break; @@ -498,9 +499,13 @@ static int svc_udp_get_dest_address4(struct svc_rqst *rqstp, struct cmsghdr *cmh) { struct in_pktinfo *pki = CMSG_DATA(cmh); + struct sockaddr_in *daddr = svc_daddr_in(rqstp); + if (cmh->cmsg_type != IP_PKTINFO) return 0; - rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr; + + daddr->sin_family = AF_INET; + daddr->sin_addr.s_addr = pki->ipi_spec_dst.s_addr; return 1; } @@ -511,9 +516,14 @@ static int svc_udp_get_dest_address6(struct svc_rqst *rqstp, struct cmsghdr *cmh) { struct in6_pktinfo *pki = CMSG_DATA(cmh); + struct sockaddr_in6 *daddr = svc_daddr_in6(rqstp); + if (cmh->cmsg_type != IPV6_PKTINFO) return 0; - ipv6_addr_copy(&rqstp->rq_daddr.addr6, &pki->ipi6_addr); + + daddr->sin6_family = AF_INET6; + ipv6_addr_copy(&daddr->sin6_addr, &pki->ipi6_addr); + daddr->sin6_scope_id = pki->ipi6_ifindex; return 1; } @@ -614,6 +624,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) skb_free_datagram_locked(svsk->sk_sk, skb); return 0; } + rqstp->rq_daddrlen = svc_addr_len(svc_daddr(rqstp)); if (skb_is_nonlinear(skb)) { /* we have to copy */ -- cgit v1.2.3-58-ga151 From 58e7b33a58d0cd07c9294d5161553b204c75662d Mon Sep 17 00:00:00 2001 From: Mi Jinlong Date: Sun, 28 Aug 2011 18:18:56 +0800 Subject: nfsd41: try to check reply size before operation For checking the size of reply before calling a operation, we need try to get maxsize of the operation's reply. v3: using new method as Bruce said, "we could handle operations in two different ways: - For operations that actually change something (write, rename, open, close, ...), do it the way we're doing it now: be very careful to estimate the size of the response before even processing the operation. - For operations that don't change anything (read, getattr, ...) just go ahead and do the operation. If you realize after the fact that the response is too large, then return the error at that point. So we'd add another flag to op_flags: say, OP_MODIFIES_SOMETHING. And for operations with OP_MODIFIES_SOMETHING set, we'd do the first thing. For operations without it set, we'd do the second." Signed-off-by: Mi Jinlong [bfields@redhat.com: crash, don't attempt to handle, undefined op_rsize_bop] Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 247 +++++++++++++++++++++++++++++++++++++++++++++++++---- fs/nfsd/nfs4xdr.c | 37 ++++---- fs/nfsd/xdr4.h | 1 + 3 files changed, 248 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 460eeb329d81..752a367e0e3d 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -35,6 +35,7 @@ #include #include +#include "idmap.h" #include "cache.h" #include "xdr4.h" #include "vfs.h" @@ -1003,6 +1004,8 @@ static inline void nfsd4_increment_op_stats(u32 opnum) typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *, void *); +typedef u32(*nfsd4op_rsize)(struct svc_rqst *, struct nfsd4_op *op); + enum nfsd4_op_flags { ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */ ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */ @@ -1010,6 +1013,7 @@ enum nfsd4_op_flags { /* For rfc 5661 section 2.6.3.1.1: */ OP_HANDLES_WRONGSEC = 1 << 3, OP_IS_PUTFH_LIKE = 1 << 4, + OP_MODIFIES_SOMETHING = 1 << 5, /* op is non-idempotent */ }; struct nfsd4_operation { @@ -1025,6 +1029,8 @@ struct nfsd4_operation { * the v4.0 case). */ bool op_cacheresult; + /* Try to get response size before operation */ + nfsd4op_rsize op_rsize_bop; }; static struct nfsd4_operation nfsd4_ops[]; @@ -1119,6 +1125,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, struct nfsd4_operation *opdesc; struct nfsd4_compound_state *cstate = &resp->cstate; int slack_bytes; + u32 plen = 0; __be32 status; resp->xbuf = &rqstp->rq_res; @@ -1197,6 +1204,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, goto encode_op; } + /* If op is non-idempotent */ + if (opdesc->op_flags & OP_MODIFIES_SOMETHING) { + plen = opdesc->op_rsize_bop(rqstp, op); + op->status = nfsd4_check_resp_size(resp, plen); + } + + if (op->status) + goto encode_op; + if (opdesc->op_func) op->status = opdesc->op_func(rqstp, cstate, &op->u); else @@ -1247,6 +1263,144 @@ out: return status; } +#define op_encode_hdr_size (2) +#define op_encode_stateid_maxsz (XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define op_encode_verifier_maxsz (XDR_QUADLEN(NFS4_VERIFIER_SIZE)) +#define op_encode_change_info_maxsz (5) +#define nfs4_fattr_bitmap_maxsz (4) + +#define op_encode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) +#define op_encode_lock_denied_maxsz (8 + op_encode_lockowner_maxsz) + +#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) + +#define op_encode_ace_maxsz (3 + nfs4_owner_maxsz) +#define op_encode_delegation_maxsz (1 + op_encode_stateid_maxsz + 1 + \ + op_encode_ace_maxsz) + +#define op_encode_channel_attrs_maxsz (6 + 1 + 1) + +static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size) * sizeof(__be32); +} + +static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32); +} + +static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz + + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz) + * sizeof(__be32); +} + +static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_lock_denied_maxsz) + * sizeof(__be32); +} + +static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_stateid_maxsz + + op_encode_change_info_maxsz + 1 + + nfs4_fattr_bitmap_maxsz + + op_encode_delegation_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + u32 maxcount = 0, rlen = 0; + + maxcount = svc_max_payload(rqstp); + rlen = op->u.read.rd_length; + + if (rlen > maxcount) + rlen = maxcount; + + return (op_encode_hdr_size + 2) * sizeof(__be32) + rlen; +} + +static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + u32 rlen = op->u.readdir.rd_maxcount; + + if (rlen > PAGE_SIZE) + rlen = PAGE_SIZE; + + return (op_encode_hdr_size + op_encode_verifier_maxsz) + * sizeof(__be32) + rlen; +} + +static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz) + * sizeof(__be32); +} + +static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz + + op_encode_change_info_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 2 + 1024) * sizeof(__be32); +} + +static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\ + 1 + 1 + 0 + /* eir_flags, spr_how, SP4_NONE (for now) */\ + 2 + /*eir_server_owner.so_minor_id */\ + /* eir_server_owner.so_major_id<> */\ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\ + /* eir_server_scope<> */\ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\ + 1 + /* eir_server_impl_id array length */\ + 0 /* ignored eir_server_impl_id contents */) * sizeof(__be32); +} + +static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\ + 2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32); +} + +static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\ + 2 + /* csr_sequence, csr_flags */\ + op_encode_channel_attrs_maxsz + \ + op_encode_channel_attrs_maxsz) * sizeof(__be32); +} + static struct nfsd4_operation nfsd4_ops[] = { [OP_ACCESS] = { .op_func = (nfsd4op_func)nfsd4_access, @@ -1254,20 +1408,28 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_CLOSE] = { .op_func = (nfsd4op_func)nfsd4_close, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_CLOSE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, }, [OP_COMMIT] = { .op_func = (nfsd4op_func)nfsd4_commit, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_COMMIT", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_commit_rsize, }, [OP_CREATE] = { .op_func = (nfsd4op_func)nfsd4_create, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_CREATE", .op_cacheresult = true, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_rsize, }, [OP_DELEGRETURN] = { .op_func = (nfsd4op_func)nfsd4_delegreturn, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_DELEGRETURN", + .op_rsize_bop = nfsd4_only_status_rsize, }, [OP_GETATTR] = { .op_func = (nfsd4op_func)nfsd4_getattr, @@ -1280,12 +1442,16 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_LINK] = { .op_func = (nfsd4op_func)nfsd4_link, + .op_flags = ALLOWED_ON_ABSENT_FS | OP_MODIFIES_SOMETHING, .op_name = "OP_LINK", .op_cacheresult = true, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_link_rsize, }, [OP_LOCK] = { .op_func = (nfsd4op_func)nfsd4_lock, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_LOCK", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize, }, [OP_LOCKT] = { .op_func = (nfsd4op_func)nfsd4_lockt, @@ -1293,7 +1459,9 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_LOCKU] = { .op_func = (nfsd4op_func)nfsd4_locku, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_LOCKU", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, }, [OP_LOOKUP] = { .op_func = (nfsd4op_func)nfsd4_lookup, @@ -1311,42 +1479,54 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_OPEN] = { .op_func = (nfsd4op_func)nfsd4_open, - .op_flags = OP_HANDLES_WRONGSEC, + .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING, .op_name = "OP_OPEN", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_open_rsize, }, [OP_OPEN_CONFIRM] = { .op_func = (nfsd4op_func)nfsd4_open_confirm, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_OPEN_CONFIRM", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, }, [OP_OPEN_DOWNGRADE] = { .op_func = (nfsd4op_func)nfsd4_open_downgrade, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_OPEN_DOWNGRADE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, }, [OP_PUTFH] = { .op_func = (nfsd4op_func)nfsd4_putfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE, + | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING, .op_name = "OP_PUTFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_PUTPUBFH] = { .op_func = (nfsd4op_func)nfsd4_putrootfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE, + | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING, .op_name = "OP_PUTPUBFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_PUTROOTFH] = { .op_func = (nfsd4op_func)nfsd4_putrootfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE, + | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING, .op_name = "OP_PUTROOTFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_READ] = { .op_func = (nfsd4op_func)nfsd4_read, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_READ", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_read_rsize, }, [OP_READDIR] = { .op_func = (nfsd4op_func)nfsd4_readdir, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_READDIR", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_readdir_rsize, }, [OP_READLINK] = { .op_func = (nfsd4op_func)nfsd4_readlink, @@ -1354,29 +1534,38 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_REMOVE] = { .op_func = (nfsd4op_func)nfsd4_remove, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_REMOVE", .op_cacheresult = true, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_remove_rsize, }, [OP_RENAME] = { - .op_name = "OP_RENAME", .op_func = (nfsd4op_func)nfsd4_rename, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_RENAME", .op_cacheresult = true, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_rename_rsize, }, [OP_RENEW] = { .op_func = (nfsd4op_func)nfsd4_renew, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING, .op_name = "OP_RENEW", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, [OP_RESTOREFH] = { .op_func = (nfsd4op_func)nfsd4_restorefh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE, + | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING, .op_name = "OP_RESTOREFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_SAVEFH] = { .op_func = (nfsd4op_func)nfsd4_savefh, - .op_flags = OP_HANDLES_WRONGSEC, + .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING, .op_name = "OP_SAVEFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_SECINFO] = { .op_func = (nfsd4op_func)nfsd4_secinfo, @@ -1386,19 +1575,25 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_SETATTR] = { .op_func = (nfsd4op_func)nfsd4_setattr, .op_name = "OP_SETATTR", + .op_flags = OP_MODIFIES_SOMETHING, .op_cacheresult = true, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_setattr_rsize, }, [OP_SETCLIENTID] = { .op_func = (nfsd4op_func)nfsd4_setclientid, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING, .op_name = "OP_SETCLIENTID", .op_cacheresult = true, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_setclientid_rsize, }, [OP_SETCLIENTID_CONFIRM] = { .op_func = (nfsd4op_func)nfsd4_setclientid_confirm, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING, .op_name = "OP_SETCLIENTID_CONFIRM", .op_cacheresult = true, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_VERIFY] = { .op_func = (nfsd4op_func)nfsd4_verify, @@ -1406,35 +1601,47 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_WRITE] = { .op_func = (nfsd4op_func)nfsd4_write, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_WRITE", .op_cacheresult = true, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize, }, [OP_RELEASE_LOCKOWNER] = { .op_func = (nfsd4op_func)nfsd4_release_lockowner, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING, .op_name = "OP_RELEASE_LOCKOWNER", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, /* NFSv4.1 operations */ [OP_EXCHANGE_ID] = { .op_func = (nfsd4op_func)nfsd4_exchange_id, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, .op_name = "OP_EXCHANGE_ID", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize, }, [OP_BIND_CONN_TO_SESSION] = { .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, .op_name = "OP_BIND_CONN_TO_SESSION", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_bind_conn_to_session_rsize, }, [OP_CREATE_SESSION] = { .op_func = (nfsd4op_func)nfsd4_create_session, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, .op_name = "OP_CREATE_SESSION", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_session_rsize, }, [OP_DESTROY_SESSION] = { .op_func = (nfsd4op_func)nfsd4_destroy_session, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, .op_name = "OP_DESTROY_SESSION", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_SEQUENCE] = { .op_func = (nfsd4op_func)nfsd4_sequence, @@ -1443,13 +1650,16 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_DESTROY_CLIENTID] = { .op_func = NULL, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, .op_name = "OP_DESTROY_CLIENTID", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_RECLAIM_COMPLETE] = { .op_func = (nfsd4op_func)nfsd4_reclaim_complete, - .op_flags = ALLOWED_WITHOUT_FH, + .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING, .op_name = "OP_RECLAIM_COMPLETE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_SECINFO_NO_NAME] = { .op_func = (nfsd4op_func)nfsd4_secinfo_no_name, @@ -1463,8 +1673,9 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_FREE_STATEID] = { .op_func = (nfsd4op_func)nfsd4_free_stateid, - .op_flags = ALLOWED_WITHOUT_FH, + .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING, .op_name = "OP_FREE_STATEID", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, }; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 5252d6681960..f4116cf16595 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3387,34 +3387,29 @@ static nfsd4_enc nfsd4_enc_ops[] = { /* * Calculate the total amount of memory that the compound response has taken - * after encoding the current operation. + * after encoding the current operation with pad. * - * pad: add on 8 bytes for the next operation's op_code and status so that - * there is room to cache a failure on the next operation. + * pad: if operation is non-idempotent, pad was calculate by op_rsize_bop() + * which was specified at nfsd4_operation, else pad is zero. * - * Compare this length to the session se_fmaxresp_cached. + * Compare this length to the session se_fmaxresp_sz and se_fmaxresp_cached. * * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so * will be at least a page and will therefore hold the xdr_buf head. */ -static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp) +int nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad) { - int status = 0; struct xdr_buf *xb = &resp->rqstp->rq_res; - struct nfsd4_compoundargs *args = resp->rqstp->rq_argp; struct nfsd4_session *session = NULL; struct nfsd4_slot *slot = resp->cstate.slot; - u32 length, tlen = 0, pad = 8; + u32 length, tlen = 0; if (!nfsd4_has_session(&resp->cstate)) - return status; + return 0; session = resp->cstate.session; - if (session == NULL || slot->sl_cachethis == 0) - return status; - - if (resp->opcnt >= args->opcnt) - pad = 0; /* this is the last operation */ + if (session == NULL) + return 0; if (xb->page_len == 0) { length = (char *)resp->p - (char *)xb->head[0].iov_base + pad; @@ -3427,10 +3422,14 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp) dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__, length, xb->page_len, tlen, pad); - if (length <= session->se_fchannel.maxresp_cached) - return status; - else + if (length > session->se_fchannel.maxresp_sz) + return nfserr_rep_too_big; + + if (slot->sl_cachethis == 1 && + length > session->se_fchannel.maxresp_cached) return nfserr_rep_too_big_to_cache; + + return 0; } void @@ -3450,8 +3449,8 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) !nfsd4_enc_ops[op->opnum]); op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u); /* nfsd4_check_drc_limit guarantees enough room for error status */ - if (!op->status && nfsd4_check_drc_limit(resp)) - op->status = nfserr_rep_too_big_to_cache; + if (!op->status) + op->status = nfsd4_check_resp_size(resp, 0); status: /* * Note: We write the status directly, instead of using WRITE32(), diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index f95a72482064..a767b57b8208 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -524,6 +524,7 @@ int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *, struct nfsd4_compoundargs *); int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *, struct nfsd4_compoundres *); +int nfsd4_check_resp_size(struct nfsd4_compoundres *, u32); void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *); void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op); __be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, -- cgit v1.2.3-58-ga151 From dad1c067eb42ec8bedadd64f681056914547d22e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 12 Sep 2011 12:24:13 -0400 Subject: nfsd4: replace oo_confirmed by flag bit I want at least one more bit here. So, let's haul out the caps lock key and add a flags field. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 4 ++-- fs/nfsd/nfs4state.c | 24 ++++++++++++------------ fs/nfsd/state.h | 3 ++- 3 files changed, 16 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 752a367e0e3d..aa769dfa756a 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -375,7 +375,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; break; case NFS4_OPEN_CLAIM_PREVIOUS: - open->op_openowner->oo_confirmed = 1; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; /* * The CURRENT_FH is already set to the file being * opened. (1) set open->op_cinfo, (2) set @@ -388,7 +388,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; break; case NFS4_OPEN_CLAIM_DELEGATE_PREV: - open->op_openowner->oo_confirmed = 1; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; dprintk("NFSD: unsupported OPEN claim type %d\n", open->op_claim_type); status = nfserr_notsupp; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index e7f83bd9b4a8..59b70afdc884 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2322,7 +2322,7 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str return NULL; oo->oo_owner.so_is_open_owner = 1; oo->oo_owner.so_seqid = open->op_seqid; - oo->oo_confirmed = 0; + oo->oo_flags = 0; oo->oo_time = 0; INIT_LIST_HEAD(&oo->oo_close_lru); hash_openowner(oo, clp, strhashval); @@ -2549,7 +2549,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, return nfserr_expired; goto renew; } - if (!oo->oo_confirmed) { + if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { /* Replace unconfirmed owners without checking for replay. */ clp = oo->oo_owner.so_client; release_openowner(oo); @@ -2616,7 +2616,7 @@ out: return nfs_ok; if (status) return status; - open->op_openowner->oo_confirmed = 1; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; return nfs_ok; } @@ -2749,7 +2749,7 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c static void nfs4_set_claim_prev(struct nfsd4_open *open) { - open->op_openowner->oo_confirmed = 1; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; open->op_openowner->oo_owner.so_client->cl_firststate = 1; } @@ -2853,7 +2853,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_ * had the chance to reclaim theirs.... */ if (locks_in_grace()) goto out; - if (!cb_up || !oo->oo_confirmed) + if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) goto out; if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) flag = NFS4_OPEN_DELEGATE_WRITE; @@ -2952,7 +2952,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); if (nfsd4_has_session(&resp->cstate)) - open->op_openowner->oo_confirmed = 1; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; /* * Attempt to hand out a delegation. No error return, because the @@ -2973,7 +2973,7 @@ out: * To finish the open response, we just need to set the rflags. */ open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; - if (!open->op_openowner->oo_confirmed && + if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) && !nfsd4_has_session(&resp->cstate)) open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; @@ -3264,7 +3264,7 @@ __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session) return nfs_ok; ols = openlockstateid(s); if (ols->st_stateowner->so_is_open_owner - && !openowner(ols->st_stateowner)->oo_confirmed) + && !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) return nfserr_bad_stateid; return nfs_ok; } @@ -3323,7 +3323,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, if (nfs4_check_fh(current_fh, stp)) goto out; if (stp->st_stateowner->so_is_open_owner - && !openowner(stp->st_stateowner)->oo_confirmed) + && !(openowner(stp->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) goto out; status = nfs4_check_openmode(stp, flags); if (status) @@ -3476,7 +3476,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs if (status) return status; oo = openowner((*stpp)->st_stateowner); - if (!oo->oo_confirmed) + if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) return nfserr_bad_stateid; return nfs_ok; } @@ -3506,9 +3506,9 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; oo = openowner(stp->st_stateowner); status = nfserr_bad_stateid; - if (oo->oo_confirmed) + if (oo->oo_flags & NFS4_OO_CONFIRMED) goto out; - oo->oo_confirmed = 1; + oo->oo_flags |= NFS4_OO_CONFIRMED; update_stateid(&stp->st_stid.sc_stateid); memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 12c142436705..a8324b868a36 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -371,7 +371,8 @@ struct nfs4_openowner { struct list_head oo_perclient; struct list_head oo_close_lru; /* tail queue */ time_t oo_time; /* time of placement on so_close_lru */ - int oo_confirmed; /* successful OPEN_CONFIRM? */ +#define NFS4_OO_CONFIRMED 1 + unsigned char oo_flags; }; struct nfs4_lockowner { -- cgit v1.2.3-58-ga151 From 38c387b52d8404f8fd29d8c26bebc83a80733657 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 16 Sep 2011 17:42:48 -0400 Subject: nfsd4: match close replays on stateid, not open owner id Keep around an unhashed copy of the final stateid after the last close using an openowner, and when identifying a replay, match against that stateid instead of just against the open owner id. Free it the next time the seqid is bumped or the stateowner is destroyed. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 47 ++++++++++++++++++++++++++++++++++++++++------- fs/nfsd/nfs4xdr.c | 1 + fs/nfsd/state.h | 3 +++ 3 files changed, 44 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 59b70afdc884..a174841b262e 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -438,7 +438,6 @@ static void close_generic_stateid(struct nfs4_ol_stateid *stp) static void free_generic_stateid(struct nfs4_ol_stateid *stp) { - close_generic_stateid(stp); kmem_cache_free(stateid_slab, stp); } @@ -450,6 +449,7 @@ static void release_lock_stateid(struct nfs4_ol_stateid *stp) file = find_any_file(stp->st_file); if (file) locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner)); + close_generic_stateid(stp); free_generic_stateid(stp); } @@ -485,10 +485,16 @@ release_stateid_lockowners(struct nfs4_ol_stateid *open_stp) } } -static void release_open_stateid(struct nfs4_ol_stateid *stp) +static void unhash_open_stateid(struct nfs4_ol_stateid *stp) { unhash_generic_stateid(stp); release_stateid_lockowners(stp); + close_generic_stateid(stp); +} + +static void release_open_stateid(struct nfs4_ol_stateid *stp) +{ + unhash_open_stateid(stp); free_generic_stateid(stp); } @@ -510,6 +516,8 @@ static void release_openowner(struct nfs4_openowner *oo) { unhash_openowner(oo); list_del(&oo->oo_close_lru); + if (oo->oo_last_closed_stid) + free_generic_stateid(oo->oo_last_closed_stid); nfs4_free_openowner(oo); } @@ -2324,6 +2332,7 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str oo->oo_owner.so_seqid = open->op_seqid; oo->oo_flags = 0; oo->oo_time = 0; + oo->oo_last_closed_stid = NULL; INIT_LIST_HEAD(&oo->oo_close_lru); hash_openowner(oo, clp, strhashval); return oo; @@ -3120,12 +3129,14 @@ laundromat_main(struct work_struct *not_used) queue_delayed_work(laundry_wq, &laundromat_work, t*HZ); } -static struct nfs4_openowner * search_close_lru(u32 st_id) +static struct nfs4_openowner * search_close_lru(stateid_t *s) { struct nfs4_openowner *local; + struct nfs4_ol_stateid *os; list_for_each_entry(local, &close_lru, oo_close_lru) { - if (local->oo_owner.so_id == st_id) + os = local->oo_last_closed_stid; + if (same_stateid(&os->st_stid.sc_stateid, s)) return local; } return NULL; @@ -3589,6 +3600,27 @@ out: return status; } +void nfsd4_purge_closed_stateid(struct nfs4_stateowner *so) +{ + struct nfs4_openowner *oo; + struct nfs4_ol_stateid *s; + + if (!so->so_is_open_owner) + return; + oo = openowner(so); + s = oo->oo_last_closed_stid; + if (!s) + return; + if (!(oo->oo_flags & NFS4_OO_PURGE_CLOSE)) { + /* Release the last_closed_stid on the next seqid bump: */ + oo->oo_flags |= NFS4_OO_PURGE_CLOSE; + return; + } + oo->oo_flags &= ~NFS4_OO_PURGE_CLOSE; + free_generic_stateid(oo->oo_last_closed_stid); + oo->oo_last_closed_stid = NULL; +} + /* * nfs4_unlock_state() called after encode */ @@ -3613,7 +3645,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * Also, we should make sure this isn't just the result of * a replayed close: */ - oo = search_close_lru(close->cl_stateid.si_stateownerid); + oo = search_close_lru(&close->cl_stateid); /* It's not stale; let's assume it's expired: */ if (oo == NULL) goto out; @@ -3630,8 +3662,9 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, update_stateid(&stp->st_stid.sc_stateid); memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); - /* release_stateid() calls nfsd_close() if needed */ - release_open_stateid(stp); + /* unhash_open_stateid() calls nfsd_close() if needed */ + oo->oo_last_closed_stid = stp; + unhash_open_stateid(stp); /* place unused nfs4_stateowners on so_close_lru list to be * released by the laundromat service after the lease period diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index f4116cf16595..7bd57c2dbc4d 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1636,6 +1636,7 @@ static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, _ (char *)resp->p - (char *)save; memcpy(stateowner->so_replay.rp_buf, save, stateowner->so_replay.rp_buflen); + nfsd4_purge_closed_stateid(stateowner); } } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index a8324b868a36..e807abb116f6 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -370,8 +370,10 @@ struct nfs4_openowner { struct nfs4_stateowner oo_owner; /* must be first field */ struct list_head oo_perclient; struct list_head oo_close_lru; /* tail queue */ + struct nfs4_ol_stateid *oo_last_closed_stid; time_t oo_time; /* time of placement on so_close_lru */ #define NFS4_OO_CONFIRMED 1 +#define NFS4_OO_PURGE_CLOSE 2 unsigned char oo_flags; }; @@ -514,5 +516,6 @@ extern int nfsd4_create_clid_dir(struct nfs4_client *clp); extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); extern void release_session_client(struct nfsd4_session *); extern __be32 nfs4_validate_stateid(stateid_t *, bool); +extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *); #endif /* NFSD4_STATE_H */ -- cgit v1.2.3-58-ga151 From 2da1cec713bc6d3ec9732e7d48b8bc0453580fd3 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 16 Sep 2011 18:56:20 -0400 Subject: nfsd4: simplify free_stateid We no longer need is_deleg_stateid, for example. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 66 ++++++++++++----------------------------------------- 1 file changed, 15 insertions(+), 51 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a174841b262e..fdd03f6ae044 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1098,16 +1098,6 @@ static struct nfs4_stid *find_stateid(stateid_t *t) return NULL; } -static struct nfs4_ol_stateid *find_ol_stateid(stateid_t *t) -{ - struct nfs4_stid *s; - - s = find_stateid(t); - if (!s) - return NULL; - return openlockstateid(s); -} - static struct nfs4_stid *find_stateid_by_type(stateid_t *t, char typemask) { struct nfs4_stid *s; @@ -3251,11 +3241,6 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_sess return nfserr_old_stateid; } -static int is_delegation_stateid(stateid_t *stateid) -{ - return stateid->si_fileid == 0; -} - __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session) { struct nfs4_stid *s; @@ -3352,16 +3337,6 @@ out: return status; } -static __be32 -nfsd4_free_delegation_stateid(stateid_t *stateid) -{ - struct nfs4_delegation *dp = find_deleg_stateid(stateid); - if (dp) - return nfserr_locks_held; - - return nfserr_bad_stateid; -} - static __be32 nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp) { @@ -3382,40 +3357,32 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfs_ok; } -/* - * Free a state id - */ __be32 nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_free_stateid *free_stateid) { stateid_t *stateid = &free_stateid->fr_stateid; - struct nfs4_ol_stateid *stp; - __be32 ret; + struct nfs4_stid *s; + __be32 ret = nfserr_bad_stateid; nfs4_lock_state(); - if (is_delegation_stateid(stateid)) { - ret = nfsd4_free_delegation_stateid(stateid); - goto out; - } - - stp = find_ol_stateid(stateid); - if (!stp) { - ret = nfserr_bad_stateid; - goto out; - } - ret = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, 1); - if (ret) + s = find_stateid(stateid); + if (!s) goto out; - - if (stp->st_stid.sc_type == NFS4_OPEN_STID) { + switch (s->sc_type) { + case NFS4_DELEG_STID: ret = nfserr_locks_held; goto out; - } else { - ret = nfsd4_free_lock_stateid(stp); - goto out; + case NFS4_OPEN_STID: + case NFS4_LOCK_STID: + ret = check_stateid_generation(stateid, &s->sc_stateid, 1); + if (ret) + goto out; + if (s->sc_type == NFS4_LOCK_STID) + ret = nfsd4_free_lock_stateid(openlockstateid(s)); + else + ret = nfserr_locks_held; } - out: nfs4_unlock_state(); return ret; @@ -3698,9 +3665,6 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfserr_stale_stateid; if (STALE_STATEID(stateid)) goto out; - status = nfserr_bad_stateid; - if (!is_delegation_stateid(stateid)) - goto out; status = nfserr_expired; dp = find_deleg_stateid(stateid); if (!dp) -- cgit v1.2.3-58-ga151 From d3b313a463c64c54d57c6af09c4a5d20106c1d1c Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 15 Sep 2011 15:02:41 -0400 Subject: nfsd4: construct stateid from clientid and counter Including the full clientid in the on-the-wire stateid allows more reliable detection of bad vs. expired stateid's, simplifies code, and ensures we won't reuse the opaque part of the stateid (as we currently do when the same openowner closes and reopens the same file). Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 58 ++++++++++++----------------------------------------- fs/nfsd/state.h | 18 +++++------------ 2 files changed, 18 insertions(+), 58 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index fdd03f6ae044..922f47dd0d74 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -49,9 +49,7 @@ time_t nfsd4_lease = 90; /* default lease time */ time_t nfsd4_grace = 90; static time_t boot_time; -static u32 current_ownerid = 1; -static u32 current_fileid = 1; -static u32 current_delegid = 1; +static u32 current_stateid = 1; static stateid_t zerostateid; /* bits all 0 */ static stateid_t onestateid; /* bits all 1 */ static u64 current_sessionid = 1; @@ -136,11 +134,6 @@ unsigned int max_delegations; #define OPEN_OWNER_HASH_SIZE (1 << OPEN_OWNER_HASH_BITS) #define OPEN_OWNER_HASH_MASK (OPEN_OWNER_HASH_SIZE - 1) -static unsigned int open_ownerid_hashval(const u32 id) -{ - return id & OPEN_OWNER_HASH_MASK; -} - static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername) { unsigned int ret; @@ -150,7 +143,6 @@ static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *owner return ret & OPEN_OWNER_HASH_MASK; } -static struct list_head open_ownerid_hashtbl[OPEN_OWNER_HASH_SIZE]; static struct list_head open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE]; /* hash table for nfs4_file */ @@ -255,9 +247,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv dp->dl_file = fp; dp->dl_type = type; dp->dl_stid.sc_type = NFS4_DELEG_STID; - dp->dl_stid.sc_stateid.si_boot = boot_time; - dp->dl_stid.sc_stateid.si_stateownerid = current_delegid++; - dp->dl_stid.sc_stateid.si_fileid = 0; + dp->dl_stid.sc_stateid.si_opaque.so_clid = clp->cl_clientid; + dp->dl_stid.sc_stateid.si_opaque.so_id = current_stateid++; dp->dl_stid.sc_stateid.si_generation = 1; hash_stid(&dp->dl_stid); fh_copy_shallow(&dp->dl_fh, ¤t_fh->fh_handle); @@ -457,7 +448,6 @@ static void unhash_lockowner(struct nfs4_lockowner *lo) { struct nfs4_ol_stateid *stp; - list_del(&lo->lo_owner.so_idhash); list_del(&lo->lo_owner.so_strhash); list_del(&lo->lo_perstateid); while (!list_empty(&lo->lo_owner.so_stateids)) { @@ -502,7 +492,6 @@ static void unhash_openowner(struct nfs4_openowner *oo) { struct nfs4_ol_stateid *stp; - list_del(&oo->oo_owner.so_idhash); list_del(&oo->oo_owner.so_strhash); list_del(&oo->oo_perclient); while (!list_empty(&oo->oo_owner.so_stateids)) { @@ -1081,9 +1070,8 @@ static void gen_confirm(struct nfs4_client *clp) static int same_stateid(stateid_t *id_one, stateid_t *id_two) { - if (id_one->si_stateownerid != id_two->si_stateownerid) - return 0; - return id_one->si_fileid == id_two->si_fileid; + return 0 == memcmp(&id_one->si_opaque, &id_two->si_opaque, + sizeof(stateid_opaque_t)); } static struct nfs4_stid *find_stateid(stateid_t *t) @@ -2198,7 +2186,6 @@ alloc_init_file(struct inode *ino) INIT_LIST_HEAD(&fp->fi_stateids); INIT_LIST_HEAD(&fp->fi_delegations); fp->fi_inode = igrab(ino); - fp->fi_id = current_fileid++; fp->fi_had_conflict = false; fp->fi_lease = NULL; memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); @@ -2295,7 +2282,6 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj sop->so_owner.len = owner->len; INIT_LIST_HEAD(&sop->so_stateids); - sop->so_id = current_ownerid++; sop->so_client = clp; init_nfs4_replay(&sop->so_replay); return sop; @@ -2303,10 +2289,6 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval) { - unsigned int idhashval; - - idhashval = open_ownerid_hashval(oo->oo_owner.so_id); - list_add(&oo->oo_owner.so_idhash, &open_ownerid_hashtbl[idhashval]); list_add(&oo->oo_owner.so_strhash, &open_ownerstr_hashtbl[strhashval]); list_add(&oo->oo_perclient, &clp->cl_openowners); } @@ -2331,6 +2313,7 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str static inline void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { struct nfs4_openowner *oo = open->op_openowner; + struct nfs4_client *clp = oo->oo_owner.so_client; INIT_LIST_HEAD(&stp->st_lockowners); list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); @@ -2339,9 +2322,8 @@ init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd stp->st_stateowner = &oo->oo_owner; get_nfs4_file(fp); stp->st_file = fp; - stp->st_stid.sc_stateid.si_boot = boot_time; - stp->st_stid.sc_stateid.si_stateownerid = oo->oo_owner.so_id; - stp->st_stid.sc_stateid.si_fileid = fp->fi_id; + stp->st_stid.sc_stateid.si_opaque.so_clid = clp->cl_clientid; + stp->st_stid.sc_stateid.si_opaque.so_id = current_stateid++; /* note will be incremented before first return to client: */ stp->st_stid.sc_stateid.si_generation = 0; hash_stid(&stp->st_stid); @@ -3095,8 +3077,6 @@ nfs4_laundromat(void) test_val = u; break; } - dprintk("NFSD: purging unused open stateowner (so_id %d)\n", - oo->oo_owner.so_id); release_openowner(oo); } if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) @@ -3141,7 +3121,7 @@ nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) static int STALE_STATEID(stateid_t *stateid) { - if (stateid->si_boot == boot_time) + if (stateid->si_opaque.so_clid.cl_boot == boot_time) return 0; dprintk("NFSD: stale stateid " STATEID_FMT "!\n", STATEID_VAL(stateid)); @@ -3710,11 +3690,6 @@ last_byte_offset(u64 start, u64 len) return end > start ? end - 1: NFS4_MAX_UINT64; } -static unsigned int lockownerid_hashval(u32 id) -{ - return id & LOCK_HASH_MASK; -} - static inline unsigned int lock_ownerstr_hashval(struct inode *inode, u32 cl_id, struct xdr_netobj *ownername) @@ -3724,7 +3699,6 @@ lock_ownerstr_hashval(struct inode *inode, u32 cl_id, & LOCK_HASH_MASK; } -static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE]; static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; /* @@ -3795,10 +3769,6 @@ find_lockowner_str(struct inode *inode, clientid_t *clid, static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp) { - unsigned int idhashval; - - idhashval = lockownerid_hashval(lo->lo_owner.so_id); - list_add(&lo->lo_owner.so_idhash, &lock_ownerid_hashtbl[idhashval]); list_add(&lo->lo_owner.so_strhash, &lock_ownerstr_hashtbl[strhashval]); list_add(&lo->lo_perstateid, &open_stp->st_lockowners); } @@ -3831,6 +3801,7 @@ static struct nfs4_ol_stateid * alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_ol_stateid *open_stp) { struct nfs4_ol_stateid *stp; + struct nfs4_client *clp = lo->lo_owner.so_client; stp = nfs4_alloc_stateid(); if (stp == NULL) @@ -3841,9 +3812,8 @@ alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct stp->st_stid.sc_type = NFS4_LOCK_STID; get_nfs4_file(fp); stp->st_file = fp; - stp->st_stid.sc_stateid.si_boot = boot_time; - stp->st_stid.sc_stateid.si_stateownerid = lo->lo_owner.so_id; - stp->st_stid.sc_stateid.si_fileid = fp->fi_id; + stp->st_stid.sc_stateid.si_opaque.so_clid = clp->cl_clientid; + stp->st_stid.sc_stateid.si_opaque.so_id = current_stateid++; /* note will be incremented before first return to client: */ stp->st_stid.sc_stateid.si_generation = 0; hash_stid(&stp->st_stid); @@ -4252,7 +4222,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, * data structures. */ INIT_LIST_HEAD(&matches); for (i = 0; i < LOCK_HASH_SIZE; i++) { - list_for_each_entry(sop, &lock_ownerid_hashtbl[i], so_idhash) { + list_for_each_entry(sop, &lock_ownerstr_hashtbl[i], so_strhash) { if (!same_owner_str(sop, owner, clid)) continue; list_for_each_entry(stp, &sop->so_stateids, @@ -4398,12 +4368,10 @@ nfs4_state_init(void) } for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) { INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]); - INIT_LIST_HEAD(&open_ownerid_hashtbl[i]); } for (i = 0; i < STATEID_HASH_SIZE; i++) INIT_LIST_HEAD(&stateid_hashtbl[i]); for (i = 0; i < LOCK_HASH_SIZE; i++) { - INIT_LIST_HEAD(&lock_ownerid_hashtbl[i]); INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]); } memset(&onestateid, ~0, sizeof(stateid_t)); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index e807abb116f6..d6aec4f8d3dd 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -45,24 +45,20 @@ typedef struct { } clientid_t; typedef struct { - u32 so_boot; - u32 so_stateownerid; - u32 so_fileid; + clientid_t so_clid; + u32 so_id; } stateid_opaque_t; typedef struct { u32 si_generation; stateid_opaque_t si_opaque; } stateid_t; -#define si_boot si_opaque.so_boot -#define si_stateownerid si_opaque.so_stateownerid -#define si_fileid si_opaque.so_fileid #define STATEID_FMT "(%08x/%08x/%08x/%08x)" #define STATEID_VAL(s) \ - (s)->si_boot, \ - (s)->si_stateownerid, \ - (s)->si_fileid, \ + (s)->si_opaque.so_clid.cl_boot, \ + (s)->si_opaque.so_clid.cl_id, \ + (s)->si_opaque.so_id, \ (s)->si_generation struct nfsd4_callback { @@ -353,11 +349,9 @@ struct nfs4_replay { */ struct nfs4_stateowner { - struct list_head so_idhash; /* hash by so_id */ struct list_head so_strhash; /* hash by op_name */ struct list_head so_stateids; int so_is_open_owner; /* 1=openowner,0=lockowner */ - u32 so_id; struct nfs4_client * so_client; /* after increment in ENCODE_SEQID_OP_TAIL, represents the next * sequence id expected from the client: */ @@ -415,8 +409,6 @@ struct nfs4_file { struct file_lock *fi_lease; atomic_t fi_delegees; struct inode *fi_inode; - u32 fi_id; /* used with stateowner->so_id - * for stateid_hashtbl hash */ bool fi_had_conflict; }; -- cgit v1.2.3-58-ga151 From f7a4d872078a5e143d88adb561627f637046b05a Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 16 Sep 2011 20:12:38 -0400 Subject: nfsd4: hash closed stateid's like any other Look up closed stateid's in the stateid hash like any other stateid rather than searching the close lru. This is simpler, and fixes a bug: currently we handle only the case of a close that is the last close for a given stateowner, but not the case of a close for a stateowner that still has active opens on other files. Thus in a case like: open(owner, file1) open(owner, file2) close(owner, file2) close(owner, file2) the final close won't be recognized as a retransmission. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 101 +++++++++++++++++++++++++++------------------------- fs/nfsd/state.h | 4 ++- 2 files changed, 56 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 922f47dd0d74..e5cba833613f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -406,7 +406,6 @@ static int nfs4_access_to_omode(u32 access) static void unhash_generic_stateid(struct nfs4_ol_stateid *stp) { - list_del(&stp->st_stid.sc_hash); list_del(&stp->st_perfile); list_del(&stp->st_perstateowner); } @@ -437,6 +436,7 @@ static void release_lock_stateid(struct nfs4_ol_stateid *stp) struct file *file; unhash_generic_stateid(stp); + list_del(&stp->st_stid.sc_hash); file = find_any_file(stp->st_file); if (file) locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner)); @@ -485,6 +485,7 @@ static void unhash_open_stateid(struct nfs4_ol_stateid *stp) static void release_open_stateid(struct nfs4_ol_stateid *stp) { unhash_open_stateid(stp); + list_del(&stp->st_stid.sc_hash); free_generic_stateid(stp); } @@ -501,12 +502,22 @@ static void unhash_openowner(struct nfs4_openowner *oo) } } +static void release_last_closed_stateid(struct nfs4_openowner *oo) +{ + struct nfs4_ol_stateid *s = oo->oo_last_closed_stid; + + if (s) { + list_del_init(&s->st_stid.sc_hash); + free_generic_stateid(s); + oo->oo_last_closed_stid = NULL; + } +} + static void release_openowner(struct nfs4_openowner *oo) { unhash_openowner(oo); list_del(&oo->oo_close_lru); - if (oo->oo_last_closed_stid) - free_generic_stateid(oo->oo_last_closed_stid); + release_last_closed_stateid(oo); nfs4_free_openowner(oo); } @@ -3099,23 +3110,11 @@ laundromat_main(struct work_struct *not_used) queue_delayed_work(laundry_wq, &laundromat_work, t*HZ); } -static struct nfs4_openowner * search_close_lru(stateid_t *s) -{ - struct nfs4_openowner *local; - struct nfs4_ol_stateid *os; - - list_for_each_entry(local, &close_lru, oo_close_lru) { - os = local->oo_last_closed_stid; - if (same_stateid(&os->st_stid.sc_stateid, s)) - return local; - } - return NULL; -} - -static inline int -nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) +static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) { - return fhp->fh_dentry->d_inode != stp->st_file->fi_inode; + if (fhp->fh_dentry->d_inode != stp->st_file->fi_inode) + return nfserr_bad_stateid; + return nfs_ok; } static int @@ -3283,7 +3282,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate)); if (status) goto out; - if (s->sc_type == NFS4_DELEG_STID) { + switch (s->sc_type) { + case NFS4_DELEG_STID: dp = delegstateid(s); status = nfs4_check_delegmode(dp, flags); if (status) @@ -3293,10 +3293,12 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, *filpp = dp->dl_file->fi_deleg_file; BUG_ON(!*filpp); } - } else { /* open or lock stateid */ + break; + case NFS4_OPEN_STID: + case NFS4_LOCK_STID: stp = openlockstateid(s); - status = nfserr_bad_stateid; - if (nfs4_check_fh(current_fh, stp)) + status = nfs4_check_fh(current_fh, stp); + if (status) goto out; if (stp->st_stateowner->so_is_open_owner && !(openowner(stp->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) @@ -3311,6 +3313,9 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, else *filpp = find_writeable_file(stp->st_file); } + break; + default: + return nfserr_bad_stateid; } status = nfs_ok; out: @@ -3362,6 +3367,9 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, ret = nfsd4_free_lock_stateid(openlockstateid(s)); else ret = nfserr_locks_held; + break; + default: + ret = nfserr_bad_stateid; } out: nfs4_unlock_state(); @@ -3390,12 +3398,19 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ struct nfs4_stateowner *sop = stp->st_stateowner; __be32 status; - if (nfs4_check_fh(current_fh, stp)) - return nfserr_bad_stateid; status = nfsd4_check_seqid(cstate, sop, seqid); if (status) return status; - return check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); + if (stp->st_stid.sc_type == NFS4_CLOSED_STID) + /* + * "Closed" stateid's exist *only* to return + * nfserr_replay_me from the previous step. + */ + return nfserr_bad_stateid; + status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); + if (status) + return status; + return nfs4_check_fh(current_fh, stp); } /* @@ -3564,8 +3579,13 @@ void nfsd4_purge_closed_stateid(struct nfs4_stateowner *so) return; } oo->oo_flags &= ~NFS4_OO_PURGE_CLOSE; - free_generic_stateid(oo->oo_last_closed_stid); - oo->oo_last_closed_stid = NULL; + release_last_closed_stateid(oo); +} + +static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) +{ + unhash_open_stateid(s); + s->st_stid.sc_type = NFS4_CLOSED_STID; } /* @@ -3584,24 +3604,10 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, cstate->current_fh.fh_dentry->d_name.name); nfs4_lock_state(); - /* check close_lru for replay */ - status = nfs4_preprocess_confirmed_seqid_op(cstate, close->cl_seqid, - &close->cl_stateid, &stp); - if (stp == NULL && status == nfserr_expired) { - /* - * Also, we should make sure this isn't just the result of - * a replayed close: - */ - oo = search_close_lru(&close->cl_stateid); - /* It's not stale; let's assume it's expired: */ - if (oo == NULL) - goto out; - cstate->replay_owner = &oo->oo_owner; - status = nfsd4_check_seqid(cstate, &oo->oo_owner, close->cl_seqid); - if (status) - goto out; - status = nfserr_bad_seqid; - } + status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, + &close->cl_stateid, + NFS4_OPEN_STID|NFS4_CLOSED_STID, + &stp); if (status) goto out; oo = openowner(stp->st_stateowner); @@ -3609,9 +3615,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, update_stateid(&stp->st_stid.sc_stateid); memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); - /* unhash_open_stateid() calls nfsd_close() if needed */ + nfsd4_close_open_stateid(stp); oo->oo_last_closed_stid = stp; - unhash_open_stateid(stp); /* place unused nfs4_stateowners on so_close_lru list to be * released by the laundromat service after the lease period diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index d6aec4f8d3dd..da68bf66a3d7 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -76,7 +76,9 @@ struct nfs4_stid { #define NFS4_OPEN_STID 1 #define NFS4_LOCK_STID 2 #define NFS4_DELEG_STID 4 - char sc_type; +/* For an open stateid kept around *only* to process close replays: */ +#define NFS4_CLOSED_STID 8 + unsigned char sc_type; struct list_head sc_hash; stateid_t sc_stateid; }; -- cgit v1.2.3-58-ga151 From 3d02fa29dec920c597dd7b7db608a4bc71f088ce Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 19 Sep 2011 15:07:41 -0400 Subject: nfsd4: fix open downgrade, again Yet another open-management regression: - nfs4_file_downgrade() doesn't remove the BOTH access bit on downgrade, so the server's idea of the stateid's access gets out of sync with the client's. If we want to keep an O_RDWR open in this case, we should do that in the file_put_access logic rather than here. - We forgot to convert v4 access to an open mode here. This logic has proven too hard to get right. In the future we may consider: - reexamining the lock/openowner relationship (locks probably don't really need to take their own references here). - adding open upgrade/downgrade support to the vfs. - removing the atomic operations. They're redundant as long as this is all under some other lock. Also, maybe some kind of additional static checking would help catch O_/NFS4_SHARE_ACCESS confusion. Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index e5cba833613f..edcced18caa8 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -194,8 +194,15 @@ static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag) static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) { if (atomic_dec_and_test(&fp->fi_access[oflag])) { - nfs4_file_put_fd(fp, O_RDWR); nfs4_file_put_fd(fp, oflag); + /* + * It's also safe to get rid of the RDWR open *if* + * we no longer have need of the other kind of access + * or if we already have the other kind of open: + */ + if (fp->fi_fds[1-oflag] + || atomic_read(&fp->fi_access[1 - oflag]) == 0) + nfs4_file_put_fd(fp, O_RDWR); } } @@ -3500,8 +3507,9 @@ static inline void nfs4_file_downgrade(struct nfs4_ol_stateid *stp, unsigned int int i; for (i = 1; i < 4; i++) { - if (test_bit(i, &stp->st_access_bmap) && !(i & to_access)) { - nfs4_file_put_access(stp->st_file, i); + if (test_bit(i, &stp->st_access_bmap) + && ((i & to_access) != i)) { + nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(i)); __clear_bit(i, &stp->st_access_bmap); } } -- cgit v1.2.3-58-ga151 From c856694e3d46976c76bf5b92091cb1efa211208d Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 20 Sep 2011 08:49:51 -0400 Subject: nfsd4: make op_cacheresult another flag I'm not sure why I used a new field for this originally. Also, the differences between some of these flags are a little subtle; add some comments to explain. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 50 +++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index aa769dfa756a..4e41f65c7021 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1013,14 +1013,15 @@ enum nfsd4_op_flags { /* For rfc 5661 section 2.6.3.1.1: */ OP_HANDLES_WRONGSEC = 1 << 3, OP_IS_PUTFH_LIKE = 1 << 4, - OP_MODIFIES_SOMETHING = 1 << 5, /* op is non-idempotent */ -}; - -struct nfsd4_operation { - nfsd4op_func op_func; - u32 op_flags; - char *op_name; /* + * These are the ops whose result size we estimate before + * encoding, to avoid performing an op then not being able to + * respond or cache a response. This includes writes and setattrs + * as well as the operations usually called "nonidempotent": + */ + OP_MODIFIES_SOMETHING = 1 << 5, + /* + * Cache compounds containing these ops in the xid-based drc: * We use the DRC for compounds containing non-idempotent * operations, *except* those that are 4.1-specific (since * sessions provide their own EOS), and except for stateful @@ -1028,7 +1029,13 @@ struct nfsd4_operation { * (since sequence numbers provide EOS for open, lock, etc in * the v4.0 case). */ - bool op_cacheresult; + OP_CACHEME = 1 << 6, +}; + +struct nfsd4_operation { + nfsd4op_func op_func; + u32 op_flags; + char *op_name; /* Try to get response size before operation */ nfsd4op_rsize op_rsize_bop; }; @@ -1077,7 +1084,7 @@ static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op) bool nfsd4_cache_this_op(struct nfsd4_op *op) { - return OPDESC(op)->op_cacheresult; + return OPDESC(op)->op_flags & OP_CACHEME; } static bool need_wrongsec_check(struct svc_rqst *rqstp) @@ -1420,9 +1427,8 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_CREATE] = { .op_func = (nfsd4op_func)nfsd4_create, - .op_flags = OP_MODIFIES_SOMETHING, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_CREATE", - .op_cacheresult = true, .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_rsize, }, [OP_DELEGRETURN] = { @@ -1442,9 +1448,9 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_LINK] = { .op_func = (nfsd4op_func)nfsd4_link, - .op_flags = ALLOWED_ON_ABSENT_FS | OP_MODIFIES_SOMETHING, + .op_flags = ALLOWED_ON_ABSENT_FS | OP_MODIFIES_SOMETHING + | OP_CACHEME, .op_name = "OP_LINK", - .op_cacheresult = true, .op_rsize_bop = (nfsd4op_rsize)nfsd4_link_rsize, }, [OP_LOCK] = { @@ -1534,16 +1540,14 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_REMOVE] = { .op_func = (nfsd4op_func)nfsd4_remove, - .op_flags = OP_MODIFIES_SOMETHING, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_REMOVE", - .op_cacheresult = true, .op_rsize_bop = (nfsd4op_rsize)nfsd4_remove_rsize, }, [OP_RENAME] = { .op_func = (nfsd4op_func)nfsd4_rename, - .op_flags = OP_MODIFIES_SOMETHING, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_RENAME", - .op_cacheresult = true, .op_rsize_bop = (nfsd4op_rsize)nfsd4_rename_rsize, }, [OP_RENEW] = { @@ -1575,24 +1579,21 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_SETATTR] = { .op_func = (nfsd4op_func)nfsd4_setattr, .op_name = "OP_SETATTR", - .op_flags = OP_MODIFIES_SOMETHING, - .op_cacheresult = true, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, .op_rsize_bop = (nfsd4op_rsize)nfsd4_setattr_rsize, }, [OP_SETCLIENTID] = { .op_func = (nfsd4op_func)nfsd4_setclientid, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_MODIFIES_SOMETHING, + | OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_SETCLIENTID", - .op_cacheresult = true, .op_rsize_bop = (nfsd4op_rsize)nfsd4_setclientid_rsize, }, [OP_SETCLIENTID_CONFIRM] = { .op_func = (nfsd4op_func)nfsd4_setclientid_confirm, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_MODIFIES_SOMETHING, + | OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_SETCLIENTID_CONFIRM", - .op_cacheresult = true, .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_VERIFY] = { @@ -1601,9 +1602,8 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_WRITE] = { .op_func = (nfsd4op_func)nfsd4_write, - .op_flags = OP_MODIFIES_SOMETHING, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_WRITE", - .op_cacheresult = true, .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize, }, [OP_RELEASE_LOCKOWNER] = { -- cgit v1.2.3-58-ga151 From 8335ebd94b3f5bed7875cc35848bbe46d8381695 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 21 Sep 2011 08:34:32 -0400 Subject: leases: split up generic_setlease into lock/unlock cases Eventually we should probably do the same thing to the file operations as well. Signed-off-by: J. Bruce Fields --- fs/locks.c | 98 +++++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 62 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 9b8408eb6946..b342902c38bc 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1352,18 +1352,7 @@ int fcntl_getlease(struct file *filp) return type; } -/** - * generic_setlease - sets a lease on an open file - * @filp: file pointer - * @arg: type of lease to obtain - * @flp: input - file_lock to use, output - file_lock inserted - * - * The (input) flp->fl_lmops->lm_break function is required - * by break_lease(). - * - * Called with file_lock_lock held. - */ -int generic_setlease(struct file *filp, long arg, struct file_lock **flp) +int generic_add_lease(struct file *filp, long arg, struct file_lock **flp) { struct file_lock *fl, **before, **my_before = NULL, *lease; struct dentry *dentry = filp->f_path.dentry; @@ -1372,30 +1361,14 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) lease = *flp; - error = -EACCES; - if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE)) - goto out; - error = -EINVAL; - if (!S_ISREG(inode->i_mode)) + error = -EAGAIN; + if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) goto out; - error = security_file_lock(filp, arg); - if (error) + if ((arg == F_WRLCK) + && ((dentry->d_count > 1) + || (atomic_read(&inode->i_count) > 1))) goto out; - time_out_leases(inode); - - BUG_ON(!(*flp)->fl_lmops->lm_break); - - if (arg != F_UNLCK) { - error = -EAGAIN; - if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) - goto out; - if ((arg == F_WRLCK) - && ((dentry->d_count > 1) - || (atomic_read(&inode->i_count) > 1))) - goto out; - } - /* * At this point, we know that if there is an exclusive * lease on this file, then we hold it on this filp @@ -1433,9 +1406,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) goto out; } - if (arg == F_UNLCK) - goto out; - error = -EINVAL; if (!leases_enable) goto out; @@ -1446,6 +1416,62 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) out: return error; } + +int generic_delete_lease(struct file *filp, struct file_lock **flp) +{ + struct file_lock *fl, **before; + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + + for (before = &inode->i_flock; + ((fl = *before) != NULL) && IS_LEASE(fl); + before = &fl->fl_next) { + if (fl->fl_file != filp) + continue; + return (*flp)->fl_lmops->lm_change(before, F_UNLCK); + } + return -EAGAIN; +} + +/** + * generic_setlease - sets a lease on an open file + * @filp: file pointer + * @arg: type of lease to obtain + * @flp: input - file_lock to use, output - file_lock inserted + * + * The (input) flp->fl_lmops->lm_break function is required + * by break_lease(). + * + * Called with file_lock_lock held. + */ +int generic_setlease(struct file *filp, long arg, struct file_lock **flp) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + int error; + + if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE)) + return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + error = security_file_lock(filp, arg); + if (error) + return error; + + time_out_leases(inode); + + BUG_ON(!(*flp)->fl_lmops->lm_break); + + switch (arg) { + case F_UNLCK: + return generic_delete_lease(filp, flp); + case F_RDLCK: + case F_WRLCK: + return generic_add_lease(filp, arg, flp); + default: + BUG(); + } +} EXPORT_SYMBOL(generic_setlease); static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease) -- cgit v1.2.3-58-ga151 From 2a74aba799bfbc02977b69400b7bf4d2850aea79 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 23 Sep 2011 17:20:02 -0400 Subject: nfsd4: move client * to nfs4_stateid, add init_stid helper This will be convenient. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 8 ++++---- fs/nfsd/nfs4state.c | 48 ++++++++++++++++++++++++++---------------------- fs/nfsd/state.h | 2 +- 3 files changed, 31 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 93b5e405ad38..de018ecadae6 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -787,7 +787,7 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); - struct nfs4_client *clp = dp->dl_client; + struct nfs4_client *clp = dp->dl_stid.sc_client; u32 minorversion = clp->cl_minorversion; cb->cb_minorversion = minorversion; @@ -809,7 +809,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); - struct nfs4_client *clp = dp->dl_client; + struct nfs4_client *clp = dp->dl_stid.sc_client; dprintk("%s: minorversion=%d\n", __func__, clp->cl_minorversion); @@ -832,7 +832,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); - struct nfs4_client *clp = dp->dl_client; + struct nfs4_client *clp = dp->dl_stid.sc_client; struct rpc_clnt *current_rpc_client = clp->cl_cb_client; nfsd4_cb_done(task, calldata); @@ -1006,7 +1006,7 @@ void nfsd4_do_callback_rpc(struct work_struct *w) void nfsd4_cb_recall(struct nfs4_delegation *dp) { struct nfsd4_callback *cb = &dp->dl_recall; - struct nfs4_client *clp = dp->dl_client; + struct nfs4_client *clp = dp->dl_stid.sc_client; dp->dl_retries = 1; cb->cb_op = dp; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index edcced18caa8..cb36c9a2e8ca 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -224,6 +224,19 @@ static inline void hash_stid(struct nfs4_stid *stid) list_add(&stid->sc_hash, &stateid_hashtbl[hashval]); } +static void init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type) +{ + stateid_t *s = &stid->sc_stateid; + + stid->sc_type = type; + stid->sc_client = cl; + s->si_opaque.so_clid = cl->cl_clientid; + s->si_opaque.so_id = current_stateid++; + /* Will be incremented before return to client: */ + s->si_generation = 0; + hash_stid(stid); +} + static struct nfs4_delegation * alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type) { @@ -245,19 +258,20 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL); if (dp == NULL) return dp; + init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID); + /* + * delegation seqid's are never incremented. The 4.1 special + * meaning of seqid 0 isn't really meaningful, really, but let's + * avoid 0 anyway just for consistency and use 1: + */ + dp->dl_stid.sc_stateid.si_generation = 1; num_delegations++; INIT_LIST_HEAD(&dp->dl_perfile); INIT_LIST_HEAD(&dp->dl_perclnt); INIT_LIST_HEAD(&dp->dl_recall_lru); - dp->dl_client = clp; get_nfs4_file(fp); dp->dl_file = fp; dp->dl_type = type; - dp->dl_stid.sc_type = NFS4_DELEG_STID; - dp->dl_stid.sc_stateid.si_opaque.so_clid = clp->cl_clientid; - dp->dl_stid.sc_stateid.si_opaque.so_id = current_stateid++; - dp->dl_stid.sc_stateid.si_generation = 1; - hash_stid(&dp->dl_stid); fh_copy_shallow(&dp->dl_fh, ¤t_fh->fh_handle); dp->dl_time = 0; atomic_set(&dp->dl_count, 1); @@ -2333,18 +2347,13 @@ init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd struct nfs4_openowner *oo = open->op_openowner; struct nfs4_client *clp = oo->oo_owner.so_client; + init_stid(&stp->st_stid, clp, NFS4_OPEN_STID); INIT_LIST_HEAD(&stp->st_lockowners); list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); - stp->st_stid.sc_type = NFS4_OPEN_STID; stp->st_stateowner = &oo->oo_owner; get_nfs4_file(fp); stp->st_file = fp; - stp->st_stid.sc_stateid.si_opaque.so_clid = clp->cl_clientid; - stp->st_stid.sc_stateid.si_opaque.so_id = current_stateid++; - /* note will be incremented before first return to client: */ - stp->st_stid.sc_stateid.si_generation = 0; - hash_stid(&stp->st_stid); stp->st_access_bmap = 0; stp->st_deny_bmap = 0; __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK, @@ -2792,7 +2801,7 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag) if (!fl) return -ENOMEM; fl->fl_file = find_readable_file(fp); - list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations); + list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); status = vfs_setlease(fl->fl_file, fl->fl_type, &fl); if (status) { list_del_init(&dp->dl_perclnt); @@ -2821,7 +2830,7 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag) atomic_inc(&fp->fi_delegees); list_add(&dp->dl_perfile, &fp->fi_delegations); spin_unlock(&recall_lock); - list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations); + list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); return 0; } @@ -3295,7 +3304,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, status = nfs4_check_delegmode(dp, flags); if (status) goto out; - renew_client(dp->dl_client); + renew_client(dp->dl_stid.sc_client); if (filpp) { *filpp = dp->dl_file->fi_deleg_file; BUG_ON(!*filpp); @@ -3665,7 +3674,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate)); if (status) goto out; - renew_client(dp->dl_client); + renew_client(dp->dl_stid.sc_client); unhash_delegation(dp); out: @@ -3819,17 +3828,12 @@ alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct stp = nfs4_alloc_stateid(); if (stp == NULL) goto out; + init_stid(&stp->st_stid, clp, NFS4_LOCK_STID); list_add(&stp->st_perfile, &fp->fi_stateids); list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); stp->st_stateowner = &lo->lo_owner; - stp->st_stid.sc_type = NFS4_LOCK_STID; get_nfs4_file(fp); stp->st_file = fp; - stp->st_stid.sc_stateid.si_opaque.so_clid = clp->cl_clientid; - stp->st_stid.sc_stateid.si_opaque.so_id = current_stateid++; - /* note will be incremented before first return to client: */ - stp->st_stid.sc_stateid.si_generation = 0; - hash_stid(&stp->st_stid); stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index da68bf66a3d7..70062b75e24a 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -81,6 +81,7 @@ struct nfs4_stid { unsigned char sc_type; struct list_head sc_hash; stateid_t sc_stateid; + struct nfs4_client *sc_client; }; struct nfs4_delegation { @@ -88,7 +89,6 @@ struct nfs4_delegation { struct list_head dl_perclnt; struct list_head dl_recall_lru; /* delegation recalled */ atomic_t dl_count; /* ref count */ - struct nfs4_client *dl_client; struct nfs4_file *dl_file; u32 dl_type; time_t dl_time; -- cgit v1.2.3-58-ga151 From 6136d2b409652b064b2da6d43d5c47cbd1d2cc14 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 23 Sep 2011 16:21:15 -0400 Subject: nfsd4: use idr for stateid's The idr system is designed exactly for generating id and looking up integer id's. Thanks to Trond for pointing it out. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 124 +++++++++++++++++++++++++++++++--------------------- fs/nfsd/state.h | 1 - 2 files changed, 73 insertions(+), 52 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index cb36c9a2e8ca..a9e71cdf4a84 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -32,6 +32,7 @@ * */ +#include #include #include #include @@ -49,7 +50,6 @@ time_t nfsd4_lease = 90; /* default lease time */ time_t nfsd4_grace = 90; static time_t boot_time; -static u32 current_stateid = 1; static stateid_t zerostateid; /* bits all 0 */ static stateid_t onestateid; /* bits all 1 */ static u64 current_sessionid = 1; @@ -149,10 +149,7 @@ static struct list_head open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE]; #define FILE_HASH_BITS 8 #define FILE_HASH_SIZE (1 << FILE_HASH_BITS) -/* hash table for (open)nfs4_ol_stateid */ -#define STATEID_HASH_BITS 10 -#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) -#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) +struct idr stateids; static unsigned int file_hashval(struct inode *ino) { @@ -160,13 +157,7 @@ static unsigned int file_hashval(struct inode *ino) return hash_ptr(ino, FILE_HASH_BITS); } -static unsigned int stateid_hashval(stateid_t *s) -{ - return opaque_hashval(&s->si_opaque, sizeof(stateid_opaque_t)) & STATEID_HASH_MASK; -} - static struct list_head file_hashtbl[FILE_HASH_SIZE]; -static struct list_head stateid_hashtbl[STATEID_HASH_SIZE]; static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag) { @@ -215,26 +206,52 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag) __nfs4_file_put_access(fp, oflag); } -static inline void hash_stid(struct nfs4_stid *stid) +static inline int get_new_stid(struct nfs4_stid *stid) { - stateid_t *s = &stid->sc_stateid; - unsigned int hashval; + static int min_stateid = 0; + int new_stid; + int error; + + if (!idr_pre_get(&stateids, GFP_KERNEL)) + return -ENOMEM; + + error = idr_get_new_above(&stateids, stid, min_stateid, &new_stid); + /* + * All this code is currently serialized; the preallocation + * above should still be ours: + */ + BUG_ON(error); + /* + * It shouldn't be a problem to reuse an opaque stateid value. + * I don't think it is for 4.1. But with 4.0 I worry that, for + * example, a stray write retransmission could be accepted by + * the server when it should have been rejected. Therefore, + * adopt a trick from the sctp code to attempt to maximize the + * amount of time until an id is reused, by ensuring they always + * "increase" (mod INT_MAX): + */ - hashval = stateid_hashval(s); - list_add(&stid->sc_hash, &stateid_hashtbl[hashval]); + min_stateid = new_stid+1; + if (min_stateid == INT_MAX) + min_stateid = 0; + return new_stid; } -static void init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type) +static inline __be32 init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type) { stateid_t *s = &stid->sc_stateid; + int new_id; stid->sc_type = type; stid->sc_client = cl; s->si_opaque.so_clid = cl->cl_clientid; - s->si_opaque.so_id = current_stateid++; + new_id = get_new_stid(stid); + if (new_id < 0) + return nfserr_jukebox; + s->si_opaque.so_id = (u32)new_id; /* Will be incremented before return to client: */ s->si_generation = 0; - hash_stid(stid); + return 0; } static struct nfs4_delegation * @@ -242,6 +259,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv { struct nfs4_delegation *dp; struct nfs4_file *fp = stp->st_file; + __be32 status; dprintk("NFSD alloc_init_deleg\n"); /* @@ -258,11 +276,15 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL); if (dp == NULL) return dp; - init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID); + status = init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID); + if (status) { + kmem_cache_free(deleg_slab, dp); + return NULL; + } /* * delegation seqid's are never incremented. The 4.1 special - * meaning of seqid 0 isn't really meaningful, really, but let's - * avoid 0 anyway just for consistency and use 1: + * meaning of seqid 0 isn't meaningful, really, but let's avoid + * 0 anyway just for consistency and use 1: */ dp->dl_stid.sc_stateid.si_generation = 1; num_delegations++; @@ -300,11 +322,16 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp) } } +static void unhash_stid(struct nfs4_stid *s) +{ + idr_remove(&stateids, s->sc_stateid.si_opaque.so_id); +} + /* Called under the state lock. */ static void unhash_delegation(struct nfs4_delegation *dp) { - list_del_init(&dp->dl_stid.sc_hash); + unhash_stid(&dp->dl_stid); list_del_init(&dp->dl_perclnt); spin_lock(&recall_lock); list_del_init(&dp->dl_perfile); @@ -457,7 +484,7 @@ static void release_lock_stateid(struct nfs4_ol_stateid *stp) struct file *file; unhash_generic_stateid(stp); - list_del(&stp->st_stid.sc_hash); + unhash_stid(&stp->st_stid); file = find_any_file(stp->st_file); if (file) locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner)); @@ -506,7 +533,7 @@ static void unhash_open_stateid(struct nfs4_ol_stateid *stp) static void release_open_stateid(struct nfs4_ol_stateid *stp) { unhash_open_stateid(stp); - list_del(&stp->st_stid.sc_hash); + unhash_stid(&stp->st_stid); free_generic_stateid(stp); } @@ -528,7 +555,7 @@ static void release_last_closed_stateid(struct nfs4_openowner *oo) struct nfs4_ol_stateid *s = oo->oo_last_closed_stid; if (s) { - list_del_init(&s->st_stid.sc_hash); + unhash_stid(&s->st_stid); free_generic_stateid(s); oo->oo_last_closed_stid = NULL; } @@ -1099,23 +1126,9 @@ static void gen_confirm(struct nfs4_client *clp) *p++ = i++; } -static int -same_stateid(stateid_t *id_one, stateid_t *id_two) -{ - return 0 == memcmp(&id_one->si_opaque, &id_two->si_opaque, - sizeof(stateid_opaque_t)); -} - static struct nfs4_stid *find_stateid(stateid_t *t) { - struct nfs4_stid *s; - unsigned int hashval; - - hashval = stateid_hashval(t); - list_for_each_entry(s, &stateid_hashtbl[hashval], sc_hash) - if (same_stateid(&s->sc_stateid, t)) - return s; - return NULL; + return idr_find(&stateids, t->si_opaque.so_id); } static struct nfs4_stid *find_stateid_by_type(stateid_t *t, char typemask) @@ -2342,12 +2355,14 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str return oo; } -static inline void -init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { +static inline __be32 init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { struct nfs4_openowner *oo = open->op_openowner; struct nfs4_client *clp = oo->oo_owner.so_client; + __be32 status; - init_stid(&stp->st_stid, clp, NFS4_OPEN_STID); + status = init_stid(&stp->st_stid, clp, NFS4_OPEN_STID); + if (status) + return status; INIT_LIST_HEAD(&stp->st_lockowners); list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); @@ -2360,6 +2375,7 @@ init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd &stp->st_access_bmap); __set_bit(open->op_share_deny, &stp->st_deny_bmap); stp->st_openstp = NULL; + return nfs_ok; } static void @@ -2949,7 +2965,11 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs4_new_open(rqstp, &stp, fp, current_fh, open); if (status) goto out; - init_open_stateid(stp, fp, open); + status = init_open_stateid(stp, fp, open); + if (status) { + release_open_stateid(stp); + goto out; + } status = nfsd4_truncate(rqstp, current_fh, open); if (status) { release_open_stateid(stp); @@ -3824,11 +3844,16 @@ alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct { struct nfs4_ol_stateid *stp; struct nfs4_client *clp = lo->lo_owner.so_client; + __be32 status; stp = nfs4_alloc_stateid(); if (stp == NULL) - goto out; - init_stid(&stp->st_stid, clp, NFS4_LOCK_STID); + return NULL; + status = init_stid(&stp->st_stid, clp, NFS4_LOCK_STID); + if (status) { + free_generic_stateid(stp); + return NULL; + } list_add(&stp->st_perfile, &fp->fi_stateids); list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); stp->st_stateowner = &lo->lo_owner; @@ -3837,8 +3862,6 @@ alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; - -out: return stp; } @@ -4386,8 +4409,7 @@ nfs4_state_init(void) for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) { INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]); } - for (i = 0; i < STATEID_HASH_SIZE; i++) - INIT_LIST_HEAD(&stateid_hashtbl[i]); + idr_init(&stateids); for (i = 0; i < LOCK_HASH_SIZE; i++) { INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]); } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 70062b75e24a..3ed5f99141ec 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -79,7 +79,6 @@ struct nfs4_stid { /* For an open stateid kept around *only* to process close replays: */ #define NFS4_CLOSED_STID 8 unsigned char sc_type; - struct list_head sc_hash; stateid_t sc_stateid; struct nfs4_client *sc_client; }; -- cgit v1.2.3-58-ga151 From 36279ac10c3d69372af875f1affafd375db687a9 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 26 Sep 2011 12:53:00 -0400 Subject: nfsd4: assume test_stateid always has session Test_stateid is 4.1-only and only allowed after a sequence operation, so this check is unnecessary. Cc: Bryan Schumaker Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 6 +++--- fs/nfsd/nfs4xdr.c | 2 +- fs/nfsd/state.h | 2 +- fs/nfsd/xdr4.h | 1 - 4 files changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a9e71cdf4a84..daf75fa4c027 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3256,7 +3256,7 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_sess return nfserr_old_stateid; } -__be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session) +__be32 nfs4_validate_stateid(stateid_t *stateid) { struct nfs4_stid *s; struct nfs4_ol_stateid *ols; @@ -3268,7 +3268,7 @@ __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session) s = find_stateid(stateid); if (!s) return nfserr_stale_stateid; - status = check_stateid_generation(stateid, &s->sc_stateid, has_session); + status = check_stateid_generation(stateid, &s->sc_stateid, 1); if (status) return status; if (!(s->sc_type & (NFS4_OPEN_STID | NFS4_LOCK_STID))) @@ -3374,7 +3374,7 @@ __be32 nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_test_stateid *test_stateid) { - test_stateid->ts_has_session = nfsd4_has_session(cstate); + /* real work is done during encoding */ return nfs_ok; } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 7bd57c2dbc4d..2429fffa31dd 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3302,7 +3302,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr, nfs4_lock_state(); for (i = 0; i < test_stateid->ts_num_ids; i++) { nfsd4_decode_stateid(argp, &si); - valid = nfs4_validate_stateid(&si, test_stateid->ts_has_session); + valid = nfs4_validate_stateid(&si); RESERVE_SPACE(4); *p++ = htonl(valid); resp->p = p; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 3ed5f99141ec..55a4d6a108a2 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -508,7 +508,7 @@ extern void nfsd4_recdir_purge_old(void); extern int nfsd4_create_clid_dir(struct nfs4_client *clp); extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); extern void release_session_client(struct nfsd4_session *); -extern __be32 nfs4_validate_stateid(stateid_t *, bool); +extern __be32 nfs4_validate_stateid(stateid_t *); extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *); #endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index a767b57b8208..c9012149637c 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -343,7 +343,6 @@ struct nfsd4_saved_compoundargs { struct nfsd4_test_stateid { __be32 ts_num_ids; - bool ts_has_session; struct nfsd4_compoundargs *ts_saved_args; struct nfsd4_saved_compoundargs ts_savedp; }; -- cgit v1.2.3-58-ga151 From 38c2f4b12a455cb3a108fd5c79a10df2ba3ec9a7 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 23 Sep 2011 17:01:19 -0400 Subject: nfsd4: look up stateid's per clientid Use a separate stateid idr per client, and lookup a stateid by first finding the client, then looking up the stateid relative to that client. Also some minor refactoring. This allows us to improve error returns: we can return expired when the clientid is not found and bad_stateid when the clientid is found but not the stateid, as opposed to returning expired for both cases. I hope this will also help to replace the state lock mostly by a per-client lock, but that hasn't been done yet. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 112 +++++++++++++++++++++++----------------------------- fs/nfsd/nfs4xdr.c | 3 +- fs/nfsd/state.h | 4 +- 3 files changed, 54 insertions(+), 65 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index daf75fa4c027..931155f51ecc 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -32,7 +32,6 @@ * */ -#include #include #include #include @@ -149,8 +148,6 @@ static struct list_head open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE]; #define FILE_HASH_BITS 8 #define FILE_HASH_SIZE (1 << FILE_HASH_BITS) -struct idr stateids; - static unsigned int file_hashval(struct inode *ino) { /* XXX: why are we hashing on inode pointer, anyway? */ @@ -209,13 +206,14 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag) static inline int get_new_stid(struct nfs4_stid *stid) { static int min_stateid = 0; + struct idr *stateids = &stid->sc_client->cl_stateids; int new_stid; int error; - if (!idr_pre_get(&stateids, GFP_KERNEL)) + if (!idr_pre_get(stateids, GFP_KERNEL)) return -ENOMEM; - error = idr_get_new_above(&stateids, stid, min_stateid, &new_stid); + error = idr_get_new_above(stateids, stid, min_stateid, &new_stid); /* * All this code is currently serialized; the preallocation * above should still be ours: @@ -324,7 +322,9 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp) static void unhash_stid(struct nfs4_stid *s) { - idr_remove(&stateids, s->sc_stateid.si_opaque.so_id); + struct idr *stateids = &s->sc_client->cl_stateids; + + idr_remove(stateids, s->sc_stateid.si_opaque.so_id); } /* Called under the state lock. */ @@ -1126,16 +1126,16 @@ static void gen_confirm(struct nfs4_client *clp) *p++ = i++; } -static struct nfs4_stid *find_stateid(stateid_t *t) +static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t) { - return idr_find(&stateids, t->si_opaque.so_id); + return idr_find(&cl->cl_stateids, t->si_opaque.so_id); } -static struct nfs4_stid *find_stateid_by_type(stateid_t *t, char typemask) +static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask) { struct nfs4_stid *s; - s = find_stateid(t); + s = find_stateid(cl, t); if (!s) return NULL; if (typemask & s->sc_type) @@ -1143,16 +1143,6 @@ static struct nfs4_stid *find_stateid_by_type(stateid_t *t, char typemask) return NULL; } -static struct nfs4_ol_stateid *find_ol_stateid_by_type(stateid_t *t, char typemask) -{ - struct nfs4_stid *s; - - s = find_stateid_by_type(t, typemask); - if (!s) - return NULL; - return openlockstateid(s); -} - static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, struct svc_rqst *rqstp, nfs4_verifier *verf) { @@ -1175,6 +1165,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, } } + idr_init(&clp->cl_stateids); memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); atomic_set(&clp->cl_refcount, 0); clp->cl_cb_state = NFSD4_CB_UNKNOWN; @@ -2611,24 +2602,24 @@ static int share_access_to_flags(u32 share_access) return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; } -static struct nfs4_delegation *find_deleg_stateid(stateid_t *s) +static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s) { struct nfs4_stid *ret; - ret = find_stateid_by_type(s, NFS4_DELEG_STID); + ret = find_stateid_by_type(cl, s, NFS4_DELEG_STID); if (!ret) return NULL; return delegstateid(ret); } static __be32 -nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open, +nfs4_check_deleg(struct nfs4_client *cl, struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_delegation **dp) { int flags; __be32 status = nfserr_bad_stateid; - *dp = find_deleg_stateid(&open->op_delegate_stateid); + *dp = find_deleg_stateid(cl, &open->op_delegate_stateid); if (*dp == NULL) goto out; flags = share_access_to_flags(open->op_share_access); @@ -2920,6 +2911,7 @@ __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) { struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfs4_client *cl = open->op_openowner->oo_owner.so_client; struct nfs4_file *fp = NULL; struct inode *ino = current_fh->fh_dentry->d_inode; struct nfs4_ol_stateid *stp = NULL; @@ -2939,7 +2931,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf if (fp) { if ((status = nfs4_check_open(fp, open, &stp))) goto out; - status = nfs4_check_deleg(fp, open, &dp); + status = nfs4_check_deleg(cl, fp, open, &dp); if (status) goto out; } else { @@ -3256,7 +3248,7 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_sess return nfserr_old_stateid; } -__be32 nfs4_validate_stateid(stateid_t *stateid) +__be32 nfs4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) { struct nfs4_stid *s; struct nfs4_ol_stateid *ols; @@ -3265,7 +3257,7 @@ __be32 nfs4_validate_stateid(stateid_t *stateid) if (STALE_STATEID(stateid)) return nfserr_stale_stateid; - s = find_stateid(stateid); + s = find_stateid(cl, stateid); if (!s) return nfserr_stale_stateid; status = check_stateid_generation(stateid, &s->sc_stateid, 1); @@ -3280,6 +3272,24 @@ __be32 nfs4_validate_stateid(stateid_t *stateid) return nfs_ok; } +static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s) +{ + struct nfs4_client *cl; + + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + return nfserr_bad_stateid; + if (STALE_STATEID(stateid)) + return nfserr_stale_stateid; + cl = find_confirmed_client(&stateid->si_opaque.so_clid); + if (!cl) + return nfserr_expired; + *s = find_stateid_by_type(cl, stateid, typemask); + if (!*s) + return nfserr_bad_stateid; + return nfs_ok; + +} + /* * Checks for stateid operations */ @@ -3303,18 +3313,9 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return check_special_stateids(current_fh, stateid, flags); - status = nfserr_stale_stateid; - if (STALE_STATEID(stateid)) - goto out; - - /* - * We assume that any stateid that has the current boot time, - * but that we can't find, is expired: - */ - status = nfserr_expired; - s = find_stateid(stateid); - if (!s) - goto out; + status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s); + if (status) + return status; status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate)); if (status) goto out; @@ -3384,10 +3385,11 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { stateid_t *stateid = &free_stateid->fr_stateid; struct nfs4_stid *s; + struct nfs4_client *cl = cstate->session->se_client; __be32 ret = nfserr_bad_stateid; nfs4_lock_state(); - s = find_stateid(stateid); + s = find_stateid(cl, stateid); if (!s) goto out; switch (s->sc_type) { @@ -3419,15 +3421,6 @@ setlkflg (int type) RD_STATE : WR_STATE; } -static __be32 nfs4_nospecial_stateid_checks(stateid_t *stateid) -{ - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) - return nfserr_bad_stateid; - if (STALE_STATEID(stateid)) - return nfserr_stale_stateid; - return nfs_ok; -} - static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_ol_stateid *stp) { struct svc_fh *current_fh = &cstate->current_fh; @@ -3458,17 +3451,16 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, struct nfs4_ol_stateid **stpp) { __be32 status; + struct nfs4_stid *s; dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__, seqid, STATEID_VAL(stateid)); *stpp = NULL; - status = nfs4_nospecial_stateid_checks(stateid); + status = nfsd4_lookup_stateid(stateid, typemask, &s); if (status) return status; - *stpp = find_ol_stateid_by_type(stateid, typemask); - if (*stpp == NULL) - return nfserr_expired; + *stpp = openlockstateid(s); cstate->replay_owner = (*stpp)->st_stateowner; renew_client((*stpp)->st_stateowner->so_client); @@ -3673,6 +3665,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfs4_delegation *dp; stateid_t *stateid = &dr->dr_stateid; + struct nfs4_stid *s; struct inode *inode; __be32 status; @@ -3681,16 +3674,10 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, inode = cstate->current_fh.fh_dentry->d_inode; nfs4_lock_state(); - status = nfserr_bad_stateid; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) - goto out; - status = nfserr_stale_stateid; - if (STALE_STATEID(stateid)) - goto out; - status = nfserr_expired; - dp = find_deleg_stateid(stateid); - if (!dp) + status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s); + if (status) goto out; + dp = delegstateid(s); status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate)); if (status) goto out; @@ -4409,7 +4396,6 @@ nfs4_state_init(void) for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) { INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]); } - idr_init(&stateids); for (i = 0; i < LOCK_HASH_SIZE; i++) { INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 2429fffa31dd..5779acde7e70 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3287,6 +3287,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_test_stateid *test_stateid) { struct nfsd4_compoundargs *argp; + struct nfs4_client *cl = resp->cstate.session->se_client; stateid_t si; __be32 *p; int i; @@ -3302,7 +3303,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr, nfs4_lock_state(); for (i = 0; i < test_stateid->ts_num_ids; i++) { nfsd4_decode_stateid(argp, &si); - valid = nfs4_validate_stateid(&si); + valid = nfs4_validate_stateid(cl, &si); RESERVE_SPACE(4); *p++ = htonl(valid); resp->p = p; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 55a4d6a108a2..13f6f9f5ceec 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -35,6 +35,7 @@ #ifndef _NFSD4_STATE_H #define _NFSD4_STATE_H +#include #include #include #include "nfsfh.h" @@ -231,6 +232,7 @@ struct nfs4_client { struct list_head cl_idhash; /* hash by cl_clientid.id */ struct list_head cl_strhash; /* hash by cl_name */ struct list_head cl_openowners; + struct idr cl_stateids; /* stateid lookup */ struct list_head cl_delegations; struct list_head cl_lru; /* tail queue */ struct xdr_netobj cl_name; /* id generated by client */ @@ -508,7 +510,7 @@ extern void nfsd4_recdir_purge_old(void); extern int nfsd4_create_clid_dir(struct nfs4_client *clp); extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); extern void release_session_client(struct nfsd4_session *); -extern __be32 nfs4_validate_stateid(stateid_t *); +extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *); extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *); #endif /* NFSD4_STATE_H */ -- cgit v1.2.3-58-ga151 From c4253cb0748cd50060d04d838c38b07f1ad0e6e5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 22 Sep 2011 19:34:33 +0200 Subject: sysfs: add unsigned long cast to prevent compile warning "sysfs: use rb-tree for inode number lookup" added a new printk which causes a new compile warning on s390 (and few other architectures): fs/sysfs/dir.c: In function 'sysfs_link_sibling': fs/sysfs/dir.c:63:4: warning: format '%lx' expects argument of type 'long unsigned int', but argument 2 has type 'ino_t' [-Wform Add an explicit unsigned long cast since ino_t is an unsigned long on most architectures. Cc: Mikulas Patocka Signed-off-by: Heiko Carstens Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index c3646d93a032..83bb9d1f30aa 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -60,7 +60,8 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd) } else if (sd->s_ino > node->s_ino) { p = &node->inode_node.rb_right; } else { - printk(KERN_CRIT "sysfs: inserting duplicate inode '%lx'\n", sd->s_ino); + printk(KERN_CRIT "sysfs: inserting duplicate inode '%lx'\n", + (unsigned long) sd->s_ino); BUG(); } #undef node -- cgit v1.2.3-58-ga151 From a542ad1bafc7df9fc16de8a6894b350a4df75572 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 13 Jun 2011 19:52:59 +0200 Subject: btrfs: added helper functions to iterate backrefs These helper functions iterate back references and call a function for each backref. There is also a function to resolve an inode to a path in the file system. Signed-off-by: Jan Schmidt --- fs/btrfs/Makefile | 3 +- fs/btrfs/backref.c | 776 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/backref.h | 62 +++++ fs/btrfs/ioctl.h | 11 + 4 files changed, 851 insertions(+), 1 deletion(-) create mode 100644 fs/btrfs/backref.c create mode 100644 fs/btrfs/backref.h (limited to 'fs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 40e6ac08c21f..89b6ce3634fd 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -7,6 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ - compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o + compression.o delayed-ref.o relocation.o delayed-inode.o backref.o \ + scrub.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c new file mode 100644 index 000000000000..2351df0de450 --- /dev/null +++ b/fs/btrfs/backref.c @@ -0,0 +1,776 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "backref.h" + +struct __data_ref { + struct list_head list; + u64 inum; + u64 root; + u64 extent_data_item_offset; +}; + +struct __shared_ref { + struct list_head list; + u64 disk_byte; +}; + +static int __inode_info(u64 inum, u64 ioff, u8 key_type, + struct btrfs_root *fs_root, struct btrfs_path *path, + struct btrfs_key *found_key) +{ + int ret; + struct btrfs_key key; + struct extent_buffer *eb; + + key.type = key_type; + key.objectid = inum; + key.offset = ioff; + + ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); + if (ret < 0) + return ret; + + eb = path->nodes[0]; + if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(fs_root, path); + if (ret) + return ret; + eb = path->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); + if (found_key->type != key.type || found_key->objectid != key.objectid) + return 1; + + return 0; +} + +/* + * this makes the path point to (inum INODE_ITEM ioff) + */ +int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, + struct btrfs_path *path) +{ + struct btrfs_key key; + return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path, + &key); +} + +static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, + struct btrfs_path *path, + struct btrfs_key *found_key) +{ + return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path, + found_key); +} + +/* + * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements + * of the path are separated by '/' and the path is guaranteed to be + * 0-terminated. the path is only given within the current file system. + * Therefore, it never starts with a '/'. the caller is responsible to provide + * "size" bytes in "dest". the dest buffer will be filled backwards. finally, + * the start point of the resulting string is returned. this pointer is within + * dest, normally. + * in case the path buffer would overflow, the pointer is decremented further + * as if output was written to the buffer, though no more output is actually + * generated. that way, the caller can determine how much space would be + * required for the path to fit into the buffer. in that case, the returned + * value will be smaller than dest. callers must check this! + */ +static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, + struct btrfs_inode_ref *iref, + struct extent_buffer *eb_in, u64 parent, + char *dest, u32 size) +{ + u32 len; + int slot; + u64 next_inum; + int ret; + s64 bytes_left = size - 1; + struct extent_buffer *eb = eb_in; + struct btrfs_key found_key; + + if (bytes_left >= 0) + dest[bytes_left] = '\0'; + + while (1) { + len = btrfs_inode_ref_name_len(eb, iref); + bytes_left -= len; + if (bytes_left >= 0) + read_extent_buffer(eb, dest + bytes_left, + (unsigned long)(iref + 1), len); + if (eb != eb_in) + free_extent_buffer(eb); + ret = inode_ref_info(parent, 0, fs_root, path, &found_key); + if (ret) + break; + next_inum = found_key.offset; + + /* regular exit ahead */ + if (parent == next_inum) + break; + + slot = path->slots[0]; + eb = path->nodes[0]; + /* make sure we can use eb after releasing the path */ + if (eb != eb_in) + atomic_inc(&eb->refs); + btrfs_release_path(path); + + iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); + parent = next_inum; + --bytes_left; + if (bytes_left >= 0) + dest[bytes_left] = '/'; + } + + btrfs_release_path(path); + + if (ret) + return ERR_PTR(ret); + + return dest + bytes_left; +} + +/* + * this makes the path point to (logical EXTENT_ITEM *) + * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for + * tree blocks and <0 on error. + */ +int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, + struct btrfs_path *path, struct btrfs_key *found_key) +{ + int ret; + u64 flags; + u32 item_size; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_key key; + + key.type = BTRFS_EXTENT_ITEM_KEY; + key.objectid = logical; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + if (ret < 0) + return ret; + ret = btrfs_previous_item(fs_info->extent_root, path, + 0, BTRFS_EXTENT_ITEM_KEY); + if (ret < 0) + return ret; + + btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); + if (found_key->type != BTRFS_EXTENT_ITEM_KEY || + found_key->objectid > logical || + found_key->objectid + found_key->offset <= logical) + return -ENOENT; + + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, path->slots[0]); + BUG_ON(item_size < sizeof(*ei)); + + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + flags = btrfs_extent_flags(eb, ei); + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + return BTRFS_EXTENT_FLAG_TREE_BLOCK; + if (flags & BTRFS_EXTENT_FLAG_DATA) + return BTRFS_EXTENT_FLAG_DATA; + + return -EIO; +} + +/* + * helper function to iterate extent inline refs. ptr must point to a 0 value + * for the first call and may be modified. it is used to track state. + * if more refs exist, 0 is returned and the next call to + * __get_extent_inline_ref must pass the modified ptr parameter to get the + * next ref. after the last ref was processed, 1 is returned. + * returns <0 on error + */ +static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, + struct btrfs_extent_item *ei, u32 item_size, + struct btrfs_extent_inline_ref **out_eiref, + int *out_type) +{ + unsigned long end; + u64 flags; + struct btrfs_tree_block_info *info; + + if (!*ptr) { + /* first call */ + flags = btrfs_extent_flags(eb, ei); + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + info = (struct btrfs_tree_block_info *)(ei + 1); + *out_eiref = + (struct btrfs_extent_inline_ref *)(info + 1); + } else { + *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1); + } + *ptr = (unsigned long)*out_eiref; + if ((void *)*ptr >= (void *)ei + item_size) + return -ENOENT; + } + + end = (unsigned long)ei + item_size; + *out_eiref = (struct btrfs_extent_inline_ref *)*ptr; + *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref); + + *ptr += btrfs_extent_inline_ref_size(*out_type); + WARN_ON(*ptr > end); + if (*ptr == end) + return 1; /* last */ + + return 0; +} + +/* + * reads the tree block backref for an extent. tree level and root are returned + * through out_level and out_root. ptr must point to a 0 value for the first + * call and may be modified (see __get_extent_inline_ref comment). + * returns 0 if data was provided, 1 if there was no more data to provide or + * <0 on error. + */ +int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, + struct btrfs_extent_item *ei, u32 item_size, + u64 *out_root, u8 *out_level) +{ + int ret; + int type; + struct btrfs_tree_block_info *info; + struct btrfs_extent_inline_ref *eiref; + + if (*ptr == (unsigned long)-1) + return 1; + + while (1) { + ret = __get_extent_inline_ref(ptr, eb, ei, item_size, + &eiref, &type); + if (ret < 0) + return ret; + + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) + break; + + if (ret == 1) + return 1; + } + + /* we can treat both ref types equally here */ + info = (struct btrfs_tree_block_info *)(ei + 1); + *out_root = btrfs_extent_inline_ref_offset(eb, eiref); + *out_level = btrfs_tree_block_level(eb, info); + + if (ret == 1) + *ptr = (unsigned long)-1; + + return 0; +} + +static int __data_list_add(struct list_head *head, u64 inum, + u64 extent_data_item_offset, u64 root) +{ + struct __data_ref *ref; + + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + ref->inum = inum; + ref->extent_data_item_offset = extent_data_item_offset; + ref->root = root; + list_add_tail(&ref->list, head); + + return 0; +} + +static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb, + struct btrfs_extent_data_ref *dref) +{ + return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref), + btrfs_extent_data_ref_offset(eb, dref), + btrfs_extent_data_ref_root(eb, dref)); +} + +static int __shared_list_add(struct list_head *head, u64 disk_byte) +{ + struct __shared_ref *ref; + + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + ref->disk_byte = disk_byte; + list_add_tail(&ref->list, head); + + return 0; +} + +static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info, + u64 logical, u64 inum, + u64 extent_data_item_offset, + u64 extent_offset, + struct btrfs_path *path, + struct list_head *data_refs, + iterate_extent_inodes_t *iterate, + void *ctx) +{ + u64 ref_root; + u32 item_size; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *eiref; + struct __data_ref *ref; + int ret; + int type; + int last; + unsigned long ptr = 0; + + WARN_ON(!list_empty(data_refs)); + ret = extent_from_logical(fs_info, logical, path, &key); + if (ret & BTRFS_EXTENT_FLAG_DATA) + ret = -EIO; + if (ret < 0) + goto out; + + eb = path->nodes[0]; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + item_size = btrfs_item_size_nr(eb, path->slots[0]); + + ret = 0; + ref_root = 0; + /* + * as done in iterate_extent_inodes, we first build a list of refs to + * iterate, then free the path and then iterate them to avoid deadlocks. + */ + do { + last = __get_extent_inline_ref(&ptr, eb, ei, item_size, + &eiref, &type); + if (last < 0) { + ret = last; + goto out; + } + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) { + ref_root = btrfs_extent_inline_ref_offset(eb, eiref); + ret = __data_list_add(data_refs, inum, + extent_data_item_offset, + ref_root); + } + } while (!ret && !last); + + btrfs_release_path(path); + + if (ref_root == 0) { + printk(KERN_ERR "btrfs: failed to find tree block ref " + "for shared data backref %llu\n", logical); + WARN_ON(1); + ret = -EIO; + } + +out: + while (!list_empty(data_refs)) { + ref = list_first_entry(data_refs, struct __data_ref, list); + list_del(&ref->list); + if (!ret) + ret = iterate(ref->inum, extent_offset + + ref->extent_data_item_offset, + ref->root, ctx); + kfree(ref); + } + + return ret; +} + +static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info, + u64 logical, u64 orig_extent_item_objectid, + u64 extent_offset, struct btrfs_path *path, + struct list_head *data_refs, + iterate_extent_inodes_t *iterate, + void *ctx) +{ + u64 disk_byte; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + struct extent_buffer *eb; + int slot; + int nritems; + int ret; + int found = 0; + + eb = read_tree_block(fs_info->tree_root, logical, + fs_info->tree_root->leafsize, 0); + if (!eb) + return -EIO; + + /* + * from the shared data ref, we only have the leaf but we need + * the key. thus, we must look into all items and see that we + * find one (some) with a reference to our extent item. + */ + nritems = btrfs_header_nritems(eb); + for (slot = 0; slot < nritems; ++slot) { + btrfs_item_key_to_cpu(eb, &key, slot); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + if (!fi) { + free_extent_buffer(eb); + return -EIO; + } + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + if (disk_byte != orig_extent_item_objectid) { + if (found) + break; + else + continue; + } + ++found; + ret = __iter_shared_inline_ref_inodes(fs_info, logical, + key.objectid, + key.offset, + extent_offset, path, + data_refs, + iterate, ctx); + if (ret) + break; + } + + if (!found) { + printk(KERN_ERR "btrfs: failed to follow shared data backref " + "to parent %llu\n", logical); + WARN_ON(1); + ret = -EIO; + } + + free_extent_buffer(eb); + return ret; +} + +/* + * calls iterate() for every inode that references the extent identified by + * the given parameters. will use the path given as a parameter and return it + * released. + * when the iterator function returns a non-zero value, iteration stops. + */ +int iterate_extent_inodes(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + u64 extent_item_objectid, + u64 extent_offset, + iterate_extent_inodes_t *iterate, void *ctx) +{ + unsigned long ptr = 0; + int last; + int ret; + int type; + u64 logical; + u32 item_size; + struct btrfs_extent_inline_ref *eiref; + struct btrfs_extent_data_ref *dref; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_key key; + struct list_head data_refs = LIST_HEAD_INIT(data_refs); + struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); + struct __data_ref *ref_d; + struct __shared_ref *ref_s; + + eb = path->nodes[0]; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + item_size = btrfs_item_size_nr(eb, path->slots[0]); + + /* first we iterate the inline refs, ... */ + do { + last = __get_extent_inline_ref(&ptr, eb, ei, item_size, + &eiref, &type); + if (last == -ENOENT) { + ret = 0; + break; + } + if (last < 0) { + ret = last; + break; + } + + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = (struct btrfs_extent_data_ref *)(&eiref->offset); + ret = __data_list_add_eb(&data_refs, eb, dref); + } else if (type == BTRFS_SHARED_DATA_REF_KEY) { + logical = btrfs_extent_inline_ref_offset(eb, eiref); + ret = __shared_list_add(&shared_refs, logical); + } + } while (!ret && !last); + + /* ... then we proceed to in-tree references and ... */ + while (!ret) { + ++path->slots[0]; + if (path->slots[0] > btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(fs_info->extent_root, path); + if (ret) { + if (ret == 1) + ret = 0; /* we're done */ + break; + } + eb = path->nodes[0]; + } + btrfs_item_key_to_cpu(eb, &key, path->slots[0]); + if (key.objectid != extent_item_objectid) + break; + if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_extent_data_ref); + ret = __data_list_add_eb(&data_refs, eb, dref); + } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + ret = __shared_list_add(&shared_refs, key.offset); + } + } + + btrfs_release_path(path); + + /* + * ... only at the very end we can process the refs we found. this is + * because the iterator function we call is allowed to make tree lookups + * and we have to avoid deadlocks. additionally, we need more tree + * lookups ourselves for shared data refs. + */ + while (!list_empty(&data_refs)) { + ref_d = list_first_entry(&data_refs, struct __data_ref, list); + list_del(&ref_d->list); + if (!ret) + ret = iterate(ref_d->inum, extent_offset + + ref_d->extent_data_item_offset, + ref_d->root, ctx); + kfree(ref_d); + } + + while (!list_empty(&shared_refs)) { + ref_s = list_first_entry(&shared_refs, struct __shared_ref, + list); + list_del(&ref_s->list); + if (!ret) + ret = __iter_shared_inline_ref(fs_info, + ref_s->disk_byte, + extent_item_objectid, + extent_offset, path, + &data_refs, + iterate, ctx); + kfree(ref_s); + } + + return ret; +} + +int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + iterate_extent_inodes_t *iterate, void *ctx) +{ + int ret; + u64 offset; + struct btrfs_key found_key; + + ret = extent_from_logical(fs_info, logical, path, + &found_key); + if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) + ret = -EINVAL; + if (ret < 0) + return ret; + + offset = logical - found_key.objectid; + ret = iterate_extent_inodes(fs_info, path, found_key.objectid, + offset, iterate, ctx); + + return ret; +} + +static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, + struct btrfs_path *path, + iterate_irefs_t *iterate, void *ctx) +{ + int ret; + int slot; + u32 cur; + u32 len; + u32 name_len; + u64 parent = 0; + int found = 0; + struct extent_buffer *eb; + struct btrfs_item *item; + struct btrfs_inode_ref *iref; + struct btrfs_key found_key; + + while (1) { + ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, + &found_key); + if (ret < 0) + break; + if (ret) { + ret = found ? 0 : -ENOENT; + break; + } + ++found; + + parent = found_key.offset; + slot = path->slots[0]; + eb = path->nodes[0]; + /* make sure we can use eb after releasing the path */ + atomic_inc(&eb->refs); + btrfs_release_path(path); + + item = btrfs_item_nr(eb, slot); + iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); + + for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { + name_len = btrfs_inode_ref_name_len(eb, iref); + /* path must be released before calling iterate()! */ + ret = iterate(parent, iref, eb, ctx); + if (ret) { + free_extent_buffer(eb); + break; + } + len = sizeof(*iref) + name_len; + iref = (struct btrfs_inode_ref *)((char *)iref + len); + } + free_extent_buffer(eb); + } + + btrfs_release_path(path); + + return ret; +} + +/* + * returns 0 if the path could be dumped (probably truncated) + * returns <0 in case of an error + */ +static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, + struct extent_buffer *eb, void *ctx) +{ + struct inode_fs_paths *ipath = ctx; + char *fspath; + char *fspath_min; + int i = ipath->fspath->elem_cnt; + const int s_ptr = sizeof(char *); + u32 bytes_left; + + bytes_left = ipath->fspath->bytes_left > s_ptr ? + ipath->fspath->bytes_left - s_ptr : 0; + + fspath_min = (char *)ipath->fspath->str + (i + 1) * s_ptr; + fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb, + inum, fspath_min, bytes_left); + if (IS_ERR(fspath)) + return PTR_ERR(fspath); + + if (fspath > fspath_min) { + ipath->fspath->str[i] = fspath; + ++ipath->fspath->elem_cnt; + ipath->fspath->bytes_left = fspath - fspath_min; + } else { + ++ipath->fspath->elem_missed; + ipath->fspath->bytes_missing += fspath_min - fspath; + ipath->fspath->bytes_left = 0; + } + + return 0; +} + +/* + * this dumps all file system paths to the inode into the ipath struct, provided + * is has been created large enough. each path is zero-terminated and accessed + * from ipath->fspath->str[i]. + * when it returns, there are ipath->fspath->elem_cnt number of paths available + * in ipath->fspath->str[]. when the allocated space wasn't sufficient, the + * number of missed paths in recored in ipath->fspath->elem_missed, otherwise, + * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would + * have been needed to return all paths. + */ +int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) +{ + return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, + inode_to_path, ipath); +} + +/* + * allocates space to return multiple file system paths for an inode. + * total_bytes to allocate are passed, note that space usable for actual path + * information will be total_bytes - sizeof(struct inode_fs_paths). + * the returned pointer must be freed with free_ipath() in the end. + */ +struct btrfs_data_container *init_data_container(u32 total_bytes) +{ + struct btrfs_data_container *data; + size_t alloc_bytes; + + alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); + data = kmalloc(alloc_bytes, GFP_NOFS); + if (!data) + return ERR_PTR(-ENOMEM); + + if (total_bytes >= sizeof(*data)) { + data->bytes_left = total_bytes - sizeof(*data); + data->bytes_missing = 0; + } else { + data->bytes_missing = sizeof(*data) - total_bytes; + data->bytes_left = 0; + } + + data->elem_cnt = 0; + data->elem_missed = 0; + + return data; +} + +/* + * allocates space to return multiple file system paths for an inode. + * total_bytes to allocate are passed, note that space usable for actual path + * information will be total_bytes - sizeof(struct inode_fs_paths). + * the returned pointer must be freed with free_ipath() in the end. + */ +struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, + struct btrfs_path *path) +{ + struct inode_fs_paths *ifp; + struct btrfs_data_container *fspath; + + fspath = init_data_container(total_bytes); + if (IS_ERR(fspath)) + return (void *)fspath; + + ifp = kmalloc(sizeof(*ifp), GFP_NOFS); + if (!ifp) { + kfree(fspath); + return ERR_PTR(-ENOMEM); + } + + ifp->btrfs_path = path; + ifp->fspath = fspath; + ifp->fs_root = fs_root; + + return ifp; +} + +void free_ipath(struct inode_fs_paths *ipath) +{ + kfree(ipath); +} diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h new file mode 100644 index 000000000000..92618837cb8f --- /dev/null +++ b/fs/btrfs/backref.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_BACKREF__ +#define __BTRFS_BACKREF__ + +#include "ioctl.h" + +struct inode_fs_paths { + struct btrfs_path *btrfs_path; + struct btrfs_root *fs_root; + struct btrfs_data_container *fspath; +}; + +typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, + void *ctx); +typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref, + struct extent_buffer *eb, void *ctx); + +int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, + struct btrfs_path *path); + +int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, + struct btrfs_path *path, struct btrfs_key *found_key); + +int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, + struct btrfs_extent_item *ei, u32 item_size, + u64 *out_root, u8 *out_level); + +int iterate_extent_inodes(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + u64 extent_item_objectid, + u64 extent_offset, + iterate_extent_inodes_t *iterate, void *ctx); + +int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + iterate_extent_inodes_t *iterate, void *ctx); + +int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); + +struct btrfs_data_container *init_data_container(u32 total_bytes); +struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, + struct btrfs_path *path); +void free_ipath(struct inode_fs_paths *ipath); + +#endif diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index ad1ea789fcb4..1857e3871843 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -193,6 +193,17 @@ struct btrfs_ioctl_space_args { struct btrfs_ioctl_space_info spaces[0]; }; +struct btrfs_data_container { + __u32 bytes_left; /* out -- bytes not needed to deliver output */ + __u32 bytes_missing; /* out -- additional bytes needed for result */ + __u32 elem_cnt; /* out */ + __u32 elem_missed; /* out */ + union { + char *str[0]; /* out */ + __u64 val[0]; /* out */ + }; +}; + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ -- cgit v1.2.3-58-ga151 From 13db62b7a1e8c64763a93c155091620f85ff8920 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 13 Jun 2011 19:56:13 +0200 Subject: btrfs scrub: added unverified_errors In normal operation, scrub is reading data sequentially in large portions. In case of an i/o error, we try to find the corrupted area(s) by issuing page sized read requests. With this commit we increment the unverified_errors counter if all of the small size requests succeed. Userland patches carrying such conspicous events to the administrator should already be around. Signed-off-by: Jan Schmidt --- fs/btrfs/scrub.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a8d03d5efb5d..35099fa97d56 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -201,18 +201,25 @@ nomem: * recheck_error gets called for every page in the bio, even though only * one may be bad */ -static void scrub_recheck_error(struct scrub_bio *sbio, int ix) +static int scrub_recheck_error(struct scrub_bio *sbio, int ix) { + struct scrub_dev *sdev = sbio->sdev; + u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9; + if (sbio->err) { - if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, - (sbio->physical + ix * PAGE_SIZE) >> 9, + if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector, sbio->bio->bi_io_vec[ix].bv_page) == 0) { if (scrub_fixup_check(sbio, ix) == 0) - return; + return 0; } } + spin_lock(&sdev->stat_lock); + ++sdev->stat.read_errors; + spin_unlock(&sdev->stat_lock); + scrub_fixup(sbio, ix); + return 1; } static int scrub_fixup_check(struct scrub_bio *sbio, int ix) @@ -382,8 +389,14 @@ static void scrub_checksum(struct btrfs_work *work) int ret; if (sbio->err) { + ret = 0; for (i = 0; i < sbio->count; ++i) - scrub_recheck_error(sbio, i); + ret |= scrub_recheck_error(sbio, i); + if (!ret) { + spin_lock(&sdev->stat_lock); + ++sdev->stat.unverified_errors; + spin_unlock(&sdev->stat_lock); + } sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); sbio->bio->bi_flags |= 1 << BIO_UPTODATE; @@ -396,10 +409,6 @@ static void scrub_checksum(struct btrfs_work *work) bi->bv_offset = 0; bi->bv_len = PAGE_SIZE; } - - spin_lock(&sdev->stat_lock); - ++sdev->stat.read_errors; - spin_unlock(&sdev->stat_lock); goto out; } for (i = 0; i < sbio->count; ++i) { @@ -420,8 +429,14 @@ static void scrub_checksum(struct btrfs_work *work) WARN_ON(1); } kunmap_atomic(buffer, KM_USER0); - if (ret) - scrub_recheck_error(sbio, i); + if (ret) { + ret = scrub_recheck_error(sbio, i); + if (!ret) { + spin_lock(&sdev->stat_lock); + ++sdev->stat.unverified_errors; + spin_unlock(&sdev->stat_lock); + } + } } out: -- cgit v1.2.3-58-ga151 From 558540c17771eaf89b1a3be39aa2c8bc837da1a6 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 13 Jun 2011 19:59:12 +0200 Subject: btrfs scrub: print paths of corrupted files While scrubbing, we may encounter various errors. Previously, a logical address was printed to the log only. Now, all paths belonging to that address are resolved and printed separately. That should work for hardlinks as well as reflinks. Signed-off-by: Jan Schmidt --- fs/btrfs/scrub.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 163 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 35099fa97d56..221fd5c48736 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -17,10 +17,12 @@ */ #include +#include #include "ctree.h" #include "volumes.h" #include "disk-io.h" #include "ordered-data.h" +#include "backref.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -100,6 +102,19 @@ struct scrub_dev { spinlock_t stat_lock; }; +struct scrub_warning { + struct btrfs_path *path; + u64 extent_item_size; + char *scratch_buf; + char *msg_buf; + const char *errstr; + sector_t sector; + u64 logical; + struct btrfs_device *dev; + int msg_bufsize; + int scratch_bufsize; +}; + static void scrub_free_csums(struct scrub_dev *sdev) { while (!list_empty(&sdev->csum_list)) { @@ -195,6 +210,143 @@ nomem: return ERR_PTR(-ENOMEM); } +static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) +{ + u64 isize; + u32 nlink; + int ret; + int i; + struct extent_buffer *eb; + struct btrfs_inode_item *inode_item; + struct scrub_warning *swarn = ctx; + struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; + struct inode_fs_paths *ipath = NULL; + struct btrfs_root *local_root; + struct btrfs_key root_key; + + root_key.objectid = root; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); + if (IS_ERR(local_root)) { + ret = PTR_ERR(local_root); + goto err; + } + + ret = inode_item_info(inum, 0, local_root, swarn->path); + if (ret) { + btrfs_release_path(swarn->path); + goto err; + } + + eb = swarn->path->nodes[0]; + inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], + struct btrfs_inode_item); + isize = btrfs_inode_size(eb, inode_item); + nlink = btrfs_inode_nlink(eb, inode_item); + btrfs_release_path(swarn->path); + + ipath = init_ipath(4096, local_root, swarn->path); + ret = paths_from_inode(inum, ipath); + + if (ret < 0) + goto err; + + /* + * we deliberately ignore the bit ipath might have been too small to + * hold all of the paths here + */ + for (i = 0; i < ipath->fspath->elem_cnt; ++i) + printk(KERN_WARNING "btrfs: %s at logical %llu on dev " + "%s, sector %llu, root %llu, inode %llu, offset %llu, " + "length %llu, links %u (path: %s)\n", swarn->errstr, + swarn->logical, swarn->dev->name, + (unsigned long long)swarn->sector, root, inum, offset, + min(isize - offset, (u64)PAGE_SIZE), nlink, + ipath->fspath->str[i]); + + free_ipath(ipath); + return 0; + +err: + printk(KERN_WARNING "btrfs: %s at logical %llu on dev " + "%s, sector %llu, root %llu, inode %llu, offset %llu: path " + "resolving failed with ret=%d\n", swarn->errstr, + swarn->logical, swarn->dev->name, + (unsigned long long)swarn->sector, root, inum, offset, ret); + + free_ipath(ipath); + return 0; +} + +static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, + int ix) +{ + struct btrfs_device *dev = sbio->sdev->dev; + struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; + struct btrfs_path *path; + struct btrfs_key found_key; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct scrub_warning swarn; + u32 item_size; + int ret; + u64 ref_root; + u8 ref_level; + unsigned long ptr = 0; + const int bufsize = 4096; + u64 extent_offset; + + path = btrfs_alloc_path(); + + swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); + swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); + swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9; + swarn.logical = sbio->logical + ix * PAGE_SIZE; + swarn.errstr = errstr; + swarn.dev = dev; + swarn.msg_bufsize = bufsize; + swarn.scratch_bufsize = bufsize; + + if (!path || !swarn.scratch_buf || !swarn.msg_buf) + goto out; + + ret = extent_from_logical(fs_info, swarn.logical, path, &found_key); + if (ret < 0) + goto out; + + extent_offset = swarn.logical - found_key.objectid; + swarn.extent_item_size = found_key.offset; + + eb = path->nodes[0]; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + item_size = btrfs_item_size_nr(eb, path->slots[0]); + + if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + do { + ret = tree_backref_for_extent(&ptr, eb, ei, item_size, + &ref_root, &ref_level); + printk(KERN_WARNING "%s at logical %llu on dev %s, " + "sector %llu: metadata %s (level %d) in tree " + "%llu\n", errstr, swarn.logical, dev->name, + (unsigned long long)swarn.sector, + ref_level ? "node" : "leaf", + ret < 0 ? -1 : ref_level, + ret < 0 ? -1 : ref_root); + } while (ret != 1); + } else { + swarn.path = path; + iterate_extent_inodes(fs_info, path, found_key.objectid, + extent_offset, + scrub_print_warning_inode, &swarn); + } + +out: + btrfs_free_path(path); + kfree(swarn.scratch_buf); + kfree(swarn.msg_buf); +} + /* * scrub_recheck_error gets called when either verification of the page * failed or the bio failed to read, e.g. with EIO. In the latter case, @@ -205,6 +357,8 @@ static int scrub_recheck_error(struct scrub_bio *sbio, int ix) { struct scrub_dev *sdev = sbio->sdev; u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9; + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); if (sbio->err) { if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector, @@ -212,6 +366,11 @@ static int scrub_recheck_error(struct scrub_bio *sbio, int ix) if (scrub_fixup_check(sbio, ix) == 0) return 0; } + if (__ratelimit(&_rs)) + scrub_print_warning("i/o error", sbio, ix); + } else { + if (__ratelimit(&_rs)) + scrub_print_warning("checksum error", sbio, ix); } spin_lock(&sdev->stat_lock); @@ -326,9 +485,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) ++sdev->stat.corrected_errors; spin_unlock(&sdev->stat_lock); - if (printk_ratelimit()) - printk(KERN_ERR "btrfs: fixed up at %llu\n", - (unsigned long long)logical); + printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n", + (unsigned long long)logical); return; uncorrectable: @@ -337,9 +495,8 @@ uncorrectable: ++sdev->stat.uncorrectable_errors; spin_unlock(&sdev->stat_lock); - if (printk_ratelimit()) - printk(KERN_ERR "btrfs: unable to fixup at %llu\n", - (unsigned long long)logical); + printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at " + "logical %llu\n", (unsigned long long)logical); } static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, -- cgit v1.2.3-58-ga151 From 193ea74b2729e6ddc08fb6bde6e15a3bd4d94071 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 13 Jun 2011 19:56:54 +0200 Subject: btrfs scrub: bugfix: mirror_num off by one Fix the mirror_num determination in scrub_stripe. The rest of the scrub code did not use mirror_num for anything important and that error went unnoticed. The nodatasum fixup patch of this set depends on a correct mirror_num. Signed-off-by: Jan Schmidt --- fs/btrfs/scrub.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 221fd5c48736..59caf8fcd1c7 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -452,7 +452,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) * first find a good copy */ for (i = 0; i < multi->num_stripes; ++i) { - if (i == sbio->spag[ix].mirror_num) + if (i + 1 == sbio->spag[ix].mirror_num) continue; if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, @@ -930,21 +930,21 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, if (map->type & BTRFS_BLOCK_GROUP_RAID0) { offset = map->stripe_len * num; increment = map->stripe_len * map->num_stripes; - mirror_num = 0; + mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { int factor = map->num_stripes / map->sub_stripes; offset = map->stripe_len * (num / map->sub_stripes); increment = map->stripe_len * factor; - mirror_num = num % map->sub_stripes; + mirror_num = num % map->sub_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { increment = map->stripe_len; - mirror_num = num % map->num_stripes; + mirror_num = num % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { increment = map->stripe_len; - mirror_num = num % map->num_stripes; + mirror_num = num % map->num_stripes + 1; } else { increment = map->stripe_len; - mirror_num = 0; + mirror_num = 1; } path = btrfs_alloc_path(); -- cgit v1.2.3-58-ga151 From 8ddc7d9cd0a00062247c732b96386ec2462bdbc7 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 13 Jun 2011 20:02:58 +0200 Subject: btrfs: add mirror_num to extent_read_full_page Currently, extent_read_full_page always assumes we are trying to read mirror 0, which generally is the best we can do. To add flexibility, pass it as a parameter. This will be needed by scrub fixup code. Signed-off-by: Jan Schmidt --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent_io.c | 6 +++--- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 07b3ac662e19..dc0343802535 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -908,7 +908,7 @@ static int btree_readpage(struct file *file, struct page *page) { struct extent_io_tree *tree; tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_read_full_page(tree, page, btree_get_extent); + return extent_read_full_page(tree, page, btree_get_extent, 0); } static int btree_releasepage(struct page *page, gfp_t gfp_flags) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d418164a35f1..afebb95e3490 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2076,16 +2076,16 @@ out: } int extent_read_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent) + get_extent_t *get_extent, int mirror_num) { struct bio *bio = NULL; unsigned long bio_flags = 0; int ret; - ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, + ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, &bio_flags); if (bio) - ret = submit_one_bio(READ, bio, 0, bio_flags); + ret = submit_one_bio(READ, bio, mirror_num, bio_flags); return ret; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 7b2f0c3e7929..a9dd994bf826 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -185,7 +185,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int extent_read_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent); + get_extent_t *get_extent, int mirror_num); int __init extent_io_init(void); void extent_io_exit(void); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b2d004ad66a0..efee8c7cfa45 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6289,7 +6289,7 @@ int btrfs_readpage(struct file *file, struct page *page) { struct extent_io_tree *tree; tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_read_full_page(tree, page, btrfs_get_extent); + return extent_read_full_page(tree, page, btrfs_get_extent, 0); } static int btrfs_writepage(struct page *page, struct writeback_control *wbc) -- cgit v1.2.3-58-ga151 From e12fa9cd390f8e93a9144bd99bd6f6ed316fbc1e Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 17 Jun 2011 15:55:21 +0200 Subject: btrfs scrub: use int for mirror_num, not u64 the rest of the code uses int mirror_num, and so should scrub Signed-off-by: Jan Schmidt --- fs/btrfs/scrub.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 59caf8fcd1c7..41a01147b959 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -65,7 +65,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix); struct scrub_page { u64 flags; /* extent flags */ u64 generation; - u64 mirror_num; + int mirror_num; int have_csum; u8 csum[BTRFS_CSUM_SIZE]; }; @@ -776,7 +776,7 @@ nomem: } static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, u64 mirror_num, + u64 physical, u64 flags, u64 gen, int mirror_num, u8 *csum, int force) { struct scrub_bio *sbio; @@ -873,7 +873,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, /* scrub extent tries to collect up to 64 kB for each bio */ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, u64 mirror_num) + u64 physical, u64 flags, u64 gen, int mirror_num) { int ret; u8 csum[BTRFS_CSUM_SIZE]; @@ -919,7 +919,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, u64 physical; u64 logical; u64 generation; - u64 mirror_num; + int mirror_num; u64 increment = map->stripe_len; u64 offset; -- cgit v1.2.3-58-ga151 From 0ef8e45158f97dde4801b535e25f70f7caf01a27 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 13 Jun 2011 20:04:15 +0200 Subject: btrfs scrub: add fixup code for errors on nodatasum files This removes a FIXME comment and introduces the first part of nodatasum fixup: It gets the corresponding inode for a logical address and triggers a regular readpage for the corrupted sector. Once we have on-the-fly error correction our error will be automatically corrected. The correction code is expected to clear the newly introduced EXTENT_DAMAGED flag, making scrub report that error as "corrected" instead of "uncorrectable" eventually. Signed-off-by: Jan Schmidt --- fs/btrfs/extent_io.h | 1 + fs/btrfs/scrub.c | 188 +++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 183 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a9dd994bf826..435d454b9926 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -17,6 +17,7 @@ #define EXTENT_NODATASUM (1 << 10) #define EXTENT_DO_ACCOUNTING (1 << 11) #define EXTENT_FIRST_DELALLOC (1 << 12) +#define EXTENT_DAMAGED (1 << 13) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 41a01147b959..db09f01c0e4f 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -22,6 +22,7 @@ #include "volumes.h" #include "disk-io.h" #include "ordered-data.h" +#include "transaction.h" #include "backref.h" /* @@ -89,6 +90,7 @@ struct scrub_dev { int first_free; int curr; atomic_t in_flight; + atomic_t fixup_cnt; spinlock_t list_lock; wait_queue_head_t list_wait; u16 csum_size; @@ -102,6 +104,14 @@ struct scrub_dev { spinlock_t stat_lock; }; +struct scrub_fixup_nodatasum { + struct scrub_dev *sdev; + u64 logical; + struct btrfs_root *root; + struct btrfs_work work; + int mirror_num; +}; + struct scrub_warning { struct btrfs_path *path; u64 extent_item_size; @@ -190,12 +200,13 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) if (i != SCRUB_BIOS_PER_DEV-1) sdev->bios[i]->next_free = i + 1; - else + else sdev->bios[i]->next_free = -1; } sdev->first_free = 0; sdev->curr = -1; atomic_set(&sdev->in_flight, 0); + atomic_set(&sdev->fixup_cnt, 0); atomic_set(&sdev->cancel_req, 0); sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); INIT_LIST_HEAD(&sdev->csum_list); @@ -347,6 +358,151 @@ out: kfree(swarn.msg_buf); } +static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) +{ + struct page *page; + unsigned long index; + struct scrub_fixup_nodatasum *fixup = ctx; + int ret; + int corrected; + struct btrfs_key key; + struct inode *inode; + u64 end = offset + PAGE_SIZE - 1; + struct btrfs_root *local_root; + + key.objectid = root; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); + if (IS_ERR(local_root)) + return PTR_ERR(local_root); + + key.type = BTRFS_INODE_ITEM_KEY; + key.objectid = inum; + key.offset = 0; + inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + ret = set_extent_bit(&BTRFS_I(inode)->io_tree, offset, end, + EXTENT_DAMAGED, 0, NULL, NULL, GFP_NOFS); + + /* set_extent_bit should either succeed or give proper error */ + WARN_ON(ret > 0); + if (ret) + return ret < 0 ? ret : -EFAULT; + + index = offset >> PAGE_CACHE_SHIFT; + + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) + return -ENOMEM; + + ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, + btrfs_get_extent, fixup->mirror_num); + wait_on_page_locked(page); + corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, end, + EXTENT_DAMAGED, 0, NULL); + + if (corrected) + WARN_ON(!PageUptodate(page)); + else + clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, end, + EXTENT_DAMAGED, 0, 0, NULL, GFP_NOFS); + + put_page(page); + iput(inode); + + if (ret < 0) + return ret; + + if (ret == 0 && corrected) { + /* + * we only need to call readpage for one of the inodes belonging + * to this extent. so make iterate_extent_inodes stop + */ + return 1; + } + + return -EIO; +} + +static void scrub_fixup_nodatasum(struct btrfs_work *work) +{ + int ret; + struct scrub_fixup_nodatasum *fixup; + struct scrub_dev *sdev; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_fs_info *fs_info; + struct btrfs_path *path; + int uncorrectable = 0; + + fixup = container_of(work, struct scrub_fixup_nodatasum, work); + sdev = fixup->sdev; + fs_info = fixup->root->fs_info; + + path = btrfs_alloc_path(); + if (!path) { + spin_lock(&sdev->stat_lock); + ++sdev->stat.malloc_errors; + spin_unlock(&sdev->stat_lock); + uncorrectable = 1; + goto out; + } + + trans = btrfs_join_transaction(fixup->root); + if (IS_ERR(trans)) { + uncorrectable = 1; + goto out; + } + + /* + * the idea is to trigger a regular read through the standard path. we + * read a page from the (failed) logical address by specifying the + * corresponding copynum of the failed sector. thus, that readpage is + * expected to fail. + * that is the point where on-the-fly error correction will kick in + * (once it's finished) and rewrite the failed sector if a good copy + * can be found. + */ + ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info, + path, scrub_fixup_readpage, + fixup); + if (ret < 0) { + uncorrectable = 1; + goto out; + } + WARN_ON(ret != 1); + + spin_lock(&sdev->stat_lock); + ++sdev->stat.corrected_errors; + spin_unlock(&sdev->stat_lock); + +out: + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans, fixup->root); + if (uncorrectable) { + spin_lock(&sdev->stat_lock); + ++sdev->stat.uncorrectable_errors; + spin_unlock(&sdev->stat_lock); + printk_ratelimited(KERN_ERR "btrfs: unable to fixup " + "(nodatasum) error at logical %llu\n", + fixup->logical); + } + + btrfs_free_path(path); + kfree(fixup); + + /* see caller why we're pretending to be paused in the scrub counters */ + mutex_lock(&fs_info->scrub_lock); + atomic_dec(&fs_info->scrubs_running); + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + atomic_dec(&sdev->fixup_cnt); + wake_up(&fs_info->scrub_pause_wait); + wake_up(&sdev->list_wait); +} + /* * scrub_recheck_error gets called when either verification of the page * failed or the bio failed to read, e.g. with EIO. In the latter case, @@ -417,6 +573,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct btrfs_multi_bio *multi = NULL; + struct scrub_fixup_nodatasum *fixup; u64 logical = sbio->logical + ix * PAGE_SIZE; u64 length; int i; @@ -425,12 +582,30 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && (sbio->spag[ix].have_csum == 0)) { + fixup = kzalloc(sizeof(*fixup), GFP_NOFS); + if (!fixup) + goto uncorrectable; + fixup->sdev = sdev; + fixup->logical = logical; + fixup->root = fs_info->extent_root; + fixup->mirror_num = sbio->spag[ix].mirror_num; /* - * nodatasum, don't try to fix anything - * FIXME: we can do better, open the inode and trigger a - * writeback + * increment scrubs_running to prevent cancel requests from + * completing as long as a fixup worker is running. we must also + * increment scrubs_paused to prevent deadlocking on pause + * requests used for transactions commits (as the worker uses a + * transaction context). it is safe to regard the fixup worker + * as paused for all matters practical. effectively, we only + * avoid cancellation requests from completing. */ - goto uncorrectable; + mutex_lock(&fs_info->scrub_lock); + atomic_inc(&fs_info->scrubs_running); + atomic_inc(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + atomic_inc(&sdev->fixup_cnt); + fixup->work.func = scrub_fixup_nodatasum; + btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work); + return; } length = PAGE_SIZE; @@ -1425,10 +1600,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, ret = scrub_enumerate_chunks(sdev, start, end); wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); - atomic_dec(&fs_info->scrubs_running); wake_up(&fs_info->scrub_pause_wait); + wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); + if (progress) memcpy(progress, &sdev->stat, sizeof(*progress)); -- cgit v1.2.3-58-ga151 From d7728c960dccf775b92f2c4139f1216275a45c44 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 7 Jul 2011 16:48:38 +0200 Subject: btrfs: new ioctls to do logical->inode and inode->path resolving these ioctls make use of the new functions initially added for scrub. they return all inodes belonging to a logical address (BTRFS_IOC_LOGICAL_INO) and all paths belonging to an inode (BTRFS_IOC_INO_PATHS). Signed-off-by: Jan Schmidt --- fs/btrfs/ioctl.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ioctl.h | 19 ++++++++ 2 files changed, 162 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 538f65a79ec5..7f57efa76d11 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -51,6 +51,7 @@ #include "volumes.h" #include "locking.h" #include "inode-map.h" +#include "backref.h" /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) @@ -2855,6 +2856,144 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, return ret; } +static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) +{ + int ret = 0; + int i; + unsigned long rel_ptr; + int size; + struct btrfs_ioctl_ino_path_args *ipa; + struct inode_fs_paths *ipath = NULL; + struct btrfs_path *path; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ipa = memdup_user(arg, sizeof(*ipa)); + if (IS_ERR(ipa)) { + ret = PTR_ERR(ipa); + ipa = NULL; + goto out; + } + + size = min_t(u32, ipa->size, 4096); + ipath = init_ipath(size, root, path); + if (IS_ERR(ipath)) { + ret = PTR_ERR(ipath); + ipath = NULL; + goto out; + } + + ret = paths_from_inode(ipa->inum, ipath); + if (ret < 0) + goto out; + + for (i = 0; i < ipath->fspath->elem_cnt; ++i) { + rel_ptr = ipath->fspath->str[i] - (char *)ipath->fspath->str; + ipath->fspath->str[i] = (void *)rel_ptr; + } + + ret = copy_to_user(ipa->fspath, ipath->fspath, size); + if (ret) { + ret = -EFAULT; + goto out; + } + +out: + btrfs_free_path(path); + free_ipath(ipath); + kfree(ipa); + + return ret; +} + +static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) +{ + struct btrfs_data_container *inodes = ctx; + const size_t c = 3 * sizeof(u64); + + if (inodes->bytes_left >= c) { + inodes->bytes_left -= c; + inodes->val[inodes->elem_cnt] = inum; + inodes->val[inodes->elem_cnt + 1] = offset; + inodes->val[inodes->elem_cnt + 2] = root; + inodes->elem_cnt += 3; + } else { + inodes->bytes_missing += c - inodes->bytes_left; + inodes->bytes_left = 0; + inodes->elem_missed += 3; + } + + return 0; +} + +static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, + void __user *arg) +{ + int ret = 0; + int size; + u64 extent_offset; + struct btrfs_ioctl_logical_ino_args *loi; + struct btrfs_data_container *inodes = NULL; + struct btrfs_path *path = NULL; + struct btrfs_key key; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + loi = memdup_user(arg, sizeof(*loi)); + if (IS_ERR(loi)) { + ret = PTR_ERR(loi); + loi = NULL; + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + size = min_t(u32, loi->size, 4096); + inodes = init_data_container(size); + if (IS_ERR(inodes)) { + ret = PTR_ERR(inodes); + inodes = NULL; + goto out; + } + + ret = extent_from_logical(root->fs_info, loi->logical, path, &key); + + if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) + ret = -ENOENT; + if (ret < 0) + goto out; + + extent_offset = loi->logical - key.objectid; + ret = iterate_extent_inodes(root->fs_info, path, key.objectid, + extent_offset, build_ino_list, inodes); + + if (ret < 0) + goto out; + + ret = copy_to_user(loi->inodes, inodes, size); + if (ret) + ret = -EFAULT; + +out: + btrfs_free_path(path); + kfree(inodes); + kfree(loi); + + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2912,6 +3051,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_tree_search(file, argp); case BTRFS_IOC_INO_LOOKUP: return btrfs_ioctl_ino_lookup(file, argp); + case BTRFS_IOC_INO_PATHS: + return btrfs_ioctl_ino_to_path(root, argp); + case BTRFS_IOC_LOGICAL_INO: + return btrfs_ioctl_logical_to_ino(root, argp); case BTRFS_IOC_SPACE_INFO: return btrfs_ioctl_space_info(root, argp); case BTRFS_IOC_SYNC: diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 1857e3871843..2da30d4950e6 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -204,6 +204,20 @@ struct btrfs_data_container { }; }; +struct btrfs_ioctl_ino_path_args { + __u64 inum; /* in */ + __u32 size; /* in */ + __u64 reserved[4]; + struct btrfs_data_container *fspath; /* out */ +}; + +struct btrfs_ioctl_logical_ino_args { + __u64 logical; /* in */ + __u32 size; /* in */ + __u64 reserved[4]; + struct btrfs_data_container *inodes; /* out */ +}; + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -259,4 +273,9 @@ struct btrfs_data_container { struct btrfs_ioctl_dev_info_args) #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ struct btrfs_ioctl_fs_info_args) +#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ + struct btrfs_ioctl_ino_path_args) +#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ + struct btrfs_ioctl_ino_path_args) + #endif -- cgit v1.2.3-58-ga151 From a1d3c4786a4b9c71c0767aa656a759968f7554b6 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 4 Aug 2011 17:15:33 +0200 Subject: btrfs: btrfs_multi_bio replaced with btrfs_bio btrfs_bio is a bio abstraction able to split and not complete after the last bio has returned (like the old btrfs_multi_bio). Additionally, btrfs_bio tracks the mirror_num used to read data which can be used for error correction purposes. Signed-off-by: Jan Schmidt --- fs/btrfs/extent-tree.c | 10 ++-- fs/btrfs/scrub.c | 20 ++++---- fs/btrfs/volumes.c | 128 ++++++++++++++++++++++++++----------------------- fs/btrfs/volumes.h | 10 ++-- 4 files changed, 90 insertions(+), 78 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f5be06a2462f..119f842c1d4f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1770,18 +1770,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, { int ret; u64 discarded_bytes = 0; - struct btrfs_multi_bio *multi = NULL; + struct btrfs_bio *bbio = NULL; /* Tell the block device(s) that the sectors can be discarded */ ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, - bytenr, &num_bytes, &multi, 0); + bytenr, &num_bytes, &bbio, 0); if (!ret) { - struct btrfs_bio_stripe *stripe = multi->stripes; + struct btrfs_bio_stripe *stripe = bbio->stripes; int i; - for (i = 0; i < multi->num_stripes; i++, stripe++) { + for (i = 0; i < bbio->num_stripes; i++, stripe++) { if (!stripe->dev->can_discard) continue; @@ -1800,7 +1800,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, */ ret = 0; } - kfree(multi); + kfree(bbio); } if (actual_bytes) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index db09f01c0e4f..97142a218f0a 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -572,7 +572,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) struct scrub_dev *sdev = sbio->sdev; struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct btrfs_multi_bio *multi = NULL; + struct btrfs_bio *bbio = NULL; struct scrub_fixup_nodatasum *fixup; u64 logical = sbio->logical + ix * PAGE_SIZE; u64 length; @@ -610,8 +610,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) length = PAGE_SIZE; ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, - &multi, 0); - if (ret || !multi || length < PAGE_SIZE) { + &bbio, 0); + if (ret || !bbio || length < PAGE_SIZE) { printk(KERN_ERR "scrub_fixup: btrfs_map_block failed us for %llu\n", (unsigned long long)logical); @@ -619,19 +619,19 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) return; } - if (multi->num_stripes == 1) + if (bbio->num_stripes == 1) /* there aren't any replicas */ goto uncorrectable; /* * first find a good copy */ - for (i = 0; i < multi->num_stripes; ++i) { + for (i = 0; i < bbio->num_stripes; ++i) { if (i + 1 == sbio->spag[ix].mirror_num) continue; - if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, - multi->stripes[i].physical >> 9, + if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev, + bbio->stripes[i].physical >> 9, sbio->bio->bi_io_vec[ix].bv_page)) { /* I/O-error, this is not a good copy */ continue; @@ -640,7 +640,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) if (scrub_fixup_check(sbio, ix) == 0) break; } - if (i == multi->num_stripes) + if (i == bbio->num_stripes) goto uncorrectable; if (!sdev->readonly) { @@ -655,7 +655,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) } } - kfree(multi); + kfree(bbio); spin_lock(&sdev->stat_lock); ++sdev->stat.corrected_errors; spin_unlock(&sdev->stat_lock); @@ -665,7 +665,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) return; uncorrectable: - kfree(multi); + kfree(bbio); spin_lock(&sdev->stat_lock); ++sdev->stat.uncorrectable_errors; spin_unlock(&sdev->stat_lock); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f2a4cc79da61..9db4d7962f9b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2848,7 +2848,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num, static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, + struct btrfs_bio **bbio_ret, int mirror_num) { struct extent_map *em; @@ -2866,18 +2866,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, int i; int num_stripes; int max_errors = 0; - struct btrfs_multi_bio *multi = NULL; + struct btrfs_bio *bbio = NULL; - if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) + if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) stripes_allocated = 1; again: - if (multi_ret) { - multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), + if (bbio_ret) { + bbio = kzalloc(btrfs_bio_size(stripes_allocated), GFP_NOFS); - if (!multi) + if (!bbio) return -ENOMEM; - atomic_set(&multi->error, 0); + atomic_set(&bbio->error, 0); } read_lock(&em_tree->lock); @@ -2898,7 +2898,7 @@ again: if (mirror_num > map->num_stripes) mirror_num = 0; - /* if our multi bio struct is too small, back off and try again */ + /* if our btrfs_bio struct is too small, back off and try again */ if (rw & REQ_WRITE) { if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) { @@ -2917,11 +2917,11 @@ again: stripes_required = map->num_stripes; } } - if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && + if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && stripes_allocated < stripes_required) { stripes_allocated = map->num_stripes; free_extent_map(em); - kfree(multi); + kfree(bbio); goto again; } stripe_nr = offset; @@ -2950,7 +2950,7 @@ again: *length = em->len - offset; } - if (!multi_ret) + if (!bbio_ret) goto out; num_stripes = 1; @@ -2975,13 +2975,17 @@ again: stripe_index = find_live_mirror(map, 0, map->num_stripes, current->pid % map->num_stripes); + mirror_num = stripe_index + 1; } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (rw & (REQ_WRITE | REQ_DISCARD)) + if (rw & (REQ_WRITE | REQ_DISCARD)) { num_stripes = map->num_stripes; - else if (mirror_num) + } else if (mirror_num) { stripe_index = mirror_num - 1; + } else { + mirror_num = 1; + } } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { int factor = map->num_stripes / map->sub_stripes; @@ -3001,6 +3005,7 @@ again: stripe_index = find_live_mirror(map, stripe_index, map->sub_stripes, stripe_index + current->pid % map->sub_stripes); + mirror_num = stripe_index + 1; } } else { /* @@ -3009,15 +3014,16 @@ again: * stripe_index is the number of our device in the stripe array */ stripe_index = do_div(stripe_nr, map->num_stripes); + mirror_num = stripe_index + 1; } BUG_ON(stripe_index >= map->num_stripes); if (rw & REQ_DISCARD) { for (i = 0; i < num_stripes; i++) { - multi->stripes[i].physical = + bbio->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - multi->stripes[i].dev = map->stripes[stripe_index].dev; + bbio->stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & BTRFS_BLOCK_GROUP_RAID0) { u64 stripes; @@ -3038,16 +3044,16 @@ again: } stripes = stripe_nr_end - 1 - j; do_div(stripes, map->num_stripes); - multi->stripes[i].length = map->stripe_len * + bbio->stripes[i].length = map->stripe_len * (stripes - stripe_nr + 1); if (i == 0) { - multi->stripes[i].length -= + bbio->stripes[i].length -= stripe_offset; stripe_offset = 0; } if (stripe_index == last_stripe) - multi->stripes[i].length -= + bbio->stripes[i].length -= stripe_end_offset; } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { u64 stripes; @@ -3072,11 +3078,11 @@ again: } stripes = stripe_nr_end - 1 - j; do_div(stripes, factor); - multi->stripes[i].length = map->stripe_len * + bbio->stripes[i].length = map->stripe_len * (stripes - stripe_nr + 1); if (i < map->sub_stripes) { - multi->stripes[i].length -= + bbio->stripes[i].length -= stripe_offset; if (i == map->sub_stripes - 1) stripe_offset = 0; @@ -3084,11 +3090,11 @@ again: if (stripe_index >= last_stripe && stripe_index <= (last_stripe + map->sub_stripes - 1)) { - multi->stripes[i].length -= + bbio->stripes[i].length -= stripe_end_offset; } } else - multi->stripes[i].length = *length; + bbio->stripes[i].length = *length; stripe_index++; if (stripe_index == map->num_stripes) { @@ -3099,19 +3105,20 @@ again: } } else { for (i = 0; i < num_stripes; i++) { - multi->stripes[i].physical = + bbio->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - multi->stripes[i].dev = + bbio->stripes[i].dev = map->stripes[stripe_index].dev; stripe_index++; } } - if (multi_ret) { - *multi_ret = multi; - multi->num_stripes = num_stripes; - multi->max_errors = max_errors; + if (bbio_ret) { + *bbio_ret = bbio; + bbio->num_stripes = num_stripes; + bbio->max_errors = max_errors; + bbio->mirror_num = mirror_num; } out: free_extent_map(em); @@ -3120,9 +3127,9 @@ out: int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, int mirror_num) + struct btrfs_bio **bbio_ret, int mirror_num) { - return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, + return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, mirror_num); } @@ -3191,28 +3198,28 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, return 0; } -static void end_bio_multi_stripe(struct bio *bio, int err) +static void btrfs_end_bio(struct bio *bio, int err) { - struct btrfs_multi_bio *multi = bio->bi_private; + struct btrfs_bio *bbio = bio->bi_private; int is_orig_bio = 0; if (err) - atomic_inc(&multi->error); + atomic_inc(&bbio->error); - if (bio == multi->orig_bio) + if (bio == bbio->orig_bio) is_orig_bio = 1; - if (atomic_dec_and_test(&multi->stripes_pending)) { + if (atomic_dec_and_test(&bbio->stripes_pending)) { if (!is_orig_bio) { bio_put(bio); - bio = multi->orig_bio; + bio = bbio->orig_bio; } - bio->bi_private = multi->private; - bio->bi_end_io = multi->end_io; + bio->bi_private = bbio->private; + bio->bi_end_io = bbio->end_io; /* only send an error to the higher layers if it is * beyond the tolerance of the multi-bio */ - if (atomic_read(&multi->error) > multi->max_errors) { + if (atomic_read(&bbio->error) > bbio->max_errors) { err = -EIO; } else if (err) { /* @@ -3222,7 +3229,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err) set_bit(BIO_UPTODATE, &bio->bi_flags); err = 0; } - kfree(multi); + kfree(bbio); bio_endio(bio, err); } else if (!is_orig_bio) { @@ -3302,20 +3309,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, u64 logical = (u64)bio->bi_sector << 9; u64 length = 0; u64 map_length; - struct btrfs_multi_bio *multi = NULL; int ret; int dev_nr = 0; int total_devs = 1; + struct btrfs_bio *bbio = NULL; length = bio->bi_size; map_tree = &root->fs_info->mapping_tree; map_length = length; - ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, + ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, mirror_num); BUG_ON(ret); - total_devs = multi->num_stripes; + total_devs = bbio->num_stripes; if (map_length < length) { printk(KERN_CRIT "mapping failed logical %llu bio len %llu " "len %llu\n", (unsigned long long)logical, @@ -3323,25 +3330,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, (unsigned long long)map_length); BUG(); } - multi->end_io = first_bio->bi_end_io; - multi->private = first_bio->bi_private; - multi->orig_bio = first_bio; - atomic_set(&multi->stripes_pending, multi->num_stripes); + + bbio->orig_bio = first_bio; + bbio->private = first_bio->bi_private; + bbio->end_io = first_bio->bi_end_io; + atomic_set(&bbio->stripes_pending, bbio->num_stripes); while (dev_nr < total_devs) { - if (total_devs > 1) { - if (dev_nr < total_devs - 1) { - bio = bio_clone(first_bio, GFP_NOFS); - BUG_ON(!bio); - } else { - bio = first_bio; - } - bio->bi_private = multi; - bio->bi_end_io = end_bio_multi_stripe; + if (dev_nr < total_devs - 1) { + bio = bio_clone(first_bio, GFP_NOFS); + BUG_ON(!bio); + } else { + bio = first_bio; } - bio->bi_sector = multi->stripes[dev_nr].physical >> 9; - dev = multi->stripes[dev_nr].dev; + bio->bi_private = bbio; + bio->bi_end_io = btrfs_end_bio; + bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; + dev = bbio->stripes[dev_nr].dev; if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { + pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " + "(%s id %llu), size=%u\n", rw, + (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, + dev->name, dev->devid, bio->bi_size); bio->bi_bdev = dev->bdev; if (async_submit) schedule_bio(root, dev, rw, bio); @@ -3354,8 +3364,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, } dev_nr++; } - if (total_devs == 1) - kfree(multi); return 0; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6d866db4e177..71f4f3f67495 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -136,7 +136,10 @@ struct btrfs_bio_stripe { u64 length; /* only used for discard mappings */ }; -struct btrfs_multi_bio { +struct btrfs_bio; +typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); + +struct btrfs_bio { atomic_t stripes_pending; bio_end_io_t *end_io; struct bio *orig_bio; @@ -144,6 +147,7 @@ struct btrfs_multi_bio { atomic_t error; int max_errors; int num_stripes; + int mirror_num; struct btrfs_bio_stripe stripes[]; }; @@ -171,7 +175,7 @@ struct map_lookup { int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); -#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ +#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \ (sizeof(struct btrfs_bio_stripe) * (n))) int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, @@ -180,7 +184,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, u64 chunk_offset, u64 start, u64 num_bytes); int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, int mirror_num); + struct btrfs_bio **bbio_ret, int mirror_num); int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len); -- cgit v1.2.3-58-ga151 From 1503140d3ec2be9b917d2f8f7c64cb77b79a215b Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 16 Jun 2011 14:37:03 +0200 Subject: btrfs: Do not use bio->bi_bdev after submission The block layer modifies bio->bi_bdev and bio->bi_sector while working on the bio, they do _not_ come back unmodified in the completion callback. To call add_page, we need at least some bi_bdev set, which is why the code was working, previously. With this patch, we use the latest_bdev from fsinfo instead of the leftover in the bio. This gives us the possibility to use the bi_bdev field for another purpose. Signed-off-by: Jan Schmidt --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index efee8c7cfa45..936a6fabaa9f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1915,7 +1915,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio, bio->bi_private = state; bio->bi_end_io = failed_bio->bi_end_io; bio->bi_sector = failrec->logical >> 9; - bio->bi_bdev = failed_bio->bi_bdev; + bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; bio->bi_size = 0; bio_add_page(bio, page, failrec->len, start - page_offset(page)); -- cgit v1.2.3-58-ga151 From 2774b2ca3d49124bf1ae89e8d575b3dab4221266 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 16 Jun 2011 12:05:19 +0200 Subject: btrfs: Put mirror_num in bi_bdev The error correction code wants to make sure that only the bad mirror is rewritten. Thus, we need to know which mirror is the bad one. I did not find a more apropriate field than bi_bdev. But I think using this is fine, because it is modified by the block layer, anyway, and should not be read after the bio returned. Signed-off-by: Jan Schmidt --- fs/btrfs/volumes.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9db4d7962f9b..18baac5a3f6c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3216,6 +3216,8 @@ static void btrfs_end_bio(struct bio *bio, int err) } bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; + bio->bi_bdev = (struct block_device *) + (unsigned long)bbio->mirror_num; /* only send an error to the higher layers if it is * beyond the tolerance of the multi-bio */ -- cgit v1.2.3-58-ga151 From 4a54c8c165b66300830a67349fc7595e3fc442f7 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 22 Jul 2011 15:41:52 +0200 Subject: btrfs: Moved repair code from inode.c to extent_io.c The raid-retry code in inode.c can be generalized so that it works for metadata as well. Thus, this patch moves it to extent_io.c and makes the raid-retry code a raid-repair code. Repair works that way: Whenever a read error occurs and we have more mirrors to try, note the failed mirror, and retry another. If we find a good one, check if we did note a failure earlier and if so, do not allow the read to complete until after the bad sector was written with the good data we just fetched. As we have the extent locked while reading, no one can change the data in between. Signed-off-by: Jan Schmidt --- fs/btrfs/extent_io.c | 387 ++++++++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/extent_io.h | 10 +- fs/btrfs/inode.c | 155 +-------------------- 3 files changed, 393 insertions(+), 159 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index afebb95e3490..624ef10d36cc 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -17,6 +17,7 @@ #include "compat.h" #include "ctree.h" #include "btrfs_inode.h" +#include "volumes.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -1599,6 +1600,368 @@ static int check_page_writeback(struct extent_io_tree *tree, return 0; } +/* + * When IO fails, either with EIO or csum verification fails, we + * try other mirrors that might have a good copy of the data. This + * io_failure_record is used to record state as we go through all the + * mirrors. If another mirror has good data, the page is set up to date + * and things continue. If a good mirror can't be found, the original + * bio end_io callback is called to indicate things have failed. + */ +struct io_failure_record { + struct page *page; + u64 start; + u64 len; + u64 logical; + unsigned long bio_flags; + int this_mirror; + int failed_mirror; + int in_validation; +}; + +static int free_io_failure(struct inode *inode, struct io_failure_record *rec, + int did_repair) +{ + int ret; + int err = 0; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + + set_state_private(failure_tree, rec->start, 0); + ret = clear_extent_bits(failure_tree, rec->start, + rec->start + rec->len - 1, + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + if (ret) + err = ret; + + if (did_repair) { + ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, + rec->start + rec->len - 1, + EXTENT_DAMAGED, GFP_NOFS); + if (ret && !err) + err = ret; + } + + kfree(rec); + return err; +} + +static void repair_io_failure_callback(struct bio *bio, int err) +{ + complete(bio->bi_private); +} + +/* + * this bypasses the standard btrfs submit functions deliberately, as + * the standard behavior is to write all copies in a raid setup. here we only + * want to write the one bad copy. so we do the mapping for ourselves and issue + * submit_bio directly. + * to avoid any synchonization issues, wait for the data after writing, which + * actually prevents the read that triggered the error from finishing. + * currently, there can be no more than two copies of every data bit. thus, + * exactly one rewrite is required. + */ +int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, + u64 length, u64 logical, struct page *page, + int mirror_num) +{ + struct bio *bio; + struct btrfs_device *dev; + DECLARE_COMPLETION_ONSTACK(compl); + u64 map_length = 0; + u64 sector; + struct btrfs_bio *bbio = NULL; + int ret; + + BUG_ON(!mirror_num); + + bio = bio_alloc(GFP_NOFS, 1); + if (!bio) + return -EIO; + bio->bi_private = &compl; + bio->bi_end_io = repair_io_failure_callback; + bio->bi_size = 0; + map_length = length; + + ret = btrfs_map_block(map_tree, WRITE, logical, + &map_length, &bbio, mirror_num); + if (ret) { + bio_put(bio); + return -EIO; + } + BUG_ON(mirror_num != bbio->mirror_num); + sector = bbio->stripes[mirror_num-1].physical >> 9; + bio->bi_sector = sector; + dev = bbio->stripes[mirror_num-1].dev; + kfree(bbio); + if (!dev || !dev->bdev || !dev->writeable) { + bio_put(bio); + return -EIO; + } + bio->bi_bdev = dev->bdev; + bio_add_page(bio, page, length, start-page_offset(page)); + submit_bio(WRITE_SYNC, bio); + wait_for_completion(&compl); + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + /* try to remap that extent elsewhere? */ + bio_put(bio); + return -EIO; + } + + printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s " + "sector %llu)\n", page->mapping->host->i_ino, start, + dev->name, sector); + + bio_put(bio); + return 0; +} + +/* + * each time an IO finishes, we do a fast check in the IO failure tree + * to see if we need to process or clean up an io_failure_record + */ +static int clean_io_failure(u64 start, struct page *page) +{ + u64 private; + u64 private_failure; + struct io_failure_record *failrec; + struct btrfs_mapping_tree *map_tree; + struct extent_state *state; + int num_copies; + int did_repair = 0; + int ret; + struct inode *inode = page->mapping->host; + + private = 0; + ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, + (u64)-1, 1, EXTENT_DIRTY, 0); + if (!ret) + return 0; + + ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, + &private_failure); + if (ret) + return 0; + + failrec = (struct io_failure_record *)(unsigned long) private_failure; + BUG_ON(!failrec->this_mirror); + + if (failrec->in_validation) { + /* there was no real error, just free the record */ + pr_debug("clean_io_failure: freeing dummy error at %llu\n", + failrec->start); + did_repair = 1; + goto out; + } + + spin_lock(&BTRFS_I(inode)->io_tree.lock); + state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, + failrec->start, + EXTENT_LOCKED); + spin_unlock(&BTRFS_I(inode)->io_tree.lock); + + if (state && state->start == failrec->start) { + map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; + num_copies = btrfs_num_copies(map_tree, failrec->logical, + failrec->len); + if (num_copies > 1) { + ret = repair_io_failure(map_tree, start, failrec->len, + failrec->logical, page, + failrec->failed_mirror); + did_repair = !ret; + } + } + +out: + if (!ret) + ret = free_io_failure(inode, failrec, did_repair); + + return ret; +} + +/* + * this is a generic handler for readpage errors (default + * readpage_io_failed_hook). if other copies exist, read those and write back + * good data to the failed position. does not investigate in remapping the + * failed extent elsewhere, hoping the device will be smart enough to do this as + * needed + */ + +static int bio_readpage_error(struct bio *failed_bio, struct page *page, + u64 start, u64 end, int failed_mirror, + struct extent_state *state) +{ + struct io_failure_record *failrec = NULL; + u64 private; + struct extent_map *em; + struct inode *inode = page->mapping->host; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct bio *bio; + int num_copies; + int ret; + int read_mode; + u64 logical; + + BUG_ON(failed_bio->bi_rw & REQ_WRITE); + + ret = get_state_private(failure_tree, start, &private); + if (ret) { + failrec = kzalloc(sizeof(*failrec), GFP_NOFS); + if (!failrec) + return -ENOMEM; + failrec->start = start; + failrec->len = end - start + 1; + failrec->this_mirror = 0; + failrec->bio_flags = 0; + failrec->in_validation = 0; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, failrec->len); + if (!em) { + read_unlock(&em_tree->lock); + kfree(failrec); + return -EIO; + } + + if (em->start > start || em->start + em->len < start) { + free_extent_map(em); + em = NULL; + } + read_unlock(&em_tree->lock); + + if (!em || IS_ERR(em)) { + kfree(failrec); + return -EIO; + } + logical = start - em->start; + logical = em->block_start + logical; + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + logical = em->block_start; + failrec->bio_flags = EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&failrec->bio_flags, + em->compress_type); + } + pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " + "len=%llu\n", logical, start, failrec->len); + failrec->logical = logical; + free_extent_map(em); + + /* set the bits in the private failure tree */ + ret = set_extent_bits(failure_tree, start, end, + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + if (ret >= 0) + ret = set_state_private(failure_tree, start, + (u64)(unsigned long)failrec); + /* set the bits in the inode's tree */ + if (ret >= 0) + ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, + GFP_NOFS); + if (ret < 0) { + kfree(failrec); + return ret; + } + } else { + failrec = (struct io_failure_record *)(unsigned long)private; + pr_debug("bio_readpage_error: (found) logical=%llu, " + "start=%llu, len=%llu, validation=%d\n", + failrec->logical, failrec->start, failrec->len, + failrec->in_validation); + /* + * when data can be on disk more than twice, add to failrec here + * (e.g. with a list for failed_mirror) to make + * clean_io_failure() clean all those errors at once. + */ + } + num_copies = btrfs_num_copies( + &BTRFS_I(inode)->root->fs_info->mapping_tree, + failrec->logical, failrec->len); + if (num_copies == 1) { + /* + * we only have a single copy of the data, so don't bother with + * all the retry and error correction code that follows. no + * matter what the error is, it is very likely to persist. + */ + pr_debug("bio_readpage_error: cannot repair, num_copies == 1. " + "state=%p, num_copies=%d, next_mirror %d, " + "failed_mirror %d\n", state, num_copies, + failrec->this_mirror, failed_mirror); + free_io_failure(inode, failrec, 0); + return -EIO; + } + + if (!state) { + spin_lock(&tree->lock); + state = find_first_extent_bit_state(tree, failrec->start, + EXTENT_LOCKED); + if (state && state->start != failrec->start) + state = NULL; + spin_unlock(&tree->lock); + } + + /* + * there are two premises: + * a) deliver good data to the caller + * b) correct the bad sectors on disk + */ + if (failed_bio->bi_vcnt > 1) { + /* + * to fulfill b), we need to know the exact failing sectors, as + * we don't want to rewrite any more than the failed ones. thus, + * we need separate read requests for the failed bio + * + * if the following BUG_ON triggers, our validation request got + * merged. we need separate requests for our algorithm to work. + */ + BUG_ON(failrec->in_validation); + failrec->in_validation = 1; + failrec->this_mirror = failed_mirror; + read_mode = READ_SYNC | REQ_FAILFAST_DEV; + } else { + /* + * we're ready to fulfill a) and b) alongside. get a good copy + * of the failed sector and if we succeed, we have setup + * everything for repair_io_failure to do the rest for us. + */ + if (failrec->in_validation) { + BUG_ON(failrec->this_mirror != failed_mirror); + failrec->in_validation = 0; + failrec->this_mirror = 0; + } + failrec->failed_mirror = failed_mirror; + failrec->this_mirror++; + if (failrec->this_mirror == failed_mirror) + failrec->this_mirror++; + read_mode = READ_SYNC; + } + + if (!state || failrec->this_mirror > num_copies) { + pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, " + "next_mirror %d, failed_mirror %d\n", state, + num_copies, failrec->this_mirror, failed_mirror); + free_io_failure(inode, failrec, 0); + return -EIO; + } + + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_private = state; + bio->bi_end_io = failed_bio->bi_end_io; + bio->bi_sector = failrec->logical >> 9; + bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + bio->bi_size = 0; + + bio_add_page(bio, page, failrec->len, start - page_offset(page)); + + pr_debug("bio_readpage_error: submitting new read[%#x] to " + "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, + failrec->this_mirror, num_copies, failrec->in_validation); + + tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror, + failrec->bio_flags, 0); + return 0; +} + /* lots and lots of room for performance fixes in the end_bio funcs */ /* @@ -1697,6 +2060,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err) struct extent_state *cached = NULL; struct extent_state *state; + pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, " + "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err, + (long int)bio->bi_bdev); tree = &BTRFS_I(page->mapping->host)->io_tree; start = ((u64)page->index << PAGE_CACHE_SHIFT) + @@ -1727,11 +2093,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err) state); if (ret) uptodate = 0; + else + clean_io_failure(start, page); } - if (!uptodate && tree->ops && - tree->ops->readpage_io_failed_hook) { - ret = tree->ops->readpage_io_failed_hook(bio, page, - start, end, NULL); + if (!uptodate) { + u64 failed_mirror; + failed_mirror = (u64)bio->bi_bdev; + if (tree->ops && tree->ops->readpage_io_failed_hook) + ret = tree->ops->readpage_io_failed_hook( + bio, page, start, end, + failed_mirror, NULL); + else + ret = bio_readpage_error(bio, page, start, end, + failed_mirror, NULL); if (ret == 0) { uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); @@ -1811,6 +2185,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, mirror_num, bio_flags, start); else submit_bio(rw, bio); + if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; bio_put(bio); @@ -2926,7 +3301,7 @@ out: return ret; } -static inline struct page *extent_buffer_page(struct extent_buffer *eb, +inline struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i) { struct page *p; @@ -2951,7 +3326,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb, return p; } -static inline unsigned long num_extent_pages(u64 start, u64 len) +inline unsigned long num_extent_pages(u64 start, u64 len) { return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 435d454b9926..a8e20b672922 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -68,7 +68,7 @@ struct extent_io_ops { unsigned long bio_flags); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, - u64 start, u64 end, + u64 start, u64 end, u64 failed_mirror, struct extent_state *state); int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, u64 start, u64 end, @@ -252,6 +252,8 @@ void free_extent_buffer(struct extent_buffer *eb); int read_extent_buffer_pages(struct extent_io_tree *tree, struct extent_buffer *eb, u64 start, int wait, get_extent_t *get_extent, int mirror_num); +unsigned long num_extent_pages(u64 start, u64 len); +struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i); static inline void extent_buffer_get(struct extent_buffer *eb) { @@ -301,4 +303,10 @@ int extent_clear_unlock_delalloc(struct inode *inode, struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, gfp_t gfp_flags); + +struct btrfs_mapping_tree; + +int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, + u64 length, u64 logical, struct page *page, + int mirror_num); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 936a6fabaa9f..9327f45434e8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -45,10 +45,10 @@ #include "btrfs_inode.h" #include "ioctl.h" #include "print-tree.h" -#include "volumes.h" #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" +#include "volumes.h" #include "compression.h" #include "locking.h" #include "free-space-cache.h" @@ -1818,154 +1818,10 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, return btrfs_finish_ordered_io(page->mapping->host, start, end); } -/* - * When IO fails, either with EIO or csum verification fails, we - * try other mirrors that might have a good copy of the data. This - * io_failure_record is used to record state as we go through all the - * mirrors. If another mirror has good data, the page is set up to date - * and things continue. If a good mirror can't be found, the original - * bio end_io callback is called to indicate things have failed. - */ -struct io_failure_record { - struct page *page; - u64 start; - u64 len; - u64 logical; - unsigned long bio_flags; - int last_mirror; -}; - -static int btrfs_io_failed_hook(struct bio *failed_bio, - struct page *page, u64 start, u64 end, - struct extent_state *state) -{ - struct io_failure_record *failrec = NULL; - u64 private; - struct extent_map *em; - struct inode *inode = page->mapping->host; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct bio *bio; - int num_copies; - int ret; - int rw; - u64 logical; - - ret = get_state_private(failure_tree, start, &private); - if (ret) { - failrec = kmalloc(sizeof(*failrec), GFP_NOFS); - if (!failrec) - return -ENOMEM; - failrec->start = start; - failrec->len = end - start + 1; - failrec->last_mirror = 0; - failrec->bio_flags = 0; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, failrec->len); - if (em->start > start || em->start + em->len < start) { - free_extent_map(em); - em = NULL; - } - read_unlock(&em_tree->lock); - - if (IS_ERR_OR_NULL(em)) { - kfree(failrec); - return -EIO; - } - logical = start - em->start; - logical = em->block_start + logical; - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - logical = em->block_start; - failrec->bio_flags = EXTENT_BIO_COMPRESSED; - extent_set_compress_type(&failrec->bio_flags, - em->compress_type); - } - failrec->logical = logical; - free_extent_map(em); - set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | - EXTENT_DIRTY, GFP_NOFS); - set_state_private(failure_tree, start, - (u64)(unsigned long)failrec); - } else { - failrec = (struct io_failure_record *)(unsigned long)private; - } - num_copies = btrfs_num_copies( - &BTRFS_I(inode)->root->fs_info->mapping_tree, - failrec->logical, failrec->len); - failrec->last_mirror++; - if (!state) { - spin_lock(&BTRFS_I(inode)->io_tree.lock); - state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, - failrec->start, - EXTENT_LOCKED); - if (state && state->start != failrec->start) - state = NULL; - spin_unlock(&BTRFS_I(inode)->io_tree.lock); - } - if (!state || failrec->last_mirror > num_copies) { - set_state_private(failure_tree, failrec->start, 0); - clear_extent_bits(failure_tree, failrec->start, - failrec->start + failrec->len - 1, - EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); - kfree(failrec); - return -EIO; - } - bio = bio_alloc(GFP_NOFS, 1); - bio->bi_private = state; - bio->bi_end_io = failed_bio->bi_end_io; - bio->bi_sector = failrec->logical >> 9; - bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; - bio->bi_size = 0; - - bio_add_page(bio, page, failrec->len, start - page_offset(page)); - if (failed_bio->bi_rw & REQ_WRITE) - rw = WRITE; - else - rw = READ; - - ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, - failrec->last_mirror, - failrec->bio_flags, 0); - return ret; -} - -/* - * each time an IO finishes, we do a fast check in the IO failure tree - * to see if we need to process or clean up an io_failure_record - */ -static int btrfs_clean_io_failures(struct inode *inode, u64 start) -{ - u64 private; - u64 private_failure; - struct io_failure_record *failure; - int ret; - - private = 0; - if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, - (u64)-1, 1, EXTENT_DIRTY, 0)) { - ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, - start, &private_failure); - if (ret == 0) { - failure = (struct io_failure_record *)(unsigned long) - private_failure; - set_state_private(&BTRFS_I(inode)->io_failure_tree, - failure->start, 0); - clear_extent_bits(&BTRFS_I(inode)->io_failure_tree, - failure->start, - failure->start + failure->len - 1, - EXTENT_DIRTY | EXTENT_LOCKED, - GFP_NOFS); - kfree(failure); - } - } - return 0; -} - /* * when reads are done, we need to check csums to verify the data is correct - * if there's a match, we allow the bio to finish. If not, we go through - * the io_failure_record routines to find good copies + * if there's a match, we allow the bio to finish. If not, the code in + * extent_io.c will try to find good copies for us. */ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state) @@ -2011,10 +1867,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, kunmap_atomic(kaddr, KM_USER0); good: - /* if the io failure tree for this inode is non-empty, - * check to see if we've recovered from a failed IO - */ - btrfs_clean_io_failures(inode, start); return 0; zeroit: @@ -7420,7 +7272,6 @@ static struct extent_io_ops btrfs_extent_io_ops = { .readpage_end_io_hook = btrfs_readpage_end_io_hook, .writepage_end_io_hook = btrfs_writepage_end_io_hook, .writepage_start_hook = btrfs_writepage_start_hook, - .readpage_io_failed_hook = btrfs_io_failed_hook, .set_bit_hook = btrfs_set_bit_hook, .clear_bit_hook = btrfs_clear_bit_hook, .merge_extent_hook = btrfs_merge_extent_hook, -- cgit v1.2.3-58-ga151 From 5da6fcbc4eb50c0f55d520750332f5a6ab13508c Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 4 Aug 2011 18:11:04 +0200 Subject: btrfs: integrating raid-repair and scrub-fixup-nodatasum This ties nodatasum fixup in scrub together with raid repair patches. While both series are working fine alone, scrub will report uncorrectable errors if they occur in a nodatasum extent *and* the page is in the page cache. Previously, we would have triggered readpage to find good data and do the repair. However, readpage wouldn't read anything in the case where the page is up to date in the cache. So, we simply take that good data we have and call repair_io_failure directly (unless the page in the cache is dirty). Signed-off-by: Jan Schmidt --- fs/btrfs/scrub.c | 92 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 67 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 97142a218f0a..eba42e5fd5fd 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -24,6 +24,7 @@ #include "ordered-data.h" #include "transaction.h" #include "backref.h" +#include "extent_io.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -360,13 +361,13 @@ out: static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) { - struct page *page; + struct page *page = NULL; unsigned long index; struct scrub_fixup_nodatasum *fixup = ctx; int ret; - int corrected; + int corrected = 0; struct btrfs_key key; - struct inode *inode; + struct inode *inode = NULL; u64 end = offset + PAGE_SIZE - 1; struct btrfs_root *local_root; @@ -384,34 +385,75 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) if (IS_ERR(inode)) return PTR_ERR(inode); - ret = set_extent_bit(&BTRFS_I(inode)->io_tree, offset, end, - EXTENT_DAMAGED, 0, NULL, NULL, GFP_NOFS); - - /* set_extent_bit should either succeed or give proper error */ - WARN_ON(ret > 0); - if (ret) - return ret < 0 ? ret : -EFAULT; - index = offset >> PAGE_CACHE_SHIFT; page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); - if (!page) - return -ENOMEM; + if (!page) { + ret = -ENOMEM; + goto out; + } + + if (PageUptodate(page)) { + struct btrfs_mapping_tree *map_tree; + if (PageDirty(page)) { + /* + * we need to write the data to the defect sector. the + * data that was in that sector is not in memory, + * because the page was modified. we must not write the + * modified page to that sector. + * + * TODO: what could be done here: wait for the delalloc + * runner to write out that page (might involve + * COW) and see whether the sector is still + * referenced afterwards. + * + * For the meantime, we'll treat this error + * incorrectable, although there is a chance that a + * later scrub will find the bad sector again and that + * there's no dirty page in memory, then. + */ + ret = -EIO; + goto out; + } + map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; + ret = repair_io_failure(map_tree, offset, PAGE_SIZE, + fixup->logical, page, + fixup->mirror_num); + unlock_page(page); + corrected = !ret; + } else { + /* + * we need to get good data first. the general readpage path + * will call repair_io_failure for us, we just have to make + * sure we read the bad mirror. + */ + ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, + EXTENT_DAMAGED, GFP_NOFS); + if (ret) { + /* set_extent_bits should give proper error */ + WARN_ON(ret > 0); + if (ret > 0) + ret = -EFAULT; + goto out; + } - ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, - btrfs_get_extent, fixup->mirror_num); - wait_on_page_locked(page); - corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, end, - EXTENT_DAMAGED, 0, NULL); + ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, + btrfs_get_extent, + fixup->mirror_num); + wait_on_page_locked(page); - if (corrected) - WARN_ON(!PageUptodate(page)); - else - clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, end, - EXTENT_DAMAGED, 0, 0, NULL, GFP_NOFS); + corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, + end, EXTENT_DAMAGED, 0, NULL); + if (!corrected) + clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, + EXTENT_DAMAGED, GFP_NOFS); + } - put_page(page); - iput(inode); +out: + if (page) + put_page(page); + if (inode) + iput(inode); if (ret < 0) return ret; -- cgit v1.2.3-58-ga151 From bb82ab88dfdb12948af58989c75bfe904bc1b09d Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Fri, 10 Jun 2011 14:06:53 +0200 Subject: btrfs: add an extra wait mode to read_extent_buffer_pages read_extent_buffer_pages currently has two modes, either trigger a read without waiting for anything, or wait for the I/O to finish. The former also bails when it's unable to lock the page. This patch now adds an additional parameter to allow it to block on page lock, but don't wait for completion. Changes v5: - merge the 2 wait parameters into one and define WAIT_NONE, WAIT_COMPLETE and WAIT_PAGE_LOCK Change v6: - fix bug introduced in v5 Signed-off-by: Arne Jansen --- fs/btrfs/disk-io.c | 5 +++-- fs/btrfs/extent_io.c | 7 +++---- fs/btrfs/extent_io.h | 3 +++ 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 07b3ac662e19..e6170e142cad 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -367,7 +367,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; while (1) { - ret = read_extent_buffer_pages(io_tree, eb, start, 1, + ret = read_extent_buffer_pages(io_tree, eb, start, + WAIT_COMPLETE, btree_get_extent, mirror_num); if (!ret && !verify_parent_transid(io_tree, eb, parent_transid)) @@ -974,7 +975,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, if (!buf) return 0; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, - buf, 0, 0, btree_get_extent, 0); + buf, 0, WAIT_NONE, btree_get_extent, 0); free_extent_buffer(buf); return ret; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d418164a35f1..823028e73cf8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3349,8 +3349,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, } int read_extent_buffer_pages(struct extent_io_tree *tree, - struct extent_buffer *eb, - u64 start, int wait, + struct extent_buffer *eb, u64 start, int wait, get_extent_t *get_extent, int mirror_num) { unsigned long i; @@ -3386,7 +3385,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); - if (!wait) { + if (wait == WAIT_NONE) { if (!trylock_page(page)) goto unlock_exit; } else { @@ -3430,7 +3429,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, if (bio) submit_one_bio(READ, bio, mirror_num, bio_flags); - if (ret || !wait) + if (ret || wait != WAIT_COMPLETE) return ret; for (i = start_i; i < num_pages; i++) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 7b2f0c3e7929..6d74c6b34691 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -248,6 +248,9 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len); void free_extent_buffer(struct extent_buffer *eb); +#define WAIT_NONE 0 +#define WAIT_COMPLETE 1 +#define WAIT_PAGE_LOCK 2 int read_extent_buffer_pages(struct extent_io_tree *tree, struct extent_buffer *eb, u64 start, int wait, get_extent_t *get_extent, int mirror_num); -- cgit v1.2.3-58-ga151 From ab0fff03055d2d1b01a7581badeba18db9c4f55c Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Mon, 23 May 2011 14:25:41 +0200 Subject: btrfs: add READAHEAD extent buffer flag Add a READAHEAD extent buffer flag. Add a function to trigger a read with this flag set. Changes v2: - use extent buffer flags instead of extent state flags Changes v5: - adapt to changed read_extent_buffer_pages interface - don't return eb from reada_tree_block_flagged if it has CORRUPT flag set Signed-off-by: Arne Jansen --- fs/btrfs/disk-io.c | 32 ++++++++++++++++++++++++++++++++ fs/btrfs/disk-io.h | 2 ++ fs/btrfs/extent_io.h | 1 + 3 files changed, 35 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e6170e142cad..1220d04072c8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -980,6 +980,38 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, return ret; } +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, + int mirror_num, struct extent_buffer **eb) +{ + struct extent_buffer *buf = NULL; + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree; + int ret; + + buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!buf) + return 0; + + set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); + + ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK, + btree_get_extent, mirror_num); + if (ret) { + free_extent_buffer(buf); + return ret; + } + + if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { + free_extent_buffer(buf); + return -EIO; + } else if (extent_buffer_uptodate(io_tree, buf, NULL)) { + *eb = buf; + } else { + free_extent_buffer(buf); + } + return 0; +} + struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index bec3ea4bd67f..b3bdb5c1390f 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 parent_transid); int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 parent_transid); +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, + int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); int clean_tree_block(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6d74c6b34691..fcaf49bcb880 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -32,6 +32,7 @@ #define EXTENT_BUFFER_BLOCKING 1 #define EXTENT_BUFFER_DIRTY 2 #define EXTENT_BUFFER_CORRUPT 3 +#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ /* these are flags for extent_clear_unlock_delalloc */ #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 -- cgit v1.2.3-58-ga151 From 90519d66abbccc251d14719ac76f191f70826e40 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Mon, 23 May 2011 14:30:00 +0200 Subject: btrfs: state information for readahead Add state information for readahead to btrfs_fs_info and btrfs_device Changes v2: - don't wait in radix_trees - add own set of workers for readahead Reviewed-by: Josef Bacik Signed-off-by: Arne Jansen --- fs/btrfs/ctree.h | 5 +++++ fs/btrfs/disk-io.c | 10 ++++++++++ fs/btrfs/volumes.c | 8 ++++++++ fs/btrfs/volumes.h | 8 ++++++++ 4 files changed, 31 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 03912c5c6f49..f71fd24cc152 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1036,6 +1036,7 @@ struct btrfs_fs_info { struct btrfs_workers endio_freespace_worker; struct btrfs_workers submit_workers; struct btrfs_workers caching_workers; + struct btrfs_workers readahead_workers; /* * fixup workers take dirty pages that didn't properly go through @@ -1119,6 +1120,10 @@ struct btrfs_fs_info { u64 fs_state; struct btrfs_delayed_root *delayed_root; + + /* readahead tree */ + spinlock_t reada_lock; + struct radix_tree_root reada_tree; }; /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1220d04072c8..fd28c40e9ec3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1711,6 +1711,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->defrag_inodes = RB_ROOT; fs_info->trans_no_join = 0; + /* readahead state */ + INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); + spin_lock_init(&fs_info->reada_lock); + fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); @@ -1903,6 +1907,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta", fs_info->thread_pool_size, &fs_info->generic_worker); + btrfs_init_workers(&fs_info->readahead_workers, "readahead", + fs_info->thread_pool_size, + &fs_info->generic_worker); /* * endios are largely parallel and should have a very @@ -1913,6 +1920,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->endio_write_workers.idle_thresh = 2; fs_info->endio_meta_write_workers.idle_thresh = 2; + fs_info->readahead_workers.idle_thresh = 2; btrfs_start_workers(&fs_info->workers, 1); btrfs_start_workers(&fs_info->generic_worker, 1); @@ -1926,6 +1934,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_start_workers(&fs_info->endio_freespace_worker, 1); btrfs_start_workers(&fs_info->delayed_workers, 1); btrfs_start_workers(&fs_info->caching_workers, 1); + btrfs_start_workers(&fs_info->readahead_workers, 1); fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, @@ -2650,6 +2659,7 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->submit_workers); btrfs_stop_workers(&fs_info->delayed_workers); btrfs_stop_workers(&fs_info->caching_workers); + btrfs_stop_workers(&fs_info->readahead_workers); btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f2a4cc79da61..1dccce5bc93d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -366,6 +366,14 @@ static noinline int device_list_add(const char *path, } INIT_LIST_HEAD(&device->dev_alloc_list); + /* init readahead state */ + spin_lock_init(&device->reada_lock); + device->reada_curr_zone = NULL; + atomic_set(&device->reada_in_flight, 0); + device->reada_next = 0; + INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT); + INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT); + mutex_lock(&fs_devices->device_list_mutex); list_add_rcu(&device->dev_list, &fs_devices->devices); mutex_unlock(&fs_devices->device_list_mutex); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6d866db4e177..2a751246188a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -92,6 +92,14 @@ struct btrfs_device { struct btrfs_work work; struct rcu_head rcu; struct work_struct rcu_work; + + /* readahead state */ + spinlock_t reada_lock; + atomic_t reada_in_flight; + u64 reada_next; + struct reada_zone *reada_curr_zone; + struct radix_tree_root reada_zones; + struct radix_tree_root reada_extents; }; struct btrfs_fs_devices { -- cgit v1.2.3-58-ga151 From 7414a03fbf9e75fbbf2a3c16828cd862e572aa44 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Mon, 23 May 2011 14:33:49 +0200 Subject: btrfs: initial readahead code and prototypes This is the implementation for the generic read ahead framework. To trigger a readahead, btrfs_reada_add must be called. It will start a read ahead for the given range [start, end) on tree root. The returned handle can either be used to wait on the readahead to finish (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach). The read ahead works as follows: On btrfs_reada_add, the root of the tree is inserted into a radix_tree. reada_start_machine will then search for extents to prefetch and trigger some reads. When a read finishes for a node, all contained node/leaf pointers that lie in the given range will also be enqueued. The reads will be triggered in sequential order, thus giving a big win over a naive enumeration. It will also make use of multi-device layouts. Each disk will have its on read pointer and all disks will by utilized in parallel. Also will no two disks read both sides of a mirror simultaneously, as this would waste seeking capacity. Instead both disks will read different parts of the filesystem. Any number of readaheads can be started in parallel. The read order will be determined globally, i.e. 2 parallel readaheads will normally finish faster than the 2 started one after another. Changes v2: - protect root->node by transaction instead of node_lock - fix missed branches: The readahead had a too simple check to determine if a branch from a node should be checked or not. It now also records the upper bound of each node to see if the requested RA range lies within. - use KERN_CONT to debug output, to avoid line breaks - defer reada_start_machine to worker to avoid deadlock Changes v3: - protect root->node by rcu Changes v5: - changed EIO-semantics of reada_tree_block_flagged - remove spin_lock from reada_control and make elems an atomic_t - remove unused read_total from reada_control - kill reada_key_cmp, use btrfs_comp_cpu_keys instead - use kref-style release functions where possible - return struct reada_control * instead of void * from btrfs_reada_add Signed-off-by: Arne Jansen --- fs/btrfs/Makefile | 3 +- fs/btrfs/ctree.h | 16 + fs/btrfs/reada.c | 949 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 967 insertions(+), 1 deletion(-) create mode 100644 fs/btrfs/reada.c (limited to 'fs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 40e6ac08c21f..bdd6fb238ce1 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -7,6 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ - compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o + compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ + reada.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f71fd24cc152..370af767440d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2702,4 +2702,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct btrfs_scrub_progress *progress); +/* reada.c */ +struct reada_control { + struct btrfs_root *root; /* tree to prefetch */ + struct btrfs_key key_start; + struct btrfs_key key_end; /* exclusive */ + atomic_t elems; + struct kref refcnt; + wait_queue_head_t wait; +}; +struct reada_control *btrfs_reada_add(struct btrfs_root *root, + struct btrfs_key *start, struct btrfs_key *end); +int btrfs_reada_wait(void *handle); +void btrfs_reada_detach(void *handle); +int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, + u64 start, int err); + #endif diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c new file mode 100644 index 000000000000..2b701d082227 --- /dev/null +++ b/fs/btrfs/reada.c @@ -0,0 +1,949 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "volumes.h" +#include "disk-io.h" +#include "transaction.h" + +#undef DEBUG + +/* + * This is the implementation for the generic read ahead framework. + * + * To trigger a readahead, btrfs_reada_add must be called. It will start + * a read ahead for the given range [start, end) on tree root. The returned + * handle can either be used to wait on the readahead to finish + * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach). + * + * The read ahead works as follows: + * On btrfs_reada_add, the root of the tree is inserted into a radix_tree. + * reada_start_machine will then search for extents to prefetch and trigger + * some reads. When a read finishes for a node, all contained node/leaf + * pointers that lie in the given range will also be enqueued. The reads will + * be triggered in sequential order, thus giving a big win over a naive + * enumeration. It will also make use of multi-device layouts. Each disk + * will have its on read pointer and all disks will by utilized in parallel. + * Also will no two disks read both sides of a mirror simultaneously, as this + * would waste seeking capacity. Instead both disks will read different parts + * of the filesystem. + * Any number of readaheads can be started in parallel. The read order will be + * determined globally, i.e. 2 parallel readaheads will normally finish faster + * than the 2 started one after another. + */ + +#define MAX_MIRRORS 2 +#define MAX_IN_FLIGHT 6 + +struct reada_extctl { + struct list_head list; + struct reada_control *rc; + u64 generation; +}; + +struct reada_extent { + u64 logical; + struct btrfs_key top; + u32 blocksize; + int err; + struct list_head extctl; + struct kref refcnt; + spinlock_t lock; + struct reada_zone *zones[MAX_MIRRORS]; + int nzones; + struct btrfs_device *scheduled_for; +}; + +struct reada_zone { + u64 start; + u64 end; + u64 elems; + struct list_head list; + spinlock_t lock; + int locked; + struct btrfs_device *device; + struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */ + int ndevs; + struct kref refcnt; +}; + +struct reada_machine_work { + struct btrfs_work work; + struct btrfs_fs_info *fs_info; +}; + +static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *); +static void reada_control_release(struct kref *kref); +static void reada_zone_release(struct kref *kref); +static void reada_start_machine(struct btrfs_fs_info *fs_info); +static void __reada_start_machine(struct btrfs_fs_info *fs_info); + +static int reada_add_block(struct reada_control *rc, u64 logical, + struct btrfs_key *top, int level, u64 generation); + +/* recurses */ +/* in case of err, eb might be NULL */ +static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, + u64 start, int err) +{ + int level = 0; + int nritems; + int i; + u64 bytenr; + u64 generation; + struct reada_extent *re; + struct btrfs_fs_info *fs_info = root->fs_info; + struct list_head list; + unsigned long index = start >> PAGE_CACHE_SHIFT; + struct btrfs_device *for_dev; + + if (eb) + level = btrfs_header_level(eb); + + /* find extent */ + spin_lock(&fs_info->reada_lock); + re = radix_tree_lookup(&fs_info->reada_tree, index); + if (re) + kref_get(&re->refcnt); + spin_unlock(&fs_info->reada_lock); + + if (!re) + return -1; + + spin_lock(&re->lock); + /* + * just take the full list from the extent. afterwards we + * don't need the lock anymore + */ + list_replace_init(&re->extctl, &list); + for_dev = re->scheduled_for; + re->scheduled_for = NULL; + spin_unlock(&re->lock); + + if (err == 0) { + nritems = level ? btrfs_header_nritems(eb) : 0; + generation = btrfs_header_generation(eb); + /* + * FIXME: currently we just set nritems to 0 if this is a leaf, + * effectively ignoring the content. In a next step we could + * trigger more readahead depending from the content, e.g. + * fetch the checksums for the extents in the leaf. + */ + } else { + /* + * this is the error case, the extent buffer has not been + * read correctly. We won't access anything from it and + * just cleanup our data structures. Effectively this will + * cut the branch below this node from read ahead. + */ + nritems = 0; + generation = 0; + } + + for (i = 0; i < nritems; i++) { + struct reada_extctl *rec; + u64 n_gen; + struct btrfs_key key; + struct btrfs_key next_key; + + btrfs_node_key_to_cpu(eb, &key, i); + if (i + 1 < nritems) + btrfs_node_key_to_cpu(eb, &next_key, i + 1); + else + next_key = re->top; + bytenr = btrfs_node_blockptr(eb, i); + n_gen = btrfs_node_ptr_generation(eb, i); + + list_for_each_entry(rec, &list, list) { + struct reada_control *rc = rec->rc; + + /* + * if the generation doesn't match, just ignore this + * extctl. This will probably cut off a branch from + * prefetch. Alternatively one could start a new (sub-) + * prefetch for this branch, starting again from root. + * FIXME: move the generation check out of this loop + */ +#ifdef DEBUG + if (rec->generation != generation) { + printk(KERN_DEBUG "generation mismatch for " + "(%llu,%d,%llu) %llu != %llu\n", + key.objectid, key.type, key.offset, + rec->generation, generation); + } +#endif + if (rec->generation == generation && + btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 && + btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0) + reada_add_block(rc, bytenr, &next_key, + level - 1, n_gen); + } + } + /* + * free extctl records + */ + while (!list_empty(&list)) { + struct reada_control *rc; + struct reada_extctl *rec; + + rec = list_first_entry(&list, struct reada_extctl, list); + list_del(&rec->list); + rc = rec->rc; + kfree(rec); + + kref_get(&rc->refcnt); + if (atomic_dec_and_test(&rc->elems)) { + kref_put(&rc->refcnt, reada_control_release); + wake_up(&rc->wait); + } + kref_put(&rc->refcnt, reada_control_release); + + reada_extent_put(fs_info, re); /* one ref for each entry */ + } + reada_extent_put(fs_info, re); /* our ref */ + if (for_dev) + atomic_dec(&for_dev->reada_in_flight); + + return 0; +} + +/* + * start is passed separately in case eb in NULL, which may be the case with + * failed I/O + */ +int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, + u64 start, int err) +{ + int ret; + + ret = __readahead_hook(root, eb, start, err); + + reada_start_machine(root->fs_info); + + return ret; +} + +static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, + struct btrfs_device *dev, u64 logical, + struct btrfs_multi_bio *multi) +{ + int ret; + int looped = 0; + struct reada_zone *zone; + struct btrfs_block_group_cache *cache = NULL; + u64 start; + u64 end; + int i; + +again: + zone = NULL; + spin_lock(&fs_info->reada_lock); + ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, + logical >> PAGE_CACHE_SHIFT, 1); + if (ret == 1) + kref_get(&zone->refcnt); + spin_unlock(&fs_info->reada_lock); + + if (ret == 1) { + if (logical >= zone->start && logical < zone->end) + return zone; + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + + if (looped) + return NULL; + + cache = btrfs_lookup_block_group(fs_info, logical); + if (!cache) + return NULL; + + start = cache->key.objectid; + end = start + cache->key.offset - 1; + btrfs_put_block_group(cache); + + zone = kzalloc(sizeof(*zone), GFP_NOFS); + if (!zone) + return NULL; + + zone->start = start; + zone->end = end; + INIT_LIST_HEAD(&zone->list); + spin_lock_init(&zone->lock); + zone->locked = 0; + kref_init(&zone->refcnt); + zone->elems = 0; + zone->device = dev; /* our device always sits at index 0 */ + for (i = 0; i < multi->num_stripes; ++i) { + /* bounds have already been checked */ + zone->devs[i] = multi->stripes[i].dev; + } + zone->ndevs = multi->num_stripes; + + spin_lock(&fs_info->reada_lock); + ret = radix_tree_insert(&dev->reada_zones, + (unsigned long)zone->end >> PAGE_CACHE_SHIFT, + zone); + spin_unlock(&fs_info->reada_lock); + + if (ret) { + kfree(zone); + looped = 1; + goto again; + } + + return zone; +} + +static struct reada_extent *reada_find_extent(struct btrfs_root *root, + u64 logical, + struct btrfs_key *top, int level) +{ + int ret; + int looped = 0; + struct reada_extent *re = NULL; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct btrfs_multi_bio *multi = NULL; + struct btrfs_device *dev; + u32 blocksize; + u64 length; + int nzones = 0; + int i; + unsigned long index = logical >> PAGE_CACHE_SHIFT; + +again: + spin_lock(&fs_info->reada_lock); + re = radix_tree_lookup(&fs_info->reada_tree, index); + if (re) + kref_get(&re->refcnt); + spin_unlock(&fs_info->reada_lock); + + if (re || looped) + return re; + + re = kzalloc(sizeof(*re), GFP_NOFS); + if (!re) + return NULL; + + blocksize = btrfs_level_size(root, level); + re->logical = logical; + re->blocksize = blocksize; + re->top = *top; + INIT_LIST_HEAD(&re->extctl); + spin_lock_init(&re->lock); + kref_init(&re->refcnt); + + /* + * map block + */ + length = blocksize; + ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &multi, 0); + if (ret || !multi || length < blocksize) + goto error; + + if (multi->num_stripes > MAX_MIRRORS) { + printk(KERN_ERR "btrfs readahead: more than %d copies not " + "supported", MAX_MIRRORS); + goto error; + } + + for (nzones = 0; nzones < multi->num_stripes; ++nzones) { + struct reada_zone *zone; + + dev = multi->stripes[nzones].dev; + zone = reada_find_zone(fs_info, dev, logical, multi); + if (!zone) + break; + + re->zones[nzones] = zone; + spin_lock(&zone->lock); + if (!zone->elems) + kref_get(&zone->refcnt); + ++zone->elems; + spin_unlock(&zone->lock); + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + re->nzones = nzones; + if (nzones == 0) { + /* not a single zone found, error and out */ + goto error; + } + + /* insert extent in reada_tree + all per-device trees, all or nothing */ + spin_lock(&fs_info->reada_lock); + ret = radix_tree_insert(&fs_info->reada_tree, index, re); + if (ret) { + spin_unlock(&fs_info->reada_lock); + if (ret != -ENOMEM) { + /* someone inserted the extent in the meantime */ + looped = 1; + } + goto error; + } + for (i = 0; i < nzones; ++i) { + dev = multi->stripes[i].dev; + ret = radix_tree_insert(&dev->reada_extents, index, re); + if (ret) { + while (--i >= 0) { + dev = multi->stripes[i].dev; + BUG_ON(dev == NULL); + radix_tree_delete(&dev->reada_extents, index); + } + BUG_ON(fs_info == NULL); + radix_tree_delete(&fs_info->reada_tree, index); + spin_unlock(&fs_info->reada_lock); + goto error; + } + } + spin_unlock(&fs_info->reada_lock); + + return re; + +error: + while (nzones) { + struct reada_zone *zone; + + --nzones; + zone = re->zones[nzones]; + kref_get(&zone->refcnt); + spin_lock(&zone->lock); + --zone->elems; + if (zone->elems == 0) { + /* + * no fs_info->reada_lock needed, as this can't be + * the last ref + */ + kref_put(&zone->refcnt, reada_zone_release); + } + spin_unlock(&zone->lock); + + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + kfree(re); + if (looped) + goto again; + return NULL; +} + +static void reada_kref_dummy(struct kref *kr) +{ +} + +static void reada_extent_put(struct btrfs_fs_info *fs_info, + struct reada_extent *re) +{ + int i; + unsigned long index = re->logical >> PAGE_CACHE_SHIFT; + + spin_lock(&fs_info->reada_lock); + if (!kref_put(&re->refcnt, reada_kref_dummy)) { + spin_unlock(&fs_info->reada_lock); + return; + } + + radix_tree_delete(&fs_info->reada_tree, index); + for (i = 0; i < re->nzones; ++i) { + struct reada_zone *zone = re->zones[i]; + + radix_tree_delete(&zone->device->reada_extents, index); + } + + spin_unlock(&fs_info->reada_lock); + + for (i = 0; i < re->nzones; ++i) { + struct reada_zone *zone = re->zones[i]; + + kref_get(&zone->refcnt); + spin_lock(&zone->lock); + --zone->elems; + if (zone->elems == 0) { + /* no fs_info->reada_lock needed, as this can't be + * the last ref */ + kref_put(&zone->refcnt, reada_zone_release); + } + spin_unlock(&zone->lock); + + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + if (re->scheduled_for) + atomic_dec(&re->scheduled_for->reada_in_flight); + + kfree(re); +} + +static void reada_zone_release(struct kref *kref) +{ + struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); + + radix_tree_delete(&zone->device->reada_zones, + zone->end >> PAGE_CACHE_SHIFT); + + kfree(zone); +} + +static void reada_control_release(struct kref *kref) +{ + struct reada_control *rc = container_of(kref, struct reada_control, + refcnt); + + kfree(rc); +} + +static int reada_add_block(struct reada_control *rc, u64 logical, + struct btrfs_key *top, int level, u64 generation) +{ + struct btrfs_root *root = rc->root; + struct reada_extent *re; + struct reada_extctl *rec; + + re = reada_find_extent(root, logical, top, level); /* takes one ref */ + if (!re) + return -1; + + rec = kzalloc(sizeof(*rec), GFP_NOFS); + if (!rec) { + reada_extent_put(root->fs_info, re); + return -1; + } + + rec->rc = rc; + rec->generation = generation; + atomic_inc(&rc->elems); + + spin_lock(&re->lock); + list_add_tail(&rec->list, &re->extctl); + spin_unlock(&re->lock); + + /* leave the ref on the extent */ + + return 0; +} + +/* + * called with fs_info->reada_lock held + */ +static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock) +{ + int i; + unsigned long index = zone->end >> PAGE_CACHE_SHIFT; + + for (i = 0; i < zone->ndevs; ++i) { + struct reada_zone *peer; + peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index); + if (peer && peer->device != zone->device) + peer->locked = lock; + } +} + +/* + * called with fs_info->reada_lock held + */ +static int reada_pick_zone(struct btrfs_device *dev) +{ + struct reada_zone *top_zone = NULL; + struct reada_zone *top_locked_zone = NULL; + u64 top_elems = 0; + u64 top_locked_elems = 0; + unsigned long index = 0; + int ret; + + if (dev->reada_curr_zone) { + reada_peer_zones_set_lock(dev->reada_curr_zone, 0); + kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release); + dev->reada_curr_zone = NULL; + } + /* pick the zone with the most elements */ + while (1) { + struct reada_zone *zone; + + ret = radix_tree_gang_lookup(&dev->reada_zones, + (void **)&zone, index, 1); + if (ret == 0) + break; + index = (zone->end >> PAGE_CACHE_SHIFT) + 1; + if (zone->locked) { + if (zone->elems > top_locked_elems) { + top_locked_elems = zone->elems; + top_locked_zone = zone; + } + } else { + if (zone->elems > top_elems) { + top_elems = zone->elems; + top_zone = zone; + } + } + } + if (top_zone) + dev->reada_curr_zone = top_zone; + else if (top_locked_zone) + dev->reada_curr_zone = top_locked_zone; + else + return 0; + + dev->reada_next = dev->reada_curr_zone->start; + kref_get(&dev->reada_curr_zone->refcnt); + reada_peer_zones_set_lock(dev->reada_curr_zone, 1); + + return 1; +} + +static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, + struct btrfs_device *dev) +{ + struct reada_extent *re = NULL; + int mirror_num = 0; + struct extent_buffer *eb = NULL; + u64 logical; + u32 blocksize; + int ret; + int i; + int need_kick = 0; + + spin_lock(&fs_info->reada_lock); + if (dev->reada_curr_zone == NULL) { + ret = reada_pick_zone(dev); + if (!ret) { + spin_unlock(&fs_info->reada_lock); + return 0; + } + } + /* + * FIXME currently we issue the reads one extent at a time. If we have + * a contiguous block of extents, we could also coagulate them or use + * plugging to speed things up + */ + ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, + dev->reada_next >> PAGE_CACHE_SHIFT, 1); + if (ret == 0 || re->logical >= dev->reada_curr_zone->end) { + ret = reada_pick_zone(dev); + if (!ret) { + spin_unlock(&fs_info->reada_lock); + return 0; + } + re = NULL; + ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, + dev->reada_next >> PAGE_CACHE_SHIFT, 1); + } + if (ret == 0) { + spin_unlock(&fs_info->reada_lock); + return 0; + } + dev->reada_next = re->logical + re->blocksize; + kref_get(&re->refcnt); + + spin_unlock(&fs_info->reada_lock); + + /* + * find mirror num + */ + for (i = 0; i < re->nzones; ++i) { + if (re->zones[i]->device == dev) { + mirror_num = i + 1; + break; + } + } + logical = re->logical; + blocksize = re->blocksize; + + spin_lock(&re->lock); + if (re->scheduled_for == NULL) { + re->scheduled_for = dev; + need_kick = 1; + } + spin_unlock(&re->lock); + + reada_extent_put(fs_info, re); + + if (!need_kick) + return 0; + + atomic_inc(&dev->reada_in_flight); + ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize, + mirror_num, &eb); + if (ret) + __readahead_hook(fs_info->extent_root, NULL, logical, ret); + else if (eb) + __readahead_hook(fs_info->extent_root, eb, eb->start, ret); + + if (eb) + free_extent_buffer(eb); + + return 1; + +} + +static void reada_start_machine_worker(struct btrfs_work *work) +{ + struct reada_machine_work *rmw; + struct btrfs_fs_info *fs_info; + + rmw = container_of(work, struct reada_machine_work, work); + fs_info = rmw->fs_info; + + kfree(rmw); + + __reada_start_machine(fs_info); +} + +static void __reada_start_machine(struct btrfs_fs_info *fs_info) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + u64 enqueued; + u64 total = 0; + int i; + + do { + enqueued = 0; + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (atomic_read(&device->reada_in_flight) < + MAX_IN_FLIGHT) + enqueued += reada_start_machine_dev(fs_info, + device); + } + total += enqueued; + } while (enqueued && total < 10000); + + if (enqueued == 0) + return; + + /* + * If everything is already in the cache, this is effectively single + * threaded. To a) not hold the caller for too long and b) to utilize + * more cores, we broke the loop above after 10000 iterations and now + * enqueue to workers to finish it. This will distribute the load to + * the cores. + */ + for (i = 0; i < 2; ++i) + reada_start_machine(fs_info); +} + +static void reada_start_machine(struct btrfs_fs_info *fs_info) +{ + struct reada_machine_work *rmw; + + rmw = kzalloc(sizeof(*rmw), GFP_NOFS); + if (!rmw) { + /* FIXME we cannot handle this properly right now */ + BUG(); + } + rmw->work.func = reada_start_machine_worker; + rmw->fs_info = fs_info; + + btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work); +} + +#ifdef DEBUG +static void dump_devs(struct btrfs_fs_info *fs_info, int all) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + unsigned long index; + int ret; + int i; + int j; + int cnt; + + spin_lock(&fs_info->reada_lock); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid, + atomic_read(&device->reada_in_flight)); + index = 0; + while (1) { + struct reada_zone *zone; + ret = radix_tree_gang_lookup(&device->reada_zones, + (void **)&zone, index, 1); + if (ret == 0) + break; + printk(KERN_DEBUG " zone %llu-%llu elems %llu locked " + "%d devs", zone->start, zone->end, zone->elems, + zone->locked); + for (j = 0; j < zone->ndevs; ++j) { + printk(KERN_CONT " %lld", + zone->devs[j]->devid); + } + if (device->reada_curr_zone == zone) + printk(KERN_CONT " curr off %llu", + device->reada_next - zone->start); + printk(KERN_CONT "\n"); + index = (zone->end >> PAGE_CACHE_SHIFT) + 1; + } + cnt = 0; + index = 0; + while (all) { + struct reada_extent *re = NULL; + + ret = radix_tree_gang_lookup(&device->reada_extents, + (void **)&re, index, 1); + if (ret == 0) + break; + printk(KERN_DEBUG + " re: logical %llu size %u empty %d for %lld", + re->logical, re->blocksize, + list_empty(&re->extctl), re->scheduled_for ? + re->scheduled_for->devid : -1); + + for (i = 0; i < re->nzones; ++i) { + printk(KERN_CONT " zone %llu-%llu devs", + re->zones[i]->start, + re->zones[i]->end); + for (j = 0; j < re->zones[i]->ndevs; ++j) { + printk(KERN_CONT " %lld", + re->zones[i]->devs[j]->devid); + } + } + printk(KERN_CONT "\n"); + index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + if (++cnt > 15) + break; + } + } + + index = 0; + cnt = 0; + while (all) { + struct reada_extent *re = NULL; + + ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re, + index, 1); + if (ret == 0) + break; + if (!re->scheduled_for) { + index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + continue; + } + printk(KERN_DEBUG + "re: logical %llu size %u list empty %d for %lld", + re->logical, re->blocksize, list_empty(&re->extctl), + re->scheduled_for ? re->scheduled_for->devid : -1); + for (i = 0; i < re->nzones; ++i) { + printk(KERN_CONT " zone %llu-%llu devs", + re->zones[i]->start, + re->zones[i]->end); + for (i = 0; i < re->nzones; ++i) { + printk(KERN_CONT " zone %llu-%llu devs", + re->zones[i]->start, + re->zones[i]->end); + for (j = 0; j < re->zones[i]->ndevs; ++j) { + printk(KERN_CONT " %lld", + re->zones[i]->devs[j]->devid); + } + } + } + printk(KERN_CONT "\n"); + index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + } + spin_unlock(&fs_info->reada_lock); +} +#endif + +/* + * interface + */ +struct reada_control *btrfs_reada_add(struct btrfs_root *root, + struct btrfs_key *key_start, struct btrfs_key *key_end) +{ + struct reada_control *rc; + u64 start; + u64 generation; + int level; + struct extent_buffer *node; + static struct btrfs_key max_key = { + .objectid = (u64)-1, + .type = (u8)-1, + .offset = (u64)-1 + }; + + rc = kzalloc(sizeof(*rc), GFP_NOFS); + if (!rc) + return ERR_PTR(-ENOMEM); + + rc->root = root; + rc->key_start = *key_start; + rc->key_end = *key_end; + atomic_set(&rc->elems, 0); + init_waitqueue_head(&rc->wait); + kref_init(&rc->refcnt); + kref_get(&rc->refcnt); /* one ref for having elements */ + + node = btrfs_root_node(root); + start = node->start; + level = btrfs_header_level(node); + generation = btrfs_header_generation(node); + free_extent_buffer(node); + + reada_add_block(rc, start, &max_key, level, generation); + + reada_start_machine(root->fs_info); + + return rc; +} + +#ifdef DEBUG +int btrfs_reada_wait(void *handle) +{ + struct reada_control *rc = handle; + + while (atomic_read(&rc->elems)) { + wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, + 5 * HZ); + dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0); + } + + dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0); + + kref_put(&rc->refcnt, reada_control_release); + + return 0; +} +#else +int btrfs_reada_wait(void *handle) +{ + struct reada_control *rc = handle; + + while (atomic_read(&rc->elems)) { + wait_event(rc->wait, atomic_read(&rc->elems) == 0); + } + + kref_put(&rc->refcnt, reada_control_release); + + return 0; +} +#endif + +void btrfs_reada_detach(void *handle) +{ + struct reada_control *rc = handle; + + kref_put(&rc->refcnt, reada_control_release); +} -- cgit v1.2.3-58-ga151 From 4bb31e928d1a47f5bd046ecb176b8eff7c589fc0 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Fri, 10 Jun 2011 13:55:54 +0200 Subject: btrfs: hooks for readahead This adds the hooks needed for readahead. In the readpage_end_io_hook, the extent state is checked for the EXTENT_READAHEAD flag. Only in this case the readahead hook is called, to keep the impact on non-ra as low as possible. Additionally, a hook for a failed IO is added, otherwise readahead would wait indefinitely for the extent to finish. Changes for v2: - eliminate race condition Signed-off-by: Arne Jansen --- fs/btrfs/disk-io.c | 37 +++++++++++++++++++++++++++++++++++++ fs/btrfs/extent_io.c | 2 +- 2 files changed, 38 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index fd28c40e9ec3..2151828aa142 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -609,11 +609,47 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, end = min_t(u64, eb->len, PAGE_CACHE_SIZE); end = eb->start + end - 1; err: + if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { + clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); + btree_readahead_hook(root, eb, eb->start, ret); + } + free_extent_buffer(eb); out: return ret; } +static int btree_io_failed_hook(struct bio *failed_bio, + struct page *page, u64 start, u64 end, + struct extent_state *state) +{ + struct extent_io_tree *tree; + unsigned long len; + struct extent_buffer *eb; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; + if (!page->private) + goto out; + + len = page->private >> 2; + WARN_ON(len == 0); + + eb = alloc_extent_buffer(tree, start, len, page); + if (eb == NULL) + goto out; + + if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { + clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); + btree_readahead_hook(root, eb, eb->start, -EIO); + } + +out: + return -EIO; /* we fixed nothing */ +} + static void end_workqueue_bio(struct bio *bio, int err) { struct end_io_wq *end_io_wq = bio->bi_private; @@ -3166,6 +3202,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) static struct extent_io_ops btree_extent_io_ops = { .write_cache_pages_lock_hook = btree_lock_page_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, + .readpage_io_failed_hook = btree_io_failed_hook, .submit_bio_hook = btree_submit_bio_hook, /* note we're sharing with inode.c for the merge bio hook */ .merge_bio_hook = btrfs_merge_bio_hook, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 823028e73cf8..deba714236d7 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1731,7 +1731,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { ret = tree->ops->readpage_io_failed_hook(bio, page, - start, end, NULL); + start, end, state); if (ret == 0) { uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); -- cgit v1.2.3-58-ga151 From 7a26285eea8eb92e0088db011571d887d4551b0f Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Fri, 10 Jun 2011 12:39:23 +0200 Subject: btrfs: use readahead API for scrub Scrub uses a simple tree-enumeration to bring the relevant portions of the extent- and csum-tree into the page cache before starting the scrub-I/O. This is now replaced by using the new readahead-API. During readahead the scrub is being accounted as paused, so it won't hold off transaction commits. This change raises the average disk bandwith utilisation on my test volume from 70% to 90%. On another volume, the time for a test run went down from 89s to 43s. Changes v5: - reada1/2 are now of type struct reada_control * Signed-off-by: Arne Jansen --- fs/btrfs/scrub.c | 112 +++++++++++++++++++++++++------------------------------ 1 file changed, 50 insertions(+), 62 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a8d03d5efb5d..f930f2776589 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -29,15 +29,12 @@ * any can be found. * * Future enhancements: - * - To enhance the performance, better read-ahead strategies for the - * extent-tree can be employed. * - In case an unrepairable extent is encountered, track which files are * affected and report them * - In case of a read error on files with nodatasum, map the file and read * the extent to trigger a writeback of the good copy * - track and record media errors, throw out bad devices * - add a mode to also read unallocated space - * - make the prefetch cancellable */ struct scrub_bio; @@ -741,13 +738,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, int slot; int i; u64 nstripes; - int start_stripe; struct extent_buffer *l; struct btrfs_key key; u64 physical; u64 logical; u64 generation; u64 mirror_num; + struct reada_control *reada1; + struct reada_control *reada2; + struct btrfs_key key_start; + struct btrfs_key key_end; u64 increment = map->stripe_len; u64 offset; @@ -779,81 +779,67 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, if (!path) return -ENOMEM; - path->reada = 2; path->search_commit_root = 1; path->skip_locking = 1; /* - * find all extents for each stripe and just read them to get - * them into the page cache - * FIXME: we can do better. build a more intelligent prefetching + * trigger the readahead for extent tree csum tree and wait for + * completion. During readahead, the scrub is officially paused + * to not hold off transaction commits */ logical = base + offset; - physical = map->stripes[num].physical; - ret = 0; - for (i = 0; i < nstripes; ++i) { - key.objectid = logical; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = (u64)0; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out_noplug; - - /* - * we might miss half an extent here, but that doesn't matter, - * as it's only the prefetch - */ - while (1) { - l = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out_noplug; - break; - } - btrfs_item_key_to_cpu(l, &key, slot); + wait_event(sdev->list_wait, + atomic_read(&sdev->in_flight) == 0); + atomic_inc(&fs_info->scrubs_paused); + wake_up(&fs_info->scrub_pause_wait); - if (key.objectid >= logical + map->stripe_len) - break; + /* FIXME it might be better to start readahead at commit root */ + key_start.objectid = logical; + key_start.type = BTRFS_EXTENT_ITEM_KEY; + key_start.offset = (u64)0; + key_end.objectid = base + offset + nstripes * increment; + key_end.type = BTRFS_EXTENT_ITEM_KEY; + key_end.offset = (u64)0; + reada1 = btrfs_reada_add(root, &key_start, &key_end); + + key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key_start.type = BTRFS_EXTENT_CSUM_KEY; + key_start.offset = logical; + key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key_end.type = BTRFS_EXTENT_CSUM_KEY; + key_end.offset = base + offset + nstripes * increment; + reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); + + if (!IS_ERR(reada1)) + btrfs_reada_wait(reada1); + if (!IS_ERR(reada2)) + btrfs_reada_wait(reada2); - path->slots[0]++; - } - btrfs_release_path(path); - logical += increment; - physical += map->stripe_len; - cond_resched(); + mutex_lock(&fs_info->scrub_lock); + while (atomic_read(&fs_info->scrub_pause_req)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrub_pause_req) == 0); + mutex_lock(&fs_info->scrub_lock); } + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + wake_up(&fs_info->scrub_pause_wait); /* * collect all data csums for the stripe to avoid seeking during * the scrub. This might currently (crc32) end up to be about 1MB */ - start_stripe = 0; blk_start_plug(&plug); -again: - logical = base + offset + start_stripe * increment; - for (i = start_stripe; i < nstripes; ++i) { - ret = btrfs_lookup_csums_range(csum_root, logical, - logical + map->stripe_len - 1, - &sdev->csum_list, 1); - if (ret) - goto out; - logical += increment; - cond_resched(); - } /* * now find all extents for each stripe and scrub them */ - logical = base + offset + start_stripe * increment; - physical = map->stripes[num].physical + start_stripe * map->stripe_len; + logical = base + offset; + physical = map->stripes[num].physical; ret = 0; - for (i = start_stripe; i < nstripes; ++i) { + for (i = 0; i < nstripes; ++i) { /* * canceled? */ @@ -882,11 +868,14 @@ again: atomic_dec(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); wake_up(&fs_info->scrub_pause_wait); - scrub_free_csums(sdev); - start_stripe = i; - goto again; } + ret = btrfs_lookup_csums_range(csum_root, logical, + logical + map->stripe_len - 1, + &sdev->csum_list, 1); + if (ret) + goto out; + key.objectid = logical; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = (u64)0; @@ -982,7 +971,6 @@ next: out: blk_finish_plug(&plug); -out_noplug: btrfs_free_path(path); return ret < 0 ? ret : 0; } -- cgit v1.2.3-58-ga151 From af6a311384bce6c88e15c80ab22ab051a918b4eb Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Mon, 3 Oct 2011 20:46:17 -0600 Subject: writeback: add bg_threshold parameter to __bdi_update_bandwidth() No behavior change. Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 2 +- include/linux/writeback.h | 1 + mm/page-writeback.c | 11 +++++++---- 3 files changed, 9 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 04cf3b91e501..28076562ada0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -675,7 +675,7 @@ static inline bool over_bground_thresh(void) static void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) { - __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time); + __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time); } /* diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 2b8963ff0f35..ddb4652cb337 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -143,6 +143,7 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, void __bdi_update_bandwidth(struct backing_dev_info *bdi, unsigned long thresh, + unsigned long bg_thresh, unsigned long dirty, unsigned long bdi_thresh, unsigned long bdi_dirty, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c16ddd8f5cb6..4b954c9fe846 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -779,6 +779,7 @@ static void global_update_bandwidth(unsigned long thresh, void __bdi_update_bandwidth(struct backing_dev_info *bdi, unsigned long thresh, + unsigned long bg_thresh, unsigned long dirty, unsigned long bdi_thresh, unsigned long bdi_dirty, @@ -815,6 +816,7 @@ snapshot: static void bdi_update_bandwidth(struct backing_dev_info *bdi, unsigned long thresh, + unsigned long bg_thresh, unsigned long dirty, unsigned long bdi_thresh, unsigned long bdi_dirty, @@ -823,8 +825,8 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi, if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) return; spin_lock(&bdi->wb.list_lock); - __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty, - start_time); + __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty, + bdi_thresh, bdi_dirty, start_time); spin_unlock(&bdi->wb.list_lock); } @@ -912,8 +914,9 @@ static void balance_dirty_pages(struct address_space *mapping, if (!bdi->dirty_exceeded) bdi->dirty_exceeded = 1; - bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty, - bdi_thresh, bdi_dirty, start_time); + bdi_update_bandwidth(bdi, dirty_thresh, background_thresh, + nr_dirty, bdi_thresh, bdi_dirty, + start_time); /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. * Unstable writes are a feature of certain networked -- cgit v1.2.3-58-ga151 From b00949aa2df9970a912bf060bc95e99da356881c Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 18 Nov 2010 14:38:33 -0600 Subject: writeback: per-bdi background threshold One thing puzzled me is that in JBOD case, the per-disk writeout performance is smaller than the corresponding single-disk case even when they have comparable bdi_thresh. Tracing shows find that in single disk case, bdi_writeback is always kept high while in JBOD case, it could drop low from time to time and correspondingly bdi_reclaimable could sometimes rush high. The fix is to watch bdi_reclaimable and kick background writeback as soon as it goes high. This resembles the global background threshold but in per-bdi manner. The trick is, as long as bdi_reclaimable does not go high, bdi_writeback naturally won't go low because bdi_reclaimable+bdi_writeback ~= bdi_thresh. With less fluctuated writeback pages, JBOD performance is observed to increase noticeably in various cases. vmstat:nr_written values before/after patch: 3.1.0-rc4-wo-underrun+ 3.1.0-rc4-bgthresh3+ ------------------------ ------------------------ 125596480 +25.9% 158179363 JBOD-10HDD-16G/ext4-100dd-1M-24p-16384M-20:10-X 61790815 +110.4% 130032231 JBOD-10HDD-16G/ext4-10dd-1M-24p-16384M-20:10-X 58853546 -0.1% 58823828 JBOD-10HDD-16G/ext4-1dd-1M-24p-16384M-20:10-X 110159811 +24.7% 137355377 JBOD-10HDD-16G/xfs-100dd-1M-24p-16384M-20:10-X 69544762 +10.8% 77080047 JBOD-10HDD-16G/xfs-10dd-1M-24p-16384M-20:10-X 50644862 +0.5% 50890006 JBOD-10HDD-16G/xfs-1dd-1M-24p-16384M-20:10-X 42677090 +28.0% 54643527 JBOD-10HDD-thresh=100M/ext4-100dd-1M-24p-16384M-100M:10-X 47491324 +13.3% 53785605 JBOD-10HDD-thresh=100M/ext4-10dd-1M-24p-16384M-100M:10-X 52548986 +0.9% 53001031 JBOD-10HDD-thresh=100M/ext4-1dd-1M-24p-16384M-100M:10-X 26783091 +36.8% 36650248 JBOD-10HDD-thresh=100M/xfs-100dd-1M-24p-16384M-100M:10-X 35526347 +14.0% 40492312 JBOD-10HDD-thresh=100M/xfs-10dd-1M-24p-16384M-100M:10-X 44670723 -1.1% 44177606 JBOD-10HDD-thresh=100M/xfs-1dd-1M-24p-16384M-100M:10-X 127996037 +22.4% 156719990 JBOD-10HDD-thresh=2G/ext4-100dd-1M-24p-16384M-2048M:10-X 57518856 +3.8% 59677625 JBOD-10HDD-thresh=2G/ext4-10dd-1M-24p-16384M-2048M:10-X 51919909 +12.2% 58269894 JBOD-10HDD-thresh=2G/ext4-1dd-1M-24p-16384M-2048M:10-X 86410514 +79.0% 154660433 JBOD-10HDD-thresh=2G/xfs-100dd-1M-24p-16384M-2048M:10-X 40132519 +38.6% 55617893 JBOD-10HDD-thresh=2G/xfs-10dd-1M-24p-16384M-2048M:10-X 48423248 +7.5% 52042927 JBOD-10HDD-thresh=2G/xfs-1dd-1M-24p-16384M-2048M:10-X 206041046 +44.1% 296846536 JBOD-10HDD-thresh=4G/xfs-100dd-1M-24p-16384M-4096M:10-X 72312903 -19.4% 58272885 JBOD-10HDD-thresh=4G/xfs-10dd-1M-24p-16384M-4096M:10-X 50635672 -0.5% 50384787 JBOD-10HDD-thresh=4G/xfs-1dd-1M-24p-16384M-4096M:10-X 68308534 +115.7% 147324758 JBOD-10HDD-thresh=800M/ext4-100dd-1M-24p-16384M-800M:10-X 57882933 +14.5% 66269621 JBOD-10HDD-thresh=800M/ext4-10dd-1M-24p-16384M-800M:10-X 52183472 +12.8% 58855181 JBOD-10HDD-thresh=800M/ext4-1dd-1M-24p-16384M-800M:10-X 53788956 +94.2% 104460352 JBOD-10HDD-thresh=800M/xfs-100dd-1M-24p-16384M-800M:10-X 44493342 +35.5% 60298210 JBOD-10HDD-thresh=800M/xfs-10dd-1M-24p-16384M-800M:10-X 42641209 +18.9% 50681038 JBOD-10HDD-thresh=800M/xfs-1dd-1M-24p-16384M-800M:10-X Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 28076562ada0..6401cd76f109 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -658,14 +658,21 @@ long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) return nr_pages - work.nr_pages; } -static inline bool over_bground_thresh(void) +static bool over_bground_thresh(struct backing_dev_info *bdi) { unsigned long background_thresh, dirty_thresh; global_dirty_limits(&background_thresh, &dirty_thresh); - return (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) > background_thresh); + if (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) > background_thresh) + return true; + + if (bdi_stat(bdi, BDI_RECLAIMABLE) > + bdi_dirty_limit(bdi, background_thresh)) + return true; + + return false; } /* @@ -727,7 +734,7 @@ static long wb_writeback(struct bdi_writeback *wb, * For background writeout, stop when we are below the * background dirty threshold */ - if (work->for_background && !over_bground_thresh()) + if (work->for_background && !over_bground_thresh(wb->bdi)) break; if (work->for_kupdate) { @@ -811,7 +818,7 @@ static unsigned long get_nr_dirty_pages(void) static long wb_check_background_flush(struct bdi_writeback *wb) { - if (over_bground_thresh()) { + if (over_bground_thresh(wb->bdi)) { struct wb_writeback_work work = { .nr_pages = LONG_MAX, -- cgit v1.2.3-58-ga151 From 1958c7c284298f0d17a1ab3820c77c40ea3d3711 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Fri, 23 Sep 2011 13:42:43 -0700 Subject: exofs/ore.c: local functions should be static This quiets the sparse noise: warning: symbol '_calc_trunk_info' was not declared. Should it be static? Signed-off-by: H Hartley Sweeten Signed-off-by: Boaz Harrosh --- fs/exofs/ore.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 25305af88198..6114fdffcad8 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -753,8 +753,8 @@ struct _trunc_info { unsigned max_devs; }; -void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, - struct _trunc_info *ti) +static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, + struct _trunc_info *ti) { unsigned stripe_unit = layout->stripe_unit; -- cgit v1.2.3-58-ga151 From de74b05ace743b4a7aefad9e9b33ff899979b34a Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Fri, 23 Sep 2011 13:46:51 -0700 Subject: exofs/super.c: local functions should be static This quiets the following sparse noise: warning: symbol 'exofs_sync_fs' was not declared. Should it be static? warning: symbol 'exofs_free_sbi' was not declared. Should it be static? warning: symbol 'exofs_get_parent' was not declared. Should it be static? Signed-off-by: H Hartley Sweeten Signed-off-by: Boaz Harrosh --- fs/exofs/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 274894053b02..afe79ee088ad 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -355,7 +355,7 @@ static const struct export_operations exofs_export_ops; /* * Write the superblock to the OSD */ -int exofs_sync_fs(struct super_block *sb, int wait) +static int exofs_sync_fs(struct super_block *sb, int wait) { struct exofs_sb_info *sbi; struct exofs_fscb *fscb; @@ -429,7 +429,7 @@ static void _exofs_print_device(const char *msg, const char *dev_path, msg, dev_path ?: "", odi->osdname, _LLU(pid)); } -void exofs_free_sbi(struct exofs_sb_info *sbi) +static void exofs_free_sbi(struct exofs_sb_info *sbi) { while (sbi->comps.numdevs) { int i = --sbi->comps.numdevs; @@ -981,7 +981,7 @@ static const struct super_operations exofs_sops = { * EXPORT OPERATIONS *****************************************************************************/ -struct dentry *exofs_get_parent(struct dentry *child) +static struct dentry *exofs_get_parent(struct dentry *child) { unsigned long ino = exofs_parent_ino(child); -- cgit v1.2.3-58-ga151 From 5bf696dad4beecb6174e701c97e1f2574e6a2c96 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 28 Sep 2011 11:39:59 +0300 Subject: exofs: Rename struct ore_components comps => oc ore_components already has a comps member so this leads to things like comps->comps which is annoying. the name oc was already used in new code. So rename all old usage of ore_components comps => ore_components oc. Signed-off-by: Boaz Harrosh --- fs/exofs/exofs.h | 16 +++++++------- fs/exofs/inode.c | 22 +++++++++---------- fs/exofs/ore.c | 30 +++++++++++++------------- fs/exofs/super.c | 58 +++++++++++++++++++++++++------------------------- include/scsi/osd_ore.h | 2 +- 5 files changed, 64 insertions(+), 64 deletions(-) (limited to 'fs') diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index f4e442ec7445..c09d5a765efe 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -71,7 +71,7 @@ struct exofs_sb_info { */ struct ore_layout layout; /* Default files layout */ struct ore_comp one_comp; /* id & cred of partition id=0*/ - struct ore_components comps; /* comps for the partition */ + struct ore_components oc; /* comps for the partition */ struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ }; @@ -86,7 +86,7 @@ struct exofs_i_info { uint32_t i_dir_start_lookup; /* which page to start lookup */ uint64_t i_commit_size; /* the object's written length */ struct ore_comp one_comp; /* same component for all devices */ - struct ore_components comps; /* inode view of the device table */ + struct ore_components oc; /* inode view of the device table */ }; static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) @@ -207,7 +207,7 @@ extern const struct inode_operations exofs_fast_symlink_inode_operations; * bigger and that the device table repeats twice. * See: exofs_read_lookup_dev_table() */ -static inline void exofs_init_comps(struct ore_components *comps, +static inline void exofs_init_comps(struct ore_components *oc, struct ore_comp *one_comp, struct exofs_sb_info *sbi, osd_id oid) { @@ -217,13 +217,13 @@ static inline void exofs_init_comps(struct ore_components *comps, one_comp->obj.id = oid; exofs_make_credential(one_comp->cred, &one_comp->obj); - comps->numdevs = sbi->comps.numdevs; - comps->single_comp = EC_SINGLE_COMP; - comps->comps = one_comp; + oc->numdevs = sbi->oc.numdevs; + oc->single_comp = EC_SINGLE_COMP; + oc->comps = one_comp; /* Round robin device view of the table */ - first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->comps.numdevs; - comps->ods = sbi->comps.ods + first_dev; + first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs; + oc->ods = sbi->oc.ods + first_dev; } #endif diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index f39a38fc2349..61b2f7e5cdbd 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -270,7 +270,7 @@ static int read_exec(struct page_collect *pcol) return 0; if (!pcol->ios) { - int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, true, + int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true, pcol->pg_first << PAGE_CACHE_SHIFT, pcol->length, &pcol->ios); @@ -516,7 +516,7 @@ static int write_exec(struct page_collect *pcol) return 0; BUG_ON(pcol->ios); - ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, false, + ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false, pcol->pg_first << PAGE_CACHE_SHIFT, pcol->length, &pcol->ios); @@ -860,7 +860,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize) inode->i_mtime = inode->i_ctime = CURRENT_TIME; - ret = ore_truncate(&sbi->layout, &oi->comps, (u64)newsize); + ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize); if (likely(!ret)) truncate_setsize(inode, newsize); @@ -927,14 +927,14 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, struct exofs_on_disk_inode_layout *layout; int ret; - ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); return ret; } - attrs[1].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs); - attrs[2].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs); + attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); + attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); ios->in_attr = attrs; ios->in_attr_len = ARRAY_SIZE(attrs); @@ -1018,7 +1018,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) return inode; oi = exofs_i(inode); __oi_init(oi); - exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info, + exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, exofs_oi_objno(oi)); /* read the inode from the osd */ @@ -1172,13 +1172,13 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) spin_unlock(&sbi->s_next_gen_lock); insert_inode_hash(inode); - exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info, + exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, exofs_oi_objno(oi)); exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ mark_inode_dirty(inode); - ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n"); return ERR_PTR(ret); @@ -1267,7 +1267,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync) } else memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); - ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); goto free_args; @@ -1350,7 +1350,7 @@ void exofs_evict_inode(struct inode *inode) /* ignore the error, attempt a remove anyway */ /* Now Remove the OSD objects */ - ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed\n", __func__); return; diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 6114fdffcad8..870f85a232d1 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -49,20 +49,20 @@ MODULE_LICENSE("GPL"); static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) { - return ios->comps->comps[index & ios->comps->single_comp].cred; + return ios->oc->comps[index & ios->oc->single_comp].cred; } static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) { - return &ios->comps->comps[index & ios->comps->single_comp].obj; + return &ios->oc->comps[index & ios->oc->single_comp].obj; } static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) { - return ios->comps->ods[index]; + return ios->oc->ods[index]; } -int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, +int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, bool is_reading, u64 offset, u64 length, struct ore_io_state **pios) { @@ -71,16 +71,16 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, /*TODO: Maybe use kmem_cach per sbi of size * exofs_io_state_size(layout->s_numdevs) */ - ios = kzalloc(ore_io_state_size(comps->numdevs), GFP_KERNEL); + ios = kzalloc(ore_io_state_size(oc->numdevs), GFP_KERNEL); if (unlikely(!ios)) { ORE_DBGMSG("Failed kzalloc bytes=%d\n", - ore_io_state_size(comps->numdevs)); + ore_io_state_size(oc->numdevs)); *pios = NULL; return -ENOMEM; } ios->layout = layout; - ios->comps = comps; + ios->oc = oc; ios->offset = offset; ios->length = length; ios->reading = is_reading; @@ -90,10 +90,10 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, } EXPORT_SYMBOL(ore_get_rw_state); -int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps, +int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, struct ore_io_state **ios) { - return ore_get_rw_state(layout, comps, true, 0, 0, ios); + return ore_get_rw_state(layout, oc, true, 0, 0, ios); } EXPORT_SYMBOL(ore_get_io_state); @@ -476,7 +476,7 @@ int ore_create(struct ore_io_state *ios) { int i, ret; - for (i = 0; i < ios->comps->numdevs; i++) { + for (i = 0; i < ios->oc->numdevs; i++) { struct osd_request *or; or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); @@ -501,7 +501,7 @@ int ore_remove(struct ore_io_state *ios) { int i, ret; - for (i = 0; i < ios->comps->numdevs; i++) { + for (i = 0; i < ios->oc->numdevs; i++) { struct osd_request *or; or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); @@ -768,7 +768,7 @@ static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, ti->max_devs = layout->group_width * layout->group_count; } -int ore_truncate(struct ore_layout *layout, struct ore_components *comps, +int ore_truncate(struct ore_layout *layout, struct ore_components *oc, u64 size) { struct ore_io_state *ios; @@ -779,7 +779,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps, struct _trunc_info ti; int i, ret; - ret = ore_get_io_state(layout, comps, &ios); + ret = ore_get_io_state(layout, oc, &ios); if (unlikely(ret)) return ret; @@ -792,7 +792,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps, goto out; } - ios->numdevs = ios->comps->numdevs; + ios->numdevs = ios->oc->numdevs; for (i = 0; i < ti.max_devs; ++i) { struct exofs_trunc_attr *size_attr = &size_attrs[i]; @@ -815,7 +815,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps, size_attr->attr.val_ptr = &size_attr->newsize; ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", - _LLU(comps->comps->obj.id), _LLU(obj_size), i); + _LLU(oc->comps->obj.id), _LLU(obj_size), i); ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, &size_attr->attr); if (unlikely(ret)) diff --git a/fs/exofs/super.c b/fs/exofs/super.c index afe79ee088ad..babb1958f6f0 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -266,7 +266,7 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi) struct ore_io_state *ios; int ret; - ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); return ret; @@ -321,7 +321,7 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi) struct ore_io_state *ios; int ret; - ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); return ret; @@ -360,7 +360,7 @@ static int exofs_sync_fs(struct super_block *sb, int wait) struct exofs_sb_info *sbi; struct exofs_fscb *fscb; struct ore_comp one_comp; - struct ore_components comps; + struct ore_components oc; struct ore_io_state *ios; int ret = -ENOMEM; @@ -378,9 +378,9 @@ static int exofs_sync_fs(struct super_block *sb, int wait) * the writeable info is set in exofs_sbi_write_stats() above. */ - exofs_init_comps(&comps, &one_comp, sbi, EXOFS_SUPER_ID); + exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID); - ret = ore_get_io_state(&sbi->layout, &comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oc, &ios); if (unlikely(ret)) goto out; @@ -431,17 +431,17 @@ static void _exofs_print_device(const char *msg, const char *dev_path, static void exofs_free_sbi(struct exofs_sb_info *sbi) { - while (sbi->comps.numdevs) { - int i = --sbi->comps.numdevs; - struct osd_dev *od = sbi->comps.ods[i]; + while (sbi->oc.numdevs) { + int i = --sbi->oc.numdevs; + struct osd_dev *od = sbi->oc.ods[i]; if (od) { - sbi->comps.ods[i] = NULL; + sbi->oc.ods[i] = NULL; osduld_put_device(od); } } - if (sbi->comps.ods != sbi->_min_one_dev) - kfree(sbi->comps.ods); + if (sbi->oc.ods != sbi->_min_one_dev) + kfree(sbi->oc.ods); kfree(sbi); } @@ -468,7 +468,7 @@ static void exofs_put_super(struct super_block *sb) msecs_to_jiffies(100)); } - _exofs_print_device("Unmounting", NULL, sbi->comps.ods[0], + _exofs_print_device("Unmounting", NULL, sbi->oc.ods[0], sbi->one_comp.obj.partition); bdi_destroy(&sbi->bdi); @@ -623,7 +623,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, return -ENOMEM; } - sbi->comps.numdevs = 0; + sbi->oc.numdevs = 0; comp.obj.partition = sbi->one_comp.obj.partition; comp.obj.id = EXOFS_DEVTABLE_ID; @@ -648,13 +648,13 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, goto out; if (likely(numdevs > 1)) { - unsigned size = numdevs * sizeof(sbi->comps.ods[0]); + unsigned size = numdevs * sizeof(sbi->oc.ods[0]); /* Twice bigger table: See exofs_init_comps() and below * comment */ - sbi->comps.ods = kzalloc(size + size - 1, GFP_KERNEL); - if (unlikely(!sbi->comps.ods)) { + sbi->oc.ods = kzalloc(size + size - 1, GFP_KERNEL); + if (unlikely(!sbi->oc.ods)) { EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", numdevs); ret = -ENOMEM; @@ -681,8 +681,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, * line. We always keep them in device-table order. */ if (fscb_od && osduld_device_same(fscb_od, &odi)) { - sbi->comps.ods[i] = fscb_od; - ++sbi->comps.numdevs; + sbi->oc.ods[i] = fscb_od; + ++sbi->oc.numdevs; fscb_od = NULL; continue; } @@ -695,8 +695,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, goto out; } - sbi->comps.ods[i] = od; - ++sbi->comps.numdevs; + sbi->oc.ods[i] = od; + ++sbi->oc.numdevs; /* Read the fscb of the other devices to make sure the FS * partition is there. @@ -719,7 +719,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, out: kfree(dt); if (likely(!ret)) { - unsigned numdevs = sbi->comps.numdevs; + unsigned numdevs = sbi->oc.numdevs; if (unlikely(fscb_od)) { EXOFS_ERR("ERROR: Bad device-table container device not present\n"); @@ -732,7 +732,7 @@ out: * starting at this device. See exofs_init_comps() */ for (i = 0; i < numdevs - 1; ++i) - sbi->comps.ods[i + numdevs] = sbi->comps.ods[i]; + sbi->oc.ods[i + numdevs] = sbi->oc.ods[i]; } return ret; } @@ -783,10 +783,10 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sbi->one_comp.obj.partition = opts->pid; sbi->one_comp.obj.id = 0; exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj); - sbi->comps.numdevs = 1; - sbi->comps.single_comp = EC_SINGLE_COMP; - sbi->comps.comps = &sbi->one_comp; - sbi->comps.ods = sbi->_min_one_dev; + sbi->oc.numdevs = 1; + sbi->oc.single_comp = EC_SINGLE_COMP; + sbi->oc.comps = &sbi->one_comp; + sbi->oc.ods = sbi->_min_one_dev; /* fill in some other data by hand */ memset(sb->s_id, 0, sizeof(sb->s_id)); @@ -835,7 +835,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) if (unlikely(ret)) goto free_sbi; } else { - sbi->comps.ods[0] = od; + sbi->oc.ods[0] = od; } __sbi_read_stats(sbi); @@ -875,7 +875,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - _exofs_print_device("Mounting", opts->dev_name, sbi->comps.ods[0], + _exofs_print_device("Mounting", opts->dev_name, sbi->oc.ods[0], sbi->one_comp.obj.partition); return 0; @@ -924,7 +924,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) uint64_t used = ULLONG_MAX; int ret; - ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); if (ret) { EXOFS_DBGMSG("ore_get_io_state failed.\n"); return ret; diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index c5c5e008e6de..954292a23767 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -64,7 +64,7 @@ struct ore_io_state { ore_io_done_fn done; struct ore_layout *layout; - struct ore_components *comps; + struct ore_components *oc; /* Global read/write IO*/ loff_t offset; -- cgit v1.2.3-58-ga151 From 8d2d83a8352b0f9c1da82c36f741722f2960feea Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 10 Aug 2011 14:15:02 -0700 Subject: exofs: Remove unused data_map member from exofs_sb_info The struct pnfs_osd_data_map data_map member of exofs_sb_info was never used after mount. In fact all it's members were duplicated by the ore_layout structure. So just remove the duplicated information. Also removed some stupid, but perfectly supported, restrictions on layout parameters. The case where num_devices is not divisible by mirror_count+1 is perfectly fine since the rotating device view will eventually use all the devices it can get. Signed-off-by: Boaz Harrosh Signed-off-by: Benny Halevy --- fs/exofs/exofs.h | 3 --- fs/exofs/super.c | 57 +++++++++++++++++++------------------------------- include/scsi/osd_ore.h | 2 ++ 3 files changed, 24 insertions(+), 38 deletions(-) (limited to 'fs') diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index c09d5a765efe..3b2e0478f363 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -66,9 +66,6 @@ struct exofs_sb_info { u32 s_next_generation; /* next gen # to use */ atomic_t s_curr_pending; /* number of pending commands */ - struct pnfs_osd_data_map data_map; /* Default raid to use - * FIXME: Needed ? - */ struct ore_layout layout; /* Default files layout */ struct ore_comp one_comp; /* id & cred of partition id=0*/ struct ore_components oc; /* comps for the partition */ diff --git a/fs/exofs/super.c b/fs/exofs/super.c index babb1958f6f0..90b4c526939f 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -481,64 +481,51 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, { u64 stripe_length; - sbi->data_map.odm_num_comps = - le32_to_cpu(dt->dt_data_map.cb_num_comps); - sbi->data_map.odm_stripe_unit = + sbi->layout.stripe_unit = le64_to_cpu(dt->dt_data_map.cb_stripe_unit); - sbi->data_map.odm_group_width = + sbi->layout.group_width = le32_to_cpu(dt->dt_data_map.cb_group_width); - sbi->data_map.odm_group_depth = + sbi->layout.group_depth = le32_to_cpu(dt->dt_data_map.cb_group_depth); - sbi->data_map.odm_mirror_cnt = - le32_to_cpu(dt->dt_data_map.cb_mirror_cnt); - sbi->data_map.odm_raid_algorithm = + sbi->layout.mirrors_p1 = + le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1; + sbi->layout.raid_algorithm = le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); /* FIXME: Only raid0 for now. if not so, do not mount */ - if (sbi->data_map.odm_num_comps != numdevs) { - EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n", - sbi->data_map.odm_num_comps, numdevs); - return -EINVAL; - } - if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) { + if (sbi->layout.raid_algorithm != PNFS_OSD_RAID_0) { EXOFS_ERR("Only RAID_0 for now\n"); return -EINVAL; } - if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) { - EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n", - numdevs, sbi->data_map.odm_mirror_cnt); + if (numdevs < (sbi->layout.group_width * sbi->layout.mirrors_p1)) { + EXOFS_ERR("Data Map wrong, " + "numdevs=%d < group_width=%d * mirrors=%d\n", + numdevs, sbi->layout.group_width, + sbi->layout.mirrors_p1); return -EINVAL; } - if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) { + if (0 != (sbi->layout.stripe_unit & ~PAGE_MASK)) { EXOFS_ERR("Stripe Unit(0x%llx)" " must be Multples of PAGE_SIZE(0x%lx)\n", - _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE); + _LLU(sbi->layout.stripe_unit), PAGE_SIZE); return -EINVAL; } - sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit; - sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1; - - if (sbi->data_map.odm_group_width) { - sbi->layout.group_width = sbi->data_map.odm_group_width; - sbi->layout.group_depth = sbi->data_map.odm_group_depth; + if (sbi->layout.group_width) { if (!sbi->layout.group_depth) { EXOFS_ERR("group_depth == 0 && group_width != 0\n"); return -EINVAL; } - sbi->layout.group_count = sbi->data_map.odm_num_comps / - sbi->layout.mirrors_p1 / - sbi->data_map.odm_group_width; + sbi->layout.group_count = numdevs / sbi->layout.mirrors_p1 / + sbi->layout.group_width; } else { - if (sbi->data_map.odm_group_depth) { + if (sbi->layout.group_depth) { printk(KERN_NOTICE "Warning: group_depth ignored " - "group_width == 0 && group_depth == %d\n", - sbi->data_map.odm_group_depth); - sbi->data_map.odm_group_depth = 0; + "group_width == 0 && group_depth == %lld\n", + _LLU(sbi->layout.group_depth)); } - sbi->layout.group_width = sbi->data_map.odm_num_comps / - sbi->layout.mirrors_p1; + sbi->layout.group_width = numdevs / sbi->layout.mirrors_p1; sbi->layout.group_depth = -1; sbi->layout.group_count = 1; } @@ -558,7 +545,7 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, sbi->layout.group_width, _LLU(sbi->layout.group_depth), sbi->layout.mirrors_p1, - sbi->data_map.odm_raid_algorithm); + sbi->layout.raid_algorithm); return 0; } diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index 954292a23767..f7fabb478877 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -34,6 +34,8 @@ struct ore_comp { struct ore_layout { /* Our way of looking at the data_map */ + enum pnfs_osd_raid_algorithm4 + raid_algorithm; unsigned stripe_unit; unsigned mirrors_p1; -- cgit v1.2.3-58-ga151 From eb507bc18969f63b8968034144fd69706c492516 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 10 Aug 2011 14:17:28 -0700 Subject: ore: Make ore_striping_info and ore_calc_stripe_info public The struct ore_striping_info will be used later in other structures. And ore_calc_stripe_info as well. Rename them make struct ore_striping_info public. ore_calc_stripe_info is still static, will be made public on first use. Signed-off-by: Boaz Harrosh --- fs/exofs/ore.c | 24 ++++++++---------------- include/scsi/osd_ore.h | 8 ++++++++ 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 870f85a232d1..c2b0033a724b 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -287,16 +287,8 @@ EXPORT_SYMBOL(ore_check_io); * * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit */ -struct _striping_info { - u64 obj_offset; - u64 group_length; - u64 M; /* for truncate */ - unsigned dev; - unsigned unit_off; -}; - -static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset, - struct _striping_info *si) +static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, + struct ore_striping_info *si) { u32 stripe_unit = layout->stripe_unit; u32 group_width = layout->group_width; @@ -375,7 +367,7 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, } static int _prepare_one_group(struct ore_io_state *ios, u64 length, - struct _striping_info *si) + struct ore_striping_info *si) { unsigned stripe_unit = ios->layout->stripe_unit; unsigned mirrors_p1 = ios->layout->mirrors_p1; @@ -434,14 +426,14 @@ static int _prepare_for_striping(struct ore_io_state *ios) { u64 length = ios->length; u64 offset = ios->offset; - struct _striping_info si; + struct ore_striping_info si; int ret = 0; if (!ios->pages) { if (ios->kern_buff) { struct ore_per_dev_state *per_dev = &ios->per_dev[0]; - _calc_stripe_info(ios->layout, ios->offset, &si); + ore_calc_stripe_info(ios->layout, ios->offset, &si); per_dev->offset = si.obj_offset; per_dev->dev = si.dev; @@ -455,7 +447,7 @@ static int _prepare_for_striping(struct ore_io_state *ios) } while (length) { - _calc_stripe_info(ios->layout, offset, &si); + ore_calc_stripe_info(ios->layout, offset, &si); if (length < si.group_length) si.group_length = length; @@ -744,7 +736,7 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp, } struct _trunc_info { - struct _striping_info si; + struct ore_striping_info si; u64 prev_group_obj_off; u64 next_group_obj_off; @@ -758,7 +750,7 @@ static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, { unsigned stripe_unit = layout->stripe_unit; - _calc_stripe_info(layout, file_offset, &ti->si); + ore_calc_stripe_info(layout, file_offset, &ti->si); ti->prev_group_obj_off = ti->si.M * stripe_unit; ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index f7fabb478877..e4d550faa7c9 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -56,6 +56,14 @@ struct ore_components { struct osd_dev **ods; /* osd_dev array */ }; +struct ore_striping_info { + u64 obj_offset; + u64 group_length; + u64 M; /* for truncate */ + unsigned dev; + unsigned unit_off; +}; + struct ore_io_state; typedef void (*ore_io_done_fn)(struct ore_io_state *ios, void *private); -- cgit v1.2.3-58-ga151 From d866d875f68fdeae63df334d291fe138dc636d96 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 28 Sep 2011 14:43:09 +0300 Subject: ore/exofs: Change the type of the devices array (API change) In the pNFS obj-LD the device table at the layout level needs to point to a device_cache node, where it is possible and likely that many layouts will point to the same device-nodes. In Exofs we have a more orderly structure where we have a single array of devices that repeats twice for a round-robin view of the device table This patch moves to a model that can be used by the pNFS obj-LD where struct ore_components holds an array of ore_dev-pointers. (ore_dev is newly defined and contains a struct osd_dev *od member) Each pointer in the array of pointers will point to a bigger user-defined dev_struct. That can be accessed by use of the container_of macro. In Exofs an __alloc_dev_table() function allocates the ore_dev-pointers array as well as an exofs_dev array, in one allocation and does the addresses dance to set everything pointing correctly. It still keeps the double allocation trick for the inodes round-robin view of the table. The device table is always allocated dynamically, also for the single device case. So it is unconditionally freed at umount. Signed-off-by: Boaz Harrosh --- fs/exofs/exofs.h | 10 +++-- fs/exofs/ore.c | 2 +- fs/exofs/super.c | 99 +++++++++++++++++++++++++++++++------------------- include/scsi/osd_ore.h | 26 ++++++++++++- 4 files changed, 94 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 3b2e0478f363..006fd6f33571 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -53,6 +53,10 @@ /* u64 has problems with printk this will cast it to unsigned long long */ #define _LLU(x) (unsigned long long)(x) +struct exofs_dev { + struct ore_dev ored; + unsigned did; +}; /* * our extension to the in-memory superblock */ @@ -69,7 +73,6 @@ struct exofs_sb_info { struct ore_layout layout; /* Default files layout */ struct ore_comp one_comp; /* id & cred of partition id=0*/ struct ore_components oc; /* comps for the partition */ - struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ }; /* @@ -214,13 +217,14 @@ static inline void exofs_init_comps(struct ore_components *oc, one_comp->obj.id = oid; exofs_make_credential(one_comp->cred, &one_comp->obj); - oc->numdevs = sbi->oc.numdevs; + oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 * + sbi->layout.group_count; oc->single_comp = EC_SINGLE_COMP; oc->comps = one_comp; /* Round robin device view of the table */ first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs; - oc->ods = sbi->oc.ods + first_dev; + oc->ods = &sbi->oc.ods[first_dev]; } #endif diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index c2b0033a724b..a7d79257fc65 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -59,7 +59,7 @@ static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) { - return ios->oc->ods[index]; + return ore_comp_dev(ios->oc, index); } int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 90b4c526939f..bce3686f0aa0 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -431,17 +431,18 @@ static void _exofs_print_device(const char *msg, const char *dev_path, static void exofs_free_sbi(struct exofs_sb_info *sbi) { - while (sbi->oc.numdevs) { - int i = --sbi->oc.numdevs; - struct osd_dev *od = sbi->oc.ods[i]; + unsigned numdevs = sbi->oc.numdevs; + + while (numdevs) { + unsigned i = --numdevs; + struct osd_dev *od = ore_comp_dev(&sbi->oc, i); if (od) { - sbi->oc.ods[i] = NULL; + ore_comp_set_dev(&sbi->oc, i, NULL); osduld_put_device(od); } } - if (sbi->oc.ods != sbi->_min_one_dev) - kfree(sbi->oc.ods); + kfree(sbi->oc.ods); kfree(sbi); } @@ -468,7 +469,7 @@ static void exofs_put_super(struct super_block *sb) msecs_to_jiffies(100)); } - _exofs_print_device("Unmounting", NULL, sbi->oc.ods[0], + _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0), sbi->one_comp.obj.partition); bdi_destroy(&sbi->bdi); @@ -592,12 +593,40 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, return !(odi->systemid_len || odi->osdname_len); } +int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, + struct exofs_dev **peds) +{ + struct __alloc_ore_devs_and_exofs_devs { + /* Twice bigger table: See exofs_init_comps() and comment at + * exofs_read_lookup_dev_table() + */ + struct ore_dev *oreds[numdevs * 2 - 1]; + struct exofs_dev eds[numdevs]; + } *aoded; + struct exofs_dev *eds; + unsigned i; + + aoded = kzalloc(sizeof(*aoded), GFP_KERNEL); + if (unlikely(!aoded)) { + EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", + numdevs); + return -ENOMEM; + } + + sbi->oc.ods = aoded->oreds; + *peds = eds = aoded->eds; + for (i = 0; i < numdevs; ++i) + aoded->oreds[i] = &eds[i].ored; + return 0; +} + static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, struct osd_dev *fscb_od, unsigned table_count) { struct ore_comp comp; struct exofs_device_table *dt; + struct exofs_dev *eds; unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + sizeof(*dt); unsigned numdevs, i; @@ -634,20 +663,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, if (unlikely(ret)) goto out; - if (likely(numdevs > 1)) { - unsigned size = numdevs * sizeof(sbi->oc.ods[0]); - - /* Twice bigger table: See exofs_init_comps() and below - * comment - */ - sbi->oc.ods = kzalloc(size + size - 1, GFP_KERNEL); - if (unlikely(!sbi->oc.ods)) { - EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", - numdevs); - ret = -ENOMEM; - goto out; - } - } + ret = __alloc_dev_table(sbi, numdevs, &eds); + if (unlikely(ret)) + goto out; + /* exofs round-robins the device table view according to inode + * number. We hold a: twice bigger table hence inodes can point + * to any device and have a sequential view of the table + * starting at this device. See exofs_init_comps() + */ + memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0], + (numdevs - 1) * sizeof(sbi->oc.ods[0])); for (i = 0; i < numdevs; i++) { struct exofs_fscb fscb; @@ -663,12 +688,15 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n", i, odi.osdname); + /* the exofs id is currently the table index */ + eds[i].did = i; + /* On all devices the device table is identical. The user can * specify any one of the participating devices on the command * line. We always keep them in device-table order. */ if (fscb_od && osduld_device_same(fscb_od, &odi)) { - sbi->oc.ods[i] = fscb_od; + eds[i].ored.od = fscb_od; ++sbi->oc.numdevs; fscb_od = NULL; continue; @@ -682,7 +710,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, goto out; } - sbi->oc.ods[i] = od; + eds[i].ored.od = od; ++sbi->oc.numdevs; /* Read the fscb of the other devices to make sure the FS @@ -705,21 +733,10 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, out: kfree(dt); - if (likely(!ret)) { - unsigned numdevs = sbi->oc.numdevs; - - if (unlikely(fscb_od)) { + if (unlikely(fscb_od && !ret)) { EXOFS_ERR("ERROR: Bad device-table container device not present\n"); osduld_put_device(fscb_od); return -EINVAL; - } - /* exofs round-robins the device table view according to inode - * number. We hold a: twice bigger table hence inodes can point - * to any device and have a sequential view of the table - * starting at this device. See exofs_init_comps() - */ - for (i = 0; i < numdevs - 1; ++i) - sbi->oc.ods[i + numdevs] = sbi->oc.ods[i]; } return ret; } @@ -773,7 +790,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sbi->oc.numdevs = 1; sbi->oc.single_comp = EC_SINGLE_COMP; sbi->oc.comps = &sbi->one_comp; - sbi->oc.ods = sbi->_min_one_dev; /* fill in some other data by hand */ memset(sb->s_id, 0, sizeof(sb->s_id)); @@ -822,7 +838,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) if (unlikely(ret)) goto free_sbi; } else { - sbi->oc.ods[0] = od; + struct exofs_dev *eds; + + ret = __alloc_dev_table(sbi, 1, &eds); + if (unlikely(ret)) + goto free_sbi; + + ore_comp_set_dev(&sbi->oc, 0, od); } __sbi_read_stats(sbi); @@ -862,7 +884,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - _exofs_print_device("Mounting", opts->dev_name, sbi->oc.ods[0], + _exofs_print_device("Mounting", opts->dev_name, + ore_comp_dev(&sbi->oc, 0), sbi->one_comp.obj.partition); return 0; diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index e4d550faa7c9..8fefdfbb1ced 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -44,6 +44,10 @@ struct ore_layout { unsigned group_count; }; +struct ore_dev { + struct osd_dev *od; +}; + struct ore_components { unsigned numdevs; /* Num of devices in array */ /* If @single_comp == EC_SINGLE_COMP, @comps points to a single @@ -53,9 +57,29 @@ struct ore_components { EC_SINGLE_COMP = 0, EC_MULTPLE_COMPS = 0xffffffff } single_comp; struct ore_comp *comps; - struct osd_dev **ods; /* osd_dev array */ + + /* Array of pointers to ore_dev-* . User will usually have these pointed + * too a bigger struct which contain an "ore_dev ored" member and use + * container_of(oc->ods[i], struct foo_dev, ored) to access the bigger + * structure. + */ + struct ore_dev **ods; }; +/* ore_comp_dev Recievies a logical device index */ +static inline struct osd_dev *ore_comp_dev( + const struct ore_components *oc, unsigned i) +{ + BUG_ON(oc->numdevs <= i); + return oc->ods[i]->od; +} + +static inline void ore_comp_set_dev( + struct ore_components *oc, unsigned i, struct osd_dev *od) +{ + oc->ods[i]->od = od; +} + struct ore_striping_info { u64 obj_offset; u64 group_length; -- cgit v1.2.3-58-ga151 From 3ee77f2091c229a72c64d9aeef1ee4a7624e5795 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Fri, 23 Sep 2011 13:55:52 -0700 Subject: ext3/balloc.c: local functions should be static This quites the sparse noise: warning: symbol 'ext3_trim_all_free' was not declared. Should it be static? Signed-off-by: H Hartley Sweeten Cc: Jan Kara Cc: Andrew Morton Cc: Andreas Dilger Signed-off-by: Jan Kara --- fs/ext3/balloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index caedc00520ed..a7fceaba87f1 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -1924,9 +1924,10 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group) * reaches any used block. Then issue a TRIM command on this extent and free * the extent in the block bitmap. This is done until whole group is scanned. */ -ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group, - ext3_grpblk_t start, ext3_grpblk_t max, - ext3_grpblk_t minblocks) +static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, + unsigned int group, + ext3_grpblk_t start, ext3_grpblk_t max, + ext3_grpblk_t minblocks) { handle_t *handle; ext3_grpblk_t next, free_blocks, bit, freed, count = 0; -- cgit v1.2.3-58-ga151 From bc1123239ab950798779ca4e23228afb5443eb5d Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Mon, 3 Oct 2011 14:02:59 +0900 Subject: udf: Add readpages support for udf. Use mpage_readpages() instead of multiple calls to udf_readpage() to reduce the CPU utilization and make performance higher. Signed-off-by: Namjae Jeon Signed-off-by: Jan Kara --- fs/udf/inode.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 1d1358ed80c1..f94d6f9febf5 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "udf_i.h" #include "udf_sb.h" @@ -104,7 +105,13 @@ static int udf_writepage(struct page *page, struct writeback_control *wbc) static int udf_readpage(struct file *file, struct page *page) { - return block_read_full_page(page, udf_get_block); + return mpage_readpage(page, udf_get_block); +} + +static int udf_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, udf_get_block); } static int udf_write_begin(struct file *file, struct address_space *mapping, @@ -139,6 +146,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block) const struct address_space_operations udf_aops = { .readpage = udf_readpage, + .readpages = udf_readpages, .writepage = udf_writepage, .write_begin = udf_write_begin, .write_end = generic_write_end, -- cgit v1.2.3-58-ga151 From 7aa0baeaba4afc4fbed7aad2812a1116e6b0adcd Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 6 Oct 2011 10:22:28 -0400 Subject: ext4: Free resources in ext4_mb_init()'s error paths In commit 79a77c5ac, we move ext4_mb_init_backend after the allocation of s_locality_group to avoid memory leak in error path, but there are still some other error paths in ext4_mb_init that need to do the same work. So this patch adds all the error patch for ext4_mb_init. And all the pointers are reset to NULL in case the caller may double free them. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 1c83161090f8..8c005c028203 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2504,7 +2504,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { ret = -ENOMEM; - goto out; + goto out_free_groupinfo_slab; } for_each_possible_cpu(i) { struct ext4_locality_group *lg; @@ -2517,9 +2517,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) /* init file for buddy data */ ret = ext4_mb_init_backend(sb); - if (ret != 0) { - goto out; - } + if (ret != 0) + goto out_free_locality_groups; if (sbi->s_proc) proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, @@ -2527,11 +2526,19 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) if (sbi->s_journal) sbi->s_journal->j_commit_callback = release_blocks_on_commit; + + return 0; + +out_free_locality_groups: + free_percpu(sbi->s_locality_groups); + sbi->s_locality_groups = NULL; +out_free_groupinfo_slab: + ext4_groupinfo_destroy_slabs(); out: - if (ret) { - kfree(sbi->s_mb_offsets); - kfree(sbi->s_mb_maxs); - } + kfree(sbi->s_mb_offsets); + sbi->s_mb_offsets = NULL; + kfree(sbi->s_mb_maxs); + sbi->s_mb_maxs = NULL; return ret; } -- cgit v1.2.3-58-ga151 From dcf2d804ed6ffe5e942b909ed5e5b74628be6ee4 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 6 Oct 2011 12:10:11 -0400 Subject: ext4: Free resources in some error path in ext4_fill_super Some of the error path in ext4_fill_super don't release the resouces properly. So this patch just try to release them in the right way. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 07f3de341452..db2cd3f21074 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3785,22 +3785,19 @@ no_journal: if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", err); - goto failed_mount4; + goto failed_mount5; } err = ext4_register_li_request(sb, first_not_zeroed); if (err) - goto failed_mount4; + goto failed_mount6; sbi->s_kobj.kset = ext4_kset; init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, "%s", sb->s_id); - if (err) { - ext4_mb_release(sb); - ext4_ext_release(sb); - goto failed_mount4; - }; + if (err) + goto failed_mount7; EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); @@ -3834,13 +3831,19 @@ cantfind_ext4: ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto failed_mount; +failed_mount7: + ext4_unregister_li_request(sb); +failed_mount6: + ext4_ext_release(sb); +failed_mount5: + ext4_mb_release(sb); + ext4_release_system_zone(sb); failed_mount4: iput(root); sb->s_root = NULL; ext4_msg(sb, KERN_ERR, "mount failed"); destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); failed_mount_wq: - ext4_release_system_zone(sb); if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; -- cgit v1.2.3-58-ga151 From 4113c4caa4f355b8ff8b7ff0510c29c9d00d30b3 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sat, 8 Oct 2011 14:34:47 -0400 Subject: ext4: remove deprecated oldalloc For a long time now orlov is the default block allocator in the ext4. It performs better than the old one and no one seems to claim otherwise so we can safely drop it and make oldalloc and orlov mount option deprecated. This is a part of the effort to reduce number of ext4 options hence the test matrix. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/ext4.txt | 8 --- fs/ext4/ext4.h | 1 - fs/ext4/ialloc.c | 138 +------------------------------------ fs/ext4/super.c | 8 +-- 4 files changed, 7 insertions(+), 148 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index a53040193967..4917cf24a5e0 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -203,14 +203,6 @@ inode_readahead_blks=n This tuning parameter controls the maximum table readahead algorithm will pre-read into the buffer cache. The default value is 32 blocks. -orlov (*) This enables the new Orlov block allocator. It is - enabled by default. - -oldalloc This disables the Orlov block allocator and enables - the old block allocator. Orlov should have better - performance - we'd like to get some feedback if it's - the contrary for you. - nouser_xattr Disables Extended User Attributes. If you have extended attribute support enabled in the kernel configuration (CONFIG_EXT4_FS_XATTR), extended attribute support diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1bbd2caebe7f..61b0b5cd4aa2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -913,7 +913,6 @@ struct ext4_inode_info { /* * Mount flags */ -#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index d50a7d5e4726..04219988e75f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -293,120 +293,6 @@ error_return: ext4_std_error(sb, fatal); } -/* - * There are two policies for allocating an inode. If the new inode is - * a directory, then a forward search is made for a block group with both - * free space and a low directory-to-inode ratio; if that fails, then of - * the groups with above-average free space, that group with the fewest - * directories already is chosen. - * - * For other inodes, search forward from the parent directory\'s block - * group to find a free inode. - */ -static int find_group_dir(struct super_block *sb, struct inode *parent, - ext4_group_t *best_group) -{ - ext4_group_t ngroups = ext4_get_groups_count(sb); - unsigned int freei, avefreei; - struct ext4_group_desc *desc, *best_desc = NULL; - ext4_group_t group; - int ret = -1; - - freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter); - avefreei = freei / ngroups; - - for (group = 0; group < ngroups; group++) { - desc = ext4_get_group_desc(sb, group, NULL); - if (!desc || !ext4_free_inodes_count(sb, desc)) - continue; - if (ext4_free_inodes_count(sb, desc) < avefreei) - continue; - if (!best_desc || - (ext4_free_group_clusters(sb, desc) > - ext4_free_group_clusters(sb, best_desc))) { - *best_group = group; - best_desc = desc; - ret = 0; - } - } - return ret; -} - -#define free_block_ratio 10 - -static int find_group_flex(struct super_block *sb, struct inode *parent, - ext4_group_t *best_group) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_group_desc *desc; - struct flex_groups *flex_group = sbi->s_flex_groups; - ext4_group_t parent_group = EXT4_I(parent)->i_block_group; - ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); - ext4_group_t ngroups = ext4_get_groups_count(sb); - int flex_size = ext4_flex_bg_size(sbi); - ext4_group_t best_flex = parent_fbg_group; - int blocks_per_flex = sbi->s_blocks_per_group * flex_size; - int flexbg_free_clusters; - int flex_freeb_ratio; - ext4_group_t n_fbg_groups; - ext4_group_t i; - - n_fbg_groups = (ngroups + flex_size - 1) >> - sbi->s_log_groups_per_flex; - -find_close_to_parent: - flexbg_free_clusters = atomic_read(&flex_group[best_flex].free_clusters); - flex_freeb_ratio = EXT4_C2B(sbi, flexbg_free_clusters) * 100 / - blocks_per_flex; - if (atomic_read(&flex_group[best_flex].free_inodes) && - flex_freeb_ratio > free_block_ratio) - goto found_flexbg; - - if (best_flex && best_flex == parent_fbg_group) { - best_flex--; - goto find_close_to_parent; - } - - for (i = 0; i < n_fbg_groups; i++) { - if (i == parent_fbg_group || i == parent_fbg_group - 1) - continue; - - flexbg_free_clusters = atomic_read(&flex_group[i].free_clusters); - flex_freeb_ratio = EXT4_C2B(sbi, flexbg_free_clusters) * 100 / - blocks_per_flex; - - if (flex_freeb_ratio > free_block_ratio && - (atomic_read(&flex_group[i].free_inodes))) { - best_flex = i; - goto found_flexbg; - } - - if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) || - ((atomic_read(&flex_group[i].free_clusters) > - atomic_read(&flex_group[best_flex].free_clusters)) && - atomic_read(&flex_group[i].free_inodes))) - best_flex = i; - } - - if (!atomic_read(&flex_group[best_flex].free_inodes) || - !atomic_read(&flex_group[best_flex].free_clusters)) - return -1; - -found_flexbg: - for (i = best_flex * flex_size; i < ngroups && - i < (best_flex + 1) * flex_size; i++) { - desc = ext4_get_group_desc(sb, i, NULL); - if (ext4_free_inodes_count(sb, desc)) { - *best_group = i; - goto out; - } - } - - return -1; -out: - return 0; -} - struct orlov_stats { __u32 free_inodes; __u32 free_clusters; @@ -819,7 +705,6 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, int ret2, err = 0; struct inode *ret; ext4_group_t i; - static int once = 1; ext4_group_t flex_group; /* Cannot create files in a deleted directory */ @@ -845,26 +730,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, goto got_group; } - if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { - ret2 = find_group_flex(sb, dir, &group); - if (ret2 == -1) { - ret2 = find_group_other(sb, dir, &group, mode); - if (ret2 == 0 && once) { - once = 0; - printk(KERN_NOTICE "ext4: find_group_flex " - "failed, fallback succeeded dir %lu\n", - dir->i_ino); - } - } - goto got_group; - } - - if (S_ISDIR(mode)) { - if (test_opt(sb, OLDALLOC)) - ret2 = find_group_dir(sb, dir, &group); - else - ret2 = find_group_orlov(sb, dir, &group, mode, qstr); - } else + if (S_ISDIR(mode)) + ret2 = find_group_orlov(sb, dir, &group, mode, qstr); + else ret2 = find_group_other(sb, dir, &group, mode); got_group: diff --git a/fs/ext4/super.c b/fs/ext4/super.c index db2cd3f21074..42f76c64df8c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1073,8 +1073,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",nouid32"); if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG)) seq_puts(seq, ",debug"); - if (test_opt(sb, OLDALLOC)) - seq_puts(seq, ",oldalloc"); #ifdef CONFIG_EXT4_FS_XATTR if (test_opt(sb, XATTR_USER)) seq_puts(seq, ",user_xattr"); @@ -1583,10 +1581,12 @@ static int parse_options(char *options, struct super_block *sb, set_opt(sb, DEBUG); break; case Opt_oldalloc: - set_opt(sb, OLDALLOC); + ext4_msg(sb, KERN_WARNING, + "Ignoring deprecated oldalloc option"); break; case Opt_orlov: - clear_opt(sb, OLDALLOC); + ext4_msg(sb, KERN_WARNING, + "Ignoring deprecated orlov option"); break; #ifdef CONFIG_EXT4_FS_XATTR case Opt_user_xattr: -- cgit v1.2.3-58-ga151 From df3ab17072c31fbd394614711772682f0a956a2c Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sat, 8 Oct 2011 15:53:49 -0400 Subject: ext4: fix the comment describing ext4_ext_search_right() The comment describing what ext4_ext_search_right() does is incorrect. We return 0 in *phys when *logical is the 'largest' allocated block, not smallest. Fix a few other typos while we're at it. Cc: "Theodore Ts'o" Signed-off-by: Tao Ma --- fs/ext4/extents.c | 4 ++-- fs/ext4/inode.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ad39627c1fbc..142e3443476f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1263,7 +1263,7 @@ static int ext4_ext_search_left(struct inode *inode, /* * search the closest allocated block to the right for *logical * and returns it at @logical + it's physical address at @phys - * if *logical is the smallest allocated block, the function + * if *logical is the largest allocated block, the function * returns 0 at @phys * return value contains 0 (success) or error code */ @@ -2168,7 +2168,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, * need to account for leaf block credit * * bitmaps and block group descriptor blocks - * and other metadat blocks still need to be + * and other metadata blocks still need to be * accounted. */ /* 1 bitmap, 1 block group descriptor */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1380cd29c312..4863238c3754 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -458,7 +458,7 @@ static void set_buffers_da_mapped(struct inode *inode, * the buffer head is mapped. * * It returns 0 if plain look up failed (blocks have not been allocated), in - * that casem, buffer head is unmapped + * that case, buffer head is unmapped * * It returns the error in case of allocation failure. */ @@ -497,7 +497,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * Returns if the blocks have already allocated * * Note that if blocks have been preallocated - * ext4_ext_get_block() returns th create = 0 + * ext4_ext_get_block() returns the create = 0 * with buffer head unmapped. */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) -- cgit v1.2.3-58-ga151 From 7fd59c83b05dc1b8af2be4d991ee376f782cd8b0 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sat, 8 Oct 2011 15:56:35 -0400 Subject: ext4: remove the obsolete/broken EXT4_IOC_WAIT_FOR_READONLY ioctl There are no users of the EXT4_IOC_WAIT_FOR_READONLY ioctl, and it is also broken. No one sets the set_ro_timer, no one wakes up us and our state is set to TASK_INTERRUPTIBLE not RUNNING. So remove it. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 10 ---------- fs/ext4/ioctl.c | 28 ---------------------------- 2 files changed, 38 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 61b0b5cd4aa2..4546da4f26c4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -564,9 +564,6 @@ struct ext4_new_group_data { #define EXT4_IOC_SETVERSION _IOW('f', 4, long) #define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION #define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION -#ifdef CONFIG_JBD2_DEBUG -#define EXT4_IOC_WAIT_FOR_READONLY _IOR('f', 99, long) -#endif #define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) #define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) #define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) @@ -589,9 +586,6 @@ struct ext4_new_group_data { #define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) #define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) #define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) -#ifdef CONFIG_JBD2_DEBUG -#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) -#endif #define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION #endif @@ -1171,10 +1165,6 @@ struct ext4_sb_info { u32 s_max_batch_time; u32 s_min_batch_time; struct block_device *journal_bdev; -#ifdef CONFIG_JBD2_DEBUG - struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ - wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ -#endif #ifdef CONFIG_QUOTA char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ int s_jquota_fmt; /* Format of quota to use */ diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 2046d699b4a7..8f7ea6905421 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -174,29 +174,6 @@ setversion_out: mnt_drop_write(filp->f_path.mnt); return err; } -#ifdef CONFIG_JBD2_DEBUG - case EXT4_IOC_WAIT_FOR_READONLY: - /* - * This is racy - by the time we're woken up and running, - * the superblock could be released. And the module could - * have been unloaded. So sue me. - * - * Returns 1 if it slept, else zero. - */ - { - DECLARE_WAITQUEUE(wait, current); - int ret = 0; - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait); - if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) { - schedule(); - ret = 1; - } - remove_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait); - return ret; - } -#endif case EXT4_IOC_GROUP_EXTEND: { ext4_fsblk_t n_blocks_count; int err, err2=0; @@ -421,11 +398,6 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC32_SETVERSION_OLD: cmd = EXT4_IOC_SETVERSION_OLD; break; -#ifdef CONFIG_JBD2_DEBUG - case EXT4_IOC32_WAIT_FOR_READONLY: - cmd = EXT4_IOC_WAIT_FOR_READONLY; - break; -#endif case EXT4_IOC32_GETRSVSZ: cmd = EXT4_IOC_GETRSVSZ; break; -- cgit v1.2.3-58-ga151 From 6ee3b2122431fc75015c6114d4749de76422452b Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sat, 8 Oct 2011 16:08:34 -0400 Subject: ext4: use le32_to_cpu for ext4_extent_idx.ei_block in ext4_ext_search_left() ext4_extent_idx.e_block is __le32, so use le32_to_cpu() in ext4_ext_search_left(). Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 142e3443476f..f473ddf0bd94 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1238,9 +1238,9 @@ static int ext4_ext_search_left(struct inode *inode, if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { EXT4_ERROR_INODE(inode, "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", - ix != NULL ? ix->ei_block : 0, + ix != NULL ? le32_to_cpu(ix->ei_block) : 0, EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? - EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0, + le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0, depth); return -EIO; } -- cgit v1.2.3-58-ga151 From d44651d0f922b7aaeddd9fc04f2f5700a65983dd Mon Sep 17 00:00:00 2001 From: Fabrice Jouhaud Date: Sat, 8 Oct 2011 16:26:03 -0400 Subject: ext4: fix ext4 so it works without CONFIG_PROC_FS This fixes a bug which was introduced in dd68314ccf3fb. The problem came from the test of the return value of proc_mkdir which is always false without procfs, and this would initialization of ext4. Signed-off-by: Fabrice Jouhaud Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 42f76c64df8c..dcc460537bc7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3540,10 +3540,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } -#ifdef CONFIG_PROC_FS if (ext4_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); -#endif bgl_lock_init(sbi->s_blockgroup_lock); @@ -5070,13 +5068,11 @@ static int __init ext4_init_fs(void) return err; err = ext4_init_system_zone(); if (err) - goto out7; + goto out6; ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); if (!ext4_kset) - goto out6; - ext4_proc_root = proc_mkdir("fs/ext4", NULL); - if (!ext4_proc_root) goto out5; + ext4_proc_root = proc_mkdir("fs/ext4", NULL); err = ext4_init_feat_adverts(); if (err) @@ -5112,12 +5108,12 @@ out2: out3: ext4_exit_feat_adverts(); out4: - remove_proc_entry("fs/ext4", NULL); -out5: + if (ext4_proc_root) + remove_proc_entry("fs/ext4", NULL); kset_unregister(ext4_kset); -out6: +out5: ext4_exit_system_zone(); -out7: +out6: ext4_exit_pageio(); return err; } -- cgit v1.2.3-58-ga151 From 40bfa16dac2adcded9e2eda58246cc3700d97de4 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Fri, 7 Oct 2011 23:21:18 +0800 Subject: ext3: Remove the obsolete broken EXT3_IOC32_WAIT_FOR_READONLY. There are no user of EXT3_IOC32_WAIT_FOR_READONLY and also it is broken. No one set the set_ro_timer, no one wake up us and our state is set to TASK_INTERRUPTIBLE not RUNNING. So remove it. Cc: Jan Kara Signed-off-by: Tao Ma Signed-off-by: Jan Kara --- fs/ext3/ioctl.c | 24 ------------------------ include/linux/ext3_fs_sb.h | 4 ---- 2 files changed, 28 deletions(-) (limited to 'fs') diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index c7f43944f160..ba1b54e23cae 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -150,30 +150,6 @@ setversion_out: mnt_drop_write(filp->f_path.mnt); return err; } -#ifdef CONFIG_JBD_DEBUG - case EXT3_IOC_WAIT_FOR_READONLY: - /* - * This is racy - by the time we're woken up and running, - * the superblock could be released. And the module could - * have been unloaded. So sue me. - * - * Returns 1 if it slept, else zero. - */ - { - struct super_block *sb = inode->i_sb; - DECLARE_WAITQUEUE(wait, current); - int ret = 0; - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait); - if (timer_pending(&EXT3_SB(sb)->turn_ro_timer)) { - schedule(); - ret = 1; - } - remove_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait); - return ret; - } -#endif case EXT3_IOC_GETRSVSZ: if (test_opt(inode->i_sb, RESERVATION) && S_ISREG(inode->i_mode) diff --git a/include/linux/ext3_fs_sb.h b/include/linux/ext3_fs_sb.h index 258088ab3c6b..64365252f1b0 100644 --- a/include/linux/ext3_fs_sb.h +++ b/include/linux/ext3_fs_sb.h @@ -76,10 +76,6 @@ struct ext3_sb_info { struct mutex s_resize_lock; unsigned long s_commit_interval; struct block_device *journal_bdev; -#ifdef CONFIG_JBD_DEBUG - struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ - wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ -#endif #ifdef CONFIG_QUOTA char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ int s_jquota_fmt; /* Format of quota to use */ -- cgit v1.2.3-58-ga151 From 7e273e3b41e32716dc122b293b5f15635af495ff Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 10 Oct 2011 01:08:02 -0700 Subject: udf: Promote some debugging messages to udf_error If there is a problem with a scratched disc or loader, it's valuable to know which error occurred. Convert some debug messages to udf_error, neaten those messages too. Add the calculated tag checksum and the read checksum to error message. Make udf_error a public function and move the logging prototypes together. Original-patch-by: NamJae Jeon Reviewed-by: NamJae Jeon Signed-off-by: Joe Perches Signed-off-by: Jan Kara --- fs/udf/misc.c | 13 +++++++++---- fs/udf/super.c | 6 ++---- fs/udf/udfdecl.h | 9 +++++++-- 3 files changed, 18 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/udf/misc.c b/fs/udf/misc.c index 9215700c00a4..a85fa7c55b0a 100644 --- a/fs/udf/misc.c +++ b/fs/udf/misc.c @@ -204,6 +204,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, { struct tag *tag_p; struct buffer_head *bh = NULL; + u8 checksum; /* Read the block */ if (block == 0xFFFFFFFF) @@ -211,7 +212,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, bh = udf_tread(sb, block); if (!bh) { - udf_debug("block=%d, location=%d: read failed\n", + udf_error(sb, __func__, "read failed, block=%u, location=%d\n", block, location); return NULL; } @@ -227,15 +228,19 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, } /* Verify the tag checksum */ - if (udf_tag_checksum(tag_p) != tag_p->tagChecksum) { - printk(KERN_ERR "udf: tag checksum failed block %d\n", block); + checksum = udf_tag_checksum(tag_p); + if (checksum != tag_p->tagChecksum) { + udf_error(sb, __func__, + "tag checksum failed, block %u: 0x%02x != 0x%02x\n", + block, checksum, tag_p->tagChecksum); goto error_out; } /* Verify the tag version */ if (tag_p->descVersion != cpu_to_le16(0x0002U) && tag_p->descVersion != cpu_to_le16(0x0003U)) { - udf_debug("tag version 0x%04x != 0x0002 || 0x0003 block %d\n", + udf_error(sb, __func__, + "tag version 0x%04x != 0x0002 || 0x0003, block %u\n", le16_to_cpu(tag_p->descVersion), block); goto error_out; } diff --git a/fs/udf/super.c b/fs/udf/super.c index 7b27b063ff6d..80f47ce515bc 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -92,8 +92,6 @@ static void udf_close_lvid(struct super_block *); static unsigned int udf_count_free(struct super_block *); static int udf_statfs(struct dentry *, struct kstatfs *); static int udf_show_options(struct seq_file *, struct vfsmount *); -static void udf_error(struct super_block *sb, const char *function, - const char *fmt, ...); struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi) { @@ -2096,8 +2094,8 @@ error_out: return -EINVAL; } -static void udf_error(struct super_block *sb, const char *function, - const char *fmt, ...) +void udf_error(struct super_block *sb, const char *function, + const char *fmt, ...) { va_list args; diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index dbd52d4b5eed..81e66afecd42 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -29,6 +29,13 @@ do { \ #define udf_debug(f, a...) /**/ #endif +__attribute__((format(printf, 3, 4))) +extern void udf_warning(struct super_block *, const char *, const char *, ...); + +__attribute__((format(printf, 3, 4))) +extern void udf_error(struct super_block *sb, const char *function, + const char *fmt, ...); + #define udf_info(f, a...) \ printk(KERN_INFO "UDF-fs INFO " f, ##a); @@ -112,8 +119,6 @@ struct extent_position { /* super.c */ -__attribute__((format(printf, 3, 4))) -extern void udf_warning(struct super_block *, const char *, const char *, ...); static inline void udf_updated_lvid(struct super_block *sb) { struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh; -- cgit v1.2.3-58-ga151 From 8076c363da15e7c35a4094974d1b4bcc196b5fa9 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 10 Oct 2011 01:08:03 -0700 Subject: udf: Rename udf_error to udf_err Rename udf_error to udf_err for consistency with normal logging uses of pr_err. Rename function udf_err to _udf_err. Remove __func__ from uses and move __func__ to a new udf_err macro that calls _udf_err. Some of the udf_error uses had \n terminations, some did not so standardize \n's to udf_err uses, remove \n from _udf_err function. Coalesce udf_err formats. One message prefixed with udf_read_super is now prefixed with udf_fill_super. Reviewed-by: NamJae Jeon Signed-off-by: Joe Perches Signed-off-by: Jan Kara --- fs/udf/misc.c | 18 ++++++++---------- fs/udf/super.c | 27 +++++++++++---------------- fs/udf/udfdecl.h | 7 ++++--- 3 files changed, 23 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/udf/misc.c b/fs/udf/misc.c index a85fa7c55b0a..c175b4dabc14 100644 --- a/fs/udf/misc.c +++ b/fs/udf/misc.c @@ -212,8 +212,8 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, bh = udf_tread(sb, block); if (!bh) { - udf_error(sb, __func__, "read failed, block=%u, location=%d\n", - block, location); + udf_err(sb, "read failed, block=%u, location=%d\n", + block, location); return NULL; } @@ -230,18 +230,16 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, /* Verify the tag checksum */ checksum = udf_tag_checksum(tag_p); if (checksum != tag_p->tagChecksum) { - udf_error(sb, __func__, - "tag checksum failed, block %u: 0x%02x != 0x%02x\n", - block, checksum, tag_p->tagChecksum); + udf_err(sb, "tag checksum failed, block %u: 0x%02x != 0x%02x\n", + block, checksum, tag_p->tagChecksum); goto error_out; } /* Verify the tag version */ if (tag_p->descVersion != cpu_to_le16(0x0002U) && tag_p->descVersion != cpu_to_le16(0x0003U)) { - udf_error(sb, __func__, - "tag version 0x%04x != 0x0002 || 0x0003, block %u\n", - le16_to_cpu(tag_p->descVersion), block); + udf_err(sb, "tag version 0x%04x != 0x0002 || 0x0003, block %u\n", + le16_to_cpu(tag_p->descVersion), block); goto error_out; } @@ -253,8 +251,8 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, return bh; udf_debug("Crc failure block %d: crc = %d, crclen = %d\n", block, - le16_to_cpu(tag_p->descCRC), le16_to_cpu(tag_p->descCRCLength)); - + le16_to_cpu(tag_p->descCRC), + le16_to_cpu(tag_p->descCRCLength)); error_out: brelse(bh); return NULL; diff --git a/fs/udf/super.c b/fs/udf/super.c index 80f47ce515bc..76cc08f9202e 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -242,9 +242,8 @@ static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count) sbi->s_partmaps = kcalloc(count, sizeof(struct udf_part_map), GFP_KERNEL); if (!sbi->s_partmaps) { - udf_error(sb, __func__, - "Unable to allocate space for %d partition maps", - count); + udf_err(sb, "Unable to allocate space for %d partition maps\n", + count); sbi->s_partitions = 0; return -ENOMEM; } @@ -879,8 +878,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) if (mdata->s_mirror_fe == NULL) { if (fe_error) { - udf_error(sb, __func__, "mirror inode efe not found " - "and metadata inode is missing too, exiting..."); + udf_err(sb, "mirror inode efe not found and metadata inode is missing too, exiting...\n"); goto error_exit; } else udf_warning(sb, __func__, "mirror inode efe not found," @@ -915,8 +913,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) "not found but it's ok since the disc" " is mounted read-only"); else { - udf_error(sb, __func__, "bitmap inode efe not " - "found and attempted read-write mount"); + udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n"); goto error_exit; } } @@ -969,9 +966,8 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) bitmap = vzalloc(size); /* TODO: get rid of vzalloc */ if (bitmap == NULL) { - udf_error(sb, __func__, - "Unable to allocate space for bitmap " - "and %d buffer_head pointers", nr_groups); + udf_err(sb, "Unable to allocate space for bitmap and %d buffer_head pointers\n", + nr_groups); return NULL; } @@ -1935,8 +1931,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) if (uopt.flags & (1 << UDF_FLAG_UTF8) && uopt.flags & (1 << UDF_FLAG_NLS_MAP)) { - udf_error(sb, "udf_read_super", - "utf8 cannot be combined with iocharset\n"); + udf_err(sb, "utf8 cannot be combined with iocharset\n"); goto error_out; } #ifdef CONFIG_UDF_NLS @@ -2094,8 +2089,8 @@ error_out: return -EINVAL; } -void udf_error(struct super_block *sb, const char *function, - const char *fmt, ...) +void _udf_err(struct super_block *sb, const char *function, + const char *fmt, ...) { va_list args; @@ -2106,8 +2101,8 @@ void udf_error(struct super_block *sb, const char *function, va_start(args, fmt); vsnprintf(error_buf, sizeof(error_buf), fmt, args); va_end(args); - printk(KERN_CRIT "UDF-fs error (device %s): %s: %s\n", - sb->s_id, function, error_buf); + printk(KERN_CRIT "UDF-fs error (device %s): %s: %s", + sb->s_id, function, error_buf); } void udf_warning(struct super_block *sb, const char *function, diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 81e66afecd42..7bc3ba1415fc 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -33,13 +33,14 @@ __attribute__((format(printf, 3, 4))) extern void udf_warning(struct super_block *, const char *, const char *, ...); __attribute__((format(printf, 3, 4))) -extern void udf_error(struct super_block *sb, const char *function, - const char *fmt, ...); +extern void _udf_err(struct super_block *sb, const char *function, + const char *fmt, ...); +#define udf_err(sb, fmt, ...) \ + _udf_err(sb, __func__, fmt, ##__VA_ARGS__) #define udf_info(f, a...) \ printk(KERN_INFO "UDF-fs INFO " f, ##a); - #define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) ) #define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) ) -- cgit v1.2.3-58-ga151 From a40ecd7b3ccf520ff02da93e8d1ba6cd55c2e359 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 10 Oct 2011 01:08:04 -0700 Subject: udf: Rename udf_warning to udf_warn Rename udf_warning to udf_warn for consistency with normal logging uses of pr_warn. Rename function udf_warning to _udf_warn. Remove __func__ from uses and move __func__ to a new udf_warn macro that calls _udf_warn. Add \n's to uses of udf_warn, remove \n from _udf_warn. Coalesce formats. Reviewed-by: NamJae Jeon Signed-off-by: Joe Perches Signed-off-by: Jan Kara --- fs/udf/namei.c | 5 ++--- fs/udf/partition.c | 3 +-- fs/udf/super.c | 22 ++++++++-------------- fs/udf/udfdecl.h | 5 ++++- 4 files changed, 15 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/udf/namei.c b/fs/udf/namei.c index f1dce848ef96..78d59ebd0656 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -799,9 +799,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) if (retval) goto end_rmdir; if (inode->i_nlink != 2) - udf_warning(inode->i_sb, "udf_rmdir", - "empty directory has nlink != 2 (%d)", - inode->i_nlink); + udf_warn(inode->i_sb, "empty directory has nlink != 2 (%d)\n", + inode->i_nlink); clear_nlink(inode); inode->i_size = 0; inode_dec_link_count(dir); diff --git a/fs/udf/partition.c b/fs/udf/partition.c index a71090ea0e07..c72edb2260e3 100644 --- a/fs/udf/partition.c +++ b/fs/udf/partition.c @@ -322,8 +322,7 @@ uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block, BUG_ON(!inode); retblk = udf_try_read_meta(inode, block, partition, offset); if (retblk == 0xFFFFFFFF) { - udf_warning(sb, __func__, "error reading from METADATA, " - "trying to read from MIRROR"); + udf_warn(sb, "error reading from METADATA, trying to read from MIRROR\n"); inode = mdata->s_mirror_fe; if (!inode) return 0xFFFFFFFF; diff --git a/fs/udf/super.c b/fs/udf/super.c index 76cc08f9202e..622331f1290d 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -855,13 +855,11 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) mdata->s_metadata_fe = udf_iget(sb, &addr); if (mdata->s_metadata_fe == NULL) { - udf_warning(sb, __func__, "metadata inode efe not found, " - "will try mirror inode."); + udf_warn(sb, "metadata inode efe not found, will try mirror inode\n"); fe_error = 1; } else if (UDF_I(mdata->s_metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) { - udf_warning(sb, __func__, "metadata inode efe does not have " - "short allocation descriptors!"); + udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n"); fe_error = 1; iput(mdata->s_metadata_fe); mdata->s_metadata_fe = NULL; @@ -881,12 +879,10 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) udf_err(sb, "mirror inode efe not found and metadata inode is missing too, exiting...\n"); goto error_exit; } else - udf_warning(sb, __func__, "mirror inode efe not found," - " but metadata inode is OK"); + udf_warn(sb, "mirror inode efe not found, but metadata inode is OK\n"); } else if (UDF_I(mdata->s_mirror_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) { - udf_warning(sb, __func__, "mirror inode efe does not have " - "short allocation descriptors!"); + udf_warn(sb, "mirror inode efe does not have short allocation descriptors!\n"); iput(mdata->s_mirror_fe); mdata->s_mirror_fe = NULL; if (fe_error) @@ -909,9 +905,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) if (mdata->s_bitmap_fe == NULL) { if (sb->s_flags & MS_RDONLY) - udf_warning(sb, __func__, "bitmap inode efe " - "not found but it's ok since the disc" - " is mounted read-only"); + udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n"); else { udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n"); goto error_exit; @@ -2105,15 +2099,15 @@ void _udf_err(struct super_block *sb, const char *function, sb->s_id, function, error_buf); } -void udf_warning(struct super_block *sb, const char *function, - const char *fmt, ...) +void _udf_warn(struct super_block *sb, const char *function, + const char *fmt, ...) { va_list args; va_start(args, fmt); vsnprintf(error_buf, sizeof(error_buf), fmt, args); va_end(args); - printk(KERN_WARNING "UDF-fs warning (device %s): %s: %s\n", + printk(KERN_WARNING "UDF-fs warning (device %s): %s: %s", sb->s_id, function, error_buf); } diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 7bc3ba1415fc..85e15edc080f 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -30,7 +30,10 @@ do { \ #endif __attribute__((format(printf, 3, 4))) -extern void udf_warning(struct super_block *, const char *, const char *, ...); +extern void _udf_warn(struct super_block *sb, const char *function, + const char *fmt, ...); +#define udf_warn(sb, fmt, ...) \ + _udf_warn(sb, __func__, fmt, ##__VA_ARGS__) __attribute__((format(printf, 3, 4))) extern void _udf_err(struct super_block *sb, const char *function, -- cgit v1.2.3-58-ga151 From 71c3bcd71393a9e67d5b77597a612537f89c30ed Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 27 Sep 2011 21:42:29 -0400 Subject: nfsd4: fix state lock usage in LOCKU In commit 5ec094c1096ab3bb795651855d53f18daa26afde "nfsd4: extend state lock over seqid replay logic" I modified the exit logic of all the seqid-based procedures except nfsd4_locku(). Fix the oversight. The result of the bug was a double-unlock while handling the LOCKU procedure, and a warning like: [ 142.150014] WARNING: at kernel/mutex-debug.c:78 debug_mutex_unlock+0xda/0xe0() ... [ 142.152927] Pid: 742, comm: nfsd Not tainted 3.1.0-rc1-SLIM+ #9 [ 142.152927] Call Trace: [ 142.152927] [] warn_slowpath_common+0x7f/0xc0 [ 142.152927] [] warn_slowpath_null+0x1a/0x20 [ 142.152927] [] debug_mutex_unlock+0xda/0xe0 [ 142.152927] [] __mutex_unlock_slowpath+0x80/0x140 [ 142.152927] [] mutex_unlock+0xe/0x10 [ 142.152927] [] nfs4_lock_state+0x35/0x40 [nfsd] [ 142.152927] [] nfsd4_proc_compound+0x2a1/0x690 [nfsd] [ 142.152927] [] nfsd_dispatch+0xeb/0x230 [nfsd] [ 142.152927] [] svc_process_common+0x345/0x690 [sunrpc] [ 142.152927] [] ? try_to_wake_up+0x280/0x280 [ 142.152927] [] svc_process+0x102/0x150 [sunrpc] [ 142.152927] [] nfsd+0xbd/0x160 [nfsd] [ 142.152927] [] ? 0xffffffffa039efff [ 142.152927] [] kthread+0x8c/0xa0 [ 142.152927] [] kernel_thread_helper+0x4/0x10 [ 142.152927] [] ? kthread_worker_fn+0x190/0x190 [ 142.152927] [] ? gs_change+0x13/0x13 Reported-by: Bryan Schumaker Tested-by: Bryan Schumaker Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 931155f51ecc..8e253a347649 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4185,7 +4185,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); out: - nfs4_unlock_state(); + if (!cstate->replay_owner) + nfs4_unlock_state(); return status; out_nfserr: -- cgit v1.2.3-58-ga151 From 6409a5a65d2b959c3f5e2b8adfa67c349e294652 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 28 Sep 2011 11:37:56 -0400 Subject: nfsd4: clean up downgrading code In response to some review comments, get rid of the somewhat obscure for-loop with bitops, and improve a comment. Reported-by: Steve Dickson Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 31 ++++++++++++++++++++++--------- fs/nfsd/state.h | 8 +++++--- 2 files changed, 27 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8e253a347649..683885b18ceb 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3523,16 +3523,29 @@ out: return status; } -static inline void nfs4_file_downgrade(struct nfs4_ol_stateid *stp, unsigned int to_access) +static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access) { - int i; + if (!test_bit(access, &stp->st_access_bmap)) + return; + nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(access)); + __clear_bit(access, &stp->st_access_bmap); +} - for (i = 1; i < 4; i++) { - if (test_bit(i, &stp->st_access_bmap) - && ((i & to_access) != i)) { - nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(i)); - __clear_bit(i, &stp->st_access_bmap); - } +static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access) +{ + switch (to_access) { + case NFS4_SHARE_ACCESS_READ: + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_WRITE); + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH); + break; + case NFS4_SHARE_ACCESS_WRITE: + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_READ); + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH); + break; + case NFS4_SHARE_ACCESS_BOTH: + break; + default: + BUG(); } } @@ -3578,7 +3591,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, stp->st_deny_bmap, od->od_share_deny); goto out; } - nfs4_file_downgrade(stp, od->od_share_access); + nfs4_stateid_downgrade(stp, od->od_share_access); reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 13f6f9f5ceec..22b065a55ea0 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -403,9 +403,11 @@ struct nfs4_file { /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ struct file * fi_fds[3]; /* - * Each open or lock stateid contributes 1 to either - * fi_access[O_RDONLY], fi_access[O_WRONLY], or both, depending - * on open or lock mode: + * Each open or lock stateid contributes 0-4 to the counts + * below depending on which bits are set in st_access_bitmap: + * 1 to fi_access[O_RDONLY] if NFS4_SHARE_ACCES_READ is set + * + 1 to fi_access[O_WRONLY] if NFS4_SHARE_ACCESS_WRITE is set + * + 1 to both of the above if NFS4_SHARE_ACCESS_BOTH is set. */ atomic_t fi_access[2]; struct file *fi_deleg_file; -- cgit v1.2.3-58-ga151 From b31b30e5c76b7653b4434fcdc3c5d2b46a367c2a Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 28 Sep 2011 11:47:20 -0400 Subject: nfsd4: cleanup state.h comments These comments are mostly out of date. Reported-by: Bryan Schumaker --- fs/nfsd/state.h | 45 ++++++++------------------------------------- 1 file changed, 8 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 22b065a55ea0..aa14f06af2df 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -332,25 +332,6 @@ struct nfs4_replay { char rp_ibuf[NFSD4_REPLAY_ISIZE]; }; -/* -* nfs4_stateowner can either be an open_owner, or a lock_owner -* -* so_idhash: stateid_hashtbl[] for open owner, lockstateid_hashtbl[] -* for lock_owner -* so_strhash: ownerstr_hashtbl[] for open_owner, lock_ownerstr_hashtbl[] -* for lock_owner -* so_perclient: nfs4_client->cl_perclient entry - used when nfs4_client -* struct is reaped. -* so_perfilestate: heads the list of nfs4_ol_stateid (either open or lock) -* and is used to ensure no dangling nfs4_ol_stateid references when we -* release a stateowner. -* so_perlockowner: (open) nfs4_ol_stateid->st_perlockowner entry - used when -* close is called to reap associated byte-range locks -* so_close_lru: (open) stateowner is placed on this list instead of being -* reaped (when so_perfilestate is empty) to hold the last close replay. -* reaped by laundramat thread after lease period. -*/ - struct nfs4_stateowner { struct list_head so_strhash; /* hash by op_name */ struct list_head so_stateids; @@ -366,7 +347,14 @@ struct nfs4_stateowner { struct nfs4_openowner { struct nfs4_stateowner oo_owner; /* must be first field */ struct list_head oo_perclient; - struct list_head oo_close_lru; /* tail queue */ + /* + * We keep around openowners a little while after last close, + * which saves clients from having to confirm, and allows us to + * handle close replays if they come soon enough. The close_lru + * is a list of such openowners, to be reaped by the laundromat + * thread eventually if they remain unused: + */ + struct list_head oo_close_lru; struct nfs4_ol_stateid *oo_last_closed_stid; time_t oo_time; /* time of placement on so_close_lru */ #define NFS4_OO_CONFIRMED 1 @@ -443,23 +431,6 @@ static inline struct file *find_any_file(struct nfs4_file *f) return f->fi_fds[O_RDONLY]; } -/* -* nfs4_ol_stateid can either be an open stateid or (eventually) a lock stateid -* -* (open)nfs4_ol_stateid: one per (open)nfs4_stateowner, nfs4_file -* -* st_hash: stateid_hashtbl[] entry or lockstateid_hashtbl entry -* st_perfile: file_hashtbl[] entry. -* st_perfile_state: nfs4_stateowner->so_perfilestate -* st_perlockowner: (open stateid) list of lock nfs4_stateowners -* st_access_bmap: used only for open stateid -* st_deny_bmap: used only for open stateid -* st_openstp: open stateid lock stateid was derived from -* -* XXX: open stateids and lock stateids have diverged sufficiently that -* we should consider defining separate structs for the two cases. -*/ - /* "ol" stands for "Open or Lock". Better suggestions welcome. */ struct nfs4_ol_stateid { struct nfs4_stid st_stid; -- cgit v1.2.3-58-ga151 From c30e92df30d7d5fe65262fbce5d1b7de675fe34e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 10 Oct 2011 17:34:31 -0400 Subject: nfsd4: ignore WANT bits in open downgrade We don't use WANT bits yet--and sending them can probably trigger a BUG() further down. Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 683885b18ceb..a39a542d8be5 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3574,6 +3574,8 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, if (!access_valid(od->od_share_access, cstate->minorversion) || !deny_valid(od->od_share_deny)) return nfserr_inval; + /* We don't yet support WANT bits: */ + od->od_share_access &= NFS4_SHARE_ACCESS_MASK; nfs4_lock_state(); status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid, -- cgit v1.2.3-58-ga151 From 04f9e664b21c4440daf4d08f31db9b18517e4b8d Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 10 Oct 2011 14:37:13 -0400 Subject: nfsd4: move access/deny validity checks to xdr code I'd rather put more of these sorts of checks into standardized xdr decoders for the various types rather than have them cluttering up the core logic in nfs4proc.c and nfs4state.c. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 32 --------------------- fs/nfsd/nfs4xdr.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 73 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a39a542d8be5..6bfa293e1c91 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2418,31 +2418,6 @@ find_file(struct inode *ino) return NULL; } -static inline int access_valid(u32 x, u32 minorversion) -{ - if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ) - return 0; - if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH) - return 0; - x &= ~NFS4_SHARE_ACCESS_MASK; - if (minorversion && x) { - if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL) - return 0; - if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED) - return 0; - x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK); - } - if (x) - return 0; - return 1; -} - -static inline int deny_valid(u32 x) -{ - /* Note: unlike access bits, deny bits may be zero. */ - return x <= NFS4_SHARE_DENY_BOTH; -} - /* * Called to check deny when READ with all zero stateid or * WRITE with all zero or all one stateid @@ -2918,10 +2893,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf struct nfs4_delegation *dp = NULL; __be32 status; - status = nfserr_inval; - if (!access_valid(open->op_share_access, resp->cstate.minorversion) - || !deny_valid(open->op_share_deny)) - goto out; /* * Lookup file; if found, lookup stateid and check open request, * and check for delegations in the process of being recalled. @@ -3571,9 +3542,6 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, (int)cstate->current_fh.fh_dentry->d_name.len, cstate->current_fh.fh_dentry->d_name.name); - if (!access_valid(od->od_share_access, cstate->minorversion) - || !deny_valid(od->od_share_deny)) - return nfserr_inval; /* We don't yet support WANT bits: */ od->od_share_access &= NFS4_SHARE_ACCESS_MASK; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 5779acde7e70..94da8bb36c85 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -639,6 +639,64 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup DECODE_TAIL; } +static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *x) +{ + __be32 *p; + u32 w; + + READ_BUF(4); + READ32(w); + *x = w; + switch (w & NFS4_SHARE_ACCESS_MASK) { + case NFS4_SHARE_ACCESS_READ: + case NFS4_SHARE_ACCESS_WRITE: + case NFS4_SHARE_ACCESS_BOTH: + break; + default: + return nfserr_bad_xdr; + } + w &= !NFS4_SHARE_ACCESS_MASK; + if (!w) + return nfs_ok; + if (!argp->minorversion) + return nfserr_bad_xdr; + switch (w & NFS4_SHARE_WANT_MASK) { + case NFS4_SHARE_WANT_NO_PREFERENCE: + case NFS4_SHARE_WANT_READ_DELEG: + case NFS4_SHARE_WANT_WRITE_DELEG: + case NFS4_SHARE_WANT_ANY_DELEG: + case NFS4_SHARE_WANT_NO_DELEG: + case NFS4_SHARE_WANT_CANCEL: + break; + default: + return nfserr_bad_xdr; + } + w &= !NFS4_SHARE_WANT_MASK; + if (!w) + return nfs_ok; + switch (w) { + case NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL: + case NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED: + return nfs_ok; + } +xdr_error: + return nfserr_bad_xdr; +} + +static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x) +{ + __be32 *p; + + READ_BUF(4); + READ32(*x); + /* Note: unlinke access bits, deny bits may be zero. */ + if (*x & !NFS4_SHARE_DENY_BOTH) + return nfserr_bad_xdr; + return nfs_ok; +xdr_error: + return nfserr_bad_xdr; +} + static __be32 nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) { @@ -649,10 +707,15 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) open->op_openowner = NULL; /* seqid, share_access, share_deny, clientid, ownerlen */ - READ_BUF(16 + sizeof(clientid_t)); + READ_BUF(4); READ32(open->op_seqid); - READ32(open->op_share_access); - READ32(open->op_share_deny); + status = nfsd4_decode_share_access(argp, &open->op_share_access); + if (status) + goto xdr_error; + status = nfsd4_decode_share_deny(argp, &open->op_share_deny); + if (status) + goto xdr_error; + READ_BUF(sizeof(clientid_t) + 4); COPYMEM(&open->op_clientid, sizeof(clientid_t)); READ32(open->op_owner.len); @@ -753,11 +816,14 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d status = nfsd4_decode_stateid(argp, &open_down->od_stateid); if (status) return status; - READ_BUF(12); + READ_BUF(4); READ32(open_down->od_seqid); - READ32(open_down->od_share_access); - READ32(open_down->od_share_deny); - + status = nfsd4_decode_share_access(argp, &open_down->od_share_access); + if (status) + return status; + status = nfsd4_decode_share_deny(argp, &open_down->od_share_deny); + if (status) + return status; DECODE_TAIL; } -- cgit v1.2.3-58-ga151 From a084daf512bb66fa3c8e21c7027daea521179cd0 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 10 Oct 2011 15:07:40 -0400 Subject: nfsd4: move name-length checks to xdr Again, these checks are better in the xdr code. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 24 ++---------------------- fs/nfsd/nfs4xdr.c | 45 ++++++++++++++++++++++++++++++--------------- fs/nfsd/xdr4.h | 3 +-- 3 files changed, 33 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 6bfa293e1c91..5f35f35a2da0 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1189,17 +1189,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, return clp; } -static int check_name(struct xdr_netobj name) -{ - if (name.len == 0) - return 0; - if (name.len > NFS4_OPAQUE_LIMIT) { - dprintk("NFSD: check_name: name too long(%d)!\n", name.len); - return 0; - } - return 1; -} - static void add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval) { @@ -1442,7 +1431,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, __func__, rqstp, exid, exid->clname.len, exid->clname.data, addr_str, exid->flags, exid->spa_how); - if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A)) + if (exid->flags & ~EXCHGID4_FLAG_MASK_A) return nfserr_inval; /* Currently only support SP4_NONE */ @@ -1992,19 +1981,13 @@ __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setclientid *setclid) { - struct xdr_netobj clname = { - .len = setclid->se_namelen, - .data = setclid->se_name, - }; + struct xdr_netobj clname = setclid->se_name; nfs4_verifier clverifier = setclid->se_verf; unsigned int strhashval; struct nfs4_client *conf, *unconf, *new; __be32 status; char dname[HEXDIR_LEN]; - if (!check_name(clname)) - return nfserr_inval; - status = nfs4_make_rec_clidname(dname, &clname); if (status) return status; @@ -2523,9 +2506,6 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, struct nfs4_openowner *oo = NULL; __be32 status; - if (!check_name(open->op_owner)) - return nfserr_inval; - if (STALE_CLIENTID(&open->op_clientid)) return nfserr_stale_clientid; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 94da8bb36c85..2cab33cc3238 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -697,6 +697,23 @@ xdr_error: return nfserr_bad_xdr; } +static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o) +{ + __be32 *p; + + READ_BUF(4); + READ32(o->len); + + if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT) + return nfserr_bad_xdr; + + READ_BUF(o->len); + SAVEMEM(o->data, o->len); + return nfs_ok; +xdr_error: + return nfserr_bad_xdr; +} + static __be32 nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) { @@ -715,13 +732,12 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) status = nfsd4_decode_share_deny(argp, &open->op_share_deny); if (status) goto xdr_error; - READ_BUF(sizeof(clientid_t) + 4); + READ_BUF(sizeof(clientid_t)); COPYMEM(&open->op_clientid, sizeof(clientid_t)); - READ32(open->op_owner.len); - - /* owner, open_flag */ - READ_BUF(open->op_owner.len + 4); - SAVEMEM(open->op_owner.data, open->op_owner.len); + status = nfsd4_decode_opaque(argp, &open->op_owner); + if (status) + goto xdr_error; + READ_BUF(4); READ32(open->op_create); switch (open->op_create) { case NFS4_OPEN_NOCREATE: @@ -964,12 +980,13 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient { DECODE_HEAD; - READ_BUF(12); + READ_BUF(8); COPYMEM(setclientid->se_verf.data, 8); - READ32(setclientid->se_namelen); - READ_BUF(setclientid->se_namelen + 8); - SAVEMEM(setclientid->se_name, setclientid->se_namelen); + status = nfsd4_decode_opaque(argp, &setclientid->se_name); + if (status) + return nfserr_bad_xdr; + READ_BUF(8); READ32(setclientid->se_callback_prog); READ32(setclientid->se_callback_netid_len); @@ -1112,11 +1129,9 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, READ_BUF(NFS4_VERIFIER_SIZE); COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE); - READ_BUF(4); - READ32(exid->clname.len); - - READ_BUF(exid->clname.len); - SAVEMEM(exid->clname.data, exid->clname.len); + status = nfsd4_decode_opaque(argp, &exid->clname); + if (status) + return nfserr_bad_xdr; READ_BUF(4); READ32(exid->flags); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index c9012149637c..4c8a7ec3f25d 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -317,8 +317,7 @@ struct nfsd4_setattr { struct nfsd4_setclientid { nfs4_verifier se_verf; /* request */ - u32 se_namelen; /* request */ - char * se_name; /* request */ + struct xdr_netobj se_name; u32 se_callback_prog; /* request */ u32 se_callback_netid_len; /* request */ char * se_callback_netid_val; /* request */ -- cgit v1.2.3-58-ga151 From b6d2f1ca3c1162f51098969e9c52fd099720416a Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 10 Oct 2011 17:44:19 -0400 Subject: nfsd4: more robust ignoring of WANT bits in OPEN Mask out the WANT bits right at the start instead of on each use. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 3 +++ fs/nfsd/nfs4state.c | 5 ++--- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 4e41f65c7021..5b192a2512b6 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -315,6 +315,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) return nfserr_inval; + /* We don't yet support WANT bits: */ + open->op_share_access &= NFS4_SHARE_ACCESS_MASK; + /* * RFC5661 18.51.3 * Before RECLAIM_COMPLETE done, server should deny new lock diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 5f35f35a2da0..2042805da960 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2345,8 +2345,7 @@ static inline __be32 init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_ stp->st_file = fp; stp->st_access_bmap = 0; stp->st_deny_bmap = 0; - __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK, - &stp->st_access_bmap); + __set_bit(open->op_share_access, &stp->st_access_bmap); __set_bit(open->op_share_deny, &stp->st_deny_bmap); stp->st_openstp = NULL; return nfs_ok; @@ -2690,7 +2689,7 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, static __be32 nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) { - u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK; + u32 op_share_access = open->op_share_access; bool new_access; __be32 status; -- cgit v1.2.3-58-ga151 From 875cd04381e695afbbbef025438113e12af84963 Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Tue, 11 Oct 2011 13:43:50 +0100 Subject: cifs: Display strictcache mount option in /proc/mounts Commit d39454ffe4a3c85428483b8a8a8e5e797b6363d5 adds a strictcache mount option. This patch allows the display of this mount option in /proc/mounts when listing shares mounted with the strictcache mount option. Signed-off-by: Sachin Prabhu Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 54b8f1e7da94..03d2f6b27cc4 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -432,6 +432,8 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m) seq_printf(s, ",mfsymlinks"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) seq_printf(s, ",fsc"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) + seq_printf(s, ",strictcache"); seq_printf(s, ",rsize=%d", cifs_sb->rsize); seq_printf(s, ",wsize=%d", cifs_sb->wsize); -- cgit v1.2.3-58-ga151 From 0c38a2512df272b14ef4238b476a2e4f70da1479 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 25 Aug 2011 07:17:01 +0000 Subject: xfs: don't serialise direct IO reads on page cache checks There is no need to grab the i_mutex of the IO lock in exclusive mode if we don't need to invalidate the page cache. Taking these locks on every direct IO effective serialises them as taking the IO lock in exclusive mode has to wait for all shared holders to drop the lock. That only happens when IO is complete, so effective it prevents dispatch of concurrent direct IO reads to the same inode. Fix this by taking the IO lock shared to check the page cache state, and only then drop it and take the IO lock exclusively if there is work to be done. Hence for the normal direct IO case, no exclusive locking will occur. Signed-off-by: Dave Chinner Tested-by: Joern Engel Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_file.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7f7b42469ea7..8fd4a0708d30 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -317,7 +317,19 @@ xfs_file_aio_read( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if (unlikely(ioflags & IO_ISDIRECT)) { + /* + * Locking is a bit tricky here. If we take an exclusive lock + * for direct IO, we effectively serialise all new concurrent + * read IO to this file and block it behind IO that is currently in + * progress because IO in progress holds the IO lock shared. We only + * need to hold the lock exclusive to blow away the page cache, so + * only take lock exclusively if the page cache needs invalidation. + * This allows the normal direct IO case of no page cache pages to + * proceeed concurrently without serialisation. + */ + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) { + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { @@ -330,8 +342,7 @@ xfs_file_aio_read( } } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); - } else - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + } trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); -- cgit v1.2.3-58-ga151 From 7271d243f9d1b4106289e4cf876c8b1203de59ab Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 25 Aug 2011 07:17:02 +0000 Subject: xfs: don't serialise adjacent concurrent direct IO appending writes For append write workloads, extending the file requires a certain amount of exclusive locking to be done up front to ensure sanity in things like ensuring that we've zeroed any allocated regions between the old EOF and the start of the new IO. For single threads, this typically isn't a problem, and for large IOs we don't serialise enough for it to be a problem for two threads on really fast block devices. However for smaller IO and larger thread counts we have a problem. Take 4 concurrent sequential, single block sized and aligned IOs. After the first IO is submitted but before it completes, we end up with this state: IO 1 IO 2 IO 3 IO 4 +-------+-------+-------+-------+ ^ ^ | | | | | | | \- ip->i_new_size \- ip->i_size And the IO is done without exclusive locking because offset <= ip->i_size. When we submit IO 2, we see offset > ip->i_size, and grab the IO lock exclusive, because there is a chance we need to do EOF zeroing. However, there is already an IO in progress that avoids the need for IO zeroing because offset <= ip->i_new_size. hence we could avoid holding the IO lock exlcusive for this. Hence after submission of the second IO, we'd end up this state: IO 1 IO 2 IO 3 IO 4 +-------+-------+-------+-------+ ^ ^ | | | | | | | \- ip->i_new_size \- ip->i_size There is no need to grab the i_mutex of the IO lock in exclusive mode if we don't need to invalidate the page cache. Taking these locks on every direct IO effective serialises them as taking the IO lock in exclusive mode has to wait for all shared holders to drop the lock. That only happens when IO is complete, so effective it prevents dispatch of concurrent direct IO writes to the same inode. And so you can see that for the third concurrent IO, we'd avoid exclusive locking for the same reason we avoided the exclusive lock for the second IO. Fixing this is a bit more complex than that, because we need to hold a write-submission local value of ip->i_new_size to that clearing the value is only done if no other thread has updated it before our IO completes..... Signed-off-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_file.c | 68 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 8fd4a0708d30..cbbac5cc9c26 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -418,11 +418,13 @@ xfs_aio_write_isize_update( */ STATIC void xfs_aio_write_newsize_update( - struct xfs_inode *ip) + struct xfs_inode *ip, + xfs_fsize_t new_size) { - if (ip->i_new_size) { + if (new_size == ip->i_new_size) { xfs_rw_ilock(ip, XFS_ILOCK_EXCL); - ip->i_new_size = 0; + if (new_size == ip->i_new_size) + ip->i_new_size = 0; if (ip->i_d.di_size > ip->i_size) ip->i_d.di_size = ip->i_size; xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); @@ -473,7 +475,7 @@ xfs_file_splice_write( ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); xfs_aio_write_isize_update(inode, ppos, ret); - xfs_aio_write_newsize_update(ip); + xfs_aio_write_newsize_update(ip, new_size); xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } @@ -670,6 +672,7 @@ xfs_file_aio_write_checks( struct file *file, loff_t *pos, size_t *count, + xfs_fsize_t *new_sizep, int *iolock) { struct inode *inode = file->f_mapping->host; @@ -677,6 +680,8 @@ xfs_file_aio_write_checks( xfs_fsize_t new_size; int error = 0; + *new_sizep = 0; +restart: error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); if (error) { xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); @@ -684,20 +689,41 @@ xfs_file_aio_write_checks( return error; } - new_size = *pos + *count; - if (new_size > ip->i_size) - ip->i_new_size = new_size; - if (likely(!(file->f_mode & FMODE_NOCMTIME))) file_update_time(file); /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this - * write. + * write. There is no need to issue zeroing if another in-flght IO ends + * at or before this one If zeronig is needed and we are currently + * holding the iolock shared, we need to update it to exclusive which + * involves dropping all locks and relocking to maintain correct locking + * order. If we do this, restart the function to ensure all checks and + * values are still valid. */ - if (*pos > ip->i_size) + if ((ip->i_new_size && *pos > ip->i_new_size) || + (!ip->i_new_size && *pos > ip->i_size)) { + if (*iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); + goto restart; + } error = -xfs_zero_eof(ip, *pos, ip->i_size); + } + + /* + * If this IO extends beyond EOF, we may need to update ip->i_new_size. + * We have already zeroed space beyond EOF (if necessary). Only update + * ip->i_new_size if this IO ends beyond any other in-flight writes. + */ + new_size = *pos + *count; + if (new_size > ip->i_size) { + if (new_size > ip->i_new_size) + ip->i_new_size = new_size; + *new_sizep = new_size; + } xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); if (error) @@ -744,6 +770,7 @@ xfs_file_dio_aio_write( unsigned long nr_segs, loff_t pos, size_t ocount, + xfs_fsize_t *new_size, int *iolock) { struct file *file = iocb->ki_filp; @@ -764,13 +791,20 @@ xfs_file_dio_aio_write( if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) unaligned_io = 1; - if (unaligned_io || mapping->nrpages || pos > ip->i_size) + /* + * We don't need to take an exclusive lock unless there page cache needs + * to be invalidated or unaligned IO is being executed. We don't need to + * consider the EOF extension case here because + * xfs_file_aio_write_checks() will relock the inode as necessary for + * EOF zeroing cases and fill out the new inode size as appropriate. + */ + if (unaligned_io || mapping->nrpages) *iolock = XFS_IOLOCK_EXCL; else *iolock = XFS_IOLOCK_SHARED; xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); - ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); + ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); if (ret) return ret; @@ -809,6 +843,7 @@ xfs_file_buffered_aio_write( unsigned long nr_segs, loff_t pos, size_t ocount, + xfs_fsize_t *new_size, int *iolock) { struct file *file = iocb->ki_filp; @@ -822,7 +857,7 @@ xfs_file_buffered_aio_write( *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); - ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); + ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); if (ret) return ret; @@ -862,6 +897,7 @@ xfs_file_aio_write( ssize_t ret; int iolock; size_t ocount = 0; + xfs_fsize_t new_size = 0; XFS_STATS_INC(xs_write_calls); @@ -881,10 +917,10 @@ xfs_file_aio_write( if (unlikely(file->f_flags & O_DIRECT)) ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, - ocount, &iolock); + ocount, &new_size, &iolock); else ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, - ocount, &iolock); + ocount, &new_size, &iolock); xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); @@ -905,7 +941,7 @@ xfs_file_aio_write( } out_unlock: - xfs_aio_write_newsize_update(ip); + xfs_aio_write_newsize_update(ip, new_size); xfs_rw_iunlock(ip, iolock); return ret; } -- cgit v1.2.3-58-ga151 From 375ec69d2ef6e0797f19f5823e36e249765c3d41 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 23 Aug 2011 08:28:03 +0000 Subject: xfs: remove delwri buffer handling from xfs_buf_iorequest We cannot ever reach xfs_buf_iorequest for a buffer with XBF_DELWRI set, given that all write handlers make sure that the buffer is remove from the delwri queue before, and we never do reads with the XBF_DELWRI flag set (which the code would not handle correctly anyway). Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_buf.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index c57836dc778f..2e71a26da22e 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1275,15 +1275,10 @@ xfs_buf_iorequest( { trace_xfs_buf_iorequest(bp, _RET_IP_); - if (bp->b_flags & XBF_DELWRI) { - xfs_buf_delwri_queue(bp, 1); - return 0; - } + ASSERT(!(bp->b_flags & XBF_DELWRI)); - if (bp->b_flags & XBF_WRITE) { + if (bp->b_flags & XBF_WRITE) xfs_buf_wait_unpin(bp); - } - xfs_buf_hold(bp); /* Set the count to 1 initially, this will stop an I/O -- cgit v1.2.3-58-ga151 From 527cfdf19dd538a5a9e46b9bed0f30a38c28438d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 23 Aug 2011 08:28:04 +0000 Subject: xfs: remove the unlock argument to xfs_buf_delwri_queue We can just unlock the buffer in the caller, and the decrement of b_hold would also be needed in the !unlock, we just never hit that case currently given that the caller handles that case. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_buf.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 2e71a26da22e..04689dbbcbba 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -43,7 +43,7 @@ static kmem_zone_t *xfs_buf_zone; STATIC int xfsbufd(void *); -STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); +STATIC void xfs_buf_delwri_queue(xfs_buf_t *); static struct workqueue_struct *xfslogd_workqueue; struct workqueue_struct *xfsdatad_workqueue; @@ -940,7 +940,7 @@ xfs_buf_unlock( if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) { atomic_inc(&bp->b_hold); bp->b_flags |= XBF_ASYNC; - xfs_buf_delwri_queue(bp, 0); + xfs_buf_delwri_queue(bp); } XB_CLEAR_OWNER(bp); @@ -1049,7 +1049,8 @@ xfs_bdwrite( bp->b_flags &= ~XBF_READ; bp->b_flags |= (XBF_DELWRI | XBF_ASYNC); - xfs_buf_delwri_queue(bp, 1); + xfs_buf_delwri_queue(bp); + xfs_buf_unlock(bp); } /* @@ -1562,8 +1563,7 @@ error: */ STATIC void xfs_buf_delwri_queue( - xfs_buf_t *bp, - int unlock) + xfs_buf_t *bp) { struct list_head *dwq = &bp->b_target->bt_delwrite_queue; spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; @@ -1576,8 +1576,7 @@ xfs_buf_delwri_queue( /* If already in the queue, dequeue and place at tail */ if (!list_empty(&bp->b_list)) { ASSERT(bp->b_flags & _XBF_DELWRI_Q); - if (unlock) - atomic_dec(&bp->b_hold); + atomic_dec(&bp->b_hold); list_del(&bp->b_list); } @@ -1590,9 +1589,6 @@ xfs_buf_delwri_queue( list_add_tail(&bp->b_list, dwq); bp->b_queuetime = jiffies; spin_unlock(dwlk); - - if (unlock) - xfs_buf_unlock(bp); } void -- cgit v1.2.3-58-ga151 From 5a8ee6bafdd0ab8555adceac8b2cec539a552a1f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 23 Aug 2011 08:28:05 +0000 Subject: xfs: move more delwri setup into xfs_buf_delwri_queue Do not transfer a reference held by the caller to the buffer on the list, or decrement it in xfs_buf_delwri_queue, but instead grab a new reference if needed, and let the caller drop its own reference. Also move setting of the XBF_DELWRI and XBF_ASYNC flags into xfs_buf_delwri_queue, and only do it if needed. Note that for now xfs_buf_unlock already has XBF_DELWRI, but that will change in the following patches. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_buf.c | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 04689dbbcbba..86c0945053c9 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -937,11 +937,8 @@ void xfs_buf_unlock( struct xfs_buf *bp) { - if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) { - atomic_inc(&bp->b_hold); - bp->b_flags |= XBF_ASYNC; + if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) xfs_buf_delwri_queue(bp); - } XB_CLEAR_OWNER(bp); up(&bp->b_sema); @@ -1046,11 +1043,8 @@ xfs_bdwrite( { trace_xfs_buf_bdwrite(bp, _RET_IP_); - bp->b_flags &= ~XBF_READ; - bp->b_flags |= (XBF_DELWRI | XBF_ASYNC); - xfs_buf_delwri_queue(bp); - xfs_buf_unlock(bp); + xfs_buf_relse(bp); } /* @@ -1570,23 +1564,22 @@ xfs_buf_delwri_queue( trace_xfs_buf_delwri_queue(bp, _RET_IP_); - ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC)); + ASSERT(!(bp->b_flags & XBF_READ)); spin_lock(dwlk); - /* If already in the queue, dequeue and place at tail */ if (!list_empty(&bp->b_list)) { + /* if already in the queue, move it to the tail */ ASSERT(bp->b_flags & _XBF_DELWRI_Q); - atomic_dec(&bp->b_hold); - list_del(&bp->b_list); - } - - if (list_empty(dwq)) { + list_move_tail(&bp->b_list, dwq); + } else { /* start xfsbufd as it is about to have something to do */ - wake_up_process(bp->b_target->bt_task); - } + if (list_empty(dwq)) + wake_up_process(bp->b_target->bt_task); - bp->b_flags |= _XBF_DELWRI_Q; - list_add_tail(&bp->b_list, dwq); + atomic_inc(&bp->b_hold); + bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC; + list_add_tail(&bp->b_list, dwq); + } bp->b_queuetime = jiffies; spin_unlock(dwlk); } -- cgit v1.2.3-58-ga151 From 61551f1ee536289084a4a8f1c4f187e2f371c440 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 23 Aug 2011 08:28:06 +0000 Subject: xfs: call xfs_buf_delwri_queue directly Unify the ways we add buffers to the delwri queue by always calling xfs_buf_delwri_queue directly. The xfs_bdwrite functions is removed and opencoded in its callers, and the two places setting XBF_DELWRI while a buffer is locked and expecting xfs_buf_unlock to pick it up are converted to call xfs_buf_delwri_queue directly, too. Also replace the XFS_BUF_UNDELAYWRITE macro with direct calls to xfs_buf_delwri_dequeue to make the explicit queuing/dequeuing more obvious. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_attr.c | 2 +- fs/xfs/xfs_buf.c | 21 +++------------------ fs/xfs/xfs_buf.h | 8 +++----- fs/xfs/xfs_buf_item.c | 4 ++-- fs/xfs/xfs_dquot.c | 6 ++++-- fs/xfs/xfs_inode.c | 6 ++++-- fs/xfs/xfs_log_recover.c | 9 ++++++--- fs/xfs/xfs_mount.c | 2 +- fs/xfs/xfs_qm.c | 3 ++- fs/xfs/xfs_rw.c | 2 +- fs/xfs/xfs_trace.h | 1 - fs/xfs/xfs_trans_buf.c | 5 +++-- 12 files changed, 30 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 160bcdc34a6e..b097fd5a2b3f 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -2189,7 +2189,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); if (bp) { XFS_BUF_STALE(bp); - XFS_BUF_UNDELAYWRITE(bp); + xfs_buf_delwri_dequeue(bp); xfs_buf_relse(bp); bp = NULL; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 86c0945053c9..309eca75fad4 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -43,7 +43,6 @@ static kmem_zone_t *xfs_buf_zone; STATIC int xfsbufd(void *); -STATIC void xfs_buf_delwri_queue(xfs_buf_t *); static struct workqueue_struct *xfslogd_workqueue; struct workqueue_struct *xfsdatad_workqueue; @@ -937,9 +936,6 @@ void xfs_buf_unlock( struct xfs_buf *bp) { - if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) - xfs_buf_delwri_queue(bp); - XB_CLEAR_OWNER(bp); up(&bp->b_sema); @@ -1036,17 +1032,6 @@ xfs_bwrite( return error; } -void -xfs_bdwrite( - void *mp, - struct xfs_buf *bp) -{ - trace_xfs_buf_bdwrite(bp, _RET_IP_); - - xfs_buf_delwri_queue(bp); - xfs_buf_relse(bp); -} - /* * Called when we want to stop a buffer from getting written or read. * We attach the EIO error, muck with its flags, and call xfs_buf_ioend @@ -1069,7 +1054,7 @@ xfs_bioerror( * We're calling xfs_buf_ioend, so delete XBF_DONE flag. */ XFS_BUF_UNREAD(bp); - XFS_BUF_UNDELAYWRITE(bp); + xfs_buf_delwri_dequeue(bp); XFS_BUF_UNDONE(bp); XFS_BUF_STALE(bp); @@ -1098,7 +1083,7 @@ xfs_bioerror_relse( * change that interface. */ XFS_BUF_UNREAD(bp); - XFS_BUF_UNDELAYWRITE(bp); + xfs_buf_delwri_dequeue(bp); XFS_BUF_DONE(bp); XFS_BUF_STALE(bp); bp->b_iodone = NULL; @@ -1555,7 +1540,7 @@ error: /* * Delayed write buffer handling */ -STATIC void +void xfs_buf_delwri_queue( xfs_buf_t *bp) { diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 620972b8094d..f1a8933becb6 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -198,7 +198,6 @@ extern void xfs_buf_unlock(xfs_buf_t *); /* Buffer Read and Write Routines */ extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); -extern void xfs_bdwrite(void *mp, xfs_buf_t *bp); extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); extern int xfs_bdstrat_cb(struct xfs_buf *); @@ -221,8 +220,9 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp) extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); /* Delayed Write Buffer Routines */ -extern void xfs_buf_delwri_dequeue(xfs_buf_t *); -extern void xfs_buf_delwri_promote(xfs_buf_t *); +extern void xfs_buf_delwri_queue(struct xfs_buf *); +extern void xfs_buf_delwri_dequeue(struct xfs_buf *); +extern void xfs_buf_delwri_promote(struct xfs_buf *); /* Buffer Daemon Setup Routines */ extern int xfs_buf_init(void); @@ -251,8 +251,6 @@ void xfs_buf_stale(struct xfs_buf *bp); XFS_BUF_DONE(bp); \ } while (0) -#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) -#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp) #define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) #define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index cac2ecfa6746..3243083d8693 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -992,7 +992,7 @@ xfs_buf_iodone_callbacks( xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ if (!XFS_BUF_ISSTALE(bp)) { - XFS_BUF_DELAYWRITE(bp); + xfs_buf_delwri_queue(bp); XFS_BUF_DONE(bp); } ASSERT(bp->b_iodone != NULL); @@ -1007,7 +1007,7 @@ xfs_buf_iodone_callbacks( */ XFS_BUF_STALE(bp); XFS_BUF_DONE(bp); - XFS_BUF_UNDELAYWRITE(bp); + xfs_buf_delwri_dequeue(bp); trace_xfs_buf_error_relse(bp, _RET_IP_); diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index db62959bed13..0f78dd46415c 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1243,8 +1243,10 @@ xfs_qm_dqflush( if (flags & SYNC_WAIT) error = xfs_bwrite(mp, bp); - else - xfs_bdwrite(mp, bp); + else { + xfs_buf_delwri_queue(bp); + xfs_buf_relse(bp); + } trace_xfs_dqflush_done(dqp); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 0239a7c7c886..f8fe1c66d420 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2598,8 +2598,10 @@ xfs_iflush( if (flags & SYNC_WAIT) error = xfs_bwrite(mp, bp); - else - xfs_bdwrite(mp, bp); + else { + xfs_buf_delwri_queue(bp); + xfs_buf_relse(bp); + } return error; corrupt_out: diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a199dbcee7d8..22946949bf5e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2176,7 +2176,8 @@ xlog_recover_buffer_pass2( } else { ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_bdwrite(mp, bp); + xfs_buf_delwri_queue(bp); + xfs_buf_relse(bp); } return (error); @@ -2439,7 +2440,8 @@ xlog_recover_inode_pass2( write_inode_buffer: ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_bdwrite(mp, bp); + xfs_buf_delwri_queue(bp); + xfs_buf_relse(bp); error: if (need_free) kmem_free(in_f); @@ -2561,7 +2563,8 @@ xlog_recover_dquot_pass2( ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_bdwrite(mp, bp); + xfs_buf_delwri_queue(bp); + xfs_buf_relse(bp); return (0); } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 0081657ad985..a957e437bee1 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1612,7 +1612,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp) XFS_BUF_UNDONE(sbp); XFS_BUF_UNREAD(sbp); - XFS_BUF_UNDELAYWRITE(sbp); + xfs_buf_delwri_dequeue(sbp); XFS_BUF_WRITE(sbp); XFS_BUF_UNASYNC(sbp); ASSERT(sbp->b_target == mp->m_ddev_targp); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 9a0aa76facdf..f51bef885e6d 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1296,7 +1296,8 @@ xfs_qm_dqiter_bufs( break; xfs_qm_reset_dqcounts(mp, bp, firstid, type); - xfs_bdwrite(mp, bp); + xfs_buf_delwri_queue(bp); + xfs_buf_relse(bp); /* * goto the next block. */ diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index c96a8a05ac03..99823c3b9aca 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -149,7 +149,7 @@ xfs_read_buf( } if (bp) { XFS_BUF_UNDONE(bp); - XFS_BUF_UNDELAYWRITE(bp); + xfs_buf_delwri_dequeue(bp); XFS_BUF_STALE(bp); /* * brelse clears B_ERROR and b_error diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 690fc7a7bd72..bb5e660e0fab 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -320,7 +320,6 @@ DEFINE_BUF_EVENT(xfs_buf_rele); DEFINE_BUF_EVENT(xfs_buf_iodone); DEFINE_BUF_EVENT(xfs_buf_iorequest); DEFINE_BUF_EVENT(xfs_buf_bawrite); -DEFINE_BUF_EVENT(xfs_buf_bdwrite); DEFINE_BUF_EVENT(xfs_buf_lock); DEFINE_BUF_EVENT(xfs_buf_lock_done); DEFINE_BUF_EVENT(xfs_buf_trylock); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 137e2b9e2948..5e5196a269dd 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -643,13 +643,14 @@ xfs_trans_log_buf(xfs_trans_t *tp, * inside the b_bdstrat callback so that this won't get written to * disk. */ - XFS_BUF_DELAYWRITE(bp); XFS_BUF_DONE(bp); ASSERT(atomic_read(&bip->bli_refcount) > 0); bp->b_iodone = xfs_buf_iodone_callbacks; bip->bli_item.li_cb = xfs_buf_iodone; + xfs_buf_delwri_queue(bp); + trace_xfs_trans_log_buf(bip); /* @@ -738,7 +739,7 @@ xfs_trans_binval( * We set the stale bit in the buffer as well since we're getting * rid of it. */ - XFS_BUF_UNDELAYWRITE(bp); + xfs_buf_delwri_dequeue(bp); XFS_BUF_STALE(bp); bip->bli_flags |= XFS_BLI_STALE; bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); -- cgit v1.2.3-58-ga151 From c2b006c1da1602551def200e4661535f02b82488 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 23 Aug 2011 08:28:07 +0000 Subject: xfs: let xfs_bwrite callers handle the xfs_buf_relse Remove the xfs_buf_relse from xfs_bwrite and let the caller handle it to mirror the delwri and read paths. Also remove the mount pointer passed to xfs_bwrite, which is superflous now that we have a mount pointer in the buftarg. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_attr.c | 7 ++++--- fs/xfs/xfs_buf.c | 8 ++++---- fs/xfs/xfs_buf.h | 2 +- fs/xfs/xfs_dquot.c | 8 ++++---- fs/xfs/xfs_fsops.c | 40 ++++++++++++++++++++++------------------ fs/xfs/xfs_inode.c | 8 ++++---- fs/xfs/xfs_log_recover.c | 11 +++++++---- fs/xfs/xfs_sync.c | 6 ++++-- 8 files changed, 50 insertions(+), 40 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index b097fd5a2b3f..8f0f65833500 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -2128,9 +2128,10 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE); if (tmp < XFS_BUF_SIZE(bp)) xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); - if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */ - return (error); - } + error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ + xfs_buf_relse(bp); + if (error) + return error; src += tmp; valuelen -= tmp; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 309eca75fad4..63dbeb9efc49 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1014,7 +1014,6 @@ xfs_buf_ioerror( int xfs_bwrite( - struct xfs_mount *mp, struct xfs_buf *bp) { int error; @@ -1026,9 +1025,10 @@ xfs_bwrite( xfs_bdstrat_cb(bp); error = xfs_buf_iowait(bp); - if (error) - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - xfs_buf_relse(bp); + if (error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_META_IO_ERROR); + } return error; } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index f1a8933becb6..3f543ed3009b 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -197,7 +197,7 @@ extern void xfs_buf_unlock(xfs_buf_t *); ((bp)->b_sema.count <= 0) /* Buffer Read and Write Routines */ -extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); +extern int xfs_bwrite(struct xfs_buf *bp); extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); extern int xfs_bdstrat_cb(struct xfs_buf *); diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 0f78dd46415c..3e2ccaedc51e 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1242,11 +1242,11 @@ xfs_qm_dqflush( } if (flags & SYNC_WAIT) - error = xfs_bwrite(mp, bp); - else { + error = xfs_bwrite(bp); + else xfs_buf_delwri_queue(bp); - xfs_buf_relse(bp); - } + + xfs_buf_relse(bp); trace_xfs_dqflush_done(dqp); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 9153d2c77caf..e023f940a3dd 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -216,10 +216,11 @@ xfs_growfs_data_private( tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp); agf->agf_freeblks = cpu_to_be32(tmpsize); agf->agf_longest = cpu_to_be32(tmpsize); - error = xfs_bwrite(mp, bp); - if (error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } + /* * AG inode header block */ @@ -240,10 +241,11 @@ xfs_growfs_data_private( agi->agi_dirino = cpu_to_be32(NULLAGINO); for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); - error = xfs_bwrite(mp, bp); - if (error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } + /* * BNO btree root block */ @@ -262,10 +264,11 @@ xfs_growfs_data_private( arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); - error = xfs_bwrite(mp, bp); - if (error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } + /* * CNT btree root block */ @@ -285,10 +288,11 @@ xfs_growfs_data_private( arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); nfree += be32_to_cpu(arec->ar_blockcount); - error = xfs_bwrite(mp, bp); - if (error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } + /* * INO btree root block */ @@ -303,10 +307,10 @@ xfs_growfs_data_private( block->bb_numrecs = 0; block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); - error = xfs_bwrite(mp, bp); - if (error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } } xfs_trans_agblocks_delta(tp, nfree); /* @@ -396,9 +400,9 @@ xfs_growfs_data_private( * just issue a warning and continue. The real work is * already done and committed. */ - if (!(error = xfs_bwrite(mp, bp))) { - continue; - } else { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) { xfs_warn(mp, "write error %d updating secondary superblock for ag %d", error, agno); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index f8fe1c66d420..7f237ba3c292 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2597,11 +2597,11 @@ xfs_iflush( goto cluster_corrupt_out; if (flags & SYNC_WAIT) - error = xfs_bwrite(mp, bp); - else { + error = xfs_bwrite(bp); + else xfs_buf_delwri_queue(bp); - xfs_buf_relse(bp); - } + + xfs_buf_relse(bp); return error; corrupt_out: diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 22946949bf5e..be173852b2ca 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -268,9 +268,12 @@ xlog_bwrite( xfs_buf_lock(bp); XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); - if ((error = xfs_bwrite(log->l_mp, bp))) + error = xfs_bwrite(bp); + if (error) { xfs_ioerror_alert("xlog_bwrite", log->l_mp, bp, XFS_BUF_ADDR(bp)); + } + xfs_buf_relse(bp); return error; } @@ -2172,15 +2175,15 @@ xlog_recover_buffer_pass2( (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize, (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { XFS_BUF_STALE(bp); - error = xfs_bwrite(mp, bp); + error = xfs_bwrite(bp); } else { ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; xfs_buf_delwri_queue(bp); - xfs_buf_relse(bp); } - return (error); + xfs_buf_relse(bp); + return error; } STATIC int diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 4604f90f86a3..90cc197e0433 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -322,6 +322,7 @@ xfs_sync_fsdata( struct xfs_mount *mp) { struct xfs_buf *bp; + int error; /* * If the buffer is pinned then push on the log so we won't get stuck @@ -334,8 +335,9 @@ xfs_sync_fsdata( bp = xfs_getsb(mp, 0); if (xfs_buf_ispinned(bp)) xfs_log_force(mp, 0); - - return xfs_bwrite(mp, bp); + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + return error; } /* -- cgit v1.2.3-58-ga151 From c4e1c098ee8a72ea563a697a2b175868be86fdc9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 23 Aug 2011 08:28:08 +0000 Subject: xfs: use the "delwri" terminology consistently And also remove the strange local lock and delwri list pointers in a few functions. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_buf.c | 43 +++++++++++++++++++------------------------ fs/xfs/xfs_buf.h | 4 ++-- 2 files changed, 21 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 63dbeb9efc49..d3c2b58d7d70 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1489,12 +1489,12 @@ xfs_setsize_buftarg( } STATIC int -xfs_alloc_delwrite_queue( +xfs_alloc_delwri_queue( xfs_buftarg_t *btp, const char *fsname) { - INIT_LIST_HEAD(&btp->bt_delwrite_queue); - spin_lock_init(&btp->bt_delwrite_lock); + INIT_LIST_HEAD(&btp->bt_delwri_queue); + spin_lock_init(&btp->bt_delwri_lock); btp->bt_flags = 0; btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); if (IS_ERR(btp->bt_task)) @@ -1524,7 +1524,7 @@ xfs_alloc_buftarg( spin_lock_init(&btp->bt_lru_lock); if (xfs_setsize_buftarg_early(btp, bdev)) goto error; - if (xfs_alloc_delwrite_queue(btp, fsname)) + if (xfs_alloc_delwri_queue(btp, fsname)) goto error; btp->bt_shrinker.shrink = xfs_buftarg_shrink; btp->bt_shrinker.seeks = DEFAULT_SEEKS; @@ -1544,46 +1544,44 @@ void xfs_buf_delwri_queue( xfs_buf_t *bp) { - struct list_head *dwq = &bp->b_target->bt_delwrite_queue; - spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; + struct xfs_buftarg *btp = bp->b_target; trace_xfs_buf_delwri_queue(bp, _RET_IP_); ASSERT(!(bp->b_flags & XBF_READ)); - spin_lock(dwlk); + spin_lock(&btp->bt_delwri_lock); if (!list_empty(&bp->b_list)) { /* if already in the queue, move it to the tail */ ASSERT(bp->b_flags & _XBF_DELWRI_Q); - list_move_tail(&bp->b_list, dwq); + list_move_tail(&bp->b_list, &btp->bt_delwri_queue); } else { /* start xfsbufd as it is about to have something to do */ - if (list_empty(dwq)) + if (list_empty(&btp->bt_delwri_queue)) wake_up_process(bp->b_target->bt_task); atomic_inc(&bp->b_hold); bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC; - list_add_tail(&bp->b_list, dwq); + list_add_tail(&bp->b_list, &btp->bt_delwri_queue); } bp->b_queuetime = jiffies; - spin_unlock(dwlk); + spin_unlock(&btp->bt_delwri_lock); } void xfs_buf_delwri_dequeue( xfs_buf_t *bp) { - spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; int dequeued = 0; - spin_lock(dwlk); + spin_lock(&bp->b_target->bt_delwri_lock); if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) { ASSERT(bp->b_flags & _XBF_DELWRI_Q); list_del_init(&bp->b_list); dequeued = 1; } bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); - spin_unlock(dwlk); + spin_unlock(&bp->b_target->bt_delwri_lock); if (dequeued) xfs_buf_rele(bp); @@ -1615,9 +1613,9 @@ xfs_buf_delwri_promote( if (bp->b_queuetime < jiffies - age) return; bp->b_queuetime = jiffies - age; - spin_lock(&btp->bt_delwrite_lock); - list_move(&bp->b_list, &btp->bt_delwrite_queue); - spin_unlock(&btp->bt_delwrite_lock); + spin_lock(&btp->bt_delwri_lock); + list_move(&bp->b_list, &btp->bt_delwri_queue); + spin_unlock(&btp->bt_delwri_lock); } STATIC void @@ -1638,15 +1636,13 @@ xfs_buf_delwri_split( unsigned long age) { xfs_buf_t *bp, *n; - struct list_head *dwq = &target->bt_delwrite_queue; - spinlock_t *dwlk = &target->bt_delwrite_lock; int skipped = 0; int force; force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); INIT_LIST_HEAD(list); - spin_lock(dwlk); - list_for_each_entry_safe(bp, n, dwq, b_list) { + spin_lock(&target->bt_delwri_lock); + list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) { ASSERT(bp->b_flags & XBF_DELWRI); if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) { @@ -1663,10 +1659,9 @@ xfs_buf_delwri_split( } else skipped++; } - spin_unlock(dwlk); + spin_unlock(&target->bt_delwri_lock); return skipped; - } /* @@ -1716,7 +1711,7 @@ xfsbufd( } /* sleep for a long time if there is nothing to do. */ - if (list_empty(&target->bt_delwrite_queue)) + if (list_empty(&target->bt_delwri_queue)) tout = MAX_SCHEDULE_TIMEOUT; schedule_timeout_interruptible(tout); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 3f543ed3009b..c23826685d3d 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -105,8 +105,8 @@ typedef struct xfs_buftarg { /* per device delwri queue */ struct task_struct *bt_task; - struct list_head bt_delwrite_queue; - spinlock_t bt_delwrite_lock; + struct list_head bt_delwri_queue; + spinlock_t bt_delwri_lock; unsigned long bt_flags; /* LRU control structures */ -- cgit v1.2.3-58-ga151 From 398d25ef23b10ce75424e0336a8d059dda1dbc8d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 23 Aug 2011 08:28:09 +0000 Subject: xfs: remove dead ENODEV handling in xfs_destroy_ioend No driver returns ENODEV from it bio completion handler, not has this ever been documented. Remove the dead code dealing with it. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_aops.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 8c37dde4c521..d91564404abf 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -122,17 +122,6 @@ xfs_destroy_ioend( bh->b_end_io(bh, !ioend->io_error); } - /* - * Volume managers supporting multiple paths can send back ENODEV - * when the final path disappears. In this case continuing to fill - * the page cache with dirty data which cannot be written out is - * evil, so prevent that. - */ - if (unlikely(ioend->io_error == -ENODEV)) { - xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, - __FILE__, __LINE__); - } - xfs_ioend_wake(ip); mempool_free(ioend, xfs_ioend_pool); } -- cgit v1.2.3-58-ga151 From c859cdd1da008b3825555be3242908088a3de366 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 23 Aug 2011 08:28:10 +0000 Subject: xfs: defer AIO/DIO completions We really shouldn't complete AIO or DIO requests until we have finished the unwritten extent conversion and size update. This means fsync never has to pick up any ioends as all work has been completed when signalling I/O completion. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_aops.c | 26 +++++++++----------------- fs/xfs/xfs_aops.h | 1 + 2 files changed, 10 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index d91564404abf..10660c364105 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -122,6 +122,11 @@ xfs_destroy_ioend( bh->b_end_io(bh, !ioend->io_error); } + if (ioend->io_iocb) { + if (ioend->io_isasync) + aio_complete(ioend->io_iocb, ioend->io_result, 0); + inode_dio_done(ioend->io_inode); + } xfs_ioend_wake(ip); mempool_free(ioend, xfs_ioend_pool); } @@ -236,8 +241,6 @@ xfs_end_io( /* ensure we don't spin on blocked ioends */ delay(1); } else { - if (ioend->io_iocb) - aio_complete(ioend->io_iocb, ioend->io_result, 0); xfs_destroy_ioend(ioend); } } @@ -274,6 +277,7 @@ xfs_alloc_ioend( * all the I/O from calling the completion routine too early. */ atomic_set(&ioend->io_remaining, 1); + ioend->io_isasync = 0; ioend->io_error = 0; ioend->io_list = NULL; ioend->io_type = type; @@ -1289,7 +1293,6 @@ xfs_end_io_direct_write( bool is_async) { struct xfs_ioend *ioend = iocb->private; - struct inode *inode = ioend->io_inode; /* * blockdev_direct_IO can return an error even after the I/O @@ -1300,28 +1303,17 @@ xfs_end_io_direct_write( ioend->io_offset = offset; ioend->io_size = size; + ioend->io_iocb = iocb; + ioend->io_result = ret; if (private && size > 0) ioend->io_type = IO_UNWRITTEN; if (is_async) { - /* - * If we are converting an unwritten extent we need to delay - * the AIO completion until after the unwrittent extent - * conversion has completed, otherwise do it ASAP. - */ - if (ioend->io_type == IO_UNWRITTEN) { - ioend->io_iocb = iocb; - ioend->io_result = ret; - } else { - aio_complete(iocb, ret, 0); - } + ioend->io_isasync = 1; xfs_finish_ioend(ioend); } else { xfs_finish_ioend_sync(ioend); } - - /* XXX: probably should move into the real I/O completion handler */ - inode_dio_done(inode); } STATIC ssize_t diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 71f721e1a71f..ce3dcb50762e 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -47,6 +47,7 @@ typedef struct xfs_ioend { unsigned int io_type; /* delalloc / unwritten */ int io_error; /* I/O error code */ atomic_t io_remaining; /* hold count */ + unsigned int io_isasync : 1; /* needs aio_complete */ struct inode *io_inode; /* file being written to */ struct buffer_head *io_buffer_head;/* buffer linked list head */ struct buffer_head *io_buffer_tail;/* buffer linked list tail */ -- cgit v1.2.3-58-ga151 From fc0063c4474599b7a066ba76b90902abe21bc675 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 23 Aug 2011 08:28:11 +0000 Subject: xfs: reduce ioend latency There is no reason to queue up ioends for processing in user context unless we actually need it. Just complete ioends that do not convert unwritten extents or need a size update from the end_io context. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_aops.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 10660c364105..e1ff0770784e 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -149,6 +149,15 @@ xfs_ioend_new_eof( return isize > ip->i_d.di_size ? isize : 0; } +/* + * Fast and loose check if this write could update the on-disk inode size. + */ +static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) +{ + return ioend->io_offset + ioend->io_size > + XFS_I(ioend->io_inode)->i_d.di_size; +} + /* * Update on-disk file size now that data has been written to disk. The * current in-memory file size is i_size. If a write is beyond eof i_new_size @@ -186,6 +195,9 @@ xfs_setfilesize( /* * Schedule IO completion handling on the final put of an ioend. + * + * If there is no work to do we might as well call it a day and free the + * ioend right now. */ STATIC void xfs_finish_ioend( @@ -194,8 +206,10 @@ xfs_finish_ioend( if (atomic_dec_and_test(&ioend->io_remaining)) { if (ioend->io_type == IO_UNWRITTEN) queue_work(xfsconvertd_workqueue, &ioend->io_work); - else + else if (xfs_ioend_is_append(ioend)) queue_work(xfsdatad_workqueue, &ioend->io_work); + else + xfs_destroy_ioend(ioend); } } -- cgit v1.2.3-58-ga151 From 2b3ffd7eb7b4392e3657c5046b055ca9f1f7cf5e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 23 Aug 2011 08:28:12 +0000 Subject: xfs: wait for I/O completion when writing out pages in xfs_setattr_size The current code relies on the xfs_ioend_wait call later on to make sure all I/O actually has completed. The xfs_ioend_wait call will go away soon, so prepare for that by using the waiting filemap function. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_iops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 673704fab748..32aca87bde5e 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -833,8 +833,8 @@ xfs_setattr_size( * care about here. */ if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) { - error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, - XBF_ASYNC, FI_NONE); + error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0, + FI_NONE); if (error) goto out_unlock; } -- cgit v1.2.3-58-ga151 From 4a06fd262dbeb70a2c315f7259e063efa493fe3d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 23 Aug 2011 08:28:13 +0000 Subject: xfs: remove i_iocount We now have an i_dio_count filed and surrounding infrastructure to wait for direct I/O completion instead of i_icount, and we have never needed to iocount waits for buffered I/O given that we only set the page uptodate after finishing all required work. Thus remove i_iocount, and replace the actually needed waits with calls to inode_dio_wait. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_aops.c | 39 +-------------------------------------- fs/xfs/xfs_aops.h | 3 --- fs/xfs/xfs_file.c | 8 ++------ fs/xfs/xfs_iget.c | 2 -- fs/xfs/xfs_inode.h | 1 - fs/xfs/xfs_iops.c | 4 ++-- fs/xfs/xfs_super.c | 7 +------ fs/xfs/xfs_sync.c | 8 ++------ fs/xfs/xfs_vnodeops.c | 4 +--- 9 files changed, 9 insertions(+), 67 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e1ff0770784e..46bba3e0af47 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -38,40 +38,6 @@ #include #include - -/* - * Prime number of hash buckets since address is used as the key. - */ -#define NVSYNC 37 -#define to_ioend_wq(v) (&xfs_ioend_wq[((unsigned long)v) % NVSYNC]) -static wait_queue_head_t xfs_ioend_wq[NVSYNC]; - -void __init -xfs_ioend_init(void) -{ - int i; - - for (i = 0; i < NVSYNC; i++) - init_waitqueue_head(&xfs_ioend_wq[i]); -} - -void -xfs_ioend_wait( - xfs_inode_t *ip) -{ - wait_queue_head_t *wq = to_ioend_wq(ip); - - wait_event(*wq, (atomic_read(&ip->i_iocount) == 0)); -} - -STATIC void -xfs_ioend_wake( - xfs_inode_t *ip) -{ - if (atomic_dec_and_test(&ip->i_iocount)) - wake_up(to_ioend_wq(ip)); -} - void xfs_count_page_state( struct page *page, @@ -115,7 +81,6 @@ xfs_destroy_ioend( xfs_ioend_t *ioend) { struct buffer_head *bh, *next; - struct xfs_inode *ip = XFS_I(ioend->io_inode); for (bh = ioend->io_buffer_head; bh; bh = next) { next = bh->b_private; @@ -127,7 +92,7 @@ xfs_destroy_ioend( aio_complete(ioend->io_iocb, ioend->io_result, 0); inode_dio_done(ioend->io_inode); } - xfs_ioend_wake(ip); + mempool_free(ioend, xfs_ioend_pool); } @@ -298,7 +263,6 @@ xfs_alloc_ioend( ioend->io_inode = inode; ioend->io_buffer_head = NULL; ioend->io_buffer_tail = NULL; - atomic_inc(&XFS_I(ioend->io_inode)->i_iocount); ioend->io_offset = 0; ioend->io_size = 0; ioend->io_iocb = NULL; @@ -558,7 +522,6 @@ xfs_cancel_ioend( unlock_buffer(bh); } while ((bh = next_bh) != NULL); - xfs_ioend_wake(XFS_I(ioend->io_inode)); mempool_free(ioend, xfs_ioend_pool); } while ((ioend = next) != NULL); } diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index ce3dcb50762e..116dd5c37034 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -61,9 +61,6 @@ typedef struct xfs_ioend { extern const struct address_space_operations xfs_address_space_operations; extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); -extern void xfs_ioend_init(void); -extern void xfs_ioend_wait(struct xfs_inode *); - extern void xfs_count_page_state(struct page *, int *, int *); #endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index cbbac5cc9c26..ee63c4fb3639 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -149,10 +149,6 @@ xfs_file_fsync( xfs_iflags_clear(ip, XFS_ITRUNCATED); - xfs_ilock(ip, XFS_IOLOCK_SHARED); - xfs_ioend_wait(ip); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - if (mp->m_flags & XFS_MOUNT_BARRIER) { /* * If we have an RT and/or log subvolume we need to make sure @@ -758,7 +754,7 @@ restart: * the dio layer. To avoid the problem with aio, we also need to wait for * outstanding IOs to complete so that unwritten extent conversion is completed * before we try to map the overlapping block. This is currently implemented by - * hitting it with a big hammer (i.e. xfs_ioend_wait()). + * hitting it with a big hammer (i.e. inode_dio_wait()). * * Returns with locks held indicated by @iolock and errors indicated by * negative return values. @@ -821,7 +817,7 @@ xfs_file_dio_aio_write( * otherwise demote the lock if we had to flush cached pages */ if (unaligned_io) - xfs_ioend_wait(ip); + inode_dio_wait(inode); else if (*iolock == XFS_IOLOCK_EXCL) { xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); *iolock = XFS_IOLOCK_SHARED; diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 7759812c1bbe..0fa98b1c70ea 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -75,7 +75,6 @@ xfs_inode_alloc( return NULL; } - ASSERT(atomic_read(&ip->i_iocount) == 0); ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(completion_done(&ip->i_flush)); @@ -150,7 +149,6 @@ xfs_inode_free( } /* asserts to verify all state is correct here */ - ASSERT(atomic_read(&ip->i_iocount) == 0); ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(completion_done(&ip->i_flush)); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 2380a4bcbece..760140d1dd66 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -257,7 +257,6 @@ typedef struct xfs_inode { xfs_fsize_t i_size; /* in-memory size */ xfs_fsize_t i_new_size; /* size when write completes */ - atomic_t i_iocount; /* outstanding I/O count */ /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 32aca87bde5e..e041e917c1d9 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -840,9 +840,9 @@ xfs_setattr_size( } /* - * Wait for all I/O to complete. + * Wait for all direct I/O to complete. */ - xfs_ioend_wait(ip); + inode_dio_wait(inode); error = -block_truncate_page(inode->i_mapping, iattr->ia_size, xfs_get_blocks); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 2366c54cc4fa..54d5e102ffe1 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -796,8 +796,6 @@ xfs_fs_destroy_inode( if (is_bad_inode(inode)) goto out_reclaim; - xfs_ioend_wait(ip); - ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); /* @@ -837,7 +835,6 @@ xfs_fs_inode_init_once( inode_init_once(VFS_I(ip)); /* xfs inode */ - atomic_set(&ip->i_iocount, 0); atomic_set(&ip->i_pincount, 0); spin_lock_init(&ip->i_flags_lock); init_waitqueue_head(&ip->i_ipin_wait); @@ -914,9 +911,8 @@ xfs_fs_write_inode( * of forcing it all the way to stable storage using a * synchronous transaction we let the log force inside the * ->sync_fs call do that for thus, which reduces the number - * of synchronous log foces dramatically. + * of synchronous log forces dramatically. */ - xfs_ioend_wait(ip); error = xfs_log_inode(ip); if (error) goto out; @@ -1681,7 +1677,6 @@ init_xfs_fs(void) printk(KERN_INFO XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n"); - xfs_ioend_init(); xfs_dir_startup(); error = xfs_init_zones(); diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 90cc197e0433..bf2b38c21caa 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -227,21 +227,17 @@ xfs_sync_inode_data( int error = 0; if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - goto out_wait; + return 0; if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { if (flags & SYNC_TRYLOCK) - goto out_wait; + return 0; xfs_ilock(ip, XFS_IOLOCK_SHARED); } error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? 0 : XBF_ASYNC, FI_NONE); xfs_iunlock(ip, XFS_IOLOCK_SHARED); - - out_wait: - if (flags & SYNC_WAIT) - xfs_ioend_wait(ip); return error; } diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 51fc429527bc..c2ff0fc86567 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -647,8 +647,6 @@ xfs_inactive( if (truncate) { xfs_ilock(ip, XFS_IOLOCK_EXCL); - xfs_ioend_wait(ip); - error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, @@ -2076,7 +2074,7 @@ xfs_free_file_space( if (need_iolock) { xfs_ilock(ip, XFS_IOLOCK_EXCL); /* wait for the completion of any pending DIOs */ - xfs_ioend_wait(ip); + inode_dio_wait(VFS_I(ip)); } rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); -- cgit v1.2.3-58-ga151 From 859f57ca00805e6c482eef1a7ab073097d02c8ca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 27 Aug 2011 14:45:11 +0000 Subject: xfs: avoid synchronous transactions when deleting attr blocks Currently xfs_attr_inactive causes a synchronous transactions if we are removing a file that has any extents allocated to the attribute fork, and thus makes XFS extremely slow at removing files with out of line extended attributes. The code looks a like a relict from the days before the busy extent list, but with the busy extent list we avoid reusing data and attr extents that have been freed but not commited yet, so this code is just as superflous as the synchronous transactions for data blocks. Signed-off-by: Christoph Hellwig Reported-by: Bernd Schubert Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_attr.c | 12 ------------ fs/xfs/xfs_bmap.c | 10 +--------- 2 files changed, 1 insertion(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 8f0f65833500..3dd5c9c374cb 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -823,18 +823,6 @@ xfs_attr_inactive(xfs_inode_t *dp) if (error) goto out; - /* - * Signal synchronous inactive transactions unless this is a - * synchronous mount filesystem in which case we know that we're here - * because we've been called out of xfs_inactive which means that the - * last reference is gone and the unlink transaction has already hit - * the disk so async inactive transactions are safe. - */ - if (!(mp->m_flags & XFS_MOUNT_WSYNC)) { - if (dp->i_d.di_anextents > 0) - xfs_trans_set_sync(trans); - } - error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); if (error) goto out; diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 452a291383ab..a6075c0f8456 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -3782,19 +3782,11 @@ xfs_bmap_compute_maxlevels( * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi * caller. Frees all the extents that need freeing, which must be done * last due to locking considerations. We never free any extents in - * the first transaction. This is to allow the caller to make the first - * transaction a synchronous one so that the pointers to the data being - * broken in this transaction will be permanent before the data is actually - * freed. This is necessary to prevent blocks from being reallocated - * and written to before the free and reallocation are actually permanent. - * We do not just make the first transaction synchronous here, because - * there are more efficient ways to gain the same protection in some cases - * (see the file truncation code). + * the first transaction. * * Return 1 if the given transaction was committed and a new one * started, and 0 otherwise in the committed parameter. */ -/*ARGSUSED*/ int /* error */ xfs_bmap_finish( xfs_trans_t **tp, /* transaction pointer addr */ -- cgit v1.2.3-58-ga151 From c58cb165bd44de8aaee9755a144136ae743be116 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 27 Aug 2011 14:42:53 +0000 Subject: xfs: avoid direct I/O write vs buffered I/O race Currently a buffered reader or writer can add pages to the pagecache while we are waiting for the iolock in xfs_file_dio_aio_write. Prevent this by re-checking mapping->nrpages after we got the iolock, and if nessecary upgrade the lock to exclusive mode. To simplify this a bit only take the ilock inside of xfs_file_aio_write_checks. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_file.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ee63c4fb3639..06fe97e56e48 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -676,6 +676,7 @@ xfs_file_aio_write_checks( xfs_fsize_t new_size; int error = 0; + xfs_rw_ilock(ip, XFS_ILOCK_EXCL); *new_sizep = 0; restart: error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); @@ -798,14 +799,24 @@ xfs_file_dio_aio_write( *iolock = XFS_IOLOCK_EXCL; else *iolock = XFS_IOLOCK_SHARED; - xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); + xfs_rw_ilock(ip, *iolock); + + /* + * Recheck if there are cached pages that need invalidate after we got + * the iolock to protect against other threads adding new pages while + * we were waiting for the iolock. + */ + if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, *iolock); + } ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); if (ret) return ret; if (mapping->nrpages) { - WARN_ON(*iolock != XFS_IOLOCK_EXCL); ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); if (ret) @@ -851,7 +862,7 @@ xfs_file_buffered_aio_write( size_t count = ocount; *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); + xfs_rw_ilock(ip, *iolock); ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); if (ret) -- cgit v1.2.3-58-ga151 From 04f658ee229f60dbb9a0dc2f3d6871b12b758051 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 Aug 2011 05:59:25 +0000 Subject: xfs: improve ioend error handling Return unwritten extent conversion errors to aio_complete. Skip both unwritten extent conversion and size updates if we had an I/O error or the filesystem has been shut down. Return -EIO to the aio/buffer completion handlers in case of a forced shutdown. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_aops.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 46bba3e0af47..22aadf667862 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -88,8 +88,10 @@ xfs_destroy_ioend( } if (ioend->io_iocb) { - if (ioend->io_isasync) - aio_complete(ioend->io_iocb, ioend->io_result, 0); + if (ioend->io_isasync) { + aio_complete(ioend->io_iocb, ioend->io_error ? + ioend->io_error : ioend->io_result, 0); + } inode_dio_done(ioend->io_inode); } @@ -141,9 +143,6 @@ xfs_setfilesize( xfs_inode_t *ip = XFS_I(ioend->io_inode); xfs_fsize_t isize; - if (unlikely(ioend->io_error)) - return 0; - if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) return EAGAIN; @@ -189,17 +188,24 @@ xfs_end_io( struct xfs_inode *ip = XFS_I(ioend->io_inode); int error = 0; + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + error = -EIO; + goto done; + } + if (ioend->io_error) + goto done; + /* * For unwritten extents we need to issue transactions to convert a * range to normal written extens after the data I/O has finished. */ - if (ioend->io_type == IO_UNWRITTEN && - likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { - + if (ioend->io_type == IO_UNWRITTEN) { error = xfs_iomap_write_unwritten(ip, ioend->io_offset, ioend->io_size); - if (error) - ioend->io_error = error; + if (error) { + ioend->io_error = -error; + goto done; + } } /* @@ -209,6 +215,7 @@ xfs_end_io( error = xfs_setfilesize(ioend); ASSERT(!error || error == EAGAIN); +done: /* * If we didn't complete processing of the ioend, requeue it to the * tail of the workqueue for another attempt later. Otherwise destroy -- cgit v1.2.3-58-ga151 From b522950f0ab8551f2ef56c210ebd50e6c6396601 Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Wed, 7 Sep 2011 19:37:54 +0000 Subject: xfs: Check the return value of xfs_buf_get() Check the return value of xfs_buf_get() and fail appropriately. Signed-off-by: Chandra Seetharaman Signed-off-by: Alex Elder --- fs/xfs/xfs_attr.c | 4 ++-- fs/xfs/xfs_fsops.c | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 3dd5c9c374cb..c7dab0c0bdda 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -2109,8 +2109,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, XBF_LOCK | XBF_DONT_BLOCK); - ASSERT(!xfs_buf_geterror(bp)); - + if (!bp) + return ENOMEM; tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : XFS_BUF_SIZE(bp); xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index e023f940a3dd..1c6fdeb702ff 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -194,6 +194,10 @@ xfs_growfs_data_private( bp = xfs_buf_get(mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); + if (!bp) { + error = ENOMEM; + goto error0; + } agf = XFS_BUF_TO_AGF(bp); memset(agf, 0, mp->m_sb.sb_sectsize); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); @@ -227,6 +231,10 @@ xfs_growfs_data_private( bp = xfs_buf_get(mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); + if (!bp) { + error = ENOMEM; + goto error0; + } agi = XFS_BUF_TO_AGI(bp); memset(agi, 0, mp->m_sb.sb_sectsize); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); @@ -253,6 +261,10 @@ xfs_growfs_data_private( XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), BTOBB(mp->m_sb.sb_blocksize), XBF_LOCK | XBF_MAPPED); + if (!bp) { + error = ENOMEM; + goto error0; + } block = XFS_BUF_TO_BLOCK(bp); memset(block, 0, mp->m_sb.sb_blocksize); block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC); @@ -276,6 +288,10 @@ xfs_growfs_data_private( XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), BTOBB(mp->m_sb.sb_blocksize), XBF_LOCK | XBF_MAPPED); + if (!bp) { + error = ENOMEM; + goto error0; + } block = XFS_BUF_TO_BLOCK(bp); memset(block, 0, mp->m_sb.sb_blocksize); block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC); @@ -300,6 +316,10 @@ xfs_growfs_data_private( XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), BTOBB(mp->m_sb.sb_blocksize), XBF_LOCK | XBF_MAPPED); + if (!bp) { + error = ENOMEM; + goto error0; + } block = XFS_BUF_TO_BLOCK(bp); memset(block, 0, mp->m_sb.sb_blocksize); block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC); -- cgit v1.2.3-58-ga151 From 2a30f36d9069b0646dcdd73def5fd7ab674bffd6 Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Tue, 20 Sep 2011 13:56:55 +0000 Subject: xfs: Check the return value of xfs_trans_get_buf() Check the return value of xfs_trans_get_buf() and fail appropriately. Signed-off-by: Chandra Seetharaman Signed-off-by: Alex Elder --- fs/xfs/xfs_attr_leaf.c | 2 ++ fs/xfs/xfs_btree.c | 3 ++- fs/xfs/xfs_dquot.c | 5 ++++- fs/xfs/xfs_ialloc.c | 13 ++++++++----- fs/xfs/xfs_inode.c | 9 ++++++--- fs/xfs/xfs_vnodeops.c | 9 ++++++++- 6 files changed, 30 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 8fad9602542b..58c3add07b65 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -2948,6 +2948,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, bp = xfs_trans_get_buf(*trans, dp->i_mount->m_ddev_targp, dblkno, dblkcnt, XBF_LOCK); + if (!bp) + return ENOMEM; xfs_trans_binval(*trans, bp); /* * Roll to next transaction. diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 2b9fd385e27d..28cc0199dc1f 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -970,7 +970,8 @@ xfs_btree_get_buf_block( *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize, flags); - ASSERT(!xfs_buf_geterror(*bpp)); + if (!*bpp) + return ENOMEM; *block = XFS_BUF_TO_BLOCK(*bpp); return 0; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 3e2ccaedc51e..0c5fe66ce92b 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -402,8 +402,11 @@ xfs_qm_dqalloc( dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0); - if (!bp || (error = xfs_buf_geterror(bp))) + + error = xfs_buf_geterror(bp); + if (error) goto error1; + /* * Make a chunk of dquots out of this buffer and log * the entire thing. diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 9f24ec28283b..207e0b0c0730 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -150,7 +150,7 @@ xfs_check_agi_freecount( /* * Initialise a new set of inodes. */ -STATIC void +STATIC int xfs_ialloc_inode_init( struct xfs_mount *mp, struct xfs_trans *tp, @@ -202,8 +202,8 @@ xfs_ialloc_inode_init( fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize * blks_per_cluster, XBF_LOCK); - ASSERT(!xfs_buf_geterror(fbuf)); - + if (!fbuf) + return ENOMEM; /* * Initialize all inodes in this buffer and then log them. * @@ -225,6 +225,7 @@ xfs_ialloc_inode_init( } xfs_trans_inode_alloc_buf(tp, fbuf); } + return 0; } /* @@ -369,9 +370,11 @@ xfs_ialloc_ag_alloc( * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len, - random32()); + error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, + args.len, random32()); + if (error) + return error; /* * Convert the results. */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7f237ba3c292..d689253fdfda 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1644,7 +1644,7 @@ xfs_iunlink_remove( * inodes that are in memory - they all must be marked stale and attached to * the cluster buffer. */ -STATIC void +STATIC int xfs_ifree_cluster( xfs_inode_t *free_ip, xfs_trans_t *tp, @@ -1690,6 +1690,8 @@ xfs_ifree_cluster( mp->m_bsize * blks_per_cluster, XBF_LOCK); + if (!bp) + return ENOMEM; /* * Walk the inodes already attached to the buffer and mark them * stale. These will all have the flush locks held, so an @@ -1799,6 +1801,7 @@ retry: } xfs_perag_put(pag); + return 0; } /* @@ -1878,10 +1881,10 @@ xfs_ifree( dip->di_mode = 0; if (delete) { - xfs_ifree_cluster(ip, tp, first_ino); + error = xfs_ifree_cluster(ip, tp, first_ino); } - return 0; + return error; } /* diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index c2ff0fc86567..0d1caec873a1 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -308,6 +308,10 @@ xfs_inactive_symlink_rmt( bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); + if (!bp) { + error = ENOMEM; + goto error1; + } xfs_trans_binval(tp, bp); } /* @@ -1648,7 +1652,10 @@ xfs_symlink( byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); - ASSERT(!xfs_buf_geterror(bp)); + if (!bp) { + error = ENOMEM; + goto error2; + } if (pathlen < byte_cnt) { byte_cnt = pathlen; } -- cgit v1.2.3-58-ga151 From eabbaf118239d0d4188298b52751040f3b4cc28f Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Thu, 8 Sep 2011 20:18:50 +0000 Subject: xfs: Fix the incorrect comment in the header of _xfs_buf_find Fix the incorrect comment in the header of the function _xfs_buf_find(). Signed-off-by: Chandra Seetharaman Signed-off-by: Alex Elder --- fs/xfs/xfs_buf.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index d3c2b58d7d70..e3af85095ddb 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -415,10 +415,7 @@ _xfs_buf_map_pages( /* * Look up, and creates if absent, a lockable buffer for * a given range of an inode. The buffer is returned - * locked. If other overlapping buffers exist, they are - * released before the new buffer is created and locked, - * which may imply that this call will block until those buffers - * are unlocked. No I/O is implied by this call. + * locked. No I/O is implied by this call. */ xfs_buf_t * _xfs_buf_find( -- cgit v1.2.3-58-ga151 From ed32201e65e15f3e6955cb84cbb544b08f81e5a5 Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Sat, 17 Sep 2011 13:38:38 +0000 Subject: xfs: Return -EIO when xfs_vn_getattr() failed An attribute of inode can be fetched via xfs_vn_getattr() in XFS. Currently it returns EIO, not negative value, when it failed. As a result, the system call returns not negative value even though an error occured. The stat(2), ls and mv commands cannot handle this error and do not work correctly. This patch fixes this bug, and returns -EIO, not EIO when an error is detected in xfs_vn_getattr(). Signed-off-by: Mitsuo Hayasaka Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_iops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index e041e917c1d9..e6b3e7620888 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -465,7 +465,7 @@ xfs_vn_getattr( trace_xfs_getattr(ip); if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -XFS_ERROR(EIO); stat->size = XFS_ISIZE(ip); stat->dev = inode->i_sb->s_dev; -- cgit v1.2.3-58-ga151 From e7455e02e5effcdd49bb28e7dfface2d3473de52 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Sep 2011 20:40:41 +0000 Subject: xfs: remove the first extent special case in xfs_bmap_add_extent Both xfs_bmap_add_extent_hole_delay and xfs_bmap_add_extent_hole_real already contain code to handle the case where there is no extent to merge with, which is effectively the same as the code duplicated here. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index a6075c0f8456..927e5454a43c 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -465,27 +465,10 @@ xfs_bmap_add_extent( ASSERT(*idx >= 0); ASSERT(*idx <= nextents); - /* - * This is the first extent added to a new/empty file. - * Special case this one, so other routines get to assume there are - * already extents in the list. - */ - if (nextents == 0) { - xfs_iext_insert(ip, *idx, 1, new, - whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); - - ASSERT(cur == NULL); - - if (!isnullstartblock(new->br_startblock)) { - XFS_IFORK_NEXT_SET(ip, whichfork, 1); - logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); - } else - logflags = 0; - } /* * Any kind of new delayed allocation goes here. */ - else if (isnullstartblock(new->br_startblock)) { + if (isnullstartblock(new->br_startblock)) { if (cur) ASSERT((cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL) == 0); -- cgit v1.2.3-58-ga151 From b9b984d7846e37c57e5b3f8cd883ad45e8ebc2cf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Sep 2011 20:40:42 +0000 Subject: xfs: remove impossible to read code in xfs_bmap_add_extent_delay_real We already have the worst case blocks reserved, so xfs_icsb_modify_counters won't fail in xfs_bmap_add_extent_delay_real. In fact we've had an assert to catch this case since day and it never triggered. So remove the code to try smaller reservations, and just return the error for that case in addition to keeping the assert. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 35 ++++++++--------------------------- 1 file changed, 8 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 927e5454a43c..ebcd38b14830 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -1045,34 +1045,15 @@ xfs_bmap_add_extent_delay_real( temp2 = xfs_bmap_worst_indlen(ip, temp2); diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - (cur ? cur->bc_private.b.allocated : 0)); - if (diff > 0 && - xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - -((int64_t)diff), 0)) { - /* - * Ick gross gag me with a spoon. - */ - ASSERT(0); /* want to see if this ever happens! */ - while (diff > 0) { - if (temp) { - temp--; - diff--; - if (!diff || - !xfs_icsb_modify_counters(ip->i_mount, - XFS_SBS_FDBLOCKS, - -((int64_t)diff), 0)) - break; - } - if (temp2) { - temp2--; - diff--; - if (!diff || - !xfs_icsb_modify_counters(ip->i_mount, - XFS_SBS_FDBLOCKS, - -((int64_t)diff), 0)) - break; - } - } + if (diff > 0) { + error = xfs_icsb_modify_counters(ip->i_mount, + XFS_SBS_FDBLOCKS, + -((int64_t)diff), 0); + ASSERT(!error); + if (error) + goto done; } + ep = xfs_iext_get_ext(ifp, *idx); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); -- cgit v1.2.3-58-ga151 From ecee76ba9d91fdcbdff933ca1bd41465ca4c4fdb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Sep 2011 20:40:43 +0000 Subject: xfs: remove the nextents variable in xfs_bmapi Instead of using a local variable that needs to updated when we modify the extent map just check ifp->if_bytes directly where we use it. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index ebcd38b14830..05d38adbb85e 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4294,7 +4294,6 @@ xfs_bmapi( xfs_mount_t *mp; /* xfs mount structure */ int n; /* current extent index */ int nallocs; /* number of extents alloc'd */ - xfs_extnum_t nextents; /* number of extents in file */ xfs_fileoff_t obno; /* old block number (offset) */ xfs_bmbt_irec_t prev; /* previous file extent record */ int tmp_logflags; /* temp flags holder */ @@ -4380,7 +4379,6 @@ xfs_bmapi( goto error0; ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); n = 0; end = bno + len; obno = bno; @@ -4622,7 +4620,6 @@ xfs_bmapi( if (error) goto error0; ep = xfs_iext_get_ext(ifp, lastx); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); xfs_bmbt_get_all(ep, &got); ASSERT(got.br_startoff <= aoff); ASSERT(got.br_startoff + got.br_blockcount >= @@ -4723,7 +4720,6 @@ xfs_bmapi( if (error) goto error0; ep = xfs_iext_get_ext(ifp, lastx); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); xfs_bmbt_get_all(ep, &got); /* * We may have combined previously unwritten @@ -4781,7 +4777,7 @@ xfs_bmapi( * Else go on to the next record. */ prev = got; - if (++lastx < nextents) { + if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) { ep = xfs_iext_get_ext(ifp, lastx); xfs_bmbt_get_all(ep, &got); } else { -- cgit v1.2.3-58-ga151 From aef9a89586fc8475bf0333b8736d5aa8aa6f4897 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sun, 18 Sep 2011 20:40:44 +0000 Subject: xfs: factor extent map manipulations out of xfs_bmapi To further improve the readability of xfs_bmapi(), factor the pure extent map manipulations out into separate functions. This removes large blocks of logic from the xfs_bmapi() code loop and makes it easier to see the operational logic flow for xfs_bmapi(). Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 181 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 107 insertions(+), 74 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 05d38adbb85e..59f63949d3bc 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4248,6 +4248,107 @@ xfs_bmap_validate_ret( #endif /* DEBUG */ +/* + * Trim the returned map to the required bounds + */ +STATIC void +xfs_bmapi_trim_map( + struct xfs_bmbt_irec *mval, + struct xfs_bmbt_irec *got, + xfs_fileoff_t *bno, + xfs_filblks_t len, + xfs_fileoff_t obno, + xfs_fileoff_t end, + int n, + int flags) +{ + if ((flags & XFS_BMAPI_ENTIRE) || + got->br_startoff + got->br_blockcount <= obno) { + *mval = *got; + if (isnullstartblock(got->br_startblock)) + mval->br_startblock = DELAYSTARTBLOCK; + return; + } + + if (obno > *bno) + *bno = obno; + ASSERT((*bno >= obno) || (n == 0)); + ASSERT(*bno < end); + mval->br_startoff = *bno; + if (isnullstartblock(got->br_startblock)) + mval->br_startblock = DELAYSTARTBLOCK; + else + mval->br_startblock = got->br_startblock + + (*bno - got->br_startoff); + /* + * Return the minimum of what we got and what we asked for for + * the length. We can use the len variable here because it is + * modified below and we could have been there before coming + * here if the first part of the allocation didn't overlap what + * was asked for. + */ + mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno, + got->br_blockcount - (*bno - got->br_startoff)); + mval->br_state = got->br_state; + ASSERT(mval->br_blockcount <= len); + return; +} + +/* + * Update and validate the extent map to return + */ +STATIC void +xfs_bmapi_update_map( + struct xfs_bmbt_irec **map, + xfs_fileoff_t *bno, + xfs_filblks_t *len, + xfs_fileoff_t obno, + xfs_fileoff_t end, + int *n, + int flags) +{ + xfs_bmbt_irec_t *mval = *map; + + ASSERT((flags & XFS_BMAPI_ENTIRE) || + ((mval->br_startoff + mval->br_blockcount) <= end)); + ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) || + (mval->br_startoff < obno)); + + *bno = mval->br_startoff + mval->br_blockcount; + *len = end - *bno; + if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) { + /* update previous map with new information */ + ASSERT(mval->br_startblock == mval[-1].br_startblock); + ASSERT(mval->br_blockcount > mval[-1].br_blockcount); + ASSERT(mval->br_state == mval[-1].br_state); + mval[-1].br_blockcount = mval->br_blockcount; + mval[-1].br_state = mval->br_state; + } else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK && + mval[-1].br_startblock != DELAYSTARTBLOCK && + mval[-1].br_startblock != HOLESTARTBLOCK && + mval->br_startblock == mval[-1].br_startblock + + mval[-1].br_blockcount && + ((flags & XFS_BMAPI_IGSTATE) || + mval[-1].br_state == mval->br_state)) { + ASSERT(mval->br_startoff == + mval[-1].br_startoff + mval[-1].br_blockcount); + mval[-1].br_blockcount += mval->br_blockcount; + } else if (*n > 0 && + mval->br_startblock == DELAYSTARTBLOCK && + mval[-1].br_startblock == DELAYSTARTBLOCK && + mval->br_startoff == + mval[-1].br_startoff + mval[-1].br_blockcount) { + mval[-1].br_blockcount += mval->br_blockcount; + mval[-1].br_state = mval->br_state; + } else if (!((*n == 0) && + ((mval->br_startoff + mval->br_blockcount) <= + obno))) { + mval++; + (*n)++; + } + *map = mval; +} + /* * Map file blocks to filesystem blocks. * File range is given by the bno/len pair. @@ -4650,44 +4751,9 @@ xfs_bmapi( n++; continue; } - /* - * Then deal with the allocated space we found. - */ - ASSERT(ep != NULL); - if (!(flags & XFS_BMAPI_ENTIRE) && - (got.br_startoff + got.br_blockcount > obno)) { - if (obno > bno) - bno = obno; - ASSERT((bno >= obno) || (n == 0)); - ASSERT(bno < end); - mval->br_startoff = bno; - if (isnullstartblock(got.br_startblock)) { - ASSERT(!wr || (flags & XFS_BMAPI_DELAY)); - mval->br_startblock = DELAYSTARTBLOCK; - } else - mval->br_startblock = - got.br_startblock + - (bno - got.br_startoff); - /* - * Return the minimum of what we got and what we - * asked for for the length. We can use the len - * variable here because it is modified below - * and we could have been there before coming - * here if the first part of the allocation - * didn't overlap what was asked for. - */ - mval->br_blockcount = - XFS_FILBLKS_MIN(end - bno, got.br_blockcount - - (bno - got.br_startoff)); - mval->br_state = got.br_state; - ASSERT(mval->br_blockcount <= len); - } else { - *mval = got; - if (isnullstartblock(mval->br_startblock)) { - ASSERT(!wr || (flags & XFS_BMAPI_DELAY)); - mval->br_startblock = DELAYSTARTBLOCK; - } - } + + /* Deal with the allocated space we found. */ + xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); /* * Check if writing previously allocated but @@ -4730,42 +4796,9 @@ xfs_bmapi( continue; } - ASSERT((flags & XFS_BMAPI_ENTIRE) || - ((mval->br_startoff + mval->br_blockcount) <= end)); - ASSERT((flags & XFS_BMAPI_ENTIRE) || - (mval->br_blockcount <= len) || - (mval->br_startoff < obno)); - bno = mval->br_startoff + mval->br_blockcount; - len = end - bno; - if (n > 0 && mval->br_startoff == mval[-1].br_startoff) { - ASSERT(mval->br_startblock == mval[-1].br_startblock); - ASSERT(mval->br_blockcount > mval[-1].br_blockcount); - ASSERT(mval->br_state == mval[-1].br_state); - mval[-1].br_blockcount = mval->br_blockcount; - mval[-1].br_state = mval->br_state; - } else if (n > 0 && mval->br_startblock != DELAYSTARTBLOCK && - mval[-1].br_startblock != DELAYSTARTBLOCK && - mval[-1].br_startblock != HOLESTARTBLOCK && - mval->br_startblock == - mval[-1].br_startblock + mval[-1].br_blockcount && - ((flags & XFS_BMAPI_IGSTATE) || - mval[-1].br_state == mval->br_state)) { - ASSERT(mval->br_startoff == - mval[-1].br_startoff + mval[-1].br_blockcount); - mval[-1].br_blockcount += mval->br_blockcount; - } else if (n > 0 && - mval->br_startblock == DELAYSTARTBLOCK && - mval[-1].br_startblock == DELAYSTARTBLOCK && - mval->br_startoff == - mval[-1].br_startoff + mval[-1].br_blockcount) { - mval[-1].br_blockcount += mval->br_blockcount; - mval[-1].br_state = mval->br_state; - } else if (!((n == 0) && - ((mval->br_startoff + mval->br_blockcount) <= - obno))) { - mval++; - n++; - } + /* update the extent map to return */ + xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); + /* * If we're done, stop now. Stop when we've allocated * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise -- cgit v1.2.3-58-ga151 From 5c8ed2021ff291f5e399a9b43c4f699b2fc58fbb Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sun, 18 Sep 2011 20:40:45 +0000 Subject: xfs: introduce xfs_bmapi_read() xfs_bmapi() currently handles both extent map reading and allocation. As a result, the code is littered with "if (wr)" branches to conditionally do allocation operations if required. This makes the code much harder to follow and causes significant indent issues with the code. Given that read mapping is much simpler than allocation, we can split out read mapping from xfs_bmapi() and reuse the logic that we have already factored out do do all the hard work of handling the extent map manipulations. The results in a much simpler function for the common extent read operations, and will allow the allocation code to be simplified in another commit. Once xfs_bmapi_read() is implemented, convert all the callers of xfs_bmapi() that are only reading extents to use the new function. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_aops.c | 8 ++-- fs/xfs/xfs_attr.c | 30 ++++++-------- fs/xfs/xfs_attr_leaf.c | 5 +-- fs/xfs/xfs_bmap.c | 104 +++++++++++++++++++++++++++++++++++++++++++++---- fs/xfs/xfs_bmap.h | 4 ++ fs/xfs/xfs_da_btree.c | 9 ++--- fs/xfs/xfs_dir2_leaf.c | 6 +-- fs/xfs/xfs_dquot.c | 5 +-- fs/xfs/xfs_file.c | 10 ++--- fs/xfs/xfs_inode.c | 12 +++--- fs/xfs/xfs_iomap.c | 4 +- fs/xfs/xfs_qm.c | 7 +--- fs/xfs/xfs_vnodeops.c | 24 ++++++------ 13 files changed, 151 insertions(+), 77 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 22aadf667862..11b2aad982d4 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -315,8 +315,8 @@ xfs_map_blocks( count = mp->m_maxioffset - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb, - bmapi_flags, NULL, 0, imap, &nimaps, NULL); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + imap, &nimaps, bmapi_flags); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (error) @@ -1138,8 +1138,8 @@ __xfs_get_blocks( end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb, - XFS_BMAPI_ENTIRE, NULL, 0, &imap, &nimaps, NULL); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + &imap, &nimaps, XFS_BMAPI_ENTIRE); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index c7dab0c0bdda..41ef02b7185a 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -1963,10 +1963,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) lblkno = args->rmtblkno; while (valuelen > 0) { nmap = ATTR_RMTVALUE_MAPSIZE; - error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - NULL, 0, map, &nmap, NULL); + error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, map, &nmap, + XFS_BMAPI_ATTRFORK); if (error) return(error); ASSERT(nmap >= 1); @@ -2092,14 +2091,11 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) */ xfs_bmap_init(args->flist, args->firstblock); nmap = 1; - error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - args->firstblock, 0, &map, &nmap, - NULL); - if (error) { + error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, &map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) return(error); - } ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); @@ -2155,16 +2151,12 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) /* * Try to remember where we decided to put the value. */ - xfs_bmap_init(args->flist, args->firstblock); nmap = 1; - error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - args->firstblock, 0, &map, &nmap, - args->flist); - if (error) { + error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, &map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) return(error); - } ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 58c3add07b65..d4906e7c9787 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -2926,9 +2926,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, * Try to remember where we decided to put the value. */ nmap = 1; - error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - NULL, 0, &map, &nmap, NULL); + error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, + &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) { return(error); } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 59f63949d3bc..fc5acf0b921e 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4349,6 +4349,98 @@ xfs_bmapi_update_map( *map = mval; } +/* + * Map file blocks to filesystem blocks without allocation. + */ +int +xfs_bmapi_read( + struct xfs_inode *ip, + xfs_fileoff_t bno, + xfs_filblks_t len, + struct xfs_bmbt_irec *mval, + int *nmap, + int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec prev; + xfs_fileoff_t obno; + xfs_fileoff_t end; + xfs_extnum_t lastx; + int error; + int eof; + int n = 0; + int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + + ASSERT(*nmap >= 1); + ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE| + XFS_BMAPI_IGSTATE))); + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + XFS_STATS_INC(xs_blk_mapr); + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, whichfork); + if (error) + return error; + } + + xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev); + end = bno + len; + obno = bno; + + while (bno < end && n < *nmap) { + /* Reading past eof, act as though there's a hole up to end. */ + if (eof) + got.br_startoff = end; + if (got.br_startoff > bno) { + /* Reading in a hole. */ + mval->br_startoff = bno; + mval->br_startblock = HOLESTARTBLOCK; + mval->br_blockcount = + XFS_FILBLKS_MIN(len, got.br_startoff - bno); + mval->br_state = XFS_EXT_NORM; + bno += mval->br_blockcount; + len -= mval->br_blockcount; + mval++; + n++; + continue; + } + + /* set up the extent map to return. */ + xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); + xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); + + /* If we're done, stop now. */ + if (bno >= end || n >= *nmap) + break; + + /* Else go on to the next record. */ + if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); + else + eof = 1; + } + *nmap = n; + return 0; +} + /* * Map file blocks to filesystem blocks. * File range is given by the bno/len pair. @@ -5490,10 +5582,9 @@ xfs_getbmap( do { nmap = (nexleft > subnex) ? subnex : nexleft; - error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), - XFS_BB_TO_FSB(mp, bmv->bmv_length), - bmapi_flags, NULL, 0, map, &nmap, - NULL); + error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), + XFS_BB_TO_FSB(mp, bmv->bmv_length), + map, &nmap, bmapi_flags); if (error) goto out_free_map; ASSERT(nmap <= subnex); @@ -6084,9 +6175,8 @@ xfs_bmap_punch_delalloc_range( * trying to remove a real extent (which requires a * transaction) or a hole, which is probably a bad idea... */ - error = xfs_bmapi(NULL, ip, start_fsb, 1, - XFS_BMAPI_ENTIRE, NULL, 0, &imap, - &nimaps, NULL); + error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps, + XFS_BMAPI_ENTIRE); if (error) { /* something screwed, just bail */ diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index c62234bde053..16257c6eba71 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -294,6 +294,10 @@ xfs_bmapi( int *nmap, /* i/o: mval size/count */ xfs_bmap_free_t *flist); /* i/o: list extents to free */ +int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, + xfs_filblks_t len, struct xfs_bmbt_irec *mval, + int *nmap, int flags); + /* * Map file blocks to filesystem blocks, simple version. * One block only, read-only. diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index ee9d5427fcd4..44dfa39099ef 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -1995,11 +1995,10 @@ xfs_da_do_buf( } else { mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP); nmap = nfsb; - if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno, - nfsb, - XFS_BMAPI_METADATA | - xfs_bmapi_aflag(whichfork), - NULL, 0, mapp, &nmap, NULL))) + error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, + mapp, &nmap, + xfs_bmapi_aflag(whichfork)); + if (error) goto exit0; } } else { diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index ca2386d82cdf..66e108f561a3 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -888,12 +888,10 @@ xfs_dir2_leaf_getdents( * we already have in the table. */ nmap = map_size - map_valid; - error = xfs_bmapi(NULL, dp, - map_off, + error = xfs_bmapi_read(dp, map_off, xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) - map_off, - XFS_BMAPI_METADATA, NULL, 0, - &map[map_valid], &nmap, NULL); + &map[map_valid], &nmap, 0); /* * Don't know if we should ignore this or * try to return an error. diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 0c5fe66ce92b..c377961657ee 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -488,9 +488,8 @@ xfs_qm_dqtobp( /* * Find the block map; no allocations yet */ - error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset, - XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, - NULL, 0, &map, &nmaps, NULL); + error = xfs_bmapi_read(quotip, dqp->q_fileoffset, + XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); xfs_iunlock(quotip, XFS_ILOCK_SHARED); if (error) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 06fe97e56e48..558543c146b3 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -509,11 +509,9 @@ xfs_zero_last_block( last_fsb = XFS_B_TO_FSBT(mp, isize); nimaps = 1; - error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap, - &nimaps, NULL); - if (error) { + error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0); + if (error) return error; - } ASSERT(nimaps > 0); /* * If the block underlying isize is just a hole, then there @@ -604,8 +602,8 @@ xfs_zero_eof( while (start_zero_fsb <= end_zero_fsb) { nimaps = 1; zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; - error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb, - 0, NULL, 0, &imap, &nimaps, NULL); + error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb, + &imap, &nimaps, 0); if (error) { ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); return error; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d689253fdfda..f690d3a10b34 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1187,6 +1187,7 @@ xfs_isize_check( xfs_fileoff_t map_first; int nimaps; xfs_bmbt_irec_t imaps[2]; + int error; if (!S_ISREG(ip->i_d.di_mode)) return; @@ -1203,13 +1204,12 @@ xfs_isize_check( * The filesystem could be shutting down, so bmapi may return * an error. */ - if (xfs_bmapi(NULL, ip, map_first, + error = xfs_bmapi_read(ip, map_first, (XFS_B_TO_FSB(mp, - (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - - map_first), - XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps, - NULL)) - return; + (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first), + imaps, &nimaps, XFS_BMAPI_ENTIRE); + if (error) + return; ASSERT(nimaps == 1); ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 091d82b94c4d..544f053860f1 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -300,8 +300,8 @@ xfs_iomap_eof_want_preallocate( while (count_fsb > 0) { imaps = nimaps; firstblock = NULLFSBLOCK; - error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0, - &firstblock, 0, imap, &imaps, NULL); + error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps, + 0); if (error) return error; for (n = 0; n < imaps; n++) { diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index f51bef885e6d..ddaf97a57ec6 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1347,11 +1347,8 @@ xfs_qm_dqiterate( * the inode is never added to the transaction. */ xfs_ilock(qip, XFS_ILOCK_SHARED); - error = xfs_bmapi(NULL, qip, lblkno, - maxlblkcnt - lblkno, - XFS_BMAPI_METADATA, - NULL, - 0, map, &nmaps, NULL); + error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno, + map, &nmaps, 0); xfs_iunlock(qip, XFS_ILOCK_SHARED); if (error) break; diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 0d1caec873a1..63874a87b378 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -72,8 +72,8 @@ xfs_readlink_bmap( xfs_buf_t *bp; int error = 0; - error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0, - mval, &nmaps, NULL); + error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, pathlen), mval, &nmaps, + 0); if (error) goto out; @@ -178,8 +178,7 @@ xfs_free_eofblocks( nimaps = 1; xfs_ilock(ip, XFS_ILOCK_SHARED); - error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0, - NULL, 0, &imap, &nimaps, NULL); + error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (!error && (nimaps != 0) && @@ -297,9 +296,9 @@ xfs_inactive_symlink_rmt( done = 0; xfs_bmap_init(&free_list, &first_block); nmaps = ARRAY_SIZE(mval); - if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size), - XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps, - &free_list))) + error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, size), + mval, &nmaps, 0); + if (error) goto error0; /* * Invalidate the block(s). @@ -1981,8 +1980,7 @@ xfs_zero_remaining_bytes( for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { offset_fsb = XFS_B_TO_FSBT(mp, offset); nimap = 1; - error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, - NULL, 0, &imap, &nimap, NULL); + error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); if (error || nimap < 1) break; ASSERT(imap.br_blockcount >= 1); @@ -2101,8 +2099,8 @@ xfs_free_file_space( */ if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { nimap = 1; - error = xfs_bmapi(NULL, ip, startoffset_fsb, - 1, 0, NULL, 0, &imap, &nimap, NULL); + error = xfs_bmapi_read(ip, startoffset_fsb, 1, + &imap, &nimap, 0); if (error) goto out_unlock_iolock; ASSERT(nimap == 0 || nimap == 1); @@ -2116,8 +2114,8 @@ xfs_free_file_space( startoffset_fsb += mp->m_sb.sb_rextsize - mod; } nimap = 1; - error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, - 1, 0, NULL, 0, &imap, &nimap, NULL); + error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1, + &imap, &nimap, 0); if (error) goto out_unlock_iolock; ASSERT(nimap == 0 || nimap == 1); -- cgit v1.2.3-58-ga151 From 5b777ad517ee75d3bb8d67c142d808822e46601b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sun, 18 Sep 2011 20:40:46 +0000 Subject: xfs: remove xfs_bmapi_single() Now we have xfs_bmapi_read, there is no need for xfs_bmapi_single(). Change the remaining caller over and kill the function. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 52 --------------------------------------------------- fs/xfs/xfs_bmap.h | 15 --------------- fs/xfs/xfs_da_btree.c | 32 ++++++++----------------------- fs/xfs/xfs_rtalloc.c | 32 +++++++++++-------------------- 4 files changed, 19 insertions(+), 112 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index fc5acf0b921e..fd06bf17d09c 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4968,58 +4968,6 @@ error0: return error; } -/* - * Map file blocks to filesystem blocks, simple version. - * One block (extent) only, read-only. - * For flags, only the XFS_BMAPI_ATTRFORK flag is examined. - * For the other flag values, the effect is as if XFS_BMAPI_METADATA - * was set and all the others were clear. - */ -int /* error */ -xfs_bmapi_single( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - int whichfork, /* data or attr fork */ - xfs_fsblock_t *fsb, /* output: mapped block */ - xfs_fileoff_t bno) /* starting file offs. mapped */ -{ - int eof; /* we've hit the end of extents */ - int error; /* error return */ - xfs_bmbt_irec_t got; /* current file extent record */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t lastx; /* last useful extent number */ - xfs_bmbt_irec_t prev; /* previous file extent record */ - - ifp = XFS_IFORK_PTR(ip, whichfork); - if (unlikely( - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)) { - XFS_ERROR_REPORT("xfs_bmapi_single", XFS_ERRLEVEL_LOW, - ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); - } - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return XFS_ERROR(EIO); - XFS_STATS_INC(xs_blk_mapr); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - return error; - (void)xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); - /* - * Reading past eof, act as though there's a hole - * up to end. - */ - if (eof || got.br_startoff > bno) { - *fsb = NULLFSBLOCK; - return 0; - } - ASSERT(!isnullstartblock(got.br_startblock)); - ASSERT(bno < got.br_startoff + got.br_blockcount); - *fsb = got.br_startblock + (bno - got.br_startoff); - return 0; -} - /* * Unmap (remove) blocks from a file. * If nexts is nonzero then the number of extents to remove is limited to diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 16257c6eba71..01693390df21 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -298,21 +298,6 @@ int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, struct xfs_bmbt_irec *mval, int *nmap, int flags); -/* - * Map file blocks to filesystem blocks, simple version. - * One block only, read-only. - * For flags, only the XFS_BMAPI_ATTRFORK flag is examined. - * For the other flag values, the effect is as if XFS_BMAPI_METADATA - * was set and all the others were clear. - */ -int /* error */ -xfs_bmapi_single( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - int whichfork, /* data or attr fork */ - xfs_fsblock_t *fsb, /* output: mapped block */ - xfs_fileoff_t bno); /* starting file offs. mapped */ - /* * Unmap (remove) blocks from a file. * If nexts is nonzero then the number of extents to remove is limited to diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 44dfa39099ef..70a5f580e5ed 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -1975,32 +1975,16 @@ xfs_da_do_buf( /* * Optimize the one-block case. */ - if (nfsb == 1) { - xfs_fsblock_t fsb; - - if ((error = - xfs_bmapi_single(trans, dp, whichfork, &fsb, - (xfs_fileoff_t)bno))) { - return error; - } + if (nfsb == 1) mapp = ↦ - if (fsb == NULLFSBLOCK) { - nmap = 0; - } else { - map.br_startblock = fsb; - map.br_startoff = (xfs_fileoff_t)bno; - map.br_blockcount = 1; - nmap = 1; - } - } else { + else mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP); - nmap = nfsb; - error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, - mapp, &nmap, - xfs_bmapi_aflag(whichfork)); - if (error) - goto exit0; - } + + nmap = nfsb; + error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, mapp, + &nmap, xfs_bmapi_aflag(whichfork)); + if (error) + goto exit0; } else { map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno); map.br_startoff = (xfs_fileoff_t)bno; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 35561a511b57..e5f40c6460b2 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -856,33 +856,23 @@ xfs_rtbuf_get( xfs_buf_t **bpp) /* output: buffer for the block */ { xfs_buf_t *bp; /* block buffer, result */ - xfs_daddr_t d; /* disk addr of block */ - int error; /* error value */ - xfs_fsblock_t fsb; /* fs block number for block */ xfs_inode_t *ip; /* bitmap or summary inode */ + xfs_bmbt_irec_t map; + int nmap; + int error; /* error value */ ip = issum ? mp->m_rsumip : mp->m_rbmip; - /* - * Map from the file offset (block) and inode number to the - * file system block. - */ - error = xfs_bmapi_single(tp, ip, XFS_DATA_FORK, &fsb, block); - if (error) { + + error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK); + if (error) return error; - } - ASSERT(fsb != NULLFSBLOCK); - /* - * Convert to disk address for buffer cache. - */ - d = XFS_FSB_TO_DADDR(mp, fsb); - /* - * Read the buffer. - */ - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, + + ASSERT(map.br_startblock != NULLFSBLOCK); + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, map.br_startblock), mp->m_bsize, 0, &bp); - if (error) { + if (error) return error; - } ASSERT(!xfs_buf_geterror(bp)); *bpp = bp; return 0; -- cgit v1.2.3-58-ga151 From b64dfe4e180ab5047c59bcbe379538eb23be4d8e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Sep 2011 20:40:47 +0000 Subject: xfs: factor delalloc reservations out of xfs_bmapi Move the reservation of delayed allocations, and addition of delalloc regions to the extent trees into a new helper function. For now this adds some twisted goto logic to xfs_bmapi, but that will be cleaned up in the following patches. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 202 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 118 insertions(+), 84 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index fd06bf17d09c..cd01783a89d7 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4441,6 +4441,120 @@ xfs_bmapi_read( return 0; } +STATIC int +xfs_bmapi_reserve_delalloc( + struct xfs_inode *ip, + xfs_fileoff_t aoff, + xfs_filblks_t len, + struct xfs_bmbt_irec *got, + struct xfs_bmbt_irec *prev, + xfs_extnum_t *lastx, + int eof) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + xfs_extlen_t alen; + xfs_extlen_t indlen; + xfs_fsblock_t firstblock = NULLFSBLOCK; + struct xfs_btree_cur *cur = NULL; + int tmp_logflags = 0; + char rt = XFS_IS_REALTIME_INODE(ip); + xfs_extlen_t extsz; + int error; + + alen = XFS_FILBLKS_MIN(len, MAXEXTLEN); + if (!eof) + alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); + + /* Figure out the extent size, adjust alen */ + extsz = xfs_get_extsz_hint(ip); + if (extsz) { + /* + * Make sure we don't exceed a single extent length when we + * align the extent by reducing length we are going to + * allocate by the maximum amount extent size aligment may + * require. + */ + alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1)); + error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof, + 1, 0, &aoff, &alen); + ASSERT(!error); + } + + if (rt) + extsz = alen / mp->m_sb.sb_rextsize; + + /* + * Make a transaction-less quota reservation for delayed allocation + * blocks. This number gets adjusted later. We return if we haven't + * allocated blocks already inside this loop. + */ + error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0, + rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + if (error) + return error; + + /* + * Split changing sb for alen and indlen since they could be coming + * from different places. + */ + indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); + ASSERT(indlen > 0); + + if (rt) { + error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, + -((int64_t)extsz), 0); + } else { + error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + -((int64_t)alen), 0); + } + + if (error) + goto out_unreserve_quota; + + error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + -((int64_t)indlen), 0); + if (error) + goto out_unreserve_blocks; + + + ip->i_delayed_blks += alen; + + got->br_startoff = aoff; + got->br_startblock = nullstartblock(indlen); + got->br_blockcount = alen; + got->br_state = XFS_EXT_NORM; + + error = xfs_bmap_add_extent(NULL, ip, lastx, &cur, got, &firstblock, + NULL, &tmp_logflags, XFS_DATA_FORK); + ASSERT(!error); + ASSERT(!tmp_logflags); + ASSERT(!cur); + + /* + * Update our extent pointer, given that xfs_bmap_add_extent might + * have merged it into one of the neighbouring ones. + */ + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got); + + ASSERT(got->br_startoff <= aoff); + ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen); + ASSERT(isnullstartblock(got->br_startblock)); + ASSERT(got->br_state == XFS_EXT_NORM); + return 0; + +out_unreserve_blocks: + if (rt) + xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0); + else + xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0); +out_unreserve_quota: + if (XFS_IS_QUOTA_ON(mp)) + xfs_trans_unreserve_quota_nblks(NULL, ip, alen, 0, rt ? + XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + return error; +} + /* * Map file blocks to filesystem blocks. * File range is given by the bno/len pair. @@ -4479,7 +4593,6 @@ xfs_bmapi( int error; /* error return */ xfs_bmbt_irec_t got; /* current file extent record */ xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extlen_t indlen; /* indirect blocks length */ xfs_extnum_t lastx; /* last useful extent number */ int logflags; /* flags for transaction logging */ xfs_extlen_t minleft; /* min blocks left after allocation */ @@ -4615,43 +4728,8 @@ xfs_bmapi( } minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1; if (flags & XFS_BMAPI_DELAY) { - xfs_extlen_t extsz; - - /* Figure out the extent size, adjust alen */ - extsz = xfs_get_extsz_hint(ip); - if (extsz) { - /* - * make sure we don't exceed a single - * extent length when we align the - * extent by reducing length we are - * going to allocate by the maximum - * amount extent size aligment may - * require. - */ - alen = XFS_FILBLKS_MIN(len, - MAXEXTLEN - (2 * extsz - 1)); - error = xfs_bmap_extsize_align(mp, - &got, &prev, extsz, - rt, eof, - flags&XFS_BMAPI_DELAY, - flags&XFS_BMAPI_CONVERT, - &aoff, &alen); - ASSERT(!error); - } - - if (rt) - extsz = alen / mp->m_sb.sb_rextsize; - - /* - * Make a transaction-less quota reservation for - * delayed allocation blocks. This number gets - * adjusted later. We return if we haven't - * allocated blocks already inside this loop. - */ - error = xfs_trans_reserve_quota_nblks( - NULL, ip, (long)alen, 0, - rt ? XFS_QMOPT_RES_RTBLKS : - XFS_QMOPT_RES_REGBLKS); + error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got, + &prev, &lastx, eof); if (error) { if (n == 0) { *nmap = 0; @@ -4661,51 +4739,7 @@ xfs_bmapi( break; } - /* - * Split changing sb for alen and indlen since - * they could be coming from different places. - */ - indlen = (xfs_extlen_t) - xfs_bmap_worst_indlen(ip, alen); - ASSERT(indlen > 0); - - if (rt) { - error = xfs_mod_incore_sb(mp, - XFS_SBS_FREXTENTS, - -((int64_t)extsz), 0); - } else { - error = xfs_icsb_modify_counters(mp, - XFS_SBS_FDBLOCKS, - -((int64_t)alen), 0); - } - if (!error) { - error = xfs_icsb_modify_counters(mp, - XFS_SBS_FDBLOCKS, - -((int64_t)indlen), 0); - if (error && rt) - xfs_mod_incore_sb(mp, - XFS_SBS_FREXTENTS, - (int64_t)extsz, 0); - else if (error) - xfs_icsb_modify_counters(mp, - XFS_SBS_FDBLOCKS, - (int64_t)alen, 0); - } - - if (error) { - if (XFS_IS_QUOTA_ON(mp)) - /* unreserve the blocks now */ - (void) - xfs_trans_unreserve_quota_nblks( - NULL, ip, - (long)alen, 0, rt ? - XFS_QMOPT_RES_RTBLKS : - XFS_QMOPT_RES_REGBLKS); - break; - } - - ip->i_delayed_blks += alen; - abno = nullstartblock(indlen); + goto trim_extent; } else { /* * If first time, allocate and fill in @@ -4843,7 +4877,7 @@ xfs_bmapi( n++; continue; } - +trim_extent: /* Deal with the allocated space we found. */ xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); -- cgit v1.2.3-58-ga151 From 4403280aa5c00c6074f2dc23e1cfc11f2bfb0032 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Sep 2011 20:40:48 +0000 Subject: xfs: introduce xfs_bmapi_delay() Delalloc reservations are much simpler than allocations, so give them a separate bmapi-level interface. Using the previously added xfs_bmapi_reserve_delalloc we get a function that is only minimally more complicated than xfs_bmapi_read, which is far from the complexity in xfs_bmapi. Also remove the XFS_BMAPI_DELAY code after switching over the only user to xfs_bmapi_delay. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 115 +++++++++++++++++++++++++++++++++++++++++------------ fs/xfs/xfs_bmap.h | 5 ++- fs/xfs/xfs_iomap.c | 9 +---- 3 files changed, 94 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index cd01783a89d7..6f3276974555 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4238,7 +4238,7 @@ xfs_bmap_validate_ret( ASSERT(i == 0 || mval[i - 1].br_startoff + mval[i - 1].br_blockcount == mval[i].br_startoff); - if ((flags & XFS_BMAPI_WRITE) && !(flags & XFS_BMAPI_DELAY)) + if (flags & XFS_BMAPI_WRITE) ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && mval[i].br_startblock != HOLESTARTBLOCK); ASSERT(mval[i].br_state == XFS_EXT_NORM || @@ -4555,6 +4555,90 @@ out_unreserve_quota: return error; } +/* + * Map file blocks to filesystem blocks, adding delayed allocations as needed. + */ +int +xfs_bmapi_delay( + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting file offs. mapped */ + xfs_filblks_t len, /* length to map in file */ + struct xfs_bmbt_irec *mval, /* output: map values */ + int *nmap, /* i/o: mval size/count */ + int flags) /* XFS_BMAPI_... */ +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_bmbt_irec got; /* current file extent record */ + struct xfs_bmbt_irec prev; /* previous file extent record */ + xfs_fileoff_t obno; /* old block number (offset) */ + xfs_fileoff_t end; /* end of mapped file region */ + xfs_extnum_t lastx; /* last useful extent number */ + int eof; /* we've hit the end of extents */ + int n = 0; /* current extent index */ + int error = 0; + + ASSERT(*nmap >= 1); + ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); + ASSERT(!(flags & ~XFS_BMAPI_ENTIRE)); + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + XFS_STATS_INC(xs_blk_mapw); + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + return error; + } + + xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev); + end = bno + len; + obno = bno; + + while (bno < end && n < *nmap) { + if (eof || got.br_startoff > bno) { + error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got, + &prev, &lastx, eof); + if (error) { + if (n == 0) { + *nmap = 0; + return error; + } + break; + } + } + + /* set up the extent map to return. */ + xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); + xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); + + /* If we're done, stop now. */ + if (bno >= end || n >= *nmap) + break; + + /* Else go on to the next record. */ + prev = got; + if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); + else + eof = 1; + } + + *nmap = n; + return 0; +} + + /* * Map file blocks to filesystem blocks. * File range is given by the bno/len pair. @@ -4663,7 +4747,6 @@ xfs_bmapi( */ if ((flags & XFS_BMAPI_IGSTATE) && wr) /* if writing unwritten space */ wr = 0; /* no allocations are allowed */ - ASSERT(wr || !(flags & XFS_BMAPI_DELAY)); logflags = 0; nallocs = 0; cur = NULL; @@ -4698,8 +4781,7 @@ xfs_bmapi( if (eof && !wr) got.br_startoff = end; inhole = eof || got.br_startoff > bno; - wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) && - isnullstartblock(got.br_startblock); + wasdelay = wr && !inhole && isnullstartblock(got.br_startblock); /* * First, deal with the hole before the allocated space * that we found, if any. @@ -4727,20 +4809,7 @@ xfs_bmapi( aoff = bno; } minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1; - if (flags & XFS_BMAPI_DELAY) { - error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got, - &prev, &lastx, eof); - if (error) { - if (n == 0) { - *nmap = 0; - ASSERT(cur == NULL); - return error; - } - break; - } - - goto trim_extent; - } else { + { /* * If first time, allocate and fill in * once-only bma fields. @@ -4851,14 +4920,8 @@ xfs_bmapi( ASSERT(got.br_startoff <= aoff); ASSERT(got.br_startoff + got.br_blockcount >= aoff + alen); -#ifdef DEBUG - if (flags & XFS_BMAPI_DELAY) { - ASSERT(isnullstartblock(got.br_startblock)); - ASSERT(startblockval(got.br_startblock) > 0); - } ASSERT(got.br_state == XFS_EXT_NORM || got.br_state == XFS_EXT_UNWRITTEN); -#endif /* * Fall down into the found allocated space case. */ @@ -4877,7 +4940,7 @@ xfs_bmapi( n++; continue; } -trim_extent: + /* Deal with the allocated space we found. */ xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); @@ -4887,7 +4950,7 @@ trim_extent: */ if (wr && ((mval->br_state == XFS_EXT_UNWRITTEN && - ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) || + ((flags & XFS_BMAPI_PREALLOC) == 0)) || (mval->br_state == XFS_EXT_NORM && ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) == (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) { diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 01693390df21..c70c19c7f1fa 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -65,7 +65,6 @@ typedef struct xfs_bmap_free * Flags for xfs_bmapi */ #define XFS_BMAPI_WRITE 0x001 /* write operation: allocate space */ -#define XFS_BMAPI_DELAY 0x002 /* delayed write operation */ #define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */ #define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */ #define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */ @@ -82,7 +81,6 @@ typedef struct xfs_bmap_free #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_WRITE, "WRITE" }, \ - { XFS_BMAPI_DELAY, "DELAY" }, \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ @@ -297,6 +295,9 @@ xfs_bmapi( int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, struct xfs_bmbt_irec *mval, int *nmap, int flags); +int xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno, + xfs_filblks_t len, struct xfs_bmbt_irec *mval, + int *nmap, int flags); /* * Unmap (remove) blocks from a file. diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 544f053860f1..681ba34c9233 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -381,7 +381,6 @@ xfs_iomap_write_delay( xfs_fileoff_t last_fsb; xfs_off_t aligned_offset; xfs_fileoff_t ioalign; - xfs_fsblock_t firstblock; xfs_extlen_t extsz; int nimaps; xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; @@ -425,12 +424,8 @@ retry: } nimaps = XFS_WRITE_IMAPS; - firstblock = NULLFSBLOCK; - error = xfs_bmapi(NULL, ip, offset_fsb, - (xfs_filblks_t)(last_fsb - offset_fsb), - XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | - XFS_BMAPI_ENTIRE, &firstblock, 1, imap, - &nimaps, NULL); + error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb, + imap, &nimaps, XFS_BMAPI_ENTIRE); switch (error) { case 0: case ENOSPC: -- cgit v1.2.3-58-ga151 From 1fd044d9c6735e669f0db025f18023e56a608130 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Sep 2011 20:40:49 +0000 Subject: xfs: do not use xfs_bmap_add_extent for adding delalloc extents We can just call xfs_bmap_add_extent_hole_delay directly to add a delayed allocated regions to the extent tree, instead of going through all the complexities of xfs_bmap_add_extent that aren't needed for this simple case. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 52 +++++++++------------------------------------------- 1 file changed, 9 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 6f3276974555..c9492e23447c 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -100,17 +100,6 @@ xfs_bmap_add_extent_delay_real( xfs_bmap_free_t *flist, /* list of extents to be freed */ int *logflagsp); /* inode logging flags */ -/* - * Called by xfs_bmap_add_extent to handle cases converting a hole - * to a delayed allocation. - */ -STATIC int /* error */ -xfs_bmap_add_extent_hole_delay( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp); /* inode logging flags */ - /* * Called by xfs_bmap_add_extent to handle cases converting a hole * to a real allocation. @@ -431,8 +420,7 @@ xfs_bmap_add_attrfork_local( } /* - * Called by xfs_bmapi to update file extent records and the btree - * after allocating space (or doing a delayed allocation). + * Update file extent records and the btree after allocating space. */ STATIC int /* error */ xfs_bmap_add_extent( @@ -464,21 +452,12 @@ xfs_bmap_add_extent( ASSERT(*idx >= 0); ASSERT(*idx <= nextents); + ASSERT(!isnullstartblock(new->br_startblock)); - /* - * Any kind of new delayed allocation goes here. - */ - if (isnullstartblock(new->br_startblock)) { - if (cur) - ASSERT((cur->bc_private.b.flags & - XFS_BTCUR_BPRV_WASDEL) == 0); - error = xfs_bmap_add_extent_hole_delay(ip, idx, new, - &logflags); - } /* * Real allocation off the end of the file. */ - else if (*idx == nextents) { + if (*idx == nextents) { if (cur) ASSERT((cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL) == 0); @@ -1581,16 +1560,13 @@ done: } /* - * Called by xfs_bmap_add_extent to handle cases converting a hole - * to a delayed allocation. + * Convert a hole to a delayed allocation. */ -/*ARGSUSED*/ -STATIC int /* error */ +STATIC void xfs_bmap_add_extent_hole_delay( xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp) /* inode logging flags */ + xfs_bmbt_irec_t *new) /* new data to add to file extents */ { xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_irec_t left; /* left neighbor extent entry */ @@ -1725,8 +1701,6 @@ xfs_bmap_add_extent_hole_delay( * Nothing to do for disk quota accounting here. */ } - *logflagsp = 0; - return 0; } /* @@ -4455,9 +4429,6 @@ xfs_bmapi_reserve_delalloc( struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); xfs_extlen_t alen; xfs_extlen_t indlen; - xfs_fsblock_t firstblock = NULLFSBLOCK; - struct xfs_btree_cur *cur = NULL; - int tmp_logflags = 0; char rt = XFS_IS_REALTIME_INODE(ip); xfs_extlen_t extsz; int error; @@ -4524,16 +4495,11 @@ xfs_bmapi_reserve_delalloc( got->br_startblock = nullstartblock(indlen); got->br_blockcount = alen; got->br_state = XFS_EXT_NORM; - - error = xfs_bmap_add_extent(NULL, ip, lastx, &cur, got, &firstblock, - NULL, &tmp_logflags, XFS_DATA_FORK); - ASSERT(!error); - ASSERT(!tmp_logflags); - ASSERT(!cur); + xfs_bmap_add_extent_hole_delay(ip, lastx, got); /* - * Update our extent pointer, given that xfs_bmap_add_extent might - * have merged it into one of the neighbouring ones. + * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay + * might have merged it into one of the neighbouring ones. */ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got); -- cgit v1.2.3-58-ga151 From 7e47a4efde33aa3f0cb901e086a75751c2269f04 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sun, 18 Sep 2011 20:40:50 +0000 Subject: xfs: factor extent allocation out of xfs_bmapi To further improve the readability of xfs_bmapi(), factor the extent allocation out into a separate function. This removes a large block of logic from the xfs_bmapi() code loop and makes it easier to see the operational logic flow for xfs_bmapi(). Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 302 +++++++++++++++++++++++++++++------------------------- 1 file changed, 162 insertions(+), 140 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index c9492e23447c..311cbc1d64c5 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4605,6 +4605,147 @@ xfs_bmapi_delay( } +STATIC int +xfs_bmapi_allocate( + struct xfs_bmalloca *bma, + xfs_extnum_t *lastx, + struct xfs_btree_cur **cur, + xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, + int flags, + int *nallocs, + int *logflags) +{ + struct xfs_mount *mp = bma->ip->i_mount; + int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + xfs_fsblock_t abno; + xfs_extlen_t alen; + xfs_fileoff_t aoff; + int error; + int rt; + + rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip); + + /* + * For the wasdelay case, we could also just allocate the stuff asked + * for in this bmap call but that wouldn't be as good. + */ + if (bma->wasdel) { + alen = (xfs_extlen_t)bma->gotp->br_blockcount; + aoff = bma->gotp->br_startoff; + if (*lastx != NULLEXTNUM && *lastx) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx - 1), + bma->prevp); + } + } else { + alen = (xfs_extlen_t)XFS_FILBLKS_MIN(bma->alen, MAXEXTLEN); + if (!bma->eof) + alen = (xfs_extlen_t)XFS_FILBLKS_MIN(alen, + bma->gotp->br_startoff - bma->off); + aoff = bma->off; + } + + /* + * Indicate if this is the first user data in the file, or just any + * user data. + */ + if (!(flags & XFS_BMAPI_METADATA)) { + bma->userdata = (aoff == 0) ? + XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; + } + + /* + * Fill in changeable bma fields. + */ + bma->alen = alen; + bma->off = aoff; + bma->firstblock = *firstblock; + bma->minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1; + bma->low = flist->xbf_low; + bma->aeof = 0; + + /* + * Only want to do the alignment at the eof if it is userdata and + * allocation length is larger than a stripe unit. + */ + if (mp->m_dalign && alen >= mp->m_dalign && + !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { + error = xfs_bmap_isaeof(bma->ip, aoff, whichfork, &bma->aeof); + if (error) + return error; + } + + error = xfs_bmap_alloc(bma); + if (error) + return error; + + /* + * Copy out result fields. + */ + abno = bma->rval; + flist->xbf_low = bma->low; + alen = bma->alen; + aoff = bma->off; + ASSERT(*firstblock == NULLFSBLOCK || + XFS_FSB_TO_AGNO(mp, *firstblock) == + XFS_FSB_TO_AGNO(mp, bma->firstblock) || + (flist->xbf_low && + XFS_FSB_TO_AGNO(mp, *firstblock) < + XFS_FSB_TO_AGNO(mp, bma->firstblock))); + *firstblock = bma->firstblock; + if (*cur) + (*cur)->bc_private.b.firstblock = *firstblock; + if (abno == NULLFSBLOCK) + return 0; + if ((ifp->if_flags & XFS_IFBROOT) && !*cur) { + (*cur) = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork); + (*cur)->bc_private.b.firstblock = *firstblock; + (*cur)->bc_private.b.flist = flist; + } + /* + * Bump the number of extents we've allocated + * in this call. + */ + (*nallocs)++; + + if (*cur) + (*cur)->bc_private.b.flags = + bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + + bma->gotp->br_startoff = aoff; + bma->gotp->br_startblock = abno; + bma->gotp->br_blockcount = alen; + bma->gotp->br_state = XFS_EXT_NORM; + + /* + * A wasdelay extent has been initialized, so shouldn't be flagged + * as unwritten. + */ + if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) && + xfs_sb_version_hasextflgbit(&mp->m_sb)) + bma->gotp->br_state = XFS_EXT_UNWRITTEN; + + error = xfs_bmap_add_extent(bma->tp, bma->ip, lastx, cur, bma->gotp, + firstblock, flist, logflags, whichfork); + if (error) + return error; + + /* + * Update our extent pointer, given that xfs_bmap_add_extent might + * have merged it into one of the neighbouring ones. + */ + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), bma->gotp); + + ASSERT(bma->gotp->br_startoff <= aoff); + ASSERT(bma->gotp->br_startoff + bma->gotp->br_blockcount >= + aoff + alen); + ASSERT(bma->gotp->br_state == XFS_EXT_NORM || + bma->gotp->br_state == XFS_EXT_UNWRITTEN); + return 0; +} + /* * Map file blocks to filesystem blocks. * File range is given by the bno/len pair. @@ -4632,9 +4773,6 @@ xfs_bmapi( int *nmap, /* i/o: mval size/count */ xfs_bmap_free_t *flist) /* i/o: list extents to free */ { - xfs_fsblock_t abno; /* allocated block number */ - xfs_extlen_t alen; /* allocated extent length */ - xfs_fileoff_t aoff; /* allocated file offset */ xfs_bmalloca_t bma = { 0 }; /* args for xfs_bmap_alloc */ xfs_btree_cur_t *cur; /* bmap btree cursor */ xfs_fileoff_t end; /* end of mapped file region */ @@ -4646,7 +4784,6 @@ xfs_bmapi( xfs_extnum_t lastx; /* last useful extent number */ int logflags; /* flags for transaction logging */ xfs_extlen_t minleft; /* min blocks left after allocation */ - xfs_extlen_t minlen; /* min allocation size */ xfs_mount_t *mp; /* xfs mount structure */ int n; /* current extent index */ int nallocs; /* number of extents alloc'd */ @@ -4737,7 +4874,13 @@ xfs_bmapi( n = 0; end = bno + len; obno = bno; - bma.ip = NULL; + + bma.tp = tp; + bma.ip = ip; + bma.prevp = &prev; + bma.gotp = &got; + bma.total = total; + bma.userdata = 0; while (bno < end && n < *nmap) { /* @@ -4753,144 +4896,23 @@ xfs_bmapi( * that we found, if any. */ if (wr && (inhole || wasdelay)) { - /* - * For the wasdelay case, we could also just - * allocate the stuff asked for in this bmap call - * but that wouldn't be as good. - */ - if (wasdelay) { - alen = (xfs_extlen_t)got.br_blockcount; - aoff = got.br_startoff; - if (lastx != NULLEXTNUM && lastx) { - ep = xfs_iext_get_ext(ifp, lastx - 1); - xfs_bmbt_get_all(ep, &prev); - } - } else { - alen = (xfs_extlen_t) - XFS_FILBLKS_MIN(len, MAXEXTLEN); - if (!eof) - alen = (xfs_extlen_t) - XFS_FILBLKS_MIN(alen, - got.br_startoff - bno); - aoff = bno; - } - minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1; - { - /* - * If first time, allocate and fill in - * once-only bma fields. - */ - if (bma.ip == NULL) { - bma.tp = tp; - bma.ip = ip; - bma.prevp = &prev; - bma.gotp = &got; - bma.total = total; - bma.userdata = 0; - } - /* Indicate if this is the first user data - * in the file, or just any user data. - */ - if (!(flags & XFS_BMAPI_METADATA)) { - bma.userdata = (aoff == 0) ? - XFS_ALLOC_INITIAL_USER_DATA : - XFS_ALLOC_USERDATA; - } - /* - * Fill in changeable bma fields. - */ - bma.eof = eof; - bma.firstblock = *firstblock; - bma.alen = alen; - bma.off = aoff; - bma.conv = !!(flags & XFS_BMAPI_CONVERT); - bma.wasdel = wasdelay; - bma.minlen = minlen; - bma.low = flist->xbf_low; - bma.minleft = minleft; - /* - * Only want to do the alignment at the - * eof if it is userdata and allocation length - * is larger than a stripe unit. - */ - if (mp->m_dalign && alen >= mp->m_dalign && - (!(flags & XFS_BMAPI_METADATA)) && - (whichfork == XFS_DATA_FORK)) { - if ((error = xfs_bmap_isaeof(ip, aoff, - whichfork, &bma.aeof))) - goto error0; - } else - bma.aeof = 0; - /* - * Call allocator. - */ - if ((error = xfs_bmap_alloc(&bma))) - goto error0; - /* - * Copy out result fields. - */ - abno = bma.rval; - if ((flist->xbf_low = bma.low)) - minleft = 0; - alen = bma.alen; - aoff = bma.off; - ASSERT(*firstblock == NULLFSBLOCK || - XFS_FSB_TO_AGNO(mp, *firstblock) == - XFS_FSB_TO_AGNO(mp, bma.firstblock) || - (flist->xbf_low && - XFS_FSB_TO_AGNO(mp, *firstblock) < - XFS_FSB_TO_AGNO(mp, bma.firstblock))); - *firstblock = bma.firstblock; - if (cur) - cur->bc_private.b.firstblock = - *firstblock; - if (abno == NULLFSBLOCK) - break; - if ((ifp->if_flags & XFS_IFBROOT) && !cur) { - cur = xfs_bmbt_init_cursor(mp, tp, - ip, whichfork); - cur->bc_private.b.firstblock = - *firstblock; - cur->bc_private.b.flist = flist; - } - /* - * Bump the number of extents we've allocated - * in this call. - */ - nallocs++; - } - if (cur) - cur->bc_private.b.flags = - wasdelay ? XFS_BTCUR_BPRV_WASDEL : 0; - got.br_startoff = aoff; - got.br_startblock = abno; - got.br_blockcount = alen; - got.br_state = XFS_EXT_NORM; /* assume normal */ - /* - * Determine state of extent, and the filesystem. - * A wasdelay extent has been initialized, so - * shouldn't be flagged as unwritten. - */ - if (wr && xfs_sb_version_hasextflgbit(&mp->m_sb)) { - if (!wasdelay && (flags & XFS_BMAPI_PREALLOC)) - got.br_state = XFS_EXT_UNWRITTEN; - } - error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, &got, - firstblock, flist, &tmp_logflags, - whichfork); + bma.eof = eof; + bma.conv = !!(flags & XFS_BMAPI_CONVERT); + bma.wasdel = wasdelay; + bma.alen = len; + bma.off = bno; + bma.minleft = minleft; + + error = xfs_bmapi_allocate(&bma, &lastx, &cur, + firstblock, flist, flags, &nallocs, + &tmp_logflags); logflags |= tmp_logflags; if (error) goto error0; - ep = xfs_iext_get_ext(ifp, lastx); - xfs_bmbt_get_all(ep, &got); - ASSERT(got.br_startoff <= aoff); - ASSERT(got.br_startoff + got.br_blockcount >= - aoff + alen); - ASSERT(got.br_state == XFS_EXT_NORM || - got.br_state == XFS_EXT_UNWRITTEN); - /* - * Fall down into the found allocated space case. - */ + if (flist && flist->xbf_low) + minleft = 0; + if (bma.rval == NULLFSBLOCK) + break; } else if (inhole) { /* * Reading in a hole. -- cgit v1.2.3-58-ga151 From b447fe5a05cbd01c4bf7fe2fa41cb9e99ce7e58e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sun, 18 Sep 2011 20:40:51 +0000 Subject: xfs: factor unwritten extent map manipulations out of xfs_bmapi To further improve the readability of xfs_bmapi(), factor the unwritten extent conversion out into a separate function. This removes large block of logic from the xfs_bmapi() code loop and makes it easier to see the operational logic flow for xfs_bmapi(). Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 107 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 70 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 311cbc1d64c5..c2e49fd18bfd 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4089,7 +4089,6 @@ xfs_bmap_read_extents( xfs_extnum_t num_recs; xfs_extnum_t start; - num_recs = xfs_btree_get_numrecs(block); if (unlikely(i + num_recs > room)) { ASSERT(i + num_recs <= room); @@ -4746,6 +4745,69 @@ xfs_bmapi_allocate( return 0; } +STATIC int +xfs_bmapi_convert_unwritten( + struct xfs_bmalloca *bma, + struct xfs_bmbt_irec *mval, + xfs_filblks_t len, + xfs_extnum_t *lastx, + struct xfs_btree_cur **cur, + xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, + int flags, + int *logflags) +{ + int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + int error; + + *logflags = 0; + + /* check if we need to do unwritten->real conversion */ + if (mval->br_state == XFS_EXT_UNWRITTEN && + (flags & XFS_BMAPI_PREALLOC)) + return 0; + + /* check if we need to do real->unwritten conversion */ + if (mval->br_state == XFS_EXT_NORM && + (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) != + (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) + return 0; + + /* + * Modify (by adding) the state flag, if writing. + */ + ASSERT(mval->br_blockcount <= len); + if ((ifp->if_flags & XFS_IFBROOT) && !*cur) { + *cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp, + bma->ip, whichfork); + (*cur)->bc_private.b.firstblock = *firstblock; + (*cur)->bc_private.b.flist = flist; + } + mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) + ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN; + + error = xfs_bmap_add_extent(bma->tp, bma->ip, lastx, cur, mval, + firstblock, flist, logflags, whichfork); + if (error) + return error; + + /* + * Update our extent pointer, given that xfs_bmap_add_extent might + * have merged it into one of the neighbouring ones. + */ + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), bma->gotp); + + /* + * We may have combined previously unwritten space with written space, + * so generate another request. + */ + if (mval->br_blockcount < len) + return EAGAIN; + return 0; +} + /* * Map file blocks to filesystem blocks. * File range is given by the bno/len pair. @@ -4932,45 +4994,16 @@ xfs_bmapi( /* Deal with the allocated space we found. */ xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); - /* - * Check if writing previously allocated but - * unwritten extents. - */ - if (wr && - ((mval->br_state == XFS_EXT_UNWRITTEN && - ((flags & XFS_BMAPI_PREALLOC) == 0)) || - (mval->br_state == XFS_EXT_NORM && - ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) == - (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) { - /* - * Modify (by adding) the state flag, if writing. - */ - ASSERT(mval->br_blockcount <= len); - if ((ifp->if_flags & XFS_IFBROOT) && !cur) { - cur = xfs_bmbt_init_cursor(mp, - tp, ip, whichfork); - cur->bc_private.b.firstblock = - *firstblock; - cur->bc_private.b.flist = flist; - } - mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) - ? XFS_EXT_NORM - : XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, mval, - firstblock, flist, &tmp_logflags, - whichfork); + /* Execute unwritten extent conversion if necessary */ + if (wr) { + error = xfs_bmapi_convert_unwritten(&bma, mval, len, + &lastx, &cur, firstblock, flist, flags, + &tmp_logflags); logflags |= tmp_logflags; + if (error == EAGAIN) + continue; if (error) goto error0; - ep = xfs_iext_get_ext(ifp, lastx); - xfs_bmbt_get_all(ep, &got); - /* - * We may have combined previously unwritten - * space with written space, so generate - * another request. - */ - if (mval->br_blockcount < len) - continue; } /* update the extent map to return */ -- cgit v1.2.3-58-ga151 From c0dc7828af6952643219292be29e482ef74cb261 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sun, 18 Sep 2011 20:40:52 +0000 Subject: xfs: rename xfs_bmapi to xfs_bmapi_write Now that all the read-only users of xfs_bmapi have been converted to use xfs_bmapi_read(), we can remove all the read-only handling cases from xfs_bmapi(). Once this is done, rename xfs_bmapi to xfs_bmapi_write to reflect the fact it is for allocation only. This enables us to kill the XFS_BMAPI_WRITE flag as well. Also clean up xfs_bmapi_write to the style used in the newly added xfs_bmapi_read/delay functions. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_attr.c | 5 +- fs/xfs/xfs_bmap.c | 225 +++++++++++++++++++++----------------------------- fs/xfs/xfs_bmap.h | 50 +++-------- fs/xfs/xfs_da_btree.c | 10 +-- fs/xfs/xfs_dquot.c | 12 ++- fs/xfs/xfs_iomap.c | 20 ++--- fs/xfs/xfs_rtalloc.c | 6 +- fs/xfs/xfs_vnodeops.c | 19 ++--- 8 files changed, 135 insertions(+), 212 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 41ef02b7185a..5484766938f9 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -2039,10 +2039,9 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) */ xfs_bmap_init(args->flist, args->firstblock); nmap = 1; - error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno, + error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, blkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA | - XFS_BMAPI_WRITE, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, args->firstblock, args->total, &map, &nmap, args->flist); if (!error) { diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index c2e49fd18bfd..595cc6311937 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4211,9 +4211,8 @@ xfs_bmap_validate_ret( ASSERT(i == 0 || mval[i - 1].br_startoff + mval[i - 1].br_blockcount == mval[i].br_startoff); - if (flags & XFS_BMAPI_WRITE) - ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && - mval[i].br_startblock != HOLESTARTBLOCK); + ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && + mval[i].br_startblock != HOLESTARTBLOCK); ASSERT(mval[i].br_state == XFS_EXT_NORM || mval[i].br_state == XFS_EXT_UNWRITTEN); } @@ -4809,60 +4808,57 @@ xfs_bmapi_convert_unwritten( } /* - * Map file blocks to filesystem blocks. - * File range is given by the bno/len pair. - * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set) - * into a hole or past eof. - * Only allocates blocks from a single allocation group, - * to avoid locking problems. + * Map file blocks to filesystem blocks, and allocate blocks or convert the + * extent state if necessary. Details behaviour is controlled by the flags + * parameter. Only allocates blocks from a single allocation group, to avoid + * locking problems. + * * The returned value in "firstblock" from the first call in a transaction * must be remembered and presented to subsequent calls in "firstblock". * An upper bound for the number of blocks to be allocated is supplied to * the first call in "total"; if no allocation group has that many free * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). */ -int /* error */ -xfs_bmapi( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_fileoff_t bno, /* starting file offs. mapped */ - xfs_filblks_t len, /* length to map in file */ - int flags, /* XFS_BMAPI_... */ - xfs_fsblock_t *firstblock, /* first allocated block - controls a.g. for allocs */ - xfs_extlen_t total, /* total blocks needed */ - xfs_bmbt_irec_t *mval, /* output: map values */ - int *nmap, /* i/o: mval size/count */ - xfs_bmap_free_t *flist) /* i/o: list extents to free */ +int +xfs_bmapi_write( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting file offs. mapped */ + xfs_filblks_t len, /* length to map in file */ + int flags, /* XFS_BMAPI_... */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + xfs_extlen_t total, /* total blocks needed */ + struct xfs_bmbt_irec *mval, /* output: map values */ + int *nmap, /* i/o: mval size/count */ + struct xfs_bmap_free *flist) /* i/o: list extents to free */ { - xfs_bmalloca_t bma = { 0 }; /* args for xfs_bmap_alloc */ - xfs_btree_cur_t *cur; /* bmap btree cursor */ - xfs_fileoff_t end; /* end of mapped file region */ - int eof; /* we've hit the end of extents */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - int error; /* error return */ - xfs_bmbt_irec_t got; /* current file extent record */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t lastx; /* last useful extent number */ - int logflags; /* flags for transaction logging */ - xfs_extlen_t minleft; /* min blocks left after allocation */ - xfs_mount_t *mp; /* xfs mount structure */ - int n; /* current extent index */ - int nallocs; /* number of extents alloc'd */ - xfs_fileoff_t obno; /* old block number (offset) */ - xfs_bmbt_irec_t prev; /* previous file extent record */ - int tmp_logflags; /* temp flags holder */ - int whichfork; /* data or attr fork */ - char inhole; /* current location is hole in file */ - char wasdelay; /* old extent was delayed */ - char wr; /* this is a write request */ - char rt; /* this is a realtime file */ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + struct xfs_bmalloca bma = { 0 }; /* args for xfs_bmap_alloc */ + struct xfs_btree_cur *cur; /* bmap btree cursor */ + xfs_fileoff_t end; /* end of mapped file region */ + int eof; /* after the end of extents */ + int error; /* error return */ + struct xfs_bmbt_irec got; /* current file extent record */ + xfs_extnum_t lastx; /* last useful extent number */ + int logflags; /* flags for transaction logging */ + xfs_extlen_t minleft; /* min blocks left after allocation */ + int n; /* current extent index */ + int nallocs; /* number of extents alloc'd */ + xfs_fileoff_t obno; /* old block number (offset) */ + struct xfs_bmbt_irec prev; /* previous file extent record */ + int tmp_logflags; /* temp flags holder */ + int whichfork; /* data or attr fork */ + char inhole; /* current location is hole in file */ + char wasdelay; /* old extent was delayed */ + #ifdef DEBUG - xfs_fileoff_t orig_bno; /* original block number value */ - int orig_flags; /* original flags arg value */ - xfs_filblks_t orig_len; /* original value of len arg */ - xfs_bmbt_irec_t *orig_mval; /* original value of mval */ - int orig_nmap; /* original value of *nmap */ + xfs_fileoff_t orig_bno; /* original block number value */ + int orig_flags; /* original flags arg value */ + xfs_filblks_t orig_len; /* original value of len arg */ + struct xfs_bmbt_irec *orig_mval; /* original value of mval */ + int orig_nmap; /* original value of *nmap */ orig_bno = bno; orig_len = len; @@ -4870,69 +4866,60 @@ xfs_bmapi( orig_mval = mval; orig_nmap = *nmap; #endif + ASSERT(*nmap >= 1); - ASSERT(*nmap <= XFS_BMAP_MAX_NMAP || !(flags & XFS_BMAPI_WRITE)); + ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); + ASSERT(!(flags & XFS_BMAPI_IGSTATE)); + ASSERT(tp != NULL); + whichfork = (flags & XFS_BMAPI_ATTRFORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; - mp = ip->i_mount; + if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL), mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { - XFS_ERROR_REPORT("xfs_bmapi", XFS_ERRLEVEL_LOW, mp); + XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp); return XFS_ERROR(EFSCORRUPTED); } + if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); + ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(ifp->if_ext_max == XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); - if ((wr = (flags & XFS_BMAPI_WRITE)) != 0) - XFS_STATS_INC(xs_blk_mapw); - else - XFS_STATS_INC(xs_blk_mapr); - /* - * IGSTATE flag is used to combine extents which - * differ only due to the state of the extents. - * This technique is used from xfs_getbmap() - * when the caller does not wish to see the - * separation (which is the default). - * - * This technique is also used when writing a - * buffer which has been partially written, - * (usually by being flushed during a chunkread), - * to ensure one write takes place. This also - * prevents a change in the xfs inode extents at - * this time, intentionally. This change occurs - * on completion of the write operation, in - * xfs_strat_comp(), where the xfs_bmapi() call - * is transactioned, and the extents combined. - */ - if ((flags & XFS_BMAPI_IGSTATE) && wr) /* if writing unwritten space */ - wr = 0; /* no allocations are allowed */ + + XFS_STATS_INC(xs_blk_mapw); + logflags = 0; nallocs = 0; cur = NULL; + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - ASSERT(wr && tp); - if ((error = xfs_bmap_local_to_extents(tp, ip, - firstblock, total, &logflags, whichfork))) + error = xfs_bmap_local_to_extents(tp, ip, firstblock, total, + &logflags, whichfork); + if (error) goto error0; } - if (wr && *firstblock == NULLFSBLOCK) { + + if (*firstblock == NULLFSBLOCK) { if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; else minleft = 1; - } else + } else { minleft = 0; - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - goto error0; - ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); + } + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + goto error0; + } + + xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev); n = 0; end = bno + len; obno = bno; @@ -4945,19 +4932,14 @@ xfs_bmapi( bma.userdata = 0; while (bno < end && n < *nmap) { - /* - * Reading past eof, act as though there's a hole - * up to end. - */ - if (eof && !wr) - got.br_startoff = end; inhole = eof || got.br_startoff > bno; - wasdelay = wr && !inhole && isnullstartblock(got.br_startblock); + wasdelay = !inhole && isnullstartblock(got.br_startblock); + /* * First, deal with the hole before the allocated space * that we found, if any. */ - if (wr && (inhole || wasdelay)) { + if (inhole || wasdelay) { bma.eof = eof; bma.conv = !!(flags & XFS_BMAPI_CONVERT); bma.wasdel = wasdelay; @@ -4975,36 +4957,20 @@ xfs_bmapi( minleft = 0; if (bma.rval == NULLFSBLOCK) break; - } else if (inhole) { - /* - * Reading in a hole. - */ - mval->br_startoff = bno; - mval->br_startblock = HOLESTARTBLOCK; - mval->br_blockcount = - XFS_FILBLKS_MIN(len, got.br_startoff - bno); - mval->br_state = XFS_EXT_NORM; - bno += mval->br_blockcount; - len -= mval->br_blockcount; - mval++; - n++; - continue; } /* Deal with the allocated space we found. */ xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); /* Execute unwritten extent conversion if necessary */ - if (wr) { - error = xfs_bmapi_convert_unwritten(&bma, mval, len, - &lastx, &cur, firstblock, flist, flags, - &tmp_logflags); - logflags |= tmp_logflags; - if (error == EAGAIN) - continue; - if (error) - goto error0; - } + error = xfs_bmapi_convert_unwritten(&bma, mval, len, &lastx, + &cur, firstblock, flist, + flags, &tmp_logflags); + logflags |= tmp_logflags; + if (error == EAGAIN) + continue; + if (error) + goto error0; /* update the extent map to return */ xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); @@ -5016,24 +4982,22 @@ xfs_bmapi( */ if (bno >= end || n >= *nmap || nallocs >= *nmap) break; - /* - * Else go on to the next record. - */ + + /* Else go on to the next record. */ prev = got; - if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) { - ep = xfs_iext_get_ext(ifp, lastx); - xfs_bmbt_get_all(ep, &got); - } else { + if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); + else eof = 1; - } } *nmap = n; + /* * Transform from btree to extents, give it cur. */ - if (tp && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { - ASSERT(wr && cur); + ASSERT(cur); error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, whichfork); logflags |= tmp_logflags; @@ -5061,10 +5025,9 @@ error0: * detecting a case where the data is changed, there's an error, * and it's not logged so we don't shutdown when we should. */ - if (logflags) { - ASSERT(tp && wr); + if (logflags) xfs_trans_log_inode(tp, ip, logflags); - } + if (cur) { if (!error) { ASSERT(*firstblock == NULLFSBLOCK || diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index c70c19c7f1fa..be235f56d2a6 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -62,25 +62,23 @@ typedef struct xfs_bmap_free #define XFS_BMAP_MAX_NMAP 4 /* - * Flags for xfs_bmapi + * Flags for xfs_bmapi_* */ -#define XFS_BMAPI_WRITE 0x001 /* write operation: allocate space */ -#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */ -#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */ -#define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */ -#define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */ -#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */ +#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */ +#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */ +#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */ +#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */ +#define XFS_BMAPI_IGSTATE 0x010 /* Ignore state - */ /* combine contig. space */ -#define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */ +#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */ /* * unwritten extent conversion - this needs write cache flushing and no additional * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts * from written to unwritten, otherwise convert from unwritten to written. */ -#define XFS_BMAPI_CONVERT 0x200 +#define XFS_BMAPI_CONVERT 0x040 #define XFS_BMAPI_FLAGS \ - { XFS_BMAPI_WRITE, "WRITE" }, \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ @@ -265,39 +263,17 @@ xfs_bmap_read_extents( struct xfs_inode *ip, /* incore inode */ int whichfork); /* data or attr fork */ -/* - * Map file blocks to filesystem blocks. - * File range is given by the bno/len pair. - * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set) - * into a hole or past eof. - * Only allocates blocks from a single allocation group, - * to avoid locking problems. - * The returned value in "firstblock" from the first call in a transaction - * must be remembered and presented to subsequent calls in "firstblock". - * An upper bound for the number of blocks to be allocated is supplied to - * the first call in "total"; if no allocation group has that many free - * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). - */ -int /* error */ -xfs_bmapi( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t bno, /* starting file offs. mapped */ - xfs_filblks_t len, /* length to map in file */ - int flags, /* XFS_BMAPI_... */ - xfs_fsblock_t *firstblock, /* first allocated block - controls a.g. for allocs */ - xfs_extlen_t total, /* total blocks needed */ - struct xfs_bmbt_irec *mval, /* output: map values */ - int *nmap, /* i/o: mval size/count */ - xfs_bmap_free_t *flist); /* i/o: list extents to free */ - int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, struct xfs_bmbt_irec *mval, int *nmap, int flags); int xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, struct xfs_bmbt_irec *mval, int *nmap, int flags); +int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_fsblock_t *firstblock, xfs_extlen_t total, + struct xfs_bmbt_irec *mval, int *nmap, + struct xfs_bmap_free *flist); /* * Unmap (remove) blocks from a file. diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 70a5f580e5ed..46c8aa2740da 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -1578,9 +1578,8 @@ xfs_da_grow_inode_int( */ nmap = 1; ASSERT(args->firstblock != NULL); - error = xfs_bmapi(tp, dp, *bno, count, - xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA| - XFS_BMAPI_CONTIG, + error = xfs_bmapi_write(tp, dp, *bno, count, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, args->firstblock, args->total, &map, &nmap, args->flist); if (error) @@ -1602,9 +1601,8 @@ xfs_da_grow_inode_int( for (b = *bno, mapi = 0; b < *bno + count; ) { nmap = MIN(XFS_BMAP_MAX_NMAP, count); c = (int)(*bno + count - b); - error = xfs_bmapi(tp, dp, b, c, - xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE| - XFS_BMAPI_METADATA, + error = xfs_bmapi_write(tp, dp, b, c, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, args->firstblock, args->total, &mapp[mapi], &nmap, args->flist); if (error) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index c377961657ee..179673531f20 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -379,14 +379,12 @@ xfs_qm_dqalloc( xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL); nmaps = 1; - if ((error = xfs_bmapi(tp, quotip, - offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB, - XFS_BMAPI_METADATA | XFS_BMAPI_WRITE, - &firstblock, - XFS_QM_DQALLOC_SPACE_RES(mp), - &map, &nmaps, &flist))) { + error = xfs_bmapi_write(tp, quotip, offset_fsb, + XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, + &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp), + &map, &nmaps, &flist); + if (error) goto error0; - } ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); ASSERT(nmaps == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 681ba34c9233..da5bf05c5bb7 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -210,20 +210,18 @@ xfs_iomap_write_direct( xfs_trans_ijoin(tp, ip); - bmapi_flag = XFS_BMAPI_WRITE; + bmapi_flag = 0; if (offset < ip->i_size || extsz) bmapi_flag |= XFS_BMAPI_PREALLOC; /* - * Issue the xfs_bmapi() call to allocate the blocks. - * * From this point onwards we overwrite the imap pointer that the * caller gave to us. */ xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; - error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag, - &firstfsb, 0, imap, &nimaps, &free_list); + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag, + &firstfsb, 0, imap, &nimaps, &free_list); if (error) goto error0; @@ -582,14 +580,12 @@ xfs_iomap_write_allocate( } /* - * Go get the actual blocks. - * * From this point onwards we overwrite the imap * pointer that the caller gave to us. */ - error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb, - XFS_BMAPI_WRITE, &first_block, 1, - imap, &nimaps, &free_list); + error = xfs_bmapi_write(tp, ip, map_start_fsb, + count_fsb, 0, &first_block, 1, + imap, &nimaps, &free_list); if (error) goto trans_cancel; @@ -703,8 +699,8 @@ xfs_iomap_write_unwritten( */ xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; - error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, - XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb, + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_CONVERT, &firstfsb, 1, &imap, &nimaps, &free_list); if (error) goto error_on_bmapi_transaction; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index e5f40c6460b2..f29424964521 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -120,9 +120,9 @@ xfs_growfs_rt_alloc( */ nmap = 1; cancelflags |= XFS_TRANS_ABORT; - error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks, - XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock, - resblks, &map, &nmap, &flist); + error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, + XFS_BMAPI_METADATA, &firstblock, + resblks, &map, &nmap, &flist); if (!error && nmap < 1) error = XFS_ERROR(ENOSPC); if (error) diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 63874a87b378..f47ecee8d437 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1633,10 +1633,9 @@ xfs_symlink( first_fsb = 0; nmaps = SYMLINK_MAPS; - error = xfs_bmapi(tp, ip, first_fsb, fs_blocks, - XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, - &first_block, resblks, mval, &nmaps, - &free_list); + error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks, + XFS_BMAPI_METADATA, &first_block, resblks, + mval, &nmaps, &free_list); if (error) goto error2; @@ -1782,7 +1781,6 @@ xfs_alloc_file_space( xfs_fileoff_t startoffset_fsb; xfs_fsblock_t firstfsb; int nimaps; - int bmapi_flag; int quota_flag; int rt; xfs_trans_t *tp; @@ -1810,7 +1808,6 @@ xfs_alloc_file_space( count = len; imapp = &imaps[0]; nimaps = 1; - bmapi_flag = XFS_BMAPI_WRITE | alloc_type; startoffset_fsb = XFS_B_TO_FSBT(mp, offset); allocatesize_fsb = XFS_B_TO_FSB(mp, count); @@ -1883,14 +1880,10 @@ xfs_alloc_file_space( xfs_trans_ijoin(tp, ip); - /* - * Issue the xfs_bmapi() call to allocate the blocks - */ xfs_bmap_init(&free_list, &firstfsb); - error = xfs_bmapi(tp, ip, startoffset_fsb, - allocatesize_fsb, bmapi_flag, - &firstfsb, 0, imapp, &nimaps, - &free_list); + error = xfs_bmapi_write(tp, ip, startoffset_fsb, + allocatesize_fsb, alloc_type, &firstfsb, + 0, imapp, &nimaps, &free_list); if (error) { goto error0; } -- cgit v1.2.3-58-ga151 From 27a3f8f2de758205765f277b3428bbf3d15da973 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Sep 2011 20:40:53 +0000 Subject: xfs: introduce xfs_bmap_last_extent Add a common helper for finding the last extent in a file. Largely based on a patch from Dave Chinner. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 226 +++++++++++++++++++++++++----------------------------- 1 file changed, 105 insertions(+), 121 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 595cc6311937..d9de5ae1fd5c 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -203,19 +203,6 @@ xfs_bmap_search_extents( xfs_bmbt_irec_t *gotp, /* out: extent entry found */ xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */ -/* - * Check the last inode extent to determine whether this allocation will result - * in blocks being allocated at the end of the file. When we allocate new data - * blocks at the end of the file which do not start at the previous data block, - * we will try to align the new blocks at stripe unit boundaries. - */ -STATIC int /* error */ -xfs_bmap_isaeof( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t off, /* file offset in fsblocks */ - int whichfork, /* data or attribute fork */ - char *aeof); /* return value */ - /* * Compute the worst-case number of indirect blocks that will be used * for ip's delayed extent of length "len". @@ -3924,42 +3911,122 @@ xfs_bmap_last_before( return 0; } +STATIC int +xfs_bmap_last_extent( + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *rec, + int *is_empty) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int error; + int nextents; + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *is_empty = 1; + return 0; + } + + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec); + *is_empty = 0; + return 0; +} + +/* + * Check the last inode extent to determine whether this allocation will result + * in blocks being allocated at the end of the file. When we allocate new data + * blocks at the end of the file which do not start at the previous data block, + * we will try to align the new blocks at stripe unit boundaries. + * + * Returns 0 in *aeof if the file (fork) is empty as any new write will be at, + * or past the EOF. + */ +STATIC int +xfs_bmap_isaeof( + struct xfs_inode *ip, + xfs_fileoff_t off, + int whichfork, + char *aeof) +{ + struct xfs_bmbt_irec rec; + int is_empty; + int error; + + *aeof = 0; + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); + if (error || is_empty) + return error; + + /* + * Check if we are allocation or past the last extent, or at least into + * the last delayed allocated extent. + */ + *aeof = off >= rec.br_startoff + rec.br_blockcount || + (off >= rec.br_startoff && isnullstartblock(rec.br_startblock)); + return 0; +} + +/* + * Check if the endoff is outside the last extent. If so the caller will grow + * the allocation to a stripe unit boundary. All offsets are considered outside + * the end of file for an empty fork, so 1 is returned in *eof in that case. + */ +int +xfs_bmap_eof( + struct xfs_inode *ip, + xfs_fileoff_t endoff, + int whichfork, + int *eof) +{ + struct xfs_bmbt_irec rec; + int error; + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof); + if (error || *eof) + return error; + + *eof = endoff >= rec.br_startoff + rec.br_blockcount; + return 0; +} + /* * Returns the file-relative block number of the first block past eof in * the file. This is not based on i_size, it is based on the extent records. * Returns 0 for local files, as they do not have extent records. */ -int /* error */ +int xfs_bmap_last_offset( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_fileoff_t *last_block, /* last block */ - int whichfork) /* data or attr fork */ + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_fileoff_t *last_block, + int whichfork) { - xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t nextents; /* number of extent entries */ + struct xfs_bmbt_irec rec; + int is_empty; + int error; + + *last_block = 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) + return 0; if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) return XFS_ERROR(EIO); - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - *last_block = 0; - return 0; - } - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); + if (error || is_empty) return error; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (!nextents) { - *last_block = 0; - return 0; - } - ep = xfs_iext_get_ext(ifp, nextents - 1); - *last_block = xfs_bmbt_get_startoff(ep) + xfs_bmbt_get_blockcount(ep); + + *last_block = rec.br_startoff + rec.br_blockcount; return 0; } @@ -5687,89 +5754,6 @@ xfs_getbmap( return error; } -/* - * Check the last inode extent to determine whether this allocation will result - * in blocks being allocated at the end of the file. When we allocate new data - * blocks at the end of the file which do not start at the previous data block, - * we will try to align the new blocks at stripe unit boundaries. - */ -STATIC int /* error */ -xfs_bmap_isaeof( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t off, /* file offset in fsblocks */ - int whichfork, /* data or attribute fork */ - char *aeof) /* return value */ -{ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_rec_host_t *lastrec; /* extent record pointer */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_bmbt_irec_t s; /* expanded extent record */ - - ASSERT(whichfork == XFS_DATA_FORK); - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(NULL, ip, whichfork))) - return error; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (nextents == 0) { - *aeof = 1; - return 0; - } - /* - * Go to the last extent - */ - lastrec = xfs_iext_get_ext(ifp, nextents - 1); - xfs_bmbt_get_all(lastrec, &s); - /* - * Check we are allocating in the last extent (for delayed allocations) - * or past the last extent for non-delayed allocations. - */ - *aeof = (off >= s.br_startoff && - off < s.br_startoff + s.br_blockcount && - isnullstartblock(s.br_startblock)) || - off >= s.br_startoff + s.br_blockcount; - return 0; -} - -/* - * Check if the endoff is outside the last extent. If so the caller will grow - * the allocation to a stripe unit boundary. - */ -int /* error */ -xfs_bmap_eof( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t endoff, /* file offset in fsblocks */ - int whichfork, /* data or attribute fork */ - int *eof) /* result value */ -{ - xfs_fsblock_t blockcount; /* extent block count */ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_rec_host_t *lastrec; /* extent record pointer */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_fileoff_t startoff; /* extent starting file offset */ - - ASSERT(whichfork == XFS_DATA_FORK); - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(NULL, ip, whichfork))) - return error; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (nextents == 0) { - *eof = 1; - return 0; - } - /* - * Go to the last extent - */ - lastrec = xfs_iext_get_ext(ifp, nextents - 1); - startoff = xfs_bmbt_get_startoff(lastrec); - blockcount = xfs_bmbt_get_blockcount(lastrec); - *eof = endoff >= startoff + blockcount; - return 0; -} - #ifdef DEBUG STATIC struct xfs_buf * xfs_bmap_get_bp( -- cgit v1.2.3-58-ga151 From a5bd606ba65f24e5990edfc0e7b52702720ee6fa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Sep 2011 20:40:54 +0000 Subject: xfs: remove xfs_bmap_add_extent There is no real need to the xfs_bmap_add_extent, as the callers know what kind of extents they need to it. Removing it means duplicating the extents to btree conversion logic in three places, but overall it's still much simpler code and quite a bit less code. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 416 +++++++++++++++++++++++------------------------------- 1 file changed, 173 insertions(+), 243 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index d9de5ae1fd5c..0c9be0ed606e 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -50,17 +50,22 @@ #include "xfs_trace.h" -#ifdef DEBUG -STATIC void -xfs_bmap_check_leaf_extents(xfs_btree_cur_t *cur, xfs_inode_t *ip, int whichfork); -#endif - kmem_zone_t *xfs_bmap_free_item_zone; /* * Prototypes for internal bmap routines. */ +#ifdef DEBUG +STATIC void +xfs_bmap_check_leaf_extents( + struct xfs_btree_cur *cur, + struct xfs_inode *ip, + int whichfork); +#else +#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) +#endif + /* * Called from xfs_bmap_add_attrfork to handle extents format files. @@ -84,47 +89,6 @@ xfs_bmap_add_attrfork_local( xfs_bmap_free_t *flist, /* blocks to free at commit */ int *flags); /* inode logging flags */ -/* - * Called by xfs_bmap_add_extent to handle cases converting a delayed - * allocation to a real allocation. - */ -STATIC int /* error */ -xfs_bmap_add_extent_delay_real( - struct xfs_trans *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ - xfs_fsblock_t *first, /* pointer to firstblock variable */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - int *logflagsp); /* inode logging flags */ - -/* - * Called by xfs_bmap_add_extent to handle cases converting a hole - * to a real allocation. - */ -STATIC int /* error */ -xfs_bmap_add_extent_hole_real( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t *cur, /* if null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp, /* inode logging flags */ - int whichfork); /* data or attr fork */ - -/* - * Called by xfs_bmap_add_extent to handle cases converting an unwritten - * allocation to a real allocation or vice versa. - */ -STATIC int /* error */ -xfs_bmap_add_extent_unwritten_real( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp); /* inode logging flags */ - /* * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. * It figures out where to ask the underlying allocator to put the new extent. @@ -407,147 +371,7 @@ xfs_bmap_add_attrfork_local( } /* - * Update file extent records and the btree after allocating space. - */ -STATIC int /* error */ -xfs_bmap_add_extent( - struct xfs_trans *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - xfs_fsblock_t *first, /* pointer to firstblock variable */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ -{ - xfs_btree_cur_t *cur; /* btree cursor or null */ - xfs_filblks_t da_new; /* new count del alloc blocks used */ - xfs_filblks_t da_old; /* old count del alloc blocks used */ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork ptr */ - int logflags; /* returned value */ - xfs_extnum_t nextents; /* number of extents in file now */ - - XFS_STATS_INC(xs_add_exlist); - - cur = *curp; - ifp = XFS_IFORK_PTR(ip, whichfork); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - da_old = da_new = 0; - error = 0; - - ASSERT(*idx >= 0); - ASSERT(*idx <= nextents); - ASSERT(!isnullstartblock(new->br_startblock)); - - /* - * Real allocation off the end of the file. - */ - if (*idx == nextents) { - if (cur) - ASSERT((cur->bc_private.b.flags & - XFS_BTCUR_BPRV_WASDEL) == 0); - error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new, - &logflags, whichfork); - } else { - xfs_bmbt_irec_t prev; /* old extent at offset idx */ - - /* - * Get the record referred to by idx. - */ - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev); - /* - * If it's a real allocation record, and the new allocation ends - * after the start of the referred to record, then we're filling - * in a delayed or unwritten allocation with a real one, or - * converting real back to unwritten. - */ - if (!isnullstartblock(new->br_startblock) && - new->br_startoff + new->br_blockcount > prev.br_startoff) { - if (prev.br_state != XFS_EXT_UNWRITTEN && - isnullstartblock(prev.br_startblock)) { - da_old = startblockval(prev.br_startblock); - if (cur) - ASSERT(cur->bc_private.b.flags & - XFS_BTCUR_BPRV_WASDEL); - error = xfs_bmap_add_extent_delay_real(tp, ip, - idx, &cur, new, &da_new, - first, flist, &logflags); - } else { - ASSERT(new->br_state == XFS_EXT_NORM || - new->br_state == XFS_EXT_UNWRITTEN); - - error = xfs_bmap_add_extent_unwritten_real(ip, - idx, &cur, new, &logflags); - if (error) - goto done; - } - } - /* - * Otherwise we're filling in a hole with an allocation. - */ - else { - if (cur) - ASSERT((cur->bc_private.b.flags & - XFS_BTCUR_BPRV_WASDEL) == 0); - error = xfs_bmap_add_extent_hole_real(ip, idx, cur, - new, &logflags, whichfork); - } - } - - if (error) - goto done; - ASSERT(*curp == cur || *curp == NULL); - - /* - * Convert to a btree if necessary. - */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) { - int tmp_logflags; /* partial log flag return val */ - - ASSERT(cur == NULL); - error = xfs_bmap_extents_to_btree(tp, ip, first, - flist, &cur, da_old > 0, &tmp_logflags, whichfork); - logflags |= tmp_logflags; - if (error) - goto done; - } - /* - * Adjust for changes in reserved delayed indirect blocks. - * Nothing to do for disk quotas here. - */ - if (da_old || da_new) { - xfs_filblks_t nblks; - - nblks = da_new; - if (cur) - nblks += cur->bc_private.b.allocated; - ASSERT(nblks <= da_old); - if (nblks < da_old) - xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - (int64_t)(da_old - nblks), 0); - } - /* - * Clear out the allocated field, done with it now in any case. - */ - if (cur) { - cur->bc_private.b.allocated = 0; - *curp = cur; - } -done: -#ifdef DEBUG - if (!error) - xfs_bmap_check_leaf_extents(*curp, ip, whichfork); -#endif - *logflagsp = logflags; - return error; -} - -/* - * Called by xfs_bmap_add_extent to handle cases converting a delayed - * allocation to a real allocation. + * Convert a delayed allocation to a real allocation. */ STATIC int /* error */ xfs_bmap_add_extent_delay_real( @@ -556,7 +380,6 @@ xfs_bmap_add_extent_delay_real( xfs_extnum_t *idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ - xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ xfs_fsblock_t *first, /* pointer to firstblock variable */ xfs_bmap_free_t *flist, /* list of extents to be freed */ int *logflagsp) /* inode logging flags */ @@ -572,10 +395,24 @@ xfs_bmap_add_extent_delay_real( /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ int state = 0;/* state bits, accessed thru macros */ - xfs_filblks_t temp=0; /* value for dnew calculations */ - xfs_filblks_t temp2=0;/* value for dnew calculations */ + xfs_filblks_t da_new; /* new count del alloc blocks used */ + xfs_filblks_t da_old; /* old count del alloc blocks used */ + xfs_filblks_t temp=0; /* value for da_new calculations */ + xfs_filblks_t temp2=0;/* value for da_new calculations */ int tmp_rval; /* partial logging flags */ + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + cur = *curp; + + ASSERT(*idx >= 0); + ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(!isnullstartblock(new->br_startblock)); + ASSERT(!cur || (cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + + XFS_STATS_INC(xs_add_exlist); + + *logflagsp = 0; + #define LEFT r[0] #define RIGHT r[1] #define PREV r[2] @@ -583,14 +420,15 @@ xfs_bmap_add_extent_delay_real( /* * Set up a bunch of variables to make the tests simpler. */ - cur = *curp; - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); ep = xfs_iext_get_ext(ifp, *idx); xfs_bmbt_get_all(ep, &PREV); new_endoff = new->br_startoff + new->br_blockcount; ASSERT(PREV.br_startoff <= new->br_startoff); ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); + da_old = startblockval(PREV.br_startblock); + da_new = 0; + /* * Set flags determining what part of the previous delayed allocation * extent is being replaced by a real allocation. @@ -688,7 +526,6 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount, LEFT.br_state))) goto done; } - *dnew = 0; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: @@ -719,7 +556,6 @@ xfs_bmap_add_extent_delay_real( PREV.br_blockcount, LEFT.br_state))) goto done; } - *dnew = 0; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -749,8 +585,6 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount, PREV.br_state))) goto done; } - - *dnew = 0; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: @@ -778,8 +612,6 @@ xfs_bmap_add_extent_delay_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - - *dnew = 0; break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: @@ -813,13 +645,12 @@ xfs_bmap_add_extent_delay_real( LEFT.br_state))) goto done; } - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), startblockval(PREV.br_startblock)); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); --*idx; - *dnew = temp; break; case BMAP_LEFT_FILLING: @@ -856,14 +687,12 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), startblockval(PREV.br_startblock) - (cur ? cur->bc_private.b.allocated : 0)); ep = xfs_iext_get_ext(ifp, *idx + 1); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_); - - *dnew = temp; break; case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -896,14 +725,13 @@ xfs_bmap_add_extent_delay_real( goto done; } - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), startblockval(PREV.br_startblock)); trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); ++*idx; - *dnew = temp; break; case BMAP_RIGHT_FILLING: @@ -939,15 +767,14 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), startblockval(PREV.br_startblock) - (cur ? cur->bc_private.b.allocated : 0)); ep = xfs_iext_get_ext(ifp, *idx); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); ++*idx; - *dnew = temp; break; case 0: @@ -1029,7 +856,7 @@ xfs_bmap_add_extent_delay_real( trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_); ++*idx; - *dnew = temp + temp2; + da_new = temp + temp2; break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: @@ -1044,9 +871,39 @@ xfs_bmap_add_extent_delay_real( */ ASSERT(0); } - *curp = cur; + + /* convert to a btree if necessary */ + if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur, + da_old > 0, &tmp_logflags, XFS_DATA_FORK); + *logflagsp |= tmp_logflags; + if (error) + goto done; + } + + /* adjust for changes in reserved delayed indirect blocks */ + if (da_old || da_new) { + temp = da_new; + if (cur) + temp += cur->bc_private.b.allocated; + ASSERT(temp <= da_old); + if (temp < da_old) + xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, + (int64_t)(da_old - temp), 0); + } + + /* clear out the allocated field, done with it now in any case. */ + if (cur) { + cur->bc_private.b.allocated = 0; + *curp = cur; + } + xfs_bmap_check_leaf_extents(cur, ip, XFS_DATA_FORK); done: - *logflagsp = rval; + *logflagsp |= rval; return error; #undef LEFT #undef RIGHT @@ -1054,15 +911,17 @@ done: } /* - * Called by xfs_bmap_add_extent to handle cases converting an unwritten - * allocation to a real allocation or vice versa. + * Convert an unwritten allocation to a real allocation or vice versa. */ STATIC int /* error */ xfs_bmap_add_extent_unwritten_real( + struct xfs_trans *tp, xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t *idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ + xfs_fsblock_t *first, /* pointer to firstblock variable */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ int *logflagsp) /* inode logging flags */ { xfs_btree_cur_t *cur; /* btree cursor */ @@ -1078,15 +937,25 @@ xfs_bmap_add_extent_unwritten_real( int rval=0; /* return value (logging flags) */ int state = 0;/* state bits, accessed thru macros */ + *logflagsp = 0; + + cur = *curp; + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + + ASSERT(*idx >= 0); + ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(!isnullstartblock(new->br_startblock)); + + XFS_STATS_INC(xs_add_exlist); + #define LEFT r[0] #define RIGHT r[1] #define PREV r[2] + /* * Set up a bunch of variables to make the tests simpler. */ error = 0; - cur = *curp; - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); ep = xfs_iext_get_ext(ifp, *idx); xfs_bmbt_get_all(ep, &PREV); newext = new->br_state; @@ -1537,9 +1406,29 @@ xfs_bmap_add_extent_unwritten_real( */ ASSERT(0); } - *curp = cur; + + /* convert to a btree if necessary */ + if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur, + 0, &tmp_logflags, XFS_DATA_FORK); + *logflagsp |= tmp_logflags; + if (error) + goto done; + } + + /* clear out the allocated field, done with it now in any case. */ + if (cur) { + cur->bc_private.b.allocated = 0; + *curp = cur; + } + + xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK); done: - *logflagsp = rval; + *logflagsp |= rval; return error; #undef LEFT #undef RIGHT @@ -1691,30 +1580,42 @@ xfs_bmap_add_extent_hole_delay( } /* - * Called by xfs_bmap_add_extent to handle cases converting a hole - * to a real allocation. + * Convert a hole to a real allocation. */ STATIC int /* error */ xfs_bmap_add_extent_hole_real( + struct xfs_trans *tp, xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_btree_cur_t **curp, /* if null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ + xfs_fsblock_t *first, /* pointer to firstblock variable */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ int *logflagsp, /* inode logging flags */ int whichfork) /* data or attr fork */ { int error; /* error return value */ int i; /* temp state */ + xfs_btree_cur_t *cur; /* if null, not a btree */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_irec_t left; /* left neighbor extent entry */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ int rval=0; /* return value (logging flags) */ int state; /* state bits, accessed thru macros */ + *logflagsp = 0; + ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); - state = 0; + cur = *curp; + + ASSERT(*idx >= 0); + ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(!isnullstartblock(new->br_startblock)); + ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + + XFS_STATS_INC(xs_add_exlist); + state = 0; if (whichfork == XFS_ATTR_FORK) state |= BMAP_ATTRFORK; @@ -1897,8 +1798,28 @@ xfs_bmap_add_extent_hole_real( } break; } + + /* convert to a btree if necessary */ + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, first, + flist, &cur, 0, &tmp_logflags, whichfork); + *logflagsp |= tmp_logflags; + if (error) + goto done; + } + + /* clear out the allocated field, done with it now in any case. */ + if (cur) { + cur->bc_private.b.allocated = 0; + *curp = cur; + } + xfs_bmap_check_leaf_extents(cur, ip, whichfork); done: - *logflagsp = rval; + *logflagsp |= rval; return error; } @@ -4792,14 +4713,22 @@ xfs_bmapi_allocate( xfs_sb_version_hasextflgbit(&mp->m_sb)) bma->gotp->br_state = XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(bma->tp, bma->ip, lastx, cur, bma->gotp, - firstblock, flist, logflags, whichfork); + if (bma->wasdel) { + error = xfs_bmap_add_extent_delay_real(bma->tp, bma->ip, lastx, + cur, bma->gotp, firstblock, flist, logflags); + } else { + error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, lastx, + cur, bma->gotp, firstblock, flist, logflags, + whichfork); + } + if (error) return error; /* - * Update our extent pointer, given that xfs_bmap_add_extent might - * have merged it into one of the neighbouring ones. + * Update our extent pointer, given that xfs_bmap_add_extent_delay_real + * or xfs_bmap_add_extent_hole_real might have merged it into one of + * the neighbouring ones. */ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), bma->gotp); @@ -4854,14 +4783,15 @@ xfs_bmapi_convert_unwritten( mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(bma->tp, bma->ip, lastx, cur, mval, - firstblock, flist, logflags, whichfork); + error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, lastx, + cur, mval, firstblock, flist, logflags); if (error) return error; /* - * Update our extent pointer, given that xfs_bmap_add_extent might - * have merged it into one of the neighbouring ones. + * Update our extent pointer, given that + * xfs_bmap_add_extent_unwritten_real might have merged it into one + * of the neighbouring ones. */ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), bma->gotp); @@ -5287,9 +5217,9 @@ xfs_bunmapi( del.br_blockcount = mod; } del.br_state = XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, &del, - firstblock, flist, &logflags, - XFS_DATA_FORK); + error = xfs_bmap_add_extent_unwritten_real(tp, ip, + &lastx, &cur, &del, firstblock, flist, + &logflags); if (error) goto error0; goto nodelete; @@ -5345,18 +5275,18 @@ xfs_bunmapi( } prev.br_state = XFS_EXT_UNWRITTEN; lastx--; - error = xfs_bmap_add_extent(tp, ip, &lastx, - &cur, &prev, firstblock, flist, - &logflags, XFS_DATA_FORK); + error = xfs_bmap_add_extent_unwritten_real(tp, + ip, &lastx, &cur, &prev, + firstblock, flist, &logflags); if (error) goto error0; goto nodelete; } else { ASSERT(del.br_state == XFS_EXT_NORM); del.br_state = XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(tp, ip, &lastx, - &cur, &del, firstblock, flist, - &logflags, XFS_DATA_FORK); + error = xfs_bmap_add_extent_unwritten_real(tp, + ip, &lastx, &cur, &del, + firstblock, flist, &logflags); if (error) goto error0; goto nodelete; -- cgit v1.2.3-58-ga151 From 1b16447ba24ae39c7fe7133fcdcb4f174dec1901 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sun, 18 Sep 2011 20:40:55 +0000 Subject: xfs: pass bmalloca structure to xfs_bmap_isaeof All the variables xfs_bmap_isaeof() is passed are contained within the xfs_bmalloca structure. Pass that instead. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 0c9be0ed606e..e2eb3ba5b420 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -3867,22 +3867,21 @@ xfs_bmap_last_extent( * blocks at the end of the file which do not start at the previous data block, * we will try to align the new blocks at stripe unit boundaries. * - * Returns 0 in *aeof if the file (fork) is empty as any new write will be at, - * or past the EOF. + * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be + * at, or past the EOF. */ STATIC int xfs_bmap_isaeof( - struct xfs_inode *ip, - xfs_fileoff_t off, - int whichfork, - char *aeof) + struct xfs_bmalloca *bma, + int whichfork) { struct xfs_bmbt_irec rec; int is_empty; int error; - *aeof = 0; - error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); + bma->aeof = 0; + error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, + &is_empty); if (error || is_empty) return error; @@ -3890,8 +3889,9 @@ xfs_bmap_isaeof( * Check if we are allocation or past the last extent, or at least into * the last delayed allocated extent. */ - *aeof = off >= rec.br_startoff + rec.br_blockcount || - (off >= rec.br_startoff && isnullstartblock(rec.br_startblock)); + bma->aeof = bma->off >= rec.br_startoff + rec.br_blockcount || + (bma->off >= rec.br_startoff && + isnullstartblock(rec.br_startblock)); return 0; } @@ -4658,7 +4658,7 @@ xfs_bmapi_allocate( */ if (mp->m_dalign && alen >= mp->m_dalign && !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { - error = xfs_bmap_isaeof(bma->ip, aoff, whichfork, &bma->aeof); + error = xfs_bmap_isaeof(bma, whichfork); if (error) return error; } -- cgit v1.2.3-58-ga151 From baf41a52b9c62f9a825371806129ed12e2c1e2d8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sun, 18 Sep 2011 20:40:56 +0000 Subject: xfs: move extent records into bmalloca structure Rather that putting extent records on the stack and then pointing to them in the bmalloca structure which is in the same stack frame, put the extent records directly in the bmalloca structure. This reduces the number of args that need to be passed around. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 93 +++++++++++++++++++++++++++---------------------------- fs/xfs/xfs_bmap.h | 4 +-- 2 files changed, 47 insertions(+), 50 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index e2eb3ba5b420..68edf99bfad6 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2013,18 +2013,18 @@ xfs_bmap_adjacent( * If allocating at eof, and there's a previous real block, * try to use its last block as our starting point. */ - if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF && - !isnullstartblock(ap->prevp->br_startblock) && - ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount, - ap->prevp->br_startblock)) { - ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount; + if (ap->eof && ap->prev.br_startoff != NULLFILEOFF && + !isnullstartblock(ap->prev.br_startblock) && + ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount, + ap->prev.br_startblock)) { + ap->rval = ap->prev.br_startblock + ap->prev.br_blockcount; /* * Adjust for the gap between prevp and us. */ adjust = ap->off - - (ap->prevp->br_startoff + ap->prevp->br_blockcount); + (ap->prev.br_startoff + ap->prev.br_blockcount); if (adjust && - ISVALID(ap->rval + adjust, ap->prevp->br_startblock)) + ISVALID(ap->rval + adjust, ap->prev.br_startblock)) ap->rval += adjust; } /* @@ -2042,17 +2042,17 @@ xfs_bmap_adjacent( * If there's a previous (left) block, select a requested * start block based on it. */ - if (ap->prevp->br_startoff != NULLFILEOFF && - !isnullstartblock(ap->prevp->br_startblock) && - (prevbno = ap->prevp->br_startblock + - ap->prevp->br_blockcount) && - ISVALID(prevbno, ap->prevp->br_startblock)) { + if (ap->prev.br_startoff != NULLFILEOFF && + !isnullstartblock(ap->prev.br_startblock) && + (prevbno = ap->prev.br_startblock + + ap->prev.br_blockcount) && + ISVALID(prevbno, ap->prev.br_startblock)) { /* * Calculate gap to end of previous block. */ adjust = prevdiff = ap->off - - (ap->prevp->br_startoff + - ap->prevp->br_blockcount); + (ap->prev.br_startoff + + ap->prev.br_blockcount); /* * Figure the startblock based on the previous block's * end and the gap size. @@ -2063,7 +2063,7 @@ xfs_bmap_adjacent( */ if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen && ISVALID(prevbno + prevdiff, - ap->prevp->br_startblock)) + ap->prev.br_startblock)) prevbno += adjust; else prevdiff += adjust; @@ -2084,16 +2084,16 @@ xfs_bmap_adjacent( * If there's a following (right) block, select a requested * start block based on it. */ - if (!isnullstartblock(ap->gotp->br_startblock)) { + if (!isnullstartblock(ap->got.br_startblock)) { /* * Calculate gap to start of next block. */ - adjust = gotdiff = ap->gotp->br_startoff - ap->off; + adjust = gotdiff = ap->got.br_startoff - ap->off; /* * Figure the startblock based on the next block's * start and the gap size. */ - gotbno = ap->gotp->br_startblock; + gotbno = ap->got.br_startblock; /* * Heuristic! * If the gap is large relative to the piece we're @@ -2151,7 +2151,7 @@ xfs_bmap_rtalloc( mp = ap->ip->i_mount; align = xfs_get_extsz_hint(ap->ip); prod = align / mp->m_sb.sb_rextsize; - error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, + error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 1, ap->eof, 0, ap->conv, &ap->off, &ap->alen); if (error) @@ -2374,7 +2374,7 @@ xfs_bmap_btalloc( mp = ap->ip->i_mount; align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; if (unlikely(align)) { - error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, + error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, ap->eof, 0, ap->conv, &ap->off, &ap->alen); ASSERT(!error); @@ -4619,17 +4619,17 @@ xfs_bmapi_allocate( * for in this bmap call but that wouldn't be as good. */ if (bma->wasdel) { - alen = (xfs_extlen_t)bma->gotp->br_blockcount; - aoff = bma->gotp->br_startoff; + alen = (xfs_extlen_t)bma->got.br_blockcount; + aoff = bma->got.br_startoff; if (*lastx != NULLEXTNUM && *lastx) { xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx - 1), - bma->prevp); + &bma->prev); } } else { alen = (xfs_extlen_t)XFS_FILBLKS_MIN(bma->alen, MAXEXTLEN); if (!bma->eof) alen = (xfs_extlen_t)XFS_FILBLKS_MIN(alen, - bma->gotp->br_startoff - bma->off); + bma->got.br_startoff - bma->off); aoff = bma->off; } @@ -4700,10 +4700,10 @@ xfs_bmapi_allocate( (*cur)->bc_private.b.flags = bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; - bma->gotp->br_startoff = aoff; - bma->gotp->br_startblock = abno; - bma->gotp->br_blockcount = alen; - bma->gotp->br_state = XFS_EXT_NORM; + bma->got.br_startoff = aoff; + bma->got.br_startblock = abno; + bma->got.br_blockcount = alen; + bma->got.br_state = XFS_EXT_NORM; /* * A wasdelay extent has been initialized, so shouldn't be flagged @@ -4711,14 +4711,14 @@ xfs_bmapi_allocate( */ if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) && xfs_sb_version_hasextflgbit(&mp->m_sb)) - bma->gotp->br_state = XFS_EXT_UNWRITTEN; + bma->got.br_state = XFS_EXT_UNWRITTEN; if (bma->wasdel) { error = xfs_bmap_add_extent_delay_real(bma->tp, bma->ip, lastx, - cur, bma->gotp, firstblock, flist, logflags); + cur, &bma->got, firstblock, flist, logflags); } else { error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, lastx, - cur, bma->gotp, firstblock, flist, logflags, + cur, &bma->got, firstblock, flist, logflags, whichfork); } @@ -4730,13 +4730,12 @@ xfs_bmapi_allocate( * or xfs_bmap_add_extent_hole_real might have merged it into one of * the neighbouring ones. */ - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), bma->gotp); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), &bma->got); - ASSERT(bma->gotp->br_startoff <= aoff); - ASSERT(bma->gotp->br_startoff + bma->gotp->br_blockcount >= - aoff + alen); - ASSERT(bma->gotp->br_state == XFS_EXT_NORM || - bma->gotp->br_state == XFS_EXT_UNWRITTEN); + ASSERT(bma->got.br_startoff <= aoff); + ASSERT(bma->got.br_startoff + bma->got.br_blockcount >= aoff + alen); + ASSERT(bma->got.br_state == XFS_EXT_NORM || + bma->got.br_state == XFS_EXT_UNWRITTEN); return 0; } @@ -4793,7 +4792,7 @@ xfs_bmapi_convert_unwritten( * xfs_bmap_add_extent_unwritten_real might have merged it into one * of the neighbouring ones. */ - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), bma->gotp); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), &bma->got); /* * We may have combined previously unwritten space with written space, @@ -4837,14 +4836,12 @@ xfs_bmapi_write( xfs_fileoff_t end; /* end of mapped file region */ int eof; /* after the end of extents */ int error; /* error return */ - struct xfs_bmbt_irec got; /* current file extent record */ xfs_extnum_t lastx; /* last useful extent number */ int logflags; /* flags for transaction logging */ xfs_extlen_t minleft; /* min blocks left after allocation */ int n; /* current extent index */ int nallocs; /* number of extents alloc'd */ xfs_fileoff_t obno; /* old block number (offset) */ - struct xfs_bmbt_irec prev; /* previous file extent record */ int tmp_logflags; /* temp flags holder */ int whichfork; /* data or attr fork */ char inhole; /* current location is hole in file */ @@ -4916,21 +4913,20 @@ xfs_bmapi_write( goto error0; } - xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev); + xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &bma.got, + &bma.prev); n = 0; end = bno + len; obno = bno; bma.tp = tp; bma.ip = ip; - bma.prevp = &prev; - bma.gotp = &got; bma.total = total; bma.userdata = 0; while (bno < end && n < *nmap) { - inhole = eof || got.br_startoff > bno; - wasdelay = !inhole && isnullstartblock(got.br_startblock); + inhole = eof || bma.got.br_startoff > bno; + wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); /* * First, deal with the hole before the allocated space @@ -4957,7 +4953,8 @@ xfs_bmapi_write( } /* Deal with the allocated space we found. */ - xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); + xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno, + end, n, flags); /* Execute unwritten extent conversion if necessary */ error = xfs_bmapi_convert_unwritten(&bma, mval, len, &lastx, @@ -4981,9 +4978,9 @@ xfs_bmapi_write( break; /* Else go on to the next record. */ - prev = got; + bma.prev = bma.got; if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &bma.got); else eof = 1; } diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index be235f56d2a6..6e8f8aee7cdb 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -114,8 +114,8 @@ typedef struct xfs_bmalloca { xfs_fileoff_t off; /* offset in file filling in */ struct xfs_trans *tp; /* transaction pointer */ struct xfs_inode *ip; /* incore inode pointer */ - struct xfs_bmbt_irec *prevp; /* extent before the new one */ - struct xfs_bmbt_irec *gotp; /* extent after, or delayed */ + struct xfs_bmbt_irec prev; /* extent before the new one */ + struct xfs_bmbt_irec got; /* extent after, or delayed */ xfs_extlen_t alen; /* i/o length asked/allocated */ xfs_extlen_t total; /* total blocks needed for xaction */ xfs_extlen_t minlen; /* minimum allocation size (blocks) */ -- cgit v1.2.3-58-ga151 From 0937e0fd8be6f9c26844127d39d677bb752e8741 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sun, 18 Sep 2011 20:40:57 +0000 Subject: xfs: move firstblock and bmap freelist cursor into bmalloca structure Rather than passing the firstblock and freelist structure around, embed it into the bmalloca structure and remove it from the function parameters. This also enables the minleft parameter to be set only once in xfs_bmapi_write(), and the freelist cursor directly queried in xfs_bmapi_allocate to clear it when the lowspace algorithm is activated. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 92 ++++++++++++++++++++++++------------------------- fs/xfs/xfs_bmap.h | 4 +-- fs/xfs/xfs_filestream.c | 2 +- 3 files changed, 48 insertions(+), 50 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 68edf99bfad6..608a0013791b 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2006,9 +2006,9 @@ xfs_bmap_adjacent( XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks) mp = ap->ip->i_mount; - nullfb = ap->firstblock == NULLFSBLOCK; + nullfb = *ap->firstblock == NULLFSBLOCK; rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata; - fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); + fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); /* * If allocating at eof, and there's a previous real block, * try to use its last block as our starting point. @@ -2380,8 +2380,8 @@ xfs_bmap_btalloc( ASSERT(!error); ASSERT(ap->alen); } - nullfb = ap->firstblock == NULLFSBLOCK; - fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); + nullfb = *ap->firstblock == NULLFSBLOCK; + fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); if (nullfb) { if (ap->userdata && xfs_inode_is_filestream(ap->ip)) { ag = xfs_filestream_lookup_ag(ap->ip); @@ -2391,7 +2391,7 @@ xfs_bmap_btalloc( ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino); } } else - ap->rval = ap->firstblock; + ap->rval = *ap->firstblock; xfs_bmap_adjacent(ap); @@ -2402,7 +2402,7 @@ xfs_bmap_btalloc( if (nullfb || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno) ; else - ap->rval = ap->firstblock; + ap->rval = *ap->firstblock; /* * Normal allocation, done through xfs_alloc_vextent. */ @@ -2413,13 +2413,13 @@ xfs_bmap_btalloc( /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp)); - args.firstblock = ap->firstblock; + args.firstblock = *ap->firstblock; blen = 0; if (nullfb) { error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); if (error) return error; - } else if (ap->low) { + } else if (ap->flist->xbf_low) { if (xfs_inode_is_filestream(ap->ip)) args.type = XFS_ALLOCTYPE_FIRST_AG; else @@ -2452,7 +2452,7 @@ xfs_bmap_btalloc( * is >= the stripe unit and the allocation offset is * at the end of file. */ - if (!ap->low && ap->aeof) { + if (!ap->flist->xbf_low && ap->aeof) { if (!ap->off) { args.alignment = mp->m_dalign; atype = args.type; @@ -2540,12 +2540,25 @@ xfs_bmap_btalloc( args.minleft = 0; if ((error = xfs_alloc_vextent(&args))) return error; - ap->low = 1; + ap->flist->xbf_low = 1; } if (args.fsbno != NULLFSBLOCK) { - ap->firstblock = ap->rval = args.fsbno; + /* + * check the allocation happened at the same or higher AG than + * the first block that was allocated. + */ + ASSERT(*ap->firstblock == NULLFSBLOCK || + XFS_FSB_TO_AGNO(mp, *ap->firstblock) == + XFS_FSB_TO_AGNO(mp, args.fsbno) || + (ap->flist->xbf_low && + XFS_FSB_TO_AGNO(mp, *ap->firstblock) < + XFS_FSB_TO_AGNO(mp, args.fsbno))); + + ap->rval = args.fsbno; + if (*ap->firstblock == NULLFSBLOCK) + *ap->firstblock = args.fsbno; ASSERT(nullfb || fb_agno == args.agno || - (ap->low && fb_agno < args.agno)); + (ap->flist->xbf_low && fb_agno < args.agno)); ap->alen = args.len; ap->ip->i_d.di_nblocks += args.len; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); @@ -4596,8 +4609,6 @@ xfs_bmapi_allocate( struct xfs_bmalloca *bma, xfs_extnum_t *lastx, struct xfs_btree_cur **cur, - xfs_fsblock_t *firstblock, - struct xfs_bmap_free *flist, int flags, int *nallocs, int *logflags) @@ -4647,9 +4658,7 @@ xfs_bmapi_allocate( */ bma->alen = alen; bma->off = aoff; - bma->firstblock = *firstblock; bma->minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1; - bma->low = flist->xbf_low; bma->aeof = 0; /* @@ -4671,24 +4680,18 @@ xfs_bmapi_allocate( * Copy out result fields. */ abno = bma->rval; - flist->xbf_low = bma->low; alen = bma->alen; aoff = bma->off; - ASSERT(*firstblock == NULLFSBLOCK || - XFS_FSB_TO_AGNO(mp, *firstblock) == - XFS_FSB_TO_AGNO(mp, bma->firstblock) || - (flist->xbf_low && - XFS_FSB_TO_AGNO(mp, *firstblock) < - XFS_FSB_TO_AGNO(mp, bma->firstblock))); - *firstblock = bma->firstblock; + if (bma->flist->xbf_low) + bma->minleft = 0; if (*cur) - (*cur)->bc_private.b.firstblock = *firstblock; + (*cur)->bc_private.b.firstblock = *bma->firstblock; if (abno == NULLFSBLOCK) return 0; if ((ifp->if_flags & XFS_IFBROOT) && !*cur) { (*cur) = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork); - (*cur)->bc_private.b.firstblock = *firstblock; - (*cur)->bc_private.b.flist = flist; + (*cur)->bc_private.b.firstblock = *bma->firstblock; + (*cur)->bc_private.b.flist = bma->flist; } /* * Bump the number of extents we've allocated @@ -4715,11 +4718,12 @@ xfs_bmapi_allocate( if (bma->wasdel) { error = xfs_bmap_add_extent_delay_real(bma->tp, bma->ip, lastx, - cur, &bma->got, firstblock, flist, logflags); + cur, &bma->got, bma->firstblock, bma->flist, + logflags); } else { error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, lastx, - cur, &bma->got, firstblock, flist, logflags, - whichfork); + cur, &bma->got, bma->firstblock, bma->flist, + logflags, whichfork); } if (error) @@ -4746,8 +4750,6 @@ xfs_bmapi_convert_unwritten( xfs_filblks_t len, xfs_extnum_t *lastx, struct xfs_btree_cur **cur, - xfs_fsblock_t *firstblock, - struct xfs_bmap_free *flist, int flags, int *logflags) { @@ -4776,14 +4778,14 @@ xfs_bmapi_convert_unwritten( if ((ifp->if_flags & XFS_IFBROOT) && !*cur) { *cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp, bma->ip, whichfork); - (*cur)->bc_private.b.firstblock = *firstblock; - (*cur)->bc_private.b.flist = flist; + (*cur)->bc_private.b.firstblock = *bma->firstblock; + (*cur)->bc_private.b.flist = bma->flist; } mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, lastx, - cur, mval, firstblock, flist, logflags); + cur, mval, bma->firstblock, bma->flist, logflags); if (error) return error; @@ -4838,7 +4840,6 @@ xfs_bmapi_write( int error; /* error return */ xfs_extnum_t lastx; /* last useful extent number */ int logflags; /* flags for transaction logging */ - xfs_extlen_t minleft; /* min blocks left after allocation */ int n; /* current extent index */ int nallocs; /* number of extents alloc'd */ xfs_fileoff_t obno; /* old block number (offset) */ @@ -4900,11 +4901,11 @@ xfs_bmapi_write( if (*firstblock == NULLFSBLOCK) { if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) - minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; + bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; else - minleft = 1; + bma.minleft = 1; } else { - minleft = 0; + bma.minleft = 0; } if (!(ifp->if_flags & XFS_IFEXTENTS)) { @@ -4923,6 +4924,8 @@ xfs_bmapi_write( bma.ip = ip; bma.total = total; bma.userdata = 0; + bma.flist = flist; + bma.firstblock = firstblock; while (bno < end && n < *nmap) { inhole = eof || bma.got.br_startoff > bno; @@ -4938,16 +4941,12 @@ xfs_bmapi_write( bma.wasdel = wasdelay; bma.alen = len; bma.off = bno; - bma.minleft = minleft; - error = xfs_bmapi_allocate(&bma, &lastx, &cur, - firstblock, flist, flags, &nallocs, - &tmp_logflags); + error = xfs_bmapi_allocate(&bma, &lastx, &cur, flags, + &nallocs, &tmp_logflags); logflags |= tmp_logflags; if (error) goto error0; - if (flist && flist->xbf_low) - minleft = 0; if (bma.rval == NULLFSBLOCK) break; } @@ -4958,8 +4957,7 @@ xfs_bmapi_write( /* Execute unwritten extent conversion if necessary */ error = xfs_bmapi_convert_unwritten(&bma, mval, len, &lastx, - &cur, firstblock, flist, - flags, &tmp_logflags); + &cur, flags, &tmp_logflags); logflags |= tmp_logflags; if (error == EAGAIN) continue; diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 6e8f8aee7cdb..6e7c7a50d248 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -109,7 +109,8 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) * Argument structure for xfs_bmap_alloc. */ typedef struct xfs_bmalloca { - xfs_fsblock_t firstblock; /* i/o first block allocated */ + xfs_fsblock_t *firstblock; /* i/o first block allocated */ + struct xfs_bmap_free *flist; /* bmap freelist */ xfs_fsblock_t rval; /* starting block of new extent */ xfs_fileoff_t off; /* offset in file filling in */ struct xfs_trans *tp; /* transaction pointer */ @@ -123,7 +124,6 @@ typedef struct xfs_bmalloca { char eof; /* set if allocating past last extent */ char wasdel; /* replacing a delayed allocation */ char userdata;/* set if is user data */ - char low; /* low on space, using seq'l ags */ char aeof; /* allocated space at eof */ char conv; /* overwriting unwritten extents */ } xfs_bmalloca_t; diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 3ff3d9e23ded..137f957b8c7e 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -761,7 +761,7 @@ xfs_filestream_new_ag( */ ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount; flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | - (ap->low ? XFS_PICK_LOWSPACE : 0); + (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0); err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen); if (err || *agp == NULLAGNUMBER) -- cgit v1.2.3-58-ga151 From 3a75667e902dbdb87718b1ee2b3b745b344a8163 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sun, 18 Sep 2011 20:40:58 +0000 Subject: xfs: rename allocation range fields in struct xfs_bmalloca Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 138 ++++++++++++++++++++++++------------------------ fs/xfs/xfs_bmap.h | 8 +-- fs/xfs/xfs_filestream.c | 2 +- 3 files changed, 75 insertions(+), 73 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 608a0013791b..b47555cfbd8f 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2017,15 +2017,15 @@ xfs_bmap_adjacent( !isnullstartblock(ap->prev.br_startblock) && ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount, ap->prev.br_startblock)) { - ap->rval = ap->prev.br_startblock + ap->prev.br_blockcount; + ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount; /* * Adjust for the gap between prevp and us. */ - adjust = ap->off - + adjust = ap->offset - (ap->prev.br_startoff + ap->prev.br_blockcount); if (adjust && - ISVALID(ap->rval + adjust, ap->prev.br_startblock)) - ap->rval += adjust; + ISVALID(ap->blkno + adjust, ap->prev.br_startblock)) + ap->blkno += adjust; } /* * If not at eof, then compare the two neighbor blocks. @@ -2050,7 +2050,7 @@ xfs_bmap_adjacent( /* * Calculate gap to end of previous block. */ - adjust = prevdiff = ap->off - + adjust = prevdiff = ap->offset - (ap->prev.br_startoff + ap->prev.br_blockcount); /* @@ -2061,7 +2061,7 @@ xfs_bmap_adjacent( * allocating, or using it gives us an invalid block * number, then just use the end of the previous block. */ - if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen && + if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length && ISVALID(prevbno + prevdiff, ap->prev.br_startblock)) prevbno += adjust; @@ -2088,7 +2088,7 @@ xfs_bmap_adjacent( /* * Calculate gap to start of next block. */ - adjust = gotdiff = ap->got.br_startoff - ap->off; + adjust = gotdiff = ap->got.br_startoff - ap->offset; /* * Figure the startblock based on the next block's * start and the gap size. @@ -2101,12 +2101,12 @@ xfs_bmap_adjacent( * number, then just use the start of the next block * offset by our length. */ - if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->alen && + if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length && ISVALID(gotbno - gotdiff, gotbno)) gotbno -= adjust; - else if (ISVALID(gotbno - ap->alen, gotbno)) { - gotbno -= ap->alen; - gotdiff += adjust - ap->alen; + else if (ISVALID(gotbno - ap->length, gotbno)) { + gotbno -= ap->length; + gotdiff += adjust - ap->length; } else gotdiff += adjust; /* @@ -2124,14 +2124,14 @@ xfs_bmap_adjacent( gotbno = NULLFSBLOCK; /* * If both valid, pick the better one, else the only good - * one, else ap->rval is already set (to 0 or the inode block). + * one, else ap->blkno is already set (to 0 or the inode block). */ if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) - ap->rval = prevdiff <= gotdiff ? prevbno : gotbno; + ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno; else if (prevbno != NULLFSBLOCK) - ap->rval = prevbno; + ap->blkno = prevbno; else if (gotbno != NULLFSBLOCK) - ap->rval = gotbno; + ap->blkno = gotbno; } #undef ISVALID } @@ -2153,22 +2153,22 @@ xfs_bmap_rtalloc( prod = align / mp->m_sb.sb_rextsize; error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 1, ap->eof, 0, - ap->conv, &ap->off, &ap->alen); + ap->conv, &ap->offset, &ap->length); if (error) return error; - ASSERT(ap->alen); - ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0); + ASSERT(ap->length); + ASSERT(ap->length % mp->m_sb.sb_rextsize == 0); /* * If the offset & length are not perfectly aligned * then kill prod, it will just get us in trouble. */ - if (do_mod(ap->off, align) || ap->alen % align) + if (do_mod(ap->offset, align) || ap->length % align) prod = 1; /* * Set ralen to be the actual requested length in rtextents. */ - ralen = ap->alen / mp->m_sb.sb_rextsize; + ralen = ap->length / mp->m_sb.sb_rextsize; /* * If the old value was close enough to MAXEXTLEN that * we rounded up to it, cut it back so it's valid again. @@ -2189,15 +2189,15 @@ xfs_bmap_rtalloc( * If it's an allocation to an empty file at offset 0, * pick an extent that will space things out in the rt area. */ - if (ap->eof && ap->off == 0) { + if (ap->eof && ap->offset == 0) { xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */ error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); if (error) return error; - ap->rval = rtx * mp->m_sb.sb_rextsize; + ap->blkno = rtx * mp->m_sb.sb_rextsize; } else { - ap->rval = 0; + ap->blkno = 0; } xfs_bmap_adjacent(ap); @@ -2205,23 +2205,23 @@ xfs_bmap_rtalloc( /* * Realtime allocation, done through xfs_rtallocate_extent. */ - atype = ap->rval == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; - do_div(ap->rval, mp->m_sb.sb_rextsize); - rtb = ap->rval; - ap->alen = ralen; - if ((error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, ap->alen, + atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; + do_div(ap->blkno, mp->m_sb.sb_rextsize); + rtb = ap->blkno; + ap->length = ralen; + if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length, &ralen, atype, ap->wasdel, prod, &rtb))) return error; if (rtb == NULLFSBLOCK && prod > 1 && - (error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, - ap->alen, &ralen, atype, + (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, + ap->length, &ralen, atype, ap->wasdel, 1, &rtb))) return error; - ap->rval = rtb; - if (ap->rval != NULLFSBLOCK) { - ap->rval *= mp->m_sb.sb_rextsize; + ap->blkno = rtb; + if (ap->blkno != NULLFSBLOCK) { + ap->blkno *= mp->m_sb.sb_rextsize; ralen *= mp->m_sb.sb_rextsize; - ap->alen = ralen; + ap->length = ralen; ap->ip->i_d.di_nblocks += ralen; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) @@ -2234,7 +2234,7 @@ xfs_bmap_rtalloc( ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_RTBCOUNT, (long) ralen); } else { - ap->alen = 0; + ap->length = 0; } return 0; } @@ -2349,7 +2349,7 @@ xfs_bmap_btalloc_nullfb( * AG as the stream may have moved. */ if (xfs_inode_is_filestream(ap->ip)) - ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); + ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); return 0; } @@ -2376,9 +2376,9 @@ xfs_bmap_btalloc( if (unlikely(align)) { error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, ap->eof, 0, ap->conv, - &ap->off, &ap->alen); + &ap->offset, &ap->length); ASSERT(!error); - ASSERT(ap->alen); + ASSERT(ap->length); } nullfb = *ap->firstblock == NULLFSBLOCK; fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); @@ -2386,33 +2386,33 @@ xfs_bmap_btalloc( if (ap->userdata && xfs_inode_is_filestream(ap->ip)) { ag = xfs_filestream_lookup_ag(ap->ip); ag = (ag != NULLAGNUMBER) ? ag : 0; - ap->rval = XFS_AGB_TO_FSB(mp, ag, 0); + ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0); } else { - ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino); + ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino); } } else - ap->rval = *ap->firstblock; + ap->blkno = *ap->firstblock; xfs_bmap_adjacent(ap); /* - * If allowed, use ap->rval; otherwise must use firstblock since + * If allowed, use ap->blkno; otherwise must use firstblock since * it's in the right allocation group. */ - if (nullfb || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno) + if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno) ; else - ap->rval = *ap->firstblock; + ap->blkno = *ap->firstblock; /* * Normal allocation, done through xfs_alloc_vextent. */ tryagain = isaligned = 0; args.tp = ap->tp; args.mp = mp; - args.fsbno = ap->rval; + args.fsbno = ap->blkno; /* Trim the allocation back to the maximum an AG can fit. */ - args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp)); + args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp)); args.firstblock = *ap->firstblock; blen = 0; if (nullfb) { @@ -2433,14 +2433,14 @@ xfs_bmap_btalloc( /* apply extent size hints if obtained earlier */ if (unlikely(align)) { args.prod = align; - if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod))) + if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod))) args.mod = (xfs_extlen_t)(args.prod - args.mod); } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) { args.prod = 1; args.mod = 0; } else { args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog; - if ((args.mod = (xfs_extlen_t)(do_mod(ap->off, args.prod)))) + if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod)))) args.mod = (xfs_extlen_t)(args.prod - args.mod); } /* @@ -2453,7 +2453,7 @@ xfs_bmap_btalloc( * at the end of file. */ if (!ap->flist->xbf_low && ap->aeof) { - if (!ap->off) { + if (!ap->offset) { args.alignment = mp->m_dalign; atype = args.type; isaligned = 1; @@ -2506,7 +2506,7 @@ xfs_bmap_btalloc( * turned on. */ args.type = atype; - args.fsbno = ap->rval; + args.fsbno = ap->blkno; args.alignment = mp->m_dalign; args.minlen = nextminlen; args.minalignslop = 0; @@ -2520,7 +2520,7 @@ xfs_bmap_btalloc( * try again. */ args.type = atype; - args.fsbno = ap->rval; + args.fsbno = ap->blkno; args.alignment = 0; if ((error = xfs_alloc_vextent(&args))) return error; @@ -2529,7 +2529,7 @@ xfs_bmap_btalloc( args.minlen > ap->minlen) { args.minlen = ap->minlen; args.type = XFS_ALLOCTYPE_START_BNO; - args.fsbno = ap->rval; + args.fsbno = ap->blkno; if ((error = xfs_alloc_vextent(&args))) return error; } @@ -2554,12 +2554,12 @@ xfs_bmap_btalloc( XFS_FSB_TO_AGNO(mp, *ap->firstblock) < XFS_FSB_TO_AGNO(mp, args.fsbno))); - ap->rval = args.fsbno; + ap->blkno = args.fsbno; if (*ap->firstblock == NULLFSBLOCK) *ap->firstblock = args.fsbno; ASSERT(nullfb || fb_agno == args.agno || (ap->flist->xbf_low && fb_agno < args.agno)); - ap->alen = args.len; + ap->length = args.len; ap->ip->i_d.di_nblocks += args.len; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) @@ -2573,8 +2573,8 @@ xfs_bmap_btalloc( XFS_TRANS_DQ_BCOUNT, (long) args.len); } else { - ap->rval = NULLFSBLOCK; - ap->alen = 0; + ap->blkno = NULLFSBLOCK; + ap->length = 0; } return 0; } @@ -3902,8 +3902,8 @@ xfs_bmap_isaeof( * Check if we are allocation or past the last extent, or at least into * the last delayed allocated extent. */ - bma->aeof = bma->off >= rec.br_startoff + rec.br_blockcount || - (bma->off >= rec.br_startoff && + bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount || + (bma->offset >= rec.br_startoff && isnullstartblock(rec.br_startblock)); return 0; } @@ -4637,11 +4637,11 @@ xfs_bmapi_allocate( &bma->prev); } } else { - alen = (xfs_extlen_t)XFS_FILBLKS_MIN(bma->alen, MAXEXTLEN); + alen = (xfs_extlen_t)XFS_FILBLKS_MIN(bma->length, MAXEXTLEN); if (!bma->eof) alen = (xfs_extlen_t)XFS_FILBLKS_MIN(alen, - bma->got.br_startoff - bma->off); - aoff = bma->off; + bma->got.br_startoff - bma->offset); + aoff = bma->offset; } /* @@ -4656,8 +4656,8 @@ xfs_bmapi_allocate( /* * Fill in changeable bma fields. */ - bma->alen = alen; - bma->off = aoff; + bma->length = alen; + bma->offset = aoff; bma->minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1; bma->aeof = 0; @@ -4679,9 +4679,9 @@ xfs_bmapi_allocate( /* * Copy out result fields. */ - abno = bma->rval; - alen = bma->alen; - aoff = bma->off; + abno = bma->blkno; + alen = bma->length; + aoff = bma->offset; if (bma->flist->xbf_low) bma->minleft = 0; if (*cur) @@ -4939,15 +4939,15 @@ xfs_bmapi_write( bma.eof = eof; bma.conv = !!(flags & XFS_BMAPI_CONVERT); bma.wasdel = wasdelay; - bma.alen = len; - bma.off = bno; + bma.length = len; + bma.offset = bno; error = xfs_bmapi_allocate(&bma, &lastx, &cur, flags, &nallocs, &tmp_logflags); logflags |= tmp_logflags; if (error) goto error0; - if (bma.rval == NULLFSBLOCK) + if (bma.blkno == NULLFSBLOCK) break; } diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 6e7c7a50d248..5f398b1ac70c 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -111,13 +111,15 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) typedef struct xfs_bmalloca { xfs_fsblock_t *firstblock; /* i/o first block allocated */ struct xfs_bmap_free *flist; /* bmap freelist */ - xfs_fsblock_t rval; /* starting block of new extent */ - xfs_fileoff_t off; /* offset in file filling in */ struct xfs_trans *tp; /* transaction pointer */ struct xfs_inode *ip; /* incore inode pointer */ struct xfs_bmbt_irec prev; /* extent before the new one */ struct xfs_bmbt_irec got; /* extent after, or delayed */ - xfs_extlen_t alen; /* i/o length asked/allocated */ + + xfs_fileoff_t offset; /* offset in file filling in */ + xfs_extlen_t length; /* i/o length asked/allocated */ + xfs_fsblock_t blkno; /* starting block of new extent */ + xfs_extlen_t total; /* total blocks needed for xaction */ xfs_extlen_t minlen; /* minimum allocation size (blocks) */ xfs_extlen_t minleft; /* amount must be left after alloc */ diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 137f957b8c7e..5170306a1009 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -682,7 +682,7 @@ xfs_filestream_new_ag( ip = ap->ip; mp = ip->i_mount; cache = mp->m_filestream; - minlen = ap->alen; + minlen = ap->length; *agp = NULLAGNUMBER; /* -- cgit v1.2.3-58-ga151 From 963c30cf45e8c832ae11438ff9d99c954b9d0114 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sun, 18 Sep 2011 20:40:59 +0000 Subject: xfs: do not keep local copies of allocation ranges in xfs_bmapi_allocate Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 43 ++++++++++++++----------------------------- 1 file changed, 14 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index b47555cfbd8f..6a7832d3540e 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4617,9 +4617,6 @@ xfs_bmapi_allocate( int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); - xfs_fsblock_t abno; - xfs_extlen_t alen; - xfs_fileoff_t aoff; int error; int rt; @@ -4630,18 +4627,17 @@ xfs_bmapi_allocate( * for in this bmap call but that wouldn't be as good. */ if (bma->wasdel) { - alen = (xfs_extlen_t)bma->got.br_blockcount; - aoff = bma->got.br_startoff; + bma->length = (xfs_extlen_t)bma->got.br_blockcount; + bma->offset = bma->got.br_startoff; if (*lastx != NULLEXTNUM && *lastx) { xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx - 1), &bma->prev); } } else { - alen = (xfs_extlen_t)XFS_FILBLKS_MIN(bma->length, MAXEXTLEN); + bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN); if (!bma->eof) - alen = (xfs_extlen_t)XFS_FILBLKS_MIN(alen, + bma->length = XFS_FILBLKS_MIN(bma->length, bma->got.br_startoff - bma->offset); - aoff = bma->offset; } /* @@ -4649,23 +4645,17 @@ xfs_bmapi_allocate( * user data. */ if (!(flags & XFS_BMAPI_METADATA)) { - bma->userdata = (aoff == 0) ? + bma->userdata = (bma->offset == 0) ? XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; } - /* - * Fill in changeable bma fields. - */ - bma->length = alen; - bma->offset = aoff; - bma->minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1; - bma->aeof = 0; + bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1; /* * Only want to do the alignment at the eof if it is userdata and * allocation length is larger than a stripe unit. */ - if (mp->m_dalign && alen >= mp->m_dalign && + if (mp->m_dalign && bma->length >= mp->m_dalign && !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { error = xfs_bmap_isaeof(bma, whichfork); if (error) @@ -4676,17 +4666,11 @@ xfs_bmapi_allocate( if (error) return error; - /* - * Copy out result fields. - */ - abno = bma->blkno; - alen = bma->length; - aoff = bma->offset; if (bma->flist->xbf_low) bma->minleft = 0; if (*cur) (*cur)->bc_private.b.firstblock = *bma->firstblock; - if (abno == NULLFSBLOCK) + if (bma->blkno == NULLFSBLOCK) return 0; if ((ifp->if_flags & XFS_IFBROOT) && !*cur) { (*cur) = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork); @@ -4703,9 +4687,9 @@ xfs_bmapi_allocate( (*cur)->bc_private.b.flags = bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; - bma->got.br_startoff = aoff; - bma->got.br_startblock = abno; - bma->got.br_blockcount = alen; + bma->got.br_startoff = bma->offset; + bma->got.br_startblock = bma->blkno; + bma->got.br_blockcount = bma->length; bma->got.br_state = XFS_EXT_NORM; /* @@ -4736,8 +4720,9 @@ xfs_bmapi_allocate( */ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), &bma->got); - ASSERT(bma->got.br_startoff <= aoff); - ASSERT(bma->got.br_startoff + bma->got.br_blockcount >= aoff + alen); + ASSERT(bma->got.br_startoff <= bma->offset); + ASSERT(bma->got.br_startoff + bma->got.br_blockcount >= + bma->offset + bma->length); ASSERT(bma->got.br_state == XFS_EXT_NORM || bma->got.br_state == XFS_EXT_UNWRITTEN); return 0; -- cgit v1.2.3-58-ga151 From 29c8d17a8938be88e36b93522753f3519aefd05d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sun, 18 Sep 2011 20:41:00 +0000 Subject: xfs: move btree cursor into bmalloca Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 56 ++++++++++++++++++++++++++----------------------------- fs/xfs/xfs_bmap.h | 2 ++ 2 files changed, 28 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 6a7832d3540e..a54326bf9aa0 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4608,7 +4608,6 @@ STATIC int xfs_bmapi_allocate( struct xfs_bmalloca *bma, xfs_extnum_t *lastx, - struct xfs_btree_cur **cur, int flags, int *nallocs, int *logflags) @@ -4668,14 +4667,14 @@ xfs_bmapi_allocate( if (bma->flist->xbf_low) bma->minleft = 0; - if (*cur) - (*cur)->bc_private.b.firstblock = *bma->firstblock; + if (bma->cur) + bma->cur->bc_private.b.firstblock = *bma->firstblock; if (bma->blkno == NULLFSBLOCK) return 0; - if ((ifp->if_flags & XFS_IFBROOT) && !*cur) { - (*cur) = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork); - (*cur)->bc_private.b.firstblock = *bma->firstblock; - (*cur)->bc_private.b.flist = bma->flist; + if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) { + bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork); + bma->cur->bc_private.b.firstblock = *bma->firstblock; + bma->cur->bc_private.b.flist = bma->flist; } /* * Bump the number of extents we've allocated @@ -4683,8 +4682,8 @@ xfs_bmapi_allocate( */ (*nallocs)++; - if (*cur) - (*cur)->bc_private.b.flags = + if (bma->cur) + bma->cur->bc_private.b.flags = bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; bma->got.br_startoff = bma->offset; @@ -4702,12 +4701,12 @@ xfs_bmapi_allocate( if (bma->wasdel) { error = xfs_bmap_add_extent_delay_real(bma->tp, bma->ip, lastx, - cur, &bma->got, bma->firstblock, bma->flist, - logflags); + &bma->cur, &bma->got, bma->firstblock, + bma->flist, logflags); } else { error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, lastx, - cur, &bma->got, bma->firstblock, bma->flist, - logflags, whichfork); + &bma->cur, &bma->got, bma->firstblock, + bma->flist, logflags, whichfork); } if (error) @@ -4734,7 +4733,6 @@ xfs_bmapi_convert_unwritten( struct xfs_bmbt_irec *mval, xfs_filblks_t len, xfs_extnum_t *lastx, - struct xfs_btree_cur **cur, int flags, int *logflags) { @@ -4760,17 +4758,17 @@ xfs_bmapi_convert_unwritten( * Modify (by adding) the state flag, if writing. */ ASSERT(mval->br_blockcount <= len); - if ((ifp->if_flags & XFS_IFBROOT) && !*cur) { - *cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp, + if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) { + bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp, bma->ip, whichfork); - (*cur)->bc_private.b.firstblock = *bma->firstblock; - (*cur)->bc_private.b.flist = bma->flist; + bma->cur->bc_private.b.firstblock = *bma->firstblock; + bma->cur->bc_private.b.flist = bma->flist; } mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, lastx, - cur, mval, bma->firstblock, bma->flist, logflags); + &bma->cur, mval, bma->firstblock, bma->flist, logflags); if (error) return error; @@ -4819,7 +4817,6 @@ xfs_bmapi_write( struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; struct xfs_bmalloca bma = { 0 }; /* args for xfs_bmap_alloc */ - struct xfs_btree_cur *cur; /* bmap btree cursor */ xfs_fileoff_t end; /* end of mapped file region */ int eof; /* after the end of extents */ int error; /* error return */ @@ -4875,7 +4872,6 @@ xfs_bmapi_write( logflags = 0; nallocs = 0; - cur = NULL; if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { error = xfs_bmap_local_to_extents(tp, ip, firstblock, total, @@ -4927,7 +4923,7 @@ xfs_bmapi_write( bma.length = len; bma.offset = bno; - error = xfs_bmapi_allocate(&bma, &lastx, &cur, flags, + error = xfs_bmapi_allocate(&bma, &lastx, flags, &nallocs, &tmp_logflags); logflags |= tmp_logflags; if (error) @@ -4942,7 +4938,7 @@ xfs_bmapi_write( /* Execute unwritten extent conversion if necessary */ error = xfs_bmapi_convert_unwritten(&bma, mval, len, &lastx, - &cur, flags, &tmp_logflags); + flags, &tmp_logflags); logflags |= tmp_logflags; if (error == EAGAIN) continue; @@ -4974,8 +4970,8 @@ xfs_bmapi_write( */ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { - ASSERT(cur); - error = xfs_bmap_btree_to_extents(tp, ip, cur, + ASSERT(bma.cur); + error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &tmp_logflags, whichfork); logflags |= tmp_logflags; if (error) @@ -5005,19 +5001,19 @@ error0: if (logflags) xfs_trans_log_inode(tp, ip, logflags); - if (cur) { + if (bma.cur) { if (!error) { ASSERT(*firstblock == NULLFSBLOCK || XFS_FSB_TO_AGNO(mp, *firstblock) == XFS_FSB_TO_AGNO(mp, - cur->bc_private.b.firstblock) || + bma.cur->bc_private.b.firstblock) || (flist->xbf_low && XFS_FSB_TO_AGNO(mp, *firstblock) < XFS_FSB_TO_AGNO(mp, - cur->bc_private.b.firstblock))); - *firstblock = cur->bc_private.b.firstblock; + bma.cur->bc_private.b.firstblock))); + *firstblock = bma.cur->bc_private.b.firstblock; } - xfs_btree_del_cursor(cur, + xfs_btree_del_cursor(bma.cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); } if (!error) diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 5f398b1ac70c..858d9d509989 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -120,6 +120,8 @@ typedef struct xfs_bmalloca { xfs_extlen_t length; /* i/o length asked/allocated */ xfs_fsblock_t blkno; /* starting block of new extent */ + struct xfs_btree_cur *cur; /* btree cursor */ + xfs_extlen_t total; /* total blocks needed for xaction */ xfs_extlen_t minlen; /* minimum allocation size (blocks) */ xfs_extlen_t minleft; /* amount must be left after alloc */ -- cgit v1.2.3-58-ga151 From e0c3da5d89dc1aeef2275a8b751231e147603f0f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sun, 18 Sep 2011 20:41:01 +0000 Subject: xfs: move lastx and nallocs into bmalloca Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 47 +++++++++++++++++++++-------------------------- fs/xfs/xfs_bmap.h | 2 ++ 2 files changed, 23 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index a54326bf9aa0..3ef145307274 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4607,9 +4607,7 @@ xfs_bmapi_delay( STATIC int xfs_bmapi_allocate( struct xfs_bmalloca *bma, - xfs_extnum_t *lastx, int flags, - int *nallocs, int *logflags) { struct xfs_mount *mp = bma->ip->i_mount; @@ -4628,8 +4626,8 @@ xfs_bmapi_allocate( if (bma->wasdel) { bma->length = (xfs_extlen_t)bma->got.br_blockcount; bma->offset = bma->got.br_startoff; - if (*lastx != NULLEXTNUM && *lastx) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx - 1), + if (bma->idx != NULLEXTNUM && bma->idx) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &bma->prev); } } else { @@ -4680,7 +4678,7 @@ xfs_bmapi_allocate( * Bump the number of extents we've allocated * in this call. */ - (*nallocs)++; + bma->nallocs++; if (bma->cur) bma->cur->bc_private.b.flags = @@ -4700,13 +4698,14 @@ xfs_bmapi_allocate( bma->got.br_state = XFS_EXT_UNWRITTEN; if (bma->wasdel) { - error = xfs_bmap_add_extent_delay_real(bma->tp, bma->ip, lastx, - &bma->cur, &bma->got, bma->firstblock, - bma->flist, logflags); + error = xfs_bmap_add_extent_delay_real(bma->tp, bma->ip, + &bma->idx, &bma->cur, &bma->got, + bma->firstblock, bma->flist, logflags); } else { - error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, lastx, - &bma->cur, &bma->got, bma->firstblock, - bma->flist, logflags, whichfork); + error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, + &bma->idx, &bma->cur, &bma->got, + bma->firstblock, bma->flist, logflags, + whichfork); } if (error) @@ -4717,7 +4716,7 @@ xfs_bmapi_allocate( * or xfs_bmap_add_extent_hole_real might have merged it into one of * the neighbouring ones. */ - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), &bma->got); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got); ASSERT(bma->got.br_startoff <= bma->offset); ASSERT(bma->got.br_startoff + bma->got.br_blockcount >= @@ -4732,7 +4731,6 @@ xfs_bmapi_convert_unwritten( struct xfs_bmalloca *bma, struct xfs_bmbt_irec *mval, xfs_filblks_t len, - xfs_extnum_t *lastx, int flags, int *logflags) { @@ -4767,7 +4765,7 @@ xfs_bmapi_convert_unwritten( mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, lastx, + error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, &bma->cur, mval, bma->firstblock, bma->flist, logflags); if (error) return error; @@ -4777,7 +4775,7 @@ xfs_bmapi_convert_unwritten( * xfs_bmap_add_extent_unwritten_real might have merged it into one * of the neighbouring ones. */ - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), &bma->got); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got); /* * We may have combined previously unwritten space with written space, @@ -4820,10 +4818,8 @@ xfs_bmapi_write( xfs_fileoff_t end; /* end of mapped file region */ int eof; /* after the end of extents */ int error; /* error return */ - xfs_extnum_t lastx; /* last useful extent number */ int logflags; /* flags for transaction logging */ int n; /* current extent index */ - int nallocs; /* number of extents alloc'd */ xfs_fileoff_t obno; /* old block number (offset) */ int tmp_logflags; /* temp flags holder */ int whichfork; /* data or attr fork */ @@ -4871,7 +4867,6 @@ xfs_bmapi_write( XFS_STATS_INC(xs_blk_mapw); logflags = 0; - nallocs = 0; if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { error = xfs_bmap_local_to_extents(tp, ip, firstblock, total, @@ -4895,7 +4890,7 @@ xfs_bmapi_write( goto error0; } - xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &bma.got, + xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got, &bma.prev); n = 0; end = bno + len; @@ -4923,8 +4918,7 @@ xfs_bmapi_write( bma.length = len; bma.offset = bno; - error = xfs_bmapi_allocate(&bma, &lastx, flags, - &nallocs, &tmp_logflags); + error = xfs_bmapi_allocate(&bma, flags, &tmp_logflags); logflags |= tmp_logflags; if (error) goto error0; @@ -4937,7 +4931,7 @@ xfs_bmapi_write( end, n, flags); /* Execute unwritten extent conversion if necessary */ - error = xfs_bmapi_convert_unwritten(&bma, mval, len, &lastx, + error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags, &tmp_logflags); logflags |= tmp_logflags; if (error == EAGAIN) @@ -4953,14 +4947,15 @@ xfs_bmapi_write( * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise * the transaction may get too big. */ - if (bno >= end || n >= *nmap || nallocs >= *nmap) + if (bno >= end || n >= *nmap || bma.nallocs >= *nmap) break; /* Else go on to the next record. */ bma.prev = bma.got; - if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &bma.got); - else + if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx), + &bma.got); + } else eof = 1; } *nmap = n; diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 858d9d509989..c2d9e1ef4ba3 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -121,6 +121,8 @@ typedef struct xfs_bmalloca { xfs_fsblock_t blkno; /* starting block of new extent */ struct xfs_btree_cur *cur; /* btree cursor */ + xfs_extnum_t idx; /* current extent index */ + int nallocs;/* number of extents alloc'd */ xfs_extlen_t total; /* total blocks needed for xaction */ xfs_extlen_t minlen; /* minimum allocation size (blocks) */ -- cgit v1.2.3-58-ga151 From c315c90b7d530d1ec3c226052e153b0cffa512c8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Sep 2011 20:41:02 +0000 Subject: xfs: move logflags into bmalloca Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 48 ++++++++++++++++++++++-------------------------- fs/xfs/xfs_bmap.h | 1 + 2 files changed, 23 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 3ef145307274..ed094749851b 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4607,13 +4607,13 @@ xfs_bmapi_delay( STATIC int xfs_bmapi_allocate( struct xfs_bmalloca *bma, - int flags, - int *logflags) + int flags) { struct xfs_mount *mp = bma->ip->i_mount; int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + int tmp_logflags = 0; int error; int rt; @@ -4700,14 +4700,15 @@ xfs_bmapi_allocate( if (bma->wasdel) { error = xfs_bmap_add_extent_delay_real(bma->tp, bma->ip, &bma->idx, &bma->cur, &bma->got, - bma->firstblock, bma->flist, logflags); + bma->firstblock, bma->flist, &tmp_logflags); } else { error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, &bma->idx, &bma->cur, &bma->got, - bma->firstblock, bma->flist, logflags, + bma->firstblock, bma->flist, &tmp_logflags, whichfork); } + bma->logflags |= tmp_logflags; if (error) return error; @@ -4731,16 +4732,14 @@ xfs_bmapi_convert_unwritten( struct xfs_bmalloca *bma, struct xfs_bmbt_irec *mval, xfs_filblks_t len, - int flags, - int *logflags) + int flags) { int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + int tmp_logflags = 0; int error; - *logflags = 0; - /* check if we need to do unwritten->real conversion */ if (mval->br_state == XFS_EXT_UNWRITTEN && (flags & XFS_BMAPI_PREALLOC)) @@ -4766,7 +4765,9 @@ xfs_bmapi_convert_unwritten( ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, - &bma->cur, mval, bma->firstblock, bma->flist, logflags); + &bma->cur, mval, bma->firstblock, bma->flist, + &tmp_logflags); + bma->logflags |= tmp_logflags; if (error) return error; @@ -4818,10 +4819,8 @@ xfs_bmapi_write( xfs_fileoff_t end; /* end of mapped file region */ int eof; /* after the end of extents */ int error; /* error return */ - int logflags; /* flags for transaction logging */ int n; /* current extent index */ xfs_fileoff_t obno; /* old block number (offset) */ - int tmp_logflags; /* temp flags holder */ int whichfork; /* data or attr fork */ char inhole; /* current location is hole in file */ char wasdelay; /* old extent was delayed */ @@ -4866,11 +4865,9 @@ xfs_bmapi_write( XFS_STATS_INC(xs_blk_mapw); - logflags = 0; - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { error = xfs_bmap_local_to_extents(tp, ip, firstblock, total, - &logflags, whichfork); + &bma.logflags, whichfork); if (error) goto error0; } @@ -4918,8 +4915,7 @@ xfs_bmapi_write( bma.length = len; bma.offset = bno; - error = xfs_bmapi_allocate(&bma, flags, &tmp_logflags); - logflags |= tmp_logflags; + error = xfs_bmapi_allocate(&bma, flags); if (error) goto error0; if (bma.blkno == NULLFSBLOCK) @@ -4931,9 +4927,7 @@ xfs_bmapi_write( end, n, flags); /* Execute unwritten extent conversion if necessary */ - error = xfs_bmapi_convert_unwritten(&bma, mval, len, - flags, &tmp_logflags); - logflags |= tmp_logflags; + error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags); if (error == EAGAIN) continue; if (error) @@ -4965,10 +4959,12 @@ xfs_bmapi_write( */ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { + int tmp_logflags = 0; + ASSERT(bma.cur); error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &tmp_logflags, whichfork); - logflags |= tmp_logflags; + bma.logflags |= tmp_logflags; if (error) goto error0; } @@ -4982,19 +4978,19 @@ error0: * Log everything. Do this after conversion, there's no point in * logging the extent records if we've converted to btree format. */ - if ((logflags & xfs_ilog_fext(whichfork)) && + if ((bma.logflags & xfs_ilog_fext(whichfork)) && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - logflags &= ~xfs_ilog_fext(whichfork); - else if ((logflags & xfs_ilog_fbroot(whichfork)) && + bma.logflags &= ~xfs_ilog_fext(whichfork); + else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) - logflags &= ~xfs_ilog_fbroot(whichfork); + bma.logflags &= ~xfs_ilog_fbroot(whichfork); /* * Log whatever the flags say, even if error. Otherwise we might miss * detecting a case where the data is changed, there's an error, * and it's not logged so we don't shutdown when we should. */ - if (logflags) - xfs_trans_log_inode(tp, ip, logflags); + if (bma.logflags) + xfs_trans_log_inode(tp, ip, bma.logflags); if (bma.cur) { if (!error) { diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index c2d9e1ef4ba3..243e212b31f6 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -123,6 +123,7 @@ typedef struct xfs_bmalloca { struct xfs_btree_cur *cur; /* btree cursor */ xfs_extnum_t idx; /* current extent index */ int nallocs;/* number of extents alloc'd */ + int logflags;/* flags for transaction logging */ xfs_extlen_t total; /* total blocks needed for xaction */ xfs_extlen_t minlen; /* minimum allocation size (blocks) */ -- cgit v1.2.3-58-ga151 From 572a4cf04ac7f46e9206aabfef03dae602812341 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Sep 2011 20:41:04 +0000 Subject: xfs: pass bmalloca to xfs_bmap_add_extent_delay_real All the parameters passed to xfs_bmap_add_extent_delay_real() are in the xfs_bmalloca structure now. Just pass the bmalloca parameter to the function instead of 8 separate parameters. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 332 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 171 insertions(+), 161 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index ed094749851b..c8b9c4ec9f6f 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -375,16 +375,9 @@ xfs_bmap_add_attrfork_local( */ STATIC int /* error */ xfs_bmap_add_extent_delay_real( - struct xfs_trans *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - xfs_fsblock_t *first, /* pointer to firstblock variable */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - int *logflagsp) /* inode logging flags */ + struct xfs_bmalloca *bma) { - xfs_btree_cur_t *cur; /* btree cursor */ + struct xfs_bmbt_irec *new = &bma->got; int diff; /* temp value */ xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ int error; /* error return value */ @@ -401,18 +394,16 @@ xfs_bmap_add_extent_delay_real( xfs_filblks_t temp2=0;/* value for da_new calculations */ int tmp_rval; /* partial logging flags */ - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - cur = *curp; + ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK); - ASSERT(*idx >= 0); - ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(bma->idx >= 0); + ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); ASSERT(!isnullstartblock(new->br_startblock)); - ASSERT(!cur || (cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + ASSERT(!bma->cur || + (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); XFS_STATS_INC(xs_add_exlist); - *logflagsp = 0; - #define LEFT r[0] #define RIGHT r[1] #define PREV r[2] @@ -420,7 +411,7 @@ xfs_bmap_add_extent_delay_real( /* * Set up a bunch of variables to make the tests simpler. */ - ep = xfs_iext_get_ext(ifp, *idx); + ep = xfs_iext_get_ext(ifp, bma->idx); xfs_bmbt_get_all(ep, &PREV); new_endoff = new->br_startoff + new->br_blockcount; ASSERT(PREV.br_startoff <= new->br_startoff); @@ -442,9 +433,9 @@ xfs_bmap_add_extent_delay_real( * Check and set flags if this segment has a left neighbor. * Don't set contiguous if the combined extent would be too large. */ - if (*idx > 0) { + if (bma->idx > 0) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT); if (isnullstartblock(LEFT.br_startblock)) state |= BMAP_LEFT_DELAY; @@ -462,9 +453,9 @@ xfs_bmap_add_extent_delay_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT); if (isnullstartblock(RIGHT.br_startblock)) state |= BMAP_RIGHT_DELAY; @@ -495,35 +486,39 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The left and right neighbors are both contiguous with new. */ - --*idx; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + bma->idx--; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), LEFT.br_blockcount + PREV.br_blockcount + RIGHT.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 2, state); - ip->i_d.di_nextents--; - if (cur == NULL) + xfs_iext_remove(bma->ip, bma->idx + 1, 2, state); + bma->ip->i_d.di_nextents--; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, RIGHT.br_startblock, - RIGHT.br_blockcount, &i))) + RIGHT.br_blockcount, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_btree_delete(cur, &i))) + error = xfs_btree_delete(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_btree_decrement(cur, 0, &i))) + error = xfs_btree_decrement(bma->cur, 0, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + PREV.br_blockcount + - RIGHT.br_blockcount, LEFT.br_state))) + RIGHT.br_blockcount, LEFT.br_state); + if (error) goto done; } break; @@ -533,27 +528,29 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The left neighbor is contiguous, the right is not. */ - --*idx; + bma->idx--; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), LEFT.br_blockcount + PREV.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 1, state); - if (cur == NULL) + xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + - PREV.br_blockcount, LEFT.br_state))) + PREV.br_blockcount, LEFT.br_state); + if (error) goto done; } break; @@ -563,26 +560,28 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The right neighbor is contiguous, the left is not. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount + RIGHT.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 1, state); - if (cur == NULL) + xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, RIGHT.br_startblock, - RIGHT.br_blockcount, &i))) + RIGHT.br_blockcount, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, PREV.br_startoff, + error = xfs_bmbt_update(bma->cur, PREV.br_startoff, new->br_startblock, PREV.br_blockcount + - RIGHT.br_blockcount, PREV.br_state))) + RIGHT.br_blockcount, PREV.br_state); + if (error) goto done; } break; @@ -593,22 +592,24 @@ xfs_bmap_add_extent_delay_real( * Neither the left nor right neighbors are contiguous with * the new one. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - ip->i_d.di_nextents++; - if (cur == NULL) + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); - cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_btree_insert(cur, &i))) + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } @@ -619,38 +620,40 @@ xfs_bmap_add_extent_delay_real( * Filling in the first part of a previous delayed allocation. * The left neighbor is contiguous. */ - trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1), + trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1), LEFT.br_blockcount + new->br_blockcount); xfs_bmbt_set_startoff(ep, PREV.br_startoff + new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_); temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - if (cur == NULL) + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + new->br_blockcount, - LEFT.br_state))) + LEFT.br_state); + if (error) goto done; } - da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock)); xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - --*idx; + bma->idx--; break; case BMAP_LEFT_FILLING: @@ -658,41 +661,43 @@ xfs_bmap_add_extent_delay_real( * Filling in the first part of a previous delayed allocation. * The left neighbor is not contiguous. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_startoff(ep, new_endoff); temp = PREV.br_blockcount - new->br_blockcount; xfs_bmbt_set_blockcount(ep, temp); - xfs_iext_insert(ip, *idx, 1, new, state); - ip->i_d.di_nextents++; - if (cur == NULL) + xfs_iext_insert(bma->ip, bma->idx, 1, new, state); + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); - cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_btree_insert(cur, &i))) + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_nextents > ip->i_df.if_ext_max) { - error = xfs_bmap_extents_to_btree(tp, ip, - first, flist, &cur, 1, &tmp_rval, - XFS_DATA_FORK); + if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, + &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); rval |= tmp_rval; if (error) goto done; } - da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (cur ? cur->bc_private.b.allocated : 0)); - ep = xfs_iext_get_ext(ifp, *idx + 1); + (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + ep = xfs_iext_get_ext(ifp, bma->idx + 1); xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); - trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_); break; case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -701,37 +706,39 @@ xfs_bmap_add_extent_delay_real( * The right neighbor is contiguous with the new allocation. */ temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1), + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1), new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, RIGHT.br_state); - trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_); - if (cur == NULL) + trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_); + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, RIGHT.br_startblock, - RIGHT.br_blockcount, &i))) + RIGHT.br_blockcount, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, new->br_startoff, + error = xfs_bmbt_update(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, - RIGHT.br_state))) + RIGHT.br_state); + if (error) goto done; } - da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock)); - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - ++*idx; + bma->idx++; break; case BMAP_RIGHT_FILLING: @@ -740,41 +747,43 @@ xfs_bmap_add_extent_delay_real( * The right neighbor is not contiguous. */ temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - xfs_iext_insert(ip, *idx + 1, 1, new, state); - ip->i_d.di_nextents++; - if (cur == NULL) + xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state); + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); - cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_btree_insert(cur, &i))) + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_nextents > ip->i_df.if_ext_max) { - error = xfs_bmap_extents_to_btree(tp, ip, - first, flist, &cur, 1, &tmp_rval, - XFS_DATA_FORK); + if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, 1, + &tmp_rval, XFS_DATA_FORK); rval |= tmp_rval; if (error) goto done; } - da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (cur ? cur->bc_private.b.allocated : 0)); - ep = xfs_iext_get_ext(ifp, *idx); + (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + ep = xfs_iext_get_ext(ifp, bma->idx); xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - ++*idx; + bma->idx++; break; case 0: @@ -800,46 +809,48 @@ xfs_bmap_add_extent_delay_real( */ temp = new->br_startoff - PREV.br_startoff; temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; - trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */ LEFT = *new; RIGHT.br_state = PREV.br_state; RIGHT.br_startblock = nullstartblock( - (int)xfs_bmap_worst_indlen(ip, temp2)); + (int)xfs_bmap_worst_indlen(bma->ip, temp2)); RIGHT.br_startoff = new_endoff; RIGHT.br_blockcount = temp2; /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ - xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state); - ip->i_d.di_nextents++; - if (cur == NULL) + xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state); + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); - cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_btree_insert(cur, &i))) + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_nextents > ip->i_df.if_ext_max) { - error = xfs_bmap_extents_to_btree(tp, ip, - first, flist, &cur, 1, &tmp_rval, - XFS_DATA_FORK); + if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, + 1, &tmp_rval, XFS_DATA_FORK); rval |= tmp_rval; if (error) goto done; } - temp = xfs_bmap_worst_indlen(ip, temp); - temp2 = xfs_bmap_worst_indlen(ip, temp2); + temp = xfs_bmap_worst_indlen(bma->ip, temp); + temp2 = xfs_bmap_worst_indlen(bma->ip, temp2); diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - - (cur ? cur->bc_private.b.allocated : 0)); + (bma->cur ? bma->cur->bc_private.b.allocated : 0)); if (diff > 0) { - error = xfs_icsb_modify_counters(ip->i_mount, + error = xfs_icsb_modify_counters(bma->ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), 0); ASSERT(!error); @@ -847,15 +858,15 @@ xfs_bmap_add_extent_delay_real( goto done; } - ep = xfs_iext_get_ext(ifp, *idx); + ep = xfs_iext_get_ext(ifp, bma->idx); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_); - xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2), + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_); + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2), nullstartblock((int)temp2)); - trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_); - ++*idx; + bma->idx++; da_new = temp + temp2; break; @@ -873,14 +884,15 @@ xfs_bmap_add_extent_delay_real( } /* convert to a btree if necessary */ - if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) { + if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) { int tmp_logflags; /* partial log flag return val */ - ASSERT(cur == NULL); - error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur, + ASSERT(bma->cur == NULL); + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, da_old > 0, &tmp_logflags, XFS_DATA_FORK); - *logflagsp |= tmp_logflags; + bma->logflags |= tmp_logflags; if (error) goto done; } @@ -888,22 +900,22 @@ xfs_bmap_add_extent_delay_real( /* adjust for changes in reserved delayed indirect blocks */ if (da_old || da_new) { temp = da_new; - if (cur) - temp += cur->bc_private.b.allocated; + if (bma->cur) + temp += bma->cur->bc_private.b.allocated; ASSERT(temp <= da_old); if (temp < da_old) - xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - (int64_t)(da_old - temp), 0); + xfs_icsb_modify_counters(bma->ip->i_mount, + XFS_SBS_FDBLOCKS, + (int64_t)(da_old - temp), 0); } /* clear out the allocated field, done with it now in any case. */ - if (cur) { - cur->bc_private.b.allocated = 0; - *curp = cur; - } - xfs_bmap_check_leaf_extents(cur, ip, XFS_DATA_FORK); + if (bma->cur) + bma->cur->bc_private.b.allocated = 0; + + xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK); done: - *logflagsp |= rval; + bma->logflags |= rval; return error; #undef LEFT #undef RIGHT @@ -4698,9 +4710,7 @@ xfs_bmapi_allocate( bma->got.br_state = XFS_EXT_UNWRITTEN; if (bma->wasdel) { - error = xfs_bmap_add_extent_delay_real(bma->tp, bma->ip, - &bma->idx, &bma->cur, &bma->got, - bma->firstblock, bma->flist, &tmp_logflags); + error = xfs_bmap_add_extent_delay_real(bma); } else { error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, &bma->idx, &bma->cur, &bma->got, -- cgit v1.2.3-58-ga151 From c6534249851d062113ab4d8d226be8dba8ecb92e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Sep 2011 20:41:05 +0000 Subject: xfs: pass bmalloca to xfs_bmap_add_extent_hole_real All the parameters passed to xfs_bmap_add_extent_hole_real() are in the xfs_bmalloca structure now. Just pass the bmalloca parameter to the function instead of 8 separate parameters. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 161 ++++++++++++++++++++++++++---------------------------- 1 file changed, 78 insertions(+), 83 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index c8b9c4ec9f6f..2b31945ce9fe 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -1596,34 +1596,25 @@ xfs_bmap_add_extent_hole_delay( */ STATIC int /* error */ xfs_bmap_add_extent_hole_real( - struct xfs_trans *tp, - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - xfs_fsblock_t *first, /* pointer to firstblock variable */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ + struct xfs_bmalloca *bma, + int whichfork) { + struct xfs_bmbt_irec *new = &bma->got; int error; /* error return value */ int i; /* temp state */ - xfs_btree_cur_t *cur; /* if null, not a btree */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_irec_t left; /* left neighbor extent entry */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ int rval=0; /* return value (logging flags) */ int state; /* state bits, accessed thru macros */ - *logflagsp = 0; - - ifp = XFS_IFORK_PTR(ip, whichfork); - cur = *curp; + ifp = XFS_IFORK_PTR(bma->ip, whichfork); - ASSERT(*idx >= 0); - ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(bma->idx >= 0); + ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); ASSERT(!isnullstartblock(new->br_startblock)); - ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + ASSERT(!bma->cur || + !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); XFS_STATS_INC(xs_add_exlist); @@ -1634,9 +1625,9 @@ xfs_bmap_add_extent_hole_real( /* * Check and set flags if this segment has a left neighbor. */ - if (*idx > 0) { + if (bma->idx > 0) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left); if (isnullstartblock(left.br_startblock)) state |= BMAP_LEFT_DELAY; } @@ -1645,9 +1636,9 @@ xfs_bmap_add_extent_hole_real( * Check and set flags if this segment has a current value. * Not true if we're inserting into the "hole" at eof. */ - if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right); if (isnullstartblock(right.br_startblock)) state |= BMAP_RIGHT_DELAY; } @@ -1684,39 +1675,42 @@ xfs_bmap_add_extent_hole_real( * left and on the right. * Merge all three into a single extent record. */ - --*idx; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + --bma->idx; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), left.br_blockcount + new->br_blockcount + right.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 1, state); + xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); - if (cur == NULL) { + XFS_IFORK_NEXT_SET(bma->ip, whichfork, + XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1); + if (bma->cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, - right.br_startoff, - right.br_startblock, - right.br_blockcount, &i))) + error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff, + right.br_startblock, right.br_blockcount, + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_btree_delete(cur, &i))) + error = xfs_btree_delete(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_btree_decrement(cur, 0, &i))) + error = xfs_btree_decrement(bma->cur, 0, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, left.br_startoff, + error = xfs_bmbt_update(bma->cur, left.br_startoff, left.br_startblock, left.br_blockcount + new->br_blockcount + right.br_blockcount, - left.br_state))) + left.br_state); + if (error) goto done; } break; @@ -1727,27 +1721,28 @@ xfs_bmap_add_extent_hole_real( * on the left. * Merge the new allocation with the left neighbor. */ - --*idx; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + --bma->idx; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), left.br_blockcount + new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - if (cur == NULL) { + if (bma->cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, - left.br_startoff, - left.br_startblock, - left.br_blockcount, &i))) + error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff, + left.br_startblock, left.br_blockcount, + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, left.br_startoff, + error = xfs_bmbt_update(bma->cur, left.br_startoff, left.br_startblock, left.br_blockcount + new->br_blockcount, - left.br_state))) + left.br_state); + if (error) goto done; } break; @@ -1758,28 +1753,30 @@ xfs_bmap_add_extent_hole_real( * on the right. * Merge the new allocation with the right neighbor. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx), new->br_startoff, new->br_startblock, new->br_blockcount + right.br_blockcount, right.br_state); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - if (cur == NULL) { + if (bma->cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, + error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff, right.br_startblock, - right.br_blockcount, &i))) + right.br_blockcount, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, new->br_startoff, + error = xfs_bmbt_update(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount + right.br_blockcount, - right.br_state))) + right.br_state); + if (error) goto done; } break; @@ -1790,21 +1787,23 @@ xfs_bmap_add_extent_hole_real( * real allocation. * Insert a new entry. */ - xfs_iext_insert(ip, *idx, 1, new, state); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); - if (cur == NULL) { + xfs_iext_insert(bma->ip, bma->idx, 1, new, state); + XFS_IFORK_NEXT_SET(bma->ip, whichfork, + XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1); + if (bma->cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, - new->br_blockcount, &i))) + new->br_blockcount, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); - cur->bc_rec.b.br_state = new->br_state; - if ((error = xfs_btree_insert(cur, &i))) + bma->cur->bc_rec.b.br_state = new->br_state; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } @@ -1812,26 +1811,26 @@ xfs_bmap_add_extent_hole_real( } /* convert to a btree if necessary */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) { + if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) { int tmp_logflags; /* partial log flag return val */ - ASSERT(cur == NULL); - error = xfs_bmap_extents_to_btree(tp, ip, first, - flist, &cur, 0, &tmp_logflags, whichfork); - *logflagsp |= tmp_logflags; + ASSERT(bma->cur == NULL); + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, + 0, &tmp_logflags, whichfork); + bma->logflags |= tmp_logflags; if (error) goto done; } /* clear out the allocated field, done with it now in any case. */ - if (cur) { - cur->bc_private.b.allocated = 0; - *curp = cur; - } - xfs_bmap_check_leaf_extents(cur, ip, whichfork); + if (bma->cur) + bma->cur->bc_private.b.allocated = 0; + + xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); done: - *logflagsp |= rval; + bma->logflags |= rval; return error; } @@ -4709,14 +4708,10 @@ xfs_bmapi_allocate( xfs_sb_version_hasextflgbit(&mp->m_sb)) bma->got.br_state = XFS_EXT_UNWRITTEN; - if (bma->wasdel) { + if (bma->wasdel) error = xfs_bmap_add_extent_delay_real(bma); - } else { - error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, - &bma->idx, &bma->cur, &bma->got, - bma->firstblock, bma->flist, &tmp_logflags, - whichfork); - } + else + error = xfs_bmap_add_extent_hole_real(bma, whichfork); bma->logflags |= tmp_logflags; if (error) -- cgit v1.2.3-58-ga151 From b0eab14e74d2d7b22d065e18a1cdebcf7716debf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Sep 2011 20:41:06 +0000 Subject: xfs: dont ignore error code from xfs_bmbt_update Fix a case in xfs_bmap_add_extent_unwritten_real where we aren't passing the returned error on. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 2b31945ce9fe..bb31d37bf49b 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -1217,10 +1217,11 @@ xfs_bmap_add_extent_unwritten_real( goto done; if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - if (xfs_bmbt_update(cur, LEFT.br_startoff, + error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + new->br_blockcount, - LEFT.br_state)) + LEFT.br_state); + if (error) goto done; } break; -- cgit v1.2.3-58-ga151 From d952e2f81244d6502aff126df5011fab10f92187 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Sep 2011 20:41:07 +0000 Subject: xfs: cleanup xfs_bmap.h Convert all function prototypes to the short form used elsewhere, and remove duplicates of comments already placed at the function body. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.h | 225 ++++++++---------------------------------------------- 1 file changed, 33 insertions(+), 192 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 243e212b31f6..89ee672d378a 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -155,121 +155,29 @@ typedef struct xfs_bmalloca { { BMAP_RIGHT_FILLING, "RF" }, \ { BMAP_ATTRFORK, "ATTR" } -/* - * Add bmap trace insert entries for all the contents of the extent list. - * - * Quite excessive tracing. Only do this for debug builds. - */ #if defined(__KERNEL) && defined(DEBUG) -void -xfs_bmap_trace_exlist( - struct xfs_inode *ip, /* incore inode pointer */ - xfs_extnum_t cnt, /* count of entries in list */ - int whichfork, - unsigned long caller_ip); /* data or attr fork */ +void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, + int whichfork, unsigned long caller_ip); #define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_) #else #define XFS_BMAP_TRACE_EXLIST(ip,c,w) #endif -/* - * Convert inode from non-attributed to attributed. - * Must not be in a transaction, ip must not be locked. - */ -int /* error code */ -xfs_bmap_add_attrfork( - struct xfs_inode *ip, /* incore inode pointer */ - int size, /* space needed for new attribute */ - int rsvd); /* flag for reserved block allocation */ - -/* - * Add the extent to the list of extents to be free at transaction end. - * The list is maintained sorted (by block number). - */ -void -xfs_bmap_add_free( - xfs_fsblock_t bno, /* fs block number of extent */ - xfs_filblks_t len, /* length of extent */ - xfs_bmap_free_t *flist, /* list of extents */ - struct xfs_mount *mp); /* mount point structure */ - -/* - * Routine to clean up the free list data structure when - * an error occurs during a transaction. - */ -void -xfs_bmap_cancel( - xfs_bmap_free_t *flist); /* free list to clean up */ - -/* - * Compute and fill in the value of the maximum depth of a bmap btree - * in this filesystem. Done once, during mount. - */ -void -xfs_bmap_compute_maxlevels( - struct xfs_mount *mp, /* file system mount structure */ - int whichfork); /* data or attr fork */ - -/* - * Returns the file-relative block number of the first unused block in the file. - * This is the lowest-address hole if the file has holes, else the first block - * past the end of file. - */ -int /* error */ -xfs_bmap_first_unused( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_extlen_t len, /* size of hole to find */ - xfs_fileoff_t *unused, /* unused block num */ - int whichfork); /* data or attr fork */ - -/* - * Returns the file-relative block number of the last block + 1 before - * last_block (input value) in the file. - * This is not based on i_size, it is based on the extent list. - * Returns 0 for local files, as they do not have an extent list. - */ -int /* error */ -xfs_bmap_last_before( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t *last_block, /* last block */ - int whichfork); /* data or attr fork */ - -/* - * Returns the file-relative block number of the first block past eof in - * the file. This is not based on i_size, it is based on the extent list. - * Returns 0 for local files, as they do not have an extent list. - */ -int /* error */ -xfs_bmap_last_offset( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t *unused, /* last block num */ - int whichfork); /* data or attr fork */ - -/* - * Returns whether the selected fork of the inode has exactly one - * block or not. For the data fork we check this matches di_size, - * implying the file's range is 0..bsize-1. - */ -int -xfs_bmap_one_block( - struct xfs_inode *ip, /* incore inode */ - int whichfork); /* data or attr fork */ - -/* - * Read in the extents to iu_extents. - * All inode fields are set up by caller, we just traverse the btree - * and copy the records in. - */ -int /* error */ -xfs_bmap_read_extents( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - int whichfork); /* data or attr fork */ - +int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); +void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, + struct xfs_bmap_free *flist, struct xfs_mount *mp); +void xfs_bmap_cancel(struct xfs_bmap_free *flist); +void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); +int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); +int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t *last_block, int whichfork); +int xfs_bmap_last_offset(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t *unused, int whichfork); +int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork); +int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork); int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, struct xfs_bmbt_irec *mval, int *nmap, int flags); @@ -281,95 +189,28 @@ int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fsblock_t *firstblock, xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap, struct xfs_bmap_free *flist); - -/* - * Unmap (remove) blocks from a file. - * If nexts is nonzero then the number of extents to remove is limited to - * that value. If not all extents in the block range can be removed then - * *done is set. - */ -int /* error */ -xfs_bunmapi( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t bno, /* starting offset to unmap */ - xfs_filblks_t len, /* length to unmap in file */ - int flags, /* XFS_BMAPI_... */ - xfs_extnum_t nexts, /* number of extents max */ - xfs_fsblock_t *firstblock, /* first allocated block - controls a.g. for allocs */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - int *done); /* set if not done yet */ - -/* - * Check an extent list, which has just been read, for - * any bit in the extent flag field. - */ -int -xfs_check_nostate_extents( - struct xfs_ifork *ifp, - xfs_extnum_t idx, - xfs_extnum_t num); - -uint -xfs_default_attroffset( - struct xfs_inode *ip); +int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_extnum_t nexts, xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, int *done); +int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, + xfs_extnum_t num); +uint xfs_default_attroffset(struct xfs_inode *ip); #ifdef __KERNEL__ - -/* - * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi - * caller. Frees all the extents that need freeing, which must be done - * last due to locking considerations. - * - * Return 1 if the given transaction was committed and a new one allocated, - * and 0 otherwise. - */ -int /* error */ -xfs_bmap_finish( - struct xfs_trans **tp, /* transaction pointer addr */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - int *committed); /* xact committed or not */ - /* bmap to userspace formatter - copy to user & advance pointer */ typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *); -/* - * Get inode's extents as described in bmv, and format for output. - */ -int /* error code */ -xfs_getbmap( - xfs_inode_t *ip, - struct getbmapx *bmv, /* user bmap structure */ - xfs_bmap_format_t formatter, /* format to user */ - void *arg); /* formatter arg */ - -/* - * Check if the endoff is outside the last extent. If so the caller will grow - * the allocation to a stripe unit boundary - */ -int -xfs_bmap_eof( - struct xfs_inode *ip, - xfs_fileoff_t endoff, - int whichfork, - int *eof); - -/* - * Count fsblocks of the given fork. - */ -int -xfs_bmap_count_blocks( - xfs_trans_t *tp, - struct xfs_inode *ip, - int whichfork, - int *count); - -int -xfs_bmap_punch_delalloc_range( - struct xfs_inode *ip, - xfs_fileoff_t start_fsb, - xfs_fileoff_t length); +int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, + int *committed); +int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv, + xfs_bmap_format_t formatter, void *arg); +int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, + int whichfork, int *eof); +int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, int *count); +int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, + xfs_fileoff_t start_fsb, xfs_fileoff_t length); #endif /* __KERNEL__ */ #endif /* __XFS_BMAP_H__ */ -- cgit v1.2.3-58-ga151 From c029a50d51b8a9520105ec903639de03389915d0 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 21 Sep 2011 09:42:30 +0000 Subject: xfs: fix possible overflow in xfs_ioc_trim() In xfs_ioc_trim it is possible that computing the last allocation group to discard might overflow for big start & len values, because the result might be bigger then xfs_agnumber_t which is 32 bit long. Fix this by not allowing the start and end block of the range to be beyond the end of the file system. Note that if the start is beyond the end of the file system we have to return -EINVAL, but in the "end" case we have to truncate it to the fs size. Also introduce "end" variable, rather than using start+len which which might be more confusing to get right as this bug shows. Signed-off-by: Lukas Czerner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_discard.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 244e797dae32..8a24f0c6c860 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -38,7 +38,7 @@ xfs_trim_extents( struct xfs_mount *mp, xfs_agnumber_t agno, xfs_fsblock_t start, - xfs_fsblock_t len, + xfs_fsblock_t end, xfs_fsblock_t minlen, __uint64_t *blocks_trimmed) { @@ -100,7 +100,7 @@ xfs_trim_extents( * down partially overlapping ranges for now. */ if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start || - XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) { + XFS_AGB_TO_FSB(mp, agno, fbno) > end) { trace_xfs_discard_exclude(mp, agno, fbno, flen); goto next_extent; } @@ -145,7 +145,7 @@ xfs_ioc_trim( struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; unsigned int granularity = q->limits.discard_granularity; struct fstrim_range range; - xfs_fsblock_t start, len, minlen; + xfs_fsblock_t start, end, minlen; xfs_agnumber_t start_agno, end_agno, agno; __uint64_t blocks_trimmed = 0; int error, last_error = 0; @@ -165,19 +165,19 @@ xfs_ioc_trim( * matter as trimming blocks is an advisory interface. */ start = XFS_B_TO_FSBT(mp, range.start); - len = XFS_B_TO_FSBT(mp, range.len); + end = start + XFS_B_TO_FSBT(mp, range.len) - 1; minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen)); - start_agno = XFS_FSB_TO_AGNO(mp, start); - if (start_agno >= mp->m_sb.sb_agcount) + if (start >= mp->m_sb.sb_dblocks) return -XFS_ERROR(EINVAL); + if (end > mp->m_sb.sb_dblocks - 1) + end = mp->m_sb.sb_dblocks - 1; - end_agno = XFS_FSB_TO_AGNO(mp, start + len); - if (end_agno >= mp->m_sb.sb_agcount) - end_agno = mp->m_sb.sb_agcount - 1; + start_agno = XFS_FSB_TO_AGNO(mp, start); + end_agno = XFS_FSB_TO_AGNO(mp, end); for (agno = start_agno; agno <= end_agno; agno++) { - error = -xfs_trim_extents(mp, agno, start, len, minlen, + error = -xfs_trim_extents(mp, agno, start, end, minlen, &blocks_trimmed); if (error) last_error = error; -- cgit v1.2.3-58-ga151 From 815cb21662b914e1e14c256a3d662b1352c8509e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Sep 2011 09:14:34 +0000 Subject: xfs: XFS_TRANS_SWAPEXT is not a valid flag for xfs_trans_commit XFS_TRANS_SWAPEXT is a transaction type, not a flag for xfs_trans_commit, so don't pass it in xfs_swap_extents. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_dfrag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 9a84a85c03b1..645387b69738 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -438,7 +438,7 @@ xfs_swap_extents( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); + error = xfs_trans_commit(tp, 0); trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); -- cgit v1.2.3-58-ga151 From b10370585349d364ff3c550afa7922e6e21f029d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Sep 2011 14:55:51 +0000 Subject: xfs: unlock the inode before log force in xfs_fsync Only read the LSN we need to push to with the ilock held, and then release it before we do the log force to improve concurrency. This also removes the only direct caller of _xfs_trans_commit, thus allowing it to be merged into the plain xfs_trans_commit again. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_file.c | 15 ++++++++------- fs/xfs/xfs_trans.c | 11 ++++------- fs/xfs/xfs_trans.h | 5 +---- 3 files changed, 13 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 558543c146b3..d8ef02eb178a 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -137,6 +137,7 @@ xfs_file_fsync( struct xfs_trans *tp; int error = 0; int log_flushed = 0; + xfs_lsn_t lsn = 0; trace_xfs_file_fsync(ip); @@ -214,9 +215,9 @@ xfs_file_fsync( */ xfs_trans_ijoin(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_trans_set_sync(tp); - error = _xfs_trans_commit(tp, 0, &log_flushed); + error = xfs_trans_commit(tp, 0); + lsn = ip->i_itemp->ili_last_lsn; xfs_iunlock(ip, XFS_ILOCK_EXCL); } else { /* @@ -227,14 +228,14 @@ xfs_file_fsync( * disk yet, the inode will be still be pinned. If it is, * force the log. */ - if (xfs_ipincount(ip)) { - error = _xfs_log_force_lsn(mp, - ip->i_itemp->ili_last_lsn, - XFS_LOG_SYNC, &log_flushed); - } + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; xfs_iunlock(ip, XFS_ILOCK_SHARED); } + if (!error && lsn) + error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); + /* * If we only have a single device, and the log force about was * a no-op we might have to flush the data device cache here. diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index efc147f0e9b6..b64c8c79736a 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1790,9 +1790,7 @@ xfs_trans_commit_cil( } /* - * xfs_trans_commit - * - * Commit the given transaction to the log a/synchronously. + * Commit the given transaction to the log. * * XFS disk error handling mechanism is not based on a typical * transaction abort mechanism. Logically after the filesystem @@ -1804,10 +1802,9 @@ xfs_trans_commit_cil( * Do not reference the transaction structure after this call. */ int -_xfs_trans_commit( +xfs_trans_commit( struct xfs_trans *tp, - uint flags, - int *log_flushed) + uint flags) { struct xfs_mount *mp = tp->t_mountp; xfs_lsn_t commit_lsn = -1; @@ -1866,7 +1863,7 @@ _xfs_trans_commit( if (sync) { if (!error) { error = _xfs_log_force_lsn(mp, commit_lsn, - XFS_LOG_SYNC, log_flushed); + XFS_LOG_SYNC, NULL); } XFS_STATS_INC(xs_trans_sync); } else { diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 06a9759b6352..54b0b1cc3e3f 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -487,10 +487,7 @@ void xfs_trans_log_efd_extent(xfs_trans_t *, struct xfs_efd_log_item *, xfs_fsblock_t, xfs_extlen_t); -int _xfs_trans_commit(xfs_trans_t *, - uint flags, - int *); -#define xfs_trans_commit(tp, flags) _xfs_trans_commit(tp, flags, NULL) +int xfs_trans_commit(xfs_trans_t *, uint flags); void xfs_trans_cancel(xfs_trans_t *, int); int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); -- cgit v1.2.3-58-ga151 From 8292d88c5c833fc8b837c3a018fd6d72c35a3231 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Sep 2011 20:47:50 +0000 Subject: xfs: unlock the inode before log force in xfs_fs_nfs_commit_metadata Only read the LSN we need to push to with the ilock held, and then release it before we do the log force to improve concurrency. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_export.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 75e5d322e48f..da108977b21f 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -229,16 +229,16 @@ xfs_fs_nfs_commit_metadata( { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - int error = 0; + xfs_lsn_t lsn = 0; xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) { - error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn, - XFS_LOG_SYNC, NULL); - } + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; + if (!lsn) + return 0; + return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } const struct export_operations xfs_export_operations = { -- cgit v1.2.3-58-ga151 From 23bb0be1a237c8732ce1a43140e5cb103a676b92 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Sep 2011 20:47:51 +0000 Subject: xfs: unlock the inode before log force in xfs_change_file_space Let the transaction commit unlock the inode before it potentially causes a synchronous log force. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_vnodeops.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index f47ecee8d437..c9c8e8230b21 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2349,8 +2349,7 @@ xfs_change_file_space( } xfs_ilock(ip, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); if ((attr_flags & XFS_ATTR_DMI) == 0) { ip->i_d.di_mode &= ~S_ISUID; @@ -2375,10 +2374,5 @@ xfs_change_file_space( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (attr_flags & XFS_ATTR_SYNC) xfs_trans_set_sync(tp); - - error = xfs_trans_commit(tp, 0); - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - return error; + return xfs_trans_commit(tp, 0); } -- cgit v1.2.3-58-ga151 From ddc3415aba1cb2f86d1fcad720cea834ee178f54 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Sep 2011 15:00:54 +0000 Subject: xfs: simplify xfs_trans_ijoin* again There is no reason to keep a reference to the inode even if we unlock it during transaction commit because we never drop a reference between the ijoin and commit. Also use this fact to merge xfs_trans_ijoin_ref back into xfs_trans_ijoin - the third argument decides if an unlock is needed now. I'm actually starting to wonder if allowing inodes to be unlocked at transaction commit really is worth the effort. The only real benefit is that they can be unlocked earlier when commiting a synchronous transactions, but that could be solved by doing the log force manually after the unlock, too. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_attr.c | 28 ++++++++++++++-------------- fs/xfs/xfs_bmap.c | 4 ++-- fs/xfs/xfs_dfrag.c | 4 ++-- fs/xfs/xfs_dquot.c | 2 +- fs/xfs/xfs_file.c | 2 +- fs/xfs/xfs_inode.c | 6 +++--- fs/xfs/xfs_inode_item.c | 4 +--- fs/xfs/xfs_ioctl.c | 2 +- fs/xfs/xfs_iomap.c | 6 +++--- fs/xfs/xfs_iops.c | 4 ++-- fs/xfs/xfs_qm_syscalls.c | 2 +- fs/xfs/xfs_rename.c | 8 ++++---- fs/xfs/xfs_rtalloc.c | 10 +++++----- fs/xfs/xfs_super.c | 2 +- fs/xfs/xfs_trans.c | 2 +- fs/xfs/xfs_trans.h | 3 +-- fs/xfs/xfs_trans_inode.c | 25 +++++-------------------- fs/xfs/xfs_vnodeops.c | 34 +++++++++++++++++----------------- 18 files changed, 65 insertions(+), 83 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 5484766938f9..981a0624f382 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -319,7 +319,7 @@ xfs_attr_set_int( return (error); } - xfs_trans_ijoin(args.trans, dp); + xfs_trans_ijoin(args.trans, dp, 0); /* * If the attribute list is non-existent or a shortform list, @@ -389,7 +389,7 @@ xfs_attr_set_int( * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args.trans, dp); + xfs_trans_ijoin(args.trans, dp, 0); /* * Commit the leaf transformation. We'll need another (linked) @@ -537,7 +537,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) * No need to make quota reservations here. We expect to release some * blocks not allocate in the common case. */ - xfs_trans_ijoin(args.trans, dp); + xfs_trans_ijoin(args.trans, dp, 0); /* * Decide on what work routines to call based on the inode size. @@ -809,7 +809,7 @@ xfs_attr_inactive(xfs_inode_t *dp) * No need to make quota reservations here. We expect to release some * blocks, not allocate, in the common case. */ - xfs_trans_ijoin(trans, dp); + xfs_trans_ijoin(trans, dp, 0); /* * Decide on what work routines to call based on the inode size. @@ -961,7 +961,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); /* * Commit the current trans (including the inode) and start @@ -1063,7 +1063,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); } else xfs_da_buf_done(bp); @@ -1137,7 +1137,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); } else xfs_da_buf_done(bp); return(0); @@ -1291,7 +1291,7 @@ restart: * in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); /* * Commit the node conversion and start the next @@ -1328,7 +1328,7 @@ restart: * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); } else { /* * Addition succeeded, update Btree hashvals. @@ -1440,7 +1440,7 @@ restart: * in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); } /* @@ -1572,7 +1572,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); /* * Commit the Btree join operation and start a new trans. @@ -1623,7 +1623,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) * in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); } else xfs_da_brelse(args->trans, bp); } @@ -2060,7 +2060,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && @@ -2207,7 +2207,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, args->dp); + xfs_trans_ijoin(args->trans, args->dp, 0); /* * Close out trans and start the next one in the chain. diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index bb31d37bf49b..c68baeb0974a 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2195,7 +2195,7 @@ xfs_bmap_rtalloc( * Lock out other modifications to the RT bitmap inode. */ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); /* * If it's an allocation to an empty file at offset 0, @@ -3460,7 +3460,7 @@ xfs_bmap_add_attrfork( } ASSERT(ip->i_d.di_anextents == 0); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); switch (ip->i_d.di_format) { diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 645387b69738..654dc6f05bac 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -425,8 +425,8 @@ xfs_swap_extents( } - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ijoin_ref(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_trans_log_inode(tp, ip, ilf_fields); xfs_trans_log_inode(tp, tip, tilf_fields); diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 179673531f20..c597bfe4ada0 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -377,7 +377,7 @@ xfs_qm_dqalloc( return (ESRCH); } - xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); nmaps = 1; error = xfs_bmapi_write(tp, quotip, offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index d8ef02eb178a..0b600b51778c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -213,7 +213,7 @@ xfs_file_fsync( * transaction. So we play it safe and fire off the * transaction anyway. */ - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_trans_commit(tp, 0); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index f690d3a10b34..b676494a55ca 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1152,7 +1152,7 @@ xfs_ialloc( /* * Log the new values stuffed into the inode. */ - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, flags); /* now that we have an i_mode we can setup inode ops and unlock */ @@ -1297,7 +1297,7 @@ xfs_itruncate_extents( */ error = xfs_bmap_finish(&tp, &free_list, &committed); if (committed) - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); if (error) goto out_bmap_cancel; @@ -1313,7 +1313,7 @@ xfs_itruncate_extents( error = xfs_trans_commit(tp, 0); tp = ntp; - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); if (error) goto out; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 588406dc6a35..8704a99241d7 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -658,10 +658,8 @@ xfs_inode_item_unlock( lock_flags = iip->ili_lock_flags; iip->ili_lock_flags = 0; - if (lock_flags) { + if (lock_flags) xfs_iunlock(ip, lock_flags); - IRELE(ip); - } } /* diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index f7ce7debe14c..d99a90518909 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1069,7 +1069,7 @@ xfs_ioctl_setattr( } } - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); /* * Change file ownership. Must be the owner or privileged. diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index da5bf05c5bb7..9afa282aa937 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -208,7 +208,7 @@ xfs_iomap_write_direct( if (error) goto error1; - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); bmapi_flag = 0; if (offset < ip->i_size || extsz) @@ -528,7 +528,7 @@ xfs_iomap_write_allocate( return XFS_ERROR(error); } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); xfs_bmap_init(&free_list, &first_block); @@ -692,7 +692,7 @@ xfs_iomap_write_unwritten( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); /* * Modify the unwritten extent state of the buffer. diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index e6b3e7620888..556bbe7751b7 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -611,7 +611,7 @@ xfs_setattr_nonsize( } } - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); /* * Change file ownership. Must be the owner or privileged. @@ -863,7 +863,7 @@ xfs_setattr_size( xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); /* * Only change the c/mtime if we are changing the size or we are diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 609246f42e6c..5cc3dde1bc90 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -261,7 +261,7 @@ xfs_qm_scall_trunc_qfile( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); error = xfs_itruncate_data(&tp, ip, 0); if (error) { diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index df78c297d1a1..866de277079a 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -170,12 +170,12 @@ xfs_rename( * we can rely on either trans_commit or trans_cancel to unlock * them. */ - xfs_trans_ijoin_ref(tp, src_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); if (new_parent) - xfs_trans_ijoin_ref(tp, target_dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, src_ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); if (target_ip) - xfs_trans_ijoin_ref(tp, target_ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); /* * If we are using project inheritance, we only allow renames diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index f29424964521..87323f1ded64 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -112,7 +112,7 @@ xfs_growfs_rt_alloc( * Lock the inode. */ xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_bmap_init(&flist, &firstblock); /* @@ -155,7 +155,7 @@ xfs_growfs_rt_alloc( * Lock the bitmap inode. */ xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* * Get a buffer for the block. */ @@ -1960,7 +1960,7 @@ xfs_growfs_rt( * Lock out other callers by grabbing the bitmap inode lock. */ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); /* * Update the bitmap inode's size. */ @@ -1972,7 +1972,7 @@ xfs_growfs_rt( * Get the summary inode into the transaction. */ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, mp->m_rsumip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL); /* * Update the summary inode's size. */ @@ -2143,7 +2143,7 @@ xfs_rtfree_extent( * Synchronize by locking the bitmap inode. */ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); #if defined(__KERNEL__) && defined(DEBUG) /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 54d5e102ffe1..6ad05e68abda 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -884,7 +884,7 @@ xfs_log_inode( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); return xfs_trans_commit(tp, 0); } diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index b64c8c79736a..1f35b2feca97 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -2018,6 +2018,6 @@ xfs_trans_roll( if (error) return error; - xfs_trans_ijoin(trans, dp); + xfs_trans_ijoin(trans, dp, 0); return 0; } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 54b0b1cc3e3f..f5df16969f82 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -470,8 +470,7 @@ void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); -void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint); -void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *); +void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint); diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index c8dea2fd7e68..32f0288ae10f 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -47,11 +47,13 @@ xfs_trans_inode_broot_debug( * Add a locked inode to the transaction. * * The inode must be locked, and it cannot be associated with any transaction. + * If lock_flags is non-zero the inode will be unlocked on transaction commit. */ void xfs_trans_ijoin( struct xfs_trans *tp, - struct xfs_inode *ip) + struct xfs_inode *ip, + uint lock_flags) { xfs_inode_log_item_t *iip; @@ -59,7 +61,9 @@ xfs_trans_ijoin( if (ip->i_itemp == NULL) xfs_inode_item_init(ip, ip->i_mount); iip = ip->i_itemp; + ASSERT(iip->ili_lock_flags == 0); + iip->ili_lock_flags = lock_flags; /* * Get a log_item_desc to point at the new item. @@ -69,25 +73,6 @@ xfs_trans_ijoin( xfs_trans_inode_broot_debug(ip); } -/* - * Add a locked inode to the transaction. - * - * - * Grabs a reference to the inode which will be dropped when the transaction - * is committed. The inode will also be unlocked at that point. The inode - * must be locked, and it cannot be associated with any transaction. - */ -void -xfs_trans_ijoin_ref( - struct xfs_trans *tp, - struct xfs_inode *ip, - uint lock_flags) -{ - xfs_trans_ijoin(tp, ip); - IHOLD(ip); - ip->i_itemp->ili_lock_flags = lock_flags; -} - /* * Transactional inode timestamp update. Requires the inode to be locked and * joined to the transaction supplied. Relies on the transaction subsystem to diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index c9c8e8230b21..fc38f15808c6 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -219,7 +219,7 @@ xfs_free_eofblocks( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); error = xfs_itruncate_data(&tp, ip, ip->i_size); if (error) { @@ -288,7 +288,7 @@ xfs_inactive_symlink_rmt( xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); size = (int)ip->i_d.di_size; ip->i_d.di_size = 0; - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); /* * Find the block(s) so we can inval and unmap them. @@ -336,7 +336,7 @@ xfs_inactive_symlink_rmt( * Mark it dirty so it will be logged and moved forward in the log as * part of every commit. */ - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); /* * Get a new, empty transaction to return to our caller. @@ -469,7 +469,7 @@ xfs_inactive_attrs( goto error_cancel; xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); xfs_idestroy_fork(ip, XFS_ATTR_FORK); ASSERT(ip->i_d.di_anextents == 0); @@ -663,7 +663,7 @@ xfs_inactive( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); error = xfs_itruncate_data(&tp, ip, 0); if (error) { @@ -687,7 +687,7 @@ xfs_inactive( return VN_INACTIVE_CACHE; } - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); } else { error = xfs_trans_reserve(tp, 0, XFS_IFREE_LOG_RES(mp), @@ -700,7 +700,7 @@ xfs_inactive( } xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); } /* @@ -940,7 +940,7 @@ xfs_create( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = B_FALSE; error = xfs_dir_createname(tp, dp, name, ip->i_ino, @@ -1261,8 +1261,8 @@ xfs_remove( xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* * If we're removing a directory perform some additional validation. @@ -1407,8 +1407,8 @@ xfs_link( xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, sip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, tdp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); /* * If the source has too many links, we can't make any more to it. @@ -1602,7 +1602,7 @@ xfs_symlink( * transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = B_FALSE; /* @@ -1735,7 +1735,7 @@ xfs_set_dmattrs( return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); ip->i_d.di_dmevmask = evmask; ip->i_d.di_dmstate = state; @@ -1878,7 +1878,7 @@ xfs_alloc_file_space( if (error) goto error1; - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); xfs_bmap_init(&free_list, &firstfsb); error = xfs_bmapi_write(tp, ip, startoffset_fsb, @@ -2176,7 +2176,7 @@ xfs_free_file_space( if (error) goto error1; - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); /* * issue the bunmapi() call to free the blocks @@ -2349,7 +2349,7 @@ xfs_change_file_space( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); if ((attr_flags & XFS_ATTR_DMI) == 0) { ip->i_d.di_mode &= ~S_ISUID; -- cgit v1.2.3-58-ga151 From 3815832a2aa4df9815d15dac05227e0c8551833f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 30 Sep 2011 04:45:02 +0000 Subject: xfs: Don't allocate new buffers on every call to _xfs_buf_find Stats show that for an 8-way unlink @ ~80,000 unlinks/s we are doing ~1 million cache hit lookups to ~3000 buffer creates. That's almost 3 orders of magnitude more cahce hits than misses, so optimising for cache hits is quite important. In the cache hit case, we do not need to allocate a new buffer in case of a cache miss, so we are effectively hitting the allocator for no good reason for vast the majority of calls to _xfs_buf_find. 8-way create workloads are showing similar cache hit/miss ratios. The result is profiles that look like this: samples pcnt function DSO _______ _____ _______________________________ _________________ 1036.00 10.0% _xfs_buf_find [kernel.kallsyms] 582.00 5.6% kmem_cache_alloc [kernel.kallsyms] 519.00 5.0% __memcpy [kernel.kallsyms] 468.00 4.5% __ticket_spin_lock [kernel.kallsyms] 388.00 3.7% kmem_cache_free [kernel.kallsyms] 331.00 3.2% xfs_log_commit_cil [kernel.kallsyms] Further, there is a fair bit of work involved in initialising a new buffer once a cache miss has occurred and we currently do that under the rbtree spinlock. That increases spinlock hold time on what are heavily used trees. To fix this, remove the initialisation of the buffer from _xfs_buf_find() and only allocate the new buffer once we've had a cache miss. Initialise the buffer immediately after allocating it in xfs_buf_get, too, so that is it ready for insert if we get another cache miss after allocation. This minimises lock hold time and avoids unnecessary allocator churn. The resulting profiles look like: samples pcnt function DSO _______ _____ ___________________________ _________________ 8111.00 9.1% _xfs_buf_find [kernel.kallsyms] 4380.00 4.9% __memcpy [kernel.kallsyms] 4341.00 4.8% __ticket_spin_lock [kernel.kallsyms] 3401.00 3.8% kmem_cache_alloc [kernel.kallsyms] 2856.00 3.2% xfs_log_commit_cil [kernel.kallsyms] 2625.00 2.9% __kmalloc [kernel.kallsyms] 2380.00 2.7% kfree [kernel.kallsyms] 2016.00 2.3% kmem_cache_free [kernel.kallsyms] Showing a significant reduction in time spent doing allocation and freeing from slabs (kmem_cache_alloc and kmem_cache_free). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_buf.c | 48 ++++++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index e3af85095ddb..6785b7bd952d 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -477,8 +477,6 @@ _xfs_buf_find( /* No match found */ if (new_bp) { - _xfs_buf_initialize(new_bp, btp, range_base, - range_length, flags); rb_link_node(&new_bp->b_rbnode, parent, rbp); rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); /* the buffer keeps the perag reference until it is freed */ @@ -521,35 +519,53 @@ found: } /* - * Assembles a buffer covering the specified range. - * Storage in memory for all portions of the buffer will be allocated, - * although backing storage may not be. + * Assembles a buffer covering the specified range. The code is optimised for + * cache hits, as metadata intensive workloads will see 3 orders of magnitude + * more hits than misses. */ -xfs_buf_t * +struct xfs_buf * xfs_buf_get( xfs_buftarg_t *target,/* target for buffer */ xfs_off_t ioff, /* starting offset of range */ size_t isize, /* length of range */ xfs_buf_flags_t flags) { - xfs_buf_t *bp, *new_bp; + struct xfs_buf *bp; + struct xfs_buf *new_bp; int error = 0; + bp = _xfs_buf_find(target, ioff, isize, flags, NULL); + if (likely(bp)) + goto found; + new_bp = xfs_buf_allocate(flags); if (unlikely(!new_bp)) return NULL; + _xfs_buf_initialize(new_bp, target, + ioff << BBSHIFT, isize << BBSHIFT, flags); + bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); + if (!bp) { + xfs_buf_deallocate(new_bp); + return NULL; + } + if (bp == new_bp) { error = xfs_buf_allocate_memory(bp, flags); if (error) goto no_buffer; - } else { + } else xfs_buf_deallocate(new_bp); - if (unlikely(bp == NULL)) - return NULL; - } + /* + * Now we have a workable buffer, fill in the block number so + * that we can do IO on it. + */ + bp->b_bn = ioff; + bp->b_count_desired = bp->b_buffer_length; + +found: if (!(bp->b_flags & XBF_MAPPED)) { error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { @@ -560,18 +576,10 @@ xfs_buf_get( } XFS_STATS_INC(xb_get); - - /* - * Always fill in the block number now, the mapped cases can do - * their own overlay of this later. - */ - bp->b_bn = ioff; - bp->b_count_desired = bp->b_buffer_length; - trace_xfs_buf_get(bp, flags, _RET_IP_); return bp; - no_buffer: +no_buffer: if (flags & (XBF_LOCK | XBF_TRYLOCK)) xfs_buf_unlock(bp); xfs_buf_rele(bp); -- cgit v1.2.3-58-ga151 From 670ce93fef93bba8c8a422a79747385bec8e846a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 30 Sep 2011 04:45:03 +0000 Subject: xfs: reduce the number of log forces from tail pushing The AIL push code will issue a log force on ever single push loop that it exits and has encountered pinned items. It doesn't rescan these pinned items until it revisits the AIL from the start. Hence we only need to force the log once per walk from the start of the AIL to the target LSN. This results in numbers like this: xs_push_ail_flush..... 1456 xs_log_force......... 1485 For an 8-way 50M inode create workload - almost all the log forces are coming from the AIL pushing code. Reduce the number of log forces by only forcing the log if the previous walk found pinned buffers. This reduces the numbers to: xs_push_ail_flush..... 665 xs_log_force......... 682 For the same test. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_trans_ail.c | 33 ++++++++++++++++++++------------- fs/xfs/xfs_trans_priv.h | 1 + 2 files changed, 21 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index c15aa29fa169..9df7f9f1b5ee 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -372,12 +372,24 @@ xfs_ail_worker( xfs_lsn_t lsn; xfs_lsn_t target; long tout = 10; - int flush_log = 0; int stuck = 0; int count = 0; int push_xfsbufd = 0; + /* + * If last time we ran we encountered pinned items, force the log first + * and wait for it before pushing again. + */ spin_lock(&ailp->xa_lock); + if (ailp->xa_last_pushed_lsn == 0 && ailp->xa_log_flush && + !list_empty(&ailp->xa_ail)) { + ailp->xa_log_flush = 0; + spin_unlock(&ailp->xa_lock); + XFS_STATS_INC(xs_push_ail_flush); + xfs_log_force(mp, XFS_LOG_SYNC); + spin_lock(&ailp->xa_lock); + } + target = ailp->xa_target; lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn); if (!lip || XFS_FORCED_SHUTDOWN(mp)) { @@ -435,7 +447,7 @@ xfs_ail_worker( case XFS_ITEM_PINNED: XFS_STATS_INC(xs_push_ail_pinned); stuck++; - flush_log = 1; + ailp->xa_log_flush++; break; case XFS_ITEM_LOCKED: @@ -480,16 +492,6 @@ xfs_ail_worker( xfs_trans_ail_cursor_done(ailp, &cur); spin_unlock(&ailp->xa_lock); - if (flush_log) { - /* - * If something we need to push out was pinned, then - * push out the log so it will become unpinned and - * move forward in the AIL. - */ - XFS_STATS_INC(xs_push_ail_flush); - xfs_log_force(mp, 0); - } - if (push_xfsbufd) { /* we've got delayed write buffers to flush */ wake_up_process(mp->m_ddev_targp->bt_task); @@ -500,6 +502,7 @@ out_done: if (!count) { /* We're past our target or empty, so idle */ ailp->xa_last_pushed_lsn = 0; + ailp->xa_log_flush = 0; /* * We clear the XFS_AIL_PUSHING_BIT first before checking @@ -532,9 +535,13 @@ out_done: * were stuck. * * Backoff a bit more to allow some I/O to complete before - * continuing from where we were. + * restarting from the start of the AIL. This prevents us + * from spinning on the same items, and if they are pinned will + * all the restart to issue a log force to unpin the stuck + * items. */ tout = 20; + ailp->xa_last_pushed_lsn = 0; } /* There is more to do, requeue us. */ diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 212946b97239..0a6eec6d472a 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -71,6 +71,7 @@ struct xfs_ail { struct delayed_work xa_work; xfs_lsn_t xa_last_pushed_lsn; unsigned long xa_flags; + int xa_log_flush; }; #define XFS_AIL_PUSHING_BIT 0 -- cgit v1.2.3-58-ga151 From 1da2f2dbf2d2aaa1b0f6ca2f61fcf07e24eb659b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 2 Oct 2011 14:25:16 +0000 Subject: xfs: optimize fsync on directories Directories are only updated transactionally, which means fsync only needs to flush the log the inode is currently dirty, but not bother with checking for dirty data, non-transactional updates, and most importanly doesn't have to flush disk caches except as part of a transaction commit. While the first two optimizations can't easily be measured, the latter actually makes a difference when doing lots of fsync that do not actually have to commit the inode, e.g. because an earlier fsync already pushed the log far enough. The new xfs_dir_fsync is identical to xfs_nfs_commit_metadata except for the prototype, but I'm not sure creating a common helper for the two is worth it given how simple the functions are. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_file.c | 31 ++++++++++++++++++++++++++++++- fs/xfs/xfs_trace.h | 1 + 2 files changed, 31 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 0b600b51778c..753ed9b5c70b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -124,6 +124,35 @@ xfs_iozero( return (-status); } +/* + * Fsync operations on directories are much simpler than on regular files, + * as there is no file data to flush, and thus also no need for explicit + * cache flush operations, and there are no non-transaction metadata updates + * on directories either. + */ +STATIC int +xfs_dir_fsync( + struct file *file, + loff_t start, + loff_t end, + int datasync) +{ + struct xfs_inode *ip = XFS_I(file->f_mapping->host); + struct xfs_mount *mp = ip->i_mount; + xfs_lsn_t lsn = 0; + + trace_xfs_dir_fsync(ip); + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!lsn) + return 0; + return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); +} + STATIC int xfs_file_fsync( struct file *file, @@ -1140,7 +1169,7 @@ const struct file_operations xfs_dir_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, #endif - .fsync = xfs_file_fsync, + .fsync = xfs_dir_fsync, }; static const struct vm_operations_struct xfs_file_vm_ops = { diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index bb5e660e0fab..b78f2c65438b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -576,6 +576,7 @@ DEFINE_INODE_EVENT(xfs_vm_bmap); DEFINE_INODE_EVENT(xfs_file_ioctl); DEFINE_INODE_EVENT(xfs_file_compat_ioctl); DEFINE_INODE_EVENT(xfs_ioctl_setattr); +DEFINE_INODE_EVENT(xfs_dir_fsync); DEFINE_INODE_EVENT(xfs_file_fsync); DEFINE_INODE_EVENT(xfs_destroy_inode); DEFINE_INODE_EVENT(xfs_write_inode); -- cgit v1.2.3-58-ga151 From 87c7bec7fc3377b3873eb3a0f4b603981ea16ebb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Sep 2011 14:08:26 +0000 Subject: xfs: fix buffer flushing during unmount The code to flush buffers in the umount code is a bit iffy: we first flush all delwri buffers out, but then might be able to queue up a new one when logging the sb counts. On a normal shutdown that one would get flushed out when doing the synchronous superblock write in xfs_unmountfs_writesb, but we skip that one if the filesystem has been shut down. Fix this by moving the delwri list flushing until just before unmounting the log, and while we're at it also remove the superflous delwri list and buffer lru flusing for the rt and log device that can never have cached or delwri buffers. Signed-off-by: Christoph Hellwig Reported-by: Amit Sahrawat Tested-by: Amit Sahrawat Signed-off-by: Alex Elder --- fs/xfs/xfs_buf.h | 1 - fs/xfs/xfs_mount.c | 29 ++++++++++------------------- 2 files changed, 10 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index c23826685d3d..200f59138b8a 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -318,7 +318,6 @@ extern struct list_head *xfs_get_buftarg_list(void); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) -#define xfs_binval(buftarg) xfs_flush_buftarg(buftarg, 1) #define XFS_bflush(buftarg) xfs_flush_buftarg(buftarg, 1) #endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index a957e437bee1..e8fe1cb54094 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -44,9 +44,6 @@ #include "xfs_trace.h" -STATIC void xfs_unmountfs_wait(xfs_mount_t *); - - #ifdef HAVE_PERCPU_SB STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, int); @@ -1496,11 +1493,6 @@ xfs_unmountfs( */ xfs_log_force(mp, XFS_LOG_SYNC); - xfs_binval(mp->m_ddev_targp); - if (mp->m_rtdev_targp) { - xfs_binval(mp->m_rtdev_targp); - } - /* * Unreserve any blocks we have so that when we unmount we don't account * the reserved free space as used. This is really only necessary for @@ -1526,7 +1518,16 @@ xfs_unmountfs( xfs_warn(mp, "Unable to update superblock counters. " "Freespace may not be correct on next mount."); xfs_unmountfs_writesb(mp); - xfs_unmountfs_wait(mp); /* wait for async bufs */ + + /* + * Make sure all buffers have been flushed and completed before + * unmounting the log. + */ + error = xfs_flush_buftarg(mp->m_ddev_targp, 1); + if (error) + xfs_warn(mp, "%d busy buffers during unmount.", error); + xfs_wait_buftarg(mp->m_ddev_targp); + xfs_log_unmount_write(mp); xfs_log_unmount(mp); xfs_uuid_unmount(mp); @@ -1537,16 +1538,6 @@ xfs_unmountfs( xfs_free_perag(mp); } -STATIC void -xfs_unmountfs_wait(xfs_mount_t *mp) -{ - if (mp->m_logdev_targp != mp->m_ddev_targp) - xfs_wait_buftarg(mp->m_logdev_targp); - if (mp->m_rtdev_targp) - xfs_wait_buftarg(mp->m_rtdev_targp); - xfs_wait_buftarg(mp->m_ddev_targp); -} - int xfs_fs_writable(xfs_mount_t *mp) { -- cgit v1.2.3-58-ga151 From b17b833443a3b65907f5ecb36f8af33996f6ec78 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Oct 2011 16:52:43 +0000 Subject: xfs: remove xfs_get_buftarg_list The code is unused and under a config option that doesn't exist, remove it. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_buf.c | 8 -------- fs/xfs/xfs_buf.h | 4 ---- 2 files changed, 12 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 6785b7bd952d..e0339a4a0bc8 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1835,11 +1835,3 @@ xfs_buf_terminate(void) destroy_workqueue(xfslogd_workqueue); kmem_zone_destroy(xfs_buf_zone); } - -#ifdef CONFIG_KDB_MODULES -struct list_head * -xfs_get_buftarg_list(void) -{ - return &xfs_buftarg_list; -} -#endif diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 200f59138b8a..cb54cc234ed1 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -311,10 +311,6 @@ extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); extern int xfs_flush_buftarg(xfs_buftarg_t *, int); -#ifdef CONFIG_KDB_MODULES -extern struct list_head *xfs_get_buftarg_list(void); -#endif - #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) -- cgit v1.2.3-58-ga151 From 5fde0326ddb1472ef31034c8ed952a19d4679191 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Oct 2011 16:52:44 +0000 Subject: xfs: remove XFS_BUF_FINISH_IOWAIT Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_buf.c | 2 +- fs/xfs/xfs_buf.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index e0339a4a0bc8..36fed03da26f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1100,7 +1100,7 @@ xfs_bioerror_relse( * ASYNC buffers. */ xfs_buf_ioerror(bp, EIO); - XFS_BUF_FINISH_IOWAIT(bp); + complete(&bp->b_iowait); } else { xfs_buf_relse(bp); } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index cb54cc234ed1..65181220c9a2 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -293,8 +293,6 @@ static inline int xfs_buf_ispinned(struct xfs_buf *bp) return atomic_read(&bp->b_pin_count); } -#define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait); - static inline void xfs_buf_relse(xfs_buf_t *bp) { xfs_buf_unlock(bp); -- cgit v1.2.3-58-ga151 From 38f23232449c9d2c0bc8e9541cb8ab08b7c2b9ce Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Oct 2011 16:52:45 +0000 Subject: xfs: remove XFS_BUF_SET_VTYPE and XFS_BUF_SET_VTYPE_REF Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_alloc.c | 4 ++-- fs/xfs/xfs_btree.c | 8 ++++---- fs/xfs/xfs_buf.h | 7 +------ fs/xfs/xfs_da_btree.c | 11 ++++------- fs/xfs/xfs_dquot.c | 2 +- fs/xfs/xfs_ialloc.c | 2 +- fs/xfs/xfs_inode.c | 6 ------ 7 files changed, 13 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index bdd9cb54d63b..ce84ffd0264c 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -452,7 +452,7 @@ xfs_alloc_read_agfl( if (error) return error; ASSERT(!xfs_buf_geterror(bp)); - XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF); + xfs_buf_set_ref(bp, XFS_AGFL_REF); *bpp = bp; return 0; } @@ -2139,7 +2139,7 @@ xfs_read_agf( xfs_trans_brelse(tp, *bpp); return XFS_ERROR(EFSCORRUPTED); } - XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF); + xfs_buf_set_ref(*bpp, XFS_AGF_REF); return 0; } diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 28cc0199dc1f..1f19f03af9d3 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -631,7 +631,7 @@ xfs_btree_read_bufl( } ASSERT(!xfs_buf_geterror(bp)); if (bp) - XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); + xfs_buf_set_ref(bp, refval); *bpp = bp; return 0; } @@ -939,13 +939,13 @@ xfs_btree_set_refs( switch (cur->bc_btnum) { case XFS_BTNUM_BNO: case XFS_BTNUM_CNT: - XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF); + xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF); break; case XFS_BTNUM_INO: - XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF); + xfs_buf_set_ref(bp, XFS_INO_BTREE_REF); break; case XFS_BTNUM_BMAP: - XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF); + xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF); break; default: ASSERT(0); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 65181220c9a2..ca2934717343 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -278,15 +278,10 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) #define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) -static inline void -xfs_buf_set_ref( - struct xfs_buf *bp, - int lru_ref) +static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) { atomic_set(&bp->b_lru_ref, lru_ref); } -#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref) -#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) static inline int xfs_buf_ispinned(struct xfs_buf *bp) { diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 46c8aa2740da..77c74257c2a3 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -2053,13 +2053,10 @@ xfs_da_do_buf( if (!bp) continue; if (caller == 1) { - if (whichfork == XFS_ATTR_FORK) { - XFS_BUF_SET_VTYPE_REF(bp, B_FS_ATTR_BTREE, - XFS_ATTR_BTREE_REF); - } else { - XFS_BUF_SET_VTYPE_REF(bp, B_FS_DIR_BTREE, - XFS_DIR_BTREE_REF); - } + if (whichfork == XFS_ATTR_FORK) + xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF); + else + xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF); } if (bplist) { bplist[nbplist++] = bp; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index c597bfe4ada0..25d7280e9f6b 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -605,7 +605,7 @@ xfs_qm_dqread( dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount); /* Mark the buf so that this will stay incore a little longer */ - XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF); + xfs_buf_set_ref(bp, XFS_DQUOT_REF); /* * We got the buffer with a xfs_trans_read_buf() (in dqtobp()) diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 207e0b0c0730..169380e66057 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -1505,7 +1505,7 @@ xfs_read_agi( return XFS_ERROR(EFSCORRUPTED); } - XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGI, XFS_AGI_REF); + xfs_buf_set_ref(*bpp, XFS_AGI_REF); xfs_check_agi_unlinked(agi); return 0; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b676494a55ca..21cec6cdf453 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -190,12 +190,6 @@ xfs_imap_to_bp( } xfs_inobp_check(mp, bp); - - /* - * Mark the buffer as an inode buffer now that it looks good - */ - XFS_BUF_SET_VTYPE(bp, B_FS_INO); - *bpp = bp; return 0; } -- cgit v1.2.3-58-ga151 From c867cb61641751fd3d86350232d64ae2a10137d4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Oct 2011 16:52:46 +0000 Subject: xfs: remove XFS_BUF_STALE and XFS_BUF_SUPER_STALE Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_attr.c | 2 +- fs/xfs/xfs_buf.c | 4 ++-- fs/xfs/xfs_buf.h | 6 ------ fs/xfs/xfs_buf_item.c | 6 ++++-- fs/xfs/xfs_inode.c | 4 ++-- fs/xfs/xfs_log.c | 4 ++-- fs/xfs/xfs_log_recover.c | 2 +- fs/xfs/xfs_rw.c | 2 +- fs/xfs/xfs_trans_buf.c | 13 +++++++++---- 9 files changed, 22 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 981a0624f382..ae8f917490d4 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -2168,7 +2168,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) */ bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); if (bp) { - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); xfs_buf_delwri_dequeue(bp); xfs_buf_relse(bp); bp = NULL; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 36fed03da26f..f88eab9e8144 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1061,7 +1061,7 @@ xfs_bioerror( XFS_BUF_UNREAD(bp); xfs_buf_delwri_dequeue(bp); XFS_BUF_UNDONE(bp); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); xfs_buf_ioend(bp, 0); @@ -1090,7 +1090,7 @@ xfs_bioerror_relse( XFS_BUF_UNREAD(bp); xfs_buf_delwri_dequeue(bp); XFS_BUF_DONE(bp); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); bp->b_iodone = NULL; if (!(fl & XBF_ASYNC)) { /* diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index ca2934717343..fa38401449d9 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -242,14 +242,8 @@ xfs_buf_target_name(struct xfs_buftarg *target) XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) void xfs_buf_stale(struct xfs_buf *bp); -#define XFS_BUF_STALE(bp) xfs_buf_stale(bp); #define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) #define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) -#define XFS_BUF_SUPER_STALE(bp) do { \ - XFS_BUF_STALE(bp); \ - xfs_buf_delwri_dequeue(bp); \ - XFS_BUF_DONE(bp); \ - } while (0) #define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 3243083d8693..8213f4a753dc 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -966,7 +966,9 @@ xfs_buf_iodone_callbacks( * I/O errors, there's no point in giving this a retry. */ if (XFS_FORCED_SHUTDOWN(mp)) { - XFS_BUF_SUPER_STALE(bp); + xfs_buf_stale(bp); + xfs_buf_delwri_dequeue(bp); + XFS_BUF_DONE(bp); trace_xfs_buf_item_iodone(bp, _RET_IP_); goto do_callbacks; } @@ -1005,7 +1007,7 @@ xfs_buf_iodone_callbacks( * If the write of the buffer was synchronous, we want to make * sure to return the error to the caller of xfs_bwrite(). */ - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); XFS_BUF_DONE(bp); xfs_buf_delwri_dequeue(bp); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 21cec6cdf453..c0237c602f11 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2469,11 +2469,11 @@ cluster_corrupt_out: */ if (bp->b_iodone) { XFS_BUF_UNDONE(bp); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); xfs_buf_ioerror(bp, EIO); xfs_buf_ioend(bp, 0); } else { - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); xfs_buf_relse(bp); } } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3a8d4f66d702..493447dc4f55 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -881,7 +881,7 @@ xlog_iodone(xfs_buf_t *bp) if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp, XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp)); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR); /* * This flag will be propagated to the trans-committed @@ -1247,7 +1247,7 @@ xlog_bdstrat( if (iclog->ic_state & XLOG_STATE_IOERROR) { xfs_buf_ioerror(bp, EIO); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); xfs_buf_ioend(bp, 0); /* * It would seem logical to return EIO here, but we rely on diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index be173852b2ca..8f70f3469997 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2174,7 +2174,7 @@ xlog_recover_buffer_pass2( be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize, (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); error = xfs_bwrite(bp); } else { ASSERT(bp->b_target->bt_mount == mp); diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index 99823c3b9aca..ff33645fe62d 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -150,7 +150,7 @@ xfs_read_buf( if (bp) { XFS_BUF_UNDONE(bp); xfs_buf_delwri_dequeue(bp); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); /* * brelse clears B_ERROR and b_error */ diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 5e5196a269dd..d03a8ee19172 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -160,8 +160,11 @@ xfs_trans_get_buf(xfs_trans_t *tp, bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len); if (bp != NULL) { ASSERT(xfs_buf_islocked(bp)); - if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) - XFS_BUF_SUPER_STALE(bp); + if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) { + xfs_buf_stale(bp); + xfs_buf_delwri_dequeue(bp); + XFS_BUF_DONE(bp); + } /* * If the buffer is stale then it was binval'ed @@ -387,7 +390,9 @@ xfs_trans_read_buf( } if (bp->b_error) { error = bp->b_error; - XFS_BUF_SUPER_STALE(bp); + xfs_buf_stale(bp); + xfs_buf_delwri_dequeue(bp); + XFS_BUF_DONE(bp); xfs_ioerror_alert("xfs_trans_read_buf", mp, bp, blkno); if (tp->t_flags & XFS_TRANS_DIRTY) @@ -740,7 +745,7 @@ xfs_trans_binval( * rid of it. */ xfs_buf_delwri_dequeue(bp); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); bip->bli_flags |= XFS_BLI_STALE; bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; -- cgit v1.2.3-58-ga151 From af5c4bee499eb68bc36ca046030394d82d0e3669 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Oct 2011 16:52:47 +0000 Subject: xfs: remove buffers from the delwri list in xfs_buf_stale For each call to xfs_buf_stale we call xfs_buf_delwri_dequeue either directly before or after it, or are guaranteed by the surrounding conditionals that we are never called on delwri buffers. Simply this situation by moving the call to xfs_buf_delwri_dequeue into xfs_buf_stale. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_attr.c | 1 - fs/xfs/xfs_buf.c | 3 +-- fs/xfs/xfs_buf_item.c | 2 -- fs/xfs/xfs_rw.c | 1 - fs/xfs/xfs_trans_buf.c | 3 --- 5 files changed, 1 insertion(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index ae8f917490d4..1e5d97f86ea8 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -2169,7 +2169,6 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); if (bp) { xfs_buf_stale(bp); - xfs_buf_delwri_dequeue(bp); xfs_buf_relse(bp); bp = NULL; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index f88eab9e8144..3df7d0a2b245 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -151,6 +151,7 @@ xfs_buf_stale( struct xfs_buf *bp) { bp->b_flags |= XBF_STALE; + xfs_buf_delwri_dequeue(bp); atomic_set(&(bp)->b_lru_ref, 0); if (!list_empty(&bp->b_lru)) { struct xfs_buftarg *btp = bp->b_target; @@ -1059,7 +1060,6 @@ xfs_bioerror( * We're calling xfs_buf_ioend, so delete XBF_DONE flag. */ XFS_BUF_UNREAD(bp); - xfs_buf_delwri_dequeue(bp); XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); @@ -1088,7 +1088,6 @@ xfs_bioerror_relse( * change that interface. */ XFS_BUF_UNREAD(bp); - xfs_buf_delwri_dequeue(bp); XFS_BUF_DONE(bp); xfs_buf_stale(bp); bp->b_iodone = NULL; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 8213f4a753dc..06a76ca99659 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -967,7 +967,6 @@ xfs_buf_iodone_callbacks( */ if (XFS_FORCED_SHUTDOWN(mp)) { xfs_buf_stale(bp); - xfs_buf_delwri_dequeue(bp); XFS_BUF_DONE(bp); trace_xfs_buf_item_iodone(bp, _RET_IP_); goto do_callbacks; @@ -1009,7 +1008,6 @@ xfs_buf_iodone_callbacks( */ xfs_buf_stale(bp); XFS_BUF_DONE(bp); - xfs_buf_delwri_dequeue(bp); trace_xfs_buf_error_relse(bp, _RET_IP_); diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index ff33645fe62d..86f1928b4cfa 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -149,7 +149,6 @@ xfs_read_buf( } if (bp) { XFS_BUF_UNDONE(bp); - xfs_buf_delwri_dequeue(bp); xfs_buf_stale(bp); /* * brelse clears B_ERROR and b_error diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index d03a8ee19172..5bab5980a6f9 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -162,7 +162,6 @@ xfs_trans_get_buf(xfs_trans_t *tp, ASSERT(xfs_buf_islocked(bp)); if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) { xfs_buf_stale(bp); - xfs_buf_delwri_dequeue(bp); XFS_BUF_DONE(bp); } @@ -391,7 +390,6 @@ xfs_trans_read_buf( if (bp->b_error) { error = bp->b_error; xfs_buf_stale(bp); - xfs_buf_delwri_dequeue(bp); XFS_BUF_DONE(bp); xfs_ioerror_alert("xfs_trans_read_buf", mp, bp, blkno); @@ -744,7 +742,6 @@ xfs_trans_binval( * We set the stale bit in the buffer as well since we're getting * rid of it. */ - xfs_buf_delwri_dequeue(bp); xfs_buf_stale(bp); bip->bli_flags |= XFS_BLI_STALE; bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); -- cgit v1.2.3-58-ga151 From 4347b9d7ad4223474d315c3ab6bc1ce7cce7fa2d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Oct 2011 16:52:48 +0000 Subject: xfs: clean up buffer allocation Change _xfs_buf_initialize to allocate the buffer directly and rename it to xfs_buf_alloc now that is the only buffer allocation routine. Also remove the xfs_buf_deallocate wrapper around the kmem_zone_free calls for buffers. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_buf.c | 50 ++++++++++++++++++-------------------------------- fs/xfs/xfs_buf.h | 3 ++- fs/xfs/xfs_log.c | 2 +- 3 files changed, 21 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 3df7d0a2b245..1f24ee5f0d7a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -65,10 +65,6 @@ struct workqueue_struct *xfsconvertd_workqueue; #define xb_to_km(flags) \ (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) -#define xfs_buf_allocate(flags) \ - kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags)) -#define xfs_buf_deallocate(bp) \ - kmem_zone_free(xfs_buf_zone, (bp)); static inline int xfs_buf_is_vmapped( @@ -167,14 +163,19 @@ xfs_buf_stale( ASSERT(atomic_read(&bp->b_hold) >= 1); } -STATIC void -_xfs_buf_initialize( - xfs_buf_t *bp, - xfs_buftarg_t *target, +struct xfs_buf * +xfs_buf_alloc( + struct xfs_buftarg *target, xfs_off_t range_base, size_t range_length, xfs_buf_flags_t flags) { + struct xfs_buf *bp; + + bp = kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags)); + if (unlikely(!bp)) + return NULL; + /* * We don't want certain flags to appear in b_flags. */ @@ -203,8 +204,9 @@ _xfs_buf_initialize( init_waitqueue_head(&bp->b_waiters); XFS_STATS_INC(xb_create); - trace_xfs_buf_init(bp, _RET_IP_); + + return bp; } /* @@ -277,7 +279,7 @@ xfs_buf_free( } else if (bp->b_flags & _XBF_KMEM) kmem_free(bp->b_addr); _xfs_buf_free_pages(bp); - xfs_buf_deallocate(bp); + kmem_zone_free(xfs_buf_zone, bp); } /* @@ -539,16 +541,14 @@ xfs_buf_get( if (likely(bp)) goto found; - new_bp = xfs_buf_allocate(flags); + new_bp = xfs_buf_alloc(target, ioff << BBSHIFT, isize << BBSHIFT, + flags); if (unlikely(!new_bp)) return NULL; - _xfs_buf_initialize(new_bp, target, - ioff << BBSHIFT, isize << BBSHIFT, flags); - bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); if (!bp) { - xfs_buf_deallocate(new_bp); + kmem_zone_free(xfs_buf_zone, new_bp); return NULL; } @@ -557,7 +557,7 @@ xfs_buf_get( if (error) goto no_buffer; } else - xfs_buf_deallocate(new_bp); + kmem_zone_free(xfs_buf_zone, new_bp); /* * Now we have a workable buffer, fill in the block number so @@ -694,19 +694,6 @@ xfs_buf_read_uncached( return bp; } -xfs_buf_t * -xfs_buf_get_empty( - size_t len, - xfs_buftarg_t *target) -{ - xfs_buf_t *bp; - - bp = xfs_buf_allocate(0); - if (bp) - _xfs_buf_initialize(bp, target, 0, len, 0); - return bp; -} - /* * Return a buffer allocated as an empty buffer and associated to external * memory via xfs_buf_associate_memory() back to it's empty state. @@ -792,10 +779,9 @@ xfs_buf_get_uncached( int error, i; xfs_buf_t *bp; - bp = xfs_buf_allocate(0); + bp = xfs_buf_alloc(target, 0, len, 0); if (unlikely(bp == NULL)) goto fail; - _xfs_buf_initialize(bp, target, 0, len, 0); error = _xfs_buf_get_pages(bp, page_count, 0); if (error) @@ -823,7 +809,7 @@ xfs_buf_get_uncached( __free_page(bp->b_pages[i]); _xfs_buf_free_pages(bp); fail_free_buf: - xfs_buf_deallocate(bp); + kmem_zone_free(xfs_buf_zone, bp); fail: return NULL; } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index fa38401449d9..26b909417deb 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -175,7 +175,8 @@ extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t, extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t, xfs_buf_flags_t); -extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); +struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *, xfs_off_t, size_t, + xfs_buf_flags_t); extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len); extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int); extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 493447dc4f55..8c9db4e5ddd2 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1047,7 +1047,7 @@ xlog_alloc_log(xfs_mount_t *mp, xlog_get_iclog_buffer_size(mp, log); error = ENOMEM; - bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp); + bp = xfs_buf_alloc(mp->m_logdev_targp, 0, log->l_iclog_size, 0); if (!bp) goto out_free_log; bp->b_iodone = xlog_iodone; -- cgit v1.2.3-58-ga151 From 901796afca0d31d97bf6d1bf2ab251a93a4b8c83 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Oct 2011 16:52:49 +0000 Subject: xfs: clean up xfs_ioerror_alert Instead of passing the block number and mount structure explicitly get them off the bp and fix make the argument order more natural. Also move it to xfs_buf.c and stop printing the device name given that we already get the fs name as part of xfs_alert, and we know what device is operates on because of the caller that gets printed, finally rename it to xfs_buf_ioerror_alert and pass __func__ as argument where it makes sense. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_buf.c | 11 +++++++++++ fs/xfs/xfs_buf.h | 1 + fs/xfs/xfs_log.c | 14 +++++++------- fs/xfs/xfs_log_recover.c | 25 ++++++++----------------- fs/xfs/xfs_mount.c | 3 +-- fs/xfs/xfs_rw.c | 20 +------------------- fs/xfs/xfs_rw.h | 2 -- fs/xfs/xfs_trans_buf.c | 9 +++------ fs/xfs/xfs_vnodeops.c | 11 +++++------ 9 files changed, 37 insertions(+), 59 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 1f24ee5f0d7a..0a767fca0305 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1004,6 +1004,17 @@ xfs_buf_ioerror( trace_xfs_buf_ioerror(bp, error, _RET_IP_); } +void +xfs_buf_ioerror_alert( + struct xfs_buf *bp, + const char *func) +{ + xfs_alert(bp->b_target->bt_mount, +"metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd", + (__uint64_t)XFS_BUF_ADDR(bp), func, + bp->b_error, XFS_BUF_COUNT(bp)); +} + int xfs_bwrite( struct xfs_buf *bp) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 26b909417deb..357a3371cae7 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -205,6 +205,7 @@ extern int xfs_bdstrat_cb(struct xfs_buf *); extern void xfs_buf_ioend(xfs_buf_t *, int); extern void xfs_buf_ioerror(xfs_buf_t *, int); +extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); extern int xfs_buf_iorequest(xfs_buf_t *); extern int xfs_buf_iowait(xfs_buf_t *); extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 8c9db4e5ddd2..2758a6277c52 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -880,7 +880,7 @@ xlog_iodone(xfs_buf_t *bp) */ if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp, XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { - xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); xfs_buf_stale(bp); xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR); /* @@ -1387,9 +1387,9 @@ xlog_sync(xlog_t *log, */ XFS_BUF_WRITE(bp); - if ((error = xlog_bdstrat(bp))) { - xfs_ioerror_alert("xlog_sync", log->l_mp, bp, - XFS_BUF_ADDR(bp)); + error = xlog_bdstrat(bp); + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_sync"); return error; } if (split) { @@ -1423,9 +1423,9 @@ xlog_sync(xlog_t *log, /* account for internal log which doesn't start at block #0 */ XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); XFS_BUF_WRITE(bp); - if ((error = xlog_bdstrat(bp))) { - xfs_ioerror_alert("xlog_sync (split)", log->l_mp, - bp, XFS_BUF_ADDR(bp)); + error = xlog_bdstrat(bp); + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_sync (split)"); return error; } } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 8f70f3469997..82ee9db628ed 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -183,8 +183,7 @@ xlog_bread_noalign( xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); if (error) - xfs_ioerror_alert("xlog_bread", log->l_mp, - bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); return error; } @@ -269,10 +268,8 @@ xlog_bwrite( XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); error = xfs_bwrite(bp); - if (error) { - xfs_ioerror_alert("xlog_bwrite", log->l_mp, - bp, XFS_BUF_ADDR(bp)); - } + if (error) + xfs_buf_ioerror_alert(bp, __func__); xfs_buf_relse(bp); return error; } @@ -364,9 +361,7 @@ xlog_recover_iodone( * We're not going to bother about retrying * this during recovery. One strike! */ - xfs_ioerror_alert("xlog_recover_iodone", - bp->b_target->bt_mount, bp, - XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); xfs_force_shutdown(bp->b_target->bt_mount, SHUTDOWN_META_IO_ERROR); } @@ -2138,8 +2133,7 @@ xlog_recover_buffer_pass2( return XFS_ERROR(ENOMEM); error = bp->b_error; if (error) { - xfs_ioerror_alert("xlog_recover_do..(read#1)", mp, - bp, buf_f->blf_blkno); + xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); xfs_buf_relse(bp); return error; } @@ -2234,8 +2228,7 @@ xlog_recover_inode_pass2( } error = bp->b_error; if (error) { - xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, - bp, in_f->ilf_blkno); + xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)"); xfs_buf_relse(bp); goto error; } @@ -2542,8 +2535,7 @@ xlog_recover_dquot_pass2( XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp); if (error) { - xfs_ioerror_alert("xlog_recover_do..(read#3)", mp, - bp, dq_f->qlf_blkno); + xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#3)"); return error; } ASSERT(bp); @@ -3695,8 +3687,7 @@ xlog_do_recover( xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); if (error) { - xfs_ioerror_alert("xlog_do_recover", - log->l_mp, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); ASSERT(0); xfs_buf_relse(bp); return error; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index e8fe1cb54094..f3b1cec38b81 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1610,8 +1610,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp) xfsbdstrat(mp, sbp); error = xfs_buf_iowait(sbp); if (error) - xfs_ioerror_alert("xfs_unmountfs_writesb", - mp, sbp, XFS_BUF_ADDR(sbp)); + xfs_buf_ioerror_alert(sbp, __func__); xfs_buf_relse(sbp); } return error; diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index 86f1928b4cfa..597d044a09a1 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -91,24 +91,6 @@ xfs_do_force_shutdown( } } -/* - * Prints out an ALERT message about I/O error. - */ -void -xfs_ioerror_alert( - char *func, - struct xfs_mount *mp, - xfs_buf_t *bp, - xfs_daddr_t blkno) -{ - xfs_alert(mp, - "I/O error occurred: meta-data dev %s block 0x%llx" - " (\"%s\") error %d buf count %zd", - xfs_buf_target_name(bp->b_target), - (__uint64_t)blkno, func, - bp->b_error, XFS_BUF_COUNT(bp)); -} - /* * This isn't an absolute requirement, but it is * just a good idea to call xfs_read_buf instead of @@ -143,7 +125,7 @@ xfs_read_buf( } else { *bpp = NULL; if (error) { - xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); } else { error = XFS_ERROR(EIO); } diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h index 11c41ec6ed75..bbdb9ad6a4ba 100644 --- a/fs/xfs/xfs_rw.h +++ b/fs/xfs/xfs_rw.h @@ -42,8 +42,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, xfs_daddr_t blkno, int len, uint flags, struct xfs_buf **bpp); -extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp, - xfs_buf_t *bp, xfs_daddr_t blkno); extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); #endif /* __XFS_RW_H__ */ diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 5bab5980a6f9..475a4ded4f41 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -296,8 +296,7 @@ xfs_trans_read_buf( if (bp->b_error) { error = bp->b_error; - xfs_ioerror_alert("xfs_trans_read_buf", mp, - bp, blkno); + xfs_buf_ioerror_alert(bp, __func__); xfs_buf_relse(bp); return error; } @@ -339,8 +338,7 @@ xfs_trans_read_buf( xfsbdstrat(tp->t_mountp, bp); error = xfs_buf_iowait(bp); if (error) { - xfs_ioerror_alert("xfs_trans_read_buf", mp, - bp, blkno); + xfs_buf_ioerror_alert(bp, __func__); xfs_buf_relse(bp); /* * We can gracefully recover from most read @@ -391,8 +389,7 @@ xfs_trans_read_buf( error = bp->b_error; xfs_buf_stale(bp); XFS_BUF_DONE(bp); - xfs_ioerror_alert("xfs_trans_read_buf", mp, - bp, blkno); + xfs_buf_ioerror_alert(bp, __func__); if (tp->t_flags & XFS_TRANS_DIRTY) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index fc38f15808c6..4ecf2a549060 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -87,8 +87,7 @@ xfs_readlink_bmap( return XFS_ERROR(ENOMEM); error = bp->b_error; if (error) { - xfs_ioerror_alert("xfs_readlink", - ip->i_mount, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); xfs_buf_relse(bp); goto out; } @@ -1993,8 +1992,8 @@ xfs_zero_remaining_bytes( xfsbdstrat(mp, bp); error = xfs_buf_iowait(bp); if (error) { - xfs_ioerror_alert("xfs_zero_remaining_bytes(read)", - mp, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, + "xfs_zero_remaining_bytes(read)"); break; } memset(bp->b_addr + @@ -2006,8 +2005,8 @@ xfs_zero_remaining_bytes( xfsbdstrat(mp, bp); error = xfs_buf_iowait(bp); if (error) { - xfs_ioerror_alert("xfs_zero_remaining_bytes(write)", - mp, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, + "xfs_zero_remaining_bytes(write)"); break; } } -- cgit v1.2.3-58-ga151 From b38505b09b7854d446b2f60b4414e3231277aa1a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Oct 2011 16:52:50 +0000 Subject: xfs: use xfs_ioerror_alert in xfs_buf_iodone_callbacks Use xfs_ioerror_alert instead of opencoding a very similar error message. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_buf_item.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 06a76ca99659..65d6f4432d28 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -975,9 +975,7 @@ xfs_buf_iodone_callbacks( if (bp->b_target != lasttarg || time_after(jiffies, (lasttime + 5*HZ))) { lasttime = jiffies; - xfs_alert(mp, "Device %s: metadata write error block 0x%llx", - xfs_buf_target_name(bp->b_target), - (__uint64_t)XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); } lasttarg = bp->b_target; -- cgit v1.2.3-58-ga151 From 02b102df1502a7ea4167d115510e1e8fe6467f12 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Oct 2011 16:52:51 +0000 Subject: xfs: remove xfs_buf_target_name The calling convention that returns a pointer to a static buffer is fairly nasty, so just opencode it in the only caller that is left. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_buf.c | 6 +++++- fs/xfs/xfs_buf.h | 9 --------- 2 files changed, 5 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 0a767fca0305..6f615c259411 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1457,9 +1457,13 @@ xfs_setsize_buftarg_flags( btp->bt_smask = sectorsize - 1; if (set_blocksize(btp->bt_bdev, sectorsize)) { + char name[BDEVNAME_SIZE]; + + bdevname(btp->bt_bdev, name); + xfs_warn(btp->bt_mount, "Cannot set_blocksize to %u on device %s\n", - sectorsize, xfs_buf_target_name(btp)); + sectorsize, name); return EINVAL; } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 357a3371cae7..be19dd2b0212 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -230,15 +230,6 @@ extern void xfs_buf_delwri_promote(struct xfs_buf *); extern int xfs_buf_init(void); extern void xfs_buf_terminate(void); -static inline const char * -xfs_buf_target_name(struct xfs_buftarg *target) -{ - static char __b[BDEVNAME_SIZE]; - - return bdevname(target->bt_bdev, __b); -} - - #define XFS_BUF_ZEROFLAGS(bp) \ ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \ XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) -- cgit v1.2.3-58-ga151 From a9add83e5abd29bf2b7b3658311199eeabbdefc6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Oct 2011 16:52:52 +0000 Subject: xfs: remove XFS_bflush Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_buf.h | 2 -- fs/xfs/xfs_log_recover.c | 2 +- fs/xfs/xfs_mount.c | 2 +- fs/xfs/xfs_qm.c | 2 +- fs/xfs/xfs_super.c | 4 ++-- fs/xfs/xfs_sync.c | 2 +- 6 files changed, 6 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index be19dd2b0212..5bab046e859f 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -294,6 +294,4 @@ extern int xfs_flush_buftarg(xfs_buftarg_t *, int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) -#define XFS_bflush(buftarg) xfs_flush_buftarg(buftarg, 1) - #endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 82ee9db628ed..541a508adea1 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3654,7 +3654,7 @@ xlog_do_recover( return error; } - XFS_bflush(log->l_mp->m_ddev_targp); + xfs_flush_buftarg(log->l_mp->m_ddev_targp, 1); /* * If IO errors happened during recovery, bail out. diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index f3b1cec38b81..d06afbc3540d 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1481,7 +1481,7 @@ xfs_unmountfs( * state as much as possible. */ xfs_reclaim_inodes(mp, 0); - XFS_bflush(mp->m_ddev_targp); + xfs_flush_buftarg(mp->m_ddev_targp, 1); xfs_reclaim_inodes(mp, SYNC_WAIT); xfs_qm_unmount(mp); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index ddaf97a57ec6..5cff443f6cdb 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1681,7 +1681,7 @@ xfs_qm_quotacheck( * quotacheck'd stamp on the superblock. So, here we do a synchronous * flush. */ - XFS_bflush(mp->m_ddev_targp); + xfs_flush_buftarg(mp->m_ddev_targp, 1); /* * If one type of quotas is off, then it will lose its diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 6ad05e68abda..ba16248bcf24 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1015,7 +1015,7 @@ xfs_fs_put_super( */ xfs_filestream_unmount(mp); - XFS_bflush(mp->m_ddev_targp); + xfs_flush_buftarg(mp->m_ddev_targp, 1); xfs_unmountfs(mp); xfs_freesb(mp); @@ -1439,7 +1439,7 @@ xfs_fs_fill_super( */ xfs_filestream_unmount(mp); - XFS_bflush(mp->m_ddev_targp); + xfs_flush_buftarg(mp->m_ddev_targp, 1); xfs_unmountfs(mp); goto out_free_sb; diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index bf2b38c21caa..aa3dc1a4d53d 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -377,7 +377,7 @@ xfs_quiesce_data( /* flush data-only devices */ if (mp->m_rtdev_targp) - XFS_bflush(mp->m_rtdev_targp); + xfs_flush_buftarg(mp->m_rtdev_targp, 1); return error ? error : error2; } -- cgit v1.2.3-58-ga151 From 5a93a064d27b42e4af1772b0599b53e3241191ac Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Oct 2011 16:52:53 +0000 Subject: xfs: do not flush data workqueues in xfs_flush_buftarg When we call xfs_flush_buftarg (generally from sync or umount) it already is too late to flush the data workqueues, as I/O completion is signalled for them and we are thus already done with the data we would flush here. There are places where flushing them might be useful, but the current sync interface doesn't give us that opportunity. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_buf.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 6f615c259411..cf0ac056815f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1623,13 +1623,6 @@ xfs_buf_delwri_promote( spin_unlock(&btp->bt_delwri_lock); } -STATIC void -xfs_buf_runall_queues( - struct workqueue_struct *queue) -{ - flush_workqueue(queue); -} - /* * Move as many buffers as specified to the supplied list * idicating if we skipped any buffers to prevent deadlocks. @@ -1752,9 +1745,7 @@ xfs_flush_buftarg( LIST_HEAD(wait_list); struct blk_plug plug; - xfs_buf_runall_queues(xfsconvertd_workqueue); - xfs_buf_runall_queues(xfsdatad_workqueue); - xfs_buf_runall_queues(xfslogd_workqueue); + flush_workqueue(xfslogd_workqueue); set_bit(XBT_FORCE_FLUSH, &target->bt_flags); pincount = xfs_buf_delwri_split(target, &tmp_list, 0); -- cgit v1.2.3-58-ga151 From 5703728ac1194baee554163180415baf4ccd9996 Mon Sep 17 00:00:00 2001 From: Mi Jinlong Date: Wed, 12 Oct 2011 15:09:34 +0800 Subject: nfs: fix bug about IPv6 address scope checking The result from ipv6_addr_scope() is a set of flags, not a single value, so we can't just compare the result with IPV6_ADDR_SCOPE_LINKLOCAL. This patch fixs the problem, and checks for unequal addresses before scope_id. Signed-off-by: Mi Jinlong Signed-off-by: J. Bruce Fields --- fs/nfs/client.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 5833fbbf59b0..b4e41dd4d0f6 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -336,11 +336,12 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; - if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && - sin1->sin6_scope_id != sin2->sin6_scope_id) + if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr)) return 0; + else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL) + return sin1->sin6_scope_id == sin2->sin6_scope_id; - return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr); + return 1; } #else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, -- cgit v1.2.3-58-ga151 From b238b8fa93353ab50c9a2b1e2fa47a0ab01c37cd Mon Sep 17 00:00:00 2001 From: Chen Gong Date: Wed, 12 Oct 2011 09:17:24 -0700 Subject: pstore: make pstore write function return normal success/fail value Currently pstore write interface employs record id as return value, but it is not enough because it can't tell caller if the write operation is successful. Pass the record id back via an argument pointer and return zero for success, non-zero for failure. Signed-off-by: Chen Gong Signed-off-by: Tony Luck --- drivers/acpi/apei/erst.c | 10 ++++++---- drivers/firmware/efivars.c | 17 +++++++++-------- fs/pstore/platform.c | 11 ++++++----- include/linux/pstore.h | 4 ++-- 4 files changed, 23 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c index 5e820ea35570..127408069ca7 100644 --- a/drivers/acpi/apei/erst.c +++ b/drivers/acpi/apei/erst.c @@ -933,7 +933,7 @@ static int erst_open_pstore(struct pstore_info *psi); static int erst_close_pstore(struct pstore_info *psi); static ssize_t erst_reader(u64 *id, enum pstore_type_id *type, struct timespec *time, struct pstore_info *psi); -static u64 erst_writer(enum pstore_type_id type, unsigned int part, +static int erst_writer(enum pstore_type_id type, u64 *id, unsigned int part, size_t size, struct pstore_info *psi); static int erst_clearer(enum pstore_type_id type, u64 id, struct pstore_info *psi); @@ -1040,11 +1040,12 @@ out: return (rc < 0) ? rc : (len - sizeof(*rcd)); } -static u64 erst_writer(enum pstore_type_id type, unsigned int part, +static int erst_writer(enum pstore_type_id type, u64 *id, unsigned int part, size_t size, struct pstore_info *psi) { struct cper_pstore_record *rcd = (struct cper_pstore_record *) (erst_info.buf - sizeof(*rcd)); + int ret; memset(rcd, 0, sizeof(*rcd)); memcpy(rcd->hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE); @@ -1079,9 +1080,10 @@ static u64 erst_writer(enum pstore_type_id type, unsigned int part, } rcd->sec_hdr.section_severity = CPER_SEV_FATAL; - erst_write(&rcd->hdr); + ret = erst_write(&rcd->hdr); + *id = rcd->hdr.record_id; - return rcd->hdr.record_id; + return ret; } static int erst_clearer(enum pstore_type_id type, u64 id, diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c index be8bcb035e2a..8370f72d87ff 100644 --- a/drivers/firmware/efivars.c +++ b/drivers/firmware/efivars.c @@ -490,8 +490,8 @@ static ssize_t efi_pstore_read(u64 *id, enum pstore_type_id *type, return 0; } -static u64 efi_pstore_write(enum pstore_type_id type, unsigned int part, - size_t size, struct pstore_info *psi) +static int efi_pstore_write(enum pstore_type_id type, u64 *id, + unsigned int part, size_t size, struct pstore_info *psi) { char name[DUMP_NAME_LEN]; char stub_name[DUMP_NAME_LEN]; @@ -499,7 +499,7 @@ static u64 efi_pstore_write(enum pstore_type_id type, unsigned int part, efi_guid_t vendor = LINUX_EFI_CRASH_GUID; struct efivars *efivars = psi->data; struct efivar_entry *entry, *found = NULL; - int i; + int i, ret = 0; sprintf(stub_name, "dump-type%u-%u-", type, part); sprintf(name, "%s%lu", stub_name, get_seconds()); @@ -548,18 +548,19 @@ static u64 efi_pstore_write(enum pstore_type_id type, unsigned int part, efivar_unregister(found); if (size) - efivar_create_sysfs_entry(efivars, + ret = efivar_create_sysfs_entry(efivars, utf16_strsize(efi_name, DUMP_NAME_LEN * 2), efi_name, &vendor); - return part; + *id = part; + return ret; }; static int efi_pstore_erase(enum pstore_type_id type, u64 id, struct pstore_info *psi) { - efi_pstore_write(type, id, 0, psi); + efi_pstore_write(type, &id, (unsigned int)id, 0, psi); return 0; } @@ -580,8 +581,8 @@ static ssize_t efi_pstore_read(u64 *id, enum pstore_type_id *type, return -1; } -static u64 efi_pstore_write(enum pstore_type_id type, unsigned int part, - size_t size, struct pstore_info *psi) +static int efi_pstore_write(enum pstore_type_id type, u64 *id, + unsigned int part, size_t size, struct pstore_info *psi) { return 0; } diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 0472924024cc..2bd620f0d796 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -87,7 +87,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, unsigned long size, total = 0; char *dst, *why; u64 id; - int hsize; + int hsize, ret; unsigned int part = 1; unsigned long flags = 0; int is_locked = 0; @@ -122,9 +122,9 @@ static void pstore_dump(struct kmsg_dumper *dumper, memcpy(dst, s1 + s1_start, l1_cpy); memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy); - id = psinfo->write(PSTORE_TYPE_DMESG, part, + ret = psinfo->write(PSTORE_TYPE_DMESG, &id, part, hsize + l1_cpy + l2_cpy, psinfo); - if (reason == KMSG_DUMP_OOPS && pstore_is_mounted()) + if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) pstore_new_entry = 1; l1 -= l1_cpy; l2 -= l2_cpy; @@ -247,6 +247,7 @@ static void pstore_timefunc(unsigned long dummy) int pstore_write(enum pstore_type_id type, char *buf, size_t size) { u64 id; + int ret; unsigned long flags; if (!psinfo) @@ -257,8 +258,8 @@ int pstore_write(enum pstore_type_id type, char *buf, size_t size) spin_lock_irqsave(&psinfo->buf_lock, flags); memcpy(psinfo->buf, buf, size); - id = psinfo->write(type, 0, size, psinfo); - if (pstore_is_mounted()) + ret = psinfo->write(type, &id, 0, size, psinfo); + if (ret == 0 && pstore_is_mounted()) pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf, size, CURRENT_TIME, psinfo); spin_unlock_irqrestore(&psinfo->buf_lock, flags); diff --git a/include/linux/pstore.h b/include/linux/pstore.h index b91440e64d6e..ea567321ae3c 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -39,8 +39,8 @@ struct pstore_info { int (*close)(struct pstore_info *psi); ssize_t (*read)(u64 *id, enum pstore_type_id *type, struct timespec *time, struct pstore_info *psi); - u64 (*write)(enum pstore_type_id type, unsigned int part, - size_t size, struct pstore_info *psi); + int (*write)(enum pstore_type_id type, u64 *id, + unsigned int part, size_t size, struct pstore_info *psi); int (*erase)(enum pstore_type_id type, u64 id, struct pstore_info *psi); void *data; -- cgit v1.2.3-58-ga151 From ac423446d83e4fd5c3ffba393b887e531cf6bf46 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Oct 2011 06:41:32 -0400 Subject: cifs: switch CIFSSMBQAllEAs to use memcmp ...as that's more efficient when we know that the lengths are equal. Reported-by: David Howells Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifssmb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index a80f7bd97b90..b7df4bff3aaa 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -5840,7 +5840,7 @@ QAllEAsRetry: if (ea_name) { if (ea_name_len == name_len && - strncmp(ea_name, temp_ptr, name_len) == 0) { + memcmp(ea_name, temp_ptr, name_len) == 0) { temp_ptr += name_len + 1; rc = value_len; if (buf_size == 0) -- cgit v1.2.3-58-ga151 From b4dacbc2823799b84d4d4563f865b0680cb44ed1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Oct 2011 06:41:32 -0400 Subject: cifs: use memcpy for magic string in cifs signature generation BSRSPYL ...it's more efficient since we know the length. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsencrypt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 30acd22147e1..12f1c1263013 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -92,7 +92,7 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, return rc; if (!server->session_estab) { - strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8); + memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8); return rc; } @@ -189,7 +189,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, return rc; if (!server->session_estab) { - strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8); + memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8); return rc; } -- cgit v1.2.3-58-ga151 From 4a29a0bd1d63c1a434f29ce5dd6428b65604a398 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Oct 2011 06:41:32 -0400 Subject: cifs: get rid of unused xid in cifs_get_root Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 03d2f6b27cc4..7986d330cbb5 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -532,7 +532,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) char *full_path = NULL; char *s, *p; char sep; - int xid; full_path = cifs_build_path_to_root(vol, cifs_sb, cifs_sb_master_tcon(cifs_sb)); @@ -541,7 +540,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) cFYI(1, "Get root dentry for %s", full_path); - xid = GetXid(); sep = CIFS_DIR_SEP(cifs_sb); dentry = dget(sb->s_root); p = s = full_path; @@ -572,7 +570,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) dput(dentry); dentry = child; } while (!IS_ERR(dentry)); - _FreeXid(xid); kfree(full_path); return dentry; } -- cgit v1.2.3-58-ga151 From f3a6a60e4c3ac83370c620dbbd08d2a418b9364d Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Wed, 12 Oct 2011 14:14:04 +0200 Subject: cifs: Fix typo 'CIFS_NFSD_EXPORT' It should be 'CONFIG_CIFS_NFSD_EXPORT'. No-one noticed because that symbol depends on BROKEN. Signed-off-by: Paul Bolle Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 4 ++-- fs/cifs/cifsfs.h | 4 ++-- fs/cifs/export.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 7986d330cbb5..aa8fe7cee0b4 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -132,12 +132,12 @@ cifs_read_super(struct super_block *sb) else sb->s_d_op = &cifs_dentry_ops; -#ifdef CIFS_NFSD_EXPORT +#ifdef CONFIG_CIFS_NFSD_EXPORT if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { cFYI(1, "export ops supported"); sb->s_export_op = &cifs_export_ops; } -#endif /* CIFS_NFSD_EXPORT */ +#endif /* CONFIG_CIFS_NFSD_EXPORT */ return 0; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 95da8027983d..d9dbaf869cd1 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -121,9 +121,9 @@ extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); -#ifdef CIFS_NFSD_EXPORT +#ifdef CONFIG_CIFS_NFSD_EXPORT extern const struct export_operations cifs_export_ops; -#endif /* CIFS_NFSD_EXPORT */ +#endif /* CONFIG_CIFS_NFSD_EXPORT */ #define CIFS_VERSION "1.75" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/export.c b/fs/cifs/export.c index 55d87ac52000..9c7ecdccf2f3 100644 --- a/fs/cifs/export.c +++ b/fs/cifs/export.c @@ -45,7 +45,7 @@ #include "cifs_debug.h" #include "cifsfs.h" -#ifdef CIFS_NFSD_EXPORT +#ifdef CONFIG_CIFS_NFSD_EXPORT static struct dentry *cifs_get_parent(struct dentry *dentry) { /* BB need to add code here eventually to enable export via NFSD */ @@ -63,5 +63,5 @@ const struct export_operations cifs_export_ops = { .encode_fs = */ }; -#endif /* CIFS_NFSD_EXPORT */ +#endif /* CONFIG_CIFS_NFSD_EXPORT */ -- cgit v1.2.3-58-ga151 From c974befa402b5eb2ed115b3083b5a46a4be85a9f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Oct 2011 06:41:32 -0400 Subject: cifs: untangle server->maxBuf and CIFSMaxBufSize server->maxBuf is the maximum SMB size (including header) that the server can handle. CIFSMaxBufSize is the maximum amount of data (sans header) that the client can handle. Currently maxBuf is being capped at CIFSMaxBufSize + the max headers size, and the two values are used somewhat interchangeably in the code. This makes little sense as these two values are not related at all. Separate them and make sure the code uses the right values in the right places. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifssmb.c | 32 +++++++++----------------------- fs/cifs/connect.c | 11 +++++------ fs/cifs/file.c | 2 +- fs/cifs/sess.c | 4 +++- 4 files changed, 18 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index b7df4bff3aaa..602326fa4a4f 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -453,8 +453,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses) } server->sec_mode = (__u8)le16_to_cpu(rsp->SecurityMode); server->maxReq = le16_to_cpu(rsp->MaxMpxCount); - server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize), - (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); + server->maxBuf = le16_to_cpu(rsp->MaxBufSize); server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs); /* even though we do not use raw we might as well set this accurately, in case we ever find a need for it */ @@ -561,8 +560,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses) little endian */ server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount); /* probably no need to store and check maxvcs */ - server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize), - (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); + server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize); server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); cFYI(DBG2, "Max buf = %d", ses->server->maxBuf); server->capabilities = le32_to_cpu(pSMBr->Capabilities); @@ -2812,8 +2810,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifs_tcon *tcon, pSMB->TotalDataCount = 0; pSMB->MaxParameterCount = cpu_to_le32(2); /* BB find exact data count max from sess structure BB */ - pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - - MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); + pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); pSMB->MaxSetupCount = 4; pSMB->Reserved = 0; pSMB->ParameterOffset = 0; @@ -3306,8 +3303,7 @@ smb_init_nttransact(const __u16 sub_command, const int setup_count, pSMB->Reserved = 0; pSMB->TotalParameterCount = cpu_to_le32(parm_len); pSMB->TotalDataCount = 0; - pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - - MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); + pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); pSMB->ParameterCount = pSMB->TotalParameterCount; pSMB->DataCount = pSMB->TotalDataCount; temp_offset = offsetof(struct smb_com_ntransact_req, Parms) + @@ -3977,8 +3973,7 @@ findFirstRetry: params = 12 + name_len /* includes null */ ; pSMB->TotalDataCount = 0; /* no EAs */ pSMB->MaxParameterCount = cpu_to_le16(10); - pSMB->MaxDataCount = cpu_to_le16((tcon->ses->server->maxBuf - - MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); + pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00); pSMB->MaxSetupCount = 0; pSMB->Reserved = 0; pSMB->Flags = 0; @@ -4052,8 +4047,7 @@ findFirstRetry: psrch_inf->index_of_last_entry = 2 /* skip . and .. */ + psrch_inf->entries_in_buffer; lnoff = le16_to_cpu(parms->LastNameOffset); - if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < - lnoff) { + if (CIFSMaxBufSize < lnoff) { cERROR(1, "ignoring corrupt resume name"); psrch_inf->last_entry = NULL; return rc; @@ -4097,9 +4091,7 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon, byte_count = 0; pSMB->TotalDataCount = 0; /* no EAs */ pSMB->MaxParameterCount = cpu_to_le16(8); - pSMB->MaxDataCount = - cpu_to_le16((tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & - 0xFFFFFF00); + pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00); pSMB->MaxSetupCount = 0; pSMB->Reserved = 0; pSMB->Flags = 0; @@ -4181,8 +4173,7 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon, psrch_inf->index_of_last_entry += psrch_inf->entries_in_buffer; lnoff = le16_to_cpu(parms->LastNameOffset); - if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < - lnoff) { + if (CIFSMaxBufSize < lnoff) { cERROR(1, "ignoring corrupt resume name"); psrch_inf->last_entry = NULL; return rc; @@ -6035,12 +6026,7 @@ int CIFSSMBNotify(const int xid, struct cifs_tcon *tcon, pSMB->TotalParameterCount = 0 ; pSMB->TotalDataCount = 0; pSMB->MaxParameterCount = cpu_to_le32(2); - /* BB find exact data count max from sess structure BB */ - pSMB->MaxDataCount = 0; /* same in little endian or be */ -/* BB VERIFY verify which is correct for above BB */ - pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - - MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); - + pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); pSMB->MaxSetupCount = 4; pSMB->Reserved = 0; pSMB->ParameterOffset = 0; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 71beb0201970..a0077a5e0669 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -181,7 +181,7 @@ cifs_reconnect(struct TCP_Server_Info *server) -EINVAL = invalid transact2 */ -static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize) +static int check2ndT2(struct smb_hdr *pSMB) { struct smb_t2_rsp *pSMBt; int remaining; @@ -214,9 +214,9 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize) cFYI(1, "missing %d bytes from transact2, check next response", remaining); - if (total_data_size > maxBufSize) { + if (total_data_size > CIFSMaxBufSize) { cERROR(1, "TotalDataSize %d is over maximum buffer %d", - total_data_size, maxBufSize); + total_data_size, CIFSMaxBufSize); return -EINVAL; } return remaining; @@ -486,7 +486,7 @@ find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf, mid->command != buf->Command) continue; - if (*length == 0 && check2ndT2(buf, server->maxBuf) > 0) { + if (*length == 0 && check2ndT2(buf) > 0) { /* We have a multipart transact2 resp */ *is_multi_rsp = true; if (mid->resp_buf) { @@ -3130,8 +3130,7 @@ try_mount_again: cFYI(DBG2, "no very large read support, rsize now 127K"); } if (!(tcon->ses->capabilities & CAP_LARGE_READ_X)) - cifs_sb->rsize = min(cifs_sb->rsize, - (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE)); + cifs_sb->rsize = min(cifs_sb->rsize, CIFSMaxBufSize); cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 9f41a10523a1..fd57165f55fa 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1868,7 +1868,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, if ((pTcon->ses) && !(pTcon->ses->capabilities & CAP_LARGE_FILES)) { current_read_size = min_t(const int, current_read_size, - pTcon->ses->server->maxBuf - 128); + CIFSMaxBufSize); } rc = -EAGAIN; while (rc == -EAGAIN) { diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index d3e619692ee0..c7d80e24f24e 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -124,7 +124,9 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) /* that we use in next few lines */ /* Note that header is initialized to zero in header_assemble */ pSMB->req.AndXCommand = 0xFF; - pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf); + pSMB->req.MaxBufferSize = cpu_to_le16(min_t(u32, + CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4, + USHRT_MAX)); pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); pSMB->req.VcNumber = get_next_vcnum(ses); -- cgit v1.2.3-58-ga151 From 376b43f41c8b9315f7efdf085d214b6024337381 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Oct 2011 06:41:32 -0400 Subject: cifs: clean up checkSMB The variable names in this function are so ambiguous that it's very difficult to know what it's doing. Rename them to make it a bit more clear. Also, remove a redundant length check. cifsd checks to make sure that the rfclen isn't larger than the maximum frame size when it does the receive. Finally, change checkSMB to return a real error code (-EIO) when it finds an error. That will help simplify some coming changes in the callers. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/misc.c | 51 +++++++++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 7c1693392598..4a1801b3195f 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -420,19 +420,22 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid) } int -checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length) +checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int total_read) { - __u32 len = be32_to_cpu(smb->smb_buf_length); + __u32 rfclen = be32_to_cpu(smb->smb_buf_length); __u32 clc_len; /* calculated length */ - cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len); + cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", + total_read, rfclen); - if (length < 2 + sizeof(struct smb_hdr)) { - if ((length >= sizeof(struct smb_hdr) - 1) + /* is this frame too small to even get to a BCC? */ + if (total_read < 2 + sizeof(struct smb_hdr)) { + if ((total_read >= sizeof(struct smb_hdr) - 1) && (smb->Status.CifsError != 0)) { + /* it's an error return */ smb->WordCount = 0; /* some error cases do not return wct and bcc */ return 0; - } else if ((length == sizeof(struct smb_hdr) + 1) && + } else if ((total_read == sizeof(struct smb_hdr) + 1) && (smb->WordCount == 0)) { char *tmp = (char *)smb; /* Need to work around a bug in two servers here */ @@ -452,39 +455,35 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length) } else { cERROR(1, "Length less than smb header size"); } - return 1; - } - if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { - cERROR(1, "smb length greater than MaxBufSize, mid=%d", - smb->Mid); - return 1; + return -EIO; } + /* otherwise, there is enough to get to the BCC */ if (check_smb_hdr(smb, mid)) - return 1; + return -EIO; clc_len = smbCalcSize(smb); - if (4 + len != length) { + if (4 + rfclen != total_read) { cERROR(1, "Length read does not match RFC1001 length %d", - len); - return 1; + rfclen); + return -EIO; } - if (4 + len != clc_len) { + if (4 + rfclen != clc_len) { /* check if bcc wrapped around for large read responses */ - if ((len > 64 * 1024) && (len > clc_len)) { + if ((rfclen > 64 * 1024) && (rfclen > clc_len)) { /* check if lengths match mod 64K */ - if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF)) + if (((4 + rfclen) & 0xFFFF) == (clc_len & 0xFFFF)) return 0; /* bcc wrapped */ } cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u", - clc_len, 4 + len, smb->Mid); + clc_len, 4 + rfclen, smb->Mid); - if (4 + len < clc_len) { + if (4 + rfclen < clc_len) { cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u", - len, smb->Mid); - return 1; - } else if (len > clc_len + 512) { + rfclen, smb->Mid); + return -EIO; + } else if (rfclen > clc_len + 512) { /* * Some servers (Windows XP in particular) send more * data than the lengths in the SMB packet would @@ -495,8 +494,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length) * data to 512 bytes. */ cERROR(1, "RFC1001 size %u more than 512 bytes larger " - "than SMB for mid=%u", len, smb->Mid); - return 1; + "than SMB for mid=%u", rfclen, smb->Mid); + return -EIO; } } return 0; -- cgit v1.2.3-58-ga151 From 826a95e4a33f3e9bfa0d31ab769d5b01130f7111 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Oct 2011 06:41:32 -0400 Subject: cifs: consolidate signature generating code We have two versions of signature generating code. A vectorized and non-vectorized version. Eliminate a large chunk of cut-and-paste code by turning the non-vectorized version into a wrapper around the vectorized one. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsencrypt.c | 103 ++++++++++---------------------------------------- fs/cifs/cifsproto.h | 2 +- fs/cifs/transport.c | 11 ++++-- 3 files changed, 30 insertions(+), 86 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 12f1c1263013..2cfb695d1f89 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -37,83 +37,8 @@ * the sequence number before this function is called. Also, this function * should be called with the server->srv_mutex held. */ -static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, - struct TCP_Server_Info *server, char *signature) -{ - int rc; - - if (cifs_pdu == NULL || signature == NULL || server == NULL) - return -EINVAL; - - if (!server->secmech.sdescmd5) { - cERROR(1, "%s: Can't generate signature\n", __func__); - return -1; - } - - rc = crypto_shash_init(&server->secmech.sdescmd5->shash); - if (rc) { - cERROR(1, "%s: Could not init md5\n", __func__); - return rc; - } - - rc = crypto_shash_update(&server->secmech.sdescmd5->shash, - server->session_key.response, server->session_key.len); - if (rc) { - cERROR(1, "%s: Could not update with response\n", __func__); - return rc; - } - - rc = crypto_shash_update(&server->secmech.sdescmd5->shash, - cifs_pdu->Protocol, be32_to_cpu(cifs_pdu->smb_buf_length)); - if (rc) { - cERROR(1, "%s: Could not update with payload\n", __func__); - return rc; - } - - rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); - if (rc) - cERROR(1, "%s: Could not generate md5 hash\n", __func__); - - return rc; -} - -/* must be called with server->srv_mutex held */ -int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, - __u32 *pexpected_response_sequence_number) -{ - int rc = 0; - char smb_signature[20]; - - if ((cifs_pdu == NULL) || (server == NULL)) - return -EINVAL; - - if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) || - server->tcpStatus == CifsNeedNegotiate) - return rc; - - if (!server->session_estab) { - memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8); - return rc; - } - - cifs_pdu->Signature.Sequence.SequenceNumber = - cpu_to_le32(server->sequence_number); - cifs_pdu->Signature.Sequence.Reserved = 0; - - *pexpected_response_sequence_number = server->sequence_number++; - server->sequence_number++; - - rc = cifs_calculate_signature(cifs_pdu, server, smb_signature); - if (rc) - memset(cifs_pdu->Signature.SecuritySignature, 0, 8); - else - memcpy(cifs_pdu->Signature.SecuritySignature, smb_signature, 8); - - return rc; -} - -static int cifs_calc_signature2(const struct kvec *iov, int n_vec, - struct TCP_Server_Info *server, char *signature) +static int cifs_calc_signature(const struct kvec *iov, int n_vec, + struct TCP_Server_Info *server, char *signature) { int i; int rc; @@ -179,7 +104,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, { int rc = 0; char smb_signature[20]; - struct smb_hdr *cifs_pdu = iov[0].iov_base; + struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base; if ((cifs_pdu == NULL) || (server == NULL)) return -EINVAL; @@ -200,7 +125,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, *pexpected_response_sequence_number = server->sequence_number++; server->sequence_number++; - rc = cifs_calc_signature2(iov, n_vec, server, smb_signature); + rc = cifs_calc_signature(iov, n_vec, server, smb_signature); if (rc) memset(cifs_pdu->Signature.SecuritySignature, 0, 8); else @@ -209,13 +134,27 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, return rc; } -int cifs_verify_signature(struct smb_hdr *cifs_pdu, +/* must be called with server->srv_mutex held */ +int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, + __u32 *pexpected_response_sequence_number) +{ + struct kvec iov; + + iov.iov_base = cifs_pdu; + iov.iov_len = be32_to_cpu(cifs_pdu->smb_buf_length) + 4; + + return cifs_sign_smb2(&iov, 1, server, + pexpected_response_sequence_number); +} + +int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov, struct TCP_Server_Info *server, __u32 expected_sequence_number) { unsigned int rc; char server_response_sig[8]; char what_we_think_sig_should_be[20]; + struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base; if (cifs_pdu == NULL || server == NULL) return -EINVAL; @@ -247,8 +186,8 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu, cifs_pdu->Signature.Sequence.Reserved = 0; mutex_lock(&server->srv_mutex); - rc = cifs_calculate_signature(cifs_pdu, server, - what_we_think_sig_should_be); + rc = cifs_calc_signature(iov, nr_iov, server, + what_we_think_sig_should_be); mutex_unlock(&server->srv_mutex); if (rc) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 8df28e925e5b..03dc945d94a3 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -380,7 +380,7 @@ extern void tconInfoFree(struct cifs_tcon *); extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *); extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *, __u32 *); -extern int cifs_verify_signature(struct smb_hdr *, +extern int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov, struct TCP_Server_Info *server, __u32 expected_sequence_number); extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 10ca6b2c26b7..33a3fbf3a3a5 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -496,13 +496,18 @@ int cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, bool log_error) { - dump_smb(mid->resp_buf, - min_t(u32, 92, be32_to_cpu(mid->resp_buf->smb_buf_length))); + unsigned int len = be32_to_cpu(mid->resp_buf->smb_buf_length) + 4; + + dump_smb(mid->resp_buf, min_t(u32, 92, len)); /* convert the length into a more usable form */ if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + struct kvec iov; + + iov.iov_base = mid->resp_buf; + iov.iov_len = len; /* FIXME: add code to kill session */ - if (cifs_verify_signature(mid->resp_buf, server, + if (cifs_verify_signature(&iov, 1, server, mid->sequence_number + 1) != 0) cERROR(1, "Unexpected SMB signature"); } -- cgit v1.2.3-58-ga151 From e2218eab2050e879b253ca112aabd5f7167572af Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Oct 2011 06:41:32 -0400 Subject: cifs: trivial: remove obsolete comment Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a0077a5e0669..6126fbeaecb6 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -694,12 +694,6 @@ incomplete_rcv: * The right amount was read from socket - 4 bytes, * so we can now interpret the length field. */ - - /* - * Note that RFC 1001 length is big endian on the wire, - * but we convert it here so it is always manipulated - * as host byte order. - */ pdu_length = be32_to_cpu(smb_buffer->smb_buf_length); cFYI(1, "rfc1002 length 0x%x", pdu_length+4); -- cgit v1.2.3-58-ga151 From e831e6cf3acb058d898411367a582deef80e32f8 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Oct 2011 06:41:32 -0400 Subject: cifs: make smb_msg local to read_from_socket If msg_controllen is 0, then the socket layer should never touch these fields. Thus, there's no need to continually reset them. Also, there's no need to keep this field on the stack for the demultiplex thread, just make it a local variable in read_from_socket. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 6126fbeaecb6..ed969fd5f7cc 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -359,16 +359,20 @@ allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size, } static int -read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg, +read_from_socket(struct TCP_Server_Info *server, struct kvec *iov, unsigned int to_read, unsigned int *ptotal_read, bool is_header_read) { int length, rc = 0; unsigned int total_read; + struct msghdr smb_msg; char *buf = iov->iov_base; + smb_msg.msg_control = NULL; + smb_msg.msg_controllen = 0; + for (total_read = 0; total_read < to_read; total_read += length) { - length = kernel_recvmsg(server->ssocket, smb_msg, iov, 1, + length = kernel_recvmsg(server->ssocket, &smb_msg, iov, 1, to_read - total_read, 0); if (server->tcpStatus == CifsExiting) { /* then will exit */ @@ -397,8 +401,6 @@ read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg, iov->iov_base = (to_read - total_read) + buf; iov->iov_len = to_read - total_read; - smb_msg->msg_control = NULL; - smb_msg->msg_controllen = 0; rc = 3; } else rc = 1; @@ -634,7 +636,6 @@ cifs_demultiplex_thread(void *p) unsigned int pdu_length, total_read; char *buf = NULL, *bigbuf = NULL, *smallbuf = NULL; struct smb_hdr *smb_buffer = NULL; - struct msghdr smb_msg; struct kvec iov; struct task_struct *task_to_wake = NULL; struct mid_q_entry *mid_entry; @@ -665,8 +666,6 @@ cifs_demultiplex_thread(void *p) buf = smallbuf; iov.iov_base = buf; iov.iov_len = 4; - smb_msg.msg_control = NULL; - smb_msg.msg_controllen = 0; pdu_length = 4; /* enough to get RFC1001 header */ incomplete_rcv: @@ -681,7 +680,7 @@ incomplete_rcv: continue; } - rc = read_from_socket(server, &smb_msg, &iov, pdu_length, + rc = read_from_socket(server, &iov, pdu_length, &total_read, true /* header read */); if (rc == 3) goto incomplete_rcv; @@ -710,7 +709,7 @@ incomplete_rcv: iov.iov_base = 4 + buf; iov.iov_len = pdu_length; - rc = read_from_socket(server, &smb_msg, &iov, pdu_length, + rc = read_from_socket(server, &iov, pdu_length, &total_read, false); if (rc == 2) break; -- cgit v1.2.3-58-ga151 From ba749e6d5227de22e442c6088cc7dc1f0c5c68bf Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Oct 2011 06:41:32 -0400 Subject: cifs: check for unresponsive server every time we call kernel_recvmsg If the server stops sending data while in the middle of sending a response then we still want to reconnect it if it doesn't come back. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index ed969fd5f7cc..abbc6c3fe3f1 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -358,6 +358,23 @@ allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size, return true; } +static bool +server_unresponsive(struct TCP_Server_Info *server) +{ + if (echo_retries > 0 && server->tcpStatus == CifsGood && + time_after(jiffies, server->lstrp + + (echo_retries * SMB_ECHO_INTERVAL))) { + cERROR(1, "Server %s has not responded in %d seconds. " + "Reconnecting...", server->hostname, + (echo_retries * SMB_ECHO_INTERVAL / HZ)); + cifs_reconnect(server); + wake_up(&server->response_q); + return true; + } + + return false; +} + static int read_from_socket(struct TCP_Server_Info *server, struct kvec *iov, unsigned int to_read, @@ -372,6 +389,11 @@ read_from_socket(struct TCP_Server_Info *server, smb_msg.msg_controllen = 0; for (total_read = 0; total_read < to_read; total_read += length) { + if (server_unresponsive(server)) { + rc = 1; + break; + } + length = kernel_recvmsg(server->ssocket, &smb_msg, iov, 1, to_read - total_read, 0); if (server->tcpStatus == CifsExiting) { @@ -669,17 +691,6 @@ cifs_demultiplex_thread(void *p) pdu_length = 4; /* enough to get RFC1001 header */ incomplete_rcv: - if (echo_retries > 0 && server->tcpStatus == CifsGood && - time_after(jiffies, server->lstrp + - (echo_retries * SMB_ECHO_INTERVAL))) { - cERROR(1, "Server %s has not responded in %d seconds. " - "Reconnecting...", server->hostname, - (echo_retries * SMB_ECHO_INTERVAL / HZ)); - cifs_reconnect(server); - wake_up(&server->response_q); - continue; - } - rc = read_from_socket(server, &iov, pdu_length, &total_read, true /* header read */); if (rc == 3) -- cgit v1.2.3-58-ga151 From e75047344ea415760b2508a6fa29c0288c7b6b68 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 12 Oct 2011 17:47:03 -0500 Subject: add new module parameter 'enable_oplocks' Thus spake Jeff Layton: "Making that a module parm would allow you to set that parameter at boot time without needing to add special startup scripts. IMO, all of the procfile "switches" under /proc/fs/cifs should be module parms instead." This patch doesn't alter the default behavior (Oplocks are enabled by default). To disable oplocks when loading the module, use modprobe cifs enable_oplocks=0 (any of '0' or 'n' or 'N' conventions can be used). To disable oplocks at runtime using the new interface, use echo 0 > /sys/module/cifs/parameters/enable_oplocks The older /proc/fs/cifs/OplockEnabled interface will be deprecated after two releases. A subsequent patch will add an warning message about this deprecation. Changes since v2: - make enable_oplocks a 'bool' Changes since v1: - eliminate the use of extra variable by renaming the old one to enable_oplocks and make it an 'int' type. Reported-by: Alexander Swen Reviewed-by: Jeff Layton Signed-off-by: Suresh Jayaraman Signed-off-by: Steve French --- fs/cifs/cifs_debug.c | 6 +++--- fs/cifs/cifsfs.c | 6 +++++- fs/cifs/cifsglob.h | 3 ++- fs/cifs/dir.c | 2 +- fs/cifs/file.c | 4 ++-- 5 files changed, 13 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 6d40656e1e29..e1715313f398 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -511,7 +511,7 @@ static const struct file_operations cifsFYI_proc_fops = { static int cifs_oplock_proc_show(struct seq_file *m, void *v) { - seq_printf(m, "%d\n", oplockEnabled); + seq_printf(m, "%d\n", enable_oplocks); return 0; } @@ -530,9 +530,9 @@ static ssize_t cifs_oplock_proc_write(struct file *file, if (rc) return rc; if (c == '0' || c == 'n' || c == 'N') - oplockEnabled = 0; + enable_oplocks = false; else if (c == '1' || c == 'y' || c == 'Y') - oplockEnabled = 1; + enable_oplocks = true; return count; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index aa8fe7cee0b4..5c2972106816 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -53,7 +53,7 @@ int cifsFYI = 0; int cifsERROR = 1; int traceSMB = 0; -unsigned int oplockEnabled = 1; +bool enable_oplocks = true; unsigned int linuxExtEnabled = 1; unsigned int lookupCacheEnabled = 1; unsigned int multiuser_mount = 0; @@ -82,6 +82,10 @@ module_param(echo_retries, ushort, 0644); MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and " "reconnecting server. Default: 5. 0 means " "never reconnect."); +module_param(enable_oplocks, bool, 0644); +MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:" + "y/Y/1"); + extern mempool_t *cifs_sm_req_poolp; extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 95dad9d14cf1..d734dee9d495 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -964,7 +964,8 @@ GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions to be established on existing mount if we have the uid/password or Kerberos credential or equivalent for current user */ -GLOBAL_EXTERN unsigned int oplockEnabled; +/* enable or disable oplocks */ +GLOBAL_EXTERN bool enable_oplocks; GLOBAL_EXTERN unsigned int lookupCacheEnabled; GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent with more secure ntlmssp2 challenge/resp */ diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 72d448bf96ce..4dd5333753fa 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -171,7 +171,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, } tcon = tlink_tcon(tlink); - if (oplockEnabled) + if (enable_oplocks) oplock = REQ_OPLOCK; if (nd) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index fd57165f55fa..8e184150cfb5 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -371,7 +371,7 @@ int cifs_open(struct inode *inode, struct file *file) cFYI(1, "inode = 0x%p file flags are 0x%x for %s", inode, file->f_flags, full_path); - if (oplockEnabled) + if (enable_oplocks) oplock = REQ_OPLOCK; else oplock = 0; @@ -495,7 +495,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush) cFYI(1, "inode = 0x%p file flags 0x%x for %s", inode, pCifsFile->f_flags, full_path); - if (oplockEnabled) + if (enable_oplocks) oplock = REQ_OPLOCK; else oplock = 0; -- cgit v1.2.3-58-ga151 From c9c4708fdf63d26e3fdfc65bba9e82e654cae285 Mon Sep 17 00:00:00 2001 From: Suresh Jayaraman Date: Wed, 12 Oct 2011 11:52:01 +0530 Subject: cifs: update README about the kernel module parameters Reported-by: Alexander Swen Cc: Jeff Layton Signed-off-by: Suresh Jayaraman Signed-off-by: Steve French --- fs/cifs/README | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs') diff --git a/fs/cifs/README b/fs/cifs/README index c5c2c5e5f0f2..95b37a1d58f2 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -745,4 +745,18 @@ installed and something like the following lines should be added to the create cifs.spnego * * /usr/local/sbin/cifs.upcall %k create dns_resolver * * /usr/local/sbin/cifs.upcall %k +CIFS kernel module parameters +============================= +These module parameters can be specified or modified either during the time of +module loading or during the runtime by using the interface + /proc/module/cifs/parameters/ + +i.e. echo "value" > /proc/module/cifs/parameters/ + +1. echo_retries - The number of echo attempts before giving up and + reconnecting to the server. The default is 5. The value 0 + means never reconnect. + +2. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default. + [Y/y/1]. To disable use any of [N/n/0]. -- cgit v1.2.3-58-ga151 From 8bc4392a1e50f346e97f8777aaefd9cfc3d45c9f Mon Sep 17 00:00:00 2001 From: Suresh Jayaraman Date: Wed, 12 Oct 2011 11:51:53 +0530 Subject: cifs: warn about deprecation of /proc/fs/cifs/OplockEnabled interface The plan is to deprecate this interface by kernel version 3.4. Changes since v1 - add a '\n' to the printk. Reported-by: Alexander Swen Cc: Jeff Layton Signed-off-by: Suresh Jayaraman Signed-off-by: Steve French --- fs/cifs/cifs_debug.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index e1715313f398..84e8c0724704 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -526,6 +526,9 @@ static ssize_t cifs_oplock_proc_write(struct file *file, char c; int rc; + printk(KERN_WARNING "CIFS: The /proc/fs/cifs/OplockEnabled interface " + "will be removed in kernel version 3.4. Please migrate to " + "using the 'enable_oplocks' module parameter in cifs.ko.\n"); rc = get_user(c, buffer); if (rc) return rc; -- cgit v1.2.3-58-ga151 From 3d3ea8e64efbeb3e4289675dbbfab82333395642 Mon Sep 17 00:00:00 2001 From: Shirish Pargaonkar Date: Mon, 26 Sep 2011 09:56:44 -0500 Subject: cifs: Add mount options for backup intent (try #6) Add mount options backupuid and backugid. It allows an authenticated user to access files with the intent to back them up including their ACLs, who may not have access permission but has "Backup files and directories user right" on them (by virtue of being part of the built-in group Backup Operators. When mount options backupuid is specified, cifs client restricts the use of backup intents to the user whose effective user id is specified along with the mount option. When mount options backupgid is specified, cifs client restricts the use of backup intents to the users whose effective user id belongs to the group id specified along with the mount option. If an authenticated user is not part of the built-in group Backup Operators at the server, access to such files is denied, even if allowed by the client. Signed-off-by: Shirish Pargaonkar Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifs_fs_sb.h | 4 ++++ fs/cifs/cifsacl.c | 18 ++++++++++++------ fs/cifs/cifsglob.h | 7 ++++++- fs/cifs/cifsproto.h | 1 + fs/cifs/connect.c | 27 +++++++++++++++++++++++++++ fs/cifs/dir.c | 10 ++++++++-- fs/cifs/file.c | 12 ++++++++++-- fs/cifs/link.c | 17 ++++++++++++----- fs/cifs/misc.c | 15 +++++++++++++++ 9 files changed, 95 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 7260e11e21f8..500d65859279 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -43,6 +43,8 @@ #define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */ #define CIFS_MOUNT_RWPIDFORWARD 0x80000 /* use pid forwarding for rw */ #define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */ +#define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */ +#define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */ struct cifs_sb_info { struct rb_root tlink_tree; @@ -55,6 +57,8 @@ struct cifs_sb_info { atomic_t active; uid_t mnt_uid; gid_t mnt_gid; + uid_t mnt_backupuid; + gid_t mnt_backupgid; mode_t mnt_file_mode; mode_t mnt_dir_mode; unsigned int mnt_cifs_flags; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index d0f59faefb78..b244e07c3048 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -945,7 +945,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, { struct cifs_ntsd *pntsd = NULL; int oplock = 0; - int xid, rc; + int xid, rc, create_options = 0; __u16 fid; struct cifs_tcon *tcon; struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); @@ -956,9 +956,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, tcon = tlink_tcon(tlink); xid = GetXid(); - rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0, - &fid, &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, + create_options, &fid, &oplock, NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (!rc) { rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen); CIFSSMBClose(xid, tcon, fid); @@ -995,7 +998,7 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path, struct cifs_ntsd *pnntsd, u32 acllen) { int oplock = 0; - int xid, rc; + int xid, rc, create_options = 0; __u16 fid; struct cifs_tcon *tcon; struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); @@ -1006,7 +1009,10 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path, tcon = tlink_tcon(tlink); xid = GetXid(); - rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, 0, + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, create_options, &fid, &oplock, NULL, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc) { diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index d734dee9d495..9551437a2498 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -167,6 +167,8 @@ struct smb_vol { uid_t cred_uid; uid_t linux_uid; gid_t linux_gid; + uid_t backupuid; + gid_t backupgid; mode_t file_mode; mode_t dir_mode; unsigned secFlg; @@ -179,6 +181,8 @@ struct smb_vol { bool noperm:1; bool no_psx_acl:1; /* set if posix acl support should be disabled */ bool cifs_acl:1; + bool backupuid_specified; /* mount option backupuid is specified */ + bool backupgid_specified; /* mount option backupgid is specified */ bool no_xattr:1; /* set if xattr (EA) support should be disabled*/ bool server_ino:1; /* use inode numbers from server ie UniqueId */ bool direct_io:1; @@ -219,7 +223,8 @@ struct smb_vol { CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \ CIFS_MOUNT_NOPOSIXBRL | CIFS_MOUNT_NOSSYNC | \ CIFS_MOUNT_FSCACHE | CIFS_MOUNT_MF_SYMLINKS | \ - CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO) + CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \ + CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID) #define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \ MS_NODEV | MS_SYNCHRONOUS) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 03dc945d94a3..9ddb1eccde69 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -90,6 +90,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid, extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length); extern bool is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *); +extern bool backup_cred(struct cifs_sb_info *); extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, unsigned int bytes_written); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index abbc6c3fe3f1..70dd2c418276 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -831,6 +831,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, { char *value, *data, *end; char *mountdata_copy = NULL, *options; + int err; unsigned int temp_len, i, j; char separator[2]; short int override_uid = -1; @@ -887,6 +888,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, cFYI(1, "Null separator not allowed"); } } + vol->backupuid_specified = false; /* no backup intent for a user */ + vol->backupgid_specified = false; /* no backup intent for a group */ while ((data = strsep(&options, separator)) != NULL) { if (!*data) @@ -1446,6 +1449,22 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->mfsymlinks = true; } else if (strnicmp(data, "multiuser", 8) == 0) { vol->multiuser = true; + } else if (!strnicmp(data, "backupuid", 9) && value && *value) { + err = kstrtouint(value, 0, &vol->backupuid); + if (err < 0) { + cERROR(1, "%s: Invalid backupuid value", + __func__); + goto cifs_parse_mount_err; + } + vol->backupuid_specified = true; + } else if (!strnicmp(data, "backupgid", 9) && value && *value) { + err = kstrtouint(value, 0, &vol->backupgid); + if (err < 0) { + cERROR(1, "%s: Invalid backupgid value", + __func__); + goto cifs_parse_mount_err; + } + vol->backupgid_specified = true; } else printk(KERN_WARNING "CIFS: Unknown mount option %s\n", data); @@ -2737,6 +2756,10 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->mnt_uid = pvolume_info->linux_uid; cifs_sb->mnt_gid = pvolume_info->linux_gid; + if (pvolume_info->backupuid_specified) + cifs_sb->mnt_backupuid = pvolume_info->backupuid; + if (pvolume_info->backupgid_specified) + cifs_sb->mnt_backupgid = pvolume_info->backupgid; cifs_sb->mnt_file_mode = pvolume_info->file_mode; cifs_sb->mnt_dir_mode = pvolume_info->dir_mode; cFYI(1, "file mode: 0x%x dir mode: 0x%x", @@ -2767,6 +2790,10 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD; if (pvolume_info->cifs_acl) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL; + if (pvolume_info->backupuid_specified) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPUID; + if (pvolume_info->backupgid_specified) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPGID; if (pvolume_info->override_uid) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID; if (pvolume_info->override_gid) diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 4dd5333753fa..0c8098d54d2b 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -244,6 +244,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, if (!tcon->unix_ext && (mode & S_IWUGO) == 0) create_options |= CREATE_OPTION_READONLY; + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + if (tcon->ses->capabilities & CAP_NT_SMBS) rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess, create_options, @@ -357,6 +360,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, { int rc = -EPERM; int xid; + int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL; struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; struct cifs_tcon *pTcon; @@ -431,9 +435,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, return rc; } - /* FIXME: would WRITE_OWNER | WRITE_DAC be better? */ + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE, - GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL, + GENERIC_WRITE, create_options, &fileHandle, &oplock, buf, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8e184150cfb5..237192ae7587 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -174,6 +174,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, int rc; int desiredAccess; int disposition; + int create_options = CREATE_NOT_DIR; FILE_ALL_INFO *buf; desiredAccess = cifs_convert_flags(f_flags); @@ -210,9 +211,12 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, if (!buf) return -ENOMEM; + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + if (tcon->ses->capabilities & CAP_NT_SMBS) rc = CIFSSMBOpen(xid, tcon, full_path, disposition, - desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf, + desiredAccess, create_options, pnetfid, poplock, buf, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); else @@ -465,6 +469,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush) char *full_path = NULL; int desiredAccess; int disposition = FILE_OPEN; + int create_options = CREATE_NOT_DIR; __u16 netfid; xid = GetXid(); @@ -524,6 +529,9 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush) desiredAccess = cifs_convert_flags(pCifsFile->f_flags); + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + /* Can not refresh inode by passing in file_info buf to be returned by SMBOpen and then calling get_inode_info with returned buf since file might have write behind data that needs to be flushed @@ -531,7 +539,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush) that inode was not dirty locally we could do this */ rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess, - CREATE_NOT_DIR, &netfid, &oplock, NULL, + create_options, &netfid, &oplock, NULL, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc) { diff --git a/fs/cifs/link.c b/fs/cifs/link.c index db3f18cdf024..8693b5d0e180 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -183,14 +183,20 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str) static int CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon, const char *fromName, const char *toName, - const struct nls_table *nls_codepage, int remap) + struct cifs_sb_info *cifs_sb) { int rc; int oplock = 0; + int remap; + int create_options = CREATE_NOT_DIR; __u16 netfid = 0; u8 *buf; unsigned int bytes_written = 0; struct cifs_io_parms io_parms; + struct nls_table *nls_codepage; + + nls_codepage = cifs_sb->local_nls; + remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); if (!buf) @@ -202,8 +208,11 @@ CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon, return rc; } + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE, - CREATE_NOT_DIR, &netfid, &oplock, NULL, + create_options, &netfid, &oplock, NULL, nls_codepage, remap); if (rc != 0) { kfree(buf); @@ -559,9 +568,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) /* BB what if DFS and this volume is on different share? BB */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_sb); else if (pTcon->unix_ext) rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, cifs_sb->local_nls); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 4a1801b3195f..703ef5c6fdb1 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -675,3 +675,18 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) cinode->clientCanCacheRead = false; } } + +bool +backup_cred(struct cifs_sb_info *cifs_sb) +{ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) { + if (cifs_sb->mnt_backupuid == current_fsuid()) + return true; + } + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) { + if (in_group_p(cifs_sb->mnt_backupgid)) + return true; + } + + return false; +} -- cgit v1.2.3-58-ga151 From d02616869221517582dea7972c9b5063ac9a2ae7 Mon Sep 17 00:00:00 2001 From: Shirish Pargaonkar Date: Wed, 21 Sep 2011 14:33:31 -0500 Subject: cifs: clean up unused encryption code Remove unsed #if 0 encryption code. Signed-off-by: Shirish Pargaonkar Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/smbencrypt.c | 121 --------------------------------------------------- 1 file changed, 121 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 42b9fff48751..ac1221d969d6 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -265,91 +265,6 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16) return rc; } -#if 0 /* currently unused */ -/* Does both the NT and LM owfs of a user's password */ -static void -nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16]) -{ - char passwd[514]; - - memset(passwd, '\0', 514); - if (strlen(pwd) < 513) - strcpy(passwd, pwd); - else - memcpy(passwd, pwd, 512); - /* Calculate the MD4 hash (NT compatible) of the password */ - memset(nt_p16, '\0', 16); - E_md4hash(passwd, nt_p16); - - /* Mangle the passwords into Lanman format */ - passwd[14] = '\0'; -/* strupper(passwd); */ - - /* Calculate the SMB (lanman) hash functions of the password */ - - memset(p16, '\0', 16); - E_P16((unsigned char *) passwd, (unsigned char *) p16); - - /* clear out local copy of user's password (just being paranoid). */ - memset(passwd, '\0', sizeof(passwd)); -} -#endif - -/* Does the NTLMv2 owfs of a user's password */ -#if 0 /* function not needed yet - but will be soon */ -static void -ntv2_owf_gen(const unsigned char owf[16], const char *user_n, - const char *domain_n, unsigned char kr_buf[16], - const struct nls_table *nls_codepage) -{ - wchar_t *user_u; - wchar_t *dom_u; - int user_l, domain_l; - struct HMACMD5Context ctx; - - /* might as well do one alloc to hold both (user_u and dom_u) */ - user_u = kmalloc(2048 * sizeof(wchar_t), GFP_KERNEL); - if (user_u == NULL) - return; - dom_u = user_u + 1024; - - /* push_ucs2(NULL, user_u, user_n, (user_l+1)*2, - STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER); - push_ucs2(NULL, dom_u, domain_n, (domain_l+1)*2, - STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER); */ - - /* BB user and domain may need to be uppercased */ - user_l = cifs_strtoUCS(user_u, user_n, 511, nls_codepage); - domain_l = cifs_strtoUCS(dom_u, domain_n, 511, nls_codepage); - - user_l++; /* trailing null */ - domain_l++; - - hmac_md5_init_limK_to_64(owf, 16, &ctx); - hmac_md5_update((const unsigned char *) user_u, user_l * 2, &ctx); - hmac_md5_update((const unsigned char *) dom_u, domain_l * 2, &ctx); - hmac_md5_final(kr_buf, &ctx); - - kfree(user_u); -} -#endif - -/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */ -#if 0 /* currently unused */ -static void -NTLMSSPOWFencrypt(unsigned char passwd[8], - unsigned char *ntlmchalresp, unsigned char p24[24]) -{ - unsigned char p21[21]; - - memset(p21, '\0', 21); - memcpy(p21, passwd, 8); - memset(p21 + 8, 0xbd, 8); - - E_P24(p21, ntlmchalresp, p24); -} -#endif - /* Does the NT MD4 hash then des encryption. */ int SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) @@ -369,39 +284,3 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) rc = E_P24(p21, c8, p24); return rc; } - - -/* Does the md5 encryption from the NT hash for NTLMv2. */ -/* These routines will be needed later */ -#if 0 -static void -SMBOWFencrypt_ntv2(const unsigned char kr[16], - const struct data_blob *srv_chal, - const struct data_blob *cli_chal, unsigned char resp_buf[16]) -{ - struct HMACMD5Context ctx; - - hmac_md5_init_limK_to_64(kr, 16, &ctx); - hmac_md5_update(srv_chal->data, srv_chal->length, &ctx); - hmac_md5_update(cli_chal->data, cli_chal->length, &ctx); - hmac_md5_final(resp_buf, &ctx); -} - -static void -SMBsesskeygen_ntv2(const unsigned char kr[16], - const unsigned char *nt_resp, __u8 sess_key[16]) -{ - struct HMACMD5Context ctx; - - hmac_md5_init_limK_to_64(kr, 16, &ctx); - hmac_md5_update(nt_resp, 16, &ctx); - hmac_md5_final((unsigned char *) sess_key, &ctx); -} - -static void -SMBsesskeygen_ntv1(const unsigned char kr[16], - const unsigned char *nt_resp, __u8 sess_key[16]) -{ - mdfour((unsigned char *) sess_key, (unsigned char *) kr, 16); -} -#endif -- cgit v1.2.3-58-ga151 From 20c3a200c418ea1b02037800830ba8a7cdd1b275 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 12 Oct 2011 20:17:55 -0500 Subject: Typo in cifs readme in name of module parm directory Suresh had a typo in his recent patch adding information on the new oplock_endabled parm. Should be documented as in directory /sys/module/cifs/parameters not /proc/module/cifs/parameters Signed-off-by: Steve French --- fs/cifs/README | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/README b/fs/cifs/README index 95b37a1d58f2..895da1dc1550 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -751,7 +751,7 @@ These module parameters can be specified or modified either during the time of module loading or during the runtime by using the interface /proc/module/cifs/parameters/ -i.e. echo "value" > /proc/module/cifs/parameters/ +i.e. echo "value" > /sys/module/cifs/parameters/ 1. echo_retries - The number of echo attempts before giving up and reconnecting to the server. The default is 5. The value 0 -- cgit v1.2.3-58-ga151 From 21fed0d5b763b94a7d1568c27d0cce892ab8d43e Mon Sep 17 00:00:00 2001 From: Shirish Pargaonkar Date: Tue, 9 Aug 2011 14:30:48 -0500 Subject: cifs: Add data structures and functions for uid/gid to SID mapping (try #4) Add data structures and functions necessary to map a uid and gid to SID. These functions are very similar to the ones used to map a SID to uid and gid. This time, instead of storing sid to id mapping sorted on a sid value, id to sid is stored, sorted on an id. A cifs upcall sends an id (uid or gid) and expects a SID structure in return, if mapping was done successfully. A failed id to sid mapping to EINVAL. This patchset aims to enable chown and chgrp commands when cifsacl mount option is specified, especially to Windows SMB servers. Currently we can't do that. So now along with chmod command, chown and chgrp work. Winbind is used to map id to a SID. chown and chgrp use an upcall to provide an id to winbind and upcall returns with corrosponding SID if any exists. That SID is used to build security descriptor. The DACL part of a security descriptor is not changed by either chown or chgrp functionality. cifs client maintains a separate caches for uid to SID and gid to SID mapping. This is similar to the one used earlier to map SID to id (as part of ID mapping code). I tested it by mounting shares from a Windows (2003) server by authenticating as two users, one at a time, as Administrator and as a ordinary user. And then attempting to change owner of a file on the share. Depending on the permissions/privileges at the server for that file, chown request fails to either open a file (to change the ownership) or to set security descriptor. So it all depends on privileges on the file at the server and what user you are authenticated as at the server, cifs client is just a conduit. I compared the security descriptor during chown command to that what smbcacls sends when it is used with -M OWNNER: option and they are similar. This patchset aim to enable chown and chgrp commands when cifsacl mount option is specified, especially to Windows SMB servers. Currently we can't do that. So now along with chmod command, chown and chgrp work. I tested it by mounting shares from a Windows (2003) server by authenticating as two users, one at a time, as Administrator and as a ordinary user. And then attempting to change owner of a file on the share. Depending on the permissions/privileges at the server for that file, chown request fails to either open a file (to change the ownership) or to set security descriptor. So it all depends on privileges on the file at the server and what user you are authenticated as at the server, cifs client is just a conduit. Signed-off-by: Shirish Pargaonkar Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 198 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/cifs/cifsglob.h | 6 ++ 2 files changed, 204 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index b244e07c3048..992f1fb299d1 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -91,9 +91,76 @@ cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc) shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); spin_unlock(&sidgidlock); + root = &siduidtree; + spin_lock(&uidsidlock); + shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); + spin_unlock(&uidsidlock); + + root = &sidgidtree; + spin_lock(&gidsidlock); + shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); + spin_unlock(&gidsidlock); + return nr_rem; } +static void +sid_rb_insert(struct rb_root *root, unsigned long cid, + struct cifs_sid_id **psidid, char *typestr) +{ + char *strptr; + struct rb_node *node = root->rb_node; + struct rb_node *parent = NULL; + struct rb_node **linkto = &(root->rb_node); + struct cifs_sid_id *lsidid; + + while (node) { + lsidid = rb_entry(node, struct cifs_sid_id, rbnode); + parent = node; + if (cid > lsidid->id) { + linkto = &(node->rb_left); + node = node->rb_left; + } + if (cid < lsidid->id) { + linkto = &(node->rb_right); + node = node->rb_right; + } + } + + (*psidid)->id = cid; + (*psidid)->time = jiffies - (SID_MAP_RETRY + 1); + (*psidid)->refcount = 0; + + sprintf((*psidid)->sidstr, "%s", typestr); + strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr); + sprintf(strptr, "%ld", cid); + + clear_bit(SID_ID_PENDING, &(*psidid)->state); + clear_bit(SID_ID_MAPPED, &(*psidid)->state); + + rb_link_node(&(*psidid)->rbnode, parent, linkto); + rb_insert_color(&(*psidid)->rbnode, root); +} + +static struct cifs_sid_id * +sid_rb_search(struct rb_root *root, unsigned long cid) +{ + struct rb_node *node = root->rb_node; + struct cifs_sid_id *lsidid; + + while (node) { + lsidid = rb_entry(node, struct cifs_sid_id, rbnode); + if (cid > lsidid->id) + node = node->rb_left; + else if (cid < lsidid->id) + node = node->rb_right; + else /* node found */ + return lsidid; + } + + return NULL; +} + static struct shrinker cifs_shrinker = { .shrink = cifs_idmap_shrinker, .seeks = DEFAULT_SEEKS, @@ -110,6 +177,7 @@ cifs_idmap_key_instantiate(struct key *key, const void *data, size_t datalen) memcpy(payload, data, datalen); key->payload.data = payload; + key->datalen = datalen; return 0; } @@ -223,6 +291,120 @@ sidid_pending_wait(void *unused) return signal_pending(current) ? -ERESTARTSYS : 0; } +static int +id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid) +{ + int rc = 0; + struct key *sidkey; + const struct cred *saved_cred; + struct cifs_sid *lsid; + struct cifs_sid_id *psidid, *npsidid; + struct rb_root *cidtree; + spinlock_t *cidlock; + + if (sidtype == SIDOWNER) { + cidlock = &siduidlock; + cidtree = &uidtree; + } else if (sidtype == SIDGROUP) { + cidlock = &sidgidlock; + cidtree = &gidtree; + } else + return -EINVAL; + + spin_lock(cidlock); + psidid = sid_rb_search(cidtree, cid); + + if (!psidid) { /* node does not exist, allocate one & attempt adding */ + spin_unlock(cidlock); + npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL); + if (!npsidid) + return -ENOMEM; + + npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL); + if (!npsidid->sidstr) { + kfree(npsidid); + return -ENOMEM; + } + + spin_lock(cidlock); + psidid = sid_rb_search(cidtree, cid); + if (psidid) { /* node happened to get inserted meanwhile */ + ++psidid->refcount; + spin_unlock(cidlock); + kfree(npsidid->sidstr); + kfree(npsidid); + } else { + psidid = npsidid; + sid_rb_insert(cidtree, cid, &psidid, + sidtype == SIDOWNER ? "oi:" : "gi:"); + ++psidid->refcount; + spin_unlock(cidlock); + } + } else { + ++psidid->refcount; + spin_unlock(cidlock); + } + + /* + * If we are here, it is safe to access psidid and its fields + * since a reference was taken earlier while holding the spinlock. + * A reference on the node is put without holding the spinlock + * and it is OK to do so in this case, shrinker will not erase + * this node until all references are put and we do not access + * any fields of the node after a reference is put . + */ + if (test_bit(SID_ID_MAPPED, &psidid->state)) { + memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid)); + psidid->time = jiffies; /* update ts for accessing */ + goto id_sid_out; + } + + if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) { + rc = -EINVAL; + goto id_sid_out; + } + + if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) { + saved_cred = override_creds(root_cred); + sidkey = request_key(&cifs_idmap_key_type, psidid->sidstr, ""); + if (IS_ERR(sidkey)) { + rc = -EINVAL; + cFYI(1, "%s: Can't map and id to a SID", __func__); + } else { + lsid = (struct cifs_sid *)sidkey->payload.data; + memcpy(&psidid->sid, lsid, + sidkey->datalen < sizeof(struct cifs_sid) ? + sidkey->datalen : sizeof(struct cifs_sid)); + memcpy(ssid, &psidid->sid, + sidkey->datalen < sizeof(struct cifs_sid) ? + sidkey->datalen : sizeof(struct cifs_sid)); + set_bit(SID_ID_MAPPED, &psidid->state); + key_put(sidkey); + kfree(psidid->sidstr); + } + psidid->time = jiffies; /* update ts for accessing */ + revert_creds(saved_cred); + clear_bit(SID_ID_PENDING, &psidid->state); + wake_up_bit(&psidid->state, SID_ID_PENDING); + } else { + rc = wait_on_bit(&psidid->state, SID_ID_PENDING, + sidid_pending_wait, TASK_INTERRUPTIBLE); + if (rc) { + cFYI(1, "%s: sidid_pending_wait interrupted %d", + __func__, rc); + --psidid->refcount; + return rc; + } + if (test_bit(SID_ID_MAPPED, &psidid->state)) + memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid)); + else + rc = -EINVAL; + } +id_sid_out: + --psidid->refcount; + return rc; +} + static int sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, struct cifs_fattr *fattr, uint sidtype) @@ -383,6 +565,10 @@ init_cifs_idmap(void) spin_lock_init(&sidgidlock); gidtree = RB_ROOT; + spin_lock_init(&uidsidlock); + siduidtree = RB_ROOT; + spin_lock_init(&gidsidlock); + sidgidtree = RB_ROOT; register_shrinker(&cifs_shrinker); cFYI(1, "cifs idmap keyring: %d\n", key_serial(keyring)); @@ -422,6 +608,18 @@ cifs_destroy_idmaptrees(void) while ((node = rb_first(root))) rb_erase(node, root); spin_unlock(&sidgidlock); + + root = &siduidtree; + spin_lock(&uidsidlock); + while ((node = rb_first(root))) + rb_erase(node, root); + spin_unlock(&uidsidlock); + + root = &sidgidtree; + spin_lock(&gidsidlock); + while ((node = rb_first(root))) + rb_erase(node, root); + spin_unlock(&gidsidlock); } /* if the two SIDs (roughly equivalent to a UUID for a user or group) are diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 9551437a2498..3b83fe7bfe60 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -984,10 +984,16 @@ GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ /* reconnect after this many failed echo attempts */ GLOBAL_EXTERN unsigned short echo_retries; +#ifdef CONFIG_CIFS_ACL GLOBAL_EXTERN struct rb_root uidtree; GLOBAL_EXTERN struct rb_root gidtree; GLOBAL_EXTERN spinlock_t siduidlock; GLOBAL_EXTERN spinlock_t sidgidlock; +GLOBAL_EXTERN struct rb_root siduidtree; +GLOBAL_EXTERN struct rb_root sidgidtree; +GLOBAL_EXTERN spinlock_t uidsidlock; +GLOBAL_EXTERN spinlock_t gidsidlock; +#endif /* CONFIG_CIFS_ACL */ void cifs_oplock_break(struct work_struct *work); -- cgit v1.2.3-58-ga151 From a52c1eb7ae79f0a11511d49cfc00d2f9e576abea Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Oct 2011 06:41:32 -0400 Subject: cifs: simplify read_from_socket Move the iovec handling entirely into read_from_socket. That simplifies the code and gets rid of the special handling for header reads. With this we can also get rid of the "goto incomplete_rcv" label in the main demultiplex thread function since we can now treat header and non-header receives the same way. Also, make it return an int (since we'll never receive enough to worry about the sign bit anyway), and simply make it return the amount of bytes read or a negative error code. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 77 +++++++++++++++++-------------------------------------- 1 file changed, 24 insertions(+), 53 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 70dd2c418276..ff47c1842bec 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -376,35 +376,33 @@ server_unresponsive(struct TCP_Server_Info *server) } static int -read_from_socket(struct TCP_Server_Info *server, - struct kvec *iov, unsigned int to_read, - unsigned int *ptotal_read, bool is_header_read) +read_from_socket(struct TCP_Server_Info *server, char *buf, + unsigned int to_read) { - int length, rc = 0; - unsigned int total_read; + int length = 0; + int total_read; struct msghdr smb_msg; - char *buf = iov->iov_base; + struct kvec iov; smb_msg.msg_control = NULL; smb_msg.msg_controllen = 0; - for (total_read = 0; total_read < to_read; total_read += length) { + for (total_read = 0; to_read; total_read += length, to_read -= length) { if (server_unresponsive(server)) { - rc = 1; + total_read = -EAGAIN; break; } - length = kernel_recvmsg(server->ssocket, &smb_msg, iov, 1, - to_read - total_read, 0); + iov.iov_base = buf + total_read; + iov.iov_len = to_read; + length = kernel_recvmsg(server->ssocket, &smb_msg, &iov, 1, + to_read, 0); if (server->tcpStatus == CifsExiting) { - /* then will exit */ - rc = 2; + total_read = -ESHUTDOWN; break; } else if (server->tcpStatus == CifsNeedReconnect) { cifs_reconnect(server); - /* Reconnect wakes up rspns q */ - /* Now we will reread sock */ - rc = 1; + total_read = -EAGAIN; break; } else if (length == -ERESTARTSYS || length == -EAGAIN || @@ -416,28 +414,16 @@ read_from_socket(struct TCP_Server_Info *server, */ usleep_range(1000, 2000); length = 0; - if (!is_header_read) - continue; - /* Special handling for header read */ - if (total_read) { - iov->iov_base = (to_read - total_read) + - buf; - iov->iov_len = to_read - total_read; - rc = 3; - } else - rc = 1; - break; + continue; } else if (length <= 0) { - cERROR(1, "Received no data, expecting %d", - to_read - total_read); + cFYI(1, "Received no data or error: expecting %d " + "got %d", to_read, length); cifs_reconnect(server); - rc = 1; + total_read = -EAGAIN; break; } } - - *ptotal_read = total_read; - return rc; + return total_read; } static bool @@ -658,12 +644,10 @@ cifs_demultiplex_thread(void *p) unsigned int pdu_length, total_read; char *buf = NULL, *bigbuf = NULL, *smallbuf = NULL; struct smb_hdr *smb_buffer = NULL; - struct kvec iov; struct task_struct *task_to_wake = NULL; struct mid_q_entry *mid_entry; bool isLargeBuf = false; bool isMultiRsp = false; - int rc; current->flags |= PF_MEMALLOC; cFYI(1, "Demultiplex PID: %d", task_pid_nr(current)); @@ -686,19 +670,12 @@ cifs_demultiplex_thread(void *p) isMultiRsp = false; smb_buffer = (struct smb_hdr *)smallbuf; buf = smallbuf; - iov.iov_base = buf; - iov.iov_len = 4; pdu_length = 4; /* enough to get RFC1001 header */ -incomplete_rcv: - rc = read_from_socket(server, &iov, pdu_length, - &total_read, true /* header read */); - if (rc == 3) - goto incomplete_rcv; - else if (rc == 2) - break; - else if (rc == 1) + length = read_from_socket(server, buf, pdu_length); + if (length < 0) continue; + total_read = length; /* * The right amount was read from socket - 4 bytes, @@ -718,16 +695,10 @@ incomplete_rcv: buf = bigbuf; } - iov.iov_base = 4 + buf; - iov.iov_len = pdu_length; - rc = read_from_socket(server, &iov, pdu_length, - &total_read, false); - if (rc == 2) - break; - else if (rc == 1) + length = read_from_socket(server, buf + 4, pdu_length); + if (length < 0) continue; - - total_read += 4; /* account for rfc1002 hdr */ + total_read += length; dump_smb(smb_buffer, total_read); -- cgit v1.2.3-58-ga151 From 94443f43404239c2a6dc4252a7cb9e77f5b1eb6e Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 7 Oct 2011 18:57:45 +0400 Subject: CIFS: Fix incorrect max RFC1002 write size value ..the length field has only 17 bits. Cc: Acked-by: Jeff Layton Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/connect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index ff47c1842bec..82bc0d27e495 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2809,10 +2809,10 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, /* * When the server doesn't allow large posix writes, only allow a wsize of - * 128k minus the size of the WRITE_AND_X header. That allows for a write up + * 2^17-1 minus the size of the WRITE_AND_X header. That allows for a write up * to the maximum size described by RFC1002. */ -#define CIFS_MAX_RFC1002_WSIZE (128 * 1024 - sizeof(WRITE_REQ) + 4) +#define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4) /* * The default wsize is 1M. find_get_pages seems to return a maximum of 256 -- cgit v1.2.3-58-ga151 From 03776f4516bc299b3145595bdd704d40d69adc02 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Tue, 17 Aug 2010 11:26:00 +0400 Subject: CIFS: Simplify byte range locking code Split cifs_lock into several functions and let CIFSSMBLock get pid as an argument. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 1 + fs/cifs/cifsproto.h | 2 +- fs/cifs/cifssmb.c | 4 +- fs/cifs/file.c | 370 ++++++++++++++++++++++++++++------------------------ 4 files changed, 205 insertions(+), 172 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 3b83fe7bfe60..1f265ffe7e63 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -492,6 +492,7 @@ struct cifsLockInfo { struct list_head llist; /* pointer to next cifsLockInfo */ __u64 offset; __u64 length; + __u32 pid; __u8 type; }; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 9ddb1eccde69..94834dbb46da 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -361,7 +361,7 @@ extern int CIFSGetSrvInodeNumber(const int xid, struct cifs_tcon *tcon, int remap_special_chars); extern int CIFSSMBLock(const int xid, struct cifs_tcon *tcon, - const __u16 netfid, const __u64 len, + const __u16 netfid, const __u32 netpid, const __u64 len, const __u64 offset, const __u32 numUnlock, const __u32 numLock, const __u8 lockType, const bool waitFlag, const __u8 oplock_level); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 602326fa4a4f..e33093f7ef0d 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1963,7 +1963,7 @@ CIFSSMBWrite2(const int xid, struct cifs_io_parms *io_parms, int CIFSSMBLock(const int xid, struct cifs_tcon *tcon, - const __u16 smb_file_id, const __u64 len, + const __u16 smb_file_id, const __u32 netpid, const __u64 len, const __u64 offset, const __u32 numUnlock, const __u32 numLock, const __u8 lockType, const bool waitFlag, const __u8 oplock_level) @@ -1999,7 +1999,7 @@ CIFSSMBLock(const int xid, struct cifs_tcon *tcon, pSMB->Fid = smb_file_id; /* netfid stays le */ if ((numLock != 0) || (numUnlock != 0)) { - pSMB->Locks[0].Pid = cpu_to_le16(current->tgid); + pSMB->Locks[0].Pid = cpu_to_le16(netpid); /* BB where to store pid high? */ pSMB->Locks[0].LengthLow = cpu_to_le32((u32)len); pSMB->Locks[0].LengthHigh = cpu_to_le32((u32)(len>>32)); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 237192ae7587..ab85699c5653 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -639,8 +639,8 @@ int cifs_closedir(struct inode *inode, struct file *file) return rc; } -static int store_file_lock(struct cifsFileInfo *fid, __u64 len, - __u64 offset, __u8 lockType) +static int store_file_lock(struct cifsFileInfo *cfile, __u64 len, + __u64 offset, __u8 type, __u16 netfid) { struct cifsLockInfo *li = kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL); @@ -648,210 +648,241 @@ static int store_file_lock(struct cifsFileInfo *fid, __u64 len, return -ENOMEM; li->offset = offset; li->length = len; - li->type = lockType; - mutex_lock(&fid->lock_mutex); - list_add(&li->llist, &fid->llist); - mutex_unlock(&fid->lock_mutex); + li->type = type; + li->pid = current->tgid; + mutex_lock(&cfile->lock_mutex); + list_add_tail(&li->llist, &cfile->llist); + mutex_unlock(&cfile->lock_mutex); return 0; } -int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) +static void +cifs_read_flock(struct file_lock *flock, __u8 *type, int *lock, int *unlock, + bool *wait_flag) { - int rc, xid; - __u32 numLock = 0; - __u32 numUnlock = 0; - __u64 length; - bool wait_flag = false; - struct cifs_sb_info *cifs_sb; - struct cifs_tcon *tcon; - __u16 netfid; - __u8 lockType = LOCKING_ANDX_LARGE_FILES; - bool posix_locking = 0; - - length = 1 + pfLock->fl_end - pfLock->fl_start; - rc = -EACCES; - xid = GetXid(); - - cFYI(1, "Lock parm: 0x%x flockflags: " - "0x%x flocktype: 0x%x start: %lld end: %lld", - cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start, - pfLock->fl_end); - - if (pfLock->fl_flags & FL_POSIX) + if (flock->fl_flags & FL_POSIX) cFYI(1, "Posix"); - if (pfLock->fl_flags & FL_FLOCK) + if (flock->fl_flags & FL_FLOCK) cFYI(1, "Flock"); - if (pfLock->fl_flags & FL_SLEEP) { + if (flock->fl_flags & FL_SLEEP) { cFYI(1, "Blocking lock"); - wait_flag = true; + *wait_flag = true; } - if (pfLock->fl_flags & FL_ACCESS) + if (flock->fl_flags & FL_ACCESS) cFYI(1, "Process suspended by mandatory locking - " - "not implemented yet"); - if (pfLock->fl_flags & FL_LEASE) + "not implemented yet"); + if (flock->fl_flags & FL_LEASE) cFYI(1, "Lease on file - not implemented yet"); - if (pfLock->fl_flags & + if (flock->fl_flags & (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE))) - cFYI(1, "Unknown lock flags 0x%x", pfLock->fl_flags); + cFYI(1, "Unknown lock flags 0x%x", flock->fl_flags); - if (pfLock->fl_type == F_WRLCK) { + *type = LOCKING_ANDX_LARGE_FILES; + if (flock->fl_type == F_WRLCK) { cFYI(1, "F_WRLCK "); - numLock = 1; - } else if (pfLock->fl_type == F_UNLCK) { + *lock = 1; + } else if (flock->fl_type == F_UNLCK) { cFYI(1, "F_UNLCK"); - numUnlock = 1; - /* Check if unlock includes more than - one lock range */ - } else if (pfLock->fl_type == F_RDLCK) { + *unlock = 1; + /* Check if unlock includes more than one lock range */ + } else if (flock->fl_type == F_RDLCK) { cFYI(1, "F_RDLCK"); - lockType |= LOCKING_ANDX_SHARED_LOCK; - numLock = 1; - } else if (pfLock->fl_type == F_EXLCK) { + *type |= LOCKING_ANDX_SHARED_LOCK; + *lock = 1; + } else if (flock->fl_type == F_EXLCK) { cFYI(1, "F_EXLCK"); - numLock = 1; - } else if (pfLock->fl_type == F_SHLCK) { + *lock = 1; + } else if (flock->fl_type == F_SHLCK) { cFYI(1, "F_SHLCK"); - lockType |= LOCKING_ANDX_SHARED_LOCK; - numLock = 1; + *type |= LOCKING_ANDX_SHARED_LOCK; + *lock = 1; } else cFYI(1, "Unknown type of lock"); +} - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink); - netfid = ((struct cifsFileInfo *)file->private_data)->netfid; - - if ((tcon->ses->capabilities & CAP_UNIX) && - (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && - ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) - posix_locking = 1; - /* BB add code here to normalize offset and length to - account for negative length which we can not accept over the - wire */ - if (IS_GETLK(cmd)) { - if (posix_locking) { - int posix_lock_type; - if (lockType & LOCKING_ANDX_SHARED_LOCK) - posix_lock_type = CIFS_RDLCK; - else - posix_lock_type = CIFS_WRLCK; - rc = CIFSSMBPosixLock(xid, tcon, netfid, 1 /* get */, - length, pfLock, posix_lock_type, - wait_flag); - FreeXid(xid); - return rc; - } - - /* BB we could chain these into one lock request BB */ - rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start, - 0, 1, lockType, 0 /* wait flag */, 0); - if (rc == 0) { - rc = CIFSSMBLock(xid, tcon, netfid, length, - pfLock->fl_start, 1 /* numUnlock */ , - 0 /* numLock */ , lockType, - 0 /* wait flag */, 0); - pfLock->fl_type = F_UNLCK; - if (rc != 0) - cERROR(1, "Error unlocking previously locked " - "range %d during test of lock", rc); - rc = 0; - - } else { - /* if rc == ERR_SHARING_VIOLATION ? */ - rc = 0; +static int +cifs_getlk(struct cifsFileInfo *cfile, struct file_lock *flock, __u8 type, + bool wait_flag, bool posix_lck, int xid) +{ + int rc = 0; + __u64 length = 1 + flock->fl_end - flock->fl_start; + __u16 netfid = cfile->netfid; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); - if (lockType & LOCKING_ANDX_SHARED_LOCK) { - pfLock->fl_type = F_WRLCK; - } else { - rc = CIFSSMBLock(xid, tcon, netfid, length, - pfLock->fl_start, 0, 1, - lockType | LOCKING_ANDX_SHARED_LOCK, - 0 /* wait flag */, 0); - if (rc == 0) { - rc = CIFSSMBLock(xid, tcon, netfid, - length, pfLock->fl_start, 1, 0, - lockType | - LOCKING_ANDX_SHARED_LOCK, - 0 /* wait flag */, 0); - pfLock->fl_type = F_RDLCK; - if (rc != 0) - cERROR(1, "Error unlocking " - "previously locked range %d " - "during test of lock", rc); - rc = 0; - } else { - pfLock->fl_type = F_WRLCK; - rc = 0; - } - } - } + if (posix_lck) { + int posix_lock_type; + if (type & LOCKING_ANDX_SHARED_LOCK) + posix_lock_type = CIFS_RDLCK; + else + posix_lock_type = CIFS_WRLCK; + rc = CIFSSMBPosixLock(xid, tcon, netfid, 1 /* get */, + length, flock, posix_lock_type, + wait_flag); + return rc; + } - FreeXid(xid); + /* BB we could chain these into one lock request BB */ + rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, + flock->fl_start, 0, 1, type, 0, 0); + if (rc == 0) { + rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, + length, flock->fl_start, 1, 0, + type, 0, 0); + flock->fl_type = F_UNLCK; + if (rc != 0) + cERROR(1, "Error unlocking previously locked " + "range %d during test of lock", rc); + rc = 0; return rc; } - if (!numLock && !numUnlock) { - /* if no lock or unlock then nothing - to do since we do not know what it is */ - FreeXid(xid); - return -EOPNOTSUPP; + if (type & LOCKING_ANDX_SHARED_LOCK) { + flock->fl_type = F_WRLCK; + rc = 0; + return rc; } - if (posix_locking) { + rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, + flock->fl_start, 0, 1, + type | LOCKING_ANDX_SHARED_LOCK, 0, 0); + if (rc == 0) { + rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, + length, flock->fl_start, 1, 0, + type | LOCKING_ANDX_SHARED_LOCK, + 0, 0); + flock->fl_type = F_RDLCK; + if (rc != 0) + cERROR(1, "Error unlocking previously locked " + "range %d during test of lock", rc); + } else + flock->fl_type = F_WRLCK; + + rc = 0; + return rc; +} + +static int +cifs_setlk(struct file *file, struct file_lock *flock, __u8 type, + bool wait_flag, bool posix_lck, int lock, int unlock, int xid) +{ + int rc = 0; + __u64 length = 1 + flock->fl_end - flock->fl_start; + struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + __u16 netfid = cfile->netfid; + + if (posix_lck) { int posix_lock_type; - if (lockType & LOCKING_ANDX_SHARED_LOCK) + if (type & LOCKING_ANDX_SHARED_LOCK) posix_lock_type = CIFS_RDLCK; else posix_lock_type = CIFS_WRLCK; - if (numUnlock == 1) + if (unlock == 1) posix_lock_type = CIFS_UNLCK; - rc = CIFSSMBPosixLock(xid, tcon, netfid, 0 /* set */, - length, pfLock, posix_lock_type, - wait_flag); - } else { - struct cifsFileInfo *fid = file->private_data; - - if (numLock) { - rc = CIFSSMBLock(xid, tcon, netfid, length, - pfLock->fl_start, 0, numLock, lockType, - wait_flag, 0); - - if (rc == 0) { - /* For Windows locks we must store them. */ - rc = store_file_lock(fid, length, - pfLock->fl_start, lockType); - } - } else if (numUnlock) { - /* For each stored lock that this unlock overlaps - completely, unlock it. */ - int stored_rc = 0; - struct cifsLockInfo *li, *tmp; + rc = CIFSSMBPosixLock(xid, tcon, netfid, 0 /* set */, length, + flock, posix_lock_type, wait_flag); + goto out; + } - rc = 0; - mutex_lock(&fid->lock_mutex); - list_for_each_entry_safe(li, tmp, &fid->llist, llist) { - if (pfLock->fl_start <= li->offset && - (pfLock->fl_start + length) >= - (li->offset + li->length)) { - stored_rc = CIFSSMBLock(xid, tcon, - netfid, li->length, - li->offset, 1, 0, - li->type, false, 0); - if (stored_rc) - rc = stored_rc; - else { - list_del(&li->llist); - kfree(li); - } - } + if (lock) { + rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, + flock->fl_start, 0, lock, type, wait_flag, 0); + if (rc == 0) { + /* For Windows locks we must store them. */ + rc = store_file_lock(cfile, length, flock->fl_start, + type, netfid); + } + } else if (unlock) { + /* + * For each stored lock that this unlock overlaps completely, + * unlock it. + */ + int stored_rc = 0; + struct cifsLockInfo *li, *tmp; + + mutex_lock(&cfile->lock_mutex); + list_for_each_entry_safe(li, tmp, &cfile->llist, llist) { + if (flock->fl_start > li->offset || + (flock->fl_start + length) < + (li->offset + li->length)) + continue; + if (current->tgid != li->pid) + continue; + + stored_rc = CIFSSMBLock(xid, tcon, netfid, + current->tgid, li->length, + li->offset, 1, 0, li->type, + 0, 0); + if (stored_rc) + rc = stored_rc; + else { + list_del(&li->llist); + kfree(li); } - mutex_unlock(&fid->lock_mutex); } + mutex_unlock(&cfile->lock_mutex); + } +out: + if (flock->fl_flags & FL_POSIX) + posix_lock_file_wait(file, flock); + return rc; +} + +int cifs_lock(struct file *file, int cmd, struct file_lock *flock) +{ + int rc, xid; + int lock = 0, unlock = 0; + bool wait_flag = false; + bool posix_lck = false; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + struct cifsInodeInfo *cinode; + struct cifsFileInfo *cfile; + __u16 netfid; + __u8 type; + + rc = -EACCES; + xid = GetXid(); + + cFYI(1, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld " + "end: %lld", cmd, flock->fl_flags, flock->fl_type, + flock->fl_start, flock->fl_end); + + cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag); + + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + cfile = (struct cifsFileInfo *)file->private_data; + tcon = tlink_tcon(cfile->tlink); + netfid = cfile->netfid; + cinode = CIFS_I(file->f_path.dentry->d_inode); + + if ((tcon->ses->capabilities & CAP_UNIX) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + posix_lck = true; + /* + * BB add code here to normalize offset and length to account for + * negative length which we can not accept over the wire. + */ + if (IS_GETLK(cmd)) { + rc = cifs_getlk(cfile, flock, type, wait_flag, posix_lck, xid); + FreeXid(xid); + return rc; + } + + if (!lock && !unlock) { + /* + * if no lock or unlock then nothing to do since we do not + * know what it is + */ + FreeXid(xid); + return -EOPNOTSUPP; } - if (pfLock->fl_flags & FL_POSIX) - posix_lock_file_wait(file, pfLock); + rc = cifs_setlk(file, flock, type, wait_flag, posix_lck, lock, unlock, + xid); FreeXid(xid); return rc; } @@ -2423,8 +2454,9 @@ void cifs_oplock_break(struct work_struct *work) * disconnected since oplock already released by the server */ if (!cfile->oplock_break_cancelled) { - rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0, - 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false, + rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, + current->tgid, 0, 0, 0, 0, + LOCKING_ANDX_OPLOCK_RELEASE, false, cinode->clientCanCacheRead ? 1 : 0); cFYI(1, "Oplock release rc = %d", rc); } -- cgit v1.2.3-58-ga151 From fe11e4ccb8479d92cd2a101d380d332544b84aaa Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Oct 2011 06:41:32 -0400 Subject: cifs: clean up check_rfc1002_header Rename it for better clarity as to what it does and have the caller pass in just the single type byte. Turn the if statement into a switch and optimize it by placing the most common message type at the top. Move the header length check back into cifs_demultiplex_thread in preparation for adding a new receive phase and normalize the cFYI messages. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 63 ++++++++++++++++++++++++++----------------------------- 1 file changed, 30 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 82bc0d27e495..97a65af2a08a 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -427,29 +427,29 @@ read_from_socket(struct TCP_Server_Info *server, char *buf, } static bool -check_rfc1002_header(struct TCP_Server_Info *server, char *buf) +is_smb_response(struct TCP_Server_Info *server, unsigned char type) { - char temp = *buf; - unsigned int pdu_length = be32_to_cpu( - ((struct smb_hdr *)buf)->smb_buf_length); - /* * The first byte big endian of the length field, * is actually not part of the length but the type * with the most common, zero, as regular data. */ - if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) { - return false; - } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) { - cFYI(1, "Good RFC 1002 session rsp"); - return false; - } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) { + switch (type) { + case RFC1002_SESSION_MESSAGE: + /* Regular SMB response */ + return true; + case RFC1002_SESSION_KEEP_ALIVE: + cFYI(1, "RFC 1002 session keep alive"); + break; + case RFC1002_POSITIVE_SESSION_RESPONSE: + cFYI(1, "RFC 1002 positive session response"); + break; + case RFC1002_NEGATIVE_SESSION_RESPONSE: /* * We get this from Windows 98 instead of an error on * SMB negprot response. */ - cFYI(1, "Negative RFC1002 Session Response Error 0x%x)", - pdu_length); + cFYI(1, "RFC 1002 negative session response"); /* give server a second to clean up */ msleep(1000); /* @@ -458,29 +458,16 @@ check_rfc1002_header(struct TCP_Server_Info *server, char *buf) * is since we do not begin with RFC1001 session * initialize frame). */ - cifs_set_port((struct sockaddr *) - &server->dstaddr, CIFS_PORT); + cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT); cifs_reconnect(server); wake_up(&server->response_q); - return false; - } else if (temp != (char) 0) { - cERROR(1, "Unknown RFC 1002 frame"); - cifs_dump_mem(" Received Data: ", buf, 4); - cifs_reconnect(server); - return false; - } - - /* else we have an SMB response */ - if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) || - (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) { - cERROR(1, "Invalid size SMB length %d pdu_length %d", - 4, pdu_length+4); + break; + default: + cERROR(1, "RFC 1002 unknown response type 0x%x", type); cifs_reconnect(server); - wake_up(&server->response_q); - return false; } - return true; + return false; } static struct mid_q_entry * @@ -683,10 +670,20 @@ cifs_demultiplex_thread(void *p) */ pdu_length = be32_to_cpu(smb_buffer->smb_buf_length); - cFYI(1, "rfc1002 length 0x%x", pdu_length+4); - if (!check_rfc1002_header(server, buf)) + cFYI(1, "RFC1002 header 0x%x", pdu_length); + if (!is_smb_response(server, buf[0])) continue; + /* check the length */ + if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) || + (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) { + cERROR(1, "Invalid size SMB length %d pdu_length %d", + 4, pdu_length + 4); + cifs_reconnect(server); + wake_up(&server->response_q); + continue; + } + /* else length ok */ if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) { isLargeBuf = true; -- cgit v1.2.3-58-ga151 From d59dad2be038132259ac99a2837d65a87fd90588 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 22 Sep 2011 09:53:59 +0400 Subject: CIFS: Move byte range lock list from fd to inode that let us do local lock checks before requesting to the server. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 3 ++- fs/cifs/cifsglob.h | 7 ++++--- fs/cifs/file.c | 30 +++++++++++++++++------------- 3 files changed, 23 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 5c2972106816..b0a2e1647390 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -945,7 +945,8 @@ cifs_init_once(void *inode) struct cifsInodeInfo *cifsi = inode; inode_init_once(&cifsi->vfs_inode); - INIT_LIST_HEAD(&cifsi->lockList); + INIT_LIST_HEAD(&cifsi->llist); + mutex_init(&cifsi->lock_mutex); } static int diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 1f265ffe7e63..55ebf39fb3fd 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -494,6 +494,7 @@ struct cifsLockInfo { __u64 length; __u32 pid; __u8 type; + __u16 netfid; }; /* @@ -526,8 +527,6 @@ struct cifsFileInfo { struct dentry *dentry; unsigned int f_flags; struct tcon_link *tlink; - struct mutex lock_mutex; - struct list_head llist; /* list of byte range locks we have. */ bool invalidHandle:1; /* file closed via session abend */ bool oplock_break_cancelled:1; int count; /* refcount protected by cifs_file_list_lock */ @@ -560,7 +559,9 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file); */ struct cifsInodeInfo { - struct list_head lockList; + struct list_head llist; /* brlocks for this inode */ + bool can_cache_brlcks; + struct mutex lock_mutex; /* protect two fields above */ /* BB add in lists for dirty pages i.e. write caching info for oplock */ struct list_head openFileList; __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index ab85699c5653..7f84ece116d0 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -262,8 +262,6 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, pCifsFile->invalidHandle = false; pCifsFile->tlink = cifs_get_tlink(tlink); mutex_init(&pCifsFile->fh_mutex); - mutex_init(&pCifsFile->lock_mutex); - INIT_LIST_HEAD(&pCifsFile->llist); INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break); spin_lock(&cifs_file_list_lock); @@ -331,12 +329,14 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) /* Delete any outstanding lock records. We'll lose them when the file * is closed anyway. */ - mutex_lock(&cifs_file->lock_mutex); - list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) { + mutex_lock(&cifsi->lock_mutex); + list_for_each_entry_safe(li, tmp, &cifsi->llist, llist) { + if (li->netfid != cifs_file->netfid) + continue; list_del(&li->llist); kfree(li); } - mutex_unlock(&cifs_file->lock_mutex); + mutex_unlock(&cifsi->lock_mutex); cifs_put_tlink(cifs_file->tlink); dput(cifs_file->dentry); @@ -639,20 +639,21 @@ int cifs_closedir(struct inode *inode, struct file *file) return rc; } -static int store_file_lock(struct cifsFileInfo *cfile, __u64 len, +static int store_file_lock(struct cifsInodeInfo *cinode, __u64 len, __u64 offset, __u8 type, __u16 netfid) { struct cifsLockInfo *li = kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL); if (li == NULL) return -ENOMEM; + li->netfid = netfid; li->offset = offset; li->length = len; li->type = type; li->pid = current->tgid; - mutex_lock(&cfile->lock_mutex); - list_add_tail(&li->llist, &cfile->llist); - mutex_unlock(&cfile->lock_mutex); + mutex_lock(&cinode->lock_mutex); + list_add_tail(&li->llist, &cinode->llist); + mutex_unlock(&cinode->lock_mutex); return 0; } @@ -769,6 +770,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u8 type, __u64 length = 1 + flock->fl_end - flock->fl_start; struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); __u16 netfid = cfile->netfid; if (posix_lck) { @@ -791,7 +793,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u8 type, flock->fl_start, 0, lock, type, wait_flag, 0); if (rc == 0) { /* For Windows locks we must store them. */ - rc = store_file_lock(cfile, length, flock->fl_start, + rc = store_file_lock(cinode, length, flock->fl_start, type, netfid); } } else if (unlock) { @@ -802,14 +804,16 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u8 type, int stored_rc = 0; struct cifsLockInfo *li, *tmp; - mutex_lock(&cfile->lock_mutex); - list_for_each_entry_safe(li, tmp, &cfile->llist, llist) { + mutex_lock(&cinode->lock_mutex); + list_for_each_entry_safe(li, tmp, &cinode->llist, llist) { if (flock->fl_start > li->offset || (flock->fl_start + length) < (li->offset + li->length)) continue; if (current->tgid != li->pid) continue; + if (cfile->netfid != li->netfid) + continue; stored_rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, li->length, @@ -822,7 +826,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u8 type, kfree(li); } } - mutex_unlock(&cfile->lock_mutex); + mutex_unlock(&cinode->lock_mutex); } out: if (flock->fl_flags & FL_POSIX) -- cgit v1.2.3-58-ga151 From b916c5cd4d895a27b47a652648958f73e4f23ac6 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 28 Sep 2011 11:55:51 +0300 Subject: ore: Only IO one group at a time (API change) Usually a single IO is confined to one group of devices (group_width) and at the boundary of a raid group it can spill into a second group. Current code would allocate a full device_table size array at each io_state so it can comply to requests that span two groups. Needless to say that is very wasteful, specially when device_table count can get very large (hundreds even thousands), while a group_width is usually 8 or 10. * Change ore API to trim on IO that spans two raid groups. The user passes offset+length to ore_get_rw_state, the ore might trim on that length if spanning a group boundary. The user must check ios->length or ios->nrpages to see how much IO will be preformed. It is the responsibility of the user to re-issue the reminder of the IO. * Modify exofs To copy spilled pages on to the next IO. This means one last kick is needed after all coalescing of pages is done. Signed-off-by: Boaz Harrosh --- fs/exofs/inode.c | 100 ++++++++++++++++++++++++++++++++++++++++++++-------- fs/exofs/ore.c | 105 ++++++++++++++++++++++++++++++++++++------------------- 2 files changed, 154 insertions(+), 51 deletions(-) (limited to 'fs') diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 61b2f7e5cdbd..d87c1f7562fb 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -259,6 +259,46 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) } } +static int _maybe_not_all_in_one_io(struct ore_io_state *ios, + struct page_collect *pcol_src, struct page_collect *pcol) +{ + /* length was wrong or offset was not page aligned */ + BUG_ON(pcol_src->nr_pages < ios->nr_pages); + + if (pcol_src->nr_pages > ios->nr_pages) { + struct page **src_page; + unsigned pages_less = pcol_src->nr_pages - ios->nr_pages; + unsigned long len_less = pcol_src->length - ios->length; + unsigned i; + int ret; + + /* This IO was trimmed */ + pcol_src->nr_pages = ios->nr_pages; + pcol_src->length = ios->length; + + /* Left over pages are passed to the next io */ + pcol->expected_pages += pages_less; + pcol->nr_pages = pages_less; + pcol->length = len_less; + src_page = pcol_src->pages + pcol_src->nr_pages; + pcol->pg_first = (*src_page)->index; + + ret = pcol_try_alloc(pcol); + if (unlikely(ret)) + return ret; + + for (i = 0; i < pages_less; ++i) + pcol->pages[i] = *src_page++; + + EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x " + "pages_less=0x%x expected_pages=0x%x " + "next_offset=0x%llx next_len=0x%lx\n", + pcol_src->nr_pages, pages_less, pcol->expected_pages, + pcol->pg_first * PAGE_SIZE, pcol->length); + } + return 0; +} + static int read_exec(struct page_collect *pcol) { struct exofs_i_info *oi = exofs_i(pcol->inode); @@ -280,7 +320,6 @@ static int read_exec(struct page_collect *pcol) ios = pcol->ios; ios->pages = pcol->pages; - ios->nr_pages = pcol->nr_pages; if (pcol->read_4_write) { ore_read(pcol->ios); @@ -296,17 +335,23 @@ static int read_exec(struct page_collect *pcol) *pcol_copy = *pcol; ios->done = readpages_done; ios->private = pcol_copy; + + /* pages ownership was passed to pcol_copy */ + _pcol_reset(pcol); + + ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); + if (unlikely(ret)) + goto err; + + EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n", + pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); + ret = ore_read(ios); if (unlikely(ret)) goto err; atomic_inc(&pcol->sbi->s_curr_pending); - EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", - oi->one_comp.obj.id, _LLU(ios->offset), pcol->length); - - /* pages ownership was passed to pcol_copy */ - _pcol_reset(pcol); return 0; err: @@ -429,6 +474,10 @@ static int exofs_readpages(struct file *file, struct address_space *mapping, return ret; } + ret = read_exec(&pcol); + if (unlikely(ret)) + return ret; + return read_exec(&pcol); } @@ -519,7 +568,6 @@ static int write_exec(struct page_collect *pcol) ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false, pcol->pg_first << PAGE_CACHE_SHIFT, pcol->length, &pcol->ios); - if (unlikely(ret)) goto err; @@ -534,10 +582,19 @@ static int write_exec(struct page_collect *pcol) ios = pcol->ios; ios->pages = pcol_copy->pages; - ios->nr_pages = pcol_copy->nr_pages; ios->done = writepages_done; ios->private = pcol_copy; + /* pages ownership was passed to pcol_copy */ + _pcol_reset(pcol); + + ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); + if (unlikely(ret)) + goto err; + + EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n", + pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); + ret = ore_write(ios); if (unlikely(ret)) { EXOFS_ERR("write_exec: ore_write() Failed\n"); @@ -545,11 +602,6 @@ static int write_exec(struct page_collect *pcol) } atomic_inc(&pcol->sbi->s_curr_pending); - EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", - pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset), - pcol->length); - /* pages ownership was passed to pcol_copy */ - _pcol_reset(pcol); return 0; err: @@ -689,12 +741,30 @@ static int exofs_writepages(struct address_space *mapping, _pcol_init(&pcol, expected_pages, mapping->host); ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); - if (ret) { + if (unlikely(ret)) { EXOFS_ERR("write_cache_pages => %d\n", ret); return ret; } - return write_exec(&pcol); + ret = write_exec(&pcol); + if (unlikely(ret)) + return ret; + + if (wbc->sync_mode == WB_SYNC_ALL) { + return write_exec(&pcol); /* pump the last reminder */ + } else if (pcol.nr_pages) { + /* not SYNC let the reminder join the next writeout */ + unsigned i; + + for (i = 0; i < pcol.nr_pages; i++) { + struct page *page = pcol.pages[i]; + + end_page_writeback(page); + set_page_dirty(page); + unlock_page(page); + } + } + return 0; } static int exofs_writepage(struct page *page, struct writeback_control *wbc) diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index a7d79257fc65..c1c2cc607adf 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -47,6 +47,9 @@ MODULE_AUTHOR("Boaz Harrosh "); MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); MODULE_LICENSE("GPL"); +static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, + struct ore_striping_info *si); + static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) { return ios->oc->comps[index & ios->oc->single_comp].cred; @@ -62,38 +65,85 @@ static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) return ore_comp_dev(ios->oc, index); } -int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, - bool is_reading, u64 offset, u64 length, - struct ore_io_state **pios) +static int _get_io_state(struct ore_layout *layout, + struct ore_components *oc, unsigned numdevs, + struct ore_io_state **pios) { struct ore_io_state *ios; /*TODO: Maybe use kmem_cach per sbi of size * exofs_io_state_size(layout->s_numdevs) */ - ios = kzalloc(ore_io_state_size(oc->numdevs), GFP_KERNEL); + ios = kzalloc(ore_io_state_size(numdevs), GFP_KERNEL); if (unlikely(!ios)) { ORE_DBGMSG("Failed kzalloc bytes=%d\n", - ore_io_state_size(oc->numdevs)); + ore_io_state_size(numdevs)); *pios = NULL; return -ENOMEM; } ios->layout = layout; ios->oc = oc; - ios->offset = offset; - ios->length = length; + *pios = ios; + return 0; +} + +/* Allocate an io_state for only a single group of devices + * + * If a user needs to call ore_read/write() this version must be used becase it + * allocates extra stuff for striping and raid. + * The ore might decide to only IO less then @length bytes do to alignmets + * and constrains as follows: + * - The IO cannot cross group boundary. + * - In raid5/6 The end of the IO must align at end of a stripe eg. + * (@offset + @length) % strip_size == 0. Or the complete range is within a + * single stripe. + * - Memory condition only permitted a shorter IO. (A user can use @length=~0 + * And check the returned ios->length for max_io_size.) + * + * The caller must check returned ios->length (and/or ios->nr_pages) and + * re-issue these pages that fall outside of ios->length + */ +int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, + bool is_reading, u64 offset, u64 length, + struct ore_io_state **pios) +{ + struct ore_io_state *ios; + unsigned numdevs = layout->group_width * layout->mirrors_p1; + int ret; + + ret = _get_io_state(layout, oc, numdevs, pios); + if (unlikely(ret)) + return ret; + + ios = *pios; ios->reading = is_reading; + ios->offset = offset; + + if (length) { + struct ore_striping_info si; + + ore_calc_stripe_info(layout, offset, &si); + ios->length = (length <= si.group_length) ? length : + si.group_length; + ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; + } - *pios = ios; return 0; } EXPORT_SYMBOL(ore_get_rw_state); +/* Allocate an io_state for all the devices in the comps array + * + * This version of io_state allocation is used mostly by create/remove + * and trunc where we currently need all the devices. The only wastful + * bit is the read/write_attributes with no IO. Those sites should + * be converted to use ore_get_rw_state() with length=0 + */ int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, - struct ore_io_state **ios) + struct ore_io_state **pios) { - return ore_get_rw_state(layout, oc, true, 0, 0, ios); + return _get_io_state(layout, oc, oc->numdevs, pios); } EXPORT_SYMBOL(ore_get_io_state); @@ -374,12 +424,12 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length, unsigned devs_in_group = ios->layout->group_width * mirrors_p1; unsigned dev = si->dev; unsigned first_dev = dev - (dev % devs_in_group); - unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; unsigned cur_pg = ios->pages_consumed; int ret = 0; while (length) { - struct ore_per_dev_state *per_dev = &ios->per_dev[dev]; + unsigned comp = dev - first_dev; + struct ore_per_dev_state *per_dev = &ios->per_dev[comp]; unsigned cur_len, page_off = 0; if (!per_dev->length) { @@ -397,9 +447,6 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length, per_dev->offset = si->obj_offset - si->unit_off; cur_len = stripe_unit; } - - if (max_comp < dev) - max_comp = dev; } else { cur_len = stripe_unit; } @@ -417,17 +464,15 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length, length -= cur_len; } out: - ios->numdevs = max_comp + mirrors_p1; + ios->numdevs = devs_in_group; ios->pages_consumed = cur_pg; return ret; } static int _prepare_for_striping(struct ore_io_state *ios) { - u64 length = ios->length; - u64 offset = ios->offset; struct ore_striping_info si; - int ret = 0; + int ret; if (!ios->pages) { if (ios->kern_buff) { @@ -446,21 +491,11 @@ static int _prepare_for_striping(struct ore_io_state *ios) return 0; } - while (length) { - ore_calc_stripe_info(ios->layout, offset, &si); - - if (length < si.group_length) - si.group_length = length; + ore_calc_stripe_info(ios->layout, ios->offset, &si); - ret = _prepare_one_group(ios, si.group_length, &si); - if (unlikely(ret)) - goto out; + BUG_ON(ios->length > si.group_length); + ret = _prepare_one_group(ios, ios->length, &si); - offset += si.group_length; - length -= si.group_length; - } - -out: return ret; } @@ -742,7 +777,6 @@ struct _trunc_info { unsigned first_group_dev; unsigned nex_group_dev; - unsigned max_devs; }; static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, @@ -757,7 +791,6 @@ static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); ti->nex_group_dev = ti->first_group_dev + layout->group_width; - ti->max_devs = layout->group_width * layout->group_count; } int ore_truncate(struct ore_layout *layout, struct ore_components *oc, @@ -777,7 +810,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc, _calc_trunk_info(ios->layout, size, &ti); - size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs), + size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs), GFP_KERNEL); if (unlikely(!size_attrs)) { ret = -ENOMEM; @@ -786,7 +819,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc, ios->numdevs = ios->oc->numdevs; - for (i = 0; i < ti.max_devs; ++i) { + for (i = 0; i < ios->numdevs; ++i) { struct exofs_trunc_attr *size_attr = &size_attrs[i]; u64 obj_size; -- cgit v1.2.3-58-ga151 From 98260754046eee4cc7d75751a4a20182ade39f58 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Sun, 2 Oct 2011 15:32:50 +0200 Subject: ore: cleanup: Embed an ore_striping_info inside ore_io_state Now that each ore_io_state covers only a single raid group. A single striping_info math is needed. Embed one inside ore_io_state to cache the calculation results and eliminate an extra call. Also the outer _prepare_for_striping is removed since it does nothing. Signed-off-by: Boaz Harrosh --- fs/exofs/ore.c | 61 ++++++++++++++++++++------------------------------ include/scsi/osd_ore.h | 1 + 2 files changed, 25 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index c1c2cc607adf..7e02d33b5e5c 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -121,11 +121,9 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, ios->offset = offset; if (length) { - struct ore_striping_info si; - - ore_calc_stripe_info(layout, offset, &si); - ios->length = (length <= si.group_length) ? length : - si.group_length; + ore_calc_stripe_info(layout, offset, &ios->si); + ios->length = (length <= ios->si.group_length) ? length : + ios->si.group_length; ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; } @@ -416,17 +414,36 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, return 0; } -static int _prepare_one_group(struct ore_io_state *ios, u64 length, - struct ore_striping_info *si) +static int _prepare_for_striping(struct ore_io_state *ios) { + struct ore_striping_info *si = &ios->si; unsigned stripe_unit = ios->layout->stripe_unit; unsigned mirrors_p1 = ios->layout->mirrors_p1; unsigned devs_in_group = ios->layout->group_width * mirrors_p1; unsigned dev = si->dev; unsigned first_dev = dev - (dev % devs_in_group); unsigned cur_pg = ios->pages_consumed; + u64 length = ios->length; int ret = 0; + if (!ios->pages) { + if (ios->kern_buff) { + struct ore_per_dev_state *per_dev = &ios->per_dev[0]; + + per_dev->offset = si->obj_offset; + per_dev->dev = si->dev; + + /* no cross device without page array */ + BUG_ON((ios->layout->group_width > 1) && + (si->unit_off + ios->length > + ios->layout->stripe_unit)); + } + ios->numdevs = ios->layout->mirrors_p1; + return 0; + } + + BUG_ON(length > si->group_length); + while (length) { unsigned comp = dev - first_dev; struct ore_per_dev_state *per_dev = &ios->per_dev[comp]; @@ -469,36 +486,6 @@ out: return ret; } -static int _prepare_for_striping(struct ore_io_state *ios) -{ - struct ore_striping_info si; - int ret; - - if (!ios->pages) { - if (ios->kern_buff) { - struct ore_per_dev_state *per_dev = &ios->per_dev[0]; - - ore_calc_stripe_info(ios->layout, ios->offset, &si); - per_dev->offset = si.obj_offset; - per_dev->dev = si.dev; - - /* no cross device without page array */ - BUG_ON((ios->layout->group_width > 1) && - (si.unit_off + ios->length > - ios->layout->stripe_unit)); - } - ios->numdevs = ios->layout->mirrors_p1; - return 0; - } - - ore_calc_stripe_info(ios->layout, ios->offset, &si); - - BUG_ON(ios->length > si.group_length); - ret = _prepare_one_group(ios, ios->length, &si); - - return ret; -} - int ore_create(struct ore_io_state *ios) { int i, ret; diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index 8fefdfbb1ced..baeef0200a1f 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -93,6 +93,7 @@ typedef void (*ore_io_done_fn)(struct ore_io_state *ios, void *private); struct ore_io_state { struct kref kref; + struct ore_striping_info si; void *private; ore_io_done_fn done; -- cgit v1.2.3-58-ga151 From 6851a5e5c12b35f8bb8986d0ff16ca9be4cf2c09 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 24 Aug 2011 18:10:49 -0700 Subject: ore: Remove check for ios->kern_buff in _prepare_for_striping to later Move the check and preparation of the ios->kern_buff case to later inside _write_mirror(). Since read was never used with ios->kern_buff its support is removed instead of fixed. Signed-off-by: Boaz Harrosh --- fs/exofs/ore.c | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 7e02d33b5e5c..d0f8db5e46c5 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -427,17 +427,6 @@ static int _prepare_for_striping(struct ore_io_state *ios) int ret = 0; if (!ios->pages) { - if (ios->kern_buff) { - struct ore_per_dev_state *per_dev = &ios->per_dev[0]; - - per_dev->offset = si->obj_offset; - per_dev->dev = si->dev; - - /* no cross device without page array */ - BUG_ON((ios->layout->group_width > 1) && - (si->unit_off + ios->length > - ios->layout->stripe_unit)); - } ios->numdevs = ios->layout->mirrors_p1; return 0; } @@ -557,7 +546,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) goto out; } per_dev->or = or; - per_dev->offset = master_dev->offset; if (ios->pages) { struct bio *bio; @@ -576,6 +564,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) __bio_clone(bio, master_dev->bio); bio->bi_bdev = NULL; bio->bi_next = NULL; + per_dev->offset = master_dev->offset; per_dev->length = master_dev->length; per_dev->bio = bio; per_dev->dev = dev; @@ -593,7 +582,15 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) _LLU(per_dev->offset), _LLU(per_dev->length), dev); } else if (ios->kern_buff) { - ret = osd_req_write_kern(or, _ios_obj(ios, dev), + per_dev->offset = ios->si.obj_offset; + per_dev->dev = ios->si.dev + dev; + + /* no cross device without page array */ + BUG_ON((ios->layout->group_width > 1) && + (ios->si.unit_off + ios->length > + ios->layout->stripe_unit)); + + ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev), per_dev->offset, ios->kern_buff, ios->length); if (unlikely(ret)) @@ -602,7 +599,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) "length=0x%llx dev=%d\n", _LLU(_ios_obj(ios, dev)->id), _LLU(per_dev->offset), - _LLU(ios->length), dev); + _LLU(ios->length), per_dev->dev); } else { osd_req_set_attributes(or, _ios_obj(ios, dev)); ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", @@ -668,16 +665,9 @@ static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) " dev=%d\n", _LLU(obj->id), _LLU(per_dev->offset), _LLU(per_dev->length), first_dev); - } else if (ios->kern_buff) { - int ret = osd_req_read_kern(or, obj, per_dev->offset, - ios->kern_buff, ios->length); - ORE_DBGMSG2("read_kern(0x%llx) offset=0x%llx " - "length=0x%llx dev=%d ret=>%d\n", - _LLU(obj->id), _LLU(per_dev->offset), - _LLU(ios->length), first_dev, ret); - if (unlikely(ret)) - return ret; } else { + BUG_ON(ios->kern_buff); + osd_req_get_attributes(or, obj); ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", _LLU(obj->id), -- cgit v1.2.3-58-ga151 From 154a9300cd87eee7538f356c9d339f06b0b1d257 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Fri, 26 Aug 2011 21:00:32 -0700 Subject: exofs: Support for short read/writes If at read/write_done the actual IO was shorter then requested, reported in returned ios->length. It is not an error. The reminder of the pages should just be unlocked but not marked uptodate or end_page_writeback. They will be re issued later by the VFS. Signed-off-by: Boaz Harrosh --- fs/exofs/inode.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index d87c1f7562fb..96366a1d7eae 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -149,14 +149,17 @@ static int pcol_add_page(struct page_collect *pcol, struct page *page, return 0; } +enum {PAGE_WAS_NOT_IN_IO = 17}; static int update_read_page(struct page *page, int ret) { - if (ret == 0) { + switch (ret) { + case 0: /* Everything is OK */ SetPageUptodate(page); if (PageError(page)) ClearPageError(page); - } else if (ret == -EFAULT) { + break; + case -EFAULT: /* In this case we were trying to read something that wasn't on * disk yet - return a page full of zeroes. This should be OK, * because the object should be empty (if there was a write @@ -167,16 +170,22 @@ static int update_read_page(struct page *page, int ret) SetPageUptodate(page); if (PageError(page)) ClearPageError(page); - ret = 0; /* recovered error */ EXOFS_DBGMSG("recovered read error\n"); - } else /* Error */ + /* fall through */ + case PAGE_WAS_NOT_IN_IO: + ret = 0; /* recovered error */ + break; + default: SetPageError(page); - + } return ret; } static void update_write_page(struct page *page, int ret) { + if (unlikely(ret == PAGE_WAS_NOT_IN_IO)) + return; /* don't pass start don't collect $200 */ + if (ret) { mapping_set_error(page->mapping, ret); SetPageError(page); @@ -195,10 +204,14 @@ static int __readpages_done(struct page_collect *pcol) u64 length = 0; int ret = ore_check_io(pcol->ios, &resid); - if (likely(!ret)) + if (likely(!ret)) { good_bytes = pcol->length; - else + ret = PAGE_WAS_NOT_IN_IO; + } else { good_bytes = pcol->length - resid; + } + if (good_bytes > pcol->ios->length) + good_bytes = pcol->ios->length; EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" " length=0x%lx nr_pages=%u\n", @@ -518,10 +531,14 @@ static void writepages_done(struct ore_io_state *ios, void *p) atomic_dec(&pcol->sbi->s_curr_pending); - if (likely(!ret)) + if (likely(!ret)) { good_bytes = pcol->length; - else + ret = PAGE_WAS_NOT_IN_IO; + } else { good_bytes = pcol->length - resid; + } + if (good_bytes > pcol->ios->length) + good_bytes = pcol->ios->length; EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" " length=0x%lx nr_pages=%u\n", -- cgit v1.2.3-58-ga151 From bbf9a31bba8c985780fe94da059cc5813a7920f5 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Fri, 26 Aug 2011 21:04:52 -0700 Subject: ore: Support for short read/writes Memory conditions and max_bio constraints might cause us to not comply to the full length of the requested IO. Instead of failing the complete IO we can issue a shorter read/write and report how much was actually executed in the ios->length member. All users must check ios->length at IO_done or upon return of ore_read/write and re-issue the reminder of the bytes. Because other wise there is no error returned like before. This is part of the effort to support the pnfs-obj layout driver. Signed-off-by: Boaz Harrosh --- fs/exofs/ore.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index d0f8db5e46c5..8354fe061d1c 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -377,8 +377,8 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, unsigned pg = *cur_pg; struct request_queue *q = osd_request_queue(_ios_od(ios, per_dev->dev)); - - per_dev->length += cur_len; + unsigned len = cur_len; + int ret; if (per_dev->bio == NULL) { unsigned pages_in_stripe = ios->layout->group_width * @@ -390,7 +390,8 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, if (unlikely(!per_dev->bio)) { ORE_DBGMSG("Failed to allocate BIO size=%u\n", bio_size); - return -ENOMEM; + ret = -ENOMEM; + goto out; } } @@ -403,15 +404,24 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], pglen, pgbase); - if (unlikely(pglen != added_len)) - return -ENOMEM; + if (unlikely(pglen != added_len)) { + ret = -ENOMEM; + goto out; + } pgbase = 0; ++pg; } BUG_ON(cur_len); + per_dev->length += len; *cur_pg = pg; - return 0; + ret = 0; +out: /* we fail the complete unit on an error eg don't advance + * per_dev->length and cur_pg. This means that we might have a bigger + * bio than the CDB requested length (per_dev->length). That's fine + * only the oposite is fatal. + */ + return ret; } static int _prepare_for_striping(struct ore_io_state *ios) @@ -472,7 +482,13 @@ static int _prepare_for_striping(struct ore_io_state *ios) out: ios->numdevs = devs_in_group; ios->pages_consumed = cur_pg; - return ret; + if (unlikely(ret)) { + if (length == ios->length) + return ret; + else + ios->length -= length; + } + return 0; } int ore_create(struct ore_io_state *ios) -- cgit v1.2.3-58-ga151 From 3bd9856857339d7ee8c4ad50030583f1b9415c39 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 28 Sep 2011 12:04:23 +0300 Subject: ore: Support for partial component table Users like the objlayout-driver would like to only pass a partial device table that covers the IO in question. For example exofs divides the file into raid-group-sized chunks and only serves group_width number of devices at a time. The partiality is communicated by setting ore_componets->first_dev and the array covers all logical devices from oc->first_dev upto (oc->first_dev + oc->numdevs) The ore_comp_dev() API receives a logical device index and returns the actual present device in the table. An out-of-range dev_index will BUG. Logical device index is the theoretical device index as if all the devices of a file are present. .i.e: total_devs = group_width * mirror_p1 * group_count 0 <= dev_index < total_devs Signed-off-by: Boaz Harrosh --- fs/exofs/exofs.h | 1 + fs/exofs/ore.c | 4 ++++ include/scsi/osd_ore.h | 7 ++++--- 3 files changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 006fd6f33571..51f4b4c40f09 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -217,6 +217,7 @@ static inline void exofs_init_comps(struct ore_components *oc, one_comp->obj.id = oid; exofs_make_credential(one_comp->cred, &one_comp->obj); + oc->first_dev = 0; oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 * sbi->layout.group_count; oc->single_comp = EC_SINGLE_COMP; diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 8354fe061d1c..f1b718028a1f 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -62,6 +62,10 @@ static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) { + ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n", + ios->oc->first_dev, ios->oc->numdevs, index, + ios->oc->ods); + return ore_comp_dev(ios->oc, index); } diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index baeef0200a1f..492b70d43bb6 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -49,6 +49,7 @@ struct ore_dev { }; struct ore_components { + unsigned first_dev; /* First logical device no */ unsigned numdevs; /* Num of devices in array */ /* If @single_comp == EC_SINGLE_COMP, @comps points to a single * component. else there are @numdevs components @@ -70,14 +71,14 @@ struct ore_components { static inline struct osd_dev *ore_comp_dev( const struct ore_components *oc, unsigned i) { - BUG_ON(oc->numdevs <= i); - return oc->ods[i]->od; + BUG_ON((i < oc->first_dev) || (oc->first_dev + oc->numdevs <= i)); + return oc->ods[i - oc->first_dev]->od; } static inline void ore_comp_set_dev( struct ore_components *oc, unsigned i, struct osd_dev *od) { - oc->ods[i]->od = od; + oc->ods[i - oc->first_dev]->od = od; } struct ore_striping_info { -- cgit v1.2.3-58-ga151 From 5a51c0c7e9a913649aa65d8233470682bcbb7694 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 28 Sep 2011 13:18:45 +0300 Subject: ore/exofs: Define new ore_verify_layout All users of the ore will need to check if current code supports the given layout. For example RAID5/6 is not currently supported. So move all the checks from exofs/super.c to a new ore_verify_layout() to be used by ore users. Note that any new layout should be passed through the ore_verify_layout() because the ore engine will prepare and verify some internal members of ore_layout, and assumes it's called. Signed-off-by: Boaz Harrosh --- fs/exofs/inode.c | 9 ++----- fs/exofs/ore.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/exofs/super.c | 49 +++--------------------------------- include/scsi/osd_ore.h | 8 ++++++ 4 files changed, 80 insertions(+), 53 deletions(-) (limited to 'fs') diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 96366a1d7eae..5a62420cbdb1 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -37,11 +37,7 @@ #define EXOFS_DBGMSG2(M...) do {} while (0) -enum { BIO_MAX_PAGES_KMALLOC = - (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), - MAX_PAGES_KMALLOC = - PAGE_SIZE / sizeof(struct page *), -}; +enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), }; unsigned exofs_max_io_pages(struct ore_layout *layout, unsigned expected_pages) @@ -49,8 +45,7 @@ unsigned exofs_max_io_pages(struct ore_layout *layout, unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); /* TODO: easily support bio chaining */ - pages = min_t(unsigned, pages, - layout->group_width * BIO_MAX_PAGES_KMALLOC); + pages = min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE); return pages; } diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index f1b718028a1f..4ca59d492798 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -47,9 +47,76 @@ MODULE_AUTHOR("Boaz Harrosh "); MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); MODULE_LICENSE("GPL"); +/* ore_verify_layout does a couple of things: + * 1. Given a minimum number of needed parameters fixes up the rest of the + * members to be operatonals for the ore. The needed parameters are those + * that are defined by the pnfs-objects layout STD. + * 2. Check to see if the current ore code actually supports these parameters + * for example stripe_unit must be a multple of the system PAGE_SIZE, + * and etc... + * 3. Cache some havily used calculations that will be needed by users. + */ + static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, struct ore_striping_info *si); +enum { BIO_MAX_PAGES_KMALLOC = + (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),}; + +int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) +{ + u64 stripe_length; + +/* FIXME: Only raid0 is supported for now. */ + if (layout->raid_algorithm != PNFS_OSD_RAID_0) { + ORE_ERR("Only RAID_0 for now\n"); + return -EINVAL; + } + if (0 != (layout->stripe_unit & ~PAGE_MASK)) { + ORE_ERR("Stripe Unit(0x%llx)" + " must be Multples of PAGE_SIZE(0x%lx)\n", + _LLU(layout->stripe_unit), PAGE_SIZE); + return -EINVAL; + } + if (layout->group_width) { + if (!layout->group_depth) { + ORE_ERR("group_depth == 0 && group_width != 0\n"); + return -EINVAL; + } + if (total_comps < (layout->group_width * layout->mirrors_p1)) { + ORE_ERR("Data Map wrong, " + "numdevs=%d < group_width=%d * mirrors=%d\n", + total_comps, layout->group_width, + layout->mirrors_p1); + return -EINVAL; + } + layout->group_count = total_comps / layout->mirrors_p1 / + layout->group_width; + } else { + if (layout->group_depth) { + printk(KERN_NOTICE "Warning: group_depth ignored " + "group_width == 0 && group_depth == %lld\n", + _LLU(layout->group_depth)); + } + layout->group_width = total_comps / layout->mirrors_p1; + layout->group_depth = -1; + layout->group_count = 1; + } + + stripe_length = (u64)layout->group_width * layout->stripe_unit; + if (stripe_length >= (1ULL << 32)) { + ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n", + _LLU(stripe_length)); + return -EINVAL; + } + + layout->max_io_length = + (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * + layout->group_width; + return 0; +} +EXPORT_SYMBOL(ore_verify_layout); + static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) { return ios->oc->comps[index & ios->oc->single_comp].cred; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index bce3686f0aa0..057b237b8b69 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -480,7 +480,7 @@ static void exofs_put_super(struct super_block *sb) static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, struct exofs_device_table *dt) { - u64 stripe_length; + int ret; sbi->layout.stripe_unit = le64_to_cpu(dt->dt_data_map.cb_stripe_unit); @@ -493,50 +493,7 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, sbi->layout.raid_algorithm = le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); -/* FIXME: Only raid0 for now. if not so, do not mount */ - if (sbi->layout.raid_algorithm != PNFS_OSD_RAID_0) { - EXOFS_ERR("Only RAID_0 for now\n"); - return -EINVAL; - } - if (numdevs < (sbi->layout.group_width * sbi->layout.mirrors_p1)) { - EXOFS_ERR("Data Map wrong, " - "numdevs=%d < group_width=%d * mirrors=%d\n", - numdevs, sbi->layout.group_width, - sbi->layout.mirrors_p1); - return -EINVAL; - } - - if (0 != (sbi->layout.stripe_unit & ~PAGE_MASK)) { - EXOFS_ERR("Stripe Unit(0x%llx)" - " must be Multples of PAGE_SIZE(0x%lx)\n", - _LLU(sbi->layout.stripe_unit), PAGE_SIZE); - return -EINVAL; - } - - if (sbi->layout.group_width) { - if (!sbi->layout.group_depth) { - EXOFS_ERR("group_depth == 0 && group_width != 0\n"); - return -EINVAL; - } - sbi->layout.group_count = numdevs / sbi->layout.mirrors_p1 / - sbi->layout.group_width; - } else { - if (sbi->layout.group_depth) { - printk(KERN_NOTICE "Warning: group_depth ignored " - "group_width == 0 && group_depth == %lld\n", - _LLU(sbi->layout.group_depth)); - } - sbi->layout.group_width = numdevs / sbi->layout.mirrors_p1; - sbi->layout.group_depth = -1; - sbi->layout.group_count = 1; - } - - stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit; - if (stripe_length >= (1ULL << 32)) { - EXOFS_ERR("Total Stripe length(0x%llx)" - " >= 32bit is not supported\n", _LLU(stripe_length)); - return -EINVAL; - } + ret = ore_verify_layout(numdevs, &sbi->layout); EXOFS_DBGMSG("exofs: layout: " "num_comps=%u stripe_unit=0x%x group_width=%u " @@ -547,7 +504,7 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, _LLU(sbi->layout.group_depth), sbi->layout.mirrors_p1, sbi->layout.raid_algorithm); - return 0; + return ret; } static unsigned __ra_pages(struct ore_layout *layout) diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index 492b70d43bb6..716dbeae8cd2 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -42,6 +42,13 @@ struct ore_layout { unsigned group_width; u64 group_depth; unsigned group_count; + + /* Cached often needed calculations filled in by + * ore_verify_layout + */ + unsigned long max_io_length; /* Max length that should be passed to + * ore_get_rw_state + */ }; struct ore_dev { @@ -138,6 +145,7 @@ static inline unsigned ore_io_state_size(unsigned numdevs) } /* ore.c */ +int ore_verify_layout(unsigned total_comps, struct ore_layout *layout); int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, bool is_reading, u64 offset, u64 length, struct ore_io_state **ios); -- cgit v1.2.3-58-ga151 From 4b46c9f5cf69505f0bc708995b88b0cc60317ffd Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 28 Sep 2011 13:25:50 +0300 Subject: ore/exofs: Change ore_check_io API Current ore_check_io API receives a residual pointer, to report partial IO. But it is actually not used, because in a multiple devices IO there is never a linearity in the IO failure. On the other hand if every failing device is reported through a received callback measures can be taken to handle only failed devices. One at a time. This will also be needed by the objects-layout-driver for it's error reporting facility. Exofs is not currently using the new information and keeps the old behaviour of failing the complete IO in case of an error. (No partial completion) TODO: Use an ore_check_io callback to set_page_error only the failing pages. And re-dirty write pages. Signed-off-by: Boaz Harrosh --- fs/exofs/inode.c | 14 ++++---------- fs/exofs/ore.c | 29 ++++++++++++++++------------- include/scsi/osd_ore.h | 5 ++++- 3 files changed, 24 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 5a62420cbdb1..86c0ac87b8e3 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -194,19 +194,16 @@ static void update_write_page(struct page *page, int ret) static int __readpages_done(struct page_collect *pcol) { int i; - u64 resid; u64 good_bytes; u64 length = 0; - int ret = ore_check_io(pcol->ios, &resid); + int ret = ore_check_io(pcol->ios, NULL); if (likely(!ret)) { good_bytes = pcol->length; ret = PAGE_WAS_NOT_IN_IO; } else { - good_bytes = pcol->length - resid; + good_bytes = 0; } - if (good_bytes > pcol->ios->length) - good_bytes = pcol->ios->length; EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" " length=0x%lx nr_pages=%u\n", @@ -519,10 +516,9 @@ static void writepages_done(struct ore_io_state *ios, void *p) { struct page_collect *pcol = p; int i; - u64 resid; u64 good_bytes; u64 length = 0; - int ret = ore_check_io(ios, &resid); + int ret = ore_check_io(ios, NULL); atomic_dec(&pcol->sbi->s_curr_pending); @@ -530,10 +526,8 @@ static void writepages_done(struct ore_io_state *ios, void *p) good_bytes = pcol->length; ret = PAGE_WAS_NOT_IN_IO; } else { - good_bytes = pcol->length - resid; + good_bytes = 0; } - if (good_bytes > pcol->ios->length) - good_bytes = pcol->ios->length; EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" " length=0x%lx nr_pages=%u\n", diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 4ca59d492798..3b1cc3a132d7 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -317,7 +317,7 @@ static void _clear_bio(struct bio *bio) } } -int ore_check_io(struct ore_io_state *ios, u64 *resid) +int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error) { enum osd_err_priority acumulated_osd_err = 0; int acumulated_lin_err = 0; @@ -325,7 +325,8 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid) for (i = 0; i < ios->numdevs; i++) { struct osd_sense_info osi; - struct osd_request *or = ios->per_dev[i].or; + struct ore_per_dev_state *per_dev = &ios->per_dev[i]; + struct osd_request *or = per_dev->or; int ret; if (unlikely(!or)) @@ -337,29 +338,31 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid) if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { /* start read offset passed endof file */ - _clear_bio(ios->per_dev[i].bio); + _clear_bio(per_dev->bio); ORE_DBGMSG("start read offset passed end of file " "offset=0x%llx, length=0x%llx\n", - _LLU(ios->per_dev[i].offset), - _LLU(ios->per_dev[i].length)); + _LLU(per_dev->offset), + _LLU(per_dev->length)); continue; /* we recovered */ } + if (on_dev_error) { + u64 residual = ios->reading ? + or->in.residual : or->out.residual; + u64 offset = (ios->offset + ios->length) - residual; + struct ore_dev *od = ios->oc->ods[ + per_dev->dev - ios->oc->first_dev]; + + on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri, + offset, residual); + } if (osi.osd_err_pri >= acumulated_osd_err) { acumulated_osd_err = osi.osd_err_pri; acumulated_lin_err = ret; } } - /* TODO: raid specific residual calculations */ - if (resid) { - if (likely(!acumulated_lin_err)) - *resid = 0; - else - *resid = ios->length; - } - return acumulated_lin_err; } EXPORT_SYMBOL(ore_check_io); diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index 716dbeae8cd2..af2231a0fd09 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -153,7 +153,10 @@ int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps, struct ore_io_state **ios); void ore_put_io_state(struct ore_io_state *ios); -int ore_check_io(struct ore_io_state *ios, u64 *resid); +typedef void (*ore_on_dev_error)(struct ore_io_state *ios, struct ore_dev *od, + unsigned dev_index, enum osd_err_priority oep, + u64 dev_offset, u64 dev_len); +int ore_check_io(struct ore_io_state *ios, ore_on_dev_error rep); int ore_create(struct ore_io_state *ios); int ore_remove(struct ore_io_state *ios); -- cgit v1.2.3-58-ga151 From 01cd4afadbf376de07d364a632cc82a0fc5e8655 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 17 Oct 2011 10:41:17 +0300 Subject: nfsd4: typo logical vs bitwise negate This should be a bitwise negate here. It silences a Sparse warning: fs/nfsd/nfs4xdr.c:693:16: warning: dubious: x & !y Signed-off-by: Dan Carpenter Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 2cab33cc3238..645a0a9d8073 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -690,7 +690,7 @@ static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x) READ_BUF(4); READ32(*x); /* Note: unlinke access bits, deny bits may be zero. */ - if (*x & !NFS4_SHARE_DENY_BOTH) + if (*x & ~NFS4_SHARE_DENY_BOTH) return nfserr_bad_xdr; return nfs_ok; xdr_error: -- cgit v1.2.3-58-ga151 From a5ff376966c079bd2f078524eff11b0c63cc2507 Mon Sep 17 00:00:00 2001 From: Shirish Pargaonkar Date: Thu, 13 Oct 2011 10:26:03 -0500 Subject: cifs: Call id to SID mapping functions to change owner/group (try #4 repost) Now build security descriptor to change either owner or group at the server. Initially security descriptor was built to change only (D)ACL, that functionality has been extended. When either an Owner or a Group of a file object at the server is changed, rest of security descriptor remains same (DACL etc.). To set security descriptor, it is necessary to open that file with permission bits of either WRITE_DAC if DACL is being modified or WRITE_OWNER (Take Ownership) if Owner or Group is being changed. It is the server that decides whether a set security descriptor with either owner or group change succeeds or not. Signed-off-by: Shirish Pargaonkar Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 135 ++++++++++++++++++++++++++++++++-------------------- fs/cifs/cifsproto.h | 7 +-- fs/cifs/cifssmb.c | 4 +- fs/cifs/inode.c | 35 +++++++++----- fs/cifs/xattr.c | 2 +- 5 files changed, 113 insertions(+), 70 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 992f1fb299d1..72ddf23ef6f7 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -904,7 +904,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, acl_size = sizeof(struct cifs_acl); num_aces = le32_to_cpu(pdacl->num_aces); - if (num_aces > 0) { + if (num_aces > 0) { umode_t user_mask = S_IRWXU; umode_t group_mask = S_IRWXG; umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO; @@ -1066,52 +1066,82 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, else cFYI(1, "no ACL"); /* BB grant all or default perms? */ -/* cifscred->uid = owner_sid_ptr->rid; - cifscred->gid = group_sid_ptr->rid; - memcpy((void *)(&(cifscred->osid)), (void *)owner_sid_ptr, - sizeof(struct cifs_sid)); - memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr, - sizeof(struct cifs_sid)); */ - return rc; } - /* Convert permission bits from mode to equivalent CIFS ACL */ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, - struct inode *inode, __u64 nmode) + __u32 secdesclen, __u64 nmode, uid_t uid, gid_t gid, int *aclflag) { int rc = 0; __u32 dacloffset; __u32 ndacloffset; __u32 sidsoffset; struct cifs_sid *owner_sid_ptr, *group_sid_ptr; + struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr; struct cifs_acl *dacl_ptr = NULL; /* no need for SACL ptr */ struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */ - if ((inode == NULL) || (pntsd == NULL) || (pnntsd == NULL)) - return -EIO; - - owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + + if (nmode != NO_CHANGE_64) { /* chmod */ + owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->osidoffset)); - group_sid_ptr = (struct cifs_sid *)((char *)pntsd + + group_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->gsidoffset)); - - dacloffset = le32_to_cpu(pntsd->dacloffset); - dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); - - ndacloffset = sizeof(struct cifs_ntsd); - ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); - ndacl_ptr->revision = dacl_ptr->revision; - ndacl_ptr->size = 0; - ndacl_ptr->num_aces = 0; - - rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, nmode); - - sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size); - - /* copy security descriptor control portion and owner and group sid */ - copy_sec_desc(pntsd, pnntsd, sidsoffset); + dacloffset = le32_to_cpu(pntsd->dacloffset); + dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); + ndacloffset = sizeof(struct cifs_ntsd); + ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); + ndacl_ptr->revision = dacl_ptr->revision; + ndacl_ptr->size = 0; + ndacl_ptr->num_aces = 0; + + rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, + nmode); + sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size); + /* copy sec desc control portion & owner and group sids */ + copy_sec_desc(pntsd, pnntsd, sidsoffset); + *aclflag = CIFS_ACL_DACL; + } else { + memcpy(pnntsd, pntsd, secdesclen); + if (uid != NO_CHANGE_32) { /* chown */ + owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + + le32_to_cpu(pnntsd->osidoffset)); + nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid), + GFP_KERNEL); + if (!nowner_sid_ptr) + return -ENOMEM; + rc = id_to_sid(uid, SIDOWNER, nowner_sid_ptr); + if (rc) { + cFYI(1, "%s: Mapping error %d for owner id %d", + __func__, rc, uid); + kfree(nowner_sid_ptr); + return rc; + } + memcpy(owner_sid_ptr, nowner_sid_ptr, + sizeof(struct cifs_sid)); + kfree(nowner_sid_ptr); + *aclflag = CIFS_ACL_OWNER; + } + if (gid != NO_CHANGE_32) { /* chgrp */ + group_sid_ptr = (struct cifs_sid *)((char *)pnntsd + + le32_to_cpu(pnntsd->gsidoffset)); + ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid), + GFP_KERNEL); + if (!ngroup_sid_ptr) + return -ENOMEM; + rc = id_to_sid(gid, SIDGROUP, ngroup_sid_ptr); + if (rc) { + cFYI(1, "%s: Mapping error %d for group id %d", + __func__, rc, gid); + kfree(ngroup_sid_ptr); + return rc; + } + memcpy(group_sid_ptr, ngroup_sid_ptr, + sizeof(struct cifs_sid)); + kfree(ngroup_sid_ptr); + *aclflag = CIFS_ACL_GROUP; + } + } return rc; } @@ -1192,13 +1222,15 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, return pntsd; } -static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path, - struct cifs_ntsd *pnntsd, u32 acllen) + /* Set an ACL on the server */ +int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, + struct inode *inode, const char *path, int aclflag) { int oplock = 0; - int xid, rc, create_options = 0; + int xid, rc, access_flags, create_options = 0; __u16 fid; struct cifs_tcon *tcon; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -1210,15 +1242,20 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path, if (backup_cred(cifs_sb)) create_options |= CREATE_OPEN_BACKUP_INTENT; - rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, create_options, - &fid, &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP) + access_flags = WRITE_OWNER; + else + access_flags = WRITE_DAC; + + rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, access_flags, + create_options, &fid, &oplock, NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc) { cERROR(1, "Unable to open file to set ACL"); goto out; } - rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen); + rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen, aclflag); cFYI(DBG2, "SetCIFSACL rc = %d", rc); CIFSSMBClose(xid, tcon, fid); @@ -1228,17 +1265,6 @@ out: return rc; } -/* Set an ACL on the server */ -int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, - struct inode *inode, const char *path) -{ - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - - cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode); - - return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen); -} - /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, @@ -1270,9 +1296,12 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, } /* Convert mode bits to an ACL so we can update the ACL on the server */ -int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode) +int +id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, + uid_t uid, gid_t gid) { int rc = 0; + int aclflag = CIFS_ACL_DACL; /* default flag to set */ __u32 secdesclen = 0; struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ @@ -1302,13 +1331,15 @@ int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode) return -ENOMEM; } - rc = build_sec_desc(pntsd, pnntsd, inode, nmode); + rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid, + &aclflag); cFYI(DBG2, "build_sec_desc rc: %d", rc); if (!rc) { /* Set the security descriptor */ - rc = set_cifs_acl(pnntsd, secdesclen, inode, path); + rc = set_cifs_acl(pnntsd, secdesclen, inode, + path, aclflag); cFYI(DBG2, "set_cifs_acl rc: %d", rc); } diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 94834dbb46da..a1fa9cec05d6 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -146,11 +146,12 @@ extern int cifs_get_inode_info_unix(struct inode **pinode, extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, const char *path, const __u16 *pfid); -extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64); +extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64, + uid_t, gid_t); extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, const char *, u32 *); extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, - const char *); + const char *, int); extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, struct cifs_sb_info *cifs_sb); @@ -420,7 +421,7 @@ extern int CIFSSMBSetEA(const int xid, struct cifs_tcon *tcon, extern int CIFSSMBGetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen); extern int CIFSSMBSetCIFSACL(const int, struct cifs_tcon *, __u16, - struct cifs_ntsd *, __u32); + struct cifs_ntsd *, __u32, int); extern int CIFSSMBGetPosixACL(const int xid, struct cifs_tcon *tcon, const unsigned char *searchName, char *acl_inf, const int buflen, const int acl_type, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index e33093f7ef0d..c824c106b2b7 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -3463,7 +3463,7 @@ qsec_out: int CIFSSMBSetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid, - struct cifs_ntsd *pntsd, __u32 acllen) + struct cifs_ntsd *pntsd, __u32 acllen, int aclflag) { __u16 byte_count, param_count, data_count, param_offset, data_offset; int rc = 0; @@ -3500,7 +3500,7 @@ setCifsAclRetry: pSMB->Fid = fid; /* file handle always le */ pSMB->Reserved2 = 0; - pSMB->AclFlags = cpu_to_le32(CIFS_ACL_DACL); + pSMB->AclFlags = cpu_to_le32(aclflag); if (pntsd && acllen) { memcpy((char *) &pSMBr->hdr.Protocol + data_offset, diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a7b2dcd4a53e..663c4e313be4 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -2096,6 +2096,8 @@ static int cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) { int xid; + uid_t uid = NO_CHANGE_32; + gid_t gid = NO_CHANGE_32; struct inode *inode = direntry->d_inode; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsInodeInfo *cifsInode = CIFS_I(inode); @@ -2146,13 +2148,25 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) goto cifs_setattr_exit; } - /* - * Without unix extensions we can't send ownership changes to the - * server, so silently ignore them. This is consistent with how - * local DOS/Windows filesystems behave (VFAT, NTFS, etc). With - * CIFSACL support + proper Windows to Unix idmapping, we may be - * able to support this in the future. - */ + if (attrs->ia_valid & ATTR_UID) + uid = attrs->ia_uid; + + if (attrs->ia_valid & ATTR_GID) + gid = attrs->ia_gid; + +#ifdef CONFIG_CIFS_ACL + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { + if (uid != NO_CHANGE_32 || gid != NO_CHANGE_32) { + rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64, + uid, gid); + if (rc) { + cFYI(1, "%s: Setting id failed with error: %d", + __func__, rc); + goto cifs_setattr_exit; + } + } + } else +#endif /* CONFIG_CIFS_ACL */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) attrs->ia_valid &= ~(ATTR_UID | ATTR_GID); @@ -2161,15 +2175,12 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) attrs->ia_valid &= ~ATTR_MODE; if (attrs->ia_valid & ATTR_MODE) { - cFYI(1, "Mode changed to 0%o", attrs->ia_mode); mode = attrs->ia_mode; - } - - if (attrs->ia_valid & ATTR_MODE) { rc = 0; #ifdef CONFIG_CIFS_ACL if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { - rc = mode_to_cifs_acl(inode, full_path, mode); + rc = id_mode_to_cifs_acl(inode, full_path, mode, + NO_CHANGE_32, NO_CHANGE_32); if (rc) { cFYI(1, "%s: Setting ACL failed with error: %d", __func__, rc); diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 2a22fb2989e4..fde15b1a1eb5 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -178,7 +178,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, #ifdef CONFIG_CIFS_ACL memcpy(pacl, ea_value, value_size); rc = set_cifs_acl(pacl, value_size, - direntry->d_inode, full_path); + direntry->d_inode, full_path, CIFS_ACL_DACL); if (rc == 0) /* force revalidate of the inode */ CIFS_I(direntry->d_inode)->time = 0; kfree(pacl); -- cgit v1.2.3-58-ga151 From f472e02669073e4f1a388142bafa0f806fae841c Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 17 Oct 2011 10:13:46 -0400 Subject: ext4: avoid stamping on other memories in ext4_ext_insert_index() Add a sanity check to make sure ix hasn't gone beyond the valid bounds of the extent block. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f473ddf0bd94..5c4861210d4c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -779,6 +779,11 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, ix = curp->p_idx; } + if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { + EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); + return -EIO; + } + ix->ei_block = cpu_to_le32(logical); ext4_idx_store_pblock(ix, ptr); le16_add_cpu(&curp->p_hdr->eh_entries, 1); -- cgit v1.2.3-58-ga151 From a50d2ad1721c0c785e9a74c0003ca044de6868a5 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 12 Oct 2011 16:24:27 -0400 Subject: nfsd4: centralize renew_client() calls There doesn't seem to be any harm to renewing the client a bit earlier, when it is looked up. That saves us from having to sprinkle renew_client calls over quite so many places. Also remove a redundant comment and do a little cleanup. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2042805da960..d90461eb9368 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -931,9 +931,6 @@ renew_client_locked(struct nfs4_client *clp) return; } - /* - * Move client to the end to the LRU list. - */ dprintk("renewing client (clientid %08x/%08x)\n", clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); @@ -1220,8 +1217,10 @@ find_confirmed_client(clientid_t *clid) unsigned int idhashval = clientid_hashval(clid->cl_id); list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) { - if (same_clid(&clp->cl_clientid, clid)) + if (same_clid(&clp->cl_clientid, clid)) { + renew_client(clp); return clp; + } } return NULL; } @@ -2372,11 +2371,15 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner, static struct nfs4_openowner * find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open) { - struct nfs4_stateowner *so = NULL; + struct nfs4_stateowner *so; + struct nfs4_openowner *oo; list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) { - if (same_owner_str(so, &open->op_owner, &open->op_clientid)) - return container_of(so, struct nfs4_openowner, oo_owner); + if (same_owner_str(so, &open->op_owner, &open->op_clientid)) { + oo = openowner(so); + renew_client(oo->oo_owner.so_client); + return oo; + } } return NULL; } @@ -2536,7 +2539,6 @@ renew: open->op_openowner = oo; } list_del_init(&oo->oo_close_lru); - renew_client(oo->oo_owner.so_client); return nfs_ok; } @@ -2970,7 +2972,6 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("nfsd4_renew: clientid not found!\n"); goto out; } - renew_client(clp); status = nfserr_cb_path_down; if (!list_empty(&clp->cl_delegations) && clp->cl_cb_state != NFSD4_CB_UP) @@ -3275,7 +3276,6 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, status = nfs4_check_delegmode(dp, flags); if (status) goto out; - renew_client(dp->dl_stid.sc_client); if (filpp) { *filpp = dp->dl_file->fi_deleg_file; BUG_ON(!*filpp); @@ -3293,7 +3293,6 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, status = nfs4_check_openmode(stp, flags); if (status) goto out; - renew_client(stp->st_stateowner->so_client); if (filpp) { if (flags & RD_STATE) *filpp = find_readable_file(stp->st_file); @@ -3412,7 +3411,6 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, return status; *stpp = openlockstateid(s); cstate->replay_owner = (*stpp)->st_stateowner; - renew_client((*stpp)->st_stateowner->so_client); return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp); } @@ -3643,7 +3641,6 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate)); if (status) goto out; - renew_client(dp->dl_stid.sc_client); unhash_delegation(dp); out: -- cgit v1.2.3-58-ga151 From 3557e43b8f78e5c2347bab31626fdb4d09220ae7 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 12 Oct 2011 16:58:21 -0400 Subject: nfsd4: make is_open_owner boolean Signed-off-by: J. Bruce Fields --- fs/nfsd/state.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index aa14f06af2df..87eecfd9b968 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -335,13 +335,13 @@ struct nfs4_replay { struct nfs4_stateowner { struct list_head so_strhash; /* hash by op_name */ struct list_head so_stateids; - int so_is_open_owner; /* 1=openowner,0=lockowner */ struct nfs4_client * so_client; /* after increment in ENCODE_SEQID_OP_TAIL, represents the next * sequence id expected from the client: */ u32 so_seqid; struct xdr_netobj so_owner; /* open owner name */ struct nfs4_replay so_replay; + bool so_is_open_owner; }; struct nfs4_openowner { -- cgit v1.2.3-58-ga151 From bcf130f9dfbaccf91376a44b18f51ed8007840d6 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 12 Oct 2011 20:44:20 -0400 Subject: nfsd4: simplify process_open1 logic No change in behavior. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d90461eb9368..62aa91ae278b 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2506,7 +2506,6 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, struct nfs4_client *clp = NULL; unsigned int strhashval; struct nfs4_openowner *oo = NULL; - __be32 status; if (STALE_CLIENTID(&open->op_clientid)) return nfserr_stale_clientid; @@ -2515,30 +2514,25 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, oo = find_openstateowner_str(strhashval, open); open->op_openowner = oo; if (!oo) { - /* Make sure the client's lease hasn't expired. */ clp = find_confirmed_client(clientid); if (clp == NULL) return nfserr_expired; - goto renew; + goto new_owner; } if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { /* Replace unconfirmed owners without checking for replay. */ clp = oo->oo_owner.so_client; release_openowner(oo); open->op_openowner = NULL; - goto renew; - } - status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid); - if (status) - return status; -renew: - if (open->op_openowner == NULL) { - oo = alloc_init_open_stateowner(strhashval, clp, open); - if (oo == NULL) - return nfserr_jukebox; - open->op_openowner = oo; + goto new_owner; } list_del_init(&oo->oo_close_lru); + return nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid); +new_owner: + oo = alloc_init_open_stateowner(strhashval, clp, open); + if (oo == NULL) + return nfserr_jukebox; + open->op_openowner = oo; return nfs_ok; } -- cgit v1.2.3-58-ga151 From d29b20cd589128a599e5045d4effc2d7dbc388f5 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2011 15:12:59 -0400 Subject: nfsd4: clean up open owners on OPEN failure If process_open1() creates a new open owner, but the open later fails, the current code will leave the open owner around. It won't be on the close_lru list, and the client isn't expected to send a CLOSE, so it will hang around as long as the client does. Similarly, if process_open1() removes an existing open owner from the close lru, anticipating that an open owner that previously had no associated stateid's now will, but the open subsequently fails, then we'll again be left with the same leak. Fix both problems. Reported-by: Bryan Schumaker Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 1 + fs/nfsd/nfs4state.c | 20 ++++++++++++++++++-- fs/nfsd/state.h | 1 + fs/nfsd/xdr4.h | 1 + 4 files changed, 21 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 5b192a2512b6..10b50d78bdc3 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -409,6 +409,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, */ status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); out: + nfsd4_cleanup_open_state(open, status); if (open->op_openowner) cstate->replay_owner = &open->op_openowner->oo_owner; else diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 62aa91ae278b..2c9a1a20e014 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2320,7 +2320,7 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str return NULL; oo->oo_owner.so_is_open_owner = 1; oo->oo_owner.so_seqid = open->op_seqid; - oo->oo_flags = 0; + oo->oo_flags = NFS4_OO_NEW; oo->oo_time = 0; oo->oo_last_closed_stid = NULL; INIT_LIST_HEAD(&oo->oo_close_lru); @@ -2526,7 +2526,6 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, open->op_openowner = NULL; goto new_owner; } - list_del_init(&oo->oo_close_lru); return nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid); new_owner: oo = alloc_init_open_stateowner(strhashval, clp, open); @@ -2946,6 +2945,23 @@ out: return status; } +void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status) +{ + if (open->op_openowner) { + struct nfs4_openowner *oo = open->op_openowner; + + if (!list_empty(&oo->oo_owner.so_stateids)) + list_del_init(&oo->oo_close_lru); + if (oo->oo_flags & NFS4_OO_NEW) { + if (status) { + release_openowner(oo); + open->op_openowner = NULL; + } else + oo->oo_flags &= ~NFS4_OO_NEW; + } + } +} + __be32 nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, clientid_t *clid) diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 87eecfd9b968..eab9dae23c06 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -359,6 +359,7 @@ struct nfs4_openowner { time_t oo_time; /* time of placement on so_close_lru */ #define NFS4_OO_CONFIRMED 1 #define NFS4_OO_PURGE_CLOSE 2 +#define NFS4_OO_NEW 4 unsigned char oo_flags; }; diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 4c8a7ec3f25d..32e6fd8d9768 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -554,6 +554,7 @@ extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, struct nfsd4_open *open); extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open); +extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status); extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc); extern __be32 nfsd4_close(struct svc_rqst *rqstp, -- cgit v1.2.3-58-ga151 From 32513b40efdc693b3675f1c691ab901518fbcb6a Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2011 16:00:16 -0400 Subject: nfsd4: preallocate nfs4_file in process_open1() Creating a new file is an irrevocable step--once it's visible in the filesystem, other processes may have seen it and done something with it, and unlinking it wouldn't simply undo the effects of the create. Therefore, in the case where OPEN creates a new file, we shouldn't do the create until we know that the rest of the OPEN processing will succeed. For example, we should preallocate a struct file in case we need it until waiting to allocate it till process_open2(), which is already too late. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 60 ++++++++++++++++++++++++++++++++--------------------- fs/nfsd/xdr4.h | 1 + 2 files changed, 37 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2c9a1a20e014..ae5d25075f67 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -104,6 +104,11 @@ opaque_hashval(const void *ptr, int nbytes) static struct list_head del_recall_lru; +static void nfsd4_free_file(struct nfs4_file *f) +{ + kmem_cache_free(file_slab, f); +} + static inline void put_nfs4_file(struct nfs4_file *fi) { @@ -111,7 +116,7 @@ put_nfs4_file(struct nfs4_file *fi) list_del(&fi->fi_hash); spin_unlock(&recall_lock); iput(fi->fi_inode); - kmem_cache_free(file_slab, fi); + nfsd4_free_file(fi); } } @@ -2190,30 +2195,28 @@ out: return status; } +static struct nfs4_file *nfsd4_alloc_file(void) +{ + return kmem_cache_alloc(file_slab, GFP_KERNEL); +} + /* OPEN Share state helper functions */ -static inline struct nfs4_file * -alloc_init_file(struct inode *ino) +static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino) { - struct nfs4_file *fp; unsigned int hashval = file_hashval(ino); - fp = kmem_cache_alloc(file_slab, GFP_KERNEL); - if (fp) { - atomic_set(&fp->fi_ref, 1); - INIT_LIST_HEAD(&fp->fi_hash); - INIT_LIST_HEAD(&fp->fi_stateids); - INIT_LIST_HEAD(&fp->fi_delegations); - fp->fi_inode = igrab(ino); - fp->fi_had_conflict = false; - fp->fi_lease = NULL; - memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); - memset(fp->fi_access, 0, sizeof(fp->fi_access)); - spin_lock(&recall_lock); - list_add(&fp->fi_hash, &file_hashtbl[hashval]); - spin_unlock(&recall_lock); - return fp; - } - return NULL; + atomic_set(&fp->fi_ref, 1); + INIT_LIST_HEAD(&fp->fi_hash); + INIT_LIST_HEAD(&fp->fi_stateids); + INIT_LIST_HEAD(&fp->fi_delegations); + fp->fi_inode = igrab(ino); + fp->fi_had_conflict = false; + fp->fi_lease = NULL; + memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); + memset(fp->fi_access, 0, sizeof(fp->fi_access)); + spin_lock(&recall_lock); + list_add(&fp->fi_hash, &file_hashtbl[hashval]); + spin_unlock(&recall_lock); } static void @@ -2509,6 +2512,13 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, if (STALE_CLIENTID(&open->op_clientid)) return nfserr_stale_clientid; + /* + * In case we need it later, after we've already created the + * file and don't want to risk a further failure: + */ + open->op_file = nfsd4_alloc_file(); + if (open->op_file == NULL) + return nfserr_jukebox; strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner); oo = find_openstateowner_str(strhashval, open); @@ -2884,9 +2894,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) goto out; status = nfserr_jukebox; - fp = alloc_init_file(ino); - if (fp == NULL) - goto out; + fp = open->op_file; + open->op_file = NULL; + nfsd4_init_file(fp, ino); } /* @@ -2960,6 +2970,8 @@ void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status) oo->oo_flags &= ~NFS4_OO_NEW; } } + if (open->op_file) + nfsd4_free_file(open->op_file); } __be32 diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 32e6fd8d9768..502dd43634f9 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -228,6 +228,7 @@ struct nfsd4_open { u32 op_rflags; /* response */ int op_truncate; /* used during processing */ struct nfs4_openowner *op_openowner; /* used during processing */ + struct nfs4_file *op_file; /* used during processing */ struct nfs4_acl *op_acl; }; #define op_iattr iattr -- cgit v1.2.3-58-ga151 From 996e09385c364f97a89648b401409521e2a3a094 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 17 Oct 2011 11:14:48 -0400 Subject: nfsd4: do idr preallocation with stateid allocation Move idr preallocation out of stateid initialization, into stateid allocation, so that we no longer have to handle any errors from the former. This is a little subtle due to the way the idr code manages these preallocated items--document that in comments. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 76 ++++++++++++++++++++++++++--------------------------- fs/nfsd/state.h | 4 +-- 2 files changed, 39 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ae5d25075f67..1f8c781c2a28 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -215,13 +215,12 @@ static inline int get_new_stid(struct nfs4_stid *stid) int new_stid; int error; - if (!idr_pre_get(stateids, GFP_KERNEL)) - return -ENOMEM; - error = idr_get_new_above(stateids, stid, min_stateid, &new_stid); /* - * All this code is currently serialized; the preallocation - * above should still be ours: + * Note: the necessary preallocation was done in + * nfs4_alloc_stateid(). The idr code caps the number of + * preallocations that can exist at a time, but the state lock + * prevents anyone from using ours before we get here: */ BUG_ON(error); /* @@ -240,7 +239,7 @@ static inline int get_new_stid(struct nfs4_stid *stid) return new_stid; } -static inline __be32 init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type) +static void init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type) { stateid_t *s = &stid->sc_stateid; int new_id; @@ -249,12 +248,24 @@ static inline __be32 init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, u stid->sc_client = cl; s->si_opaque.so_clid = cl->cl_clientid; new_id = get_new_stid(stid); - if (new_id < 0) - return nfserr_jukebox; s->si_opaque.so_id = (u32)new_id; /* Will be incremented before return to client: */ s->si_generation = 0; - return 0; +} + +static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab) +{ + struct idr *stateids = &cl->cl_stateids; + + if (!idr_pre_get(stateids, GFP_KERNEL)) + return NULL; + /* + * Note: if we fail here (or any time between now and the time + * we actually get the new idr), we won't need to undo the idr + * preallocation, since the idr code caps the number of + * preallocated entries. + */ + return kmem_cache_alloc(slab, GFP_KERNEL); } static struct nfs4_delegation * @@ -262,7 +273,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv { struct nfs4_delegation *dp; struct nfs4_file *fp = stp->st_file; - __be32 status; dprintk("NFSD alloc_init_deleg\n"); /* @@ -276,14 +286,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv return NULL; if (num_delegations > max_delegations) return NULL; - dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL); + dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab)); if (dp == NULL) return dp; - status = init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID); - if (status) { - kmem_cache_free(deleg_slab, dp); - return NULL; - } + init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID); /* * delegation seqid's are never incremented. The 4.1 special * meaning of seqid 0 isn't meaningful, really, but let's avoid @@ -2331,14 +2337,11 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str return oo; } -static inline __be32 init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { +static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { struct nfs4_openowner *oo = open->op_openowner; struct nfs4_client *clp = oo->oo_owner.so_client; - __be32 status; - status = init_stid(&stp->st_stid, clp, NFS4_OPEN_STID); - if (status) - return status; + init_stid(&stp->st_stid, clp, NFS4_OPEN_STID); INIT_LIST_HEAD(&stp->st_lockowners); list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); @@ -2350,7 +2353,6 @@ static inline __be32 init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_ __set_bit(open->op_share_access, &stp->st_access_bmap); __set_bit(open->op_share_deny, &stp->st_deny_bmap); stp->st_openstp = NULL; - return nfs_ok; } static void @@ -2614,10 +2616,14 @@ nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_st return nfs_ok; } -static inline struct nfs4_ol_stateid * -nfs4_alloc_stateid(void) +static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp) { - return kmem_cache_alloc(stateid_slab, GFP_KERNEL); + return openlockstateid(nfs4_alloc_stid(clp, stateid_slab)); +} + +static void nfs4_free_stateid(struct nfs4_ol_stateid *s) +{ + kmem_cache_free(stateid_slab, s); } static inline int nfs4_access_to_access(u32 nfs4_access) @@ -2661,15 +2667,16 @@ nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_ol_stateid **stpp, struct nfsd4_open *open) { struct nfs4_ol_stateid *stp; + struct nfs4_client *cl = open->op_openowner->oo_owner.so_client; __be32 status; - stp = nfs4_alloc_stateid(); + stp = nfs4_alloc_stateid(cl); if (stp == NULL) return nfserr_jukebox; status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open); if (status) { - kmem_cache_free(stateid_slab, stp); + nfs4_free_stateid(stp); return status; } *stpp = stp; @@ -2912,11 +2919,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs4_new_open(rqstp, &stp, fp, current_fh, open); if (status) goto out; - status = init_open_stateid(stp, fp, open); - if (status) { - release_open_stateid(stp); - goto out; - } + init_open_stateid(stp, fp, open); status = nfsd4_truncate(rqstp, current_fh, open); if (status) { release_open_stateid(stp); @@ -3812,16 +3815,11 @@ alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct { struct nfs4_ol_stateid *stp; struct nfs4_client *clp = lo->lo_owner.so_client; - __be32 status; - stp = nfs4_alloc_stateid(); + stp = nfs4_alloc_stateid(clp); if (stp == NULL) return NULL; - status = init_stid(&stp->st_stid, clp, NFS4_LOCK_STID); - if (status) { - free_generic_stateid(stp); - return NULL; - } + init_stid(&stp->st_stid, clp, NFS4_LOCK_STID); list_add(&stp->st_perfile, &fp->fi_stateids); list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); stp->st_stateowner = &lo->lo_owner; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index eab9dae23c06..1a5820066040 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -85,6 +85,7 @@ struct nfs4_stid { }; struct nfs4_delegation { + struct nfs4_stid dl_stid; /* must be first field */ struct list_head dl_perfile; struct list_head dl_perclnt; struct list_head dl_recall_lru; /* delegation recalled */ @@ -93,7 +94,6 @@ struct nfs4_delegation { u32 dl_type; time_t dl_time; /* For recall: */ - struct nfs4_stid dl_stid; struct knfsd_fh dl_fh; int dl_retries; struct nfsd4_callback dl_recall; @@ -434,7 +434,7 @@ static inline struct file *find_any_file(struct nfs4_file *f) /* "ol" stands for "Open or Lock". Better suggestions welcome. */ struct nfs4_ol_stateid { - struct nfs4_stid st_stid; + struct nfs4_stid st_stid; /* must be first field */ struct list_head st_perfile; struct list_head st_perstateowner; struct list_head st_lockowners; -- cgit v1.2.3-58-ga151 From 4cdc951b8611de4ce25e35c9fb8c0656150c9245 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 17 Oct 2011 15:57:47 -0400 Subject: nfsd4: preallocate open stateid in process_open1() As with the nfs4_file, we'd prefer to find out about any failure before creating a new file rather than after. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 49 ++++++++++++++++++++----------------------------- fs/nfsd/xdr4.h | 1 + 2 files changed, 21 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1f8c781c2a28..3e1d4e08dfad 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -268,6 +268,11 @@ static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cac return kmem_cache_alloc(slab, GFP_KERNEL); } +static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp) +{ + return openlockstateid(nfs4_alloc_stid(clp, stateid_slab)); +} + static struct nfs4_delegation * alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type) { @@ -2511,6 +2516,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, struct nfs4_client *clp = NULL; unsigned int strhashval; struct nfs4_openowner *oo = NULL; + __be32 status; if (STALE_CLIENTID(&open->op_clientid)) return nfserr_stale_clientid; @@ -2538,12 +2544,20 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, open->op_openowner = NULL; goto new_owner; } - return nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid); + status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid); + if (status) + return status; + clp = oo->oo_owner.so_client; + goto alloc_stateid; new_owner: oo = alloc_init_open_stateowner(strhashval, clp, open); if (oo == NULL) return nfserr_jukebox; open->op_openowner = oo; +alloc_stateid: + open->op_stp = nfs4_alloc_stateid(clp); + if (!open->op_stp) + return nfserr_jukebox; return nfs_ok; } @@ -2616,11 +2630,6 @@ nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_st return nfs_ok; } -static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp) -{ - return openlockstateid(nfs4_alloc_stid(clp, stateid_slab)); -} - static void nfs4_free_stateid(struct nfs4_ol_stateid *s) { kmem_cache_free(stateid_slab, s); @@ -2661,28 +2670,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, return nfs_ok; } -static __be32 -nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_ol_stateid **stpp, - struct nfs4_file *fp, struct svc_fh *cur_fh, - struct nfsd4_open *open) -{ - struct nfs4_ol_stateid *stp; - struct nfs4_client *cl = open->op_openowner->oo_owner.so_client; - __be32 status; - - stp = nfs4_alloc_stateid(cl); - if (stp == NULL) - return nfserr_jukebox; - - status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open); - if (status) { - nfs4_free_stateid(stp); - return status; - } - *stpp = stp; - return 0; -} - static inline __be32 nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, struct nfsd4_open *open) @@ -2916,9 +2903,11 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf if (status) goto out; } else { - status = nfs4_new_open(rqstp, &stp, fp, current_fh, open); + status = nfs4_get_vfs_file(rqstp, fp, current_fh, open); if (status) goto out; + stp = open->op_stp; + open->op_stp = NULL; init_open_stateid(stp, fp, open); status = nfsd4_truncate(rqstp, current_fh, open); if (status) { @@ -2975,6 +2964,8 @@ void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status) } if (open->op_file) nfsd4_free_file(open->op_file); + if (open->op_stp) + nfs4_free_stateid(open->op_stp); } __be32 diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 502dd43634f9..ce8c59196b4e 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -229,6 +229,7 @@ struct nfsd4_open { int op_truncate; /* used during processing */ struct nfs4_openowner *op_openowner; /* used during processing */ struct nfs4_file *op_file; /* used during processing */ + struct nfs4_ol_stateid *op_stp; /* used during processing */ struct nfs4_acl *op_acl; }; #define op_iattr iattr -- cgit v1.2.3-58-ga151 From 856121b2e83bd64bffdc8de449d24c9295e92ff3 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2011 11:37:11 -0400 Subject: nfsd4: warn on open failure after create If we create the object and then return failure to the client, we're left with an unexpected file in the filesystem. I'm trying to eliminate such cases but not 100% sure I have so an assertion might be helpful for now. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 7 ++++--- fs/nfsd/vfs.c | 2 +- fs/nfsd/vfs.h | 2 +- fs/nfsd/xdr4.h | 3 ++- 4 files changed, 8 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 10b50d78bdc3..710b97b7a2f3 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -194,7 +194,6 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o { struct svc_fh resfh; __be32 status; - int created = 0; fh_init(&resfh, NFS4_FHSIZE); open->op_truncate = 0; @@ -223,7 +222,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o open->op_fname.len, &open->op_iattr, &resfh, open->op_createmode, (u32 *)open->op_verf.data, - &open->op_truncate, &created); + &open->op_truncate, &open->op_created); /* * Following rfc 3530 14.2.16, use the returned bitmask @@ -253,7 +252,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o /* set reply cache */ fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh, &resfh.fh_handle); - if (!created) + if (!open->op_created) status = do_open_permission(rqstp, current_fh, open, NFSD_MAY_NOP); @@ -318,6 +317,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* We don't yet support WANT bits: */ open->op_share_access &= NFS4_SHARE_ACCESS_MASK; + open->op_created = 0; /* * RFC5661 18.51.3 * Before RECLAIM_COMPLETE done, server should deny new lock @@ -408,6 +408,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * set, (2) sets open->op_stateid, (3) sets open->op_delegation. */ status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); + WARN_ON(status && open->op_created); out: nfsd4_cleanup_open_state(open, status); if (open->op_openowner) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 4dd91283d039..7a2e442623c8 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1370,7 +1370,7 @@ __be32 do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, char *fname, int flen, struct iattr *iap, struct svc_fh *resfhp, int createmode, u32 *verifier, - int *truncp, int *created) + bool *truncp, bool *created) { struct dentry *dentry, *dchild = NULL; struct inode *dirp; diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 503f3bf11abd..3f54ad03bb2b 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -62,7 +62,7 @@ __be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); __be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, struct svc_fh *res, int createmode, - u32 *verifier, int *truncp, int *created); + u32 *verifier, bool *truncp, bool *created); __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, loff_t, unsigned long); #endif /* CONFIG_NFSD_V3 */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index ce8c59196b4e..e3057350eea1 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -226,7 +226,8 @@ struct nfsd4_open { u32 op_recall; /* recall */ struct nfsd4_change_info op_cinfo; /* response */ u32 op_rflags; /* response */ - int op_truncate; /* used during processing */ + bool op_truncate; /* used during processing */ + bool op_created; /* used during processing */ struct nfs4_openowner *op_openowner; /* used during processing */ struct nfs4_file *op_file; /* used during processing */ struct nfs4_ol_stateid *op_stp; /* used during processing */ -- cgit v1.2.3-58-ga151 From 215fc6af739d2dfdcad5496248d0d6302eb8a604 Mon Sep 17 00:00:00 2001 From: Nikitas Angelinas Date: Tue, 18 Oct 2011 10:49:51 -0400 Subject: ext4: MMP: kmmpd should use nodename from init_uts_ns.name, not sysname sysname holds "Linux" by default, i.e. what appears when doing a "uname -s"; nodename should be used to print the machine's hostname, i.e. what is returned when doing a "uname -n" or "hostname", and what gethostname(2)/sethostname(2) manipulate, in order to notify the administrator of the node which is contending to mount the filesystem. Acked-by: Andreas Dilger Signed-off-by: Nikitas Angelinas Signed-off-by: Andrew Perepechko Signed-off-by: "Theodore Ts'o" --- fs/ext4/mmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 9bdef3f537c5..2fca64efd6db 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -109,7 +109,7 @@ static int kmmpd(void *data) mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); bdevname(bh->b_bdev, mmp->mmp_bdevname); - memcpy(mmp->mmp_nodename, init_utsname()->sysname, + memcpy(mmp->mmp_nodename, init_utsname()->nodename, sizeof(mmp->mmp_nodename)); while (!kthread_should_stop()) { -- cgit v1.2.3-58-ga151 From bdfc230f33a9dab316dcb6be4696ae59e1562e3c Mon Sep 17 00:00:00 2001 From: Nikitas Angelinas Date: Tue, 18 Oct 2011 10:51:51 -0400 Subject: ext4: MMP: fix error message rate-limiting logic in kmmpd Current logic would print an error message only once, and then 'failed_writes' would stay at 1. Rework the loop to increment 'failed_writes' and print the error message every s_mmp_update_interval * 60 seconds, as intended according to the comment. Signed-off-by: Nikitas Angelinas Signed-off-by: Andrew Perepechko Signed-off-by: "Theodore Ts'o" Acked-by: Andreas Dilger --- fs/ext4/mmp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 2fca64efd6db..6b327423e622 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -125,8 +125,9 @@ static int kmmpd(void *data) * Don't spew too many error messages. Print one every * (s_mmp_update_interval * 60) seconds. */ - if (retval && (failed_writes % 60) == 0) { - ext4_error(sb, "Error writing to MMP block"); + if (retval) { + if ((failed_writes % 60) == 0) + ext4_error(sb, "Error writing to MMP block"); failed_writes++; } -- cgit v1.2.3-58-ga151 From f6f96fdb8c2779f9bd8ed7b0b08405b4439c982b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 18 Oct 2011 10:53:51 -0400 Subject: ext4: Fix comparison endianness problem in MMP initialization As part of startup, the MMP initialization code does this: mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); Next, mmp->mmp_seq is written out to disk, a delay happens, and then the MMP block is read back in and the sequence value is tested: if (seq != le32_to_cpu(mmp->mmp_seq)) { /* fail the mount */ On a LE system such as x86, the *le32* functions do nothing and this works. Unfortunately, on a BE system such as ppc64, this comparison becomes: if (cpu_to_le32(new_seq) != le32_to_cpu(cpu_to_le32(new_seq)) { /* fail the mount */ Except for a few palindromic sequence numbers, this test always causes the mount to fail, which makes MMP filesystems generally unmountable on ppc64. The attached patch fixes this situation. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/mmp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 6b327423e622..7ea4ba4eff2a 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -296,7 +296,8 @@ skip: /* * write a new random sequence number. */ - mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); + seq = mmp_new_seq(); + mmp->mmp_seq = cpu_to_le32(seq); retval = write_mmp_block(bh); if (retval) -- cgit v1.2.3-58-ga151 From 1bce63d1a2a2c8929442b79acd4eab2e3db10a0b Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 18 Oct 2011 10:55:51 -0400 Subject: ext4: add block plug for .writepages Add block plug for ext4 .writepages. Though ext4 .writepages already handles request merge very well, block plug is still helpful to reduce block lock contention. Signed-off-by: Shaohua Li Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4863238c3754..081bb25a9ad3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2160,6 +2160,7 @@ static int ext4_da_writepages(struct address_space *mapping, struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); pgoff_t done_index = 0; pgoff_t end; + struct blk_plug plug; trace_ext4_da_writepages(inode, wbc); @@ -2238,6 +2239,7 @@ retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); + blk_start_plug(&plug); while (!ret && wbc->nr_to_write > 0) { /* @@ -2302,6 +2304,7 @@ retry: */ break; } + blk_finish_plug(&plug); if (!io_done && !cycled) { cycled = 1; index = 0; -- cgit v1.2.3-58-ga151 From e0cbee3e14195ef07b8ab6ff30930fb93d2e510a Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Tue, 18 Oct 2011 10:57:51 -0400 Subject: ext4: functions should not be declared extern The function declarations in ext4.h are already marked extern, so it's not necessary to do so in the .c files. This quiets the sparse noise: warning: function 'ext4_flush_completed_IO' with external linkage has definition warning: function 'ext4_init_inode_table' with external linkage has definition Signed-off-by: H Hartley Sweeten Signed-off-by: "Theodore Ts'o" --- fs/ext4/fsync.c | 2 +- fs/ext4/ialloc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 036f78f7a1ef..c942924a0645 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -75,7 +75,7 @@ static void dump_completed_IO(struct inode * inode) * to written. * The function return the number of pending IOs on success. */ -extern int ext4_flush_completed_IO(struct inode *inode) +int ext4_flush_completed_IO(struct inode *inode) { ext4_io_end_t *io; struct ext4_inode_info *ei = EXT4_I(inode); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 04219988e75f..e007fecdaedc 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1100,7 +1100,7 @@ unsigned long ext4_count_dirs(struct super_block * sb) * inode allocation from the current group, so we take alloc_sem lock, to * block ext4_claim_inode until we are finished. */ -extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, +int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); -- cgit v1.2.3-58-ga151 From e6705f7c255d1ffae7cd161d0b657296f4dd62fd Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Tue, 18 Oct 2011 10:59:51 -0400 Subject: ext4: add __user decoration to calls of copy_{from,to}_user() This quiets the sparse noise: warning: incorrect type in argument 2 (different address spaces) expected void const [noderef] *from got struct fstrim_range * warning: incorrect type in argument 1 (different address spaces) expected void [noderef] *to got struct fstrim_range * Signed-off-by: H Hartley Sweeten Signed-off-by: "Theodore Ts'o" --- fs/ext4/ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 8f7ea6905421..a56796814d6a 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -350,7 +350,7 @@ mext_out: return -EOPNOTSUPP; } - if (copy_from_user(&range, (struct fstrim_range *)arg, + if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; @@ -360,7 +360,7 @@ mext_out: if (ret < 0) return ret; - if (copy_to_user((struct fstrim_range *)arg, &range, + if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; -- cgit v1.2.3-58-ga151 From ee90d57e20bd4749dda3b14397392b18b89dc3ef Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Tue, 18 Oct 2011 11:01:51 -0400 Subject: ext4: quiet sparse noise about plain integer as NULL pointer The third parameter to ext4_free_blocks is a struct buffer_head *. This parameter should be NULL not 0. This quiets the sparse noise: warning: Using plain integer as NULL pointer Signed-off-by: H Hartley Sweeten Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5c4861210d4c..2fc6cc0a51ca 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2291,7 +2291,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, start = ext4_ext_pblock(ex); ext_debug("free first %u blocks starting %llu\n", num, start); - ext4_free_blocks(handle, inode, 0, start, num, flags); + ext4_free_blocks(handle, inode, NULL, start, num, flags); } else { printk(KERN_INFO "strange request: removal(2) " -- cgit v1.2.3-58-ga151 From 7748dd6eab8e13f974d4664395e76afffacda04b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 18 Oct 2011 12:41:35 +0300 Subject: CIFS: cleanup min_t() cast in cifs_read() Smatch complains that the cast to "int" in min_t() changes very large values of current_read_size into negative values and so min_t() could return the wrong value. I removed the const as well, as that doesn't do anything here. Signed-off-by: Dan Carpenter Signed-off-by: Steve French --- fs/cifs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 7f84ece116d0..852d1f39adae 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1904,13 +1904,13 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, for (total_read = 0, current_offset = read_data; read_size > total_read; total_read += bytes_read, current_offset += bytes_read) { - current_read_size = min_t(const int, read_size - total_read, + current_read_size = min_t(uint, read_size - total_read, cifs_sb->rsize); /* For windows me and 9x we do not want to request more than it negotiated since it will refuse the read then */ if ((pTcon->ses) && !(pTcon->ses->capabilities & CAP_LARGE_FILES)) { - current_read_size = min_t(const int, current_read_size, + current_read_size = min_t(uint, current_read_size, CIFSMaxBufSize); } rc = -EAGAIN; -- cgit v1.2.3-58-ga151 From ad4778fb40994dd7c779069dad6ff704d75b81e6 Mon Sep 17 00:00:00 2001 From: Gerlando Falauto Date: Tue, 18 Oct 2011 10:58:50 +0200 Subject: CIFS: fix automount for DFS shares Automounting directories are now invalidated by .d_revalidate() so to be d_instantiate()d again with the right DCACHE_NEED_AUTOMOUNT flag Reviewed-by: Jeff Layton Signed-off-by: Gerlando Falauto Signed-off-by: Steve French --- fs/cifs/dir.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 0c8098d54d2b..d7eeb9d3ed6f 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -648,8 +648,16 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd) if (direntry->d_inode) { if (cifs_revalidate_dentry(direntry)) return 0; - else + else { + /* + * Forcibly invalidate automounting directory inodes + * (remote DFS directories) so to have them + * instantiated again for automount + */ + if (IS_AUTOMOUNT(direntry->d_inode)) + return 0; return 1; + } } /* -- cgit v1.2.3-58-ga151 From 59b7c05fffba030e5d9e72324691e2f99aa69b79 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 17 Oct 2011 18:22:55 -0700 Subject: Revert "NFS: Ensure that writeback_single_inode() calls write_inode() when syncing" This reverts commit b80c3cb628f0ebc241b02e38dd028969fb8026a2. The reverted commit was rendered obsolete by a VFS fix: commit 5547e8aac6f71505d621a612de2fca0dd988b439 (writeback: Update dirty flags in two steps). We now no longer need to worry about writeback_single_inode() missing our marking the inode for COMMIT in 'do_writepages()' call. Reverting this patch, fixes a performance regression in which the inode would continuously get queued to the dirty list, causing the writeback code to unnecessarily try to send a COMMIT. Signed-off-by: Trond Myklebust Tested-by: Simon Kirby Cc: stable@kernel.org [2.6.35+] --- fs/nfs/write.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c9bd2a6b7d4b..71296cc97f98 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -428,7 +428,6 @@ static void nfs_mark_request_dirty(struct nfs_page *req) { __set_page_dirty_nobuffers(req->wb_page); - __mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC); } #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) @@ -762,6 +761,8 @@ int nfs_updatepage(struct file *file, struct page *page, status = nfs_writepage_setup(ctx, page, offset, count); if (status < 0) nfs_set_pageerror(page); + else + __set_page_dirty_nobuffers(page); dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", status, (long long)i_size_read(inode)); -- cgit v1.2.3-58-ga151 From 3236c3e1adc0c7ec83eaff1de2d06746b7c5bb28 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Oct 2011 09:49:21 -0400 Subject: nfs: don't redirty inode when ncommit == 0 in nfs_commit_unstable_pages commit 420e3646 allowed the kernel to reduce the number of unnecessary commit calls by skipping the commit when there are a large number of outstanding pages. However, the current test in nfs_commit_unstable_pages does not handle the edge condition properly. When ncommit == 0, then that means that the kernel doesn't need to do anything more for the inode. The current test though in the WB_SYNC_NONE case will return true, and the inode will end up being marked dirty. Once that happens the inode will never be clean until there's a WB_SYNC_ALL flush. Fix this by immediately returning from nfs_commit_unstable_pages when ncommit == 0. Mike noticed this problem initially in RHEL5 (2.6.18-based kernel) which has a backported version of 420e3646. The inode cache there was growing very large. The inode cache was unable to be shrunk since the inodes were all marked dirty. Calling sync() would essentially "fix" the problem -- the WB_SYNC_ALL flush would result in the inodes all being marked clean. What I'm not clear on is how big a problem this is in mainline kernels as the writeback code there is very different. Either way, it seems incorrect to re-mark the inode dirty in this case. Reported-by: Mike McLean Signed-off-by: Jeff Layton Cc: stable@kernel.org [2.6.34+] Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 71296cc97f98..46aa4389ce13 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1554,6 +1554,10 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr int flags = FLUSH_SYNC; int ret = 0; + /* no commits means nothing needs to be done */ + if (!nfsi->ncommit) + return ret; + if (wbc->sync_mode == WB_SYNC_NONE) { /* Don't commit yet if this is a non-blocking flush and there * are a lot of outstanding writes for this mapping. -- cgit v1.2.3-58-ga151 From b9dd3abbbc708da5e3c53424a5b2c66ab580f97e Mon Sep 17 00:00:00 2001 From: Mi Jinlong Date: Wed, 12 Oct 2011 15:09:34 +0800 Subject: nfs: fix bug about IPv6 address scope checking The result from ipv6_addr_scope() always not be a single SCOPE, so we can't use equal to compare the result with IPV6_ADDR_SCOPE_LINKLOCAL at nfs_sockaddr_match_ipaddr6. This patch fixs the problem, and lets checking address before scope_id. Signed-off-by: Mi Jinlong Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 5833fbbf59b0..b4e41dd4d0f6 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -336,11 +336,12 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; - if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && - sin1->sin6_scope_id != sin2->sin6_scope_id) + if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr)) return 0; + else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL) + return sin1->sin6_scope_id == sin2->sin6_scope_id; - return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr); + return 1; } #else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, -- cgit v1.2.3-58-ga151 From 2da956523526e440ef4f4dd174e26f5ac06fe011 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 12 Oct 2011 10:57:42 -0400 Subject: nfs: don't try to migrate pages with active requests nfs_find_and_lock_request will take a reference to the nfs_page and will then put it if the req is already locked. It's possible though that the reference will be the last one. That put then can kick off a whole series of reference puts: nfs_page nfs_open_context dentry inode If the inode ends up being deleted, then the VFS will call truncate_inode_pages. That function will try to take the page lock, but it was already locked when migrate_page was called. The code deadlocks. Fix this by simply refusing the migration request if PagePrivate is already set, indicating that the page is already associated with an active read or write request. We've had a customer test a backported version of this patch and the preliminary results seem good. Cc: stable@kernel.org Cc: Andrea Arcangeli Reported-by: Harshula Jayasuriya Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 36 +++++++++++------------------------- 1 file changed, 11 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 46aa4389ce13..72813ede029e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1691,34 +1691,20 @@ out_error: int nfs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page) { - struct nfs_page *req; - int ret; + /* + * If PagePrivate is set, then the page is currently associated with + * an in-progress read or write request. Don't try to migrate it. + * + * FIXME: we could do this in principle, but we'll need a way to ensure + * that we can safely release the inode reference while holding + * the page lock. + */ + if (PagePrivate(page)) + return -EBUSY; nfs_fscache_release_page(page, GFP_KERNEL); - req = nfs_find_and_lock_request(page, false); - ret = PTR_ERR(req); - if (IS_ERR(req)) - goto out; - - ret = migrate_page(mapping, newpage, page); - if (!req) - goto out; - if (ret) - goto out_unlock; - page_cache_get(newpage); - spin_lock(&mapping->host->i_lock); - req->wb_page = newpage; - SetPagePrivate(newpage); - set_page_private(newpage, (unsigned long)req); - ClearPagePrivate(page); - set_page_private(page, 0); - spin_unlock(&mapping->host->i_lock); - page_cache_release(page); -out_unlock: - nfs_clear_page_tag_locked(req); -out: - return ret; + return migrate_page(mapping, newpage, page); } #endif -- cgit v1.2.3-58-ga151 From 516f2e24faa7548a61d9ba790958528469c2e284 Mon Sep 17 00:00:00 2001 From: Jim Rees Date: Thu, 22 Sep 2011 21:50:08 -0400 Subject: pnfsblock: fix return code confusion Always return PTR_ERR, not NULL, from nfs4_blk_get_deviceinfo and nfs4_blk_decode_device. Check for IS_ERR, not NULL, in bl_set_layoutdriver when calling nfs4_blk_get_deviceinfo. Signed-off-by: Jim Rees Signed-off-by: Benny Halevy Cc: stable@kernel.org [3.0] Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 20 ++++++++++++-------- fs/nfs/blocklayout/blocklayoutdev.c | 13 ++++++++----- 2 files changed, 20 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 9561c8fc8bdb..d2432f0dc40c 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -805,7 +805,7 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, struct nfs4_deviceid *d_id) { struct pnfs_device *dev; - struct pnfs_block_dev *rv = NULL; + struct pnfs_block_dev *rv; u32 max_resp_sz; int max_pages; struct page **pages = NULL; @@ -823,18 +823,20 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, dev = kmalloc(sizeof(*dev), GFP_NOFS); if (!dev) { dprintk("%s kmalloc failed\n", __func__); - return NULL; + return ERR_PTR(-ENOMEM); } pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS); if (pages == NULL) { kfree(dev); - return NULL; + return ERR_PTR(-ENOMEM); } for (i = 0; i < max_pages; i++) { pages[i] = alloc_page(GFP_NOFS); - if (!pages[i]) + if (!pages[i]) { + rv = ERR_PTR(-ENOMEM); goto out_free; + } } memcpy(&dev->dev_id, d_id, sizeof(*d_id)); @@ -847,8 +849,10 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); rc = nfs4_proc_getdeviceinfo(server, dev); dprintk("%s getdevice info returns %d\n", __func__, rc); - if (rc) + if (rc) { + rv = ERR_PTR(rc); goto out_free; + } rv = nfs4_blk_decode_device(server, dev); out_free: @@ -866,7 +870,7 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) struct pnfs_devicelist *dlist = NULL; struct pnfs_block_dev *bdev; LIST_HEAD(block_disklist); - int status = 0, i; + int status, i; dprintk("%s enter\n", __func__); @@ -898,8 +902,8 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) for (i = 0; i < dlist->num_devs; i++) { bdev = nfs4_blk_get_deviceinfo(server, fh, &dlist->dev_id[i]); - if (!bdev) { - status = -ENODEV; + if (IS_ERR(bdev)) { + status = PTR_ERR(bdev); goto out_error; } spin_lock(&b_mt_id->bm_lock); diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c index a83b393fb01c..0b1fb0e25b93 100644 --- a/fs/nfs/blocklayout/blocklayoutdev.c +++ b/fs/nfs/blocklayout/blocklayoutdev.c @@ -131,7 +131,7 @@ struct pnfs_block_dev * nfs4_blk_decode_device(struct nfs_server *server, struct pnfs_device *dev) { - struct pnfs_block_dev *rv = NULL; + struct pnfs_block_dev *rv; struct block_device *bd = NULL; struct rpc_pipe_msg msg; struct bl_msg_hdr bl_msg = { @@ -141,7 +141,7 @@ nfs4_blk_decode_device(struct nfs_server *server, uint8_t *dataptr; DECLARE_WAITQUEUE(wq, current); struct bl_dev_msg *reply = &bl_mount_reply; - int offset, len, i; + int offset, len, i, rc; dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, @@ -168,8 +168,10 @@ nfs4_blk_decode_device(struct nfs_server *server, dprintk("%s CALLING USERSPACE DAEMON\n", __func__); add_wait_queue(&bl_wq, &wq); - if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) { + rc = rpc_queue_upcall(bl_device_pipe->d_inode, &msg); + if (rc < 0) { remove_wait_queue(&bl_wq, &wq); + rv = ERR_PTR(rc); goto out; } @@ -187,8 +189,9 @@ nfs4_blk_decode_device(struct nfs_server *server, bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor)); if (IS_ERR(bd)) { - dprintk("%s failed to open device : %ld\n", - __func__, PTR_ERR(bd)); + rc = PTR_ERR(bd); + dprintk("%s failed to open device : %d\n", __func__, rc); + rv = ERR_PTR(rc); goto out; } -- cgit v1.2.3-58-ga151 From fdc17abbc4b6094b34ee8ff5d91eaba8637594a2 Mon Sep 17 00:00:00 2001 From: Jim Rees Date: Thu, 22 Sep 2011 21:50:09 -0400 Subject: pnfsblock: fix size of upcall message Make the status field explicitly 32 bits. "...it's unlikely that the kernel and userspace would differ on the size of an int here, but it might be a good idea to go ahead and make that explicitly 32 bits in case we end up dealing with more exotic arches at some point in the future." Suggested-by: Jeff Layton Signed-off-by: Jim Rees Signed-off-by: Benny Halevy Cc: stable@kernel.org [3.0] Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index f27d827960a3..58dc256402e3 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -150,7 +150,7 @@ BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) } struct bl_dev_msg { - int status; + int32_t status; uint32_t major, minor; }; -- cgit v1.2.3-58-ga151 From c1225158a8dad9e9d5eee8a17dbbd9c7cda05ab9 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 22 Sep 2011 21:50:10 -0400 Subject: SUNRPC/NFS: make rpc pipe upcall generic The same function is used by idmap, gss and blocklayout code. Make it generic. Signed-off-by: Peng Tao Signed-off-by: Jim Rees Cc: stable@kernel.org [3.0] Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 2 +- fs/nfs/blocklayout/blocklayout.h | 2 -- fs/nfs/blocklayout/blocklayoutdev.c | 22 ---------------------- fs/nfs/idmap.c | 25 +------------------------ include/linux/sunrpc/rpc_pipe_fs.h | 2 ++ net/sunrpc/auth_gss/auth_gss.c | 24 ++---------------------- net/sunrpc/rpc_pipe.c | 20 ++++++++++++++++++++ 7 files changed, 26 insertions(+), 71 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index d2432f0dc40c..dc23833c0231 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -964,7 +964,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = { }; static const struct rpc_pipe_ops bl_upcall_ops = { - .upcall = bl_pipe_upcall, + .upcall = rpc_pipe_generic_upcall, .downcall = bl_pipe_downcall, .destroy_msg = bl_pipe_destroy_msg, }; diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 58dc256402e3..42acf7ef5992 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -169,8 +169,6 @@ extern wait_queue_head_t bl_wq; #define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ /* blocklayoutdev.c */ -ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *, - char __user *, size_t); ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); void bl_pipe_destroy_msg(struct rpc_pipe_msg *); struct block_device *nfs4_blkdev_get(dev_t dev); diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c index 0b1fb0e25b93..d08ba9107fde 100644 --- a/fs/nfs/blocklayout/blocklayoutdev.c +++ b/fs/nfs/blocklayout/blocklayoutdev.c @@ -79,28 +79,6 @@ int nfs4_blkdev_put(struct block_device *bdev) return blkdev_put(bdev, FMODE_READ); } -/* - * Shouldn't there be a rpc_generic_upcall() to do this for us? - */ -ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, - char __user *dst, size_t buflen) -{ - char *data = (char *)msg->data + msg->copied; - size_t mlen = min(msg->len - msg->copied, buflen); - unsigned long left; - - left = copy_to_user(dst, data, mlen); - if (left == mlen) { - msg->errno = -EFAULT; - return -EFAULT; - } - - mlen -= left; - msg->copied += mlen; - msg->errno = 0; - return mlen; -} - static struct bl_dev_msg bl_mount_reply; ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index f20801ae0a16..47d1c6ff2d8e 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -336,8 +336,6 @@ struct idmap { struct idmap_hashtable idmap_group_hash; }; -static ssize_t idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *, - char __user *, size_t); static ssize_t idmap_pipe_downcall(struct file *, const char __user *, size_t); static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); @@ -345,7 +343,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); static unsigned int fnvhash32(const void *, size_t); static const struct rpc_pipe_ops idmap_upcall_ops = { - .upcall = idmap_pipe_upcall, + .upcall = rpc_pipe_generic_upcall, .downcall = idmap_pipe_downcall, .destroy_msg = idmap_pipe_destroy_msg, }; @@ -595,27 +593,6 @@ nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h, return ret; } -/* RPC pipefs upcall/downcall routines */ -static ssize_t -idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, - char __user *dst, size_t buflen) -{ - char *data = (char *)msg->data + msg->copied; - size_t mlen = min(msg->len, buflen); - unsigned long left; - - left = copy_to_user(dst, data, mlen); - if (left == mlen) { - msg->errno = -EFAULT; - return -EFAULT; - } - - mlen -= left; - msg->copied += mlen; - msg->errno = 0; - return mlen; -} - static ssize_t idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h index cf14db975da0..e4ea43058d8f 100644 --- a/include/linux/sunrpc/rpc_pipe_fs.h +++ b/include/linux/sunrpc/rpc_pipe_fs.h @@ -44,6 +44,8 @@ RPC_I(struct inode *inode) return container_of(inode, struct rpc_inode, vfs_inode); } +extern ssize_t rpc_pipe_generic_upcall(struct file *, struct rpc_pipe_msg *, + char __user *, size_t); extern int rpc_queue_upcall(struct inode *, struct rpc_pipe_msg *); struct rpc_clnt; diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 364eb45e989d..e9b76939268d 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -603,26 +603,6 @@ out: return err; } -static ssize_t -gss_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, - char __user *dst, size_t buflen) -{ - char *data = (char *)msg->data + msg->copied; - size_t mlen = min(msg->len, buflen); - unsigned long left; - - left = copy_to_user(dst, data, mlen); - if (left == mlen) { - msg->errno = -EFAULT; - return -EFAULT; - } - - mlen -= left; - msg->copied += mlen; - msg->errno = 0; - return mlen; -} - #define MSG_BUF_MAXSIZE 1024 static ssize_t @@ -1590,7 +1570,7 @@ static const struct rpc_credops gss_nullops = { }; static const struct rpc_pipe_ops gss_upcall_ops_v0 = { - .upcall = gss_pipe_upcall, + .upcall = rpc_pipe_generic_upcall, .downcall = gss_pipe_downcall, .destroy_msg = gss_pipe_destroy_msg, .open_pipe = gss_pipe_open_v0, @@ -1598,7 +1578,7 @@ static const struct rpc_pipe_ops gss_upcall_ops_v0 = { }; static const struct rpc_pipe_ops gss_upcall_ops_v1 = { - .upcall = gss_pipe_upcall, + .upcall = rpc_pipe_generic_upcall, .downcall = gss_pipe_downcall, .destroy_msg = gss_pipe_destroy_msg, .open_pipe = gss_pipe_open_v1, diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index b181e3441323..67dbc1884383 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -77,6 +77,26 @@ rpc_timeout_upcall_queue(struct work_struct *work) rpc_purge_list(rpci, &free_list, destroy_msg, -ETIMEDOUT); } +ssize_t rpc_pipe_generic_upcall(struct file *filp, struct rpc_pipe_msg *msg, + char __user *dst, size_t buflen) +{ + char *data = (char *)msg->data + msg->copied; + size_t mlen = min(msg->len - msg->copied, buflen); + unsigned long left; + + left = copy_to_user(dst, data, mlen); + if (left == mlen) { + msg->errno = -EFAULT; + return -EFAULT; + } + + mlen -= left; + msg->copied += mlen; + msg->errno = 0; + return mlen; +} +EXPORT_SYMBOL_GPL(rpc_pipe_generic_upcall); + /** * rpc_queue_upcall - queue an upcall message to userspace * @inode: inode of upcall pipe on which to queue given message -- cgit v1.2.3-58-ga151 From 760383f1ee4d14b0e0bdf0cddee648d9b8633429 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 22 Sep 2011 21:50:11 -0400 Subject: pnfsblock: add missing rpc_put_mount and path_put Reviewed-by: Jeff Layton Signed-off-by: Peng Tao Signed-off-by: Jim Rees Cc: stable@kernel.org [3.0] Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index dc23833c0231..dee6cae80fea 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -993,17 +993,20 @@ static int __init nfs4blocklayout_init(void) mnt, NFS_PIPE_DIRNAME, 0, &path); if (ret) - goto out_remove; + goto out_putrpc; bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL, &bl_upcall_ops, 0); + path_put(&path); if (IS_ERR(bl_device_pipe)) { ret = PTR_ERR(bl_device_pipe); - goto out_remove; + goto out_putrpc; } out: return ret; +out_putrpc: + rpc_put_mount(); out_remove: pnfs_unregister_layoutdriver(&blocklayout_type); return ret; @@ -1016,6 +1019,7 @@ static void __exit nfs4blocklayout_exit(void) pnfs_unregister_layoutdriver(&blocklayout_type); rpc_unlink(bl_device_pipe); + rpc_put_mount(); } MODULE_ALIAS("nfs-layouttype4-3"); -- cgit v1.2.3-58-ga151 From 1b0ae068779874f54b55aac3a2a992bcf3f2c3c4 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 22 Sep 2011 21:50:12 -0400 Subject: pnfs: make _set_lo_fail generic file layout and block layout both use it to set mark layout io failure bit. So make it generic. Signed-off-by: Peng Tao Signed-off-by: Jim Rees Cc: stable@kernel.org [3.0] Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 17 +++-------------- fs/nfs/nfs4filelayout.c | 19 +++---------------- fs/nfs/pnfs.c | 12 ++++++++++++ fs/nfs/pnfs.h | 1 + 4 files changed, 19 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index dee6cae80fea..2167ba2afdbb 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -176,17 +176,6 @@ retry: return bio; } -static void bl_set_lo_fail(struct pnfs_layout_segment *lseg) -{ - if (lseg->pls_range.iomode == IOMODE_RW) { - dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); - set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); - } else { - dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); - set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); - } -} - /* This is basically copied from mpage_end_io_read */ static void bl_end_io_read(struct bio *bio, int err) { @@ -206,7 +195,7 @@ static void bl_end_io_read(struct bio *bio, int err) if (!uptodate) { if (!rdata->pnfs_error) rdata->pnfs_error = -EIO; - bl_set_lo_fail(rdata->lseg); + pnfs_set_lo_fail(rdata->lseg); } bio_put(bio); put_parallel(par); @@ -370,7 +359,7 @@ static void bl_end_io_write_zero(struct bio *bio, int err) if (!uptodate) { if (!wdata->pnfs_error) wdata->pnfs_error = -EIO; - bl_set_lo_fail(wdata->lseg); + pnfs_set_lo_fail(wdata->lseg); } bio_put(bio); put_parallel(par); @@ -386,7 +375,7 @@ static void bl_end_io_write(struct bio *bio, int err) if (!uptodate) { if (!wdata->pnfs_error) wdata->pnfs_error = -EIO; - bl_set_lo_fail(wdata->lseg); + pnfs_set_lo_fail(wdata->lseg); } bio_put(bio); put_parallel(par); diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index e8915d4840ad..4c78c62639e6 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -77,19 +77,6 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) BUG(); } -/* For data server errors we don't recover from */ -static void -filelayout_set_lo_fail(struct pnfs_layout_segment *lseg) -{ - if (lseg->pls_range.iomode == IOMODE_RW) { - dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); - set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); - } else { - dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); - set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); - } -} - static int filelayout_async_handle_error(struct rpc_task *task, struct nfs4_state *state, struct nfs_client *clp, @@ -145,7 +132,7 @@ static int filelayout_read_done_cb(struct rpc_task *task, dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", __func__, data->ds_clp, data->ds_clp->cl_session); if (reset) { - filelayout_set_lo_fail(data->lseg); + pnfs_set_lo_fail(data->lseg); nfs4_reset_read(task, data); clp = NFS_SERVER(data->inode)->nfs_client; } @@ -221,7 +208,7 @@ static int filelayout_write_done_cb(struct rpc_task *task, dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", __func__, data->ds_clp, data->ds_clp->cl_session); if (reset) { - filelayout_set_lo_fail(data->lseg); + pnfs_set_lo_fail(data->lseg); nfs4_reset_write(task, data); clp = NFS_SERVER(data->inode)->nfs_client; } else @@ -256,7 +243,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task, __func__, data->ds_clp, data->ds_clp->cl_session); if (reset) { prepare_to_resend_writes(data); - filelayout_set_lo_fail(data->lseg); + pnfs_set_lo_fail(data->lseg); } else nfs_restart_rpc(task, data->ds_clp); return -EAGAIN; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index e550e8836c37..6b19fffa812a 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1381,6 +1381,18 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) } } +void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) +{ + if (lseg->pls_range.iomode == IOMODE_RW) { + dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); + } else { + dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + } +} +EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); + void pnfs_set_layoutcommit(struct nfs_write_data *wdata) { diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 01cbfd54f3cb..94e760eb543a 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -178,6 +178,7 @@ int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *); int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); +void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg); int pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); -- cgit v1.2.3-58-ga151 From 8ce160c5ef06cc89c2b6b26bfa5ef7a5ce2c93e0 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 22 Sep 2011 21:50:14 -0400 Subject: pnfs: recoalesce when ld write pagelist fails For pnfs pagelist write failure, we need to pg_recoalesce and resend IO to mds. Signed-off-by: Peng Tao Signed-off-by: Jim Rees Cc: stable@kernel.org [3.0] Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 20 +++++++------------- fs/nfs/pnfs.h | 2 +- fs/nfs/write.c | 25 ++++++++++++++++++++++++- 3 files changed, 32 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 6b19fffa812a..a205c8e2c731 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1168,23 +1168,17 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); /* * Called by non rpc-based layout drivers */ -int -pnfs_ld_write_done(struct nfs_write_data *data) +void pnfs_ld_write_done(struct nfs_write_data *data) { - int status; - - if (!data->pnfs_error) { + if (likely(!data->pnfs_error)) { pnfs_set_layoutcommit(data); data->mds_ops->rpc_call_done(&data->task, data); - data->mds_ops->rpc_release(data); - return 0; + } else { + put_lseg(data->lseg); + data->lseg = NULL; + dprintk("pnfs write error = %d\n", data->pnfs_error); } - - dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__, - data->pnfs_error); - status = nfs_initiate_write(data, NFS_CLIENT(data->inode), - data->mds_ops, NFS_FILE_SYNC); - return status ? : -EAGAIN; + data->mds_ops->rpc_release(data); } EXPORT_SYMBOL_GPL(pnfs_ld_write_done); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 94e760eb543a..71c23d40f735 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -201,7 +201,7 @@ void pnfs_set_layoutcommit(struct nfs_write_data *wdata); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); int _pnfs_return_layout(struct inode *); -int pnfs_ld_write_done(struct nfs_write_data *); +void pnfs_ld_write_done(struct nfs_write_data *); int pnfs_ld_read_done(struct nfs_read_data *); struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 72813ede029e..106fd0634ab3 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1166,7 +1166,13 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata) static void nfs_writeback_release_full(void *calldata) { struct nfs_write_data *data = calldata; - int status = data->task.tk_status; + int ret, status = data->task.tk_status; + struct nfs_pageio_descriptor pgio; + + if (data->pnfs_error) { + nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE); + pgio.pg_recoalesce = 1; + } /* Update attributes as result of writeback. */ while (!list_empty(&data->pages)) { @@ -1182,6 +1188,11 @@ static void nfs_writeback_release_full(void *calldata) req->wb_bytes, (long long)req_offset(req)); + if (data->pnfs_error) { + dprintk(", pnfs error = %d\n", data->pnfs_error); + goto next; + } + if (status < 0) { nfs_set_pageerror(page); nfs_context_set_write_error(req->wb_context, status); @@ -1201,7 +1212,19 @@ remove_request: next: nfs_clear_page_tag_locked(req); nfs_end_page_writeback(page); + if (data->pnfs_error) { + lock_page(page); + nfs_pageio_cond_complete(&pgio, page->index); + ret = nfs_page_async_flush(&pgio, page, 0); + if (ret) { + nfs_set_pageerror(page); + dprintk("rewrite to MDS error = %d\n", ret); + } + unlock_page(page); + } } + if (data->pnfs_error) + nfs_pageio_complete(&pgio); nfs_writedata_release(calldata); } -- cgit v1.2.3-58-ga151 From 9b7eecdcfeb943f130d86bbc249fde4994b6fe30 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 22 Sep 2011 21:50:15 -0400 Subject: pnfs: recoalesce when ld read pagelist fails For pnfs pagelist read failure, we need to pg_recoalesce and resend IO to mds. Signed-off-by: Peng Tao Signed-off-by: Jim Rees Cc: stable@kernel.org [3.0] Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 20 +++++++------------- fs/nfs/pnfs.h | 2 +- fs/nfs/read.c | 12 +++++++++++- 3 files changed, 19 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index a205c8e2c731..ee73d9a4f700 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1262,23 +1262,17 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); /* * Called by non rpc-based layout drivers */ -int -pnfs_ld_read_done(struct nfs_read_data *data) +void pnfs_ld_read_done(struct nfs_read_data *data) { - int status; - - if (!data->pnfs_error) { + if (likely(!data->pnfs_error)) { __nfs4_read_done_cb(data); data->mds_ops->rpc_call_done(&data->task, data); - data->mds_ops->rpc_release(data); - return 0; + } else { + put_lseg(data->lseg); + data->lseg = NULL; + dprintk("pnfs write error = %d\n", data->pnfs_error); } - - dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__, - data->pnfs_error); - status = nfs_initiate_read(data, NFS_CLIENT(data->inode), - data->mds_ops); - return status ? : -EAGAIN; + data->mds_ops->rpc_release(data); } EXPORT_SYMBOL_GPL(pnfs_ld_read_done); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 71c23d40f735..1509530cb111 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -202,7 +202,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); int _pnfs_return_layout(struct inode *); void pnfs_ld_write_done(struct nfs_write_data *); -int pnfs_ld_read_done(struct nfs_read_data *); +void pnfs_ld_read_done(struct nfs_read_data *); struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, loff_t pos, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 2171c043ab08..bfc20b160243 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -541,13 +541,23 @@ static void nfs_readpage_result_full(struct rpc_task *task, void *calldata) static void nfs_readpage_release_full(void *calldata) { struct nfs_read_data *data = calldata; + struct nfs_pageio_descriptor pgio; + if (data->pnfs_error) { + nfs_pageio_init_read_mds(&pgio, data->inode); + pgio.pg_recoalesce = 1; + } while (!list_empty(&data->pages)) { struct nfs_page *req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - nfs_readpage_release(req); + if (!data->pnfs_error) + nfs_readpage_release(req); + else + nfs_pageio_add_request(&pgio, req); } + if (data->pnfs_error) + nfs_pageio_complete(&pgio); nfs_readdata_release(calldata); } -- cgit v1.2.3-58-ga151 From e6d05a757c314ad88d0649d3835a8a1daa964236 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 22 Sep 2011 21:50:16 -0400 Subject: pnfsblock: fix NULL pointer dereference bl_add_page_to_bio returns error pointer. bio should be reset to NULL in failure cases as the out path always calls bl_submit_bio. Signed-off-by: Peng Tao Signed-off-by: Jim Rees Cc: stable@kernel.org [3.0] Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 2167ba2afdbb..4ddbfbf1c3ad 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -292,6 +292,7 @@ bl_read_pagelist(struct nfs_read_data *rdata) bl_end_io_read, par); if (IS_ERR(bio)) { rdata->pnfs_error = PTR_ERR(bio); + bio = NULL; goto out; } } @@ -581,6 +582,7 @@ fill_invalid_ext: bl_end_io_write_zero, par); if (IS_ERR(bio)) { wdata->pnfs_error = PTR_ERR(bio); + bio = NULL; goto out; } /* FIXME: This should be done in bi_end_io */ @@ -629,6 +631,7 @@ next_page: bl_end_io_write, par); if (IS_ERR(bio)) { wdata->pnfs_error = PTR_ERR(bio); + bio = NULL; goto out; } isect += PAGE_CACHE_SECTORS; -- cgit v1.2.3-58-ga151 From 7542274519b3ba87555410c66e8356ac1e3bc9b3 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 22 Sep 2011 21:50:17 -0400 Subject: pnfsblock: fix writeback deadlock We should check if the sector is already initialized before trying to grab the page from page cache. Otherwise when two pages of the same block are written back by two threads each calling from writepage_locked, it can cause deadlock like bellow. [ 1080.972099] INFO: task kswapd0:25 blocked for more than 120 seconds. [ 1080.972377] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1080.972812] kswapd0 D ffff88000c4926c0 0 25 2 0x00000000 [ 1080.972816] ffff88000df276b0 0000000000000046 ffff88000df27640 ffffffff81013ba7 [ 1080.972821] ffff88000c492310 ffff88000df27fd8 ffff88000df27fd8 00000000001d3440 [ 1080.972824] ffff88000c378000 ffff88000c492310 ffff8800175d3d40 ffff880017fc75a8 [ 1080.972828] Call Trace: [ 1080.972860] [] ? read_tsc+0x9/0x19 [ 1080.972877] [] ? lock_page+0x2b/0x2b [ 1080.972899] [] io_schedule+0x63/0x7e [ 1080.972902] [] sleep_on_page+0xe/0x12 [ 1080.972905] [] __wait_on_bit_lock+0x46/0x8f [ 1080.972916] [] ? lock_release_holdtime.part.7+0x6b/0x72 [ 1080.972919] [] __lock_page+0x66/0x68 [ 1080.972928] [] ? autoremove_wake_function+0x3d/0x3d [ 1080.972932] [] lock_page+0x27/0x2b [ 1080.972934] [] find_lock_page+0x34/0x57 [ 1080.972937] [] find_or_create_page+0x34/0x8a [ 1080.972947] [] bl_write_pagelist+0x205/0x6da [blocklayoutdriver] [ 1080.972951] [] ? bl_free_lseg+0x38/0x38 [blocklayoutdriver] [ 1080.972995] [] ? nfs_write_rpcsetup+0x118/0x123 [nfs] [ 1080.973033] [] pnfs_generic_pg_writepages+0x10b/0x1f4 [nfs] [ 1080.973089] [] nfs_pageio_doio+0x1a/0x43 [nfs] [ 1080.973098] [] nfs_pageio_complete+0x16/0x2d [nfs] [ 1080.973108] [] nfs_writepage_locked+0xa0/0xbf [nfs] [ 1080.973119] [] nfs_writepage+0x16/0x2b [nfs] [ 1080.973122] [] ? clear_page_dirty_for_io+0x87/0x9a [ 1080.973133] [] shrink_page_list+0x39b/0x6c8 [ 1080.973139] [] shrink_inactive_list+0x22c/0x39e [ 1080.973144] [] ? lock_release_holdtime.part.7+0x6b/0x72 [ 1080.973148] [] shrink_zone+0x445/0x588 [ 1080.973152] [] balance_pgdat+0x2c2/0x56b [ 1080.973170] [] ? __bitmap_weight+0x34/0x80 [ 1080.973175] [] kswapd+0x2be/0x2fa [ 1080.973179] [] ? __init_waitqueue_head+0x4b/0x4b [ 1080.973183] [] ? balance_pgdat+0x56b/0x56b [ 1080.973187] [] kthread+0xa8/0xb0 [ 1080.973200] [] kernel_thread_helper+0x4/0x10 [ 1080.973205] [] ? __init_kthread_worker+0x5a/0x5a [ 1080.973210] [] ? gs_change+0x13/0x13 [ 1080.973213] no locks held by kswapd0/25. Signed-off-by: Peng Tao Signed-off-by: Jim Rees Cc: stable@kernel.org [3.0] Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 4ddbfbf1c3ad..281ae95932c9 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -533,6 +533,11 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) fill_invalid_ext: dprintk("%s need to zero %d pages\n", __func__, npg_zero); for (;npg_zero > 0; npg_zero--) { + if (bl_is_sector_init(be->be_inval, isect)) { + dprintk("isect %llu already init\n", + (unsigned long long)isect); + goto next_page; + } /* page ref released in bl_end_io_write_zero */ index = isect >> PAGE_CACHE_SECTOR_SHIFT; dprintk("%s zero %dth page: index %lu isect %llu\n", @@ -552,8 +557,7 @@ fill_invalid_ext: * PageUptodate: It was read before * sector_initialized: already written out */ - if (PageDirty(page) || PageWriteback(page) || - bl_is_sector_init(be->be_inval, isect)) { + if (PageDirty(page) || PageWriteback(page)) { print_page(page); unlock_page(page); page_cache_release(page); -- cgit v1.2.3-58-ga151 From 45402c38eec740f52422aafc92937c6a4a8c8c0e Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Fri, 2 Sep 2011 14:39:12 -0700 Subject: nfs/super.c: local functions should be static commit ae50c0b5 "pnfs: client stats" added additional information to the output of /proc/self/mountstats. The new functions introduced are only used in this file and should be marked static. If CONFIG_NFS_V4_1 is not defined, empty stub functions are used. If CONFIG_NFS_V4 is not defined these stub functions are not used at all. Adding static for the functions results in compile warnings: fs/nfs/super.c:743: warning: 'show_sessions' defined but not used fs/nfs/super.c:756: warning: 'show_pnfs' defined but not used Fix this by adding a #ifdef CONFIG_NFS_V4 guard around the two show_ functions. Signed-off-by: H Hartley Sweeten Cc: Trond Myklebust Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 5b19b6aabe18..480b3b6bf71e 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -733,18 +733,22 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) return 0; } + +#ifdef CONFIG_NFS_V4 #ifdef CONFIG_NFS_V4_1 -void show_sessions(struct seq_file *m, struct nfs_server *server) +static void show_sessions(struct seq_file *m, struct nfs_server *server) { if (nfs4_has_session(server->nfs_client)) seq_printf(m, ",sessions"); } #else -void show_sessions(struct seq_file *m, struct nfs_server *server) {} +static void show_sessions(struct seq_file *m, struct nfs_server *server) {} +#endif #endif +#ifdef CONFIG_NFS_V4 #ifdef CONFIG_NFS_V4_1 -void show_pnfs(struct seq_file *m, struct nfs_server *server) +static void show_pnfs(struct seq_file *m, struct nfs_server *server) { seq_printf(m, ",pnfs="); if (server->pnfs_curr_ld) @@ -752,9 +756,10 @@ void show_pnfs(struct seq_file *m, struct nfs_server *server) else seq_printf(m, "not configured"); } -#else /* CONFIG_NFS_V4_1 */ -void show_pnfs(struct seq_file *m, struct nfs_server *server) {} -#endif /* CONFIG_NFS_V4_1 */ +#else +static void show_pnfs(struct seq_file *m, struct nfs_server *server) {} +#endif +#endif static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt) { -- cgit v1.2.3-58-ga151 From a9a4a87a5942e9271523197a90aaa82349c818fb Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 17 Oct 2011 16:08:46 -0700 Subject: NFS: Use the inode->i_version to cache NFSv4 change attribute information Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 2 +- fs/nfs/fscache-index.c | 4 ++-- fs/nfs/inode.c | 16 ++++++++-------- fs/nfs/nfs4proc.c | 4 ++-- fs/nfs/write.c | 2 +- include/linux/nfs_fs.h | 1 - 6 files changed, 14 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 321a66bc3846..7f2654069806 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -240,7 +240,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct sizeof(delegation->stateid.data)); delegation->type = res->delegation_type; delegation->maxsize = res->maxsize; - delegation->change_attr = nfsi->change_attr; + delegation->change_attr = inode->i_version; delegation->cred = get_rpccred(cred); delegation->inode = inode; delegation->flags = 1<vfs_inode.i_ctime; if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) - auxdata.change_attr = nfsi->change_attr; + auxdata.change_attr = nfsi->vfs_inode.i_version; if (bufmax > sizeof(auxdata)) bufmax = sizeof(auxdata); @@ -244,7 +244,7 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, auxdata.ctime = nfsi->vfs_inode.i_ctime; if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) - auxdata.change_attr = nfsi->change_attr; + auxdata.change_attr = nfsi->vfs_inode.i_version; if (memcmp(data, &auxdata, datalen) != 0) return FSCACHE_CHECKAUX_OBSOLETE; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index fe1203797b2b..4dc6d078f108 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -318,7 +318,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) memset(&inode->i_atime, 0, sizeof(inode->i_atime)); memset(&inode->i_mtime, 0, sizeof(inode->i_mtime)); memset(&inode->i_ctime, 0, sizeof(inode->i_ctime)); - nfsi->change_attr = 0; + inode->i_version = 0; inode->i_size = 0; inode->i_nlink = 0; inode->i_uid = -2; @@ -344,7 +344,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; if (fattr->valid & NFS_ATTR_FATTR_CHANGE) - nfsi->change_attr = fattr->change_attr; + inode->i_version = fattr->change_attr; else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA; @@ -897,8 +897,8 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) && (fattr->valid & NFS_ATTR_FATTR_CHANGE) - && nfsi->change_attr == fattr->pre_change_attr) { - nfsi->change_attr = fattr->change_attr; + && inode->i_version == fattr->pre_change_attr) { + inode->i_version = fattr->change_attr; if (S_ISDIR(inode->i_mode)) nfsi->cache_validity |= NFS_INO_INVALID_DATA; ret |= NFS_INO_INVALID_ATTR; @@ -952,7 +952,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat return -EIO; if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && - nfsi->change_attr != fattr->change_attr) + inode->i_version != fattr->change_attr) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; /* Verify a few of the more important attributes */ @@ -1163,7 +1163,7 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa } if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) { - fattr->pre_change_attr = NFS_I(inode)->change_attr; + fattr->pre_change_attr = inode->i_version; fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; } if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && @@ -1244,13 +1244,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* More cache consistency checks */ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { - if (nfsi->change_attr != fattr->change_attr) { + if (inode->i_version != fattr->change_attr) { dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); - nfsi->change_attr = fattr->change_attr; + inode->i_version = fattr->change_attr; } } else if (server->caps & NFS_CAP_CHANGE_ATTR) invalid |= save_cache_validity; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4700fae1ada0..0f0b6076de62 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -753,9 +753,9 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) spin_lock(&dir->i_lock); nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA; - if (!cinfo->atomic || cinfo->before != nfsi->change_attr) + if (!cinfo->atomic || cinfo->before != dir->i_version) nfs_force_lookup_revalidate(dir); - nfsi->change_attr = cinfo->after; + dir->i_version = cinfo->after; spin_unlock(&dir->i_lock); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 106fd0634ab3..2084a6494218 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -390,7 +390,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); BUG_ON(error); if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE)) - nfsi->change_attr++; + inode->i_version++; set_bit(PG_MAPPED, &req->wb_flags); SetPagePrivate(req->wb_page); set_page_private(req->wb_page, (unsigned long)req); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index eaac770f886e..60a137b7f171 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -149,7 +149,6 @@ struct nfs_inode { unsigned long read_cache_jiffies; unsigned long attrtimeo; unsigned long attrtimeo_timestamp; - __u64 change_attr; /* v4 only */ unsigned long attr_gencount; /* "Generation counter" for the attribute cache. This is -- cgit v1.2.3-58-ga151 From 2900b33999e2fc8a8edf0dddaafffec4da25ee10 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 18 Oct 2011 20:00:14 +0000 Subject: xfs: put in missed fix for merge problem I intended to do this as part of fixing part of the conflict with the merge with Linus' tree, but evidently it didn't get included in the commit. Signed-off-by: Alex Elder Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_trans_ail.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 512ff646d01c..4e3f9bbe0141 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -434,7 +434,7 @@ xfsaild_push( if (!IOP_PUSHBUF(lip)) { stuck++; - flush_log = 1; + ailp->xa_log_flush++; } else { ailp->xa_last_pushed_lsn = lsn; } -- cgit v1.2.3-58-ga151 From 9e4c109ac822395e0aae650e4e3c9e4903f6602f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 11 Oct 2011 15:14:11 +0000 Subject: xfs: add AIL pushing tracepoints Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_trace.h | 37 +++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_trans_ail.c | 8 ++++++++ 2 files changed, 45 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index b78f2c65438b..f1d2802b2f07 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -30,6 +30,7 @@ struct xfs_buf_log_item; struct xfs_da_args; struct xfs_da_node_entry; struct xfs_dquot; +struct xfs_log_item; struct xlog_ticket; struct log; struct xlog_recover; @@ -853,6 +854,42 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter); DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit); DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub); +DECLARE_EVENT_CLASS(xfs_log_item_class, + TP_PROTO(struct xfs_log_item *lip), + TP_ARGS(lip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(void *, lip) + __field(uint, type) + __field(uint, flags) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = lip->li_mountp->m_super->s_dev; + __entry->lip = lip; + __entry->type = lip->li_type; + __entry->flags = lip->li_flags; + __entry->lsn = lip->li_lsn; + ), + TP_printk("dev %d:%d lip 0x%p lsn %d/%d type %s flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lip, + CYCLE_LSN(__entry->lsn), BLOCK_LSN(__entry->lsn), + __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), + __print_flags(__entry->flags, "|", XFS_LI_FLAGS)) +) + +#define DEFINE_LOG_ITEM_EVENT(name) \ +DEFINE_EVENT(xfs_log_item_class, name, \ + TP_PROTO(struct xfs_log_item *lip), \ + TP_ARGS(lip)) +DEFINE_LOG_ITEM_EVENT(xfs_ail_push); +DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf); +DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf_pinned); +DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned); +DEFINE_LOG_ITEM_EVENT(xfs_ail_locked); + + DECLARE_EVENT_CLASS(xfs_file_class, TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), TP_ARGS(ip, count, offset, flags), diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 4e3f9bbe0141..ed9252bcdac9 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -26,6 +26,7 @@ #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trans_priv.h" +#include "xfs_trace.h" #include "xfs_error.h" #ifdef DEBUG @@ -425,14 +426,18 @@ xfsaild_push( switch (lock_result) { case XFS_ITEM_SUCCESS: XFS_STATS_INC(xs_push_ail_success); + trace_xfs_ail_push(lip); + IOP_PUSH(lip); ailp->xa_last_pushed_lsn = lsn; break; case XFS_ITEM_PUSHBUF: XFS_STATS_INC(xs_push_ail_pushbuf); + trace_xfs_ail_pushbuf(lip); if (!IOP_PUSHBUF(lip)) { + trace_xfs_ail_pushbuf_pinned(lip); stuck++; ailp->xa_log_flush++; } else { @@ -443,12 +448,15 @@ xfsaild_push( case XFS_ITEM_PINNED: XFS_STATS_INC(xs_push_ail_pinned); + trace_xfs_ail_pinned(lip); + stuck++; ailp->xa_log_flush++; break; case XFS_ITEM_LOCKED: XFS_STATS_INC(xs_push_ail_locked); + trace_xfs_ail_locked(lip); stuck++; break; -- cgit v1.2.3-58-ga151 From 0c2e53f11a6dae9e3af5f50f5ad0382e7c3e0cfa Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2011 16:11:22 -0700 Subject: NFS: Remove the unused "lookupfh()" version of nfs4_proc_lookup() ...and also remove the associated nfs_v4_clientops entry. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 62 ++++++++++++++----------------------------------- include/linux/nfs_xdr.h | 3 --- 2 files changed, 17 insertions(+), 48 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0f0b6076de62..b0c01b23422d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -73,9 +73,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data); static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); -static int _nfs4_proc_lookup(struct rpc_clnt *client, struct inode *dir, - const struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr); static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, @@ -2408,14 +2405,15 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, return status; } -static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server, - const struct nfs_fh *dirfh, const struct qstr *name, - struct nfs_fh *fhandle, struct nfs_fattr *fattr) +static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, + const struct qstr *name, struct nfs_fh *fhandle, + struct nfs_fattr *fattr) { + struct nfs_server *server = NFS_SERVER(dir); int status; struct nfs4_lookup_arg args = { .bitmask = server->attr_bitmask, - .dir_fh = dirfh, + .dir_fh = NFS_FH(dir), .name = name, }; struct nfs4_lookup_res res = { @@ -2431,40 +2429,8 @@ static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server, nfs_fattr_init(fattr); - dprintk("NFS call lookupfh %s\n", name->name); - status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0); - dprintk("NFS reply lookupfh: %d\n", status); - return status; -} - -static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh, - struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) -{ - struct nfs4_exception exception = { }; - int err; - do { - err = _nfs4_proc_lookupfh(server->client, server, dirfh, name, fhandle, fattr); - /* FIXME: !!!! */ - if (err == -NFS4ERR_MOVED) { - err = -EREMOTE; - break; - } - err = nfs4_handle_exception(server, err, &exception); - } while (exception.retry); - return err; -} - -static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, - const struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) -{ - int status; - dprintk("NFS call lookup %s\n", name->name); - status = _nfs4_proc_lookupfh(clnt, NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr); - if (status == -NFS4ERR_MOVED) - status = nfs4_get_referral(dir, name, fattr, fhandle); + status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0); dprintk("NFS reply lookup: %d\n", status); return status; } @@ -2485,11 +2451,18 @@ static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qst struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr), - &exception); - if (err == -EPERM) + int status; + + status = _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr); + switch (status) { + case -NFS4ERR_MOVED: + err = nfs4_get_referral(dir, name, fattr, fhandle); + break; + case -NFS4ERR_WRONGSEC: nfs_fixup_secinfo_attributes(fattr, fhandle); + } + err = nfs4_handle_exception(NFS_SERVER(dir), + status, &exception); } while (exception.retry); return err; } @@ -6270,7 +6243,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .getroot = nfs4_proc_get_root, .getattr = nfs4_proc_getattr, .setattr = nfs4_proc_setattr, - .lookupfh = nfs4_proc_lookupfh, .lookup = nfs4_proc_lookup, .access = nfs4_proc_access, .readlink = nfs4_proc_readlink, diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index abd615d74a29..1e31e1c8655b 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1197,9 +1197,6 @@ struct nfs_rpc_ops { int (*getroot) (struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); - int (*lookupfh)(struct nfs_server *, struct nfs_fh *, - struct qstr *, struct nfs_fh *, - struct nfs_fattr *); int (*getattr) (struct nfs_server *, struct nfs_fh *, struct nfs_fattr *); int (*setattr) (struct dentry *, struct nfs_fattr *, -- cgit v1.2.3-58-ga151 From 08ef7bd3bc04261d14d570ac7eaac3eac947b1ba Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2011 16:11:49 -0700 Subject: NFSv4: Translate NFS4ERR_BADNAME into ENOENT when applied to a lookup Both LOOKUP and OPEN operations may return NFS4ERR_BADNAME if we send a an invalid name as a filename argument. As far as the application is concerned, it just has to know that the file doesn't exist, and so ENOENT would be the appropriate reply. We should only return EINVAL if the filename is being used to _create_ a new object on the remote filesystem. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b0c01b23422d..ba0da50865fe 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1593,8 +1593,14 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) int status; status = nfs4_run_open_task(data, 0); - if (status != 0 || !data->rpc_done) + if (!data->rpc_done) + return status; + if (status != 0) { + if (status == -NFS4ERR_BADNAME && + !(o_arg->open_flags & O_CREAT)) + return -ENOENT; return status; + } if (o_arg->open_flags & O_CREAT) { update_changeattr(dir, &o_res->cinfo); @@ -2455,6 +2461,8 @@ static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qst status = _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr); switch (status) { + case -NFS4ERR_BADNAME: + return -ENOENT; case -NFS4ERR_MOVED: err = nfs4_get_referral(dir, name, fattr, fhandle); break; -- cgit v1.2.3-58-ga151 From 523e1d399ce0e23bec562abe2b2f8d297af81161 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Oct 2011 14:31:07 +0200 Subject: block: make gendisk hold a reference to its queue The following command sequence triggers an oops. # mount /dev/sdb1 /mnt # echo 1 > /sys/class/scsi_device/0\:0\:1\:0/device/delete # umount /mnt general protection fault: 0000 [#1] PREEMPT SMP CPU 2 Modules linked in: Pid: 791, comm: umount Not tainted 3.1.0-rc3-work+ #8 Bochs Bochs RIP: 0010:[] [] __lock_acquire+0x389/0x1d60 ... Call Trace: [] lock_acquire+0x95/0x140 [] _raw_spin_lock+0x3b/0x50 [] bdi_lock_two+0x5c/0x70 [] bdev_inode_switch_bdi+0x4c/0xf0 [] __blkdev_put+0x11b/0x1d0 [] __blkdev_put+0x160/0x1d0 [] blkdev_put+0x5f/0x190 [] kill_block_super+0x4d/0x80 [] deactivate_locked_super+0x45/0x70 [] deactivate_super+0x4a/0x70 [] mntput_no_expire+0xed/0x130 [] sys_umount+0x7e/0x3a0 [] system_call_fastpath+0x16/0x1b This is because bdev holds on to disk but disk doesn't pin the associated queue. If a SCSI device is removed while the device is still open, the sdev puts the base reference to the queue on release. When the bdev is finally released, the associated queue is already gone along with the bdi and bdev_inode_switch_bdi() ends up dereferencing already freed bdi. Even if it were not for this bug, disk not holding onto the associated queue is very unusual and error-prone. Fix it by making add_disk() take an extra reference to its queue and put it on disk_release() and ensuring that disk and its fops owner are put in that order after all accesses to the disk and queue are complete. Signed-off-by: Tejun Heo Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/genhd.c | 8 ++++++++ fs/block_dev.c | 13 ++++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/block/genhd.c b/block/genhd.c index e2f67902dd02..d261b73b9744 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -611,6 +611,12 @@ void add_disk(struct gendisk *disk) register_disk(disk); blk_register_queue(disk); + /* + * Take an extra ref on queue which will be put on disk_release() + * so that it sticks around as long as @disk is there. + */ + WARN_ON_ONCE(blk_get_queue(disk->queue)); + retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, "bdi"); WARN_ON(retval); @@ -1095,6 +1101,8 @@ static void disk_release(struct device *dev) disk_replace_part_tbl(disk, NULL); free_part_stats(&disk->part0); free_part_info(&disk->part0); + if (disk->queue) + blk_put_queue(disk->queue); kfree(disk); } struct class block_class = { diff --git a/fs/block_dev.c b/fs/block_dev.c index 95f786ec7f08..1c44b8d54504 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1085,6 +1085,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) { struct gendisk *disk; + struct module *owner; int ret; int partno; int perm = 0; @@ -1110,6 +1111,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk = get_gendisk(bdev->bd_dev, &partno); if (!disk) goto out; + owner = disk->fops->owner; disk_block_events(disk); mutex_lock_nested(&bdev->bd_mutex, for_part); @@ -1137,8 +1139,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = NULL; mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); - module_put(disk->fops->owner); put_disk(disk); + module_put(owner); goto restart; } } @@ -1194,8 +1196,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_unlock_bdev; } /* only one opener holds refs to the module and disk */ - module_put(disk->fops->owner); put_disk(disk); + module_put(owner); } bdev->bd_openers++; if (for_part) @@ -1215,8 +1217,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); - module_put(disk->fops->owner); put_disk(disk); + module_put(owner); out: bdput(bdev); @@ -1442,14 +1444,15 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) if (!bdev->bd_openers) { struct module *owner = disk->fops->owner; - put_disk(disk); - module_put(owner); disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; if (bdev != bdev->bd_contains) victim = bdev->bd_contains; bdev->bd_contains = NULL; + + put_disk(disk); + module_put(owner); } mutex_unlock(&bdev->bd_mutex); bdput(bdev); -- cgit v1.2.3-58-ga151 From 92abc475d8de1c29373f6d96ed63d8ecaa199d25 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Sun, 16 Oct 2011 18:15:16 -0700 Subject: jffs2: implement mount option parsing and compression overriding Currently jffs2 has compile-time constants (and .config options) controlling whether or not the various compression/decompression drivers are built in and enabled. This is fine for embedded systems, but it clashes with distribution kernels. Distro kernels tend to turn on everything; this causes OpenFirmware to fall over, as it understands ZLIB-compressed inodes. Booting a kernel that has LZO compression enabled, writing to the boot partition, and then rebooting causes OFW to fail to read the kernel from the filesystem. This is because LZO compression has priority when writing new data to jffs2, if LZO is enabled. This patch adds mount option parsing, and a single supported option ("compr=none"). This adds the flexibility of being able to specify which compressor overrides on a per-superblock basis. For now, we can simply disable compression; additional flexibility coming soon. v2: kill some printks, and implement show_options as suggested by Artem Bityutskiy. Signed-off-by: Andres Salomon Signed-off-by: Artem Bityutskiy --- fs/jffs2/compr.c | 9 +++-- fs/jffs2/fs.c | 2 +- fs/jffs2/jffs2_fs_sb.h | 6 ++++ fs/jffs2/os-linux.h | 2 +- fs/jffs2/super.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 112 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c index de4247021d25..97bc74db2c96 100644 --- a/fs/jffs2/compr.c +++ b/fs/jffs2/compr.c @@ -76,13 +76,18 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, uint32_t *datalen, uint32_t *cdatalen) { int ret = JFFS2_COMPR_NONE; - int compr_ret; + int mode, compr_ret; struct jffs2_compressor *this, *best=NULL; unsigned char *output_buf = NULL, *tmp_buf; uint32_t orig_slen, orig_dlen; uint32_t best_slen=0, best_dlen=0; - switch (jffs2_compression_mode) { + if (c->mount_opts.override_compr) + mode = c->mount_opts.compr; + else + mode = jffs2_compression_mode; + + switch (mode) { case JFFS2_COMPR_MODE_NONE: break; case JFFS2_COMPR_MODE_PRIORITY: diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index bbcb9755dd2b..5d54b4ed1b6c 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -379,7 +379,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags) jffs2_do_setattr(inode, &iattr); } -int jffs2_remount_fs (struct super_block *sb, int *flags, char *data) +int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h index 0bc6a6c80a56..55a0c1dceadf 100644 --- a/fs/jffs2/jffs2_fs_sb.h +++ b/fs/jffs2/jffs2_fs_sb.h @@ -29,6 +29,11 @@ struct jffs2_inodirty; +struct jffs2_mount_opts { + bool override_compr; + unsigned int compr; +}; + /* A struct for the overall file system control. Pointers to jffs2_sb_info structs are named `c' in the source code. Nee jffs_control @@ -126,6 +131,7 @@ struct jffs2_sb_info { #endif struct jffs2_summary *summary; /* Summary information */ + struct jffs2_mount_opts mount_opts; #ifdef CONFIG_JFFS2_FS_XATTR #define XATTRINDEX_HASHSIZE (57) diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 6c1755c59c0f..ab65ee3ec858 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -176,7 +176,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags); struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_raw_inode *ri); int jffs2_statfs (struct dentry *, struct kstatfs *); -int jffs2_remount_fs (struct super_block *, int *, char *); +int jffs2_do_remount_fs(struct super_block *, int *, char *); int jffs2_do_fill_super(struct super_block *sb, void *data, int silent); void jffs2_gc_release_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f); diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 853b8e300084..40f6e6385fd1 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -17,11 +17,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include "compr.h" #include "nodelist.h" @@ -75,6 +77,29 @@ static void jffs2_write_super(struct super_block *sb) unlock_super(sb); } +static const char *jffs2_compr_name(unsigned int compr) +{ + switch (compr) { + case JFFS2_COMPR_MODE_NONE: + return "none"; + default: + /* should never happen; programmer error */ + WARN_ON(1); + return ""; + } +} + +static int jffs2_show_options(struct seq_file *s, struct vfsmount *mnt) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(mnt->mnt_sb); + struct jffs2_mount_opts *opts = &c->mount_opts; + + if (opts->override_compr) + seq_printf(s, ",compr=%s", jffs2_compr_name(opts->compr)); + + return 0; +} + static int jffs2_sync_fs(struct super_block *sb, int wait) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); @@ -133,6 +158,71 @@ static const struct export_operations jffs2_export_ops = { .fh_to_parent = jffs2_fh_to_parent, }; +/* + * JFFS2 mount options. + * + * Opt_override_compr: override default compressor + * Opt_err: just end of array marker + */ +enum { + Opt_override_compr, + Opt_err, +}; + +static const match_table_t tokens = { + {Opt_override_compr, "compr=%s"}, + {Opt_err, NULL}, +}; + +static int jffs2_parse_options(struct jffs2_sb_info *c, char *data) +{ + substring_t args[MAX_OPT_ARGS]; + char *p, *name; + + if (!data) + return 0; + + while ((p = strsep(&data, ","))) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_override_compr: + name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (!strcmp(name, "none")) { + c->mount_opts.compr = JFFS2_COMPR_MODE_NONE; + c->mount_opts.override_compr = true; + } + kfree(name); + break; + default: + printk(KERN_ERR "JFFS2 Error: unrecognized mount option '%s' or missing value\n", + p); + return -EINVAL; + } + } + + return 0; +} + +static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); + int err; + + err = jffs2_parse_options(c, data); + if (err) + return -EINVAL; + + return jffs2_do_remount_fs(sb, flags, data); +} + static const struct super_operations jffs2_super_operations = { .alloc_inode = jffs2_alloc_inode, @@ -143,6 +233,7 @@ static const struct super_operations jffs2_super_operations = .remount_fs = jffs2_remount_fs, .evict_inode = jffs2_evict_inode, .dirty_inode = jffs2_dirty_inode, + .show_options = jffs2_show_options, .sync_fs = jffs2_sync_fs, }; @@ -166,6 +257,12 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent) c->os_priv = sb; sb->s_fs_info = c; + ret = jffs2_parse_options(c, data); + if (ret) { + kfree(c); + return -EINVAL; + } + /* Initialize JFFS2 superblock locks, the further initialization will * be done later */ mutex_init(&c->alloc_sem); -- cgit v1.2.3-58-ga151 From 123005f3ccfa58637ad6e1a8b9f7f3f861ca65f4 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Sun, 16 Oct 2011 18:15:23 -0700 Subject: jffs2: add compr=lzo and compr=zlib options ..to allow forcing of either compression scheme. This will override compiled-in defaults. jffs2_compress is reworked a bit, as the lzo/zlib override shares lots of code w/ the PRIORITY mode. v2: update show_options accordingly. Signed-off-by: Andres Salomon Signed-off-by: Artem Bityutskiy --- fs/jffs2/compr.c | 119 +++++++++++++++++++++++++++++++++++++++---------------- fs/jffs2/compr.h | 2 + fs/jffs2/super.c | 26 +++++++++++- 3 files changed, 110 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c index 97bc74db2c96..5b6c9d1a2fb9 100644 --- a/fs/jffs2/compr.c +++ b/fs/jffs2/compr.c @@ -53,6 +53,78 @@ static int jffs2_is_best_compression(struct jffs2_compressor *this, return 0; } +/* + * jffs2_selected_compress: + * @compr: Explicit compression type to use (ie, JFFS2_COMPR_ZLIB). + * If 0, just take the first available compression mode. + * @data_in: Pointer to uncompressed data + * @cpage_out: Pointer to returned pointer to buffer for compressed data + * @datalen: On entry, holds the amount of data available for compression. + * On exit, expected to hold the amount of data actually compressed. + * @cdatalen: On entry, holds the amount of space available for compressed + * data. On exit, expected to hold the actual size of the compressed + * data. + * + * Returns: the compression type used. Zero is used to show that the data + * could not be compressed; probably because we couldn't find the requested + * compression mode. + */ +static int jffs2_selected_compress(u8 compr, unsigned char *data_in, + unsigned char **cpage_out, u32 *datalen, u32 *cdatalen) +{ + struct jffs2_compressor *this; + int err, ret = JFFS2_COMPR_NONE; + uint32_t orig_slen, orig_dlen; + char *output_buf; + + output_buf = kmalloc(*cdatalen, GFP_KERNEL); + if (!output_buf) { + printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n"); + return ret; + } + orig_slen = *datalen; + orig_dlen = *cdatalen; + spin_lock(&jffs2_compressor_list_lock); + list_for_each_entry(this, &jffs2_compressor_list, list) { + /* Skip decompress-only and disabled modules */ + if (!this->compress || this->disabled) + continue; + + /* Skip if not the desired compression type */ + if (compr && (compr != this->compr)) + continue; + + /* + * Either compression type was unspecified, or we found our + * compressor; either way, we're good to go. + */ + this->usecount++; + spin_unlock(&jffs2_compressor_list_lock); + + *datalen = orig_slen; + *cdatalen = orig_dlen; + err = this->compress(data_in, output_buf, datalen, cdatalen); + + spin_lock(&jffs2_compressor_list_lock); + this->usecount--; + if (!err) { + /* Success */ + ret = this->compr; + this->stat_compr_blocks++; + this->stat_compr_orig_size += *datalen; + this->stat_compr_new_size += *cdatalen; + break; + } + } + spin_unlock(&jffs2_compressor_list_lock); + if (ret == JFFS2_COMPR_NONE) + kfree(output_buf); + else + *cpage_out = output_buf; + + return ret; +} + /* jffs2_compress: * @data_in: Pointer to uncompressed data * @cpage_out: Pointer to returned pointer to buffer for compressed data @@ -91,37 +163,8 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, case JFFS2_COMPR_MODE_NONE: break; case JFFS2_COMPR_MODE_PRIORITY: - output_buf = kmalloc(*cdatalen,GFP_KERNEL); - if (!output_buf) { - printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n"); - goto out; - } - orig_slen = *datalen; - orig_dlen = *cdatalen; - spin_lock(&jffs2_compressor_list_lock); - list_for_each_entry(this, &jffs2_compressor_list, list) { - /* Skip decompress-only backwards-compatibility and disabled modules */ - if ((!this->compress)||(this->disabled)) - continue; - - this->usecount++; - spin_unlock(&jffs2_compressor_list_lock); - *datalen = orig_slen; - *cdatalen = orig_dlen; - compr_ret = this->compress(data_in, output_buf, datalen, cdatalen); - spin_lock(&jffs2_compressor_list_lock); - this->usecount--; - if (!compr_ret) { - ret = this->compr; - this->stat_compr_blocks++; - this->stat_compr_orig_size += *datalen; - this->stat_compr_new_size += *cdatalen; - break; - } - } - spin_unlock(&jffs2_compressor_list_lock); - if (ret == JFFS2_COMPR_NONE) - kfree(output_buf); + ret = jffs2_selected_compress(0, data_in, cpage_out, datalen, + cdatalen); break; case JFFS2_COMPR_MODE_SIZE: case JFFS2_COMPR_MODE_FAVOURLZO: @@ -179,22 +222,28 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, best->stat_compr_orig_size += best_slen; best->stat_compr_new_size += best_dlen; ret = best->compr; + *cpage_out = output_buf; } spin_unlock(&jffs2_compressor_list_lock); break; + case JFFS2_COMPR_MODE_FORCELZO: + ret = jffs2_selected_compress(JFFS2_COMPR_LZO, data_in, + cpage_out, datalen, cdatalen); + break; + case JFFS2_COMPR_MODE_FORCEZLIB: + ret = jffs2_selected_compress(JFFS2_COMPR_ZLIB, data_in, + cpage_out, datalen, cdatalen); + break; default: printk(KERN_ERR "JFFS2: unknown compression mode.\n"); } - out: + if (ret == JFFS2_COMPR_NONE) { *cpage_out = data_in; *datalen = *cdatalen; none_stat_compr_blocks++; none_stat_compr_size += *datalen; } - else { - *cpage_out = output_buf; - } return ret; } diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h index 13bb7597ab39..5e91d578f4ed 100644 --- a/fs/jffs2/compr.h +++ b/fs/jffs2/compr.h @@ -40,6 +40,8 @@ #define JFFS2_COMPR_MODE_PRIORITY 1 #define JFFS2_COMPR_MODE_SIZE 2 #define JFFS2_COMPR_MODE_FAVOURLZO 3 +#define JFFS2_COMPR_MODE_FORCELZO 4 +#define JFFS2_COMPR_MODE_FORCEZLIB 5 #define FAVOUR_LZO_PERCENT 80 diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 40f6e6385fd1..e7e974454115 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -82,6 +82,14 @@ static const char *jffs2_compr_name(unsigned int compr) switch (compr) { case JFFS2_COMPR_MODE_NONE: return "none"; +#ifdef CONFIG_JFFS2_LZO + case JFFS2_COMPR_MODE_FORCELZO: + return "lzo"; +#endif +#ifdef CONFIG_JFFS2_ZLIB + case JFFS2_COMPR_MODE_FORCEZLIB: + return "zlib"; +#endif default: /* should never happen; programmer error */ WARN_ON(1); @@ -195,11 +203,25 @@ static int jffs2_parse_options(struct jffs2_sb_info *c, char *data) if (!name) return -ENOMEM; - if (!strcmp(name, "none")) { + if (!strcmp(name, "none")) c->mount_opts.compr = JFFS2_COMPR_MODE_NONE; - c->mount_opts.override_compr = true; +#ifdef CONFIG_JFFS2_LZO + else if (!strcmp(name, "lzo")) + c->mount_opts.compr = JFFS2_COMPR_MODE_FORCELZO; +#endif +#ifdef CONFIG_JFFS2_ZLIB + else if (!strcmp(name, "zlib")) + c->mount_opts.compr = + JFFS2_COMPR_MODE_FORCEZLIB; +#endif + else { + printk(KERN_ERR "JFFS2 Error: unknown compressor \"%s\"", + name); + kfree(name); + return -EINVAL; } kfree(name); + c->mount_opts.override_compr = true; break; default: printk(KERN_ERR "JFFS2 Error: unrecognized mount option '%s' or missing value\n", -- cgit v1.2.3-58-ga151 From a8d86cd75b709a9c9402c46674ea188493c53901 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 19 Oct 2011 11:42:03 -0400 Subject: nfsd4: remove unneeded CLAIM_DELEGATE_CUR workaround 0c12eaffdf09466f36a9ffe970dda8f4aeb6efc0 "nfsd: don't break lease on CLAIM_DELEGATE_CUR" was a temporary workaround for a problem fixed properly in the vfs layer by 778fc546f749c588aa2f6cd50215d2715c374252 "locks: fix tracking of inprogress lease breaks", so we can revert that change (but keeping some minor cleanup from that commit). Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3e1d4e08dfad..15e0db140403 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2653,12 +2653,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, int oflag = nfs4_access_to_omode(open->op_share_access); int access = nfs4_access_to_access(open->op_share_access); - /* CLAIM_DELEGATE_CUR is used in response to a broken lease; - * allowing it to break the lease and return EAGAIN leaves the - * client unable to make progress in returning the delegation */ - if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) - access |= NFSD_MAY_NOT_BREAK_LEASE; - if (!fp->fi_fds[oflag]) { status = nfsd_open(rqstp, cur_fh, S_IFREG, access, &fp->fi_fds[oflag]); -- cgit v1.2.3-58-ga151 From 8b289b2c2355c3bea75f3e499b4aa251a3191382 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 19 Oct 2011 11:52:12 -0400 Subject: nfsd4: implement new 4.1 open reclaim types Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 15 +++------------ fs/nfsd/nfs4state.c | 10 ++++++++-- fs/nfsd/nfs4xdr.c | 13 +++++++++++++ include/linux/nfs4.h | 5 ++++- 4 files changed, 28 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 710b97b7a2f3..458ebb6b59c7 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -366,12 +366,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_DELEGATE_CUR: case NFS4_OPEN_CLAIM_NULL: - /* - * (1) set CURRENT_FH to the file being opened, - * creating it if necessary, (2) set open->op_cinfo, - * (3) set open->op_truncate if the file is to be - * truncated after opening, (4) do permission checking. - */ status = do_open_lookup(rqstp, &cstate->current_fh, open); if (status) @@ -379,17 +373,14 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; case NFS4_OPEN_CLAIM_PREVIOUS: open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; - /* - * The CURRENT_FH is already set to the file being - * opened. (1) set open->op_cinfo, (2) set - * open->op_truncate if the file is to be truncated - * after opening, (3) do permission checking. - */ + case NFS4_OPEN_CLAIM_FH: + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: status = do_open_fhandle(rqstp, &cstate->current_fh, open); if (status) goto out; break; + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: case NFS4_OPEN_CLAIM_DELEGATE_PREV: open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; dprintk("NFSD: unsupported OPEN claim type %d\n", diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 15e0db140403..e8c2a3ec0e60 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2587,6 +2587,12 @@ static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, statei return delegstateid(ret); } +static bool nfsd4_is_deleg_cur(struct nfsd4_open *open) +{ + return open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR || + open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH; +} + static __be32 nfs4_check_deleg(struct nfs4_client *cl, struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_delegation **dp) @@ -2602,7 +2608,7 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfs4_file *fp, struct nfsd4_open if (status) *dp = NULL; out: - if (open->op_claim_type != NFS4_OPEN_CLAIM_DELEGATE_CUR) + if (!nfsd4_is_deleg_cur(open)) return nfs_ok; if (status) return status; @@ -2879,7 +2885,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf goto out; } else { status = nfserr_bad_stateid; - if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) + if (nfsd4_is_deleg_cur(open)) goto out; status = nfserr_jukebox; fp = open->op_file; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 645a0a9d8073..fdc09a52cd8d 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -803,6 +803,19 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval))) return status; break; + case NFS4_OPEN_CLAIM_FH: + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + if (argp->minorversion < 1) + goto xdr_error; + /* void */ + break; + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + if (argp->minorversion < 1) + goto xdr_error; + status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); + if (status) + return status; + break; default: goto xdr_error; } diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index b875b0324fc0..32345c2805c0 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -410,7 +410,10 @@ enum open_claim_type4 { NFS4_OPEN_CLAIM_NULL = 0, NFS4_OPEN_CLAIM_PREVIOUS = 1, NFS4_OPEN_CLAIM_DELEGATE_CUR = 2, - NFS4_OPEN_CLAIM_DELEGATE_PREV = 3 + NFS4_OPEN_CLAIM_DELEGATE_PREV = 3, + NFS4_OPEN_CLAIM_FH = 4, /* 4.1 */ + NFS4_OPEN_CLAIM_DELEG_CUR_FH = 5, /* 4.1 */ + NFS4_OPEN_CLAIM_DELEG_PREV_FH = 6, /* 4.1 */ }; enum opentype4 { -- cgit v1.2.3-58-ga151 From f1bdcc0a8278aa42cb77331275890aac85a4e7cd Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 14 Jul 2011 14:28:08 -0400 Subject: Btrfs: move stuff around in btrfs_inode to get better packing Moving things around to give us better packing in the btrfs_inode. This reduces the size of our inode by 8 bytes. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d9f99a16edd6..bf325f40cf92 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -115,9 +115,6 @@ struct btrfs_inode { */ u64 disk_i_size; - /* flags field from the on disk inode */ - u32 flags; - /* * if this is a directory then index_cnt is the counter for the index * number for new files that are created @@ -131,6 +128,9 @@ struct btrfs_inode { */ u64 last_unlink_trans; + /* flags field from the on disk inode */ + u32 flags; + /* * Counters to keep track of the number of extent item's we may use due * to delalloc and such. outstanding_extents is the number of extent -- cgit v1.2.3-58-ga151 From 0cbbdf7c9c46467bfb7129c30236f36a679ab244 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 14 Jul 2011 16:02:04 -0400 Subject: Btrfs: kill reserved_bytes in inode reserved_bytes is not used for anything in the inode, remove it. Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 5 ----- fs/btrfs/extent-tree.c | 2 -- fs/btrfs/inode.c | 1 - 3 files changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index bf325f40cf92..c70fb10a307b 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -103,11 +103,6 @@ struct btrfs_inode { */ u64 delalloc_bytes; - /* total number of bytes that may be used for this inode for - * delalloc - */ - u64 reserved_bytes; - /* * the size of the file stored in the metadata on disk. data=ordered * means the in-memory i_size might be larger than the size on disk diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f5be06a2462f..03edac4f7771 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3122,7 +3122,6 @@ commit_trans: return -ENOSPC; } data_sinfo->bytes_may_use += bytes; - BTRFS_I(inode)->reserved_bytes += bytes; spin_unlock(&data_sinfo->lock); return 0; @@ -3144,7 +3143,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) data_sinfo = BTRFS_I(inode)->space_info; spin_lock(&data_sinfo->lock); data_sinfo->bytes_may_use -= bytes; - BTRFS_I(inode)->reserved_bytes -= bytes; spin_unlock(&data_sinfo->lock); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b2d004ad66a0..156c3a0da792 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6755,7 +6755,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_sub_trans = 0; ei->logged_trans = 0; ei->delalloc_bytes = 0; - ei->reserved_bytes = 0; ei->disk_i_size = 0; ei->flags = 0; ei->index_cnt = (u64)-1; -- cgit v1.2.3-58-ga151 From ba5b8958dabbd7890a6929af1ffc0d87187765dc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 25 Jul 2011 15:40:35 -0400 Subject: Btrfs: use d_obtain_alias when mounting subvol/subvolid Currently what we do is just wrong. We either 1) Alloc a new "root" dentry with sb->s_root as it's parent which is just wrong as we could walk into this subvol later on via another path and hilarity could ensue. Also we don't check the return value of d_splice_alias which isn't good either. or 2) Do a d_find_alias() which we could have lost our dentry from cache at this point and found nothing. So use d_obtain_alias(). In the case that we already have the inode/dentry in cache we will get the correct dentry. If not we will get a disconnected dentry tree so if we walk into it later on everything will be connected up properly. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/super.c | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 15634d4648d7..244fa46c50b8 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -492,7 +492,6 @@ static struct dentry *get_default_root(struct super_block *sb, struct btrfs_path *path; struct btrfs_key location; struct inode *inode; - struct dentry *dentry; u64 dir_id; int new = 0; @@ -566,29 +565,7 @@ setup_root: return dget(sb->s_root); } - if (new) { - const struct qstr name = { .name = "/", .len = 1 }; - - /* - * New inode, we need to make the dentry a sibling of s_root so - * everything gets cleaned up properly on unmount. - */ - dentry = d_alloc(sb->s_root, &name); - if (!dentry) { - iput(inode); - return ERR_PTR(-ENOMEM); - } - d_splice_alias(inode, dentry); - } else { - /* - * We found the inode in cache, just find a dentry for it and - * put the reference to the inode we just got. - */ - dentry = d_find_alias(inode); - iput(inode); - } - - return dentry; + return d_obtain_alias(inode); } static int btrfs_fill_super(struct super_block *sb, -- cgit v1.2.3-58-ga151 From 830c4adbd04a79f806d4fa579546f36a71b727c1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 25 Jul 2011 15:55:42 -0400 Subject: Btrfs: fix how we mount subvol= We've only been able to mount with subvol= where whatever was a subvol within whatever root we had as the default. This allows us to mount -o subvol=path/to/subvol/you/want relative from the normal fs_tree root. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/super.c | 199 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 135 insertions(+), 64 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 244fa46c50b8..934789f7fd33 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -40,6 +40,7 @@ #include #include #include +#include #include "compat.h" #include "delayed-inode.h" #include "ctree.h" @@ -58,6 +59,7 @@ #include static const struct super_operations btrfs_super_ops; +static struct file_system_type btrfs_fs_type; static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, char nbuf[16]) @@ -411,7 +413,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, int intarg; if (!options) - goto out; + return 0; /* * strsep changes the string, duplicate it because parse_options @@ -460,26 +462,15 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, error = btrfs_scan_one_device(match_strdup(&args[0]), flags, holder, fs_devices); if (error) - goto out_free_opts; + goto out; break; default: break; } } - out_free_opts: +out: kfree(orig); - out: - /* - * If no subvolume name is specified we use the default one. Allocate - * a copy of the string "." here so that code later in the - * mount path doesn't care if it's the default volume or another one. - */ - if (!*subvol_name) { - *subvol_name = kstrdup(".", GFP_KERNEL); - if (!*subvol_name) - return -ENOMEM; - } return error; } @@ -730,6 +721,118 @@ static int btrfs_set_super(struct super_block *s, void *data) return set_anon_super(s, data); } +/* + * This will strip out the subvol=%s argument for an argument string and add + * subvolid=0 to make sure we get the actual tree root for path walking to the + * subvol we want. + */ +static char *setup_root_args(char *args) +{ + unsigned copied = 0; + unsigned len = strlen(args) + 2; + char *pos; + char *ret; + + /* + * We need the same args as before, but minus + * + * subvol=a + * + * and add + * + * subvolid=0 + * + * which is a difference of 2 characters, so we allocate strlen(args) + + * 2 characters. + */ + ret = kzalloc(len * sizeof(char), GFP_NOFS); + if (!ret) + return NULL; + pos = strstr(args, "subvol="); + + /* This shouldn't happen, but just in case.. */ + if (!pos) { + kfree(ret); + return NULL; + } + + /* + * The subvol=<> arg is not at the front of the string, copy everybody + * up to that into ret. + */ + if (pos != args) { + *pos = '\0'; + strcpy(ret, args); + copied += strlen(args); + pos++; + } + + strncpy(ret + copied, "subvolid=0", len - copied); + + /* Length of subvolid=0 */ + copied += 10; + + /* + * If there is no , after the subvol= option then we know there's no + * other options and we can just return. + */ + pos = strchr(pos, ','); + if (!pos) + return ret; + + /* Copy the rest of the arguments into our buffer */ + strncpy(ret + copied, pos, len - copied); + copied += strlen(pos); + + return ret; +} + +static struct dentry *mount_subvol(const char *subvol_name, int flags, + const char *device_name, char *data) +{ + struct super_block *s; + struct dentry *root; + struct vfsmount *mnt; + struct mnt_namespace *ns_private; + char *newargs; + struct path path; + int error; + + newargs = setup_root_args(data); + if (!newargs) + return ERR_PTR(-ENOMEM); + mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, + newargs); + kfree(newargs); + if (IS_ERR(mnt)) + return ERR_CAST(mnt); + + ns_private = create_mnt_ns(mnt); + if (IS_ERR(ns_private)) { + mntput(mnt); + return ERR_CAST(ns_private); + } + + /* + * This will trigger the automount of the subvol so we can just + * drop the mnt we have here and return the dentry that we + * found. + */ + error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name, + LOOKUP_FOLLOW, &path); + put_mnt_ns(ns_private); + if (error) + return ERR_PTR(error); + + /* Get a ref to the sb and the dentry we found and return it */ + s = path.mnt->mnt_sb; + atomic_inc(&s->s_active); + root = dget(path.dentry); + path_put(&path); + down_write(&s->s_umount); + + return root; +} /* * Find a superblock for the given device / mount point. @@ -761,13 +864,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (error) return ERR_PTR(error); + if (subvol_name) { + root = mount_subvol(subvol_name, flags, device_name, data); + kfree(subvol_name); + return root; + } + error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); if (error) - goto error_free_subvol_name; + return ERR_PTR(error); error = btrfs_open_devices(fs_devices, mode, fs_type); if (error) - goto error_free_subvol_name; + return ERR_PTR(error); if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { error = -EACCES; @@ -792,14 +901,15 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, bdev = fs_devices->latest_bdev; s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); - if (IS_ERR(s)) - goto error_s; + if (IS_ERR(s)) { + error = PTR_ERR(s); + goto error_close_devices; + } if (s->s_root) { if ((flags ^ s->s_flags) & MS_RDONLY) { deactivate_locked_super(s); - error = -EBUSY; - goto error_close_devices; + return ERR_PTR(-EBUSY); } btrfs_close_devices(fs_devices); @@ -814,64 +924,25 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, flags & MS_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); - goto error_free_subvol_name; + return ERR_PTR(error); } btrfs_sb(s)->fs_info->bdev_holder = fs_type; s->s_flags |= MS_ACTIVE; } - /* if they gave us a subvolume name bind mount into that */ - if (strcmp(subvol_name, ".")) { - struct dentry *new_root; - - root = get_default_root(s, subvol_rootid); - if (IS_ERR(root)) { - error = PTR_ERR(root); - deactivate_locked_super(s); - goto error_free_subvol_name; - } - - mutex_lock(&root->d_inode->i_mutex); - new_root = lookup_one_len(subvol_name, root, - strlen(subvol_name)); - mutex_unlock(&root->d_inode->i_mutex); - - if (IS_ERR(new_root)) { - dput(root); - deactivate_locked_super(s); - error = PTR_ERR(new_root); - goto error_free_subvol_name; - } - if (!new_root->d_inode) { - dput(root); - dput(new_root); - deactivate_locked_super(s); - error = -ENXIO; - goto error_free_subvol_name; - } - dput(root); - root = new_root; - } else { - root = get_default_root(s, subvol_objectid); - if (IS_ERR(root)) { - error = PTR_ERR(root); - deactivate_locked_super(s); - goto error_free_subvol_name; - } + root = get_default_root(s, subvol_objectid); + if (IS_ERR(root)) { + deactivate_locked_super(s); + return root; } - kfree(subvol_name); return root; -error_s: - error = PTR_ERR(s); error_close_devices: btrfs_close_devices(fs_devices); kfree(fs_info); kfree(tree_root); -error_free_subvol_name: - kfree(subvol_name); return ERR_PTR(error); } -- cgit v1.2.3-58-ga151 From fb25e9141ab843794d5cdef3936ccb58435e2371 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Jul 2011 17:00:46 -0400 Subject: Btrfs: use bytes_may_use for all ENOSPC reservations We have been using bytes_reserved for metadata reservations, which is wrong since we use that to keep track of outstanding reservations from the allocator. This resulted in us doing a lot of silly things to make sure we don't allocate a bunch of metadata chunks since we never had a real view of how much space was actually in use by metadata. This passes Arne's enospc test and xfstests as well as my own enospc tests. Hopefully this will get us moving in the right direction. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 2 - fs/btrfs/extent-tree.c | 163 ++++++++++++++++++++++++-------------------- fs/btrfs/free-space-cache.c | 29 ++++++-- 3 files changed, 112 insertions(+), 82 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 03912c5c6f49..332cbdc86ad4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2196,8 +2196,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset); int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); -int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, int sinfo); int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 03edac4f7771..fbe6278f466b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -52,6 +52,21 @@ enum { CHUNK_ALLOC_LIMITED = 2, }; +/* + * Control how reservations are dealt with. + * + * RESERVE_FREE - freeing a reservation. + * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for + * ENOSPC accounting + * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update + * bytes_may_use as the ENOSPC accounting is done elsewhere + */ +enum { + RESERVE_FREE = 0, + RESERVE_ALLOC = 1, + RESERVE_ALLOC_NO_ACCOUNT = 2, +}; + static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc); @@ -81,6 +96,8 @@ static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); +static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -3128,9 +3145,7 @@ commit_trans: } /* - * called when we are clearing an delalloc extent from the - * inode's io_tree or there was an error for whatever reason - * after calling btrfs_check_data_free_space + * Called if we need to clear a data reservation for this inode. */ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) { @@ -3163,6 +3178,7 @@ static int should_alloc_chunk(struct btrfs_root *root, struct btrfs_space_info *sinfo, u64 alloc_bytes, int force) { + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; u64 thresh; @@ -3170,6 +3186,13 @@ static int should_alloc_chunk(struct btrfs_root *root, if (force == CHUNK_ALLOC_FORCE) return 1; + /* + * We need to take into account the global rsv because for all intents + * and purposes it's used space. Don't worry about locking the + * global_rsv, it doesn't change except when the transaction commits. + */ + num_allocated += global_rsv->size; + /* * in limited mode, we want to have some free space up to * about 1% of the FS size. @@ -3317,7 +3340,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, space_info = block_rsv->space_info; smp_mb(); - reserved = space_info->bytes_reserved; + reserved = space_info->bytes_may_use; progress = space_info->reservation_progress; if (reserved == 0) @@ -3341,9 +3364,9 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); spin_lock(&space_info->lock); - if (reserved > space_info->bytes_reserved) - reclaimed += reserved - space_info->bytes_reserved; - reserved = space_info->bytes_reserved; + if (reserved > space_info->bytes_may_use) + reclaimed += reserved - space_info->bytes_may_use; + reserved = space_info->bytes_may_use; spin_unlock(&space_info->lock); loops++; @@ -3401,7 +3424,6 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, int ret = 0; bool committed = false; bool flushing = false; - again: ret = 0; spin_lock(&space_info->lock); @@ -3443,7 +3465,7 @@ again: if (unused <= space_info->total_bytes) { unused = space_info->total_bytes - unused; if (unused >= num_bytes) { - space_info->bytes_reserved += orig_bytes; + space_info->bytes_may_use += orig_bytes; ret = 0; } else { /* @@ -3614,7 +3636,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, } if (num_bytes) { spin_lock(&space_info->lock); - space_info->bytes_reserved -= num_bytes; + space_info->bytes_may_use -= num_bytes; space_info->reservation_progress++; spin_unlock(&space_info->lock); } @@ -3825,12 +3847,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) if (sinfo->total_bytes > num_bytes) { num_bytes = sinfo->total_bytes - num_bytes; block_rsv->reserved += num_bytes; - sinfo->bytes_reserved += num_bytes; + sinfo->bytes_may_use += num_bytes; } if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; - sinfo->bytes_reserved -= num_bytes; + sinfo->bytes_may_use -= num_bytes; sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; @@ -4133,7 +4155,6 @@ static int update_block_group(struct btrfs_trans_handle *trans, btrfs_set_block_group_used(&cache->item, old_val); cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; - cache->space_info->reservation_progress++; cache->space_info->bytes_used += num_bytes; cache->space_info->disk_used += num_bytes * factor; spin_unlock(&cache->lock); @@ -4185,7 +4206,6 @@ static int pin_down_extent(struct btrfs_root *root, if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; - cache->space_info->reservation_progress++; } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -4212,46 +4232,55 @@ int btrfs_pin_extent(struct btrfs_root *root, return 0; } -/* - * update size of reserved extents. this function may return -EAGAIN - * if 'reserve' is true or 'sinfo' is false. +/** + * btrfs_update_reserved_bytes - update the block_group and space info counters + * @cache: The cache we are manipulating + * @num_bytes: The number of bytes in question + * @reserve: One of the reservation enums + * + * This is called by the allocator when it reserves space, or by somebody who is + * freeing space that was never actually used on disk. For example if you + * reserve some space for a new leaf in transaction A and before transaction A + * commits you free that leaf, you call this with reserve set to 0 in order to + * clear the reservation. + * + * Metadata reservations should be called with RESERVE_ALLOC so we do the proper + * ENOSPC accounting. For data we handle the reservation through clearing the + * delalloc bits in the io_tree. We have to do this since we could end up + * allocating less disk space for the amount of data we have reserved in the + * case of compression. + * + * If this is a reservation and the block group has become read only we cannot + * make the reservation and return -EAGAIN, otherwise this function always + * succeeds. */ -int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, int sinfo) +static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve) { + struct btrfs_space_info *space_info = cache->space_info; int ret = 0; - if (sinfo) { - struct btrfs_space_info *space_info = cache->space_info; - spin_lock(&space_info->lock); - spin_lock(&cache->lock); - if (reserve) { - if (cache->ro) { - ret = -EAGAIN; - } else { - cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; - } - } else { - if (cache->ro) - space_info->bytes_readonly += num_bytes; - cache->reserved -= num_bytes; - space_info->bytes_reserved -= num_bytes; - space_info->reservation_progress++; - } - spin_unlock(&cache->lock); - spin_unlock(&space_info->lock); - } else { - spin_lock(&cache->lock); + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (reserve != RESERVE_FREE) { if (cache->ro) { ret = -EAGAIN; } else { - if (reserve) - cache->reserved += num_bytes; - else - cache->reserved -= num_bytes; + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + if (reserve == RESERVE_ALLOC) { + BUG_ON(space_info->bytes_may_use < num_bytes); + space_info->bytes_may_use -= num_bytes; + } } - spin_unlock(&cache->lock); + } else { + if (cache->ro) + space_info->bytes_readonly += num_bytes; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + space_info->reservation_progress++; } + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); return ret; } @@ -4322,7 +4351,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) } else if (cache->reserved_pinned > 0) { len = min(len, cache->reserved_pinned); cache->reserved_pinned -= len; - cache->space_info->bytes_reserved += len; + cache->space_info->bytes_may_use += len; } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -4701,27 +4730,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); - ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0); - if (ret == -EAGAIN) { - /* block group became read-only */ - btrfs_update_reserved_bytes(cache, buf->len, 0, 1); - goto out; - } + btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); - ret = 1; - spin_lock(&block_rsv->lock); - if (block_rsv->reserved < block_rsv->size) { - block_rsv->reserved += buf->len; - ret = 0; - } - spin_unlock(&block_rsv->lock); - - if (ret) { - spin_lock(&cache->space_info->lock); - cache->space_info->bytes_reserved -= buf->len; - cache->space_info->reservation_progress++; - spin_unlock(&cache->space_info->lock); - } goto out; } pin: @@ -4881,6 +4891,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, int last_ptr_loop = 0; int loop = 0; int index = 0; + int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? + RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; bool found_uncached_bg = false; bool failed_cluster_refill = false; bool failed_alloc = false; @@ -5200,8 +5212,8 @@ checks: search_start - offset); BUG_ON(offset > search_start); - ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1, - (data & BTRFS_BLOCK_GROUP_DATA)); + ret = btrfs_update_reserved_bytes(block_group, num_bytes, + alloc_type); if (ret == -EAGAIN) { btrfs_add_free_space(block_group, offset, num_bytes); goto loop; @@ -5323,7 +5335,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int index = 0; spin_lock(&info->lock); - printk(KERN_INFO "space_info has %llu free, is %sfull\n", + printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n", + (unsigned long long)info->flags, (unsigned long long)(info->total_bytes - info->bytes_used - info->bytes_pinned - info->bytes_reserved - info->bytes_readonly), @@ -5425,7 +5438,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) ret = btrfs_discard_extent(root, start, len, NULL); btrfs_add_free_space(cache, start, len); - btrfs_update_reserved_bytes(cache, len, 0, 1); + btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(root, start, len); @@ -5628,7 +5641,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, put_caching_control(caching_ctl); } - ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1); + ret = btrfs_update_reserved_bytes(block_group, ins->offset, + RESERVE_ALLOC_NO_ACCOUNT); BUG_ON(ret); btrfs_put_block_group(block_group); ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, @@ -6594,7 +6608,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) cache->reserved_pinned + num_bytes + min_allocable_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; - sinfo->bytes_reserved += cache->reserved_pinned; + sinfo->bytes_may_use += cache->reserved_pinned; cache->reserved_pinned = 0; cache->ro = 1; ret = 0; @@ -6962,7 +6976,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) struct btrfs_space_info, list); if (space_info->bytes_pinned > 0 || - space_info->bytes_reserved > 0) { + space_info->bytes_reserved > 0 || + space_info->bytes_may_use > 0) { WARN_ON(1); dump_space_info(space_info, 0, 0); } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 41ac927401d0..79c16a68a2bc 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2472,9 +2472,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, spin_unlock(&ctl->tree_lock); if (bytes >= minlen) { - int update_ret; - update_ret = btrfs_update_reserved_bytes(block_group, - bytes, 1, 1); + struct btrfs_space_info *space_info; + int update = 0; + + space_info = block_group->space_info; + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (!block_group->ro) { + block_group->reserved += bytes; + space_info->bytes_reserved += bytes; + update = 1; + } + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); ret = btrfs_error_discard_extent(fs_info->extent_root, start, @@ -2482,9 +2492,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, &actually_trimmed); btrfs_add_free_space(block_group, start, bytes); - if (!update_ret) - btrfs_update_reserved_bytes(block_group, - bytes, 0, 1); + if (update) { + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (block_group->ro) + space_info->bytes_readonly += bytes; + block_group->reserved -= bytes; + space_info->bytes_reserved -= bytes; + spin_unlock(&space_info->lock); + spin_unlock(&block_group->lock); + } if (ret) break; -- cgit v1.2.3-58-ga151 From 9e4871070b5f070cacf26525389d56e0345ba156 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 1 Aug 2011 12:08:18 -0400 Subject: Btrfs: skip looking for delalloc if we don't have ->fill_delalloc We always look for delalloc bytes in our io_tree so we can fill in delalloc. This is fine in most cases, but if we're writing out the btree_inode this is just a superfluous tree search on the io_tree, and if we have a lot of metadata dirty this could be an expensive check. So instead check to see if our io_tree has a ->fill_delalloc op, and if not don't even bother doing the lookup. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d418164a35f1..7d5e55632809 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2136,6 +2136,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, int compressed; int write_flags; unsigned long nr_written = 0; + bool fill_delalloc = true; if (wbc->sync_mode == WB_SYNC_ALL) write_flags = WRITE_SYNC; @@ -2166,10 +2167,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, set_page_extent_mapped(page); + if (!tree->ops || !tree->ops->fill_delalloc) + fill_delalloc = false; + delalloc_start = start; delalloc_end = 0; page_started = 0; - if (!epd->extent_locked) { + if (!epd->extent_locked && fill_delalloc) { u64 delalloc_to_write = 0; /* * make sure the wbc mapping index is at least updated -- cgit v1.2.3-58-ga151 From 7709cde33f12db71efb377fae4ae7aab6c94ebc6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 4 Aug 2011 10:25:02 -0400 Subject: Btrfs: calculate checksum space correctly We have not been reserving enough space for checksums. We were just reserving bytes for the checksum items themselves, we were not taking into account having to cow the tree and such. This patch adds a csum_bytes counter to the inode for keeping track of the number of bytes outstanding we have for checksums. Then we calculate how many leaves would be required for the checksums we are given and use that to reserve space. This adds a significant amount of bytes to our reservations, but we will handle this later. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 6 +++ fs/btrfs/extent-tree.c | 117 +++++++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/inode.c | 3 ++ 3 files changed, 118 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index c70fb10a307b..5a5d325a3935 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -123,6 +123,12 @@ struct btrfs_inode { */ u64 last_unlink_trans; + /* + * Number of bytes outstanding that are going to need csums. This is + * used in ENOSPC accounting. + */ + u64 csum_bytes; + /* flags field from the on disk inode */ u32 flags; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index fbe6278f466b..4add1ac2dda0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3984,11 +3984,19 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } +/** + * drop_outstanding_extent - drop an outstanding extent + * @inode: the inode we're dropping the extent for + * + * This is called when we are freeing up an outstanding extent, either called + * after an error or after an extent is written. This will return the number of + * reserved extents that need to be freed. This must be called with + * BTRFS_I(inode)->lock held. + */ static unsigned drop_outstanding_extent(struct inode *inode) { unsigned dropped_extents = 0; - spin_lock(&BTRFS_I(inode)->lock); BUG_ON(!BTRFS_I(inode)->outstanding_extents); BTRFS_I(inode)->outstanding_extents--; @@ -3998,19 +4006,70 @@ static unsigned drop_outstanding_extent(struct inode *inode) */ if (BTRFS_I(inode)->outstanding_extents >= BTRFS_I(inode)->reserved_extents) - goto out; + return 0; dropped_extents = BTRFS_I(inode)->reserved_extents - BTRFS_I(inode)->outstanding_extents; BTRFS_I(inode)->reserved_extents -= dropped_extents; -out: - spin_unlock(&BTRFS_I(inode)->lock); return dropped_extents; } -static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) +/** + * calc_csum_metadata_size - return the amount of metada space that must be + * reserved/free'd for the given bytes. + * @inode: the inode we're manipulating + * @num_bytes: the number of bytes in question + * @reserve: 1 if we are reserving space, 0 if we are freeing space + * + * This adjusts the number of csum_bytes in the inode and then returns the + * correct amount of metadata that must either be reserved or freed. We + * calculate how many checksums we can fit into one leaf and then divide the + * number of bytes that will need to be checksumed by this value to figure out + * how many checksums will be required. If we are adding bytes then the number + * may go up and we will return the number of additional bytes that must be + * reserved. If it is going down we will return the number of bytes that must + * be freed. + * + * This must be called with BTRFS_I(inode)->lock held. + */ +static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, + int reserve) { - return num_bytes >>= 3; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 csum_size; + int num_csums_per_leaf; + int num_csums; + int old_csums; + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && + BTRFS_I(inode)->csum_bytes == 0) + return 0; + + old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); + if (reserve) + BTRFS_I(inode)->csum_bytes += num_bytes; + else + BTRFS_I(inode)->csum_bytes -= num_bytes; + csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); + num_csums_per_leaf = (int)div64_u64(csum_size, + sizeof(struct btrfs_csum_item) + + sizeof(struct btrfs_disk_key)); + num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); + num_csums = num_csums + num_csums_per_leaf - 1; + num_csums = num_csums / num_csums_per_leaf; + + old_csums = old_csums + num_csums_per_leaf - 1; + old_csums = old_csums / num_csums_per_leaf; + + /* No change, no need to reserve more */ + if (old_csums == num_csums) + return 0; + + if (reserve) + return btrfs_calc_trans_metadata_size(root, + num_csums - old_csums); + + return btrfs_calc_trans_metadata_size(root, old_csums - num_csums); } int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) @@ -4037,9 +4096,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); } + to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); spin_unlock(&BTRFS_I(inode)->lock); - to_reserve += calc_csum_metadata_size(inode, num_bytes); ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); if (ret) { unsigned dropped; @@ -4047,8 +4106,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) * We don't need the return value since our reservation failed, * we just need to clean up our counter. */ + spin_lock(&BTRFS_I(inode)->lock); dropped = drop_outstanding_extent(inode); WARN_ON(dropped > 1); + BTRFS_I(inode)->csum_bytes -= num_bytes; + spin_unlock(&BTRFS_I(inode)->lock); return ret; } @@ -4057,6 +4119,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) return 0; } +/** + * btrfs_delalloc_release_metadata - release a metadata reservation for an inode + * @inode: the inode to release the reservation for + * @num_bytes: the number of bytes we're releasing + * + * This will release the metadata reservation for an inode. This can be called + * once we complete IO for a given set of bytes to release their metadata + * reservations. + */ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4064,9 +4135,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) unsigned dropped; num_bytes = ALIGN(num_bytes, root->sectorsize); + spin_lock(&BTRFS_I(inode)->lock); dropped = drop_outstanding_extent(inode); - to_free = calc_csum_metadata_size(inode, num_bytes); + to_free = calc_csum_metadata_size(inode, num_bytes, 0); + spin_unlock(&BTRFS_I(inode)->lock); if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); @@ -4074,6 +4147,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) to_free); } +/** + * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc + * @inode: inode we're writing to + * @num_bytes: the number of bytes we want to allocate + * + * This will do the following things + * + * o reserve space in the data space info for num_bytes + * o reserve space in the metadata space info based on number of outstanding + * extents and how much csums will be needed + * o add to the inodes ->delalloc_bytes + * o add it to the fs_info's delalloc inodes list. + * + * This will return 0 for success and -ENOSPC if there is no space left. + */ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) { int ret; @@ -4091,6 +4179,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) return 0; } +/** + * btrfs_delalloc_release_space - release data and metadata space for delalloc + * @inode: inode we're releasing space for + * @num_bytes: the number of bytes we want to free up + * + * This must be matched with a call to btrfs_delalloc_reserve_space. This is + * called in the case that we don't need the metadata AND data reservations + * anymore. So if there is an error or we insert an inline extent. + * + * This function will release the metadata space that was not used and will + * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes + * list if there are no delalloc bytes left. + */ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) { btrfs_delalloc_release_metadata(inode, num_bytes); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 156c3a0da792..98b9fa2d77f7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6757,6 +6757,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->delalloc_bytes = 0; ei->disk_i_size = 0; ei->flags = 0; + ei->csum_bytes = 0; ei->index_cnt = (u64)-1; ei->last_unlink_trans = 0; @@ -6802,6 +6803,8 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(inode->i_data.nrpages); WARN_ON(BTRFS_I(inode)->outstanding_extents); WARN_ON(BTRFS_I(inode)->reserved_extents); + WARN_ON(BTRFS_I(inode)->delalloc_bytes); + WARN_ON(BTRFS_I(inode)->csum_bytes); /* * This can happen where we create an inode, but somebody else also -- cgit v1.2.3-58-ga151 From dba68306f3fae681b1005137f130f5bcfdfed34a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 4 Aug 2011 15:34:57 -0400 Subject: Btrfs: kill the orphan space calculation for snapshots This patch kills off the calculation for the amount of space needed for the orphan operations during a snapshot. The thing is we only do snapshots on commit, so any space that is in the block_rsv->freed[] isn't going to be in the new snapshot anyway, so there isn't any reason to require that space to be reserved for the snapshot to occur. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 5 --- fs/btrfs/inode.c | 83 -------------------------------------------------- fs/btrfs/transaction.c | 2 -- 3 files changed, 90 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 332cbdc86ad4..4ef777e85c89 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2577,11 +2577,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans, int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); -void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending, - u64 *bytes_to_reserve); -void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending); void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 98b9fa2d77f7..5f5f8a577e69 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2079,89 +2079,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) up_read(&root->fs_info->cleanup_work_sem); } -/* - * calculate extra metadata reservation when snapshotting a subvolume - * contains orphan files. - */ -void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending, - u64 *bytes_to_reserve) -{ - struct btrfs_root *root; - struct btrfs_block_rsv *block_rsv; - u64 num_bytes; - int index; - - root = pending->root; - if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) - return; - - block_rsv = root->orphan_block_rsv; - - /* orphan block reservation for the snapshot */ - num_bytes = block_rsv->size; - - /* - * after the snapshot is created, COWing tree blocks may use more - * space than it frees. So we should make sure there is enough - * reserved space. - */ - index = trans->transid & 0x1; - if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { - num_bytes += block_rsv->size - - (block_rsv->reserved + block_rsv->freed[index]); - } - - *bytes_to_reserve += num_bytes; -} - -void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending) -{ - struct btrfs_root *root = pending->root; - struct btrfs_root *snap = pending->snap; - struct btrfs_block_rsv *block_rsv; - u64 num_bytes; - int index; - int ret; - - if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) - return; - - /* refill source subvolume's orphan block reservation */ - block_rsv = root->orphan_block_rsv; - index = trans->transid & 0x1; - if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { - num_bytes = block_rsv->size - - (block_rsv->reserved + block_rsv->freed[index]); - ret = btrfs_block_rsv_migrate(&pending->block_rsv, - root->orphan_block_rsv, - num_bytes); - BUG_ON(ret); - } - - /* setup orphan block reservation for the snapshot */ - block_rsv = btrfs_alloc_block_rsv(snap); - BUG_ON(!block_rsv); - - btrfs_add_durable_block_rsv(root->fs_info, block_rsv); - snap->orphan_block_rsv = block_rsv; - - num_bytes = root->orphan_block_rsv->size; - ret = btrfs_block_rsv_migrate(&pending->block_rsv, - block_rsv, num_bytes); - BUG_ON(ret); - -#if 0 - /* insert orphan item for the snapshot */ - WARN_ON(!root->orphan_item_inserted); - ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, - snap->root_key.objectid); - BUG_ON(ret); - snap->orphan_item_inserted = 1; -#endif -} - enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_STARTED = 1, ORPHAN_CLEANUP_DONE = 2, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e24b7964a155..3e20cc8c1c06 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -911,7 +911,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); - btrfs_orphan_pre_snapshot(trans, pending, &to_reserve); if (to_reserve > 0) { ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, @@ -1002,7 +1001,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, BUG_ON(IS_ERR(pending->snap)); btrfs_reloc_post_snapshot(trans, pending); - btrfs_orphan_post_snapshot(trans, pending); fail: kfree(new_root_item); trans->block_rsv = rsv; -- cgit v1.2.3-58-ga151 From 37be25bcb6d731914e126f8de59c4367f0d66b80 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 5 Aug 2011 10:25:38 -0400 Subject: Btrfs: kill the durable block rsv stuff This is confusing code and isn't used by anything anymore, so delete it. Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 11 ------ fs/btrfs/disk-io.c | 2 - fs/btrfs/extent-tree.c | 100 +++++++++---------------------------------------- fs/btrfs/inode.c | 4 -- fs/btrfs/relocation.c | 1 - 5 files changed, 17 insertions(+), 101 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4ef777e85c89..c5ceba4078cc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -772,13 +772,10 @@ struct btrfs_space_info { struct btrfs_block_rsv { u64 size; u64 reserved; - u64 freed[2]; struct btrfs_space_info *space_info; - struct list_head list; spinlock_t lock; atomic_t usage; unsigned int priority:8; - unsigned int durable:1; unsigned int refill_used:1; unsigned int full:1; }; @@ -840,7 +837,6 @@ struct btrfs_block_group_cache { spinlock_t lock; u64 pinned; u64 reserved; - u64 reserved_pinned; u64 bytes_super; u64 flags; u64 sectorsize; @@ -919,11 +915,6 @@ struct btrfs_fs_info { struct btrfs_block_rsv empty_block_rsv; - /* list of block reservations that cross multiple transactions */ - struct list_head durable_block_rsv_list; - - struct mutex durable_block_rsv_mutex; - u64 generation; u64 last_trans_committed; @@ -2238,8 +2229,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv); -void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv); int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 07b3ac662e19..0b5643a68d57 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1665,8 +1665,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_init_block_rsv(&fs_info->trans_block_rsv); btrfs_init_block_rsv(&fs_info->chunk_block_rsv); btrfs_init_block_rsv(&fs_info->empty_block_rsv); - INIT_LIST_HEAD(&fs_info->durable_block_rsv_list); - mutex_init(&fs_info->durable_block_rsv_mutex); atomic_set(&fs_info->nr_async_submits, 0); atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_submit_draining, 0); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4add1ac2dda0..30c0558eae84 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -121,7 +121,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache) if (atomic_dec_and_test(&cache->count)) { WARN_ON(cache->pinned > 0); WARN_ON(cache->reserved > 0); - WARN_ON(cache->reserved_pinned > 0); kfree(cache->free_space_ctl); kfree(cache); } @@ -3662,7 +3661,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) spin_lock_init(&rsv->lock); atomic_set(&rsv->usage, 1); rsv->priority = 6; - INIT_LIST_HEAD(&rsv->list); } struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) @@ -3685,25 +3683,10 @@ void btrfs_free_block_rsv(struct btrfs_root *root, { if (rsv && atomic_dec_and_test(&rsv->usage)) { btrfs_block_rsv_release(root, rsv, (u64)-1); - if (!rsv->durable) - kfree(rsv); + kfree(rsv); } } -/* - * make the block_rsv struct be able to capture freed space. - * the captured space will re-add to the the block_rsv struct - * after transaction commit - */ -void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv) -{ - block_rsv->durable = 1; - mutex_lock(&fs_info->durable_block_rsv_mutex); - list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list); - mutex_unlock(&fs_info->durable_block_rsv_mutex); -} - int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, @@ -3745,9 +3728,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, ret = 0; } else { num_bytes -= block_rsv->reserved; - if (block_rsv->durable && - block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes) - commit_trans = 1; + commit_trans = 1; } spin_unlock(&block_rsv->lock); if (!ret) @@ -3763,8 +3744,18 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, } if (commit_trans) { + struct btrfs_space_info *sinfo = block_rsv->space_info; + if (trans) return -EAGAIN; + + spin_lock(&sinfo->lock); + if (sinfo->bytes_pinned < num_bytes) { + spin_unlock(&sinfo->lock); + return -ENOSPC; + } + spin_unlock(&sinfo->lock); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); ret = btrfs_commit_transaction(trans, root); @@ -3885,10 +3876,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; - btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv); - - btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv); - update_global_block_rsv(fs_info); } @@ -4447,13 +4434,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) spin_lock(&cache->lock); cache->pinned -= len; cache->space_info->bytes_pinned -= len; - if (cache->ro) { + if (cache->ro) cache->space_info->bytes_readonly += len; - } else if (cache->reserved_pinned > 0) { - len = min(len, cache->reserved_pinned); - cache->reserved_pinned -= len; - cache->space_info->bytes_may_use += len; - } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); } @@ -4468,11 +4450,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *unpin; - struct btrfs_block_rsv *block_rsv; - struct btrfs_block_rsv *next_rsv; u64 start; u64 end; - int idx; int ret; if (fs_info->pinned_extents == &fs_info->freed_extents[0]) @@ -4495,30 +4474,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, cond_resched(); } - mutex_lock(&fs_info->durable_block_rsv_mutex); - list_for_each_entry_safe(block_rsv, next_rsv, - &fs_info->durable_block_rsv_list, list) { - - idx = trans->transid & 0x1; - if (block_rsv->freed[idx] > 0) { - block_rsv_add_bytes(block_rsv, - block_rsv->freed[idx], 0); - block_rsv->freed[idx] = 0; - } - if (atomic_read(&block_rsv->usage) == 0) { - btrfs_block_rsv_release(root, block_rsv, (u64)-1); - - if (block_rsv->freed[0] == 0 && - block_rsv->freed[1] == 0) { - list_del_init(&block_rsv->list); - kfree(block_rsv); - } - } else { - btrfs_block_rsv_release(root, block_rsv, 0); - } - } - mutex_unlock(&fs_info->durable_block_rsv_mutex); - return 0; } @@ -4820,36 +4775,18 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, root, buf->start); if (!ret) - goto pin; + goto out; } if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { pin_down_extent(root, cache, buf->start, buf->len, 1); - goto pin; + goto out; } WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); - - goto out; - } -pin: - if (block_rsv->durable && !cache->ro) { - ret = 0; - spin_lock(&cache->lock); - if (!cache->ro) { - cache->reserved_pinned += buf->len; - ret = 1; - } - spin_unlock(&cache->lock); - - if (ret) { - spin_lock(&block_rsv->lock); - block_rsv->freed[trans->transid & 0x1] += buf->len; - spin_unlock(&block_rsv->lock); - } } out: /* @@ -6705,12 +6642,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) cache->bytes_super - btrfs_block_group_used(&cache->item); if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + - sinfo->bytes_may_use + sinfo->bytes_readonly + - cache->reserved_pinned + num_bytes + min_allocable_bytes <= - sinfo->total_bytes) { + sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + + min_allocable_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; - sinfo->bytes_may_use += cache->reserved_pinned; - cache->reserved_pinned = 0; cache->ro = 1; ret = 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5f5f8a577e69..6402a41b9023 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2164,9 +2164,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) } spin_unlock(&root->orphan_lock); - if (block_rsv) - btrfs_add_durable_block_rsv(root->fs_info, block_rsv); - /* grab metadata reservation from transaction handle */ if (reserve) { ret = btrfs_orphan_reserve_metadata(trans, inode); @@ -6505,7 +6502,6 @@ static int btrfs_truncate(struct inode *inode) rsv = btrfs_alloc_block_rsv(root); if (!rsv) return -ENOMEM; - btrfs_add_durable_block_rsv(root->fs_info, rsv); trans = btrfs_start_transaction(root, 4); if (IS_ERR(trans)) { diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 59bb1764273d..545b04358249 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3651,7 +3651,6 @@ int prepare_to_relocate(struct reloc_control *rc) return ret; rc->block_rsv->refill_used = 1; - btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv); memset(&rc->cluster, 0, sizeof(rc->cluster)); rc->search_start = rc->block_group->key.objectid; -- cgit v1.2.3-58-ga151 From 4289a667a0d7c6b134898cac7bfbe950267c305c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 5 Aug 2011 13:22:24 -0400 Subject: Btrfs: fix how we reserve space for deleting inodes I converted btrfs_truncate to do sane reservations for truncate, but didn't convert btrfs_evict_inode. Basically we need to save the orphan_rsv for deleting the orphan item, and do normal reservations for our truncate. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 49 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6402a41b9023..fe3891e240b3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3527,6 +3527,8 @@ void btrfs_evict_inode(struct inode *inode) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *rsv; + u64 min_size = btrfs_calc_trans_metadata_size(root, 2); unsigned long nr; int ret; @@ -3554,22 +3556,44 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } + rsv = btrfs_alloc_block_rsv(root); + if (!rsv) { + btrfs_orphan_del(NULL, inode); + goto no_delete; + } + btrfs_i_size_write(inode, 0); + /* + * This is a bit simpler than btrfs_truncate since + * + * 1) We've already reserved our space for our orphan item in the + * unlink. + * 2) We're going to delete the inode item, so we don't need to update + * it at all. + * + * So we just need to reserve some slack space in case we add bytes when + * doing the truncate. + */ while (1) { - trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); - trans->block_rsv = root->orphan_block_rsv; - - ret = btrfs_block_rsv_check(trans, root, - root->orphan_block_rsv, 0, 5); + ret = btrfs_block_rsv_check(NULL, root, rsv, min_size, 0); if (ret) { - BUG_ON(ret != -EAGAIN); - ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); - continue; + printk(KERN_WARNING "Could not get space for a " + "delete, will truncate on mount\n"); + btrfs_orphan_del(NULL, inode); + btrfs_free_block_rsv(root, rsv); + goto no_delete; + } + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_orphan_del(NULL, inode); + btrfs_free_block_rsv(root, rsv); + goto no_delete; } + trans->block_rsv = rsv; + ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); if (ret != -EAGAIN) break; @@ -3578,14 +3602,17 @@ void btrfs_evict_inode(struct inode *inode) btrfs_end_transaction(trans, root); trans = NULL; btrfs_btree_balance_dirty(root, nr); - } + btrfs_free_block_rsv(root, rsv); + if (ret == 0) { + trans->block_rsv = root->orphan_block_rsv; ret = btrfs_orphan_del(trans, inode); BUG_ON(ret); } + trans->block_rsv = &root->fs_info->trans_block_rsv; if (!(root == root->fs_info->tree_root || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) btrfs_return_ino(root, btrfs_ino(inode)); -- cgit v1.2.3-58-ga151 From 6ab60601d518d563ca1a47eaa399096e69d3b64a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 8 Aug 2011 08:24:46 -0400 Subject: Btrfs: ratelimit the generation printk for the free space cache A user reported getting spammed when moving to 3.0 by this message. Since we switched to the normal checksumming infrastructure all old free space caches will be wrong and need to be regenerated so people are likely to see this message a lot, so ratelimit it so it doesn't fill up their logs and freak them out. Thanks, Reported-by: Andrew Lutomirski Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 79c16a68a2bc..ecc1a4f85d20 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "ctree.h" #include "free-space-cache.h" #include "transaction.h" @@ -341,11 +342,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, gen = addr; if (*gen != BTRFS_I(inode)->generation) { - printk(KERN_ERR "btrfs: space cache generation" - " (%llu) does not match inode (%llu)\n", - (unsigned long long)*gen, - (unsigned long long) - BTRFS_I(inode)->generation); + printk_ratelimited(KERN_ERR "btrfs: space cache" + " generation (%llu) does not match " + "inode (%llu)\n", + (unsigned long long)*gen, + (unsigned long long) + BTRFS_I(inode)->generation); kunmap(page); unlock_page(page); page_cache_release(page); -- cgit v1.2.3-58-ga151 From dabdb6408cb801644fa613c7432da012640b348c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 8 Aug 2011 12:50:18 -0400 Subject: Btrfs: kill unused parts of block_rsv The priority and refill_used flags are not used anymore, and neither is the usage counter, so just remove them from btrfs_block_rsv. Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 3 --- fs/btrfs/extent-tree.c | 23 ++++++----------------- fs/btrfs/relocation.c | 2 -- 3 files changed, 6 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c5ceba4078cc..58a06dea4791 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -774,9 +774,6 @@ struct btrfs_block_rsv { u64 reserved; struct btrfs_space_info *space_info; spinlock_t lock; - atomic_t usage; - unsigned int priority:8; - unsigned int refill_used:1; unsigned int full:1; }; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 30c0558eae84..5395cc639270 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3659,8 +3659,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) { memset(rsv, 0, sizeof(*rsv)); spin_lock_init(&rsv->lock); - atomic_set(&rsv->usage, 1); - rsv->priority = 6; } struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) @@ -3681,10 +3679,8 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv) { - if (rsv && atomic_dec_and_test(&rsv->usage)) { - btrfs_block_rsv_release(root, rsv, (u64)-1); - kfree(rsv); - } + btrfs_block_rsv_release(root, rsv, (u64)-1); + kfree(rsv); } int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, @@ -3734,13 +3730,10 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, if (!ret) return 0; - if (block_rsv->refill_used) { - ret = reserve_metadata_bytes(trans, root, block_rsv, - num_bytes, 0); - if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, 0); - return 0; - } + ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 0); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 0); + return 0; } if (commit_trans) { @@ -3859,16 +3852,12 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); fs_info->chunk_block_rsv.space_info = space_info; - fs_info->chunk_block_rsv.priority = 10; space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); fs_info->global_block_rsv.space_info = space_info; - fs_info->global_block_rsv.priority = 10; - fs_info->global_block_rsv.refill_used = 1; fs_info->delalloc_block_rsv.space_info = space_info; fs_info->trans_block_rsv.space_info = space_info; fs_info->empty_block_rsv.space_info = space_info; - fs_info->empty_block_rsv.priority = 10; fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 545b04358249..aeaed99e9cfe 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3650,8 +3650,6 @@ int prepare_to_relocate(struct reloc_control *rc) if (ret) return ret; - rc->block_rsv->refill_used = 1; - memset(&rc->cluster, 0, sizeof(rc->cluster)); rc->search_start = rc->block_group->key.objectid; rc->extents_found = 0; -- cgit v1.2.3-58-ga151 From 13553e5221d6901a33b3f2157a389de085c161fe Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 8 Aug 2011 13:33:21 -0400 Subject: Btrfs: don't try to commit in btrfs_block_rsv_check We will try and reserve metadata bytes in btrfs_block_rsv_check and if we cannot because we have a transaction open it will return EAGAIN, so we do not need to try and commit the transaction again. Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5395cc639270..6356ef2f0c80 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3708,7 +3708,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, u64 min_reserved, int min_factor) { u64 num_bytes = 0; - int commit_trans = 0; int ret = -ENOSPC; if (!block_rsv) @@ -3720,13 +3719,12 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, if (min_reserved > num_bytes) num_bytes = min_reserved; - if (block_rsv->reserved >= num_bytes) { + if (block_rsv->reserved >= num_bytes) ret = 0; - } else { + else num_bytes -= block_rsv->reserved; - commit_trans = 1; - } spin_unlock(&block_rsv->lock); + if (!ret) return 0; @@ -3736,26 +3734,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, return 0; } - if (commit_trans) { - struct btrfs_space_info *sinfo = block_rsv->space_info; - - if (trans) - return -EAGAIN; - - spin_lock(&sinfo->lock); - if (sinfo->bytes_pinned < num_bytes) { - spin_unlock(&sinfo->lock); - return -ENOSPC; - } - spin_unlock(&sinfo->lock); - - trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); - ret = btrfs_commit_transaction(trans, root); - return 0; - } - - return -ENOSPC; + return ret; } int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, -- cgit v1.2.3-58-ga151 From 907cbcebd4e5f641faf08601f216b1ceb6cb3bdf Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 8 Aug 2011 13:46:15 -0400 Subject: Btrfs: optimize how we account for space in truncate Currently we're starting and stopping a transaction for no real reason, so kill that and just reserve enough space as if we can truncate all in one transaction. Also use btrfs_block_rsv_check() for our reserve to minimize the amount of space we may have to allocate for our slack space. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 58 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fe3891e240b3..4d057c084de6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6482,6 +6482,7 @@ static int btrfs_truncate(struct inode *inode) struct btrfs_trans_handle *trans; unsigned long nr; u64 mask = root->sectorsize - 1; + u64 min_size = btrfs_calc_trans_metadata_size(root, 2); ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); if (ret) @@ -6530,17 +6531,21 @@ static int btrfs_truncate(struct inode *inode) if (!rsv) return -ENOMEM; - trans = btrfs_start_transaction(root, 4); + /* + * 2 for the truncate slack space + * 1 for the orphan item we're going to add + * 1 for the orphan item deletion + * 1 for updating the inode. + */ + trans = btrfs_start_transaction(root, 5); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out; } - /* - * Reserve space for the truncate process. Truncate should be adding - * space, but if there are snapshots it may end up using space. - */ - ret = btrfs_truncate_reserve_metadata(trans, root, rsv); + /* Migrate the slack space for the truncate to our reserve */ + ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, + min_size); BUG_ON(ret); ret = btrfs_orphan_add(trans, inode); @@ -6549,21 +6554,6 @@ static int btrfs_truncate(struct inode *inode) goto out; } - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); - - /* - * Ok so we've already migrated our bytes over for the truncate, so here - * just reserve the one slot we need for updating the inode. - */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - goto out; - } - trans->block_rsv = rsv; - /* * setattr is responsible for setting the ordered_data_close flag, * but that is only tested during the last file release. That @@ -6585,20 +6575,30 @@ static int btrfs_truncate(struct inode *inode) btrfs_add_ordered_operation(trans, root, inode); while (1) { + ret = btrfs_block_rsv_check(trans, root, rsv, min_size, 0); + if (ret) { + /* + * This can only happen with the original transaction we + * started above, every other time we shouldn't have a + * transaction started yet. + */ + if (ret == -EAGAIN) + goto end_trans; + err = ret; + break; + } + if (!trans) { - trans = btrfs_start_transaction(root, 3); + /* Just need the 1 for updating the inode */ + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out; } - - ret = btrfs_truncate_reserve_metadata(trans, root, - rsv); - BUG_ON(ret); - - trans->block_rsv = rsv; } + trans->block_rsv = rsv; + ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); @@ -6613,7 +6613,7 @@ static int btrfs_truncate(struct inode *inode) err = ret; break; } - +end_trans: nr = trans->blocks_used; btrfs_end_transaction(trans, root); trans = NULL; -- cgit v1.2.3-58-ga151 From 5e962c7850c273b483acc747b41bd5cddf631049 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 8 Aug 2011 14:03:37 -0400 Subject: Btrfs: kill btrfs_truncate_reserve_metadata Since we've optimized the truncate path, we no longer require this function. Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 3 --- fs/btrfs/extent-tree.c | 31 ------------------------------- 2 files changed, 34 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 58a06dea4791..22a9347a3908 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2240,9 +2240,6 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, void btrfs_block_rsv_release(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); -int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_rsv *rsv); int btrfs_set_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache); int btrfs_set_block_group_rw(struct btrfs_root *root, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6356ef2f0c80..d05967e9d613 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3858,37 +3858,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) WARN_ON(fs_info->chunk_block_rsv.reserved > 0); } -int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_rsv *rsv) -{ - struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv; - u64 num_bytes; - int ret; - - /* - * Truncate should be freeing data, but give us 2 items just in case it - * needs to use some space. We may want to be smarter about this in the - * future. - */ - num_bytes = btrfs_calc_trans_metadata_size(root, 2); - - /* We already have enough bytes, just return */ - if (rsv->reserved >= num_bytes) - return 0; - - num_bytes -= rsv->reserved; - - /* - * You should have reserved enough space before hand to do this, so this - * should not fail. - */ - ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes); - BUG_ON(ret); - - return 0; -} - void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root) { -- cgit v1.2.3-58-ga151 From 1b9c332b6c92e992b1971a08412c6f460a54b514 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 17 Aug 2011 10:19:52 -0400 Subject: Btrfs: only reserve space in fallocate if we have to do a preallocate Lukas found a problem where if he tries to fallocate over the same region twice and the first fallocate took up all the space we would fail with ENOSPC. This is because we reserve the total space we want to use for fallocate, regardless of wether or not we will have to actually preallocate. So instead move the check into the loop where we actually have to do the preallocate. Thanks, Tested-by: Lukas Czerner Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e4e57d59edb7..de569af766fe 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1615,10 +1615,6 @@ static long btrfs_fallocate(struct file *file, int mode, goto out; } - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); - if (ret) - goto out; - locked_end = alloc_end - 1; while (1) { struct btrfs_ordered_extent *ordered; @@ -1664,11 +1660,27 @@ static long btrfs_fallocate(struct file *file, int mode, if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + + /* + * Make sure we have enough space before we do the + * allocation. + */ + ret = btrfs_check_data_free_space(inode, last_byte - + cur_offset); + if (ret) { + free_extent_map(em); + break; + } + ret = btrfs_prealloc_file_range(inode, mode, cur_offset, last_byte - cur_offset, 1 << inode->i_blkbits, offset + len, &alloc_hint); + + /* Let go of our reservation. */ + btrfs_free_reserved_data_space(inode, last_byte - + cur_offset); if (ret < 0) { free_extent_map(em); break; @@ -1694,8 +1706,6 @@ static long btrfs_fallocate(struct file *file, int mode, } unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state, GFP_NOFS); - - btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); out: mutex_unlock(&inode->i_mutex); return ret; -- cgit v1.2.3-58-ga151 From 07127184efb629f1336c0592bfdacec258cab731 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 19 Aug 2011 10:29:59 -0400 Subject: Btrfs: reduce the amount of space needed for truncates With btrfs_truncate_inode_items we always return if we have to go to another leaf, which makes us do our reservation again. This means we will only ever modify one leaf at a time, so we only need 1 items worth of slack space. Also, since we are deleting we will not be creating nodes as we go down, if anything we'll be free'ing them as we merge them together, so make a different calculation for truncate which will only have the worst case useage of COW'ing the entire path down to the leaf. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 11 +++++++++++ fs/btrfs/inode.c | 8 ++++---- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 22a9347a3908..2e18b068841b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2125,6 +2125,17 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 3 * num_items; } +/* + * Doing a truncate won't result in new nodes or leaves, just what we need for + * COW. + */ +static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root, + unsigned num_items) +{ + return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * + num_items; +} + void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4d057c084de6..8316b570db55 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3528,7 +3528,7 @@ void btrfs_evict_inode(struct inode *inode) struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv; - u64 min_size = btrfs_calc_trans_metadata_size(root, 2); + u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); unsigned long nr; int ret; @@ -6482,7 +6482,7 @@ static int btrfs_truncate(struct inode *inode) struct btrfs_trans_handle *trans; unsigned long nr; u64 mask = root->sectorsize - 1; - u64 min_size = btrfs_calc_trans_metadata_size(root, 2); + u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); if (ret) @@ -6532,12 +6532,12 @@ static int btrfs_truncate(struct inode *inode) return -ENOMEM; /* - * 2 for the truncate slack space + * 1 for the truncate slack space * 1 for the orphan item we're going to add * 1 for the orphan item deletion * 1 for updating the inode. */ - trans = btrfs_start_transaction(root, 5); + trans = btrfs_start_transaction(root, 4); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out; -- cgit v1.2.3-58-ga151 From 482e6dc5261406fdb921946e70b51467b0305bad Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 19 Aug 2011 10:31:56 -0400 Subject: Btrfs: allow callers to specify if flushing can occur for btrfs_block_rsv_check If you run xfstest 224 it you will get lots of messages about not being able to delete inodes and that they will be cleaned up next mount. This is because btrfs_block_rsv_check was not calling reserve_metadata_bytes with the ability to flush, so if there was not enough space, it simply failed. But in truncate and evict case we could easily flush space to try and get enough space to do our work, so make btrfs_block_rsv_check take a flush argument to pass down to reserve_metadata_bytes. Now xfstests 224 runs fine without all those complaints. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent-tree.c | 4 ++-- fs/btrfs/free-space-cache.c | 2 +- fs/btrfs/inode.c | 6 +++--- fs/btrfs/relocation.c | 4 ++-- fs/btrfs/transaction.c | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2e18b068841b..caa73cd8c00a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2244,7 +2244,7 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 min_reserved, int min_factor); + u64 min_reserved, int min_factor, int flush); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d05967e9d613..a71fcf506531 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3705,7 +3705,7 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 min_reserved, int min_factor) + u64 min_reserved, int min_factor, int flush) { u64 num_bytes = 0; int ret = -ENOSPC; @@ -3728,7 +3728,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, if (!ret) return 0; - ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 0); + ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, flush); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 0); return 0; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ecc1a4f85d20..b0122e19db6b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -199,7 +199,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, trans->block_rsv = root->orphan_block_rsv; ret = btrfs_block_rsv_check(trans, root, root->orphan_block_rsv, - 0, 5); + 0, 5, 0); if (ret) return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8316b570db55..e40b9239660d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3576,10 +3576,10 @@ void btrfs_evict_inode(struct inode *inode) * doing the truncate. */ while (1) { - ret = btrfs_block_rsv_check(NULL, root, rsv, min_size, 0); + ret = btrfs_block_rsv_check(NULL, root, rsv, min_size, 0, 1); if (ret) { printk(KERN_WARNING "Could not get space for a " - "delete, will truncate on mount\n"); + "delete, will truncate on mount %d\n", ret); btrfs_orphan_del(NULL, inode); btrfs_free_block_rsv(root, rsv); goto no_delete; @@ -6575,7 +6575,7 @@ static int btrfs_truncate(struct inode *inode) btrfs_add_ordered_operation(trans, root, inode); while (1) { - ret = btrfs_block_rsv_check(trans, root, rsv, min_size, 0); + ret = btrfs_block_rsv_check(trans, root, rsv, min_size, 0, 1); if (ret) { /* * This can only happen with the original transaction we diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index aeaed99e9cfe..fd9ac66434b0 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2042,7 +2042,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, trans->block_rsv = rc->block_rsv; ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, - min_reserved, 0); + min_reserved, 0, 0); if (ret) { BUG_ON(ret != -EAGAIN); ret = btrfs_commit_transaction(trans, root); @@ -3775,7 +3775,7 @@ restart: } ret = btrfs_block_rsv_check(trans, rc->extent_root, - rc->block_rsv, 0, 5); + rc->block_rsv, 0, 5, 0); if (ret < 0) { if (ret != -EAGAIN) { err = ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3e20cc8c1c06..a1d8c322c1ba 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -419,7 +419,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans, { int ret; ret = btrfs_block_rsv_check(trans, root, - &root->fs_info->global_block_rsv, 0, 5); + &root->fs_info->global_block_rsv, 0, 5, 0); return ret ? 1 : 0; } -- cgit v1.2.3-58-ga151 From a9b5fcddce594a408a48d523087b5bb64ce82469 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 19 Aug 2011 12:06:12 -0400 Subject: Btrfs: fix call to btrfs_search_slot in free space cache We are setting ins_len to 1 even tho we are just modifying an item that should be there already. This may cause the search stuff to split nodes on the way down needelessly. Set this to 0 since we aren't inserting anything. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index b0122e19db6b..701ef5951e3f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -802,7 +802,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, key.offset = offset; key.type = 0; - ret = btrfs_search_slot(trans, root, &key, path, 1, 1); + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { ret = -1; clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, -- cgit v1.2.3-58-ga151 From 7ed49f187c82821e35f8869399bcf90822a74a23 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 19 Aug 2011 15:45:52 -0400 Subject: Btrfs: fix space leak when we fail to make an allocation When changing back to using a spin_lock to protect the extent counters I decided that since we would only be dropping our original extent, it was ok to just drop the extent and return. However since somebody else could have come in and done a reservation, we need to do the normal song and dance to clear the reservation out properly. So calculate how much space we need to free, and then subtract what we just attempted to reserve. If it's more then we know we need to drop those bytes from the delalloc block rsv. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a71fcf506531..99ab5716baad 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4025,16 +4025,24 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); if (ret) { + u64 to_free = 0; unsigned dropped; - /* - * We don't need the return value since our reservation failed, - * we just need to clean up our counter. - */ + spin_lock(&BTRFS_I(inode)->lock); dropped = drop_outstanding_extent(inode); - WARN_ON(dropped > 1); - BTRFS_I(inode)->csum_bytes -= num_bytes; + to_free = calc_csum_metadata_size(inode, num_bytes, 0); spin_unlock(&BTRFS_I(inode)->lock); + to_free += btrfs_calc_trans_metadata_size(root, dropped); + + /* + * Somebody could have come in and twiddled with the + * reservation, so if we have to free more than we would have + * reserved from this reservation go ahead and release those + * bytes. + */ + to_free -= to_reserve; + if (to_free) + btrfs_block_rsv_release(root, block_rsv, to_free); return ret; } -- cgit v1.2.3-58-ga151 From 7f70150896ebd1169d9c43484c8c424f755353c4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 22 Aug 2011 15:23:19 -0400 Subject: Btrfs: don't increase the block_rsv's size when emergency allocating space If we have to emergency reserve space we need to not increase the block_rsv size, otherwise we'll leak space. Take for instance delalloc, say we reserve 4k, and we use that 4k, and then we have to emergency allocate another 4k, we bump the size up to 8k, however we've only accounted for 4k in reservations in all of our supporting logic, so we'll go to free the 4k and end up having a size of 4k, which will cause us to later not free as much space. I saw this doing testing where I wasn't reserving enough space for something but was still leaking space, very frustrating. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 99ab5716baad..1f1d3e8dcec9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5707,9 +5707,6 @@ use_block_rsv(struct btrfs_trans_handle *trans, ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, 0); if (!ret) { - spin_lock(&block_rsv->lock); - block_rsv->size += blocksize; - spin_unlock(&block_rsv->lock); return block_rsv; } else if (ret && block_rsv != global_rsv) { ret = block_rsv_use_bytes(global_rsv, blocksize); -- cgit v1.2.3-58-ga151 From 4a33854257764c2ec6337ee0c8ecafb64f8e29e1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 29 Aug 2011 11:01:31 -0400 Subject: Btrfs: set truncate block rsv's size While debugging a different issue I noticed that we were always reserving space when we tried to use our truncate block rsv's. This is because they didn't have a ->size value, so use_block_rsv just assumes there is nothing reserved and it does a reserve_metadata_bytes. This is because btrfs_check_block_rsv() doesn't actually add to the size of the block rsv. That seems to be the right thing to do so set ->size to the minimum truncate size we need, since we will always only refill to that size anyway, and this way everything works out correctly. Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e40b9239660d..06ae5b173fd7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3561,6 +3561,7 @@ void btrfs_evict_inode(struct inode *inode) btrfs_orphan_del(NULL, inode); goto no_delete; } + rsv->size = min_size; btrfs_i_size_write(inode, 0); @@ -6530,6 +6531,7 @@ static int btrfs_truncate(struct inode *inode) rsv = btrfs_alloc_block_rsv(root); if (!rsv) return -ENOMEM; + rsv->size = min_size; /* * 1 for the truncate slack space -- cgit v1.2.3-58-ga151 From 300e4f8a56f263797568c95b71c949f9f02e4534 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 29 Aug 2011 14:06:00 -0400 Subject: Btrfs: put the block group cache after we commit the super In moving some enospc stuff around I noticed that when we unmount we are often evicting the free space cache inodes before we do our last commit. This isn't bad, but it makes us constantly have to re-read the inodes back. So instead don't evict the cache until after we do our last commit, this will make things a little less crappy and makes a future enospc change work properly. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/free-space-cache.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0b5643a68d57..4965a0179b31 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2543,8 +2543,6 @@ int close_ctree(struct btrfs_root *root) /* clear out the rbtree of defraggable inodes */ btrfs_run_defrag_inodes(root->fs_info); - btrfs_put_block_group_cache(fs_info); - /* * Here come 2 situations when btrfs is broken to flip readonly: * @@ -2570,6 +2568,8 @@ int close_ctree(struct btrfs_root *root) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } + btrfs_put_block_group_cache(fs_info); + kthread_stop(root->fs_info->transaction_kthread); kthread_stop(root->fs_info->cleaner_kthread); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 701ef5951e3f..1ea10731797a 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -105,7 +105,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, block_group->disk_cache_state = BTRFS_DC_CLEAR; } - if (!btrfs_fs_closing(root->fs_info)) { + if (!block_group->iref) { block_group->inode = igrab(inode); block_group->iref = 1; } -- cgit v1.2.3-58-ga151 From c09544e07f8cdc455ed8615d4c067d694c33bd18 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 30 Aug 2011 10:19:10 -0400 Subject: Btrfs: handle enospc accounting for free space inodes Since free space inodes now use normal checksumming we need to make sure to account for their metadata use. So reserve metadata space, and then if we fail to write out the metadata we can just release it, otherwise it will be freed up when the io completes. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 18 +++++++++++++----- fs/btrfs/free-space-cache.c | 44 +++++++++++++++++++++++++++++--------------- fs/btrfs/inode-map.c | 6 ++++-- fs/btrfs/inode.c | 2 +- 4 files changed, 47 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1f1d3e8dcec9..ccdc4d12e8d4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2755,16 +2755,20 @@ again: num_pages *= 16; num_pages *= PAGE_CACHE_SIZE; - ret = btrfs_check_data_free_space(inode, num_pages); + ret = btrfs_delalloc_reserve_space(inode, num_pages); if (ret) goto out_put; ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, num_pages, num_pages, &alloc_hint); - if (!ret) + if (!ret) { dcs = BTRFS_DC_SETUP; - btrfs_free_reserved_data_space(inode, num_pages); + btrfs_free_reserved_data_space(inode, num_pages); + } else { + btrfs_delalloc_release_space(inode, num_pages); + } + out_put: iput(inode); out_free: @@ -4002,9 +4006,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; u64 to_reserve = 0; unsigned nr_extents = 0; + int flush = 1; int ret; - if (btrfs_transaction_in_commit(root->fs_info)) + if (btrfs_is_free_space_inode(root, inode)) + flush = 0; + + if (flush && btrfs_transaction_in_commit(root->fs_info)) schedule_timeout(1); num_bytes = ALIGN(num_bytes, root->sectorsize); @@ -4023,7 +4031,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); spin_unlock(&BTRFS_I(inode)->lock); - ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); + ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, flush); if (ret) { u64 to_free = 0; unsigned dropped; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 1ea10731797a..3bde17ff14c0 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -532,6 +532,19 @@ out: return ret; } +/** + * __btrfs_write_out_cache - write out cached info to an inode + * @root - the root the inode belongs to + * @ctl - the free space cache we are going to write out + * @block_group - the block_group for this cache if it belongs to a block_group + * @trans - the trans handle + * @path - the path to use + * @offset - the offset for the key we'll insert + * + * This function writes out a free space cache struct to disk for quick recovery + * on mount. This will return 0 if it was successfull in writing the cache out, + * and -1 if it was not. + */ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group_cache *block_group, @@ -555,7 +568,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, int index = 0, num_pages = 0; int entries = 0; int bitmaps = 0; - int ret = -1; + int ret; + int err = -1; bool next_page = false; bool out_of_space = false; @@ -563,7 +577,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, node = rb_first(&ctl->free_space_offset); if (!node) - return 0; + return -1; if (!i_size_read(inode)) return -1; @@ -767,7 +781,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state, GFP_NOFS); - ret = 0; goto out; } @@ -789,10 +802,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state, GFP_NOFS); - if (ret) { - ret = 0; + if (ret) goto out; - } BTRFS_I(inode)->generation = trans->transid; @@ -804,7 +815,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - ret = -1; clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); @@ -818,7 +828,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || found_key.offset != offset) { - ret = -1; clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 0, 0, NULL, @@ -835,16 +844,15 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - ret = 1; - + err = 0; out: kfree(pages); - if (ret != 1) { + if (err) { invalidate_inode_pages2_range(inode->i_mapping, 0, index); BTRFS_I(inode)->generation = 0; } btrfs_update_inode(trans, root, inode); - return ret; + return err; } int btrfs_write_out_cache(struct btrfs_root *root, @@ -871,14 +879,16 @@ int btrfs_write_out_cache(struct btrfs_root *root, ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, path, block_group->key.objectid); - if (ret < 0) { + if (ret) { + btrfs_delalloc_release_metadata(inode, inode->i_size); spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_ERROR; spin_unlock(&block_group->lock); ret = 0; - +#ifdef DEBUG printk(KERN_ERR "btrfs: failed to write free space cace " "for block group %llu\n", block_group->key.objectid); +#endif } iput(inode); @@ -2662,9 +2672,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, return 0; ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); - if (ret < 0) + if (ret) { + btrfs_delalloc_release_metadata(inode, inode->i_size); +#ifdef DEBUG printk(KERN_ERR "btrfs: failed to write free ino cache " "for root %llu\n", root->root_key.objectid); +#endif + } iput(inode); return ret; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index b4087e0fa871..53dcbdf446cd 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -465,14 +465,16 @@ again: /* Just to make sure we have enough space */ prealloc += 8 * PAGE_CACHE_SIZE; - ret = btrfs_check_data_free_space(inode, prealloc); + ret = btrfs_delalloc_reserve_space(inode, prealloc); if (ret) goto out_put; ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); - if (ret) + if (ret) { + btrfs_delalloc_release_space(inode, prealloc); goto out_put; + } btrfs_free_reserved_data_space(inode, prealloc); out_put: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 06ae5b173fd7..78b5ae59ac4f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1792,11 +1792,11 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) } ret = 0; out: + btrfs_delalloc_release_metadata(inode, ordered_extent->len); if (nolock) { if (trans) btrfs_end_transaction_nolock(trans, root); } else { - btrfs_delalloc_release_metadata(inode, ordered_extent->len); if (trans) btrfs_end_transaction(trans, root); } -- cgit v1.2.3-58-ga151 From 4c13d758b7e79c14a0026c1f783f0c79e339b7bb Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 30 Aug 2011 11:31:29 -0400 Subject: Btrfs: use the transactions block_rsv for the csum root The alloc warnings everybody has been seeing is because we have been reserving space for csums, but we weren't actually using that space. So make get_block_rsv() return the trans->block_rsv if we're modifying the csum root. Also set the trans->block_rsv to NULL so that if we modify the csum root when running delayed ref's that comes out of the global reserve like it's supposed to. With this patch I'm not seeing those alloc warnings anymore. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 15 +++++++++------ fs/btrfs/transaction.c | 1 + 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ccdc4d12e8d4..53f6dbdab510 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3565,10 +3565,12 @@ out: static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_block_rsv *block_rsv; - if (root->ref_cows) + struct btrfs_block_rsv *block_rsv = NULL; + + if (root->ref_cows || root == root->fs_info->csum_root) block_rsv = trans->block_rsv; - else + + if (!block_rsv) block_rsv = root->block_rsv; if (!block_rsv) @@ -3865,12 +3867,13 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root) { + struct btrfs_block_rsv *block_rsv; + if (!trans->bytes_reserved) return; - BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); - btrfs_block_rsv_release(root, trans->block_rsv, - trans->bytes_reserved); + block_rsv = &root->fs_info->trans_block_rsv; + btrfs_block_rsv_release(root, block_rsv, trans->bytes_reserved); trans->bytes_reserved = 0; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a1d8c322c1ba..a770f4bd9d31 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -453,6 +453,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, return 0; } + trans->block_rsv = NULL; while (count < 4) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; -- cgit v1.2.3-58-ga151 From d02c9955ded7fc56dd1edc987558b084ccb03eb4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 30 Aug 2011 11:40:48 -0400 Subject: Btrfs: don't get the block_rsv in btrfs_free_tree_block Since the durable block rsv stuff has been killed there is no need to get the block_rsv in btrfs_free_tree_block anymore. Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 53f6dbdab510..d236df790156 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4707,7 +4707,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf, u64 parent, int last_ref) { - struct btrfs_block_rsv *block_rsv; struct btrfs_block_group_cache *cache = NULL; int ret; @@ -4722,10 +4721,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (!last_ref) return; - block_rsv = get_block_rsv(trans, root); cache = btrfs_lookup_block_group(root->fs_info, buf->start); - if (block_rsv->space_info != cache->space_info) - goto out; if (btrfs_header_generation(buf) == trans->transid) { if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { -- cgit v1.2.3-58-ga151 From 4a92b1b8d2810db4ea0c34616b94c0b3810fa027 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 30 Aug 2011 12:34:28 -0400 Subject: Btrfs: stop passing a trans handle all around the reservation code The only thing that we need to have a trans handle for is in reserve_metadata_bytes and thats to know how much flushing we can do. So instead of passing it around, just check current->journal_info for a trans_handle so we know if we can commit a transaction to try and free up space or not. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 6 ++---- fs/btrfs/extent-tree.c | 45 +++++++++++++++++++++++---------------------- fs/btrfs/free-space-cache.c | 4 +--- fs/btrfs/inode.c | 4 ++-- fs/btrfs/relocation.c | 15 +++++++-------- fs/btrfs/transaction.c | 8 ++++---- 6 files changed, 39 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index caa73cd8c00a..a5faf8e33baa 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2237,12 +2237,10 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv); -int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); -int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_block_rsv_check(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 min_reserved, int min_factor, int flush); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d236df790156..9bb71597aa54 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3404,29 +3404,34 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, return reclaimed >= to_reclaim; } -/* - * Retries tells us how many times we've called reserve_metadata_bytes. The - * idea is if this is the first call (retries == 0) then we will add to our - * reserved count if we can't make the allocation in order to hold our place - * while we go and try and free up space. That way for retries > 1 we don't try - * and add space, we just check to see if the amount of unused space is >= the - * total space, meaning that our reservation is valid. +/** + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space + * @root - the root we're allocating for + * @block_rsv - the block_rsv we're allocating for + * @orig_bytes - the number of bytes we want + * @flush - wether or not we can flush to make our reservation * - * However if we don't intend to retry this reservation, pass -1 as retries so - * that it short circuits this logic. + * This will reserve orgi_bytes number of bytes from the space info associated + * with the block_rsv. If there is not enough space it will make an attempt to + * flush out space to make room. It will do this by flushing delalloc if + * possible or committing the transaction. If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. */ -static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 orig_bytes, int flush) { struct btrfs_space_info *space_info = block_rsv->space_info; + struct btrfs_trans_handle *trans; u64 unused; u64 num_bytes = orig_bytes; int retries = 0; int ret = 0; bool committed = false; bool flushing = false; + + trans = (struct btrfs_trans_handle *)current->journal_info; again: ret = 0; spin_lock(&space_info->lock); @@ -3689,8 +3694,7 @@ void btrfs_free_block_rsv(struct btrfs_root *root, kfree(rsv); } -int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes) { @@ -3699,7 +3703,7 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, if (num_bytes == 0) return 0; - ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 1); return 0; @@ -3708,8 +3712,7 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, return ret; } -int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_block_rsv_check(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 min_reserved, int min_factor, int flush) { @@ -3734,7 +3737,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, if (!ret) return 0; - ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, flush); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 0); return 0; @@ -4034,7 +4037,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); spin_unlock(&BTRFS_I(inode)->lock); - ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, flush); + ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); if (ret) { u64 to_free = 0; unsigned dropped; @@ -5689,8 +5692,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(trans, root, block_rsv, - blocksize, 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); /* * If we couldn't reserve metadata bytes try and use some from * the global reserve. @@ -5711,8 +5713,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, return block_rsv; if (ret) { WARN_ON(1); - ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, - 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); if (!ret) { return block_rsv; } else if (ret && block_rsv != global_rsv) { diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 3bde17ff14c0..ffc42ef44711 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -197,9 +197,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, rsv = trans->block_rsv; trans->block_rsv = root->orphan_block_rsv; - ret = btrfs_block_rsv_check(trans, root, - root->orphan_block_rsv, - 0, 5, 0); + ret = btrfs_block_rsv_check(root, root->orphan_block_rsv, 0, 5, 0); if (ret) return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 78b5ae59ac4f..2947e94947b5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3577,7 +3577,7 @@ void btrfs_evict_inode(struct inode *inode) * doing the truncate. */ while (1) { - ret = btrfs_block_rsv_check(NULL, root, rsv, min_size, 0, 1); + ret = btrfs_block_rsv_check(root, rsv, min_size, 0, 1); if (ret) { printk(KERN_WARNING "Could not get space for a " "delete, will truncate on mount %d\n", ret); @@ -6577,7 +6577,7 @@ static int btrfs_truncate(struct inode *inode) btrfs_add_ordered_operation(trans, root, inode); while (1) { - ret = btrfs_block_rsv_check(trans, root, rsv, min_size, 0, 1); + ret = btrfs_block_rsv_check(root, rsv, min_size, 0, 1); if (ret) { /* * This can only happen with the original transaction we diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index fd9ac66434b0..3ab67409f90f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2041,8 +2041,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, BUG_ON(IS_ERR(trans)); trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, - min_reserved, 0, 0); + ret = btrfs_block_rsv_check(root, rc->block_rsv, min_reserved, + 0, 0); if (ret) { BUG_ON(ret != -EAGAIN); ret = btrfs_commit_transaction(trans, root); @@ -2152,8 +2152,7 @@ int prepare_to_merge(struct reloc_control *rc, int err) again: if (!err) { num_bytes = rc->merging_rsv_size; - ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, - num_bytes); + ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); if (ret) err = ret; } @@ -2427,7 +2426,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, num_bytes = calcu_metadata_size(rc, node, 1) * 2; trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); + ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); if (ret) { if (ret == -EAGAIN) rc->commit_transaction = 1; @@ -3645,7 +3644,7 @@ int prepare_to_relocate(struct reloc_control *rc) * btrfs_init_reloc_root will use them when there * is no reservation in transaction handle. */ - ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, + ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, rc->extent_root->nodesize * 256); if (ret) return ret; @@ -3774,8 +3773,8 @@ restart: } } - ret = btrfs_block_rsv_check(trans, rc->extent_root, - rc->block_rsv, 0, 5, 0); + ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 0, + 5, 0); if (ret < 0) { if (ret != -EAGAIN) { err = ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a770f4bd9d31..8d6f4c78f73f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -275,7 +275,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, */ if (num_items > 0 && root != root->fs_info->chunk_root) { num_bytes = btrfs_calc_trans_metadata_size(root, num_items); - ret = btrfs_block_rsv_add(NULL, root, + ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv, num_bytes); if (ret) @@ -418,8 +418,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int ret; - ret = btrfs_block_rsv_check(trans, root, - &root->fs_info->global_block_rsv, 0, 5, 0); + ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 0, + 5, 0); return ret ? 1 : 0; } @@ -914,7 +914,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); if (to_reserve > 0) { - ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, + ret = btrfs_block_rsv_add(root, &pending->block_rsv, to_reserve); if (ret) { pending->error = ret; -- cgit v1.2.3-58-ga151 From 9c8d86db9aee6f85866d480e0f9b37817264814c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 19 Sep 2011 11:58:54 -0400 Subject: Btrfs: make sure to unset trans->block_rsv before running delayed refs Checksums are charged in 2 different ways. The first case is when we're writing to the disk, we account for the new checksums with the delalloc block rsv. In order for this to work we check if we're allocating a block for the csum root and if trans->block_rsv == the delalloc block rsv. But when we're deleting the csums because of cow, this is charged to the global block rsv, and is done when we run the delayed refs. So we need to make sure that trans->block_rsv == NULL when running the delayed refs. So set it to NULL and reset it in should_end_transaction, and set it to NULL in commit_transaction. This got rid of the ridiculous amount of warnings I was seeing when trying to do a balance. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8d6f4c78f73f..7debbf396ef3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -427,17 +427,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_transaction *cur_trans = trans->transaction; + struct btrfs_block_rsv *rsv = trans->block_rsv; int updates; smp_mb(); if (cur_trans->blocked || cur_trans->delayed_refs.flushing) return 1; + /* + * We need to do this in case we're deleting csums so the global block + * rsv get's used instead of the csum block rsv. + */ + trans->block_rsv = NULL; + updates = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (updates) btrfs_run_delayed_refs(trans, root, updates); + trans->block_rsv = rsv; + return should_end_transaction(trans, root); } @@ -1167,6 +1176,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_run_ordered_operations(root, 0); + trans->block_rsv = NULL; + /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here */ -- cgit v1.2.3-58-ga151 From 455757c322cc0a0f2a692c5625dd88aaf6a7b889 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 19 Sep 2011 12:26:24 -0400 Subject: Btrfs: delay iput when deleting a block group I kept getting warnings from evict because we were calling btrfs_start_transaction() with a transaction already started when doing a balance. This is because we remove a block group which requires a transaction, and the put the last reference on the cache inode. Instead of doing this we need to delay the iput so it is done not within a transaction having started. This gets rid of our warnings. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9bb71597aa54..5498bdacd4c3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7266,7 +7266,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&block_group->lock); } /* One for our lookup ref */ - iput(inode); + btrfs_add_delayed_iput(inode); } key.objectid = BTRFS_FREE_SPACE_OBJECTID; -- cgit v1.2.3-58-ga151 From 3b16a4e3c355ee3c790473decfcf83d4faeb8ce0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 21 Sep 2011 15:05:58 -0400 Subject: Btrfs: use the inode's mapping mask for allocating pages Johannes pointed out we were allocating only kernel pages for doing writes, which is kind of a big deal if you are on 32bit and have more than a gig of ram. So fix our allocations to use the mapping's gfp but still clear __GFP_FS so we don't re-enter. Thanks, Reported-by: Johannes Weiner Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 6 ++++++ fs/btrfs/file.c | 3 ++- fs/btrfs/free-space-cache.c | 6 ++++-- fs/btrfs/inode.c | 3 ++- fs/btrfs/ioctl.c | 3 ++- fs/btrfs/relocation.c | 3 ++- 6 files changed, 18 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a5faf8e33baa..47dea7118e0e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -30,6 +30,7 @@ #include #include #include +#include #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" @@ -2117,6 +2118,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); } +static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) +{ + return mapping_gfp_mask(mapping) & ~__GFP_FS; +} + /* extent-tree.c */ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, unsigned num_items) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index de569af766fe..f2e928289600 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1069,6 +1069,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, int i; unsigned long index = pos >> PAGE_CACHE_SHIFT; struct inode *inode = fdentry(file)->d_inode; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int err = 0; int faili = 0; u64 start_pos; @@ -1080,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, again: for (i = 0; i < num_pages; i++) { pages[i] = find_or_create_page(inode->i_mapping, index + i, - GFP_NOFS); + mask); if (!pages[i]) { faili = i - 1; err = -ENOMEM; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ffc42ef44711..0a8ccdbdd464 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -254,6 +254,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, u64 num_bitmaps; u64 generation; pgoff_t index = 0; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int ret = 0; INIT_LIST_HEAD(&bitmaps); @@ -310,7 +311,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (!num_entries && !num_bitmaps) break; - page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, index, mask); if (!page) goto free_cache; @@ -563,6 +564,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, u64 start, end, len; u64 bytes = 0; u32 crc = ~(u32)0; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int index = 0, num_pages = 0; int entries = 0; int bitmaps = 0; @@ -612,7 +614,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * know and don't freak out. */ while (index < num_pages) { - page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, index, mask); if (!page) { int i; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2947e94947b5..88e3956be57d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3282,6 +3282,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) pgoff_t index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); struct page *page; + gfp_t mask = btrfs_alloc_write_mask(mapping); int ret = 0; u64 page_start; u64 page_end; @@ -3294,7 +3295,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) ret = -ENOMEM; again: - page = find_or_create_page(mapping, index, GFP_NOFS); + page = find_or_create_page(mapping, index, mask); if (!page) { btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); goto out; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 538f65a79ec5..24fd75bb0f96 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -843,6 +843,7 @@ static int cluster_pages_for_defrag(struct inode *inode, int i_done; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); if (isize == 0) return 0; @@ -860,7 +861,7 @@ again: for (i = 0; i < num_pages; i++) { struct page *page; page = find_or_create_page(inode->i_mapping, - start_index + i, GFP_NOFS); + start_index + i, mask); if (!page) break; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 3ab67409f90f..7fa090fa0d39 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2921,6 +2921,7 @@ static int relocate_file_extent_cluster(struct inode *inode, unsigned long last_index; struct page *page; struct file_ra_state *ra; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int nr = 0; int ret = 0; @@ -2955,7 +2956,7 @@ static int relocate_file_extent_cluster(struct inode *inode, ra, NULL, index, last_index + 1 - index); page = find_or_create_page(inode->i_mapping, index, - GFP_NOFS); + mask); if (!page) { btrfs_delalloc_release_metadata(inode, PAGE_CACHE_SIZE); -- cgit v1.2.3-58-ga151 From a8c9e5769718d47e87cce40c9b84cab421804797 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 21 Sep 2011 16:55:59 -0400 Subject: Btrfs: fix orphan cleanup regression In fixing how we deal with bad inodes, we had a regression in the orphan cleanup code, since it expects to get a bad inode back. So fix it to deal with getting -ESTALE back by deleting the orphan item manually and moving on. Thanks, Reported-by: Simon Kirby Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 88e3956be57d..8005be176252 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2285,37 +2285,35 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); + ret = PTR_RET(inode); + if (ret && ret != -ESTALE) goto out; - } - - /* - * add this inode to the orphan list so btrfs_orphan_del does - * the proper thing when we hit it - */ - spin_lock(&root->orphan_lock); - list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); - spin_unlock(&root->orphan_lock); /* - * if this is a bad inode, means we actually succeeded in - * removing the inode, but not the orphan record, which means - * we need to manually delete the orphan since iput will just - * do a destroy_inode + * Inode is already gone but the orphan item is still there, + * kill the orphan item. */ - if (is_bad_inode(inode)) { - trans = btrfs_start_transaction(root, 0); + if (ret == -ESTALE) { + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } - btrfs_orphan_del(trans, inode); + ret = btrfs_del_orphan_item(trans, root, + found_key.objectid); + BUG_ON(ret); btrfs_end_transaction(trans, root); - iput(inode); continue; } + /* + * add this inode to the orphan list so btrfs_orphan_del does + * the proper thing when we hit it + */ + spin_lock(&root->orphan_lock); + list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + spin_unlock(&root->orphan_lock); + /* if we have links, this was a truncate, lets do that */ if (inode->i_nlink) { if (!S_ISREG(inode->i_mode)) { -- cgit v1.2.3-58-ga151 From ef3be45722317f8c2fb0e861065df0c3830ff9ac Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 22 Sep 2011 14:30:02 -0400 Subject: Btrfs: check unused against how much space we actually want There is a bug that may lead to early ENOSPC in our reservation code. We've been checking against num_bytes which may be above and beyond what we want to actually reserve, which could give us a false ENOSPC. Fix this by making sure the unused space is above how much we want to reserve and not how much we're trying to flush. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5498bdacd4c3..fd65f6bc676c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3472,7 +3472,7 @@ again: */ if (unused <= space_info->total_bytes) { unused = space_info->total_bytes - unused; - if (unused >= num_bytes) { + if (unused >= orig_bytes) { space_info->bytes_may_use += orig_bytes; ret = 0; } else { -- cgit v1.2.3-58-ga151 From 462d6fac8960a3ba797927adfcbd29d503eb16fd Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 26 Sep 2011 13:56:12 -0400 Subject: Btrfs: introduce convert_extent_bit If I have a range where I know a certain bit is and I want to set it to another bit the only option I have is to call set and then clear bit, which will result in 2 tree searches. This is inefficient, so introduce convert_extent_bit which will go through and set the bit I want and clear the old bit I don't want. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 188 +++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/extent_io.h | 2 + 2 files changed, 190 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7d5e55632809..0ada0b700b44 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -894,6 +894,194 @@ search_again: goto again; } +/** + * convert_extent - convert all bits in a given range from one bit to another + * @tree: the io tree to search + * @start: the start offset in bytes + * @end: the end offset in bytes (inclusive) + * @bits: the bits to set in this range + * @clear_bits: the bits to clear in this range + * @mask: the allocation mask + * + * This will go through and set bits for the given range. If any states exist + * already in this range they are set with the given bit and cleared of the + * clear_bits. This is only meant to be used by things that are mergeable, ie + * converting from say DELALLOC to DIRTY. This is not meant to be used with + * boundary bits like LOCK. + */ +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int clear_bits, gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + int err = 0; + u64 last_start; + u64 last_end; + +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + if (!prealloc) + return -ENOMEM; + } + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) + return -ENOMEM; + err = insert_state(tree, prealloc, start, end, &bits); + prealloc = NULL; + BUG_ON(err == -EEXIST); + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); +hit_next: + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going + */ + if (state->start == start && state->end <= end) { + struct rb_node *next_node; + + set_state_bits(tree, state, &bits); + clear_state_bit(tree, state, &clear_bits, 0); + + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + + start = last_end + 1; + next_node = rb_next(&state->rb_node); + if (next_node && start < end && prealloc && !need_resched()) { + state = rb_entry(next_node, struct extent_state, + rb_node); + if (state->start == start) + goto hit_next; + } + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on + * second half. + * + * If the extent we found extends past our + * range, we just split and search again. It'll get split + * again the next time though. + * + * If the extent we found is inside our range, we set the + * desired bit on it. + */ + if (state->start < start) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) + return -ENOMEM; + err = split_state(tree, state, prealloc, start); + BUG_ON(err == -EEXIST); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + set_state_bits(tree, state, &bits); + clear_state_bit(tree, state, &clear_bits, 0); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and + * ignore the extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start - 1; + + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) + return -ENOMEM; + + /* + * Avoid to free 'prealloc' if it can be merged with + * the later extent. + */ + err = insert_state(tree, prealloc, start, this_end, + &bits); + BUG_ON(err == -EEXIST); + if (err) { + free_extent_state(prealloc); + prealloc = NULL; + goto out; + } + prealloc = NULL; + start = this_end + 1; + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and set the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) + return -ENOMEM; + + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + + set_state_bits(tree, prealloc, &bits); + clear_state_bit(tree, prealloc, &clear_bits, 0); + + merge_state(tree, prealloc); + prealloc = NULL; + goto out; + } + + goto search_again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} + /* wrappers around set/clear extent bit */ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 7b2f0c3e7929..cea445dcd806 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -214,6 +214,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int clear_bits, gfp_t mask); int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); int find_first_extent_bit(struct extent_io_tree *tree, u64 start, -- cgit v1.2.3-58-ga151 From 1728366efa5ebf48bd2ed544afa8700cd07ba822 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 26 Sep 2011 13:58:47 -0400 Subject: Btrfs: stop using write_one_page While looking for a performance regression a user was complaining about, I noticed that we had a regression with the varmail test of filebench. This was introduced by 0d10ee2e6deb5c8409ae65b970846344897d5e4e which keeps us from calling writepages in writepage. This is a correct change, however it happens to help the varmail test because we write out in larger chunks. This is largly to do with how we write out dirty pages for each transaction. If you run filebench with load varmail set $dir=/mnt/btrfs-test run 60 prior to this patch you would get ~1420 ops/second, but with the patch you get ~1200 ops/second. This is a 16% decrease. So since we know the range of dirty pages we want to write out, don't write out in one page chunks, write out in ranges. So to do this we call filemap_fdatawrite_range() on the range of bytes. Then we convert the DIRTY extents to NEED_WAIT extents. When we then call btrfs_wait_marked_extents() we only have to filemap_fdatawait_range() on that range and clear the NEED_WAIT extents. This doesn't get us back to our original speeds, but I've been seeing ~1380 ops/second, which is a <5% regression as opposed to a >15% regression. That is acceptable given that the original commit greatly reduces our latency to begin with. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.h | 1 + fs/btrfs/transaction.c | 86 +++++++++++--------------------------------------- 2 files changed, 20 insertions(+), 67 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index cea445dcd806..325a346369da 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -17,6 +17,7 @@ #define EXTENT_NODATASUM (1 << 10) #define EXTENT_DO_ACCOUNTING (1 << 11) #define EXTENT_FIRST_DELALLOC (1 << 12) +#define EXTENT_NEED_WAIT (1 << 13) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7debbf396ef3..45655793a2c5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -572,50 +572,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, int btrfs_write_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark) { - int ret; int err = 0; int werr = 0; - struct page *page; - struct inode *btree_inode = root->fs_info->btree_inode; + struct address_space *mapping = root->fs_info->btree_inode->i_mapping; u64 start = 0; u64 end; - unsigned long index; - while (1) { - ret = find_first_extent_bit(dirty_pages, start, &start, &end, - mark); - if (ret) - break; - while (start <= end) { - cond_resched(); - - index = start >> PAGE_CACHE_SHIFT; - start = (u64)(index + 1) << PAGE_CACHE_SHIFT; - page = find_get_page(btree_inode->i_mapping, index); - if (!page) - continue; - - btree_lock_page_hook(page); - if (!page->mapping) { - unlock_page(page); - page_cache_release(page); - continue; - } - - if (PageWriteback(page)) { - if (PageDirty(page)) - wait_on_page_writeback(page); - else { - unlock_page(page); - page_cache_release(page); - continue; - } - } - err = write_one_page(page, 0); - if (err) - werr = err; - page_cache_release(page); - } + while (!find_first_extent_bit(dirty_pages, start, &start, &end, + mark)) { + convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark, + GFP_NOFS); + err = filemap_fdatawrite_range(mapping, start, end); + if (err) + werr = err; + cond_resched(); + start = end + 1; } if (err) werr = err; @@ -631,39 +602,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root, int btrfs_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark) { - int ret; int err = 0; int werr = 0; - struct page *page; - struct inode *btree_inode = root->fs_info->btree_inode; + struct address_space *mapping = root->fs_info->btree_inode->i_mapping; u64 start = 0; u64 end; - unsigned long index; - while (1) { - ret = find_first_extent_bit(dirty_pages, start, &start, &end, - mark); - if (ret) - break; - - clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); - while (start <= end) { - index = start >> PAGE_CACHE_SHIFT; - start = (u64)(index + 1) << PAGE_CACHE_SHIFT; - page = find_get_page(btree_inode->i_mapping, index); - if (!page) - continue; - if (PageDirty(page)) { - btree_lock_page_hook(page); - wait_on_page_writeback(page); - err = write_one_page(page, 0); - if (err) - werr = err; - } - wait_on_page_writeback(page); - page_cache_release(page); - cond_resched(); - } + while (!find_first_extent_bit(dirty_pages, start, &start, &end, + EXTENT_NEED_WAIT)) { + clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS); + err = filemap_fdatawait_range(mapping, start, end); + if (err) + werr = err; + cond_resched(); + start = end + 1; } if (err) werr = err; -- cgit v1.2.3-58-ga151 From 726c35fa0edf1d9b8a88b73255532e73089aedda Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 26 Sep 2011 15:46:06 -0400 Subject: Btrfs: use the global reserve as a backup for deleting inodes Xfstests 83 really stresses our ENOSPC since it uses a 100mb fs which ends up with the mixed block group stuff. Because of this we can run into a situation where we don't have enough space to delete inodes, or even worse we can't free the inodes when we next mount the fs which causes the orphan code to lose its mind. So if we fail to make our reservation, steal from the global reserve. The global reserve will end up taking up the entire rest of the free space on the fs in this worst case so there really is no other option. With this patch test 83 doesn't freak out. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8005be176252..96fc9e342219 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3526,7 +3526,7 @@ void btrfs_evict_inode(struct inode *inode) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_block_rsv *rsv; + struct btrfs_block_rsv *rsv, *global_rsv; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); unsigned long nr; int ret; @@ -3561,6 +3561,7 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } rsv->size = min_size; + global_rsv = &root->fs_info->global_block_rsv; btrfs_i_size_write(inode, 0); @@ -3577,6 +3578,15 @@ void btrfs_evict_inode(struct inode *inode) */ while (1) { ret = btrfs_block_rsv_check(root, rsv, min_size, 0, 1); + + /* + * Try and steal from the global reserve since we will + * likely not use this space anyway, we want to try as + * hard as possible to get this to work. + */ + if (ret) + ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); + if (ret) { printk(KERN_WARNING "Could not get space for a " "delete, will truncate on mount %d\n", ret); -- cgit v1.2.3-58-ga151 From 8f6d7f4f45f18a5b669dbbf068c74b3d5be59dbf Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 26 Sep 2011 15:55:20 -0400 Subject: Btrfs: break out of orphan cleanup if we can't make progress I noticed while running xfstests 83 that if we didn't have enough space to delete our inode the orphan cleanup would just loop. This is because it keeps finding the same orphan item and keeps trying to kill it but can't because we don't get an error back from iput for deleting the inode. So keep track of the last guy we tried to kill, if it's the same as the one we're trying to kill currently we know we are having problems and can just error out. I don't have a way to test this so look hard and make sure it's right. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 96fc9e342219..15adfb542502 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2230,6 +2230,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) struct btrfs_key key, found_key; struct btrfs_trans_handle *trans; struct inode *inode; + u64 last_objectid = 0; int ret = 0, nr_unlink = 0, nr_truncate = 0; if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) @@ -2281,6 +2282,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * crossing root thing. we store the inode number in the * offset of the orphan item. */ + + if (found_key.offset == last_objectid) { + printk(KERN_ERR "btrfs: Error removing orphan entry, " + "stopping orphan cleanup\n"); + ret = -EINVAL; + goto out; + } + + last_objectid = found_key.offset; + found_key.objectid = found_key.offset; found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; -- cgit v1.2.3-58-ga151 From 2bf64758fd6290797a5ce97d4b9c698a4ed1cbad Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 26 Sep 2011 17:12:22 -0400 Subject: Btrfs: allow us to overcommit our enospc reservations One of the things that kills us is the fact that our ENOSPC reservations are horribly over the top in most normal cases. There isn't too much that can be done about this because when we are completely full we really need them to work like this so we don't under reserve. However if there is plenty of unallocated chunks on the disk we can use that to gauge how much we can overcommit. So this patch adds chunk free space accounting so we always know how much unallocated space we have. Then if we fail to make a reservation within our allocated space, check to see if we can overcommit. In the normal flushing case (like with delalloc metadata reservations) we'll take the free space and divide it by 2 if our metadata profile is setup for DUP or any of those, and then divide it by 8 to make sure we don't overcommit too much. Then if we're in a non-flushing case (we really need this reservation now!) we only limit ourselves to half of the free space. This makes this fio test [torrent] filename=torrent-test rw=randwrite size=4g ioengine=sync directory=/mnt/btrfs-test go from taking around 45 minutes to 10 seconds on my freshly formatted 3 TiB file system. This doesn't seem to break my other enospc tests, but could really use some more testing as this is a super scary change. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 4 ++++ fs/btrfs/disk-io.c | 2 ++ fs/btrfs/extent-tree.c | 61 ++++++++++++++++++++++++++++++++++++++------------ fs/btrfs/volumes.c | 39 ++++++++++++++++++++++++++++---- 4 files changed, 88 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 47dea7118e0e..1eafccb162ee 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -893,6 +893,10 @@ struct btrfs_fs_info { spinlock_t block_group_cache_lock; struct rb_root block_group_cache_tree; + /* keep track of unallocated space */ + spinlock_t free_chunk_lock; + u64 free_chunk_space; + struct extent_io_tree freed_extents[2]; struct extent_io_tree *pinned_extents; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4965a0179b31..51372a521167 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1648,6 +1648,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); + spin_lock_init(&fs_info->free_chunk_lock); mutex_init(&fs_info->reloc_mutex); init_completion(&fs_info->kobj_unregister); @@ -1675,6 +1676,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; fs_info->trans_no_join = 0; + fs_info->free_chunk_space = 0; fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index fd65f6bc676c..25b69d0f9135 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3410,6 +3410,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, * @block_rsv - the block_rsv we're allocating for * @orig_bytes - the number of bytes we want * @flush - wether or not we can flush to make our reservation + * @check - wether this is just to check if we have enough space or not * * This will reserve orgi_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to @@ -3420,11 +3421,11 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, */ static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 orig_bytes, int flush) + u64 orig_bytes, int flush, int check) { struct btrfs_space_info *space_info = block_rsv->space_info; struct btrfs_trans_handle *trans; - u64 unused; + u64 used; u64 num_bytes = orig_bytes; int retries = 0; int ret = 0; @@ -3459,9 +3460,9 @@ again: } ret = -ENOSPC; - unused = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; /* * The idea here is that we've not already over-reserved the block group @@ -3470,9 +3471,8 @@ again: * lets start flushing stuff first and then come back and try to make * our reservation. */ - if (unused <= space_info->total_bytes) { - unused = space_info->total_bytes - unused; - if (unused >= orig_bytes) { + if (used <= space_info->total_bytes) { + if (used + orig_bytes <= space_info->total_bytes) { space_info->bytes_may_use += orig_bytes; ret = 0; } else { @@ -3489,10 +3489,43 @@ again: * amount plus the amount of bytes that we need for this * reservation. */ - num_bytes = unused - space_info->total_bytes + + num_bytes = used - space_info->total_bytes + (orig_bytes * (retries + 1)); } + if (ret && !check) { + u64 profile = btrfs_get_alloc_profile(root, 0); + u64 avail; + + spin_lock(&root->fs_info->free_chunk_lock); + avail = root->fs_info->free_chunk_space; + + /* + * If we have dup, raid1 or raid10 then only half of the free + * space is actually useable. + */ + if (profile & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + avail >>= 1; + + /* + * If we aren't flushing don't let us overcommit too much, say + * 1/8th of the space. If we can flush, let it overcommit up to + * 1/2 of the space. + */ + if (flush) + avail >>= 3; + else + avail >>= 1; + spin_unlock(&root->fs_info->free_chunk_lock); + + if (used + orig_bytes < space_info->total_bytes + avail) { + space_info->bytes_may_use += orig_bytes; + ret = 0; + } + } + /* * Couldn't make our reservation, save our place so while we're trying * to reclaim space we can actually use it instead of somebody else @@ -3703,7 +3736,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root, if (num_bytes == 0) return 0; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1, 0); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 1); return 0; @@ -3737,7 +3770,7 @@ int btrfs_block_rsv_check(struct btrfs_root *root, if (!ret) return 0; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush, !flush); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 0); return 0; @@ -4037,7 +4070,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); spin_unlock(&BTRFS_I(inode)->lock); - ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); + ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush, 0); if (ret) { u64 to_free = 0; unsigned dropped; @@ -5692,7 +5725,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0); /* * If we couldn't reserve metadata bytes try and use some from * the global reserve. @@ -5713,7 +5746,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, return block_rsv; if (ret) { WARN_ON(1); - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0); if (!ret) { return block_rsv; } else if (ret && block_rsv != global_rsv) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f2a4cc79da61..e138af710de2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1013,8 +1013,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, } BUG_ON(ret); - if (device->bytes_used > 0) - device->bytes_used -= btrfs_dev_extent_length(leaf, extent); + if (device->bytes_used > 0) { + u64 len = btrfs_dev_extent_length(leaf, extent); + device->bytes_used -= len; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += len; + spin_unlock(&root->fs_info->free_chunk_lock); + } ret = btrfs_del_item(trans, root, path); out: @@ -1356,6 +1361,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (ret) goto error_undo; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space = device->total_bytes - + device->bytes_used; + spin_unlock(&root->fs_info->free_chunk_lock); + device->in_fs_metadata = 0; btrfs_scrub_cancel_dev(root, device); @@ -1691,6 +1701,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) root->fs_info->fs_devices->num_can_discard++; root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += device->total_bytes; + spin_unlock(&root->fs_info->free_chunk_lock); + if (!blk_queue_nonrot(bdev_get_queue(bdev))) root->fs_info->fs_devices->rotating = 1; @@ -2192,8 +2206,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) lock_chunks(root); device->total_bytes = new_size; - if (device->writeable) + if (device->writeable) { device->fs_devices->total_rw_bytes -= diff; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space -= diff; + spin_unlock(&root->fs_info->free_chunk_lock); + } unlock_chunks(root); again: @@ -2257,6 +2275,9 @@ again: device->total_bytes = old_size; if (device->writeable) device->fs_devices->total_rw_bytes += diff; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += diff; + spin_unlock(&root->fs_info->free_chunk_lock); unlock_chunks(root); goto done; } @@ -2615,6 +2636,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, index++; } + spin_lock(&extent_root->fs_info->free_chunk_lock); + extent_root->fs_info->free_chunk_space -= (stripe_size * + map->num_stripes); + spin_unlock(&extent_root->fs_info->free_chunk_lock); + index = 0; stripe = &chunk->stripe; while (index < map->num_stripes) { @@ -3616,8 +3642,13 @@ static int read_one_dev(struct btrfs_root *root, fill_device_from_item(leaf, dev_item, device); device->dev_root = root->fs_info->dev_root; device->in_fs_metadata = 1; - if (device->writeable) + if (device->writeable) { device->fs_devices->total_rw_bytes += device->total_bytes; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += device->total_bytes - + device->bytes_used; + spin_unlock(&root->fs_info->free_chunk_lock); + } ret = 0; return ret; } -- cgit v1.2.3-58-ga151 From e27425d614d68daa08f60735982a7c3a0230e855 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 27 Sep 2011 11:01:30 -0400 Subject: Btrfs: only inherit btrfs specific flags when creating files Xfstests 79 was failing because we were inheriting the S_APPEND flag when we weren't supposed to. There isn't any specific documentation on this so I'm taking the test as the standard of how things work, and having S_APPEND set on a directory doesn't mean that S_APPEND gets inherited by its children according to this test. So only inherit btrfs specific things. This will let us set compress/nocompress on specific directories and everything in the directories will inherit this flag, same with nodatacow. With this patch test 79 passes. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ioctl.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 24fd75bb0f96..d2b53eb8a8c2 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -117,7 +117,7 @@ void btrfs_update_iflags(struct inode *inode) /* * Inherit flags from the parent inode. * - * Unlike extN we don't have any flags we don't want to inherit currently. + * Currently only the compression flags and the cow flags are inherited. */ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) { @@ -128,12 +128,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) flags = BTRFS_I(dir)->flags; - if (S_ISREG(inode->i_mode)) - flags &= ~BTRFS_INODE_DIRSYNC; - else if (!S_ISDIR(inode->i_mode)) - flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); + if (flags & BTRFS_INODE_NOCOMPRESS) { + BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + } else if (flags & BTRFS_INODE_COMPRESS) { + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; + } + + if (flags & BTRFS_INODE_NODATACOW) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; - BTRFS_I(inode)->flags = flags; btrfs_update_iflags(inode); } -- cgit v1.2.3-58-ga151 From 73bc187680f94bed498f8a669103cad290e41180 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 3 Oct 2011 14:07:49 -0400 Subject: Btrfs: introduce mount option no_space_cache Some users have requested this and I've found I needed a way to disable cache loading without actually clearing the cache, so introduce the no_space_cache option. Before we check the super blocks cache generation field and if it was populated we always turned space caching on. Now we check this and set the space cache option on, and then parse the mount options so that if we want it off it get's turned off. Then we check the mount option all the places we do the caching work instead of checking the super's cache generation. This makes things more consistent and lets us turn space caching off. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 9 ++++----- fs/btrfs/super.c | 21 +++++++++++++++++---- fs/btrfs/transaction.c | 2 +- 3 files changed, 22 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 25b69d0f9135..f9711a82fc54 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -481,7 +481,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, * we likely hold important locks. */ if (trans && (!trans->transaction->in_commit) && - (root && root != root->fs_info->tree_root)) { + (root && root != root->fs_info->tree_root) && + btrfs_test_opt(root, SPACE_CACHE)) { spin_lock(&cache->lock); if (cache->cached != BTRFS_CACHE_NO) { spin_unlock(&cache->lock); @@ -4223,7 +4224,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); - if (btrfs_super_cache_generation(&info->super_copy) != 0 && + if (btrfs_test_opt(root, SPACE_CACHE) && cache->disk_cache_state < BTRFS_DC_CLEAR) cache->disk_cache_state = BTRFS_DC_CLEAR; @@ -7038,13 +7039,11 @@ int btrfs_read_block_groups(struct btrfs_root *root) path->reada = 1; cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); - if (cache_gen != 0 && + if (btrfs_test_opt(root, SPACE_CACHE) && btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) need_clear = 1; if (btrfs_test_opt(root, CLEAR_CACHE)) need_clear = 1; - if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen) - printk(KERN_INFO "btrfs: disk space caching is enabled\n"); while (1) { ret = find_first_block_group(root, path, &key); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 934789f7fd33..266d1f35465d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -164,7 +164,7 @@ enum { Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, - Opt_inode_cache, Opt_err, + Opt_inode_cache, Opt_no_space_cache, Opt_err, }; static match_table_t tokens = { @@ -197,6 +197,7 @@ static match_table_t tokens = { {Opt_subvolrootid, "subvolrootid=%d"}, {Opt_defrag, "autodefrag"}, {Opt_inode_cache, "inode_cache"}, + {Opt_no_space_cache, "no_space_cache"}, {Opt_err, NULL}, }; @@ -208,14 +209,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) { struct btrfs_fs_info *info = root->fs_info; substring_t args[MAX_OPT_ARGS]; - char *p, *num, *orig; + char *p, *num, *orig = NULL; + u64 cache_gen; int intarg; int ret = 0; char *compress_type; bool compress_force = false; + cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); + if (cache_gen) + btrfs_set_opt(info->mount_opt, SPACE_CACHE); + if (!options) - return 0; + goto out; /* * strsep changes the string, duplicate it because parse_options @@ -362,9 +368,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, DISCARD); break; case Opt_space_cache: - printk(KERN_INFO "btrfs: enabling disk space caching\n"); btrfs_set_opt(info->mount_opt, SPACE_CACHE); break; + case Opt_no_space_cache: + printk(KERN_INFO "btrfs: disabling disk space caching\n"); + btrfs_clear_opt(info->mount_opt, SPACE_CACHE); + break; case Opt_inode_cache: printk(KERN_INFO "btrfs: enabling inode map caching\n"); btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); @@ -393,6 +402,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } } out: + if (!ret && btrfs_test_opt(root, SPACE_CACHE)) + printk(KERN_INFO "btrfs: disk space caching is enabled\n"); kfree(orig); return ret; } @@ -687,6 +698,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",noacl"); if (btrfs_test_opt(root, SPACE_CACHE)) seq_puts(seq, ",space_cache"); + else + seq_puts(seq, ",no_space_cache"); if (btrfs_test_opt(root, CLEAR_CACHE)) seq_puts(seq, ",clear_cache"); if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 45655793a2c5..1e1a4816ccb0 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1003,7 +1003,7 @@ static void update_super_roots(struct btrfs_root *root) super->root = root_item->bytenr; super->generation = root_item->generation; super->root_level = root_item->level; - if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE)) + if (btrfs_test_opt(root, SPACE_CACHE)) super->cache_generation = root_item->generation; } -- cgit v1.2.3-58-ga151 From f75b130e9bb361850787e156c79311adb84f551e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 5 Oct 2011 10:00:18 -0400 Subject: Btrfs: don't skip writing out a empty block groups cache I noticed a slight bug where we will not bother writing out the block group cache's space cache if it's space tree is empty. Since it could have a cluster or pinned extents that need to be written out this is just not a valid test. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 0a8ccdbdd464..b81556ca75ea 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -575,10 +575,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, INIT_LIST_HEAD(&bitmap_list); - node = rb_first(&ctl->free_space_offset); - if (!node) - return -1; - if (!i_size_read(inode)) return -1; @@ -639,6 +635,12 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, if (block_group) start = block_group->key.objectid; + node = rb_first(&ctl->free_space_offset); + if (!node && cluster) { + node = rb_first(&cluster->root); + cluster = NULL; + } + /* Write out the extent entries */ do { struct btrfs_free_space_entry *entry; -- cgit v1.2.3-58-ga151 From a67509c30079f4c5025fb19ea443fb2906c3a85e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 5 Oct 2011 15:18:58 -0400 Subject: Btrfs: add a io_ctl struct and helpers for dealing with the space cache In writing and reading the space cache we have one big loop that keeps track of which page we are on and then a bunch of sizeable loops underneath this big loop to try and read/write out properly. Especially in the write case this makes things hugely complicated and hard to follow, and makes our error checking and recovery equally as complex. So add a io_ctl struct with a bunch of helpers to keep track of the pages we have, where we are, if we have enough space etc. This unifies how we deal with the pages we're writing and keeps all the messy tracking internal. This allows us to kill the big loops in both the read and write case and makes reviewing and chaning the write and read paths much simpler. I've run xfstests and stress.sh on this code and it survives. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 693 ++++++++++++++++++++++++-------------------- 1 file changed, 375 insertions(+), 318 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index b81556ca75ea..35bfc13c9d42 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -241,27 +241,275 @@ static int readahead_cache(struct inode *inode) return 0; } +struct io_ctl { + void *cur, *orig; + struct page *page; + struct page **pages; + struct btrfs_root *root; + unsigned long size; + int index; + int num_pages; +}; + +static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, + struct btrfs_root *root) +{ + memset(io_ctl, 0, sizeof(struct io_ctl)); + io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages, + GFP_NOFS); + if (!io_ctl->pages) + return -ENOMEM; + io_ctl->root = root; + return 0; +} + +static void io_ctl_free(struct io_ctl *io_ctl) +{ + kfree(io_ctl->pages); +} + +static void io_ctl_unmap_page(struct io_ctl *io_ctl) +{ + if (io_ctl->cur) { + kunmap(io_ctl->page); + io_ctl->cur = NULL; + io_ctl->orig = NULL; + } +} + +static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) +{ + WARN_ON(io_ctl->cur); + BUG_ON(io_ctl->index >= io_ctl->num_pages); + io_ctl->page = io_ctl->pages[io_ctl->index++]; + io_ctl->cur = kmap(io_ctl->page); + io_ctl->orig = io_ctl->cur; + io_ctl->size = PAGE_CACHE_SIZE; + if (clear) + memset(io_ctl->cur, 0, PAGE_CACHE_SIZE); +} + +static void io_ctl_drop_pages(struct io_ctl *io_ctl) +{ + int i; + + io_ctl_unmap_page(io_ctl); + + for (i = 0; i < io_ctl->num_pages; i++) { + ClearPageChecked(io_ctl->pages[i]); + unlock_page(io_ctl->pages[i]); + page_cache_release(io_ctl->pages[i]); + } +} + +static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, + int uptodate) +{ + struct page *page; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); + int i; + + for (i = 0; i < io_ctl->num_pages; i++) { + page = find_or_create_page(inode->i_mapping, i, mask); + if (!page) { + io_ctl_drop_pages(io_ctl); + return -ENOMEM; + } + io_ctl->pages[i] = page; + if (uptodate && !PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + printk(KERN_ERR "btrfs: error reading free " + "space cache\n"); + io_ctl_drop_pages(io_ctl); + return -EIO; + } + } + } + + return 0; +} + +static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) +{ + u64 *val; + + io_ctl_map_page(io_ctl, 1); + + /* + * Skip the first 64bits to make sure theres a bogus crc for old + * kernels + */ + io_ctl->cur += sizeof(u64); + + val = io_ctl->cur; + *val = cpu_to_le64(generation); + io_ctl->cur += sizeof(u64); + io_ctl->size -= sizeof(u64) * 2; +} + +static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) +{ + u64 *gen; + + io_ctl_map_page(io_ctl, 0); + + /* Skip the bogus crc area */ + io_ctl->cur += sizeof(u64); + gen = io_ctl->cur; + if (le64_to_cpu(*gen) != generation) { + printk_ratelimited(KERN_ERR "btrfs: space cache generation " + "(%Lu) does not match inode (%Lu)\n", *gen, + generation); + io_ctl_unmap_page(io_ctl); + return -EIO; + } + io_ctl->cur += sizeof(u64); + io_ctl->size -= sizeof(u64) * 2; + return 0; +} + +static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, + void *bitmap) +{ + struct btrfs_free_space_entry *entry; + + if (!io_ctl->cur) + return -ENOSPC; + + entry = io_ctl->cur; + entry->offset = cpu_to_le64(offset); + entry->bytes = cpu_to_le64(bytes); + entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : + BTRFS_FREE_SPACE_EXTENT; + io_ctl->cur += sizeof(struct btrfs_free_space_entry); + io_ctl->size -= sizeof(struct btrfs_free_space_entry); + + if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) + return 0; + + /* + * index == 1 means the current page is 0, we need to generate a bogus + * crc for older kernels. + */ + if (io_ctl->index == 1) { + u32 *tmp; + u32 crc = ~(u32)0; + + crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + sizeof(u64), + crc, PAGE_CACHE_SIZE - sizeof(u64)); + btrfs_csum_final(crc, (char *)&crc); + crc++; + tmp = io_ctl->orig; + *tmp = crc; + } + io_ctl_unmap_page(io_ctl); + + /* No more pages to map */ + if (io_ctl->index >= io_ctl->num_pages) + return 0; + + /* map the next page */ + io_ctl_map_page(io_ctl, 1); + return 0; +} + +static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) +{ + if (!io_ctl->cur) + return -ENOSPC; + + /* + * If we aren't at the start of the current page, unmap this one and + * map the next one if there is any left. + */ + if (io_ctl->cur != io_ctl->orig) { + io_ctl_unmap_page(io_ctl); + if (io_ctl->index >= io_ctl->num_pages) + return -ENOSPC; + io_ctl_map_page(io_ctl, 0); + } + + memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE); + io_ctl_unmap_page(io_ctl); + if (io_ctl->index < io_ctl->num_pages) + io_ctl_map_page(io_ctl, 0); + return 0; +} + +static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) +{ + io_ctl_unmap_page(io_ctl); + + while (io_ctl->index < io_ctl->num_pages) { + io_ctl_map_page(io_ctl, 1); + io_ctl_unmap_page(io_ctl); + } +} + +static u8 io_ctl_read_entry(struct io_ctl *io_ctl, + struct btrfs_free_space *entry) +{ + struct btrfs_free_space_entry *e; + u8 type; + + e = io_ctl->cur; + entry->offset = le64_to_cpu(e->offset); + entry->bytes = le64_to_cpu(e->bytes); + type = e->type; + io_ctl->cur += sizeof(struct btrfs_free_space_entry); + io_ctl->size -= sizeof(struct btrfs_free_space_entry); + + if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) + return type; + + io_ctl_unmap_page(io_ctl); + + if (io_ctl->index >= io_ctl->num_pages) + return type; + + io_ctl_map_page(io_ctl, 0); + return type; +} + +static void io_ctl_read_bitmap(struct io_ctl *io_ctl, + struct btrfs_free_space *entry) +{ + BUG_ON(!io_ctl->cur); + if (io_ctl->cur != io_ctl->orig) { + io_ctl_unmap_page(io_ctl); + io_ctl_map_page(io_ctl, 0); + } + memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE); + io_ctl_unmap_page(io_ctl); + if (io_ctl->index < io_ctl->num_pages) + io_ctl_map_page(io_ctl, 0); +} + int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_path *path, u64 offset) { struct btrfs_free_space_header *header; struct extent_buffer *leaf; - struct page *page; + struct io_ctl io_ctl; struct btrfs_key key; + struct btrfs_free_space *e, *n; struct list_head bitmaps; u64 num_entries; u64 num_bitmaps; u64 generation; - pgoff_t index = 0; - gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); + u8 type; int ret = 0; INIT_LIST_HEAD(&bitmaps); /* Nothing in the space cache, goodbye */ if (!i_size_read(inode)) - goto out; + return 0; key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.offset = offset; @@ -269,11 +517,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return 0; else if (ret > 0) { btrfs_release_path(path); - ret = 0; - goto out; + return 0; } ret = -1; @@ -291,170 +538,89 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, " not match free space cache generation (%llu)\n", (unsigned long long)BTRFS_I(inode)->generation, (unsigned long long)generation); - goto out; + return 0; } if (!num_entries) - goto out; + return 0; + io_ctl_init(&io_ctl, inode, root); ret = readahead_cache(inode); if (ret) goto out; - while (1) { - struct btrfs_free_space_entry *entry; - struct btrfs_free_space *e; - void *addr; - unsigned long offset = 0; - int need_loop = 0; + ret = io_ctl_prepare_pages(&io_ctl, inode, 1); + if (ret) + goto out; - if (!num_entries && !num_bitmaps) - break; + ret = io_ctl_check_generation(&io_ctl, generation); + if (ret) + goto free_cache; - page = find_or_create_page(inode->i_mapping, index, mask); - if (!page) + while (num_entries) { + e = kmem_cache_zalloc(btrfs_free_space_cachep, + GFP_NOFS); + if (!e) goto free_cache; - if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - page_cache_release(page); - printk(KERN_ERR "btrfs: error reading free " - "space cache\n"); - goto free_cache; - } + type = io_ctl_read_entry(&io_ctl, e); + if (!e->bytes) { + kmem_cache_free(btrfs_free_space_cachep, e); + goto free_cache; } - addr = kmap(page); - - if (index == 0) { - u64 *gen; - /* - * We put a bogus crc in the front of the first page in - * case old kernels try to mount a fs with the new - * format to make sure they discard the cache. - */ - addr += sizeof(u64); - offset += sizeof(u64); - - gen = addr; - if (*gen != BTRFS_I(inode)->generation) { - printk_ratelimited(KERN_ERR "btrfs: space cache" - " generation (%llu) does not match " - "inode (%llu)\n", - (unsigned long long)*gen, - (unsigned long long) - BTRFS_I(inode)->generation); - kunmap(page); - unlock_page(page); - page_cache_release(page); + if (type == BTRFS_FREE_SPACE_EXTENT) { + spin_lock(&ctl->tree_lock); + ret = link_free_space(ctl, e); + spin_unlock(&ctl->tree_lock); + if (ret) { + printk(KERN_ERR "Duplicate entries in " + "free space cache, dumping\n"); + kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } - addr += sizeof(u64); - offset += sizeof(u64); - } - entry = addr; - - while (1) { - if (!num_entries) - break; - - need_loop = 1; - e = kmem_cache_zalloc(btrfs_free_space_cachep, - GFP_NOFS); - if (!e) { - kunmap(page); - unlock_page(page); - page_cache_release(page); + } else { + BUG_ON(!num_bitmaps); + num_bitmaps--; + e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!e->bitmap) { + kmem_cache_free( + btrfs_free_space_cachep, e); goto free_cache; } - - e->offset = le64_to_cpu(entry->offset); - e->bytes = le64_to_cpu(entry->bytes); - if (!e->bytes) { - kunmap(page); + spin_lock(&ctl->tree_lock); + ret = link_free_space(ctl, e); + ctl->total_bitmaps++; + ctl->op->recalc_thresholds(ctl); + spin_unlock(&ctl->tree_lock); + if (ret) { + printk(KERN_ERR "Duplicate entries in " + "free space cache, dumping\n"); kmem_cache_free(btrfs_free_space_cachep, e); - unlock_page(page); - page_cache_release(page); goto free_cache; } - - if (entry->type == BTRFS_FREE_SPACE_EXTENT) { - spin_lock(&ctl->tree_lock); - ret = link_free_space(ctl, e); - spin_unlock(&ctl->tree_lock); - if (ret) { - printk(KERN_ERR "Duplicate entries in " - "free space cache, dumping\n"); - kunmap(page); - unlock_page(page); - page_cache_release(page); - goto free_cache; - } - } else { - e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); - if (!e->bitmap) { - kunmap(page); - kmem_cache_free( - btrfs_free_space_cachep, e); - unlock_page(page); - page_cache_release(page); - goto free_cache; - } - spin_lock(&ctl->tree_lock); - ret = link_free_space(ctl, e); - ctl->total_bitmaps++; - ctl->op->recalc_thresholds(ctl); - spin_unlock(&ctl->tree_lock); - if (ret) { - printk(KERN_ERR "Duplicate entries in " - "free space cache, dumping\n"); - kunmap(page); - unlock_page(page); - page_cache_release(page); - goto free_cache; - } - list_add_tail(&e->list, &bitmaps); - } - - num_entries--; - offset += sizeof(struct btrfs_free_space_entry); - if (offset + sizeof(struct btrfs_free_space_entry) >= - PAGE_CACHE_SIZE) - break; - entry++; + list_add_tail(&e->list, &bitmaps); } - /* - * We read an entry out of this page, we need to move on to the - * next page. - */ - if (need_loop) { - kunmap(page); - goto next; - } + num_entries--; + } - /* - * We add the bitmaps at the end of the entries in order that - * the bitmap entries are added to the cache. - */ - e = list_entry(bitmaps.next, struct btrfs_free_space, list); + /* + * We add the bitmaps at the end of the entries in order that + * the bitmap entries are added to the cache. + */ + list_for_each_entry_safe(e, n, &bitmaps, list) { list_del_init(&e->list); - memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); - kunmap(page); - num_bitmaps--; -next: - unlock_page(page); - page_cache_release(page); - index++; + io_ctl_read_bitmap(&io_ctl, e); } + io_ctl_drop_pages(&io_ctl); ret = 1; out: + io_ctl_free(&io_ctl); return ret; free_cache: + io_ctl_drop_pages(&io_ctl); __btrfs_remove_free_space_cache(ctl); goto out; } @@ -554,40 +720,28 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct extent_buffer *leaf; struct rb_node *node; struct list_head *pos, *n; - struct page **pages; - struct page *page; struct extent_state *cached_state = NULL; struct btrfs_free_cluster *cluster = NULL; struct extent_io_tree *unpin = NULL; + struct io_ctl io_ctl; struct list_head bitmap_list; struct btrfs_key key; u64 start, end, len; - u64 bytes = 0; - u32 crc = ~(u32)0; - gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); - int index = 0, num_pages = 0; int entries = 0; int bitmaps = 0; int ret; int err = -1; - bool next_page = false; - bool out_of_space = false; INIT_LIST_HEAD(&bitmap_list); if (!i_size_read(inode)) return -1; - num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - filemap_write_and_wait(inode->i_mapping); btrfs_wait_ordered_range(inode, inode->i_size & ~(root->sectorsize - 1), (u64)-1); - pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); - if (!pages) - return -1; + io_ctl_init(&io_ctl, inode, root); /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) @@ -601,30 +755,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, */ unpin = root->fs_info->pinned_extents; - /* - * Lock all pages first so we can lock the extent safely. - * - * NOTE: Because we hold the ref the entire time we're going to write to - * the page find_get_page should never fail, so we don't do a check - * after find_get_page at this point. Just putting this here so people - * know and don't freak out. - */ - while (index < num_pages) { - page = find_or_create_page(inode->i_mapping, index, mask); - if (!page) { - int i; - - for (i = 0; i < num_pages; i++) { - unlock_page(pages[i]); - page_cache_release(pages[i]); - } - goto out; - } - pages[index] = page; - index++; - } + /* Lock all pages first so we can lock the extent safely. */ + io_ctl_prepare_pages(&io_ctl, inode, 0); - index = 0; lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 0, &cached_state, GFP_NOFS); @@ -641,166 +774,78 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, cluster = NULL; } - /* Write out the extent entries */ - do { - struct btrfs_free_space_entry *entry; - void *addr, *orig; - unsigned long offset = 0; - - next_page = false; + io_ctl_set_generation(&io_ctl, trans->transid); - if (index >= num_pages) { - out_of_space = true; - break; - } - - page = pages[index]; + /* Write out the extent entries */ + while (node) { + struct btrfs_free_space *e; - orig = addr = kmap(page); - if (index == 0) { - u64 *gen; + e = rb_entry(node, struct btrfs_free_space, offset_index); + entries++; - /* - * We're going to put in a bogus crc for this page to - * make sure that old kernels who aren't aware of this - * format will be sure to discard the cache. - */ - addr += sizeof(u64); - offset += sizeof(u64); + ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes, + e->bitmap); + if (ret) + goto out_nospc; - gen = addr; - *gen = trans->transid; - addr += sizeof(u64); - offset += sizeof(u64); + if (e->bitmap) { + list_add_tail(&e->list, &bitmap_list); + bitmaps++; } - entry = addr; - - memset(addr, 0, PAGE_CACHE_SIZE - offset); - while (node && !next_page) { - struct btrfs_free_space *e; - - e = rb_entry(node, struct btrfs_free_space, offset_index); - entries++; - - entry->offset = cpu_to_le64(e->offset); - entry->bytes = cpu_to_le64(e->bytes); - if (e->bitmap) { - entry->type = BTRFS_FREE_SPACE_BITMAP; - list_add_tail(&e->list, &bitmap_list); - bitmaps++; - } else { - entry->type = BTRFS_FREE_SPACE_EXTENT; - } - node = rb_next(node); - if (!node && cluster) { - node = rb_first(&cluster->root); - cluster = NULL; - } - offset += sizeof(struct btrfs_free_space_entry); - if (offset + sizeof(struct btrfs_free_space_entry) >= - PAGE_CACHE_SIZE) - next_page = true; - entry++; + node = rb_next(node); + if (!node && cluster) { + node = rb_first(&cluster->root); + cluster = NULL; } + } - /* - * We want to add any pinned extents to our free space cache - * so we don't leak the space - */ - while (block_group && !next_page && - (start < block_group->key.objectid + - block_group->key.offset)) { - ret = find_first_extent_bit(unpin, start, &start, &end, - EXTENT_DIRTY); - if (ret) { - ret = 0; - break; - } - - /* This pinned extent is out of our range */ - if (start >= block_group->key.objectid + - block_group->key.offset) - break; - - len = block_group->key.objectid + - block_group->key.offset - start; - len = min(len, end + 1 - start); - - entries++; - entry->offset = cpu_to_le64(start); - entry->bytes = cpu_to_le64(len); - entry->type = BTRFS_FREE_SPACE_EXTENT; - - start = end + 1; - offset += sizeof(struct btrfs_free_space_entry); - if (offset + sizeof(struct btrfs_free_space_entry) >= - PAGE_CACHE_SIZE) - next_page = true; - entry++; + /* + * We want to add any pinned extents to our free space cache + * so we don't leak the space + */ + while (block_group && (start < block_group->key.objectid + + block_group->key.offset)) { + ret = find_first_extent_bit(unpin, start, &start, &end, + EXTENT_DIRTY); + if (ret) { + ret = 0; + break; } - /* Generate bogus crc value */ - if (index == 0) { - u32 *tmp; - crc = btrfs_csum_data(root, orig + sizeof(u64), crc, - PAGE_CACHE_SIZE - sizeof(u64)); - btrfs_csum_final(crc, (char *)&crc); - crc++; - tmp = orig; - *tmp = crc; - } + /* This pinned extent is out of our range */ + if (start >= block_group->key.objectid + + block_group->key.offset) + break; - kunmap(page); + len = block_group->key.objectid + + block_group->key.offset - start; + len = min(len, end + 1 - start); - bytes += PAGE_CACHE_SIZE; + entries++; + ret = io_ctl_add_entry(&io_ctl, start, len, NULL); + if (ret) + goto out_nospc; - index++; - } while (node || next_page); + start = end + 1; + } /* Write out the bitmaps */ list_for_each_safe(pos, n, &bitmap_list) { - void *addr; struct btrfs_free_space *entry = list_entry(pos, struct btrfs_free_space, list); - if (index >= num_pages) { - out_of_space = true; - break; - } - page = pages[index]; - - addr = kmap(page); - memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); - kunmap(page); - bytes += PAGE_CACHE_SIZE; - + ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap); + if (ret) + goto out_nospc; list_del_init(&entry->list); - index++; - } - - if (out_of_space) { - btrfs_drop_pages(pages, num_pages); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, &cached_state, - GFP_NOFS); - goto out; } /* Zero out the rest of the pages just to make sure */ - while (index < num_pages) { - void *addr; + io_ctl_zero_remaining_pages(&io_ctl); - page = pages[index]; - addr = kmap(page); - memset(addr, 0, PAGE_CACHE_SIZE); - kunmap(page); - bytes += PAGE_CACHE_SIZE; - index++; - } - - ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0, - bytes, &cached_state); - btrfs_drop_pages(pages, num_pages); + ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, + 0, i_size_read(inode), &cached_state); + io_ctl_drop_pages(&io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state, GFP_NOFS); @@ -817,7 +862,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); goto out; @@ -830,7 +875,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || found_key.offset != offset) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, + inode->i_size - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); @@ -848,13 +894,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, err = 0; out: - kfree(pages); + io_ctl_free(&io_ctl); if (err) { - invalidate_inode_pages2_range(inode->i_mapping, 0, index); + invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; } btrfs_update_inode(trans, root, inode); return err; + +out_nospc: + list_for_each_safe(pos, n, &bitmap_list) { + struct btrfs_free_space *entry = + list_entry(pos, struct btrfs_free_space, list); + list_del_init(&entry->list); + } + io_ctl_drop_pages(&io_ctl); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, + i_size_read(inode) - 1, &cached_state, GFP_NOFS); + goto out; } int btrfs_write_out_cache(struct btrfs_root *root, -- cgit v1.2.3-58-ga151 From 549b4fdb8f3c0708bbc0ee12ff955cd206c0f60c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 5 Oct 2011 16:33:53 -0400 Subject: Btrfs: check the return value of filemap_write_and_wait in the space cache We need to check the return value of filemap_write_and_wait in the space cache writeout code. Also don't set the inode's generation until we're sure nothing else is going to fail. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 35bfc13c9d42..abc924c9467c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -852,9 +852,10 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, if (ret) goto out; - BTRFS_I(inode)->generation = trans->transid; - filemap_write_and_wait(inode->i_mapping); + ret = filemap_write_and_wait(inode->i_mapping); + if (ret) + goto out; key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.offset = offset; @@ -884,6 +885,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, goto out; } } + + BTRFS_I(inode)->generation = trans->transid; header = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_free_space_header); btrfs_set_free_space_entries(leaf, header, entries); -- cgit v1.2.3-58-ga151 From 9a82ca659d8bfd99afc0e89bbde2202322df5755 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 5 Oct 2011 16:35:28 -0400 Subject: Btrfs: take overflow into account in reserving space My overcommit stuff can be a little racy when we're filling up the disk with fs_mark and we overcommit into things that quickly get used up for data. So use num_bytes to see if we have enough available space so we're less likely to overcommit ourselves out of the ability to make reservations. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f9711a82fc54..f95e55083bdb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3521,7 +3521,7 @@ again: avail >>= 1; spin_unlock(&root->fs_info->free_chunk_lock); - if (used + orig_bytes < space_info->total_bytes + avail) { + if (used + num_bytes < space_info->total_bytes + avail) { space_info->bytes_may_use += orig_bytes; ret = 0; } -- cgit v1.2.3-58-ga151 From 5b0e95bf607ddd59b39f52d3d55e6581c817b530 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 6 Oct 2011 08:58:24 -0400 Subject: Btrfs: inline checksums into the disk free space cache Yeah yeah I know this is how we used to do it and then I changed it, but damnit I'm changing it back. The fact is that writing out checksums will modify metadata, which could cause us to dirty a block group we've already written out, so we have to truncate it and all of it's checksums and re-write it which will write new checksums which could dirty a blockg roup that has already been written and you see where I'm going with this? This can cause unmount or really anything that depends on a transaction to commit to take it's sweet damned time to happen. So go back to the way it was, only this time we're specifically setting NODATACOW because we can't go through the COW pathway anyway and we're doing our own built-in cow'ing by truncating the free space cache. The other new thing is once we truncate the old cache and preallocate the new space, we don't need to do that song and dance at all for the rest of the transaction, we can just overwrite the existing space with the new cache if the block group changes for whatever reason, and the NODATACOW will let us do this fine. So keep track of which transaction we last cleared our cache in and if we cleared it in this transaction just say we're all setup and carry on. This survives xfstests and stress.sh. The inode cache will continue to use the normal csum infrastructure since it only gets written once and there will be no more modifications to the fs tree in a transaction commit. Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 1 + fs/btrfs/extent-tree.c | 18 ++-- fs/btrfs/free-space-cache.c | 211 ++++++++++++++++++++++++++++++++------------ fs/btrfs/inode.c | 10 +-- 4 files changed, 172 insertions(+), 68 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1eafccb162ee..ea60897a9171 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -838,6 +838,7 @@ struct btrfs_block_group_cache { u64 bytes_super; u64 flags; u64 sectorsize; + u64 cache_generation; unsigned int ro:1; unsigned int dirty:1; unsigned int iref:1; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f95e55083bdb..0abf70c984e9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2717,6 +2717,13 @@ again: goto again; } + /* We've already setup this transaction, go ahead and exit */ + if (block_group->cache_generation == trans->transid && + i_size_read(inode)) { + dcs = BTRFS_DC_SETUP; + goto out_put; + } + /* * We want to set the generation to 0, that way if anything goes wrong * from here on out we know not to trust this cache when we load up next @@ -2756,19 +2763,16 @@ again: num_pages *= 16; num_pages *= PAGE_CACHE_SIZE; - ret = btrfs_delalloc_reserve_space(inode, num_pages); + ret = btrfs_check_data_free_space(inode, num_pages); if (ret) goto out_put; ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, num_pages, num_pages, &alloc_hint); - if (!ret) { + if (!ret) dcs = BTRFS_DC_SETUP; - btrfs_free_reserved_data_space(inode, num_pages); - } else { - btrfs_delalloc_release_space(inode, num_pages); - } + btrfs_free_reserved_data_space(inode, num_pages); out_put: iput(inode); @@ -2776,6 +2780,8 @@ out_free: btrfs_release_path(path); out: spin_lock(&block_group->lock); + if (!ret) + block_group->cache_generation = trans->transid; block_group->disk_cache_state = dcs; spin_unlock(&block_group->lock); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index abc924c9467c..5d40c1ed8225 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -85,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, *block_group, struct btrfs_path *path) { struct inode *inode = NULL; + u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; spin_lock(&block_group->lock); if (block_group->inode) @@ -99,9 +100,10 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, return inode; spin_lock(&block_group->lock); - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) { + if (!((BTRFS_I(inode)->flags & flags) == flags)) { printk(KERN_INFO "Old style space inode found, converting.\n"); - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM; + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | + BTRFS_INODE_NODATACOW; block_group->disk_cache_state = BTRFS_DC_CLEAR; } @@ -123,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root, struct btrfs_free_space_header *header; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; + u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; int ret; ret = btrfs_insert_empty_inode(trans, root, path, ino); if (ret) return ret; + /* We inline crc's for the free disk space cache */ + if (ino != BTRFS_FREE_INO_OBJECTID) + flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; + leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); @@ -141,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root, btrfs_set_inode_uid(leaf, inode_item, 0); btrfs_set_inode_gid(leaf, inode_item, 0); btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | - BTRFS_INODE_PREALLOC); + btrfs_set_inode_flags(leaf, inode_item, flags); btrfs_set_inode_nlink(leaf, inode_item, 1); btrfs_set_inode_transid(leaf, inode_item, trans->transid); btrfs_set_inode_block_group(leaf, inode_item, offset); @@ -249,6 +255,7 @@ struct io_ctl { unsigned long size; int index; int num_pages; + unsigned check_crcs:1; }; static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, @@ -262,6 +269,8 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, if (!io_ctl->pages) return -ENOMEM; io_ctl->root = root; + if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) + io_ctl->check_crcs = 1; return 0; } @@ -340,25 +349,39 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) io_ctl_map_page(io_ctl, 1); /* - * Skip the first 64bits to make sure theres a bogus crc for old - * kernels + * Skip the csum areas. If we don't check crcs then we just have a + * 64bit chunk at the front of the first page. */ - io_ctl->cur += sizeof(u64); + if (io_ctl->check_crcs) { + io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); + io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); + } else { + io_ctl->cur += sizeof(u64); + io_ctl->size -= sizeof(u64) * 2; + } val = io_ctl->cur; *val = cpu_to_le64(generation); io_ctl->cur += sizeof(u64); - io_ctl->size -= sizeof(u64) * 2; } static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) { u64 *gen; - io_ctl_map_page(io_ctl, 0); + /* + * Skip the crc area. If we don't check crcs then we just have a 64bit + * chunk at the front of the first page. + */ + if (io_ctl->check_crcs) { + io_ctl->cur += sizeof(u32) * io_ctl->num_pages; + io_ctl->size -= sizeof(u64) + + (sizeof(u32) * io_ctl->num_pages); + } else { + io_ctl->cur += sizeof(u64); + io_ctl->size -= sizeof(u64) * 2; + } - /* Skip the bogus crc area */ - io_ctl->cur += sizeof(u64); gen = io_ctl->cur; if (le64_to_cpu(*gen) != generation) { printk_ratelimited(KERN_ERR "btrfs: space cache generation " @@ -368,7 +391,63 @@ static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) return -EIO; } io_ctl->cur += sizeof(u64); - io_ctl->size -= sizeof(u64) * 2; + return 0; +} + +static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) +{ + u32 *tmp; + u32 crc = ~(u32)0; + unsigned offset = 0; + + if (!io_ctl->check_crcs) { + io_ctl_unmap_page(io_ctl); + return; + } + + if (index == 0) + offset = sizeof(u32) * io_ctl->num_pages;; + + crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, + PAGE_CACHE_SIZE - offset); + btrfs_csum_final(crc, (char *)&crc); + io_ctl_unmap_page(io_ctl); + tmp = kmap(io_ctl->pages[0]); + tmp += index; + *tmp = crc; + kunmap(io_ctl->pages[0]); +} + +static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) +{ + u32 *tmp, val; + u32 crc = ~(u32)0; + unsigned offset = 0; + + if (!io_ctl->check_crcs) { + io_ctl_map_page(io_ctl, 0); + return 0; + } + + if (index == 0) + offset = sizeof(u32) * io_ctl->num_pages; + + tmp = kmap(io_ctl->pages[0]); + tmp += index; + val = *tmp; + kunmap(io_ctl->pages[0]); + + io_ctl_map_page(io_ctl, 0); + crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, + PAGE_CACHE_SIZE - offset); + btrfs_csum_final(crc, (char *)&crc); + if (val != crc) { + printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free " + "space cache\n"); + io_ctl_unmap_page(io_ctl); + return -EIO; + } + return 0; } @@ -391,22 +470,7 @@ static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) return 0; - /* - * index == 1 means the current page is 0, we need to generate a bogus - * crc for older kernels. - */ - if (io_ctl->index == 1) { - u32 *tmp; - u32 crc = ~(u32)0; - - crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + sizeof(u64), - crc, PAGE_CACHE_SIZE - sizeof(u64)); - btrfs_csum_final(crc, (char *)&crc); - crc++; - tmp = io_ctl->orig; - *tmp = crc; - } - io_ctl_unmap_page(io_ctl); + io_ctl_set_crc(io_ctl, io_ctl->index - 1); /* No more pages to map */ if (io_ctl->index >= io_ctl->num_pages) @@ -427,14 +491,14 @@ static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) * map the next one if there is any left. */ if (io_ctl->cur != io_ctl->orig) { - io_ctl_unmap_page(io_ctl); + io_ctl_set_crc(io_ctl, io_ctl->index - 1); if (io_ctl->index >= io_ctl->num_pages) return -ENOSPC; io_ctl_map_page(io_ctl, 0); } memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE); - io_ctl_unmap_page(io_ctl); + io_ctl_set_crc(io_ctl, io_ctl->index - 1); if (io_ctl->index < io_ctl->num_pages) io_ctl_map_page(io_ctl, 0); return 0; @@ -442,51 +506,60 @@ static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) { - io_ctl_unmap_page(io_ctl); + /* + * If we're not on the boundary we know we've modified the page and we + * need to crc the page. + */ + if (io_ctl->cur != io_ctl->orig) + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + else + io_ctl_unmap_page(io_ctl); while (io_ctl->index < io_ctl->num_pages) { io_ctl_map_page(io_ctl, 1); - io_ctl_unmap_page(io_ctl); + io_ctl_set_crc(io_ctl, io_ctl->index - 1); } } -static u8 io_ctl_read_entry(struct io_ctl *io_ctl, - struct btrfs_free_space *entry) +static int io_ctl_read_entry(struct io_ctl *io_ctl, + struct btrfs_free_space *entry, u8 *type) { struct btrfs_free_space_entry *e; - u8 type; e = io_ctl->cur; entry->offset = le64_to_cpu(e->offset); entry->bytes = le64_to_cpu(e->bytes); - type = e->type; + *type = e->type; io_ctl->cur += sizeof(struct btrfs_free_space_entry); io_ctl->size -= sizeof(struct btrfs_free_space_entry); if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) - return type; + return 0; io_ctl_unmap_page(io_ctl); if (io_ctl->index >= io_ctl->num_pages) - return type; + return 0; - io_ctl_map_page(io_ctl, 0); - return type; + return io_ctl_check_crc(io_ctl, io_ctl->index); } -static void io_ctl_read_bitmap(struct io_ctl *io_ctl, - struct btrfs_free_space *entry) +static int io_ctl_read_bitmap(struct io_ctl *io_ctl, + struct btrfs_free_space *entry) { - BUG_ON(!io_ctl->cur); - if (io_ctl->cur != io_ctl->orig) { + int ret; + + if (io_ctl->cur && io_ctl->cur != io_ctl->orig) io_ctl_unmap_page(io_ctl); - io_ctl_map_page(io_ctl, 0); - } + + ret = io_ctl_check_crc(io_ctl, io_ctl->index); + if (ret) + return ret; + memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE); io_ctl_unmap_page(io_ctl); - if (io_ctl->index < io_ctl->num_pages) - io_ctl_map_page(io_ctl, 0); + + return 0; } int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, @@ -553,6 +626,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (ret) goto out; + ret = io_ctl_check_crc(&io_ctl, 0); + if (ret) + goto free_cache; + ret = io_ctl_check_generation(&io_ctl, generation); if (ret) goto free_cache; @@ -563,7 +640,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (!e) goto free_cache; - type = io_ctl_read_entry(&io_ctl, e); + ret = io_ctl_read_entry(&io_ctl, e, &type); + if (ret) { + kmem_cache_free(btrfs_free_space_cachep, e); + goto free_cache; + } + if (!e->bytes) { kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; @@ -611,7 +693,9 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, */ list_for_each_entry_safe(e, n, &bitmaps, list) { list_del_init(&e->list); - io_ctl_read_bitmap(&io_ctl, e); + ret = io_ctl_read_bitmap(&io_ctl, e); + if (ret) + goto free_cache; } io_ctl_drop_pages(&io_ctl); @@ -632,7 +716,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root = fs_info->tree_root; struct inode *inode; struct btrfs_path *path; - int ret; + int ret = 0; bool matched; u64 used = btrfs_block_group_used(&block_group->item); @@ -664,6 +748,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, return 0; } + /* We may have converted the inode and made the cache invalid. */ + spin_lock(&block_group->lock); + if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { + spin_unlock(&block_group->lock); + goto out; + } + spin_unlock(&block_group->lock); + ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, path, block_group->key.objectid); btrfs_free_path(path); @@ -774,6 +866,13 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, cluster = NULL; } + /* Make sure we can fit our crcs into the first page */ + if (io_ctl.check_crcs && + (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) { + WARN_ON(1); + goto out_nospc; + } + io_ctl_set_generation(&io_ctl, trans->transid); /* Write out the extent entries */ @@ -864,8 +963,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, + GFP_NOFS); goto out; } leaf = path->nodes[0]; @@ -878,9 +977,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, found_key.offset != offset) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, - GFP_NOFS); + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, + NULL, GFP_NOFS); btrfs_release_path(path); goto out; } @@ -942,7 +1040,6 @@ int btrfs_write_out_cache(struct btrfs_root *root, ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, path, block_group->key.objectid); if (ret) { - btrfs_delalloc_release_metadata(inode, inode->i_size); spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_ERROR; spin_unlock(&block_group->lock); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 15adfb542502..246397d8478f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1792,12 +1792,12 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) } ret = 0; out: - btrfs_delalloc_release_metadata(inode, ordered_extent->len); - if (nolock) { - if (trans) + if (root != root->fs_info->tree_root) + btrfs_delalloc_release_metadata(inode, ordered_extent->len); + if (trans) { + if (nolock) btrfs_end_transaction_nolock(trans, root); - } else { - if (trans) + else btrfs_end_transaction(trans, root); } -- cgit v1.2.3-58-ga151 From 4b91c14f913f649d4302b3677b85c4ce87a3d8e7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 7 Oct 2011 11:55:34 -0400 Subject: Btrfs: wait for ordered extents if we didn't reclaim enough I noticed recently that my overcommit patch was causing one of my enospc tests to fail 25% of the time with early ENOSPC. This is because my overcommit patch was letting us go way over board, but it wasn't waiting long enough to let the delalloc shrinker do it's job. The problem is we just start writeback and wait a little bit hoping we flush enough, but we only free up delalloc space by having the writes complete all the way. We do this by waiting for ordered extents, which we do but only if we already free'd enough for the reservation, which isn't right, we should flush ordered extents if we didn't reclaim enough in case that will push us over the edge. With this patch I've not seen a failure in this enospc test after running it in a loop for an hour. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0abf70c984e9..fc0de6880045 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3406,7 +3406,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, } } - if (reclaimed >= to_reclaim && !trans) + if (reclaimed < to_reclaim && !trans) btrfs_wait_ordered_extents(root, 0, 0); return reclaimed >= to_reclaim; } -- cgit v1.2.3-58-ga151 From e70bea5fe0e3d6355fd95674eaff5aa5a32f0564 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 11 Oct 2011 14:18:24 -0400 Subject: Btrfs: fix the amount of space reserved for unlink Our unlink reservations were a bit much, we were reserving 10 and I only count 8 possible items we're touching, so comment what we're reserving for and fix the count value. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 246397d8478f..1f013c5c36aa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2758,7 +2758,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, u64 ino = btrfs_ino(inode); u64 dir_ino = btrfs_ino(dir); - trans = btrfs_start_transaction(root, 10); + /* + * 1 for the possible orphan item + * 1 for the dir item + * 1 for the dir index + * 1 for the inode ref + * 1 for the inode ref in the tree log + * 2 for the dir entries in the log + * 1 for the inode + */ + trans = btrfs_start_transaction(root, 8); if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) return trans; -- cgit v1.2.3-58-ga151 From ed3ee9f44ba55eb6acfbfc8caa881e0253710d2a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 13 Oct 2011 13:09:22 -0400 Subject: Btrfs: fix regression in re-setting a large xattr Recently I changed the xattr stuff to unconditionally set the xattr first in case the xattr didn't exist yet. This has introduced a regression when setting an xattr that already exists with a large value. If we find the key we are looking for split_leaf will assume that we're extending that item. The problem is the size we pass down to btrfs_search_slot includes the size of the item already, so if we have the largest xattr we can possibly have plus the size of the xattr item plus the xattr item that btrfs_search_slot we'd overflow the leaf. Thankfully this is not what we're doing, but split_leaf doesn't know this so it just returns EOVERFLOW. So in the xattr code we need to check and see if we got back EOVERFLOW and treat it like EEXIST since that's really what happened. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/xattr.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 69565e5fc6a0..a76e41c04b71 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -127,6 +127,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans, again: ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), name, name_len, value, size); + /* + * If we're setting an xattr to a new value but the new value is say + * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting + * back from split_leaf. This is because it thinks we'll be extending + * the existing item size, but we're asking for enough space to add the + * item itself. So if we get EOVERFLOW just set ret to EEXIST and let + * the rest of the function figure it out. + */ + if (ret == -EOVERFLOW) + ret = -EEXIST; + if (ret == -EEXIST) { if (flags & XATTR_CREATE) goto out; -- cgit v1.2.3-58-ga151 From bbb495c2ed9d34894cb3d6d366866fc92a2e1adc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Oct 2011 13:37:45 -0400 Subject: Btrfs: don't check bytes_pinned to determine if we should commit the transaction Before the only reason to commit the transaction to recover space in reserve_metadata_bytes() was if there were enough pinned_bytes to satisfy our reservation. But now we have the delayed inode stuff which will hold it's reservations until we commit the transaction. So say we max out our reservation by creating a bunch of files but don't have any pinned bytes we will ENOSPC out early even though we could commit the transaction and get that space back. So now just unconditionally commit the transaction since currently there is no way to know how much metadata space is being reserved by delayed inode stuff. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index fc0de6880045..79365a40cb3a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3568,17 +3568,6 @@ again: goto again; } - /* - * Not enough space to be reclaimed, don't bother committing the - * transaction. - */ - spin_lock(&space_info->lock); - if (space_info->bytes_pinned < orig_bytes) - ret = -ENOSPC; - spin_unlock(&space_info->lock); - if (ret) - goto out; - ret = -EAGAIN; if (trans) goto out; -- cgit v1.2.3-58-ga151 From f104d044376aadcee74605d66b8d9dc2e145782c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Oct 2011 13:56:58 -0400 Subject: Btrfs: wait for ordered extents if we're in trouble when shrinking delalloc The only way we actually reclaim delalloc space is waiting for the IO to completely finish. Usually we kick off a bunch of IO and wait for a little bit and hope we can make our reservation, and usually this works out pretty well. With overcommit however we can get seriously underwater if we're filling up the disk quickly, so we need to be able to force the delalloc shrinker to wait for the ordered IO to finish to give us a better chance of actually reclaiming enough space to get our reservation. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 79365a40cb3a..96cbc5104959 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3334,7 +3334,8 @@ out: * shrink metadata reservation for delalloc */ static int shrink_delalloc(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 to_reclaim, int sync) + struct btrfs_root *root, u64 to_reclaim, + bool wait_ordered) { struct btrfs_block_rsv *block_rsv; struct btrfs_space_info *space_info; @@ -3387,11 +3388,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, if (trans && trans->transaction->blocked) return -EAGAIN; - time_left = schedule_timeout_interruptible(1); + if (wait_ordered && !trans) { + btrfs_wait_ordered_extents(root, 0, 0); + } else { + time_left = schedule_timeout_interruptible(1); - /* We were interrupted, exit */ - if (time_left) - break; + /* We were interrupted, exit */ + if (time_left) + break; + } /* we've kicked the IO a few times, if anything has been freed, * exit. There is no sense in looping here for a long time @@ -3406,8 +3411,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, } } - if (reclaimed < to_reclaim && !trans) - btrfs_wait_ordered_extents(root, 0, 0); + return reclaimed >= to_reclaim; } @@ -3438,6 +3442,7 @@ static int reserve_metadata_bytes(struct btrfs_root *root, int ret = 0; bool committed = false; bool flushing = false; + bool wait_ordered = false; trans = (struct btrfs_trans_handle *)current->journal_info; again: @@ -3496,6 +3501,7 @@ again: * amount plus the amount of bytes that we need for this * reservation. */ + wait_ordered = true; num_bytes = used - space_info->total_bytes + (orig_bytes * (retries + 1)); } @@ -3530,6 +3536,8 @@ again: if (used + num_bytes < space_info->total_bytes + avail) { space_info->bytes_may_use += orig_bytes; ret = 0; + } else { + wait_ordered = true; } } @@ -3552,7 +3560,7 @@ again: * We do synchronous shrinking since we don't actually unreserve * metadata until after the IO is completed. */ - ret = shrink_delalloc(trans, root, num_bytes, 1); + ret = shrink_delalloc(trans, root, num_bytes, wait_ordered); if (ret < 0) goto out; @@ -3564,6 +3572,7 @@ again: * so go back around and try again. */ if (retries < 2) { + wait_ordered = true; retries++; goto again; } -- cgit v1.2.3-58-ga151 From 877da174301dde9062b915da4c8103048be49702 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Oct 2011 14:02:10 -0400 Subject: Btrfs: allow shrink_delalloc flush the needed reclaimed pages Currently we only allow a maximum of 2 megabytes of pages to be flushed at a time. This was ok before, but now we have overcommit which will screw us in a heartbeat if we are quickly filling the disk. So instead pick either 2 megabytes or the number of pages we need to reclaim to be safe again, which ever is larger. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 96cbc5104959..424ae82855c8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3343,7 +3343,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, u64 max_reclaim; u64 reclaimed = 0; long time_left; - int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; int loops = 0; unsigned long progress; @@ -3366,7 +3366,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, } max_reclaim = min(reserved, to_reclaim); - + nr_pages = max_t(unsigned long, nr_pages, + max_reclaim >> PAGE_CACHE_SHIFT); while (loops < 1024) { /* have the flusher threads jump in and do some IO */ smp_mb(); -- cgit v1.2.3-58-ga151 From b24e03db0df3e9164c9649db12fecc8c2d81b0d1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Oct 2011 14:40:17 -0400 Subject: Btrfs: release trans metadata bytes before flushing delayed refs We started setting trans->block_rsv = NULL to allow the delayed refs flushing stuff to use the right block_rsv and then just made btrfs_trans_release_metadata() unconditionally use the trans block rsv. The problem with this is we need to reserve some space in the transaction and then migrate it to the global block rsv, so we need to be able to free that out properly. So instead just move btrfs_trans_release_metadata() before the delayed ref flushing and use trans->block_rsv for the freeing. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 5 +---- fs/btrfs/transaction.c | 6 ++---- 2 files changed, 3 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 424ae82855c8..eb4fe56b08bb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3909,13 +3909,10 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_block_rsv *block_rsv; - if (!trans->bytes_reserved) return; - block_rsv = &root->fs_info->trans_block_rsv; - btrfs_block_rsv_release(root, block_rsv, trans->bytes_reserved); + btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); trans->bytes_reserved = 0; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 1e1a4816ccb0..d064fa0a4a07 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -462,6 +462,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, return 0; } + btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; while (count < 4) { unsigned long cur = trans->delayed_ref_updates; @@ -483,8 +484,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, count++; } - btrfs_trans_release_metadata(trans, root); - if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && should_end_transaction(trans, root)) { trans->transaction->blocked = 1; @@ -1128,6 +1127,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_run_ordered_operations(root, 0); + btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; /* make a pass through all the delayed refs we have so far @@ -1136,8 +1136,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_refs(trans, root, 0); BUG_ON(ret); - btrfs_trans_release_metadata(trans, root); - cur_trans = trans->transaction; /* * set the flushing flag so procs in this transaction have to -- cgit v1.2.3-58-ga151 From 3880a1b46d87a6b030c31889875befc745d95dff Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Oct 2011 14:46:51 -0400 Subject: Btrfs: reserve some space for an orphan item when unlinking In __unlink_start_trans() if we don't have enough room for a reservation we will check to see if the unlink will free up space. If it does that's great, but we will still could add an orphan item, so we need to reserve enough space to add the orphan item. Do this and migrate the space the global reserve so it all works out right. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1f013c5c36aa..b6b70bdd0992 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2790,7 +2790,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, return ERR_PTR(-ENOMEM); } - trans = btrfs_start_transaction(root, 0); + /* 1 for the orphan item */ + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { btrfs_free_path(path); root->fs_info->enospc_unlink = 0; @@ -2895,6 +2896,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, err = 0; out: btrfs_free_path(path); + /* Migrate the orphan reservation over */ + if (!err) + err = btrfs_block_rsv_migrate(trans->block_rsv, + &root->fs_info->global_block_rsv, + btrfs_calc_trans_metadata_size(root, 1)); + if (err) { btrfs_end_transaction(trans, root); root->fs_info->enospc_unlink = 0; -- cgit v1.2.3-58-ga151 From 36ba022ac0b748dd543f43430b03198e899426c9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Oct 2011 12:15:48 -0400 Subject: Btrfs: seperate out btrfs_block_rsv_check out into 2 different functions Currently btrfs_block_rsv_check does 2 things, it will either refill a block reserve like in the truncate or refill case, or it will check to see if there is enough space in the global reserve and possibly refill it. However because of overcommit we could be well overcommitting ourselves just to try and refill the global reserve, when really we should just be committing the transaction. So breack this out into btrfs_block_rsv_refill and btrfs_block_rsv_check. Refill will try to reserve more metadata if it can and btrfs_block_rsv_check will not, it will only tell you if the factor of the total space is still reserved. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 4 +++- fs/btrfs/extent-tree.c | 41 +++++++++++++++++++++++++++-------------- fs/btrfs/free-space-cache.c | 2 +- fs/btrfs/inode.c | 4 ++-- fs/btrfs/relocation.c | 6 ++---- fs/btrfs/transaction.c | 4 ++-- 6 files changed, 37 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ea60897a9171..227620993bce 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2252,8 +2252,10 @@ int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); int btrfs_block_rsv_check(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int min_factor); +int btrfs_block_rsv_refill(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 min_reserved, int min_factor, int flush); + u64 min_reserved); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index eb4fe56b08bb..a5f1421eeee9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3422,7 +3422,6 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, * @block_rsv - the block_rsv we're allocating for * @orig_bytes - the number of bytes we want * @flush - wether or not we can flush to make our reservation - * @check - wether this is just to check if we have enough space or not * * This will reserve orgi_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to @@ -3433,7 +3432,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, */ static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 orig_bytes, int flush, int check) + u64 orig_bytes, int flush) { struct btrfs_space_info *space_info = block_rsv->space_info; struct btrfs_trans_handle *trans; @@ -3507,7 +3506,7 @@ again: (orig_bytes * (retries + 1)); } - if (ret && !check) { + if (ret) { u64 profile = btrfs_get_alloc_profile(root, 0); u64 avail; @@ -3742,7 +3741,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root, if (num_bytes == 0) return 0; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1, 0); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 1); return 0; @@ -3752,8 +3751,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root, } int btrfs_block_rsv_check(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved, int min_factor, int flush) + struct btrfs_block_rsv *block_rsv, int min_factor) { u64 num_bytes = 0; int ret = -ENOSPC; @@ -3762,11 +3760,26 @@ int btrfs_block_rsv_check(struct btrfs_root *root, return 0; spin_lock(&block_rsv->lock); - if (min_factor > 0) - num_bytes = div_factor(block_rsv->size, min_factor); - if (min_reserved > num_bytes) - num_bytes = min_reserved; + num_bytes = div_factor(block_rsv->size, min_factor); + if (block_rsv->reserved >= num_bytes) + ret = 0; + spin_unlock(&block_rsv->lock); + return ret; +} + +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved) +{ + u64 num_bytes = 0; + int ret = -ENOSPC; + + if (!block_rsv) + return 0; + + spin_lock(&block_rsv->lock); + num_bytes = min_reserved; if (block_rsv->reserved >= num_bytes) ret = 0; else @@ -3776,7 +3789,7 @@ int btrfs_block_rsv_check(struct btrfs_root *root, if (!ret) return 0; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush, !flush); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 0); return 0; @@ -4073,7 +4086,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); spin_unlock(&BTRFS_I(inode)->lock); - ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush, 0); + ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); if (ret) { u64 to_free = 0; unsigned dropped; @@ -5728,7 +5741,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); /* * If we couldn't reserve metadata bytes try and use some from * the global reserve. @@ -5749,7 +5762,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, return block_rsv; if (ret) { WARN_ON(1); - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); if (!ret) { return block_rsv; } else if (ret && block_rsv != global_rsv) { diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5d40c1ed8225..2fecfc3183ee 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -203,7 +203,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, rsv = trans->block_rsv; trans->block_rsv = root->orphan_block_rsv; - ret = btrfs_block_rsv_check(root, root->orphan_block_rsv, 0, 5, 0); + ret = btrfs_block_rsv_check(root, root->orphan_block_rsv, 5); if (ret) return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b6b70bdd0992..f12747c9447b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3604,7 +3604,7 @@ void btrfs_evict_inode(struct inode *inode) * doing the truncate. */ while (1) { - ret = btrfs_block_rsv_check(root, rsv, min_size, 0, 1); + ret = btrfs_block_rsv_refill(root, rsv, min_size); /* * Try and steal from the global reserve since we will @@ -6613,7 +6613,7 @@ static int btrfs_truncate(struct inode *inode) btrfs_add_ordered_operation(trans, root, inode); while (1) { - ret = btrfs_block_rsv_check(root, rsv, min_size, 0, 1); + ret = btrfs_block_rsv_refill(root, rsv, min_size); if (ret) { /* * This can only happen with the original transaction we diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7fa090fa0d39..10af6a0e0865 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2041,8 +2041,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, BUG_ON(IS_ERR(trans)); trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_check(root, rc->block_rsv, min_reserved, - 0, 0); + ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved); if (ret) { BUG_ON(ret != -EAGAIN); ret = btrfs_commit_transaction(trans, root); @@ -3774,8 +3773,7 @@ restart: } } - ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 0, - 5, 0); + ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5); if (ret < 0) { if (ret != -EAGAIN) { err = ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d064fa0a4a07..29bef63e23ba 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -418,8 +418,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int ret; - ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 0, - 5, 0); + + ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); return ret ? 1 : 0; } -- cgit v1.2.3-58-ga151 From 7e355b83efa80e5f5821591c13c17649594d82ac Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Oct 2011 13:07:31 -0400 Subject: Btrfs: if we have a lot of pinned space, commit the transaction Mitch kept hitting a panic because he was getting ENOSPC. One of my previous patches makes it so we are much better at not allocating new metadata chunks. Unfortunately coupled with the overcommit patch this works us into a bit of a problem if we are removing a bunch of space and end up chewing up all of our space with pinned extents. We can allocate chunks fine and overflow is ok, but the only way to reclaim this space is to commit the transaction. So if we go to overcommit, first check and see how much pinned space we have. If we have more than 80% of the free space chewed up with pinned extents, just commit the transaction, this will free up enough space for our reservation and we won't have this problem anymore. With this patch Mitch's test doesn't blow up anymore. Thanks, Reported-and-tested-by: Mitch Harder Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a5f1421eeee9..4eb7d2ba38f8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3510,6 +3510,20 @@ again: u64 profile = btrfs_get_alloc_profile(root, 0); u64 avail; + /* + * If we have a lot of space that's pinned, don't bother doing + * the overcommit dance yet and just commit the transaction. + */ + avail = (space_info->total_bytes - space_info->bytes_used) * 8; + do_div(avail, 10); + if (space_info->bytes_pinned >= avail && flush && !trans && + !committed) { + space_info->flush = 1; + flushing = true; + spin_unlock(&space_info->lock); + goto commit; + } + spin_lock(&root->fs_info->free_chunk_lock); avail = root->fs_info->free_chunk_space; @@ -3581,6 +3595,7 @@ again: if (trans) goto out; +commit: ret = -ENOSPC; if (committed) goto out; -- cgit v1.2.3-58-ga151 From 016fc6a63e465d5b94e4028f6d05d9703e195428 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2011 10:31:37 -0400 Subject: Btrfs: don't flush the cache inode before writing it I noticed we had a little bit of latency when writing out the space cache inodes. It's because we flush it before we write anything in case we have dirty pages already there. This doesn't matter though since we're just going to overwrite the space, and there really shouldn't be any dirty pages anyway. This makes some of my tests run a little bit faster. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 2fecfc3183ee..de205d59b74b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -829,10 +829,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, if (!i_size_read(inode)) return -1; - filemap_write_and_wait(inode->i_mapping); - btrfs_wait_ordered_range(inode, inode->i_size & - ~(root->sectorsize - 1), (u64)-1); - io_ctl_init(&io_ctl, inode, root); /* Get the cluster for this block_group if it exists */ -- cgit v1.2.3-58-ga151 From 42c4dfc213190fafffc53815c2ee6064430bc379 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Oct 2011 15:28:17 -0400 Subject: cifs: turn read_from_socket into a wrapper around a vectorized version Eventually we'll want to allow cifsd to read data directly into the pagecache. In order to do that we'll need a routine that can take a kvec array and pass that directly to kernel_recvmsg. Unfortunately though, the kernel's recvmsg routines modify the kvec array that gets passed in, so we need to use a copy of the kvec array and refresh that copy on each pass through the loop. Reviewed-and-Tested-by: Pavel Shilovsky Signed-off-by: Jeff Layton --- fs/cifs/connect.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 61 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 97a65af2a08a..4860940b748b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -375,14 +375,54 @@ server_unresponsive(struct TCP_Server_Info *server) return false; } +/* + * kvec_array_init - clone a kvec array, and advance into it + * @new: pointer to memory for cloned array + * @iov: pointer to original array + * @nr_segs: number of members in original array + * @bytes: number of bytes to advance into the cloned array + * + * This function will copy the array provided in iov to a section of memory + * and advance the specified number of bytes into the new array. It returns + * the number of segments in the new array. "new" must be at least as big as + * the original iov array. + */ +static unsigned int +kvec_array_init(struct kvec *new, struct kvec *iov, unsigned int nr_segs, + size_t bytes) +{ + size_t base = 0; + + while (bytes || !iov->iov_len) { + int copy = min(bytes, iov->iov_len); + + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + nr_segs--; + base = 0; + } + } + memcpy(new, iov, sizeof(*iov) * nr_segs); + new->iov_base += base; + new->iov_len -= base; + return nr_segs; +} + static int -read_from_socket(struct TCP_Server_Info *server, char *buf, - unsigned int to_read) +readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, + unsigned int nr_segs, unsigned int to_read) { int length = 0; int total_read; + unsigned int segs; struct msghdr smb_msg; - struct kvec iov; + struct kvec *iov; + + iov = kmalloc(sizeof(*iov_orig) * nr_segs, GFP_NOFS); + if (!iov) + return -ENOMEM; smb_msg.msg_control = NULL; smb_msg.msg_controllen = 0; @@ -393,10 +433,11 @@ read_from_socket(struct TCP_Server_Info *server, char *buf, break; } - iov.iov_base = buf + total_read; - iov.iov_len = to_read; - length = kernel_recvmsg(server->ssocket, &smb_msg, &iov, 1, - to_read, 0); + segs = kvec_array_init(iov, iov_orig, nr_segs, total_read); + + length = kernel_recvmsg(server->ssocket, &smb_msg, + iov, segs, to_read, 0); + if (server->tcpStatus == CifsExiting) { total_read = -ESHUTDOWN; break; @@ -423,9 +464,22 @@ read_from_socket(struct TCP_Server_Info *server, char *buf, break; } } + kfree(iov); return total_read; } +static int +read_from_socket(struct TCP_Server_Info *server, char *buf, + unsigned int to_read) +{ + struct kvec iov; + + iov.iov_base = buf; + iov.iov_len = to_read; + + return readv_from_socket(server, &iov, 1, to_read); +} + static bool is_smb_response(struct TCP_Server_Info *server, unsigned char type) { -- cgit v1.2.3-58-ga151 From 1041e3f9919999b22c9c2a453aa0d92cd16b76ee Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Oct 2011 15:28:27 -0400 Subject: cifs: keep a reusable kvec array for receives Having to continually allocate a new kvec array is expensive. Allocate one that's big enough, and only reallocate it as needed. Reviewed-and-Tested-by: Pavel Shilovsky Signed-off-by: Jeff Layton --- fs/cifs/cifsglob.h | 2 ++ fs/cifs/connect.c | 22 ++++++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 55ebf39fb3fd..51ed2de23070 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -292,6 +292,8 @@ struct TCP_Server_Info { bool sec_kerberos; /* supports plain Kerberos */ bool sec_mskerberos; /* supports legacy MS Kerberos */ struct delayed_work echo; /* echo ping workqueue job */ + struct kvec *iov; /* reusable kvec array for receives */ + unsigned int nr_iov; /* number of kvecs in array */ #ifdef CONFIG_CIFS_FSCACHE struct fscache_cookie *fscache; /* client index cache cookie */ #endif diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 4860940b748b..ee70075c5fb1 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -410,6 +410,24 @@ kvec_array_init(struct kvec *new, struct kvec *iov, unsigned int nr_segs, return nr_segs; } +static struct kvec * +get_server_iovec(struct TCP_Server_Info *server, unsigned int nr_segs) +{ + struct kvec *new_iov; + + if (server->iov && nr_segs <= server->nr_iov) + return server->iov; + + /* not big enough -- allocate a new one and release the old */ + new_iov = kmalloc(sizeof(*new_iov) * nr_segs, GFP_NOFS); + if (new_iov) { + kfree(server->iov); + server->iov = new_iov; + server->nr_iov = nr_segs; + } + return new_iov; +} + static int readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, unsigned int nr_segs, unsigned int to_read) @@ -420,7 +438,7 @@ readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, struct msghdr smb_msg; struct kvec *iov; - iov = kmalloc(sizeof(*iov_orig) * nr_segs, GFP_NOFS); + iov = get_server_iovec(server, nr_segs); if (!iov) return -ENOMEM; @@ -464,7 +482,6 @@ readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, break; } } - kfree(iov); return total_read; } @@ -669,6 +686,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) } kfree(server->hostname); + kfree(server->iov); kfree(server); length = atomic_dec_return(&tcpSesAllocCount); -- cgit v1.2.3-58-ga151 From 89482a56a079f01c2f4c709f8e23fbf7eeda1b43 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Oct 2011 15:28:57 -0400 Subject: cifs: add a third receive phase to cifs_demultiplex_thread Have the demultiplex thread receive just enough to get to the MID, and then find it before receiving the rest. Later, we'll use this to swap in a preallocated receive buffer for some calls. Reviewed-and-Tested-by: Pavel Shilovsky Signed-off-by: Jeff Layton --- fs/cifs/connect.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index ee70075c5fb1..5308bc6e1248 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -746,11 +746,25 @@ cifs_demultiplex_thread(void *p) if (!is_smb_response(server, buf[0])) continue; - /* check the length */ - if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) || - (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) { - cERROR(1, "Invalid size SMB length %d pdu_length %d", - 4, pdu_length + 4); + /* make sure we have enough to get to the MID */ + if (pdu_length < sizeof(struct smb_hdr) - 1 - 4) { + cERROR(1, "SMB response too short (%u bytes)", + pdu_length); + cifs_reconnect(server); + wake_up(&server->response_q); + continue; + } + + /* read down to the MID */ + length = read_from_socket(server, buf + 4, + sizeof(struct smb_hdr) - 1 - 4); + if (length < 0) + continue; + total_read += length; + + if (pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { + cERROR(1, "SMB response too long (%u bytes)", + pdu_length); cifs_reconnect(server); wake_up(&server->response_q); continue; @@ -759,12 +773,15 @@ cifs_demultiplex_thread(void *p) /* else length ok */ if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) { isLargeBuf = true; - memcpy(bigbuf, smallbuf, 4); + memcpy(bigbuf, smallbuf, total_read); smb_buffer = (struct smb_hdr *)bigbuf; buf = bigbuf; } - length = read_from_socket(server, buf + 4, pdu_length); + /* now read the rest */ + length = read_from_socket(server, + buf + sizeof(struct smb_hdr) - 1, + pdu_length - sizeof(struct smb_hdr) + 1 + 4); if (length < 0) continue; total_read += length; -- cgit v1.2.3-58-ga151 From ea1f4502fc939b64807f9ab0eca259321047fe83 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Oct 2011 15:29:05 -0400 Subject: cifs: move mid finding into separate routine Begin breaking up find_cifs_mid into smaller pieces. The parts that coalesce T2 responses don't really need to be done under the GlobalMid_lock anyway. Create a new function that just finds the mid on the list, and then later takes it off the list if the entire response has been received. Reviewed-and-Tested-by: Pavel Shilovsky Signed-off-by: Jeff Layton --- fs/cifs/connect.c | 113 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 66 insertions(+), 47 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 5308bc6e1248..0f69b311d3fc 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -542,61 +542,80 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type) } static struct mid_q_entry * -find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf, - int *length, bool is_large_buf, bool *is_multi_rsp, char **bigbuf) +find_mid(struct TCP_Server_Info *server, struct smb_hdr *buf) { - struct mid_q_entry *mid = NULL, *tmp_mid, *ret = NULL; + struct mid_q_entry *mid; spin_lock(&GlobalMid_Lock); - list_for_each_entry_safe(mid, tmp_mid, &server->pending_mid_q, qhead) { - if (mid->mid != buf->Mid || - mid->midState != MID_REQUEST_SUBMITTED || - mid->command != buf->Command) - continue; - - if (*length == 0 && check2ndT2(buf) > 0) { - /* We have a multipart transact2 resp */ - *is_multi_rsp = true; - if (mid->resp_buf) { - /* merge response - fix up 1st*/ - *length = coalesce_t2(buf, mid->resp_buf); - if (*length > 0) { - *length = 0; - mid->multiRsp = true; - break; - } - /* All parts received or packet is malformed. */ - mid->multiEnd = true; - goto multi_t2_fnd; - } - if (!is_large_buf) { - /*FIXME: switch to already allocated largebuf?*/ - cERROR(1, "1st trans2 resp needs bigbuf"); - } else { - /* Have first buffer */ - mid->resp_buf = buf; - mid->largeBuf = true; - *bigbuf = NULL; - } - break; + list_for_each_entry(mid, &server->pending_mid_q, qhead) { + if (mid->mid == buf->Mid && + mid->midState == MID_REQUEST_SUBMITTED && + mid->command == buf->Command) { + spin_unlock(&GlobalMid_Lock); + return mid; } - mid->resp_buf = buf; - mid->largeBuf = is_large_buf; -multi_t2_fnd: - if (*length == 0) - mid->midState = MID_RESPONSE_RECEIVED; - else - mid->midState = MID_RESPONSE_MALFORMED; + } + spin_unlock(&GlobalMid_Lock); + return NULL; +} + +static void +dequeue_mid(struct mid_q_entry *mid, int malformed) +{ #ifdef CONFIG_CIFS_STATS2 - mid->when_received = jiffies; + mid->when_received = jiffies; #endif - list_del_init(&mid->qhead); - ret = mid; - break; - } + spin_lock(&GlobalMid_Lock); + if (!malformed) + mid->midState = MID_RESPONSE_RECEIVED; + else + mid->midState = MID_RESPONSE_MALFORMED; + list_del_init(&mid->qhead); spin_unlock(&GlobalMid_Lock); +} - return ret; +static struct mid_q_entry * +find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf, + int *malformed, bool is_large_buf, bool *is_multi_rsp, + char **bigbuf) +{ + struct mid_q_entry *mid = NULL; + + mid = find_mid(server, buf); + if (!mid) + return mid; + + if (*malformed == 0 && check2ndT2(buf) > 0) { + /* We have a multipart transact2 resp */ + *is_multi_rsp = true; + if (mid->resp_buf) { + /* merge response - fix up 1st*/ + *malformed = coalesce_t2(buf, mid->resp_buf); + if (*malformed > 0) { + *malformed = 0; + mid->multiRsp = true; + return NULL; + } + /* All parts received or packet is malformed. */ + mid->multiEnd = true; + goto multi_t2_fnd; + } + if (!is_large_buf) { + /*FIXME: switch to already allocated largebuf?*/ + cERROR(1, "1st trans2 resp needs bigbuf"); + } else { + /* Have first buffer */ + mid->resp_buf = buf; + mid->largeBuf = true; + *bigbuf = NULL; + } + return mid; + } + mid->resp_buf = buf; + mid->largeBuf = is_large_buf; +multi_t2_fnd: + dequeue_mid(mid, *malformed); + return mid; } static void clean_demultiplex_info(struct TCP_Server_Info *server) -- cgit v1.2.3-58-ga151 From ffc00e27aa5d343eb71068c185cdbd65871ccdce Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Oct 2011 15:29:13 -0400 Subject: cifs: eliminate is_multi_rsp parm to find_cifs_mid Change find_cifs_mid to only return NULL if a mid could not be found. If we got part of a multi-part T2 response, then coalesce it and still return the mid. The caller can determine the T2 receive status from the flags in the mid. With this change, there is no need to pass a pointer to "length" as well so just pass by value. If a mid is found, then we can just mark it as malformed. If one isn't found, then the value of "length" won't change anyway. Reviewed-and-Tested-by: Pavel Shilovsky Signed-off-by: Jeff Layton --- fs/cifs/connect.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0f69b311d3fc..8d5a61561909 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -576,8 +576,7 @@ dequeue_mid(struct mid_q_entry *mid, int malformed) static struct mid_q_entry * find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf, - int *malformed, bool is_large_buf, bool *is_multi_rsp, - char **bigbuf) + int malformed, bool is_large_buf, char **bigbuf) { struct mid_q_entry *mid = NULL; @@ -585,17 +584,14 @@ find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf, if (!mid) return mid; - if (*malformed == 0 && check2ndT2(buf) > 0) { - /* We have a multipart transact2 resp */ - *is_multi_rsp = true; + if (malformed == 0 && check2ndT2(buf) > 0) { + mid->multiRsp = true; if (mid->resp_buf) { /* merge response - fix up 1st*/ - *malformed = coalesce_t2(buf, mid->resp_buf); - if (*malformed > 0) { - *malformed = 0; - mid->multiRsp = true; - return NULL; - } + malformed = coalesce_t2(buf, mid->resp_buf); + if (malformed > 0) + return mid; + /* All parts received or packet is malformed. */ mid->multiEnd = true; goto multi_t2_fnd; @@ -614,7 +610,7 @@ find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf, mid->resp_buf = buf; mid->largeBuf = is_large_buf; multi_t2_fnd: - dequeue_mid(mid, *malformed); + dequeue_mid(mid, malformed); return mid; } @@ -725,7 +721,6 @@ cifs_demultiplex_thread(void *p) struct task_struct *task_to_wake = NULL; struct mid_q_entry *mid_entry; bool isLargeBuf = false; - bool isMultiRsp = false; current->flags |= PF_MEMALLOC; cFYI(1, "Demultiplex PID: %d", task_pid_nr(current)); @@ -745,7 +740,6 @@ cifs_demultiplex_thread(void *p) continue; isLargeBuf = false; - isMultiRsp = false; smb_buffer = (struct smb_hdr *)smallbuf; buf = smallbuf; pdu_length = 4; /* enough to get RFC1001 header */ @@ -823,23 +817,25 @@ cifs_demultiplex_thread(void *p) server->lstrp = jiffies; - mid_entry = find_cifs_mid(server, smb_buffer, &length, - isLargeBuf, &isMultiRsp, &bigbuf); + mid_entry = find_cifs_mid(server, smb_buffer, length, + isLargeBuf, &bigbuf); if (mid_entry != NULL) { - mid_entry->callback(mid_entry); + if (mid_entry->multiRsp && !mid_entry->multiEnd) + continue; + /* Was previous buf put in mpx struct for multi-rsp? */ - if (!isMultiRsp) { + if (!mid_entry->multiRsp) { /* smb buffer will be freed by user thread */ if (isLargeBuf) bigbuf = NULL; else smallbuf = NULL; } + mid_entry->callback(mid_entry); } else if (length != 0) { /* response sanity checks failed */ continue; - } else if (!is_valid_oplock_break(smb_buffer, server) && - !isMultiRsp) { + } else if (!is_valid_oplock_break(smb_buffer, server)) { cERROR(1, "No task to wake, unknown frame received! " "NumMids %d", atomic_read(&midCount)); cifs_dump_mem("Received Data is: ", buf, -- cgit v1.2.3-58-ga151 From 2a37ef94bb153fad13cbb091aab679d7c8b9a67f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Oct 2011 15:29:23 -0400 Subject: cifs: move buffer pointers into TCP_Server_Info We have several functions that need to access these pointers. Currently that's done with a lot of double pointer passing. Instead, move them into the TCP_Server_Info and simplify the handling. Reviewed-and-Tested-by: Pavel Shilovsky Signed-off-by: Jeff Layton --- fs/cifs/cifsglob.h | 4 +++ fs/cifs/connect.c | 101 ++++++++++++++++++++++++----------------------------- 2 files changed, 50 insertions(+), 55 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 51ed2de23070..a73dd1d5e7ef 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -291,9 +291,13 @@ struct TCP_Server_Info { bool sec_kerberosu2u; /* supports U2U Kerberos */ bool sec_kerberos; /* supports plain Kerberos */ bool sec_mskerberos; /* supports legacy MS Kerberos */ + bool large_buf; /* is current buffer large? */ struct delayed_work echo; /* echo ping workqueue job */ struct kvec *iov; /* reusable kvec array for receives */ unsigned int nr_iov; /* number of kvecs in array */ + char *smallbuf; /* pointer to current "small" buffer */ + char *bigbuf; /* pointer to current "big" buffer */ + unsigned int total_read; /* total amount of data read in this pass */ #ifdef CONFIG_CIFS_FSCACHE struct fscache_cookie *fscache; /* client index cache cookie */ #endif diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 8d5a61561909..8918f935bf07 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -320,27 +320,24 @@ requeue_echo: } static bool -allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size, - bool is_large_buf) +allocate_buffers(struct TCP_Server_Info *server) { - char *bbuf = *bigbuf, *sbuf = *smallbuf; - - if (bbuf == NULL) { - bbuf = (char *)cifs_buf_get(); - if (!bbuf) { + if (!server->bigbuf) { + server->bigbuf = (char *)cifs_buf_get(); + if (!server->bigbuf) { cERROR(1, "No memory for large SMB response"); msleep(3000); /* retry will check if exiting */ return false; } - } else if (is_large_buf) { + } else if (server->large_buf) { /* we are reusing a dirty large buf, clear its start */ - memset(bbuf, 0, size); + memset(server->bigbuf, 0, sizeof(struct smb_hdr)); } - if (sbuf == NULL) { - sbuf = (char *)cifs_small_buf_get(); - if (!sbuf) { + if (!server->smallbuf) { + server->smallbuf = (char *)cifs_small_buf_get(); + if (!server->smallbuf) { cERROR(1, "No memory for SMB response"); msleep(1000); /* retry will check if exiting */ @@ -349,12 +346,9 @@ allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size, /* beginning of smb buffer is cleared in our buf_get */ } else { /* if existing small buf clear beginning */ - memset(sbuf, 0, size); + memset(server->smallbuf, 0, sizeof(struct smb_hdr)); } - *bigbuf = bbuf; - *smallbuf = sbuf; - return true; } @@ -576,7 +570,7 @@ dequeue_mid(struct mid_q_entry *mid, int malformed) static struct mid_q_entry * find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf, - int malformed, bool is_large_buf, char **bigbuf) + int malformed) { struct mid_q_entry *mid = NULL; @@ -596,19 +590,27 @@ find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf, mid->multiEnd = true; goto multi_t2_fnd; } - if (!is_large_buf) { + if (!server->large_buf) { /*FIXME: switch to already allocated largebuf?*/ cERROR(1, "1st trans2 resp needs bigbuf"); } else { /* Have first buffer */ mid->resp_buf = buf; mid->largeBuf = true; - *bigbuf = NULL; + server->bigbuf = NULL; } return mid; } mid->resp_buf = buf; - mid->largeBuf = is_large_buf; + mid->largeBuf = server->large_buf; + /* Was previous buf put in mpx struct for multi-rsp? */ + if (!mid->multiRsp) { + /* smb buffer will be freed by user thread */ + if (server->large_buf) + server->bigbuf = NULL; + else + server->smallbuf = NULL; + } multi_t2_fnd: dequeue_mid(mid, malformed); return mid; @@ -715,12 +717,11 @@ cifs_demultiplex_thread(void *p) { int length; struct TCP_Server_Info *server = p; - unsigned int pdu_length, total_read; - char *buf = NULL, *bigbuf = NULL, *smallbuf = NULL; + unsigned int pdu_length; + char *buf = NULL; struct smb_hdr *smb_buffer = NULL; struct task_struct *task_to_wake = NULL; struct mid_q_entry *mid_entry; - bool isLargeBuf = false; current->flags |= PF_MEMALLOC; cFYI(1, "Demultiplex PID: %d", task_pid_nr(current)); @@ -735,19 +736,18 @@ cifs_demultiplex_thread(void *p) if (try_to_freeze()) continue; - if (!allocate_buffers(&bigbuf, &smallbuf, - sizeof(struct smb_hdr), isLargeBuf)) + if (!allocate_buffers(server)) continue; - isLargeBuf = false; - smb_buffer = (struct smb_hdr *)smallbuf; - buf = smallbuf; + server->large_buf = false; + smb_buffer = (struct smb_hdr *)server->smallbuf; + buf = server->smallbuf; pdu_length = 4; /* enough to get RFC1001 header */ length = read_from_socket(server, buf, pdu_length); if (length < 0) continue; - total_read = length; + server->total_read = length; /* * The right amount was read from socket - 4 bytes, @@ -773,7 +773,7 @@ cifs_demultiplex_thread(void *p) sizeof(struct smb_hdr) - 1 - 4); if (length < 0) continue; - total_read += length; + server->total_read += length; if (pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { cERROR(1, "SMB response too long (%u bytes)", @@ -785,10 +785,11 @@ cifs_demultiplex_thread(void *p) /* else length ok */ if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) { - isLargeBuf = true; - memcpy(bigbuf, smallbuf, total_read); - smb_buffer = (struct smb_hdr *)bigbuf; - buf = bigbuf; + server->large_buf = true; + memcpy(server->bigbuf, server->smallbuf, + server->total_read); + smb_buffer = (struct smb_hdr *)server->bigbuf; + buf = server->bigbuf; } /* now read the rest */ @@ -797,9 +798,9 @@ cifs_demultiplex_thread(void *p) pdu_length - sizeof(struct smb_hdr) + 1 + 4); if (length < 0) continue; - total_read += length; + server->total_read += length; - dump_smb(smb_buffer, total_read); + dump_smb(smb_buffer, server->total_read); /* * We know that we received enough to get to the MID as we @@ -810,28 +811,18 @@ cifs_demultiplex_thread(void *p) * 48 bytes is enough to display the header and a little bit * into the payload for debugging purposes. */ - length = checkSMB(smb_buffer, smb_buffer->Mid, total_read); + length = checkSMB(smb_buffer, smb_buffer->Mid, + server->total_read); if (length != 0) cifs_dump_mem("Bad SMB: ", buf, - min_t(unsigned int, total_read, 48)); + min_t(unsigned int, server->total_read, 48)); server->lstrp = jiffies; - mid_entry = find_cifs_mid(server, smb_buffer, length, - isLargeBuf, &bigbuf); + mid_entry = find_cifs_mid(server, smb_buffer, length); if (mid_entry != NULL) { - if (mid_entry->multiRsp && !mid_entry->multiEnd) - continue; - - /* Was previous buf put in mpx struct for multi-rsp? */ - if (!mid_entry->multiRsp) { - /* smb buffer will be freed by user thread */ - if (isLargeBuf) - bigbuf = NULL; - else - smallbuf = NULL; - } - mid_entry->callback(mid_entry); + if (!mid_entry->multiRsp || mid_entry->multiEnd) + mid_entry->callback(mid_entry); } else if (length != 0) { /* response sanity checks failed */ continue; @@ -849,9 +840,9 @@ cifs_demultiplex_thread(void *p) } /* end while !EXITING */ /* buffer usually freed in free_mid - need to free it here on exit */ - cifs_buf_release(bigbuf); - if (smallbuf) /* no sense logging a debug message if NULL */ - cifs_small_buf_release(smallbuf); + cifs_buf_release(server->bigbuf); + if (server->smallbuf) /* no sense logging a debug message if NULL */ + cifs_small_buf_release(server->smallbuf); task_to_wake = xchg(&server->tsk, NULL); clean_demultiplex_info(server); -- cgit v1.2.3-58-ga151 From c8054ebdb6903208b83aa59c387b16d5129492d5 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Oct 2011 15:29:31 -0400 Subject: cifs: find mid earlier in receive codepath In order to receive directly into a preallocated buffer, we need to ID the mid earlier, before the bulk of the response is read. Call the mid finding routine as soon as we're able to read the mid. Reviewed-and-Tested-by: Pavel Shilovsky Signed-off-by: Jeff Layton --- fs/cifs/connect.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 8918f935bf07..52195bad5e67 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -568,27 +568,21 @@ dequeue_mid(struct mid_q_entry *mid, int malformed) spin_unlock(&GlobalMid_Lock); } -static struct mid_q_entry * -find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf, - int malformed) +static void +handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server, + struct smb_hdr *buf, int malformed) { - struct mid_q_entry *mid = NULL; - - mid = find_mid(server, buf); - if (!mid) - return mid; - if (malformed == 0 && check2ndT2(buf) > 0) { mid->multiRsp = true; if (mid->resp_buf) { /* merge response - fix up 1st*/ malformed = coalesce_t2(buf, mid->resp_buf); if (malformed > 0) - return mid; + return; /* All parts received or packet is malformed. */ mid->multiEnd = true; - goto multi_t2_fnd; + return dequeue_mid(mid, malformed); } if (!server->large_buf) { /*FIXME: switch to already allocated largebuf?*/ @@ -599,7 +593,7 @@ find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf, mid->largeBuf = true; server->bigbuf = NULL; } - return mid; + return; } mid->resp_buf = buf; mid->largeBuf = server->large_buf; @@ -611,9 +605,7 @@ find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf, else server->smallbuf = NULL; } -multi_t2_fnd: dequeue_mid(mid, malformed); - return mid; } static void clean_demultiplex_info(struct TCP_Server_Info *server) @@ -775,6 +767,8 @@ cifs_demultiplex_thread(void *p) continue; server->total_read += length; + mid_entry = find_mid(server, smb_buffer); + if (pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { cERROR(1, "SMB response too long (%u bytes)", pdu_length); @@ -819,8 +813,8 @@ cifs_demultiplex_thread(void *p) server->lstrp = jiffies; - mid_entry = find_cifs_mid(server, smb_buffer, length); if (mid_entry != NULL) { + handle_mid(mid_entry, server, smb_buffer, length); if (!mid_entry->multiRsp || mid_entry->multiEnd) mid_entry->callback(mid_entry); } else if (length != 0) { -- cgit v1.2.3-58-ga151 From e9097ab48978c89b9c0926e2ae5d49bf6ea91b18 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Oct 2011 15:29:40 -0400 Subject: cifs: break out 3rd receive phase into separate function Move the entire 3rd phase of the receive codepath into a separate function in preparation for the addition of a pluggable receive function. Reviewed-and-Tested-by: Pavel Shilovsky Signed-off-by: Jeff Layton --- fs/cifs/connect.c | 101 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 59 insertions(+), 42 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 52195bad5e67..f05dedda37c6 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -704,6 +704,61 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) GFP_KERNEL); } +static int +standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + int length; + char *buf = server->smallbuf; + struct smb_hdr *smb_buffer = (struct smb_hdr *)buf; + unsigned int pdu_length = be32_to_cpu(smb_buffer->smb_buf_length); + + /* make sure this will fit in a large buffer */ + if (pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { + cERROR(1, "SMB response too long (%u bytes)", + pdu_length); + cifs_reconnect(server); + wake_up(&server->response_q); + return -EAGAIN; + } + + /* switch to large buffer if too big for a small one */ + if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) { + server->large_buf = true; + memcpy(server->bigbuf, server->smallbuf, server->total_read); + buf = server->bigbuf; + smb_buffer = (struct smb_hdr *)buf; + } + + /* now read the rest */ + length = read_from_socket(server, + buf + sizeof(struct smb_hdr) - 1, + pdu_length - sizeof(struct smb_hdr) + 1 + 4); + if (length < 0) + return length; + server->total_read += length; + + dump_smb(smb_buffer, server->total_read); + + /* + * We know that we received enough to get to the MID as we + * checked the pdu_length earlier. Now check to see + * if the rest of the header is OK. We borrow the length + * var for the rest of the loop to avoid a new stack var. + * + * 48 bytes is enough to display the header and a little bit + * into the payload for debugging purposes. + */ + length = checkSMB(smb_buffer, smb_buffer->Mid, server->total_read); + if (length != 0) + cifs_dump_mem("Bad SMB: ", buf, + min_t(unsigned int, server->total_read, 48)); + + if (mid) + handle_mid(mid, server, smb_buffer, length); + + return length; +} + static int cifs_demultiplex_thread(void *p) { @@ -769,57 +824,19 @@ cifs_demultiplex_thread(void *p) mid_entry = find_mid(server, smb_buffer); - if (pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { - cERROR(1, "SMB response too long (%u bytes)", - pdu_length); - cifs_reconnect(server); - wake_up(&server->response_q); + length = standard_receive3(server, mid_entry); + if (length < 0) continue; - } - /* else length ok */ - if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) { - server->large_buf = true; - memcpy(server->bigbuf, server->smallbuf, - server->total_read); - smb_buffer = (struct smb_hdr *)server->bigbuf; + if (server->large_buf) { buf = server->bigbuf; + smb_buffer = (struct smb_hdr *)buf; } - /* now read the rest */ - length = read_from_socket(server, - buf + sizeof(struct smb_hdr) - 1, - pdu_length - sizeof(struct smb_hdr) + 1 + 4); - if (length < 0) - continue; - server->total_read += length; - - dump_smb(smb_buffer, server->total_read); - - /* - * We know that we received enough to get to the MID as we - * checked the pdu_length earlier. Now check to see - * if the rest of the header is OK. We borrow the length - * var for the rest of the loop to avoid a new stack var. - * - * 48 bytes is enough to display the header and a little bit - * into the payload for debugging purposes. - */ - length = checkSMB(smb_buffer, smb_buffer->Mid, - server->total_read); - if (length != 0) - cifs_dump_mem("Bad SMB: ", buf, - min_t(unsigned int, server->total_read, 48)); - server->lstrp = jiffies; - if (mid_entry != NULL) { - handle_mid(mid_entry, server, smb_buffer, length); if (!mid_entry->multiRsp || mid_entry->multiEnd) mid_entry->callback(mid_entry); - } else if (length != 0) { - /* response sanity checks failed */ - continue; } else if (!is_valid_oplock_break(smb_buffer, server)) { cERROR(1, "No task to wake, unknown frame received! " "NumMids %d", atomic_read(&midCount)); -- cgit v1.2.3-58-ga151 From 44d22d846fdc7c3e688fc1ff5ae6d06d08bb5656 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Oct 2011 15:29:49 -0400 Subject: cifs: add a callback function to receive the rest of the frame In order to handle larger SMBs for readpages and other calls, we want to be able to read into a preallocated set of buffers. Rather than changing all of the existing code to preallocate buffers however, we instead add a receive callback function to the MID. cifsd will call this function once the mid_q_entry has been identified in order to receive the rest of the SMB. If the mid can't be identified or the receive pointer is unset, then the standard 3rd phase receive function will be called. Reviewed-and-Tested-by: Pavel Shilovsky Signed-off-by: Jeff Layton --- fs/cifs/cifsglob.h | 23 ++++++++++++++++++++--- fs/cifs/cifsproto.h | 5 +++-- fs/cifs/cifssmb.c | 5 +++-- fs/cifs/connect.c | 6 +++++- fs/cifs/transport.c | 5 +++-- 5 files changed, 34 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index a73dd1d5e7ef..d153d0b89d39 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -656,8 +656,24 @@ static inline void cifs_stats_bytes_read(struct cifs_tcon *tcon, struct mid_q_entry; /* - * This is the prototype for the mid callback function. When creating one, - * take special care to avoid deadlocks. Things to bear in mind: + * This is the prototype for the mid receive function. This function is for + * receiving the rest of the SMB frame, starting with the WordCount (which is + * just after the MID in struct smb_hdr). Note: + * + * - This will be called by cifsd, with no locks held. + * - The mid will still be on the pending_mid_q. + * - mid->resp_buf will point to the current buffer. + * + * Returns zero on a successful receive, or an error. The receive state in + * the TCP_Server_Info will also be updated. + */ +typedef int (mid_receive_t)(struct TCP_Server_Info *server, + struct mid_q_entry *mid); + +/* + * This is the prototype for the mid callback function. This is called once the + * mid has been received off of the socket. When creating one, take special + * care to avoid deadlocks. Things to bear in mind: * * - it will be called by cifsd, with no locks held * - the mid will be removed from any lists @@ -675,9 +691,10 @@ struct mid_q_entry { unsigned long when_sent; /* time when smb send finished */ unsigned long when_received; /* when demux complete (taken off wire) */ #endif + mid_receive_t *receive; /* call receive callback */ mid_callback_t *callback; /* call completion callback */ void *callback_data; /* general purpose pointer for callback */ - struct smb_hdr *resp_buf; /* response buffer */ + struct smb_hdr *resp_buf; /* pointer to received SMB header */ int midState; /* wish this were enum but can not pass to wait_event */ __u8 command; /* smb command code */ bool largeBuf:1; /* if valid response, is pointer to large buf */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index a1fa9cec05d6..8a7adb31ffed 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -69,8 +69,9 @@ extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server); extern void DeleteMidQEntry(struct mid_q_entry *midEntry); extern int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, - unsigned int nvec, mid_callback_t *callback, - void *cbdata, bool ignore_pend); + unsigned int nvec, mid_receive_t *receive, + mid_callback_t *callback, void *cbdata, + bool ignore_pend); extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *, struct smb_hdr * /* input */ , struct smb_hdr * /* out */ , diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index c824c106b2b7..0613df4d8e74 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -737,7 +737,8 @@ CIFSSMBEcho(struct TCP_Server_Info *server) iov.iov_base = smb; iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4; - rc = cifs_call_async(server, &iov, 1, cifs_echo_callback, server, true); + rc = cifs_call_async(server, &iov, 1, NULL, cifs_echo_callback, + server, true); if (rc) cFYI(1, "Echo request failed: %d", rc); @@ -1834,7 +1835,7 @@ cifs_async_writev(struct cifs_writedata *wdata) kref_get(&wdata->refcount); rc = cifs_call_async(tcon->ses->server, iov, wdata->nr_pages + 1, - cifs_writev_callback, wdata, false); + NULL, cifs_writev_callback, wdata, false); if (rc == 0) cifs_stats_inc(&tcon->num_writes); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index f05dedda37c6..eeee2f5d13ce 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -824,7 +824,11 @@ cifs_demultiplex_thread(void *p) mid_entry = find_mid(server, smb_buffer); - length = standard_receive3(server, mid_entry); + if (!mid_entry || !mid_entry->receive) + length = standard_receive3(server, mid_entry); + else + length = mid_entry->receive(server, mid_entry); + if (length < 0) continue; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 33a3fbf3a3a5..e7398d0cd054 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -339,8 +339,8 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) */ int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, - unsigned int nvec, mid_callback_t *callback, void *cbdata, - bool ignore_pend) + unsigned int nvec, mid_receive_t *receive, + mid_callback_t *callback, void *cbdata, bool ignore_pend) { int rc; struct mid_q_entry *mid; @@ -374,6 +374,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, goto out_err; } + mid->receive = receive; mid->callback = callback; mid->callback_data = cbdata; mid->midState = MID_REQUEST_SUBMITTED; -- cgit v1.2.3-58-ga151 From 2ab2593f4b8953ff951f5531e695e487dfe0b51f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Oct 2011 15:29:59 -0400 Subject: cifs: fix protocol definition for READ_RSP There is no pad, and it simplifies the code to remove the "Data" field. None of the existing code relies on these fields, or on the READ_RSP being a particular length. Reviewed-and-Tested-by: Pavel Shilovsky Signed-off-by: Jeff Layton --- fs/cifs/cifspdu.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index de3aa285de03..3c6ef34fe2bc 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -1089,9 +1089,7 @@ typedef struct smb_com_read_rsp { __le16 DataLengthHigh; __u64 Reserved2; __u16 ByteCount; - __u8 Pad; /* BB check for whether padded to DWORD - boundary and optimum performance here */ - char Data[1]; + /* read response data immediately follows */ } __attribute__((packed)) READ_RSP; typedef struct locking_andx_range { -- cgit v1.2.3-58-ga151 From e28bc5b1fdbd6e850488234d6072e6b66fc46146 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Oct 2011 15:30:07 -0400 Subject: cifs: add cifs_async_readv ...which will allow cifs to do an asynchronous read call to the server. The caller will allocate and set up cifs_readdata for each READ_AND_X call that should be issued on the wire. The pages passed in are added to the pagecache, but not placed on the LRU list yet (as we need the page->lru to keep the pages on the list in the readdata). When cifsd identifies the mid, it will see that there is a special receive handler for the call, and use that to receive the rest of the frame. cifs_readv_receive will then marshal up a kvec array with kmapped pages from the pagecache, which eliminates one copy of the data. Once the data is received, the pages are added to the LRU list, set uptodate, and unlocked. Reviewed-and-Tested-by: Pavel Shilovsky Signed-off-by: Jeff Layton --- fs/cifs/cifsproto.h | 24 ++++ fs/cifs/cifssmb.c | 359 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/cifs/connect.c | 26 ++-- 3 files changed, 396 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 8a7adb31ffed..c25d0636cc4f 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -154,6 +154,12 @@ extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, const char *, int); +extern void dequeue_mid(struct mid_q_entry *mid, bool malformed); +extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, + unsigned int to_read); +extern int cifs_readv_from_socket(struct TCP_Server_Info *server, + struct kvec *iov_orig, unsigned int nr_segs, + unsigned int to_read); extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, struct cifs_sb_info *cifs_sb); extern int cifs_match_super(struct super_block *, void *); @@ -443,6 +449,24 @@ extern int E_md4hash(const unsigned char *passwd, unsigned char *p16); extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24); +/* asynchronous read support */ +struct cifs_readdata { + struct cifsFileInfo *cfile; + struct address_space *mapping; + __u64 offset; + unsigned int bytes; + pid_t pid; + int result; + struct list_head pages; + struct work_struct work; + unsigned int nr_iov; + struct kvec iov[1]; +}; + +struct cifs_readdata *cifs_readdata_alloc(unsigned int nr_pages); +void cifs_readdata_free(struct cifs_readdata *rdata); +int cifs_async_readv(struct cifs_readdata *rdata); + /* asynchronous write support */ struct cifs_writedata { struct kref refcount; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 0613df4d8e74..aaad4ce6e6c5 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -33,6 +33,8 @@ #include #include #include +#include +#include #include #include "cifspdu.h" #include "cifsglob.h" @@ -40,6 +42,7 @@ #include "cifsproto.h" #include "cifs_unicode.h" #include "cifs_debug.h" +#include "fscache.h" #ifdef CONFIG_CIFS_POSIX static struct { @@ -83,6 +86,9 @@ static struct { #endif /* CONFIG_CIFS_WEAK_PW_HASH */ #endif /* CIFS_POSIX */ +/* Forward declarations */ +static void cifs_readv_complete(struct work_struct *work); + /* Mark as invalid, all open files on tree connections since they were closed when session to server was lost */ static void mark_open_files_invalid(struct cifs_tcon *pTcon) @@ -1375,6 +1381,359 @@ openRetry: return rc; } +struct cifs_readdata * +cifs_readdata_alloc(unsigned int nr_pages) +{ + struct cifs_readdata *rdata; + + /* readdata + 1 kvec for each page */ + rdata = kzalloc(sizeof(*rdata) + + sizeof(struct kvec) * nr_pages, GFP_KERNEL); + if (rdata != NULL) { + INIT_WORK(&rdata->work, cifs_readv_complete); + INIT_LIST_HEAD(&rdata->pages); + } + return rdata; +} + +void +cifs_readdata_free(struct cifs_readdata *rdata) +{ + cifsFileInfo_put(rdata->cfile); + kfree(rdata); +} + +/* + * Discard any remaining data in the current SMB. To do this, we borrow the + * current bigbuf. + */ +static int +cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + READ_RSP *rsp = (READ_RSP *)server->smallbuf; + unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length); + int remaining = rfclen + 4 - server->total_read; + struct cifs_readdata *rdata = mid->callback_data; + + while (remaining > 0) { + int length; + + length = cifs_read_from_socket(server, server->bigbuf, + min_t(unsigned int, remaining, + CIFSMaxBufSize + MAX_CIFS_HDR_SIZE)); + if (length < 0) + return length; + server->total_read += length; + remaining -= length; + } + + dequeue_mid(mid, rdata->result); + return 0; +} + +static int +cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + int length, len; + unsigned int data_offset, remaining, data_len; + struct cifs_readdata *rdata = mid->callback_data; + READ_RSP *rsp = (READ_RSP *)server->smallbuf; + unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length) + 4; + u64 eof; + pgoff_t eof_index; + struct page *page, *tpage; + + cFYI(1, "%s: mid=%u offset=%llu bytes=%u", __func__, + mid->mid, rdata->offset, rdata->bytes); + + /* + * read the rest of READ_RSP header (sans Data array), or whatever we + * can if there's not enough data. At this point, we've read down to + * the Mid. + */ + len = min_t(unsigned int, rfclen, sizeof(*rsp)) - + sizeof(struct smb_hdr) + 1; + + rdata->iov[0].iov_base = server->smallbuf + sizeof(struct smb_hdr) - 1; + rdata->iov[0].iov_len = len; + + length = cifs_readv_from_socket(server, rdata->iov, 1, len); + if (length < 0) + return length; + server->total_read += length; + + /* Was the SMB read successful? */ + rdata->result = map_smb_to_linux_error(&rsp->hdr, false); + if (rdata->result != 0) { + cFYI(1, "%s: server returned error %d", __func__, + rdata->result); + return cifs_readv_discard(server, mid); + } + + /* Is there enough to get to the rest of the READ_RSP header? */ + if (server->total_read < sizeof(READ_RSP)) { + cFYI(1, "%s: server returned short header. got=%u expected=%zu", + __func__, server->total_read, sizeof(READ_RSP)); + rdata->result = -EIO; + return cifs_readv_discard(server, mid); + } + + data_offset = le16_to_cpu(rsp->DataOffset) + 4; + if (data_offset < server->total_read) { + /* + * win2k8 sometimes sends an offset of 0 when the read + * is beyond the EOF. Treat it as if the data starts just after + * the header. + */ + cFYI(1, "%s: data offset (%u) inside read response header", + __func__, data_offset); + data_offset = server->total_read; + } else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) { + /* data_offset is beyond the end of smallbuf */ + cFYI(1, "%s: data offset (%u) beyond end of smallbuf", + __func__, data_offset); + rdata->result = -EIO; + return cifs_readv_discard(server, mid); + } + + cFYI(1, "%s: total_read=%u data_offset=%u", __func__, + server->total_read, data_offset); + + len = data_offset - server->total_read; + if (len > 0) { + /* read any junk before data into the rest of smallbuf */ + rdata->iov[0].iov_base = server->smallbuf + server->total_read; + rdata->iov[0].iov_len = len; + length = cifs_readv_from_socket(server, rdata->iov, 1, len); + if (length < 0) + return length; + server->total_read += length; + } + + /* set up first iov for signature check */ + rdata->iov[0].iov_base = server->smallbuf; + rdata->iov[0].iov_len = server->total_read; + cFYI(1, "0: iov_base=%p iov_len=%zu", + rdata->iov[0].iov_base, rdata->iov[0].iov_len); + + /* how much data is in the response? */ + data_len = le16_to_cpu(rsp->DataLengthHigh) << 16; + data_len += le16_to_cpu(rsp->DataLength); + if (data_offset + data_len > rfclen) { + /* data_len is corrupt -- discard frame */ + rdata->result = -EIO; + return cifs_readv_discard(server, mid); + } + + /* marshal up the page array */ + len = 0; + remaining = data_len; + rdata->nr_iov = 1; + + /* determine the eof that the server (probably) has */ + eof = CIFS_I(rdata->mapping->host)->server_eof; + eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0; + cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index); + + list_for_each_entry_safe(page, tpage, &rdata->pages, lru) { + if (remaining >= PAGE_CACHE_SIZE) { + /* enough data to fill the page */ + rdata->iov[rdata->nr_iov].iov_base = kmap(page); + rdata->iov[rdata->nr_iov].iov_len = PAGE_CACHE_SIZE; + cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu", + rdata->nr_iov, page->index, + rdata->iov[rdata->nr_iov].iov_base, + rdata->iov[rdata->nr_iov].iov_len); + ++rdata->nr_iov; + len += PAGE_CACHE_SIZE; + remaining -= PAGE_CACHE_SIZE; + } else if (remaining > 0) { + /* enough for partial page, fill and zero the rest */ + rdata->iov[rdata->nr_iov].iov_base = kmap(page); + rdata->iov[rdata->nr_iov].iov_len = remaining; + cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu", + rdata->nr_iov, page->index, + rdata->iov[rdata->nr_iov].iov_base, + rdata->iov[rdata->nr_iov].iov_len); + memset(rdata->iov[rdata->nr_iov].iov_base + remaining, + '\0', PAGE_CACHE_SIZE - remaining); + ++rdata->nr_iov; + len += remaining; + remaining = 0; + } else if (page->index > eof_index) { + /* + * The VFS will not try to do readahead past the + * i_size, but it's possible that we have outstanding + * writes with gaps in the middle and the i_size hasn't + * caught up yet. Populate those with zeroed out pages + * to prevent the VFS from repeatedly attempting to + * fill them until the writes are flushed. + */ + zero_user(page, 0, PAGE_CACHE_SIZE); + list_del(&page->lru); + lru_cache_add_file(page); + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); + } else { + /* no need to hold page hostage */ + list_del(&page->lru); + lru_cache_add_file(page); + unlock_page(page); + page_cache_release(page); + } + } + + /* issue the read if we have any iovecs left to fill */ + if (rdata->nr_iov > 1) { + length = cifs_readv_from_socket(server, &rdata->iov[1], + rdata->nr_iov - 1, len); + if (length < 0) + return length; + server->total_read += length; + } else { + length = 0; + } + + rdata->bytes = length; + + cFYI(1, "total_read=%u rfclen=%u remaining=%u", server->total_read, + rfclen, remaining); + + /* discard anything left over */ + if (server->total_read < rfclen) + return cifs_readv_discard(server, mid); + + dequeue_mid(mid, false); + return length; +} + +static void +cifs_readv_complete(struct work_struct *work) +{ + struct cifs_readdata *rdata = container_of(work, + struct cifs_readdata, work); + struct page *page, *tpage; + + list_for_each_entry_safe(page, tpage, &rdata->pages, lru) { + list_del(&page->lru); + lru_cache_add_file(page); + kunmap(page); + + if (rdata->result == 0) { + flush_dcache_page(page); + SetPageUptodate(page); + } + + unlock_page(page); + + if (rdata->result == 0) + cifs_readpage_to_fscache(rdata->mapping->host, page); + + page_cache_release(page); + } + cifs_readdata_free(rdata); +} + +static void +cifs_readv_callback(struct mid_q_entry *mid) +{ + struct cifs_readdata *rdata = mid->callback_data; + struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; + + cFYI(1, "%s: mid=%u state=%d result=%d bytes=%u", __func__, + mid->mid, mid->midState, rdata->result, rdata->bytes); + + switch (mid->midState) { + case MID_RESPONSE_RECEIVED: + /* result already set, check signature */ + if (server->sec_mode & + (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + if (cifs_verify_signature(rdata->iov, rdata->nr_iov, + server, mid->sequence_number + 1)) + cERROR(1, "Unexpected SMB signature"); + } + /* FIXME: should this be counted toward the initiating task? */ + task_io_account_read(rdata->bytes); + cifs_stats_bytes_read(tcon, rdata->bytes); + break; + case MID_REQUEST_SUBMITTED: + case MID_RETRY_NEEDED: + rdata->result = -EAGAIN; + break; + default: + rdata->result = -EIO; + } + + queue_work(system_nrt_wq, &rdata->work); + DeleteMidQEntry(mid); + atomic_dec(&server->inFlight); + wake_up(&server->request_q); +} + +/* cifs_async_readv - send an async write, and set up mid to handle result */ +int +cifs_async_readv(struct cifs_readdata *rdata) +{ + int rc; + READ_REQ *smb = NULL; + int wct; + struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); + + cFYI(1, "%s: offset=%llu bytes=%u", __func__, + rdata->offset, rdata->bytes); + + if (tcon->ses->capabilities & CAP_LARGE_FILES) + wct = 12; + else { + wct = 10; /* old style read */ + if ((rdata->offset >> 32) > 0) { + /* can not handle this big offset for old */ + return -EIO; + } + } + + rc = small_smb_init(SMB_COM_READ_ANDX, wct, tcon, (void **)&smb); + if (rc) + return rc; + + smb->hdr.Pid = cpu_to_le16((__u16)rdata->pid); + smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16)); + + smb->AndXCommand = 0xFF; /* none */ + smb->Fid = rdata->cfile->netfid; + smb->OffsetLow = cpu_to_le32(rdata->offset & 0xFFFFFFFF); + if (wct == 12) + smb->OffsetHigh = cpu_to_le32(rdata->offset >> 32); + smb->Remaining = 0; + smb->MaxCount = cpu_to_le16(rdata->bytes & 0xFFFF); + smb->MaxCountHigh = cpu_to_le32(rdata->bytes >> 16); + if (wct == 12) + smb->ByteCount = 0; + else { + /* old style read */ + struct smb_com_readx_req *smbr = + (struct smb_com_readx_req *)smb; + smbr->ByteCount = 0; + } + + /* 4 for RFC1001 length + 1 for BCC */ + rdata->iov[0].iov_base = smb; + rdata->iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4; + + rc = cifs_call_async(tcon->ses->server, rdata->iov, 1, + cifs_readv_receive, cifs_readv_callback, + rdata, false); + + if (rc == 0) + cifs_stats_inc(&tcon->num_reads); + + cifs_small_buf_release(smb); + return rc; +} + int CIFSSMBRead(const int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, char **buf, int *pbuf_type) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index eeee2f5d13ce..3d518b9e8c18 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -422,9 +422,9 @@ get_server_iovec(struct TCP_Server_Info *server, unsigned int nr_segs) return new_iov; } -static int -readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, - unsigned int nr_segs, unsigned int to_read) +int +cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, + unsigned int nr_segs, unsigned int to_read) { int length = 0; int total_read; @@ -479,16 +479,16 @@ readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, return total_read; } -static int -read_from_socket(struct TCP_Server_Info *server, char *buf, - unsigned int to_read) +int +cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, + unsigned int to_read) { struct kvec iov; iov.iov_base = buf; iov.iov_len = to_read; - return readv_from_socket(server, &iov, 1, to_read); + return cifs_readv_from_socket(server, &iov, 1, to_read); } static bool @@ -553,8 +553,8 @@ find_mid(struct TCP_Server_Info *server, struct smb_hdr *buf) return NULL; } -static void -dequeue_mid(struct mid_q_entry *mid, int malformed) +void +dequeue_mid(struct mid_q_entry *mid, bool malformed) { #ifdef CONFIG_CIFS_STATS2 mid->when_received = jiffies; @@ -730,7 +730,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) } /* now read the rest */ - length = read_from_socket(server, + length = cifs_read_from_socket(server, buf + sizeof(struct smb_hdr) - 1, pdu_length - sizeof(struct smb_hdr) + 1 + 4); if (length < 0) @@ -791,7 +791,7 @@ cifs_demultiplex_thread(void *p) buf = server->smallbuf; pdu_length = 4; /* enough to get RFC1001 header */ - length = read_from_socket(server, buf, pdu_length); + length = cifs_read_from_socket(server, buf, pdu_length); if (length < 0) continue; server->total_read = length; @@ -816,8 +816,8 @@ cifs_demultiplex_thread(void *p) } /* read down to the MID */ - length = read_from_socket(server, buf + 4, - sizeof(struct smb_hdr) - 1 - 4); + length = cifs_read_from_socket(server, buf + 4, + sizeof(struct smb_hdr) - 1 - 4); if (length < 0) continue; server->total_read += length; -- cgit v1.2.3-58-ga151 From 690c5e3163502f229e5b5d455e5212e28c20cd6d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Oct 2011 15:30:16 -0400 Subject: cifs: convert cifs_readpages to use async reads Now that we have code in place to do asynchronous reads, convert cifs_readpages to use it. The new cifs_readpages walks the page_list that gets passed in, locks and adds the pages to the pagecache and sets up cifs_readdata to handle the reads. The rest is handled by the cifs_async_readv infrastructure. Reviewed-and-Tested-by: Pavel Shilovsky Signed-off-by: Jeff Layton --- fs/cifs/file.c | 281 +++++++++++++++++++++++---------------------------------- 1 file changed, 113 insertions(+), 168 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 852d1f39adae..8f6917816fec 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include "cifsfs.h" #include "cifspdu.h" @@ -2000,82 +2001,24 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) return rc; } - -static void cifs_copy_cache_pages(struct address_space *mapping, - struct list_head *pages, int bytes_read, char *data) -{ - struct page *page; - char *target; - - while (bytes_read > 0) { - if (list_empty(pages)) - break; - - page = list_entry(pages->prev, struct page, lru); - list_del(&page->lru); - - if (add_to_page_cache_lru(page, mapping, page->index, - GFP_KERNEL)) { - page_cache_release(page); - cFYI(1, "Add page cache failed"); - data += PAGE_CACHE_SIZE; - bytes_read -= PAGE_CACHE_SIZE; - continue; - } - page_cache_release(page); - - target = kmap_atomic(page, KM_USER0); - - if (PAGE_CACHE_SIZE > bytes_read) { - memcpy(target, data, bytes_read); - /* zero the tail end of this partial page */ - memset(target + bytes_read, 0, - PAGE_CACHE_SIZE - bytes_read); - bytes_read = 0; - } else { - memcpy(target, data, PAGE_CACHE_SIZE); - bytes_read -= PAGE_CACHE_SIZE; - } - kunmap_atomic(target, KM_USER0); - - flush_dcache_page(page); - SetPageUptodate(page); - unlock_page(page); - data += PAGE_CACHE_SIZE; - - /* add page to FS-Cache */ - cifs_readpage_to_fscache(mapping->host, page); - } - return; -} - static int cifs_readpages(struct file *file, struct address_space *mapping, struct list_head *page_list, unsigned num_pages) { - int rc = -EACCES; - int xid; - loff_t offset; - struct page *page; - struct cifs_sb_info *cifs_sb; - struct cifs_tcon *pTcon; - unsigned int bytes_read = 0; - unsigned int read_size, i; - char *smb_read_data = NULL; - struct smb_com_read_rsp *pSMBr; - struct cifsFileInfo *open_file; - struct cifs_io_parms io_parms; - int buf_type = CIFS_NO_BUFFER; - __u32 pid; + int rc; + struct list_head tmplist; + struct cifsFileInfo *open_file = file->private_data; + struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + unsigned int rsize = cifs_sb->rsize; + pid_t pid; - xid = GetXid(); - if (file->private_data == NULL) { - rc = -EBADF; - FreeXid(xid); - return rc; - } - open_file = file->private_data; - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - pTcon = tlink_tcon(open_file->tlink); + /* + * Give up immediately if rsize is too small to read an entire page. + * The VFS will fall back to readpage. We should never reach this + * point however since we set ra_pages to 0 when the rsize is smaller + * than a cache page. + */ + if (unlikely(rsize < PAGE_CACHE_SIZE)) + return 0; /* * Reads as many pages as possible from fscache. Returns -ENOBUFS @@ -2084,125 +2027,127 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list, &num_pages); if (rc == 0) - goto read_complete; + return rc; - cFYI(DBG2, "rpages: num pages %d", num_pages); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; else pid = current->tgid; - for (i = 0; i < num_pages; ) { - unsigned contig_pages; - struct page *tmp_page; - unsigned long expected_index; + rc = 0; + INIT_LIST_HEAD(&tmplist); - if (list_empty(page_list)) - break; + cFYI(1, "%s: file=%p mapping=%p num_pages=%u", __func__, file, + mapping, num_pages); + + /* + * Start with the page at end of list and move it to private + * list. Do the same with any following pages until we hit + * the rsize limit, hit an index discontinuity, or run out of + * pages. Issue the async read and then start the loop again + * until the list is empty. + * + * Note that list order is important. The page_list is in + * the order of declining indexes. When we put the pages in + * the rdata->pages, then we want them in increasing order. + */ + while (!list_empty(page_list)) { + unsigned int bytes = PAGE_CACHE_SIZE; + unsigned int expected_index; + unsigned int nr_pages = 1; + loff_t offset; + struct page *page, *tpage; + struct cifs_readdata *rdata; page = list_entry(page_list->prev, struct page, lru); + + /* + * Lock the page and put it in the cache. Since no one else + * should have access to this page, we're safe to simply set + * PG_locked without checking it first. + */ + __set_page_locked(page); + rc = add_to_page_cache_locked(page, mapping, + page->index, GFP_KERNEL); + + /* give up if we can't stick it in the cache */ + if (rc) { + __clear_page_locked(page); + break; + } + + /* move first page to the tmplist */ offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + list_move_tail(&page->lru, &tmplist); + + /* now try and add more pages onto the request */ + expected_index = page->index + 1; + list_for_each_entry_safe_reverse(page, tpage, page_list, lru) { + /* discontinuity ? */ + if (page->index != expected_index) + break; + + /* would this page push the read over the rsize? */ + if (bytes + PAGE_CACHE_SIZE > rsize) + break; - /* count adjacent pages that we will read into */ - contig_pages = 0; - expected_index = - list_entry(page_list->prev, struct page, lru)->index; - list_for_each_entry_reverse(tmp_page, page_list, lru) { - if (tmp_page->index == expected_index) { - contig_pages++; - expected_index++; - } else + __set_page_locked(page); + if (add_to_page_cache_locked(page, mapping, + page->index, GFP_KERNEL)) { + __clear_page_locked(page); break; + } + list_move_tail(&page->lru, &tmplist); + bytes += PAGE_CACHE_SIZE; + expected_index++; + nr_pages++; } - if (contig_pages + i > num_pages) - contig_pages = num_pages - i; - - /* for reads over a certain size could initiate async - read ahead */ - - read_size = contig_pages * PAGE_CACHE_SIZE; - /* Read size needs to be in multiples of one page */ - read_size = min_t(const unsigned int, read_size, - cifs_sb->rsize & PAGE_CACHE_MASK); - cFYI(DBG2, "rpages: read size 0x%x contiguous pages %d", - read_size, contig_pages); - rc = -EAGAIN; - while (rc == -EAGAIN) { + + rdata = cifs_readdata_alloc(nr_pages); + if (!rdata) { + /* best to give up if we're out of mem */ + list_for_each_entry_safe(page, tpage, &tmplist, lru) { + list_del(&page->lru); + lru_cache_add_file(page); + unlock_page(page); + page_cache_release(page); + } + rc = -ENOMEM; + break; + } + + spin_lock(&cifs_file_list_lock); + cifsFileInfo_get(open_file); + spin_unlock(&cifs_file_list_lock); + rdata->cfile = open_file; + rdata->mapping = mapping; + rdata->offset = offset; + rdata->bytes = bytes; + rdata->pid = pid; + list_splice_init(&tmplist, &rdata->pages); + + do { if (open_file->invalidHandle) { rc = cifs_reopen_file(open_file, true); if (rc != 0) - break; + continue; } - io_parms.netfid = open_file->netfid; - io_parms.pid = pid; - io_parms.tcon = pTcon; - io_parms.offset = offset; - io_parms.length = read_size; - rc = CIFSSMBRead(xid, &io_parms, &bytes_read, - &smb_read_data, &buf_type); - /* BB more RC checks ? */ - if (rc == -EAGAIN) { - if (smb_read_data) { - if (buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(smb_read_data); - else if (buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(smb_read_data); - smb_read_data = NULL; - } - } - } - if ((rc < 0) || (smb_read_data == NULL)) { - cFYI(1, "Read error in readpages: %d", rc); - break; - } else if (bytes_read > 0) { - task_io_account_read(bytes_read); - pSMBr = (struct smb_com_read_rsp *)smb_read_data; - cifs_copy_cache_pages(mapping, page_list, bytes_read, - smb_read_data + 4 /* RFC1001 hdr */ + - le16_to_cpu(pSMBr->DataOffset)); - - i += bytes_read >> PAGE_CACHE_SHIFT; - cifs_stats_bytes_read(pTcon, bytes_read); - if ((bytes_read & PAGE_CACHE_MASK) != bytes_read) { - i++; /* account for partial page */ - - /* server copy of file can have smaller size - than client */ - /* BB do we need to verify this common case ? - this case is ok - if we are at server EOF - we will hit it on next read */ + rc = cifs_async_readv(rdata); + } while (rc == -EAGAIN); - /* break; */ + if (rc != 0) { + list_for_each_entry_safe(page, tpage, &rdata->pages, + lru) { + list_del(&page->lru); + lru_cache_add_file(page); + unlock_page(page); + page_cache_release(page); } - } else { - cFYI(1, "No bytes read (%d) at offset %lld . " - "Cleaning remaining pages from readahead list", - bytes_read, offset); - /* BB turn off caching and do new lookup on - file size at server? */ + cifs_readdata_free(rdata); break; } - if (smb_read_data) { - if (buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(smb_read_data); - else if (buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(smb_read_data); - smb_read_data = NULL; - } - bytes_read = 0; - } - -/* need to free smb_read_data buf before exit */ - if (smb_read_data) { - if (buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(smb_read_data); - else if (buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(smb_read_data); - smb_read_data = NULL; } -read_complete: - FreeXid(xid); return rc; } -- cgit v1.2.3-58-ga151 From 5eba8ab3606621f7e175ae9f521d71f3ac534f82 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Oct 2011 15:30:26 -0400 Subject: cifs: allow for larger rsize= options and change defaults Currently we cap the rsize at a value that fits in CIFSMaxBufSize. That's not needed any longer for readpages. Allow the use of larger values for readpages. cifs_iovec_read and cifs_read however are still limited to the CIFSMaxBufSize. Make sure they don't exceed that. The patch also changes the rsize defaults. The default when unix extensions are enabled is set to 1M for parity with the wsize, and there is a hard cap of ~16M. When unix extensions are not enabled, the default is set to 60k. According to MS-CIFS, Windows servers can only send a max of 60k at a time, so this is more efficient than requesting a larger size. If the user wishes however, the max can be extended up to 128k - the length of the READ_RSP header. Really old servers however require a special hack to ensure that we don't request too large a read. Reviewed-and-Tested-by: Pavel Shilovsky Signed-off-by: Jeff Layton --- fs/cifs/connect.c | 120 +++++++++++++++++++++++++++++++++--------------------- fs/cifs/file.c | 14 +++++-- 2 files changed, 84 insertions(+), 50 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 3d518b9e8c18..06dfaacfea5d 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2310,16 +2310,16 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) (new->mnt_cifs_flags & CIFS_MOUNT_MASK)) return 0; - if (old->rsize != new->rsize) - return 0; - /* - * We want to share sb only if we don't specify wsize or specified wsize - * is greater or equal than existing one. + * We want to share sb only if we don't specify an r/wsize or + * specified r/wsize is greater than or equal to existing one. */ if (new->wsize && new->wsize < old->wsize) return 0; + if (new->rsize && new->rsize < old->rsize) + return 0; + if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid) return 0; @@ -2757,14 +2757,6 @@ void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon, CIFS_MOUNT_POSIX_PATHS; } - if (cifs_sb && (cifs_sb->rsize > 127 * 1024)) { - if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) { - cifs_sb->rsize = 127 * 1024; - cFYI(DBG2, "larger reads not supported by srv"); - } - } - - cFYI(1, "Negotiate caps 0x%x", (int)cap); #ifdef CONFIG_CIFS_DEBUG2 if (cap & CIFS_UNIX_FCNTL_CAP) @@ -2809,27 +2801,11 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, spin_lock_init(&cifs_sb->tlink_tree_lock); cifs_sb->tlink_tree = RB_ROOT; - if (pvolume_info->rsize > CIFSMaxBufSize) { - cERROR(1, "rsize %d too large, using MaxBufSize", - pvolume_info->rsize); - cifs_sb->rsize = CIFSMaxBufSize; - } else if ((pvolume_info->rsize) && - (pvolume_info->rsize <= CIFSMaxBufSize)) - cifs_sb->rsize = pvolume_info->rsize; - else /* default */ - cifs_sb->rsize = CIFSMaxBufSize; - - if (cifs_sb->rsize < 2048) { - cifs_sb->rsize = 2048; - /* Windows ME may prefer this */ - cFYI(1, "readsize set to minimum: 2048"); - } - /* - * Temporarily set wsize for matching superblock. If we end up using - * new sb then cifs_negotiate_wsize will later negotiate it downward - * if needed. + * Temporarily set r/wsize for matching superblock. If we end up using + * new sb then client will later negotiate it downward if needed. */ + cifs_sb->rsize = pvolume_info->rsize; cifs_sb->wsize = pvolume_info->wsize; cifs_sb->mnt_uid = pvolume_info->linux_uid; @@ -2904,29 +2880,41 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, } /* - * When the server supports very large writes via POSIX extensions, we can - * allow up to 2^24-1, minus the size of a WRITE_AND_X header, not including - * the RFC1001 length. + * When the server supports very large reads and writes via POSIX extensions, + * we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not + * including the RFC1001 length. * * Note that this might make for "interesting" allocation problems during * writeback however as we have to allocate an array of pointers for the * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096. + * + * For reads, there is a similar problem as we need to allocate an array + * of kvecs to handle the receive, though that should only need to be done + * once. */ #define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4) +#define CIFS_MAX_RSIZE ((1<<24) - sizeof(READ_RSP) + 4) /* - * When the server doesn't allow large posix writes, only allow a wsize of - * 2^17-1 minus the size of the WRITE_AND_X header. That allows for a write up - * to the maximum size described by RFC1002. + * When the server doesn't allow large posix writes, only allow a rsize/wsize + * of 2^17-1 minus the size of the call header. That allows for a read or + * write up to the maximum size described by RFC1002. */ #define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4) +#define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4) /* * The default wsize is 1M. find_get_pages seems to return a maximum of 256 * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill * a single wsize request with a single call. */ -#define CIFS_DEFAULT_WSIZE (1024 * 1024) +#define CIFS_DEFAULT_IOSIZE (1024 * 1024) + +/* + * Windows only supports a max of 60k reads. Default to that when posix + * extensions aren't in force. + */ +#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024) static unsigned int cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) @@ -2934,7 +2922,7 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); struct TCP_Server_Info *server = tcon->ses->server; unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize : - CIFS_DEFAULT_WSIZE; + CIFS_DEFAULT_IOSIZE; /* can server support 24-bit write sizes? (via UNIX extensions) */ if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) @@ -2957,6 +2945,50 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) return wsize; } +static unsigned int +cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) +{ + __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); + struct TCP_Server_Info *server = tcon->ses->server; + unsigned int rsize, defsize; + + /* + * Set default value... + * + * HACK alert! Ancient servers have very small buffers. Even though + * MS-CIFS indicates that servers are only limited by the client's + * bufsize for reads, testing against win98se shows that it throws + * INVALID_PARAMETER errors if you try to request too large a read. + * + * If the server advertises a MaxBufferSize of less than one page, + * assume that it also can't satisfy reads larger than that either. + * + * FIXME: Is there a better heuristic for this? + */ + if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_READ_CAP)) + defsize = CIFS_DEFAULT_IOSIZE; + else if (server->capabilities & CAP_LARGE_READ_X) + defsize = CIFS_DEFAULT_NON_POSIX_RSIZE; + else if (server->maxBuf >= PAGE_CACHE_SIZE) + defsize = CIFSMaxBufSize; + else + defsize = server->maxBuf - sizeof(READ_RSP); + + rsize = pvolume_info->rsize ? pvolume_info->rsize : defsize; + + /* + * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to + * the client's MaxBufferSize. + */ + if (!(server->capabilities & CAP_LARGE_READ_X)) + rsize = min_t(unsigned int, CIFSMaxBufSize, rsize); + + /* hard limit of CIFS_MAX_RSIZE */ + rsize = min_t(unsigned int, rsize, CIFS_MAX_RSIZE); + + return rsize; +} + static int is_path_accessible(int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path) @@ -3234,14 +3266,8 @@ try_mount_again: CIFSSMBQFSAttributeInfo(xid, tcon); } - if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) { - cifs_sb->rsize = 1024 * 127; - cFYI(DBG2, "no very large read support, rsize now 127K"); - } - if (!(tcon->ses->capabilities & CAP_LARGE_READ_X)) - cifs_sb->rsize = min(cifs_sb->rsize, CIFSMaxBufSize); - cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info); + cifs_sb->rsize = cifs_negotiate_rsize(tcon, volume_info); remote_path_check: #ifdef CONFIG_CIFS_DFS_UPCALL diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8f6917816fec..a3b545ff5250 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1758,6 +1758,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, struct smb_com_read_rsp *pSMBr; struct cifs_io_parms io_parms; char *read_data; + unsigned int rsize; __u32 pid; if (!nr_segs) @@ -1770,6 +1771,9 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, xid = GetXid(); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + /* FIXME: set up handlers for larger reads and/or convert to async */ + rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize); + open_file = file->private_data; pTcon = tlink_tcon(open_file->tlink); @@ -1782,7 +1786,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, cFYI(1, "attempting read on write only file instance"); for (total_read = 0; total_read < len; total_read += bytes_read) { - cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize); + cur_len = min_t(const size_t, len - total_read, rsize); rc = -EAGAIN; read_data = NULL; @@ -1874,6 +1878,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, unsigned int bytes_read = 0; unsigned int total_read; unsigned int current_read_size; + unsigned int rsize; struct cifs_sb_info *cifs_sb; struct cifs_tcon *pTcon; int xid; @@ -1886,6 +1891,9 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, xid = GetXid(); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + /* FIXME: set up handlers for larger reads and/or convert to async */ + rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize); + if (file->private_data == NULL) { rc = -EBADF; FreeXid(xid); @@ -1905,8 +1913,8 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, for (total_read = 0, current_offset = read_data; read_size > total_read; total_read += bytes_read, current_offset += bytes_read) { - current_read_size = min_t(uint, read_size - total_read, - cifs_sb->rsize); + current_read_size = min_t(uint, read_size - total_read, rsize); + /* For windows me and 9x we do not want to request more than it negotiated since it will refuse the read then */ if ((pTcon->ses) && -- cgit v1.2.3-58-ga151 From 66bfaadc3da74fecb4ba8b03f6b81d5f58b031fa Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Oct 2011 15:30:35 -0400 Subject: cifs: tune bdi.ra_pages in accordance with the rsize Tune bdi.ra_pages to be a multiple of the rsize. This prevents the VFS from asking for pages that require small reads to satisfy. Reviewed-and-Tested-by: Pavel Shilovsky Signed-off-by: Jeff Layton --- fs/cifs/connect.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 06dfaacfea5d..f70d87d6ba61 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3182,6 +3182,22 @@ cifs_get_volume_info(char *mount_data, const char *devname) return volume_info; } +/* make sure ra_pages is a multiple of rsize */ +static inline unsigned int +cifs_ra_pages(struct cifs_sb_info *cifs_sb) +{ + unsigned int reads; + unsigned int rsize_pages = cifs_sb->rsize / PAGE_CACHE_SIZE; + + if (rsize_pages >= default_backing_dev_info.ra_pages) + return default_backing_dev_info.ra_pages; + else if (rsize_pages == 0) + return rsize_pages; + + reads = default_backing_dev_info.ra_pages / rsize_pages; + return reads * rsize_pages; +} + int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) { @@ -3200,8 +3216,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) if (rc) return rc; - cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages; - #ifdef CONFIG_CIFS_DFS_UPCALL try_mount_again: /* cleanup activities if we're chasing a referral */ @@ -3269,6 +3283,9 @@ try_mount_again: cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info); cifs_sb->rsize = cifs_negotiate_rsize(tcon, volume_info); + /* tune readahead according to rsize */ + cifs_sb->bdi.ra_pages = cifs_ra_pages(cifs_sb); + remote_path_check: #ifdef CONFIG_CIFS_DFS_UPCALL /* -- cgit v1.2.3-58-ga151 From fef33df88bef6810cc3c4e6edf55c741a8fd68e3 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Oct 2011 15:30:37 -0400 Subject: cifs: allow cifs_max_pending to be readable under /sys/module/cifs/parameters Signed-off-by: Jeff Layton --- fs/cifs/cifsfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index b0a2e1647390..fa6724562a41 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -74,7 +74,7 @@ module_param(cifs_min_small, int, 0); MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 " "Range: 2 to 256"); unsigned int cifs_max_pending = CIFS_MAX_REQ; -module_param(cifs_max_pending, int, 0); +module_param(cifs_max_pending, int, 0444); MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " "Default: 50 Range: 2 to 256"); unsigned short echo_retries = 5; -- cgit v1.2.3-58-ga151 From f06ac72e929115f2772c29727152ba0832d641e4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Oct 2011 15:30:40 -0400 Subject: cifs, freezer: add wait_event_freezekillable and have cifs use it CIFS currently uses wait_event_killable to put tasks to sleep while they await replies from the server. That function though does not allow the freezer to run. In many cases, the network interface may be going down anyway, in which case the reply will never come. The client then ends up blocking the computer from suspending. Fix this by adding a new wait_event_freezable variant -- wait_event_freezekillable. The idea is to combine the behavior of wait_event_killable and wait_event_freezable -- put the task to sleep and only allow it to be awoken by fatal signals, but also allow the freezer to do its job. Signed-off-by: Jeff Layton --- fs/cifs/transport.c | 3 ++- include/linux/freezer.h | 19 +++++++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index e7398d0cd054..0cc9584f5889 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -324,7 +325,7 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) { int error; - error = wait_event_killable(server->response_q, + error = wait_event_freezekillable(server->response_q, midQ->midState != MID_REQUEST_SUBMITTED); if (error < 0) return -ERESTARTSYS; diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 1effc8b56b4e..3672f731f03a 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -134,10 +134,25 @@ static inline void set_freezable_with_signal(void) } /* - * Freezer-friendly wrappers around wait_event_interruptible() and - * wait_event_interruptible_timeout(), originally defined in + * Freezer-friendly wrappers around wait_event_interruptible(), + * wait_event_killable() and wait_event_interruptible_timeout(), originally + * defined in */ +#define wait_event_freezekillable(wq, condition) \ +({ \ + int __retval; \ + do { \ + __retval = wait_event_killable(wq, \ + (condition) || freezing(current)); \ + if (__retval && !freezing(current)) \ + break; \ + else if (!(condition)) \ + __retval = -ERESTARTSYS; \ + } while (try_to_freeze()); \ + __retval; \ +}) + #define wait_event_freezable(wq, condition) \ ({ \ int __retval; \ -- cgit v1.2.3-58-ga151 From d00c5d43866720963a265fa3129f3203cac35b8e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 19 Oct 2011 12:17:29 -0700 Subject: NFS: Get rid of nfs_restart_rpc() It can trivially be replaced with rpc_restart_call_prepare. Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 10 ---------- fs/nfs/nfs4filelayout.c | 14 ++++---------- fs/nfs/nfs4proc.c | 17 ++++++++--------- fs/nfs/read.c | 2 +- fs/nfs/unlink.c | 4 ++-- fs/nfs/write.c | 2 +- net/sunrpc/clnt.c | 4 +++- 7 files changed, 19 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index ab12913dd473..c1a1bd8ddf1c 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -457,13 +457,3 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) PAGE_SIZE - 1) >> PAGE_SHIFT; } -/* - * Helper for restarting RPC calls in the possible presence of NFSv4.1 - * sessions. - */ -static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp) -{ - if (nfs4_has_session(clp)) - return rpc_restart_call_prepare(task); - return rpc_restart_call(task); -} diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 4c78c62639e6..09119418402f 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -122,7 +122,6 @@ static int filelayout_async_handle_error(struct rpc_task *task, static int filelayout_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) { - struct nfs_client *clp = data->ds_clp; int reset = 0; dprintk("%s DS read\n", __func__); @@ -134,9 +133,8 @@ static int filelayout_read_done_cb(struct rpc_task *task, if (reset) { pnfs_set_lo_fail(data->lseg); nfs4_reset_read(task, data); - clp = NFS_SERVER(data->inode)->nfs_client; } - nfs_restart_rpc(task, clp); + rpc_restart_call_prepare(task); return -EAGAIN; } @@ -203,17 +201,13 @@ static int filelayout_write_done_cb(struct rpc_task *task, if (filelayout_async_handle_error(task, data->args.context->state, data->ds_clp, &reset) == -EAGAIN) { - struct nfs_client *clp; - dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", __func__, data->ds_clp, data->ds_clp->cl_session); if (reset) { pnfs_set_lo_fail(data->lseg); nfs4_reset_write(task, data); - clp = NFS_SERVER(data->inode)->nfs_client; - } else - clp = data->ds_clp; - nfs_restart_rpc(task, clp); + } + rpc_restart_call_prepare(task); return -EAGAIN; } @@ -245,7 +239,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task, prepare_to_resend_writes(data); pnfs_set_lo_fail(data->lseg); } else - nfs_restart_rpc(task, data->ds_clp); + rpc_restart_call_prepare(task); return -EAGAIN; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ba0da50865fe..d2ae413c986a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3191,7 +3191,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) struct nfs_server *server = NFS_SERVER(data->inode); if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { - nfs_restart_rpc(task, server->nfs_client); + rpc_restart_call_prepare(task); return -EAGAIN; } @@ -3241,7 +3241,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data struct inode *inode = data->inode; if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { - nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + rpc_restart_call_prepare(task); return -EAGAIN; } if (task->tk_status >= 0) { @@ -3298,7 +3298,7 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *dat struct inode *inode = data->inode; if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { - nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + rpc_restart_call_prepare(task); return -EAGAIN; } nfs_refresh_inode(inode, data->res.fattr); @@ -3838,7 +3838,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) default: if (nfs4_async_handle_error(task, data->res.server, NULL) == -EAGAIN) { - nfs_restart_rpc(task, data->res.server->nfs_client); + rpc_restart_call_prepare(task); return; } } @@ -4092,8 +4092,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) break; default: if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) - nfs_restart_rpc(task, - calldata->server->nfs_client); + rpc_restart_call_prepare(task); } } @@ -4926,7 +4925,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata) task->tk_status = 0; /* fall through */ case -NFS4ERR_RETRY_UNCACHED_REP: - nfs_restart_rpc(task, data->clp); + rpc_restart_call_prepare(task); return; } dprintk("<-- %s\n", __func__); @@ -5767,7 +5766,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) server = NFS_SERVER(lrp->args.inode); if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { - nfs_restart_rpc(task, lrp->clp); + rpc_restart_call_prepare(task); return; } spin_lock(&lo->plh_inode->i_lock); @@ -5938,7 +5937,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) } if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { - nfs_restart_rpc(task, server->nfs_client); + rpc_restart_call_prepare(task); return; } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index bfc20b160243..e866a7e6e2d5 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -435,7 +435,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data argp->offset += resp->count; argp->pgbase += resp->count; argp->count -= resp->count; - nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); + rpc_restart_call_prepare(task); } /* diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index b2fbbde58e44..4f9319a2e567 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -87,7 +87,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata) struct inode *dir = data->dir; if (!NFS_PROTO(dir)->unlink_done(task, dir)) - nfs_restart_rpc(task, NFS_SERVER(dir)->nfs_client); + rpc_restart_call_prepare(task); } /** @@ -369,7 +369,7 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata) struct dentry *new_dentry = data->new_dentry; if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { - nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client); + rpc_restart_call_prepare(task); return; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 2084a6494218..ad90b0c998cb 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1305,7 +1305,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) */ argp->stable = NFS_FILE_SYNC; } - nfs_restart_rpc(task, server->nfs_client); + rpc_restart_call_prepare(task); return; } if (time_before(complain, jiffies)) { diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index c5347d29cfb7..f0268ea7e711 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -850,7 +850,9 @@ rpc_restart_call_prepare(struct rpc_task *task) { if (RPC_ASSASSINATED(task)) return 0; - task->tk_action = rpc_prepare_task; + task->tk_action = call_start; + if (task->tk_ops->rpc_call_prepare != NULL) + task->tk_action = rpc_prepare_task; return 1; } EXPORT_SYMBOL_GPL(rpc_restart_call_prepare); -- cgit v1.2.3-58-ga151 From fbb5a9abf0d589e9471dc93b18025b7b921d22c9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 19 Oct 2011 12:17:29 -0700 Subject: NFS: Get rid of unnecessary calls to ClearPageError() in read code The generic file read code does that for us anyway. Signed-off-by: Trond Myklebust --- fs/nfs/read.c | 2 -- fs/nfs/write.c | 1 - 2 files changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/read.c b/fs/nfs/read.c index e866a7e6e2d5..09829d96d207 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -322,7 +322,6 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head offset += len; } while(nbytes != 0); atomic_set(&req->wb_complete, requests); - ClearPageError(page); desc->pg_rpc_callops = &nfs_read_partial_ops; return ret; out_bad: @@ -357,7 +356,6 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, struct list_head * req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_list_add_request(req, &data->pages); - ClearPageError(req->wb_page); *pages++ = req->wb_page; } req = nfs_list_entry(data->pages.next); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ad90b0c998cb..2219c88d96b2 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1011,7 +1011,6 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *r req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_list_add_request(req, &data->pages); - ClearPageError(req->wb_page); *pages++ = req->wb_page; } req = nfs_list_entry(data->pages.next); -- cgit v1.2.3-58-ga151 From fba730050d1246d0e6ef44e026e0b584732fec2b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 19 Oct 2011 12:17:29 -0700 Subject: NFS: Don't rely on PageError in nfs_readpage_release_partial Don't rely on the PageError flag to tell us if one of the partial reads of the page failed. Instead, replace that with a dedicated flag in the struct nfs_page. Then clean out redundant uses of the PageError flag: the VM no longer checks it for reads. Signed-off-by: Trond Myklebust --- fs/nfs/read.c | 7 ++----- include/linux/nfs_page.h | 1 + 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 09829d96d207..fd58e909842b 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -276,7 +276,6 @@ nfs_async_read_error(struct list_head *head) while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); - SetPageError(req->wb_page); nfs_readpage_release(req); } } @@ -330,7 +329,6 @@ out_bad: list_del(&data->list); nfs_readdata_free(data); } - SetPageError(page); nfs_readpage_release(req); return -ENOMEM; } @@ -460,10 +458,10 @@ static void nfs_readpage_release_partial(void *calldata) int status = data->task.tk_status; if (status < 0) - SetPageError(page); + set_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags); if (atomic_dec_and_test(&req->wb_complete)) { - if (!PageError(page)) + if (!test_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags)) SetPageUptodate(page); nfs_readpage_release(req); } @@ -656,7 +654,6 @@ readpage_async_filler(void *data, struct page *page) return 0; out_error: error = PTR_ERR(new); - SetPageError(page); out_unlock: unlock_page(page); return error; diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index e2791a27a901..ab465fe8c3d6 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -34,6 +34,7 @@ enum { PG_NEED_COMMIT, PG_NEED_RESCHED, PG_PNFS_COMMIT, + PG_PARTIAL_READ_FAILED, }; struct nfs_inode; -- cgit v1.2.3-58-ga151 From b6ee8cd2642f6d822dd1a4ba62298b65ff99b72e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 19 Oct 2011 12:17:29 -0700 Subject: NFS: Get rid of the nfs_rdata_mempool We don't need a mempool in order to guarantee reliable NFS read performance. Signed-off-by: Trond Myklebust --- fs/nfs/read.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfs/read.c b/fs/nfs/read.c index fd58e909842b..8b48ec63f722 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -35,16 +35,13 @@ static const struct rpc_call_ops nfs_read_partial_ops; static const struct rpc_call_ops nfs_read_full_ops; static struct kmem_cache *nfs_rdata_cachep; -static mempool_t *nfs_rdata_mempool; - -#define MIN_POOL_READ (32) struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) { - struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL); + struct nfs_read_data *p; + p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); if (p) { - memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); p->npages = pagecount; if (pagecount <= ARRAY_SIZE(p->page_array)) @@ -52,7 +49,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) else { p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); if (!p->pagevec) { - mempool_free(p, nfs_rdata_mempool); + kmem_cache_free(nfs_rdata_cachep, p); p = NULL; } } @@ -64,7 +61,7 @@ void nfs_readdata_free(struct nfs_read_data *p) { if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); - mempool_free(p, nfs_rdata_mempool); + kmem_cache_free(nfs_rdata_cachep, p); } void nfs_readdata_release(struct nfs_read_data *rdata) @@ -716,16 +713,10 @@ int __init nfs_init_readpagecache(void) if (nfs_rdata_cachep == NULL) return -ENOMEM; - nfs_rdata_mempool = mempool_create_slab_pool(MIN_POOL_READ, - nfs_rdata_cachep); - if (nfs_rdata_mempool == NULL) - return -ENOMEM; - return 0; } void nfs_destroy_readpagecache(void) { - mempool_destroy(nfs_rdata_mempool); kmem_cache_destroy(nfs_rdata_cachep); } -- cgit v1.2.3-58-ga151 From 487505c257021fc06a7d05753cf27b011487f1dc Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 12 Oct 2011 21:53:38 +0000 Subject: sysfs: Implement support for tagged files in sysfs. Looking up files in sysfs is hard to understand and analyize because we currently allow placing untagged files in tagged directories. In the implementation of that we have two subtly different meanings of NULL. NULL meaning there is no tag on a directory entry and NULL meaning we don't care which namespace the lookup is performed for. This multiple uses of NULL have resulted in subtle bugs (since fixed) in the code. Currently it is only the bonding driver that needs to have an untagged file in a tagged directory. To untagle this mess I am adding support for tagged files to sysfs. Modifying the bonding driver to implement bonding_masters as a tagged file. Registering bonding_masters once for each network namespace. Then I am removing support for untagged entries in tagged sysfs directories. Resulting in code that is much easier to reason about. Signed-off-by: Eric W. Biederman Acked-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- fs/sysfs/file.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++-- include/linux/sysfs.h | 1 + 2 files changed, 52 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1ad8c93c1b85..07c1b4ec00df 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -488,17 +488,56 @@ const struct file_operations sysfs_file_operations = { .poll = sysfs_poll, }; +int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr, + const void **pns) +{ + struct sysfs_dirent *dir_sd = kobj->sd; + const struct sysfs_ops *ops; + const void *ns = NULL; + int err; + + err = 0; + if (!sysfs_ns_type(dir_sd)) + goto out; + + err = -EINVAL; + if (!kobj->ktype) + goto out; + ops = kobj->ktype->sysfs_ops; + if (!ops) + goto out; + if (!ops->namespace) + goto out; + + err = 0; + ns = ops->namespace(kobj, attr); +out: + if (err) { + WARN(1, KERN_ERR "missing sysfs namespace attribute operation for " + "kobject: %s\n", kobject_name(kobj)); + } + *pns = ns; + return err; +} + int sysfs_add_file_mode(struct sysfs_dirent *dir_sd, const struct attribute *attr, int type, mode_t amode) { umode_t mode = (amode & S_IALLUGO) | S_IFREG; struct sysfs_addrm_cxt acxt; struct sysfs_dirent *sd; + const void *ns; int rc; + rc = sysfs_attr_ns(dir_sd->s_dir.kobj, attr, &ns); + if (rc) + return rc; + sd = sysfs_new_dirent(attr->name, mode, type); if (!sd) return -ENOMEM; + + sd->s_ns = ns; sd->s_attr.attr = (void *)attr; sysfs_dirent_init_lockdep(sd); @@ -586,12 +625,17 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, { struct sysfs_dirent *sd; struct iattr newattrs; + const void *ns; int rc; + rc = sysfs_attr_ns(kobj, attr, &ns); + if (rc) + return rc; + mutex_lock(&sysfs_mutex); rc = -ENOENT; - sd = sysfs_find_dirent(kobj->sd, NULL, attr->name); + sd = sysfs_find_dirent(kobj->sd, ns, attr->name); if (!sd) goto out; @@ -616,7 +660,12 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file); void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) { - sysfs_hash_and_remove(kobj->sd, NULL, attr->name); + const void *ns; + + if (sysfs_attr_ns(kobj, attr, &ns)) + return; + + sysfs_hash_and_remove(kobj->sd, ns, attr->name); } void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr) diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index d7d2f2158142..dac0859e6440 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -112,6 +112,7 @@ struct bin_attribute { struct sysfs_ops { ssize_t (*show)(struct kobject *, struct attribute *,char *); ssize_t (*store)(struct kobject *,struct attribute *,const char *, size_t); + const void *(*namespace)(struct kobject *, const struct attribute *); }; struct sysfs_dirent; -- cgit v1.2.3-58-ga151 From 23396180a9770df2c6a694bbb689c12bdf792f94 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 12 Oct 2011 22:01:34 +0000 Subject: sysfs: Remove support for tagged directories with untagged members. Now that /sys/class/net/bonding_masters is implemented as a tagged sysfs file we can remove support for untagged files in tagged directories. This change removes any ambiguity of what a NULL namespace value means. A NULL namespace parameter after this patch means that we are talking about an untagged sysfs dirent. This makes the sysfs code much less prone to mistakes when during maintenance. Signed-off-by: Eric W. Biederman Acked-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- fs/sysfs/dir.c | 6 +++--- fs/sysfs/inode.c | 2 -- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index ea9120a830d8..352d26d98c0a 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -543,7 +543,7 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, struct sysfs_dirent *sd; for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) { - if (ns && sd->s_ns && (sd->s_ns != ns)) + if (sd->s_ns != ns) continue; if (!strcmp(sd->s_name, name)) return sd; @@ -885,7 +885,7 @@ static struct sysfs_dirent *sysfs_dir_pos(const void *ns, while (pos && (ino > pos->s_ino)) pos = pos->s_sibling; } - while (pos && pos->s_ns && pos->s_ns != ns) + while (pos && pos->s_ns != ns) pos = pos->s_sibling; return pos; } @@ -896,7 +896,7 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns, pos = sysfs_dir_pos(ns, parent_sd, ino, pos); if (pos) pos = pos->s_sibling; - while (pos && pos->s_ns && pos->s_ns != ns) + while (pos && pos->s_ns != ns) pos = pos->s_sibling; return pos; } diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index e3f091a81c72..527f0cca66ee 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -336,8 +336,6 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha sysfs_addrm_start(&acxt, dir_sd); sd = sysfs_find_dirent(dir_sd, ns, name); - if (sd && (sd->s_ns != ns)) - sd = NULL; if (sd) sysfs_remove_one(&acxt, sd); -- cgit v1.2.3-58-ga151 From 903e21e2eea036f6947f523f732e28b33a63ed0f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 12 Oct 2011 22:02:43 +0000 Subject: sysfs: Reject with a warning invalid uses of tagged directories. sysfs is a core piece of ifrastructure that many people use and few people have all of the rules in their head on how to use it correctly. Add warnings for people using tagged directories improperly to that any misuses can be caught and diagnosed quickly. A single inexpensive test in sysfs_find_dirent is almost sufficient to catch all possible misuses. An additional warning is needed in sysfs_add_dirent so that we actually fail when attempting to add an untagged dirent in a tagged directory. Signed-off-by: Eric W. Biederman Acked-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- fs/sysfs/dir.c | 14 ++++++++++++++ fs/sysfs/file.c | 3 --- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 352d26d98c0a..26f370a9b5ce 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -384,6 +384,13 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) { struct sysfs_inode_attrs *ps_iattr; + if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) { + WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", + sysfs_ns_type(acxt->parent_sd)? "required": "invalid", + acxt->parent_sd->s_name, sd->s_name); + return -EINVAL; + } + if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name)) return -EEXIST; @@ -542,6 +549,13 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, { struct sysfs_dirent *sd; + if (!!sysfs_ns_type(parent_sd) != !!ns) { + WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", + sysfs_ns_type(parent_sd)? "required": "invalid", + parent_sd->s_name, name); + return NULL; + } + for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) { if (sd->s_ns != ns) continue; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 07c1b4ec00df..d4e6080b4b20 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -466,9 +466,6 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr) mutex_lock(&sysfs_mutex); if (sd && dir) - /* Only directories are tagged, so no need to pass - * a tag explicitly. - */ sd = sysfs_find_dirent(sd, NULL, dir); if (sd && attr) sd = sysfs_find_dirent(sd, NULL, attr); -- cgit v1.2.3-58-ga151 From 71c424bac5679200e272357a225639da8bf94068 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 19 Oct 2011 20:44:48 -0500 Subject: [CIFS] Show nostrictsync and noperm mount options in /proc/mounts Add support to print nostrictsync and noperm mount options in /proc/mounts for shares mounted with these options. (cleanup merge conflict in Sachin's original patch) Suggested-by: Sachin Prabhu Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index b0a2e1647390..96a48baad8f7 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -436,6 +436,10 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m) seq_printf(s, ",mfsymlinks"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) seq_printf(s, ",fsc"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC) + seq_printf(s, ",nostrictsync"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) + seq_printf(s, ",noperm"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) seq_printf(s, ",strictcache"); -- cgit v1.2.3-58-ga151 From 84850e8d8a5ec7b9d3c47d224e9a10c9da52ff1b Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 29 Aug 2011 09:25:53 +0800 Subject: btrfs: check file extent backref offset underflow Offset field in data extent backref can underflow if clone range ioctl is used. We can reliably detect the underflow because max file size is limited to 2^63 and max data extent size is limited by block group size. Signed-off-by: Zheng Yan --- fs/btrfs/relocation.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 10af6a0e0865..24d654ce7a06 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3322,8 +3322,11 @@ static int find_data_references(struct reloc_control *rc, } key.objectid = ref_objectid; - key.offset = ref_offset; key.type = BTRFS_EXTENT_DATA_KEY; + if (ref_offset > ((u64)-1 << 32)) + key.offset = 0; + else + key.offset = ref_offset; path->search_commit_root = 1; path->skip_locking = 1; -- cgit v1.2.3-58-ga151 From 60ccf82f5b6e26e10d41783464ca469c070c7d49 Mon Sep 17 00:00:00 2001 From: Diego Calleja Date: Thu, 1 Sep 2011 16:33:57 +0200 Subject: btrfs: fix memory leak in btrfs_defrag_file kmemleak found this: unreferenced object 0xffff8801b64af968 (size 512): comm "btrfs-cleaner", pid 3317, jiffies 4306810886 (age 903.272s) hex dump (first 32 bytes): 00 82 01 07 00 ea ff ff c0 83 01 07 00 ea ff ff ................ 80 82 01 07 00 ea ff ff c0 87 01 07 00 ea ff ff ................ backtrace: [] kmemleak_alloc+0x5c/0xc0 [] kmem_cache_alloc_trace+0x163/0x240 [] btrfs_defrag_file+0xf0/0xb20 [] btrfs_run_defrag_inodes+0x165/0x210 [] cleaner_kthread+0x177/0x190 [] kthread+0x8d/0xa0 [] kernel_thread_helper+0x4/0x10 [] 0xffffffffffffffff "pages" is not always freed. Fix it removing the unnecesary additional return. Signed-off-by: Diego Calleja --- fs/btrfs/ioctl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d2b53eb8a8c2..8ccc106f4e18 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1140,9 +1140,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, btrfs_set_super_incompat_flags(disk_super, features); } - if (!file) - kfree(ra); - return defrag_count; + ret = defrag_count; out_ra: if (!file) -- cgit v1.2.3-58-ga151 From cbcc83265d929ac71553c1b5dafdb830171af947 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 2 Sep 2011 15:56:25 +0800 Subject: Btrfs: fix defragmentation regression There's an off-by-one bug: # create a file with lots of 4K file extents # btrfs fi defrag /mnt/file # sync # filefrag -v /mnt/file Filesystem type is: 9123683e File size of /mnt/file is 1228800 (300 blocks, blocksize 4096) ext logical physical expected length flags 0 0 3372 64 1 64 3136 3435 1 2 65 3436 3136 64 3 129 3201 3499 1 4 130 3500 3201 64 5 194 3266 3563 1 6 195 3564 3266 64 7 259 3331 3627 1 8 260 3628 3331 40 eof After this patch: ... # filefrag -v /mnt/file Filesystem type is: 9123683e File size of /mnt/file is 1228800 (300 blocks, blocksize 4096) ext logical physical expected length flags 0 0 3372 300 eof /mnt/file: 1 extent found Signed-off-by: Li Zefan --- fs/btrfs/ioctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8ccc106f4e18..b39f7bf92704 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1087,7 +1087,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, defrag_count += ret; balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); - i += ret; if (newer_than) { if (newer_off == (u64)-1) @@ -1107,7 +1106,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, break; } } else { - i++; + if (ret > 0) + i += ret; + else + i++; } } -- cgit v1.2.3-58-ga151 From 151a31b25e5c941bdd9fdefed650effca223c716 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 2 Sep 2011 15:56:39 +0800 Subject: Btrfs: use i_size_read() in btrfs_defrag_file() Don't use inode->i_size directly, since we're not holding i_mutex. This also fixes another bug, that i_size can change after it's checked against 0 and then (i_size - 1) can be negative. Signed-off-by: Li Zefan --- fs/btrfs/ioctl.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b39f7bf92704..323d77f09258 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -978,6 +978,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, struct btrfs_super_block *disk_super; struct file_ra_state *ra = NULL; unsigned long last_index; + u64 isize = i_size_read(inode); u64 features; u64 last_len = 0; u64 skip = 0; @@ -1003,7 +1004,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, compress_type = range->compress_type; } - if (inode->i_size == 0) + if (isize == 0) return 0; /* @@ -1028,10 +1029,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, /* find the last page to defrag */ if (range->start + range->len > range->start) { - last_index = min_t(u64, inode->i_size - 1, + last_index = min_t(u64, isize - 1, range->start + range->len - 1) >> PAGE_CACHE_SHIFT; } else { - last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; + last_index = (isize - 1) >> PAGE_CACHE_SHIFT; } if (newer_than) { -- cgit v1.2.3-58-ga151 From 5ca496604b5975d371bb669ee6c2394bcbea818f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 2 Sep 2011 15:56:55 +0800 Subject: Btrfs: fix wrong max_to_defrag in btrfs_defrag_file() It's off-by-one, and thus we may skip the last page while defragmenting. An example case: # create /mnt/file with 2 4K file extents # btrfs fi defrag /mnt/file # sync # filefrag /mnt/file /mnt/file: 2 extents found So it's not defragmented. Signed-off-by: Li Zefan --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 323d77f09258..f9026413bcf1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1052,7 +1052,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, i = range->start >> PAGE_CACHE_SHIFT; } if (!max_to_defrag) - max_to_defrag = last_index - 1; + max_to_defrag = last_index; while (i <= last_index && defrag_count < max_to_defrag) { /* -- cgit v1.2.3-58-ga151 From 83c8c9bde0add721f7509aa446455183b040b931 Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Wed, 14 Sep 2011 14:11:21 +0800 Subject: btrfs: trivial fix, a potential memory leak in btrfs_parse_early_options() Signed-off-by: Jie Liu --- fs/btrfs/super.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 266d1f35465d..09ce951666ea 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -419,7 +419,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) { substring_t args[MAX_OPT_ARGS]; - char *opts, *orig, *p; + char *device_name, *opts, *orig, *p; int error = 0; int intarg; @@ -470,8 +470,14 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, } break; case Opt_device: - error = btrfs_scan_one_device(match_strdup(&args[0]), + device_name = match_strdup(&args[0]); + if (!device_name) { + error = -ENOMEM; + goto out; + } + error = btrfs_scan_one_device(device_name, flags, holder, fs_devices); + kfree(device_name); if (error) goto out; break; -- cgit v1.2.3-58-ga151 From 008873eafbc77deb1702aedece33756c58486c6a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 2 Sep 2011 15:57:07 +0800 Subject: Btrfs: honor extent thresh during defragmentation We won't defrag an extent, if it's bigger than the threshold we specified and there's no small extent before it, but actually the code doesn't work this way. There are three bugs: - When should_defrag_range() decides we should keep on defragmenting an extent, last_len is not incremented. (old bug) - The length that passes to should_defrag_range() is not the length we're going to defrag. (new bug) - We always defrag 256K bytes data, and a big extent can be part of this range. (new bug) For a file with 4 extents: | 4K | 4K | 256K | 256K | The result of defrag with (the default) 256K extent thresh should be: | 264K | 256K | but with those bugs, we'll get: | 520K | Signed-off-by: Li Zefan --- fs/btrfs/ioctl.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f9026413bcf1..d524b6697ad9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -765,7 +765,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, int ret = 1; /* - * make sure that once we start defragging and extent, we keep on + * make sure that once we start defragging an extent, we keep on * defragging it */ if (start < *defrag_end) @@ -810,7 +810,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, * extent will force at least part of that big extent to be defragged. */ if (ret) { - *last_len += len; *defrag_end = extent_map_end(em); } else { *last_len = 0; @@ -984,13 +983,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, u64 skip = 0; u64 defrag_end = 0; u64 newer_off = range->start; - int newer_left = 0; unsigned long i; + unsigned long ra_index = 0; int ret; int defrag_count = 0; int compress_type = BTRFS_COMPRESS_ZLIB; int extent_thresh = range->extent_thresh; - int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; + int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; + int cluster = max_cluster; u64 new_align = ~((u64)128 * 1024 - 1); struct page **pages = NULL; @@ -1020,7 +1020,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra = &file->f_ra; } - pages = kmalloc(sizeof(struct page *) * newer_cluster, + pages = kmalloc(sizeof(struct page *) * max_cluster, GFP_NOFS); if (!pages) { ret = -ENOMEM; @@ -1045,7 +1045,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, * the extents in the file evenly spaced */ i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; - newer_left = newer_cluster; } else goto out_ra; } else { @@ -1077,12 +1076,26 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, i = max(i + 1, next); continue; } + + if (!newer_than) { + cluster = (PAGE_CACHE_ALIGN(defrag_end) >> + PAGE_CACHE_SHIFT) - i; + cluster = min(cluster, max_cluster); + } else { + cluster = max_cluster; + } + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) BTRFS_I(inode)->force_compress = compress_type; - btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster); + if (i + cluster > ra_index) { + ra_index = max(i, ra_index); + btrfs_force_ra(inode->i_mapping, ra, file, ra_index, + cluster); + ra_index += max_cluster; + } - ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster); + ret = cluster_pages_for_defrag(inode, pages, i, cluster); if (ret < 0) goto out_ra; @@ -1102,15 +1115,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (!ret) { range->start = newer_off; i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; - newer_left = newer_cluster; } else { break; } } else { - if (ret > 0) + if (ret > 0) { i += ret; - else + last_len += ret << PAGE_CACHE_SHIFT; + } else { i++; + last_len = 0; + } } } -- cgit v1.2.3-58-ga151 From f4c697e6406da5dd445eda8d923c53e1138793dd Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Mon, 5 Sep 2011 16:34:54 +0200 Subject: btrfs: return EINVAL if start > total_bytes in fitrim ioctl We should retirn EINVAL if the start is beyond the end of the file system in the btrfs_ioctl_fitrim(). Fix that by adding the appropriate check for it. Also in the btrfs_trim_fs() it is possible that len+start might overflow if big values are passed. Fix it by decrementing the len so that start+len is equal to the file system size in the worst case. Signed-off-by: Lukas Czerner --- fs/btrfs/ioctl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d524b6697ad9..136a2f980e21 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -282,6 +282,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; + u64 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); int ret; if (!capable(CAP_SYS_ADMIN)) @@ -300,12 +301,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) } } rcu_read_unlock(); + if (!num_devices) return -EOPNOTSUPP; - if (copy_from_user(&range, arg, sizeof(range))) return -EFAULT; + if (range.start > total_bytes) + return -EINVAL; + range.len = min(range.len, total_bytes - range.start); range.minlen = max(range.minlen, minlen); ret = btrfs_trim_fs(root, &range); if (ret < 0) -- cgit v1.2.3-58-ga151 From a05a9bb18ae0abec0b513b5fde876c47905fa13e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 6 Sep 2011 16:55:34 +0800 Subject: Btrfs: fix array bound checking Otherwise we can execced the array bound of path->slots[]. Signed-off-by: Li Zefan --- fs/btrfs/ctree.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 011cab3aca8d..0fe615e4ea38 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -902,9 +902,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, orig_ptr = btrfs_node_blockptr(mid, orig_slot); - if (level < BTRFS_MAX_LEVEL - 1) + if (level < BTRFS_MAX_LEVEL - 1) { parent = path->nodes[level + 1]; - pslot = path->slots[level + 1]; + pslot = path->slots[level + 1]; + } /* * deal with the case where there is only one pointer in the root @@ -1107,9 +1108,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, mid = path->nodes[level]; WARN_ON(btrfs_header_generation(mid) != trans->transid); - if (level < BTRFS_MAX_LEVEL - 1) + if (level < BTRFS_MAX_LEVEL - 1) { parent = path->nodes[level + 1]; - pslot = path->slots[level + 1]; + pslot = path->slots[level + 1]; + } if (!parent) return 1; -- cgit v1.2.3-58-ga151 From 560f7d75457f86a43970aa413e334e394082dce4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 8 Sep 2011 10:22:01 +0800 Subject: Btrfs: remove BUG_ON() in compress_file_range() It's not a big deal if we fail to allocate the array, and instead of panic we can just give up compressing. Signed-off-by: Li Zefan --- fs/btrfs/inode.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f12747c9447b..81d4f68f35c9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -393,7 +393,10 @@ again: (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { WARN_ON(pages); pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); - BUG_ON(!pages); + if (!pages) { + /* just bail out to the uncompressed code */ + goto cont; + } if (BTRFS_I(inode)->force_compress) compress_type = BTRFS_I(inode)->force_compress; @@ -424,6 +427,7 @@ again: will_compress = 1; } } +cont: if (start == 0) { trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); -- cgit v1.2.3-58-ga151 From f0dd9592a1aa014b3a01aa2be7e795aae040d65b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 8 Sep 2011 10:26:51 +0800 Subject: Btrfs: fix direct-io vs nodatacow To reproduce the bug: # mount -o nodatacow /dev/sda7 /mnt/ # dd if=/dev/zero of=/mnt/tmp bs=4K count=1 1+0 records in 1+0 records out 4096 bytes (4.1 kB) copied, 0.000136115 s, 30.1 MB/s # dd if=/dev/zero of=/mnt/tmp bs=4K count=1 conv=notrunc oflag=direct dd: writing `/mnt/tmp': Input/output error 1+0 records in 0+0 records out btrfs_ordered_update_i_size() may return 1, but btrfs_endio_direct_write() mistakenly takes it as an error. Signed-off-by: Li Zefan --- fs/btrfs/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 81d4f68f35c9..65474d95f26f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5777,8 +5777,7 @@ again: if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { ret = btrfs_ordered_update_i_size(inode, 0, ordered); if (!ret) - ret = btrfs_update_inode(trans, root, inode); - err = ret; + err = btrfs_update_inode(trans, root, inode); goto out; } -- cgit v1.2.3-58-ga151 From fee187d9d9ddc382c81370a9a280391132dea2e1 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 29 Sep 2011 15:55:28 +0800 Subject: Btrfs: do not set EXTENT_DIRTY along with EXTENT_DELALLOC Signed-off-by: Liu Bo --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0ada0b700b44..f284d4e5f447 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1107,7 +1107,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, + EXTENT_DELALLOC | EXTENT_UPTODATE, 0, NULL, cached_state, mask); } -- cgit v1.2.3-58-ga151 From 10b2f34d6e7fbe07f498cb2006272e9a561f5e60 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 2 Oct 2011 13:56:53 +0300 Subject: Btrfs: pass the correct root to lookup_free_space_inode() Free space items are located in tree of tree roots, not in the extent tree. It didn't pop up because lookup_free_space_inode() grabs the inode all the time instead of actually searching the tree. Signed-off-by: Ilya Dryomov --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4eb7d2ba38f8..6cfcc9060c83 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7312,7 +7312,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, goto out; } - inode = lookup_free_space_inode(root, block_group, path); + inode = lookup_free_space_inode(tree_root, block_group, path); if (!IS_ERR(inode)) { ret = btrfs_orphan_add(trans, inode); BUG_ON(ret); -- cgit v1.2.3-58-ga151 From cfbffc39ac89dbd5197cbeec2599a1128eb928f8 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 6 Oct 2011 13:37:08 +0900 Subject: Btrfs: fix return value of btrfs_get_acl() In btrfs_get_acl(), when the second __btrfs_getxattr() call fails, acl is not correctly set. Therefore, a wrong value might return to the caller. Signed-off-by: Tsutomu Itoh --- fs/btrfs/acl.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index eb159aaa5a11..89b156d85d63 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) if (!value) return ERR_PTR(-ENOMEM); size = __btrfs_getxattr(inode, name, value, size); - if (size > 0) { - acl = posix_acl_from_xattr(value, size); - if (IS_ERR(acl)) { - kfree(value); - return acl; - } - set_cached_acl(inode, type, acl); - } - kfree(value); + } + if (size > 0) { + acl = posix_acl_from_xattr(value, size); } else if (size == -ENOENT || size == -ENODATA || size == 0) { /* FIXME, who returns -ENOENT? I think nobody */ acl = NULL; - set_cached_acl(inode, type, acl); } else { acl = ERR_PTR(-EIO); } + kfree(value); + + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); return acl; } -- cgit v1.2.3-58-ga151 From 60d2adbb1e7fee1cb4bc67f70bd0bd8ace7b6c3c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 9 Sep 2011 17:34:35 +0800 Subject: Btrfs: fix race between multi-task space allocation and caching space The task may fail to get free space though it is enough when multi-task space allocation and caching space happen at the same time. Task1 Caching Thread Task2 ------------------------------------------------------------------------ find_free_extent The space has not be cached, and start caching thread. And wait for it. cache space, if the space is > 2MB wake up Task1 find_free_extent get all the space that is cached. try to allocate space, but there is no space now. trigger BUG_ON() The message is following: btrfs allocation failed flags 1, wanted 4096 space_info has 1040187392 free, is not full space_info total=1082130432, used=4096, pinned=41938944, reserved=0, may_use=40828928, readonly=0 block group 12582912 has 8388608 bytes, 0 used 8388608 pinned 0 reserved block group has cluster?: no 0 blocks of free space at or bigger than bytes is block group 1103101952 has 1073741824 bytes, 4096 used 33550336 pinned 0 reserved block group has cluster?: no 0 blocks of free space at or bigger than bytes is ------------[ cut here ]------------ kernel BUG at fs/btrfs/inode.c:835! [] __extent_writepage+0x1bf/0x5ce [btrfs] [] ? __set_page_dirty_nobuffers+0xfe/0x108 [] ? wait_current_trans+0x23/0xec [btrfs] [] ? find_get_pages_tag+0x73/0xe2 [] extent_write_cache_pages.clone.0+0x176/0x29a [btrfs] [] extent_writepages+0x3e/0x53 [btrfs] [] ? do_sync_write+0xc6/0x103 [] ? btrfs_submit_direct+0x414/0x414 [btrfs] [] ? fsnotify+0x236/0x266 [] btrfs_writepages+0x22/0x24 [btrfs] [] do_writepages+0x1c/0x25 [] __filemap_fdatawrite_range+0x4e/0x50 [] filemap_write_and_wait_range+0x28/0x51 [] btrfs_sync_file+0x7d/0x198 [btrfs] [] ? fsnotify_modify+0x5d/0x65 [] vfs_fsync_range+0x18/0x21 [] vfs_fsync+0x17/0x19 [] do_fsync+0x29/0x3e [] sys_fsync+0xb/0xf [] system_call_fastpath+0x16/0x1b [SNIP] RIP [] cow_file_range+0x1c4/0x32b [btrfs] We fix this bug by trying to allocate the space again if there are block groups in caching. Signed-off-by: Miao Xie --- fs/btrfs/extent-tree.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6cfcc9060c83..cef355f1328a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4954,6 +4954,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, bool failed_cluster_refill = false; bool failed_alloc = false; bool use_cluster = true; + bool have_caching_bg = false; u64 ideal_cache_percent = 0; u64 ideal_cache_offset = 0; @@ -5036,6 +5037,7 @@ ideal_cache: } } search: + have_caching_bg = false; down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups[index], list) { @@ -5244,6 +5246,8 @@ refill_cluster: failed_alloc = true; goto have_block_group; } else if (!offset) { + if (!cached) + have_caching_bg = true; goto loop; } checks: @@ -5294,6 +5298,9 @@ loop: } up_read(&space_info->groups_sem); + if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) + goto search; + if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) goto search; -- cgit v1.2.3-58-ga151 From 068132bad1de70f85f5f6d12c36d64f8f7848d92 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Thu, 23 Jun 2011 23:01:01 +0800 Subject: btrfs: fix oops on failure path If lookup_extent_backref fails, path->nodes[0] reasonably could be null along with other callers of btrfs_print_leaf, so ensure we have a valid extent buffer before dereferencing. Signed-off-by: Daniel J Blueman --- fs/btrfs/print-tree.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index fb2605d998e9..f38e452486b8 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot) void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) { int i; - u32 type; - u32 nr = btrfs_header_nritems(l); + u32 type, nr; struct btrfs_item *item; struct btrfs_root_item *ri; struct btrfs_dir_item *di; @@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) struct btrfs_key key; struct btrfs_key found_key; + if (!l) + return; + + nr = btrfs_header_nritems(l); + printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", (unsigned long long)btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l)); -- cgit v1.2.3-58-ga151 From 5f524444c351e145a5f7e28253594688a421bfe8 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 13 Oct 2011 00:20:43 +0300 Subject: Btrfs: fix a bug when opening seed devices Initialize fs_info->bdev_holder a bit earlier to be able to pass a correct holder id to blkdev_get() when opening seed devices with O_EXCL. Signed-off-by: Ilya Dryomov --- fs/btrfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 09ce951666ea..29eecbb6ec3a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -939,6 +939,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, s->s_flags = flags | MS_NOSEC; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + btrfs_sb(s)->fs_info->bdev_holder = fs_type; error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); if (error) { @@ -946,7 +947,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, return ERR_PTR(error); } - btrfs_sb(s)->fs_info->bdev_holder = fs_type; s->s_flags |= MS_ACTIVE; } -- cgit v1.2.3-58-ga151 From 20bcd64934e4eb8f3f90a0dca54fb0ac2edd7795 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 20 Oct 2011 00:06:20 +0300 Subject: Btrfs: close all bdevs on mount failure Fix a bug introduced by 20b45077. We have to return EINVAL on mount failure, but doing that too early in the sequence leaves all of the devices opened exclusively. This also fixes an issue where under some scenarios only a second mount -o degraded command would succeed. Signed-off-by: Ilya Dryomov --- fs/btrfs/volumes.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e138af710de2..c6938b45e0fd 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -597,10 +597,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, set_blocksize(bdev, 4096); bh = btrfs_read_dev_super(bdev); - if (!bh) { - ret = -EINVAL; + if (!bh) goto error_close; - } disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); @@ -655,7 +653,7 @@ error: continue; } if (fs_devices->open_devices == 0) { - ret = -EIO; + ret = -EINVAL; goto out; } fs_devices->seeding = seeding; -- cgit v1.2.3-58-ga151 From 09e0834fb0ce1ea2a63885177015bd5d7d2bc22d Mon Sep 17 00:00:00 2001 From: Akira Fujita Date: Thu, 20 Oct 2011 18:56:10 -0400 Subject: ext4: fix deadlock in ext4_ordered_write_end() If ext4_jbd2_file_inode() in ext4_ordered_write_end() fails for some reasons, this function returns to caller without unlocking the page. It leads to the deadlock, and the patch fixes this issue. Signed-off-by: Akira Fujita Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 081bb25a9ad3..ecb572591924 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -959,7 +959,11 @@ static int ext4_ordered_write_end(struct file *file, ext4_orphan_add(handle, inode); if (ret2 < 0) ret = ret2; + } else { + unlock_page(page); + page_cache_release(page); } + ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; -- cgit v1.2.3-58-ga151 From 8de49e674a1133ab8998914a6e933ceb4b5f4b07 Mon Sep 17 00:00:00 2001 From: Kazuya Mio Date: Thu, 20 Oct 2011 19:23:08 -0400 Subject: ext4: fix the deadlock in mpage_da_map_and_submit() If ext4_jbd2_file_inode() in mpage_da_map_and_submit() fails due to journal abort, this function returns to caller without unlocking the page. It leads to the deadlock, and the patch fixes this issue by calling mpage_da_submit_io(). Signed-off-by: Kazuya Mio Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ecb572591924..ff6aace0bb3c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1552,9 +1552,11 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) if (ext4_should_order_data(mpd->inode)) { err = ext4_jbd2_file_inode(handle, mpd->inode); - if (err) + if (err) { /* Only if the journal is aborted */ - return; + mpd->retval = err; + goto submit_io; + } } } @@ -2294,11 +2296,12 @@ retry: ret = 0; } else if (ret == MPAGE_DA_EXTENT_TAIL) { /* - * got one extent now try with - * rest of the pages + * Got one extent now try with rest of the pages. + * If mpd.retval is set -EIO, journal is aborted. + * So we don't need to write any more. */ pages_written += mpd.pages_written; - ret = 0; + ret = mpd.retval; io_done = 1; } else if (wbc->nr_to_write) /* -- cgit v1.2.3-58-ga151 From 940aab490215424a269f93d2eba2794fc8b3e269 Mon Sep 17 00:00:00 2001 From: Malahal Naineni Date: Tue, 20 Sep 2011 17:27:14 -0700 Subject: Check validity of cl_rpcclient in nfs_server_list_show As soon as the nfs_client gets created, its cl_rpcclient is set to ERR_PTR(-EINVAL). The rpc client structure is allocated later. Check if the client is ready before using the cl_rpcclient pointer. Signed-off-by: Malahal Naineni Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index b4e41dd4d0f6..873bf00d51a2 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1868,6 +1868,10 @@ static int nfs_server_list_show(struct seq_file *m, void *v) /* display one transport per line on subsequent lines */ clp = list_entry(v, struct nfs_client, cl_share_link); + /* Check if the client is initialized */ + if (clp->cl_cons_state != NFS_CS_READY) + return 0; + seq_printf(m, "v%u %s %s %3d %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), -- cgit v1.2.3-58-ga151 From 45dc63e7d8412546567399b94caeb683af58843e Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Thu, 20 Oct 2011 20:07:23 -0400 Subject: ext4: Allow quota file use root reservation Quota file is fs's metadata, so it is reasonable to permit use root resevation if necessary. This patch fix 265'th xfstest failure Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8c005c028203..cd70b3041185 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4291,6 +4291,10 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, trace_ext4_request_blocks(ar); + /* Allow to use superuser reservation for quota file */ + if (IS_NOQUOTA(ar->inode)) + ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; + /* * For delayed allocation, we could skip the ENOSPC and * EDQUOT check, as blocks and quotas have been already -- cgit v1.2.3-58-ga151 From 4c28d33803d4aeaff32b4ac502af11a9b2aed8f4 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 26 Jul 2011 09:17:28 +0100 Subject: GFS2: Clean up dir hash table reading Since there is now only a single caller to gfs2_dir_read_data() and it has a number of constant arguments, we can factor those out. Also some tests relating to the inode size were being done twice. Signed-off-by: Steven Whitehouse --- fs/gfs2/dir.c | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 1cc2f8ec52a2..2045d70753f1 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -240,16 +240,15 @@ fail: return error; } -static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf, - u64 offset, unsigned int size) +static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, __be64 *buf, + unsigned int size) { struct buffer_head *dibh; int error; error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - offset += sizeof(struct gfs2_dinode); - memcpy(buf, dibh->b_data + offset, size); + memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size); brelse(dibh); } @@ -261,13 +260,12 @@ static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf, * gfs2_dir_read_data - Read a data from a directory inode * @ip: The GFS2 Inode * @buf: The buffer to place result into - * @offset: File offset to begin jdata_readng from * @size: Amount of data to transfer * * Returns: The amount of data actually copied or the error */ -static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, - unsigned int size, unsigned ra) +static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf, + unsigned int size) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); u64 lblock, dblock; @@ -275,24 +273,14 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, unsigned int o; int copied = 0; int error = 0; - u64 disksize = i_size_read(&ip->i_inode); - - if (offset >= disksize) - return 0; - - if (offset + size > disksize) - size = disksize - offset; - - if (!size) - return 0; if (gfs2_is_stuffed(ip)) - return gfs2_dir_read_stuffed(ip, buf, offset, size); + return gfs2_dir_read_stuffed(ip, buf, size); if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip))) return -EINVAL; - lblock = offset; + lblock = 0; o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header); while (copied < size) { @@ -311,8 +299,6 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, if (error || !dblock) goto fail; BUG_ON(extlen < 1); - if (!ra) - extlen = 1; bh = gfs2_meta_ra(ip->i_gl, dblock, extlen); } else { error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh); @@ -328,7 +314,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, extlen--; memcpy(buf, bh->b_data + o, amount); brelse(bh); - buf += amount; + buf += (amount/sizeof(__be64)); copied += amount; lblock++; o = sizeof(struct gfs2_meta_header); @@ -371,7 +357,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) if (hc == NULL) return ERR_PTR(-ENOMEM); - ret = gfs2_dir_read_data(ip, (char *)hc, 0, hsize, 1); + ret = gfs2_dir_read_data(ip, hc, hsize); if (ret < 0) { kfree(hc); return ERR_PTR(ret); -- cgit v1.2.3-58-ga151 From 2f0264d592e34cde99efbad7d616b04af138e913 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 27 Jul 2011 10:58:48 +0100 Subject: GFS2: Split data write & wait in fsync Now that the data writing is part of fsync proper, we can split the waiting part out and do it later on. This reduces the number of waits that we do during fsync on average. There is also no need to take the i_mutex unless we are flushing metadata to disk, so we can move that to within the metadata flushing code. Signed-off-by: Steven Whitehouse --- fs/gfs2/file.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index edeb9e802903..92c3db424b40 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -551,8 +551,16 @@ static int gfs2_close(struct inode *inode, struct file *file) * @end: the end position in the file to sync * @datasync: set if we can ignore timestamp changes * - * The VFS will flush data for us. We only need to worry - * about metadata here. + * We split the data flushing here so that we don't wait for the data + * until after we've also sent the metadata to disk. Note that for + * data=ordered, we will write & wait for the data at the log flush + * stage anyway, so this is unlikely to make much of a difference + * except in the data=writeback case. + * + * If the fdatawrite fails due to any reason except -EIO, we will + * continue the remainder of the fsync, although we'll still report + * the error at the end. This is to match filemap_write_and_wait_range() + * behaviour. * * Returns: errno */ @@ -560,30 +568,36 @@ static int gfs2_close(struct inode *inode, struct file *file) static int gfs2_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC); struct gfs2_inode *ip = GFS2_I(inode); - int ret; + int ret, ret1 = 0; - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - return ret; - mutex_lock(&inode->i_mutex); + if (mapping->nrpages) { + ret1 = filemap_fdatawrite_range(mapping, start, end); + if (ret1 == -EIO) + return ret1; + } if (datasync) sync_state &= ~I_DIRTY_SYNC; if (sync_state) { + mutex_lock(&inode->i_mutex); ret = sync_inode_metadata(inode, 1); if (ret) { mutex_unlock(&inode->i_mutex); return ret; } gfs2_ail_flush(ip->i_gl); + mutex_unlock(&inode->i_mutex); } - mutex_unlock(&inode->i_mutex); - return 0; + if (mapping->nrpages) + ret = filemap_fdatawait_range(mapping, start, end); + + return ret ? ret : ret1; } /** -- cgit v1.2.3-58-ga151 From 75549186edf1515062fe2fcbfbd92bd99659afba Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 2 Aug 2011 13:09:36 +0100 Subject: GFS2: Fix bug-trap in ail flush code The assert was being tested under the wrong lock, a legacy of the original code. Also, if it does trigger, the resulting information was not always a lot of help. This moves the patch under the correct lock and also prints out more useful information in tacking down the source of the problem. Signed-off-by: Steven Whitehouse --- fs/gfs2/glops.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index da21ecaafcc2..99df4832f94c 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -28,6 +28,17 @@ #include "trans.h" #include "dir.h" +static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) +{ + fs_err(gl->gl_sbd, "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page state 0x%lx\n", + bh, (unsigned long long)bh->b_blocknr, bh->b_state, + bh->b_page->mapping, bh->b_page->flags); + fs_err(gl->gl_sbd, "AIL glock %u:%llu mapping %p\n", + gl->gl_name.ln_type, gl->gl_name.ln_number, + gfs2_glock2aspace(gl)); + gfs2_lm_withdraw(gl->gl_sbd, "AIL error\n"); +} + /** * __gfs2_ail_flush - remove all buffers for a given lock from the AIL * @gl: the glock @@ -41,20 +52,24 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl) struct list_head *head = &gl->gl_ail_list; struct gfs2_bufdata *bd; struct buffer_head *bh; + sector_t blocknr; spin_lock(&sdp->sd_ail_lock); while (!list_empty(head)) { bd = list_entry(head->next, struct gfs2_bufdata, bd_ail_gl_list); bh = bd->bd_bh; - gfs2_remove_from_ail(bd); - bd->bd_bh = NULL; + blocknr = bh->b_blocknr; + if (buffer_busy(bh)) + gfs2_ail_error(gl, bh); bh->b_private = NULL; + gfs2_remove_from_ail(bd); /* drops ref on bh */ spin_unlock(&sdp->sd_ail_lock); - bd->bd_blkno = bh->b_blocknr; + bd->bd_bh = NULL; + bd->bd_blkno = blocknr; + gfs2_log_lock(sdp); - gfs2_assert_withdraw(sdp, !buffer_busy(bh)); gfs2_trans_add_revoke(sdp, bd); gfs2_log_unlock(sdp); -- cgit v1.2.3-58-ga151 From 1d4ec642d9f00d4c531b1a4ae0613091ec1f8e9b Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 2 Aug 2011 13:13:20 +0100 Subject: GFS2: Make atime checks more efficient We do not need to start a transaction unless the atime check has proved positive. Also if we are going to flush the complete ail list anyway, we might as well skip the writeback for this specific inode's metadata, since that will be done as part of the ail writeback process in an order offering potentially more efficient I/O. Signed-off-by: Steven Whitehouse --- fs/gfs2/super.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b7beadd9ba4c..afb87615c014 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -768,30 +768,30 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) goto do_flush; unlock_required = 1; } - ret = gfs2_trans_begin(sdp, RES_DINODE, 0); - if (ret) - goto do_unlock; ret = gfs2_meta_inode_buffer(ip, &bh); if (ret == 0) { di = (struct gfs2_dinode *)bh->b_data; atime.tv_sec = be64_to_cpu(di->di_atime); atime.tv_nsec = be32_to_cpu(di->di_atime_nsec); if (timespec_compare(&inode->i_atime, &atime) > 0) { - gfs2_trans_add_bh(ip->i_gl, bh, 1); - gfs2_dinode_out(ip, bh->b_data); + ret = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (ret == 0) { + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_dinode_out(ip, bh->b_data); + gfs2_trans_end(sdp); + } } brelse(bh); } - gfs2_trans_end(sdp); -do_unlock: if (unlock_required) gfs2_glock_dq_uninit(&gh); do_flush: if (wbc->sync_mode == WB_SYNC_ALL) gfs2_log_flush(GFS2_SB(inode), ip->i_gl); - filemap_fdatawrite(metamapping); if (bdi->dirty_exceeded) gfs2_ail1_flush(sdp, wbc); + else + filemap_fdatawrite(metamapping); if (!ret && (wbc->sync_mode == WB_SYNC_ALL)) ret = filemap_fdatawait(metamapping); if (ret) -- cgit v1.2.3-58-ga151 From 40ac218f52aa5cac7dc8082f28b61c8b2b29373c Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 2 Aug 2011 13:17:27 +0100 Subject: GFS2: Fix inode allocation error path If we have got far enough through the inode allocation code path that an inode has already been allocated, then we must call iput to dispose of it, if an error occurs during a later part of the process. This will always be the final iput since there will be no other references to the inode. Unlike when the inode has been unlinked, its block state will be GFS2_BLKST_INODE rather than GFS2_BLKST_UNLINKED so we need to skip the test in ->evict_inode() for this one case in order to ensure that it will be deallocated correctly. This patch adds a new flag in order to ensure that this will happen correctly. Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 1 + fs/gfs2/inode.c | 6 ++++-- fs/gfs2/super.c | 12 +++++++++--- 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 892ac37de8ae..be5801a75406 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -267,6 +267,7 @@ struct gfs2_alloc { enum { GIF_INVALID = 0, GIF_QD_LOCKED = 1, + GIF_ALLOC_FAILED = 2, GIF_SW_PAGED = 3, }; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 900cf986aadc..044efe273b97 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -736,10 +736,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, fail_gunlock2: gfs2_glock_dq_uninit(ghs + 1); - if (inode && !IS_ERR(inode)) - iput(inode); fail_gunlock: gfs2_glock_dq_uninit(ghs); + if (inode && !IS_ERR(inode)) { + set_bit(GIF_ALLOC_FAILED, &GFS2_I(inode)->i_flags); + iput(inode); + } fail: if (bh) brelse(bh); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index afb87615c014..9961de702d1b 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1471,9 +1471,11 @@ static void gfs2_evict_inode(struct inode *inode) goto out; } - error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); - if (error) - goto out_truncate; + if (!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) { + error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); + if (error) + goto out_truncate; + } if (test_bit(GIF_INVALID, &ip->i_flags)) { error = gfs2_inode_refresh(ip); @@ -1513,6 +1515,10 @@ static void gfs2_evict_inode(struct inode *inode) goto out_unlock; out_truncate: + gfs2_log_flush(sdp, ip->i_gl); + write_inode_now(inode, 1); + gfs2_ail_flush(ip->i_gl); + /* Case 2 starts here */ error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); if (error) -- cgit v1.2.3-58-ga151 From f18185291d605ea9e442e00e2cf6c917a84d9837 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 5 Aug 2011 10:12:47 +0100 Subject: GFS2: Fix bug trap and journaled data fsync Journaled data requires that a complete flush of all dirty data for the file is done, in order that the ail flush which comes after will succeed. Also the recently enhanced bug trap can trigger falsely in case an ail flush from fsync races with a page read. This updates the bug trap such that it will ignore buffers which are locked and only trigger on dirty and/or pinned buffers when the ail flush is run from fsync. The original bug trap is retained when ail flush is run from ->go_sync() Signed-off-by: Steven Whitehouse --- fs/gfs2/file.c | 2 ++ fs/gfs2/glops.c | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 92c3db424b40..9d12286d8111 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -590,6 +590,8 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, mutex_unlock(&inode->i_mutex); return ret; } + if (gfs2_is_jdata(ip)) + filemap_write_and_wait(mapping); gfs2_ail_flush(ip->i_gl); mutex_unlock(&inode->i_mutex); } diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 99df4832f94c..0cc3ff48ce20 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -46,7 +46,7 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) * None of the buffers should be dirty, locked, or pinned. */ -static void __gfs2_ail_flush(struct gfs2_glock *gl) +static void __gfs2_ail_flush(struct gfs2_glock *gl, unsigned long b_state) { struct gfs2_sbd *sdp = gl->gl_sbd; struct list_head *head = &gl->gl_ail_list; @@ -60,7 +60,7 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl) bd_ail_gl_list); bh = bd->bd_bh; blocknr = bh->b_blocknr; - if (buffer_busy(bh)) + if (bh->b_state & b_state) gfs2_ail_error(gl, bh); bh->b_private = NULL; gfs2_remove_from_ail(bd); /* drops ref on bh */ @@ -99,7 +99,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) BUG_ON(current->journal_info); current->journal_info = &tr; - __gfs2_ail_flush(gl); + __gfs2_ail_flush(gl, (1ul << BH_Dirty)|(1ul << BH_Pinned)|(1ul << BH_Lock)); gfs2_trans_end(sdp); gfs2_log_flush(sdp, NULL); @@ -117,7 +117,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl) ret = gfs2_trans_begin(sdp, 0, revokes); if (ret) return; - __gfs2_ail_flush(gl); + __gfs2_ail_flush(gl, (1ul << BH_Dirty)|(1ul << BH_Pinned)); gfs2_trans_end(sdp); gfs2_log_flush(sdp, NULL); } -- cgit v1.2.3-58-ga151 From ab9bbda0204dfd0e5342562d9979d1241b14ea5f Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 15 Aug 2011 14:20:36 +0100 Subject: GFS2: Use ->dirty_inode() The aim of this patch is to use the newly enhanced ->dirty_inode() super block operation to deal with atime updates, rather than piggy backing that code into ->write_inode() as is currently done. The net result is a simplification of the code in various places and a reduction of the number of gfs2_dinode_out() calls since this is now implied by ->dirty_inode(). Some of the mark_inode_dirty() calls have been moved under glocks in order to take advantage of then being able to avoid locking in ->dirty_inode() when we already have suitable locks. One consequence is that generic_write_end() now correctly deals with file size updates, so that we do not need a separate check for that afterwards. This also, indirectly, means that fdatasync should work correctly on GFS2 - the current code always syncs the metadata whether it needs to or not. Has survived testing with postmark (with and without atime) and also fsx. Signed-off-by: Steven Whitehouse --- fs/gfs2/acl.c | 5 +-- fs/gfs2/aops.c | 6 ---- fs/gfs2/dir.c | 10 +----- fs/gfs2/file.c | 1 - fs/gfs2/inode.c | 43 ++++++++----------------- fs/gfs2/inode.h | 2 +- fs/gfs2/quota.c | 13 ++------ fs/gfs2/super.c | 97 ++++++++++++++++++++++++++++++++++++--------------------- fs/gfs2/xattr.c | 5 +-- 9 files changed, 85 insertions(+), 97 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 34501b64bc47..65978d7885c8 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -82,7 +82,7 @@ static int gfs2_set_mode(struct inode *inode, umode_t mode) iattr.ia_valid = ATTR_MODE; iattr.ia_mode = mode; - error = gfs2_setattr_simple(GFS2_I(inode), &iattr); + error = gfs2_setattr_simple(inode, &iattr); } return error; @@ -160,6 +160,7 @@ out: int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) { + struct inode *inode = &ip->i_inode; struct posix_acl *acl; char *data; unsigned int len; @@ -169,7 +170,7 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) if (IS_ERR(acl)) return PTR_ERR(acl); if (!acl) - return gfs2_setattr_simple(ip, attr); + return gfs2_setattr_simple(inode, attr); error = posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode); if (error) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index f9fbbe96c222..212fe74927ba 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -787,7 +787,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, u64 to = pos + copied; void *kaddr; unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode); - struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode))); kaddr = kmap_atomic(page, KM_USER0); @@ -804,7 +803,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, if (copied) { if (inode->i_size < to) i_size_write(inode, to); - gfs2_dinode_out(ip, di); mark_inode_dirty(inode); } @@ -873,10 +871,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, gfs2_page_add_databufs(ip, page, from, to); ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (ret > 0) { - gfs2_dinode_out(ip, dibh->b_data); - mark_inode_dirty(inode); - } if (inode == sdp->sd_rindex) { adjust_fs_space(inode); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 2045d70753f1..898e62ed5b85 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1681,7 +1681,6 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) const struct qstr *name = &dentry->d_name; struct gfs2_dirent *dent, *prev = NULL; struct buffer_head *bh; - int error; /* Returns _either_ the entry (if its first in block) or the previous entry otherwise */ @@ -1710,22 +1709,15 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) } brelse(bh); - error = gfs2_meta_inode_buffer(dip, &bh); - if (error) - return error; - if (!dip->i_entries) gfs2_consist_inode(dip); - gfs2_trans_add_bh(dip->i_gl, bh, 1); dip->i_entries--; dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; if (S_ISDIR(dentry->d_inode->i_mode)) drop_nlink(&dip->i_inode); - gfs2_dinode_out(dip, bh->b_data); - brelse(bh); mark_inode_dirty(&dip->i_inode); - return error; + return 0; } /** diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 9d12286d8111..4416a1cfa1fe 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -802,7 +802,6 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, from = 0; } - gfs2_dinode_out(ip, dibh->b_data); mark_inode_dirty(inode); brelse(dibh); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 044efe273b97..a0b53d3bd8fa 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -729,8 +729,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, gfs2_inplace_release(dip); gfs2_quota_unlock(dip); gfs2_alloc_put(dip); - gfs2_glock_dq_uninit_m(2, ghs); mark_inode_dirty(inode); + gfs2_glock_dq_uninit_m(2, ghs); d_instantiate(dentry, inode); return 0; @@ -926,8 +926,9 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, gfs2_trans_add_bh(ip->i_gl, dibh, 1); inc_nlink(&ip->i_inode); ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_dinode_out(ip, dibh->b_data); - mark_inode_dirty(&ip->i_inode); + ihold(inode); + d_instantiate(dentry, inode); + mark_inode_dirty(inode); out_brelse: brelse(dibh); @@ -949,11 +950,6 @@ out_child: out_parent: gfs2_holder_uninit(ghs); gfs2_holder_uninit(ghs + 1); - if (!error) { - ihold(inode); - d_instantiate(dentry, inode); - mark_inode_dirty(inode); - } return error; } @@ -1026,8 +1022,6 @@ static int gfs2_unlink_inode(struct gfs2_inode *dip, clear_nlink(inode); else drop_nlink(inode); - gfs2_trans_add_bh(ip->i_gl, bh, 1); - gfs2_dinode_out(ip, bh->b_data); mark_inode_dirty(inode); if (inode->i_nlink == 0) gfs2_unlink_di(inode); @@ -1565,21 +1559,10 @@ int gfs2_permission(struct inode *inode, int mask) return error; } -static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) +static int __gfs2_setattr_simple(struct inode *inode, struct iattr *attr) { - struct inode *inode = &ip->i_inode; - struct buffer_head *dibh; - int error; - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - return error; - setattr_copy(inode, attr); mark_inode_dirty(inode); - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); return 0; } @@ -1591,19 +1574,19 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) * Returns: errno */ -int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) +int gfs2_setattr_simple(struct inode *inode, struct iattr *attr) { int error; if (current->journal_info) - return __gfs2_setattr_simple(ip, attr); + return __gfs2_setattr_simple(inode, attr); - error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0); + error = gfs2_trans_begin(GFS2_SB(inode), RES_DINODE, 0); if (error) return error; - error = __gfs2_setattr_simple(ip, attr); - gfs2_trans_end(GFS2_SB(&ip->i_inode)); + error = __gfs2_setattr_simple(inode, attr); + gfs2_trans_end(GFS2_SB(inode)); return error; } @@ -1641,7 +1624,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (error) goto out_gunlock_q; - error = gfs2_setattr_simple(ip, attr); + error = gfs2_setattr_simple(inode, attr); if (error) goto out_end_trans; @@ -1697,12 +1680,12 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) error = gfs2_acl_chmod(ip, attr); else - error = gfs2_setattr_simple(ip, attr); + error = gfs2_setattr_simple(inode, attr); out: - gfs2_glock_dq_uninit(&i_gh); if (!error) mark_inode_dirty(inode); + gfs2_glock_dq_uninit(&i_gh); return error; } diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 8d90e0c07672..276e7b52b658 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -109,7 +109,7 @@ extern int gfs2_inode_refresh(struct gfs2_inode *ip); extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, int is_root); extern int gfs2_permission(struct inode *inode, int mask); -extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); +extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr); extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 0e8bb13381e4..3a9a9749f496 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -638,7 +638,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, unsigned long index = loc >> PAGE_CACHE_SHIFT; unsigned offset = loc & (PAGE_CACHE_SIZE - 1); unsigned blocksize, iblock, pos; - struct buffer_head *bh, *dibh; + struct buffer_head *bh; struct page *page; void *kaddr, *ptr; struct gfs2_quota q, *qp; @@ -736,22 +736,13 @@ get_a_page: goto get_a_page; } - /* Update the disk inode timestamp and size (if extended) */ - err = gfs2_meta_inode_buffer(ip, &dibh); - if (err) - goto out; - size = loc + sizeof(struct gfs2_quota); if (size > inode->i_size) i_size_write(inode, size); inode->i_mtime = inode->i_atime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); mark_inode_dirty(inode); - -out: return err; + unlock_out: unlock_page(page); page_cache_release(page); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 9961de702d1b..b05fa5954550 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -752,53 +752,79 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) struct gfs2_sbd *sdp = GFS2_SB(inode); struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); struct backing_dev_info *bdi = metamapping->backing_dev_info; - struct gfs2_holder gh; - struct buffer_head *bh; - struct timespec atime; - struct gfs2_dinode *di; - int ret = -EAGAIN; - int unlock_required = 0; - - /* Skip timestamp update, if this is from a memalloc */ - if (current->flags & PF_MEMALLOC) - goto do_flush; - if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { - ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - if (ret) - goto do_flush; - unlock_required = 1; - } - ret = gfs2_meta_inode_buffer(ip, &bh); - if (ret == 0) { - di = (struct gfs2_dinode *)bh->b_data; - atime.tv_sec = be64_to_cpu(di->di_atime); - atime.tv_nsec = be32_to_cpu(di->di_atime_nsec); - if (timespec_compare(&inode->i_atime, &atime) > 0) { - ret = gfs2_trans_begin(sdp, RES_DINODE, 0); - if (ret == 0) { - gfs2_trans_add_bh(ip->i_gl, bh, 1); - gfs2_dinode_out(ip, bh->b_data); - gfs2_trans_end(sdp); - } - } - brelse(bh); - } - if (unlock_required) - gfs2_glock_dq_uninit(&gh); -do_flush: + int ret = 0; + if (wbc->sync_mode == WB_SYNC_ALL) gfs2_log_flush(GFS2_SB(inode), ip->i_gl); if (bdi->dirty_exceeded) gfs2_ail1_flush(sdp, wbc); else filemap_fdatawrite(metamapping); - if (!ret && (wbc->sync_mode == WB_SYNC_ALL)) + if (wbc->sync_mode == WB_SYNC_ALL) ret = filemap_fdatawait(metamapping); if (ret) mark_inode_dirty_sync(inode); return ret; } +/** + * gfs2_dirty_inode - check for atime updates + * @inode: The inode in question + * @flags: The type of dirty + * + * Unfortunately it can be called under any combination of inode + * glock and transaction lock, so we have to check carefully. + * + * At the moment this deals only with atime - it should be possible + * to expand that role in future, once a review of the locking has + * been carried out. + */ + +static void gfs2_dirty_inode(struct inode *inode, int flags) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *bh; + struct gfs2_holder gh; + int need_unlock = 0; + int need_endtrans = 0; + int ret; + + if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC))) + return; + + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (ret) { + fs_err(sdp, "dirty_inode: glock %d\n", ret); + return; + } + need_unlock = 1; + } + + if (current->journal_info == NULL) { + ret = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (ret) { + fs_err(sdp, "dirty_inode: gfs2_trans_begin %d\n", ret); + goto out; + } + need_endtrans = 1; + } + + ret = gfs2_meta_inode_buffer(ip, &bh); + if (ret == 0) { + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_dinode_out(ip, bh->b_data); + brelse(bh); + } + + if (need_endtrans) + gfs2_trans_end(sdp); +out: + if (need_unlock) + gfs2_glock_dq_uninit(&gh); +} + /** * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one * @sdp: the filesystem @@ -1578,6 +1604,7 @@ const struct super_operations gfs2_super_ops = { .alloc_inode = gfs2_alloc_inode, .destroy_inode = gfs2_destroy_inode, .write_inode = gfs2_write_inode, + .dirty_inode = gfs2_dirty_inode, .evict_inode = gfs2_evict_inode, .put_super = gfs2_put_super, .sync_fs = gfs2_sync_fs, diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 439b61c03262..695304cf01cc 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1296,7 +1296,8 @@ fail: int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct inode *inode = &ip->i_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_ea_location el; int error; @@ -1319,7 +1320,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) if (error) return error; - error = gfs2_setattr_simple(ip, attr); + error = gfs2_setattr_simple(inode, attr); gfs2_trans_end(sdp); return error; } -- cgit v1.2.3-58-ga151 From 9a63edd12ba3c18351f00d6b77a6b2f49f2b8eb6 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 18 Aug 2011 14:35:53 +0100 Subject: GFS2: Clean up gfs2_create If we pass through knowledge of whether the creation is intended to be exclusive or not, then we can deal with that in gfs2_create_inode and remove one set of locking. Also this removes the loop in gfs2_create and simplifies the code a bit. Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index a0b53d3bd8fa..2af69056a9fd 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -663,7 +663,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, unsigned int mode, dev_t dev, const char *symname, - unsigned int size) + unsigned int size, int excl) { const struct qstr *name = &dentry->d_name; struct gfs2_holder ghs[2]; @@ -683,6 +683,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, goto fail; error = create_ok(dip, name, mode); + if ((error == -EEXIST) && S_ISREG(mode) && !excl) { + inode = gfs2_lookupi(dir, &dentry->d_name, 0); + gfs2_glock_dq_uninit(ghs); + d_instantiate(dentry, inode); + return IS_ERR(inode) ? PTR_ERR(inode) : 0; + } if (error) goto fail_gunlock; @@ -760,24 +766,10 @@ fail: static int gfs2_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) { - struct inode *inode; - int ret; - - for (;;) { - ret = gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0); - if (ret != -EEXIST || (nd && (nd->flags & LOOKUP_EXCL))) - return ret; - - inode = gfs2_lookupi(dir, &dentry->d_name, 0); - if (inode) { - if (!IS_ERR(inode)) - break; - return PTR_ERR(inode); - } - } - - d_instantiate(dentry, inode); - return 0; + int excl = 0; + if (nd && (nd->flags & LOOKUP_EXCL)) + excl = 1; + return gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0, excl); } /** @@ -1135,7 +1127,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1) return -ENAMETOOLONG; - return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size); + return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size, 0); } /** @@ -1149,7 +1141,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) { - return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0); + return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0, 0); } /** @@ -1164,7 +1156,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) { - return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0); + return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0); } /* -- cgit v1.2.3-58-ga151 From 9453615a1a7ef3fa910c6464a619595556cfcd63 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 23 Aug 2011 10:19:25 +0100 Subject: GFS2: Fix lseek after SEEK_DATA, SEEK_HOLE have been added We need to take the inode's glock whenever the inode's size is referenced, otherwise it might not be uptodate. Even though generic_file_llseek_unlocked() doesn't implement SEEK_DATA, SEEK_HOLE directly, it does reference the inode's size in those cases, so we need to add them to the list of origins which need the glock. Signed-off-by: Steven Whitehouse Cc: Andi Kleen --- fs/gfs2/file.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 4416a1cfa1fe..d717b72500a4 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -59,15 +59,24 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin) struct gfs2_holder i_gh; loff_t error; - if (origin == 2) { + switch (origin) { + case SEEK_END: /* These reference inode->i_size */ + case SEEK_DATA: + case SEEK_HOLE: error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (!error) { error = generic_file_llseek_unlocked(file, offset, origin); gfs2_glock_dq_uninit(&i_gh); } - } else + break; + case SEEK_CUR: + case SEEK_SET: error = generic_file_llseek_unlocked(file, offset, origin); + break; + default: + error = -EINVAL; + } return error; } -- cgit v1.2.3-58-ga151 From 7c9ca621137cde26be05448133fc1a554345f4f8 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 31 Aug 2011 09:53:19 +0100 Subject: GFS2: Use rbtree for resource groups and clean up bitmap buffer ref count scheme Here is an update of Bob's original rbtree patch which, in addition, also resolves the rather strange ref counting that was being done relating to the bitmap blocks. Originally we had a dual system for journaling resource groups. The metadata blocks were journaled and also the rgrp itself was added to a list. The reason for adding the rgrp to the list in the journal was so that the "repolish clones" code could be run to update the free space, and potentially send any discard requests when the log was flushed. This was done by comparing the "cloned" bitmap with what had been written back on disk during the transaction commit. Due to this, there was a requirement to hang on to the rgrps' bitmap buffers until the journal had been flushed. For that reason, there was a rather complicated set up in the ->go_lock ->go_unlock functions for rgrps involving both a mutex and a spinlock (the ->sd_rindex_spin) to maintain a reference count on the buffers. However, the journal maintains a reference count on the buffers anyway, since they are being journaled as metadata buffers. So by moving the code which deals with the post-journal accounting for bitmap blocks to the metadata journaling code, we can entirely dispense with the rather strange buffer ref counting scheme and also the requirement to journal the rgrps. The net result of all this is that the ->sd_rindex_spin is left to do exactly one job, and that is to look after the rbtree or rgrps. This patch is designed to be a stepping stone towards using RCU for the rbtree of resource groups, however the reduction in the number of uses of the ->sd_rindex_spin is likely to have benefits for multi-threaded workloads, anyway. The patch retains ->go_lock and ->go_unlock for rgrps, however these maybe also be removed in future in favour of calling the functions directly where required in the code. That will allow locking of resource groups without needing to actually read them in - something that could be useful in speeding up statfs. In the mean time though it is valid to dereference ->bi_bh only when the rgrp is locked. This is basically the same rule as before, modulo the references not being valid until the following journal flush. Signed-off-by: Steven Whitehouse Signed-off-by: Bob Peterson Cc: Benjamin Marzinski --- fs/gfs2/glops.c | 42 ++----- fs/gfs2/incore.h | 11 +- fs/gfs2/lops.c | 64 ++++------ fs/gfs2/ops_fstype.c | 3 +- fs/gfs2/rgrp.c | 344 ++++++++++++++------------------------------------- fs/gfs2/rgrp.h | 10 +- fs/gfs2/trans.c | 5 - fs/gfs2/trans.h | 14 +-- 8 files changed, 148 insertions(+), 345 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 0cc3ff48ce20..6f82aac9b0ee 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -134,6 +134,8 @@ void gfs2_ail_flush(struct gfs2_glock *gl) static void rgrp_go_sync(struct gfs2_glock *gl) { struct address_space *metamapping = gfs2_glock2aspace(gl); + struct gfs2_rgrpd *rgd = gl->gl_object; + unsigned int x; int error; if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) @@ -145,6 +147,15 @@ static void rgrp_go_sync(struct gfs2_glock *gl) error = filemap_fdatawait(metamapping); mapping_set_error(metamapping, error); gfs2_ail_empty_gl(gl); + + if (!rgd) + return; + + for (x = 0; x < rgd->rd_length; x++) { + struct gfs2_bitmap *bi = rgd->rd_bits + x; + kfree(bi->bi_clone); + bi->bi_clone = NULL; + } } /** @@ -444,33 +455,6 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) return 0; } -/** - * rgrp_go_lock - operation done after an rgrp lock is locked by - * a first holder on this node. - * @gl: the glock - * @flags: - * - * Returns: errno - */ - -static int rgrp_go_lock(struct gfs2_holder *gh) -{ - return gfs2_rgrp_bh_get(gh->gh_gl->gl_object); -} - -/** - * rgrp_go_unlock - operation done before an rgrp lock is unlocked by - * a last holder on this node. - * @gl: the glock - * @flags: - * - */ - -static void rgrp_go_unlock(struct gfs2_holder *gh) -{ - gfs2_rgrp_bh_put(gh->gh_gl->gl_object); -} - /** * trans_go_sync - promote/demote the transaction glock * @gl: the glock @@ -573,8 +557,8 @@ const struct gfs2_glock_operations gfs2_inode_glops = { const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_xmote_th = rgrp_go_sync, .go_inval = rgrp_go_inval, - .go_lock = rgrp_go_lock, - .go_unlock = rgrp_go_unlock, + .go_lock = gfs2_rgrp_go_lock, + .go_unlock = gfs2_rgrp_go_unlock, .go_dump = gfs2_rgrp_dump, .go_type = LM_TYPE_RGRP, .go_flags = GLOF_ASPACE, diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index be5801a75406..9e753fc9e6a5 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -18,6 +18,7 @@ #include #include #include +#include #define DIO_WAIT 0x00000010 #define DIO_METADATA 0x00000020 @@ -78,8 +79,7 @@ struct gfs2_bitmap { }; struct gfs2_rgrpd { - struct list_head rd_list; /* Link with superblock */ - struct list_head rd_list_mru; + struct rb_node rd_node; /* Link with superblock */ struct gfs2_glock *rd_gl; /* Glock for this rgrp */ u64 rd_addr; /* grp block disk address */ u64 rd_data0; /* first data location */ @@ -91,10 +91,7 @@ struct gfs2_rgrpd { u32 rd_dinodes; u64 rd_igeneration; struct gfs2_bitmap *rd_bits; - struct mutex rd_mutex; - struct gfs2_log_element rd_le; struct gfs2_sbd *rd_sbd; - unsigned int rd_bh_count; u32 rd_last_alloc; u32 rd_flags; #define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ @@ -575,9 +572,7 @@ struct gfs2_sbd { int sd_rindex_uptodate; spinlock_t sd_rindex_spin; struct mutex sd_rindex_mutex; - struct list_head sd_rindex_list; - struct list_head sd_rindex_mru_list; - struct gfs2_rgrpd *sd_rindex_forward; + struct rb_root sd_rindex_tree; unsigned int sd_rgrps; unsigned int sd_max_rg_data; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 05bbb124699f..de05b4d66ef3 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -60,6 +60,29 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) trace_gfs2_pin(bd, 1); } +static bool buffer_is_rgrp(const struct gfs2_bufdata *bd) +{ + return bd->bd_gl->gl_name.ln_type == LM_TYPE_RGRP; +} + +static void maybe_release_space(struct gfs2_bufdata *bd) +{ + struct gfs2_glock *gl = bd->bd_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_rgrpd *rgd = gl->gl_object; + unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number; + struct gfs2_bitmap *bi = rgd->rd_bits + index; + + if (bi->bi_clone == 0) + return; + if (sdp->sd_args.ar_discard) + gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi); + memcpy(bi->bi_clone + bi->bi_offset, + bd->bd_bh->b_data + bi->bi_offset, bi->bi_len); + clear_bit(GBF_FULL, &bi->bi_flags); + rgd->rd_free_clone = rgd->rd_free; +} + /** * gfs2_unpin - Unpin a buffer * @sdp: the filesystem the buffer belongs to @@ -81,6 +104,9 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, mark_buffer_dirty(bh); clear_buffer_pinned(bh); + if (buffer_is_rgrp(bd)) + maybe_release_space(bd); + spin_lock(&sdp->sd_ail_lock); if (bd->bd_ail) { list_del(&bd->bd_ail_st_list); @@ -469,42 +495,6 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) gfs2_revoke_clean(sdp); } -static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) -{ - struct gfs2_rgrpd *rgd; - struct gfs2_trans *tr = current->journal_info; - - tr->tr_touched = 1; - - rgd = container_of(le, struct gfs2_rgrpd, rd_le); - - gfs2_log_lock(sdp); - if (!list_empty(&le->le_list)){ - gfs2_log_unlock(sdp); - return; - } - gfs2_rgrp_bh_hold(rgd); - sdp->sd_log_num_rg++; - list_add(&le->le_list, &sdp->sd_log_le_rg); - gfs2_log_unlock(sdp); -} - -static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) -{ - struct list_head *head = &sdp->sd_log_le_rg; - struct gfs2_rgrpd *rgd; - - while (!list_empty(head)) { - rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list); - list_del_init(&rgd->rd_le.le_list); - sdp->sd_log_num_rg--; - - gfs2_rgrp_repolish_clones(rgd); - gfs2_rgrp_bh_put(rgd); - } - gfs2_assert_warn(sdp, !sdp->sd_log_num_rg); -} - /** * databuf_lo_add - Add a databuf to the transaction. * @@ -771,8 +761,6 @@ const struct gfs2_log_operations gfs2_revoke_lops = { }; const struct gfs2_log_operations gfs2_rg_lops = { - .lo_add = rg_lo_add, - .lo_after_commit = rg_lo_after_commit, .lo_name = "rg", }; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 079587e53849..69166ff7f37d 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -77,8 +77,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) spin_lock_init(&sdp->sd_rindex_spin); mutex_init(&sdp->sd_rindex_mutex); - INIT_LIST_HEAD(&sdp->sd_rindex_list); - INIT_LIST_HEAD(&sdp->sd_rindex_mru_list); + sdp->sd_rindex_tree.rb_node = NULL; INIT_LIST_HEAD(&sdp->sd_jindex_list); spin_lock_init(&sdp->sd_jindex_spin); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 7f8af1eb02de..00f6e3d62c22 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -328,15 +329,25 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk) { - struct gfs2_rgrpd *rgd; + struct rb_node **newn, *parent = NULL; spin_lock(&sdp->sd_rindex_spin); - list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) { - if (rgrp_contains_block(rgd, blk)) { - list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); + newn = &sdp->sd_rindex_tree.rb_node; + + /* Figure out where to put new node */ + while (*newn) { + struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd, + rd_node); + + parent = *newn; + if (blk < cur->rd_addr) + newn = &((*newn)->rb_left); + else if (blk > cur->rd_data0 + cur->rd_data) + newn = &((*newn)->rb_right); + else { spin_unlock(&sdp->sd_rindex_spin); - return rgd; + return cur; } } @@ -354,8 +365,13 @@ struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk) struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp) { - gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list)); - return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list); + const struct rb_node *n; + struct gfs2_rgrpd *rgd; + + n = rb_first(&sdp->sd_rindex_tree); + rgd = rb_entry(n, struct gfs2_rgrpd, rd_node); + + return rgd; } /** @@ -367,28 +383,34 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp) struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd) { - if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list) + struct gfs2_sbd *sdp = rgd->rd_sbd; + const struct rb_node *n; + + spin_lock(&sdp->sd_rindex_spin); + n = rb_next(&rgd->rd_node); + if (n == NULL) + n = rb_first(&sdp->sd_rindex_tree); + + if (unlikely(&rgd->rd_node == n)) { + spin_unlock(&sdp->sd_rindex_spin); return NULL; - return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list); + } + rgd = rb_entry(n, struct gfs2_rgrpd, rd_node); + spin_unlock(&sdp->sd_rindex_spin); + return rgd; } static void clear_rgrpdi(struct gfs2_sbd *sdp) { - struct list_head *head; + struct rb_node *n; struct gfs2_rgrpd *rgd; struct gfs2_glock *gl; - spin_lock(&sdp->sd_rindex_spin); - sdp->sd_rindex_forward = NULL; - spin_unlock(&sdp->sd_rindex_spin); - - head = &sdp->sd_rindex_list; - while (!list_empty(head)) { - rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list); + while ((n = rb_first(&sdp->sd_rindex_tree))) { + rgd = rb_entry(n, struct gfs2_rgrpd, rd_node); gl = rgd->rd_gl; - list_del(&rgd->rd_list); - list_del(&rgd->rd_list_mru); + rb_erase(n, &sdp->sd_rindex_tree); if (gl) { gl->gl_object = NULL; @@ -535,6 +557,29 @@ static void gfs2_rindex_in(struct gfs2_rgrpd *rgd, const void *buf) rgd->rd_bitbytes = be32_to_cpu(str->ri_bitbytes); } +static void rgd_insert(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL; + + /* Figure out where to put new node */ + while (*newn) { + struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd, + rd_node); + + parent = *newn; + if (rgd->rd_addr < cur->rd_addr) + newn = &((*newn)->rb_left); + else if (rgd->rd_addr > cur->rd_addr) + newn = &((*newn)->rb_right); + else + return; + } + + rb_link_node(&rgd->rd_node, parent, newn); + rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree); +} + /** * read_rindex_entry - Pull in a new resource index entry from the disk * @gl: The glock covering the rindex inode @@ -566,14 +611,11 @@ static int read_rindex_entry(struct gfs2_inode *ip, if (!rgd) return error; - mutex_init(&rgd->rd_mutex); - lops_init_le(&rgd->rd_le, &gfs2_rg_lops); rgd->rd_sbd = sdp; - list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list); - list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); - gfs2_rindex_in(rgd, buf); + rgd_insert(rgd); + error = compute_bitstructs(rgd); if (error) return error; @@ -585,6 +627,8 @@ static int read_rindex_entry(struct gfs2_inode *ip, rgd->rd_gl->gl_object = rgd; rgd->rd_flags &= ~GFS2_RDF_UPTODATE; + if (rgd->rd_data > sdp->sd_max_rg_data) + sdp->sd_max_rg_data = rgd->rd_data; return error; } @@ -601,8 +645,6 @@ int gfs2_ri_update(struct gfs2_inode *ip) struct inode *inode = &ip->i_inode; struct file_ra_state ra_state; u64 rgrp_count = i_size_read(inode); - struct gfs2_rgrpd *rgd; - unsigned int max_data = 0; int error; do_div(rgrp_count, sizeof(struct gfs2_rindex)); @@ -617,10 +659,6 @@ int gfs2_ri_update(struct gfs2_inode *ip) } } - list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list) - if (rgd->rd_data > max_data) - max_data = rgd->rd_data; - sdp->sd_max_rg_data = max_data; sdp->sd_rindex_uptodate = 1; return 0; } @@ -694,7 +732,7 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) } /** - * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps + * gfs2_rgrp_go_lock - Read in a RG's header and bitmaps * @rgd: the struct gfs2_rgrpd describing the RG to read in * * Read in all of a Resource Group's header and bitmap blocks. @@ -703,8 +741,9 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) * Returns: errno */ -int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) +int gfs2_rgrp_go_lock(struct gfs2_holder *gh) { + struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object; struct gfs2_sbd *sdp = rgd->rd_sbd; struct gfs2_glock *gl = rgd->rd_gl; unsigned int length = rgd->rd_length; @@ -712,17 +751,6 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) unsigned int x, y; int error; - mutex_lock(&rgd->rd_mutex); - - spin_lock(&sdp->sd_rindex_spin); - if (rgd->rd_bh_count) { - rgd->rd_bh_count++; - spin_unlock(&sdp->sd_rindex_spin); - mutex_unlock(&rgd->rd_mutex); - return 0; - } - spin_unlock(&sdp->sd_rindex_spin); - for (x = 0; x < length; x++) { bi = rgd->rd_bits + x; error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh); @@ -747,15 +775,9 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags); gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); + rgd->rd_free_clone = rgd->rd_free; } - spin_lock(&sdp->sd_rindex_spin); - rgd->rd_free_clone = rgd->rd_free; - rgd->rd_bh_count++; - spin_unlock(&sdp->sd_rindex_spin); - - mutex_unlock(&rgd->rd_mutex); - return 0; fail: @@ -765,52 +787,32 @@ fail: bi->bi_bh = NULL; gfs2_assert_warn(sdp, !bi->bi_clone); } - mutex_unlock(&rgd->rd_mutex); return error; } -void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd) -{ - struct gfs2_sbd *sdp = rgd->rd_sbd; - - spin_lock(&sdp->sd_rindex_spin); - gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count); - rgd->rd_bh_count++; - spin_unlock(&sdp->sd_rindex_spin); -} - /** - * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get() + * gfs2_rgrp_go_unlock - Release RG bitmaps read in with gfs2_rgrp_bh_get() * @rgd: the struct gfs2_rgrpd describing the RG to read in * */ -void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd) +void gfs2_rgrp_go_unlock(struct gfs2_holder *gh) { - struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object; int x, length = rgd->rd_length; - spin_lock(&sdp->sd_rindex_spin); - gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count); - if (--rgd->rd_bh_count) { - spin_unlock(&sdp->sd_rindex_spin); - return; - } - for (x = 0; x < length; x++) { struct gfs2_bitmap *bi = rgd->rd_bits + x; - kfree(bi->bi_clone); - bi->bi_clone = NULL; brelse(bi->bi_bh); bi->bi_bh = NULL; } - spin_unlock(&sdp->sd_rindex_spin); } -static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, - const struct gfs2_bitmap *bi) +void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, + struct buffer_head *bh, + const struct gfs2_bitmap *bi) { struct super_block *sb = sdp->sd_vfs; struct block_device *bdev = sb->s_bdev; @@ -823,7 +825,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, unsigned int x; for (x = 0; x < bi->bi_len; x++) { - const u8 *orig = bi->bi_bh->b_data + bi->bi_offset + x; + const u8 *orig = bh->b_data + bi->bi_offset + x; const u8 *clone = bi->bi_clone + bi->bi_offset + x; u8 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1)); diff &= 0x55; @@ -862,28 +864,6 @@ fail: sdp->sd_args.ar_discard = 0; } -void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd) -{ - struct gfs2_sbd *sdp = rgd->rd_sbd; - unsigned int length = rgd->rd_length; - unsigned int x; - - for (x = 0; x < length; x++) { - struct gfs2_bitmap *bi = rgd->rd_bits + x; - if (!bi->bi_clone) - continue; - if (sdp->sd_args.ar_discard) - gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi); - clear_bit(GBF_FULL, &bi->bi_flags); - memcpy(bi->bi_clone + bi->bi_offset, - bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); - } - - spin_lock(&sdp->sd_rindex_spin); - rgd->rd_free_clone = rgd->rd_free; - spin_unlock(&sdp->sd_rindex_spin); -} - /** * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode * @ip: the incore GFS2 inode structure @@ -911,20 +891,15 @@ struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip) static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) { - struct gfs2_sbd *sdp = rgd->rd_sbd; - int ret = 0; - if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) return 0; - spin_lock(&sdp->sd_rindex_spin); if (rgd->rd_free_clone >= al->al_requested) { al->al_rgd = rgd; - ret = 1; + return 1; } - spin_unlock(&sdp->sd_rindex_spin); - return ret; + return 0; } /** @@ -991,76 +966,6 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip return; } -/** - * recent_rgrp_next - get next RG from "recent" list - * @cur_rgd: current rgrp - * - * Returns: The next rgrp in the recent list - */ - -static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd) -{ - struct gfs2_sbd *sdp = cur_rgd->rd_sbd; - struct list_head *head; - struct gfs2_rgrpd *rgd; - - spin_lock(&sdp->sd_rindex_spin); - head = &sdp->sd_rindex_mru_list; - if (unlikely(cur_rgd->rd_list_mru.next == head)) { - spin_unlock(&sdp->sd_rindex_spin); - return NULL; - } - rgd = list_entry(cur_rgd->rd_list_mru.next, struct gfs2_rgrpd, rd_list_mru); - spin_unlock(&sdp->sd_rindex_spin); - return rgd; -} - -/** - * forward_rgrp_get - get an rgrp to try next from full list - * @sdp: The GFS2 superblock - * - * Returns: The rgrp to try next - */ - -static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp) -{ - struct gfs2_rgrpd *rgd; - unsigned int journals = gfs2_jindex_size(sdp); - unsigned int rg = 0, x; - - spin_lock(&sdp->sd_rindex_spin); - - rgd = sdp->sd_rindex_forward; - if (!rgd) { - if (sdp->sd_rgrps >= journals) - rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals; - - for (x = 0, rgd = gfs2_rgrpd_get_first(sdp); x < rg; - x++, rgd = gfs2_rgrpd_get_next(rgd)) - /* Do Nothing */; - - sdp->sd_rindex_forward = rgd; - } - - spin_unlock(&sdp->sd_rindex_spin); - - return rgd; -} - -/** - * forward_rgrp_set - set the forward rgrp pointer - * @sdp: the filesystem - * @rgd: The new forward rgrp - * - */ - -static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd) -{ - spin_lock(&sdp->sd_rindex_spin); - sdp->sd_rindex_forward = rgd; - spin_unlock(&sdp->sd_rindex_spin); -} - /** * get_local_rgrp - Choose and lock a rgrp for allocation * @ip: the inode to reserve space for @@ -1076,14 +981,15 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd, *begin = NULL; struct gfs2_alloc *al = ip->i_alloc; - int flags = LM_FLAG_TRY; - int skipped = 0; - int loops = 0; int error, rg_locked; + int loops = 0; - rgd = gfs2_blk2rgrpd(sdp, ip->i_goal); + rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal); - while (rgd) { + if (rgd == NULL) + return -EBADSLT; + + while (loops < 3) { rg_locked = 0; if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) { @@ -1096,80 +1002,24 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) switch (error) { case 0: if (try_rgrp_fit(rgd, al)) - goto out; + return 0; if (rgd->rd_flags & GFS2_RDF_CHECK) try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); /* fall through */ case GLR_TRYFAILED: - rgd = recent_rgrp_next(rgd); - break; - - default: - return error; - } - } - - /* Go through full list of rgrps */ - - begin = rgd = forward_rgrp_get(sdp); - - for (;;) { - rg_locked = 0; - - if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) { - rg_locked = 1; - error = 0; - } else { - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags, - &al->al_rgd_gh); - } - switch (error) { - case 0: - if (try_rgrp_fit(rgd, al)) - goto out; - if (rgd->rd_flags & GFS2_RDF_CHECK) - try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); - if (!rg_locked) - gfs2_glock_dq_uninit(&al->al_rgd_gh); - break; - - case GLR_TRYFAILED: - skipped++; + rgd = gfs2_rgrpd_get_next(rgd); + if (rgd == begin) + loops++; break; default: return error; } - - rgd = gfs2_rgrpd_get_next(rgd); - if (!rgd) - rgd = gfs2_rgrpd_get_first(sdp); - - if (rgd == begin) { - if (++loops >= 3) - return -ENOSPC; - if (!skipped) - loops++; - flags = 0; - if (loops == 2) - gfs2_log_flush(sdp, NULL); - } - } - -out: - if (begin) { - spin_lock(&sdp->sd_rindex_spin); - list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); - spin_unlock(&sdp->sd_rindex_spin); - rgd = gfs2_rgrpd_get_next(rgd); - if (!rgd) - rgd = gfs2_rgrpd_get_first(sdp); - forward_rgrp_set(sdp, rgd); } - return 0; + return -ENOSPC; } /** @@ -1352,6 +1202,7 @@ do_search: /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone bitmaps, so we must search the originals for that. */ buffer = bi->bi_bh->b_data + bi->bi_offset; + WARN_ON(!buffer_uptodate(bi->bi_bh)); if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone) buffer = bi->bi_clone + bi->bi_offset; @@ -1371,6 +1222,7 @@ skip: if (blk == BFITNOENT) return blk; + *n = 1; if (old_state == new_state) goto out; @@ -1539,9 +1391,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n) gfs2_statfs_change(sdp, 0, -(s64)*n, 0); gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid); - spin_lock(&sdp->sd_rindex_spin); rgd->rd_free_clone -= *n; - spin_unlock(&sdp->sd_rindex_spin); trace_gfs2_block_alloc(ip, block, *n, GFS2_BLKST_USED); *bn = block; return 0; @@ -1594,9 +1444,7 @@ int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation) gfs2_statfs_change(sdp, 0, -1, +1); gfs2_trans_add_unrevoke(sdp, block, 1); - spin_lock(&sdp->sd_rindex_spin); rgd->rd_free_clone--; - spin_unlock(&sdp->sd_rindex_spin); trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE); *bn = block; return 0; @@ -1629,8 +1477,6 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta) gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); - gfs2_trans_add_rg(rgd); - /* Directories keep their data in the metadata address space */ if (meta || ip->i_depth) gfs2_meta_wipe(ip, bstart, blen); @@ -1666,7 +1512,6 @@ void gfs2_unlink_di(struct inode *inode) trace_gfs2_block_alloc(ip, blkno, 1, GFS2_BLKST_UNLINKED); gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); - gfs2_trans_add_rg(rgd); } static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) @@ -1688,7 +1533,6 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_statfs_change(sdp, 0, +1, -1); - gfs2_trans_add_rg(rgd); } diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index d253f9a8c70e..59b950f43ccf 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -25,11 +25,8 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh); -extern int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd); -extern void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd); -extern void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd); - -extern void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd); +extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh); +extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh); extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); static inline void gfs2_alloc_put(struct gfs2_inode *ip) @@ -72,5 +69,8 @@ extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); +extern void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, + struct buffer_head *bh, + const struct gfs2_bitmap *bi); #endif /* __RGRP_DOT_H__ */ diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 9ec73a854111..86ac75d99d31 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -185,8 +185,3 @@ void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) gfs2_log_unlock(sdp); } -void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd) -{ - lops_add(rgd->rd_sbd, &rgd->rd_le); -} - diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index fb56b783e028..980c5c05398a 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -34,14 +34,12 @@ static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al) al->al_requested + 1 : al->al_rgd->rd_length; } -int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, - unsigned int revokes); +extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, + unsigned int revokes); -void gfs2_trans_end(struct gfs2_sbd *sdp); - -void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); -void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); -void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len); -void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd); +extern void gfs2_trans_end(struct gfs2_sbd *sdp); +extern void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); +extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); +extern void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len); #endif /* __TRANS_DOT_H__ */ -- cgit v1.2.3-58-ga151 From 8339ee543ece6e2dcc1bbd97d5350163c198cf00 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 31 Aug 2011 16:38:29 +0100 Subject: GFS2: Make resource groups "append only" during life of fs Since we have ruled out supporting online filesystem shrink, it is possible to make the resource group list append only during the life of a super block. This gives several benefits: Firstly, we only need to read new rindex elements as they are added rather than needing to reread the whole rindex file each time one element is added. Secondly, the rindex glock can be held for much shorter periods of time, and is completely removed from the fast path for allocations. The lock is taken in shared mode only when updating the resource groups when the first allocation occurs, and after a grow has taken place. Thirdly, this results in a reduction in code size, and everything gets a lot simpler to understand in this area. Signed-off-by: Steven Whitehouse --- fs/gfs2/bmap.c | 7 --- fs/gfs2/dir.c | 6 -- fs/gfs2/glops.c | 16 ++---- fs/gfs2/incore.h | 1 - fs/gfs2/inode.c | 15 +---- fs/gfs2/rgrp.c | 168 ++++++++++++++++++++++++------------------------------- fs/gfs2/rgrp.h | 17 +++--- fs/gfs2/super.c | 22 ++------ fs/gfs2/xattr.c | 17 +----- 9 files changed, 95 insertions(+), 174 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 7878c473ae62..e60296137707 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -783,11 +783,6 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, else if (ip->i_depth) revokes = sdp->sd_inptrs; - if (ip != GFS2_I(sdp->sd_rindex)) - error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh); - else if (!sdp->sd_rgrps) - error = gfs2_ri_update(ip); - if (error) return error; @@ -887,8 +882,6 @@ out_rg_gunlock: out_rlist: gfs2_rlist_free(&rlist); out: - if (ip != GFS2_I(sdp->sd_rindex)) - gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh); return error; } diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 898e62ed5b85..90b877b464ca 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1807,10 +1807,6 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, if (error) goto out_put; - error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh); - if (error) - goto out_qs; - /* Count the number of leaves */ bh = leaf_bh; @@ -1889,8 +1885,6 @@ out_rg_gunlock: gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); out_rlist: gfs2_rlist_free(&rlist); - gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh); -out_qs: gfs2_quota_unhold(dip); out_put: gfs2_alloc_put(dip); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 6f82aac9b0ee..951541b6234c 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -134,8 +134,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl) static void rgrp_go_sync(struct gfs2_glock *gl) { struct address_space *metamapping = gfs2_glock2aspace(gl); - struct gfs2_rgrpd *rgd = gl->gl_object; - unsigned int x; + struct gfs2_rgrpd *rgd; int error; if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) @@ -148,14 +147,11 @@ static void rgrp_go_sync(struct gfs2_glock *gl) mapping_set_error(metamapping, error); gfs2_ail_empty_gl(gl); - if (!rgd) - return; - - for (x = 0; x < rgd->rd_length; x++) { - struct gfs2_bitmap *bi = rgd->rd_bits + x; - kfree(bi->bi_clone); - bi->bi_clone = NULL; - } + spin_lock(&gl->gl_spin); + rgd = gl->gl_object; + if (rgd) + gfs2_free_clones(rgd); + spin_unlock(&gl->gl_spin); } /** diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 9e753fc9e6a5..56847f5903ae 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -255,7 +255,6 @@ struct gfs2_alloc { unsigned int al_line; char *al_file; - struct gfs2_holder al_ri_gh; struct gfs2_holder al_rgd_gh; struct gfs2_rgrpd *al_rgd; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 2af69056a9fd..29703dd97dc9 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1041,13 +1041,8 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) struct buffer_head *bh; struct gfs2_holder ghs[3]; struct gfs2_rgrpd *rgd; - struct gfs2_holder ri_gh; int error; - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - return error; - gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); @@ -1104,7 +1099,6 @@ out_child: gfs2_glock_dq(ghs); out_parent: gfs2_holder_uninit(ghs); - gfs2_glock_dq_uninit(&ri_gh); return error; } @@ -1222,7 +1216,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, struct gfs2_inode *ip = GFS2_I(odentry->d_inode); struct gfs2_inode *nip = NULL; struct gfs2_sbd *sdp = GFS2_SB(odir); - struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh; + struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; struct gfs2_rgrpd *nrgd; unsigned int num_gh; int dir_rename = 0; @@ -1236,10 +1230,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, return 0; } - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - return error; - if (odip != ndip) { error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 0, &r_gh); @@ -1376,7 +1366,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, al->al_requested = sdp->sd_max_dirres; - error = gfs2_inplace_reserve_ri(ndip); + error = gfs2_inplace_reserve(ndip); if (error) goto out_gunlock_q; @@ -1447,7 +1437,6 @@ out_gunlock_r: if (r_gh.gh_gl) gfs2_glock_dq_uninit(&r_gh); out: - gfs2_glock_dq_uninit(&ri_gh); return error; } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 00f6e3d62c22..88d5b75067a8 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -332,14 +332,10 @@ struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk) struct rb_node **newn, *parent = NULL; spin_lock(&sdp->sd_rindex_spin); - newn = &sdp->sd_rindex_tree.rb_node; - - /* Figure out where to put new node */ while (*newn) { struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd, rd_node); - parent = *newn; if (blk < cur->rd_addr) newn = &((*newn)->rb_left); @@ -350,7 +346,6 @@ struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk) return cur; } } - spin_unlock(&sdp->sd_rindex_spin); return NULL; @@ -368,8 +363,10 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp) const struct rb_node *n; struct gfs2_rgrpd *rgd; + spin_lock(&sdp->sd_rindex_spin); n = rb_first(&sdp->sd_rindex_tree); rgd = rb_entry(n, struct gfs2_rgrpd, rd_node); + spin_unlock(&sdp->sd_rindex_spin); return rgd; } @@ -400,7 +397,18 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd) return rgd; } -static void clear_rgrpdi(struct gfs2_sbd *sdp) +void gfs2_free_clones(struct gfs2_rgrpd *rgd) +{ + int x; + + for (x = 0; x < rgd->rd_length; x++) { + struct gfs2_bitmap *bi = rgd->rd_bits + x; + kfree(bi->bi_clone); + bi->bi_clone = NULL; + } +} + +void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) { struct rb_node *n; struct gfs2_rgrpd *rgd; @@ -413,23 +421,19 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp) rb_erase(n, &sdp->sd_rindex_tree); if (gl) { + spin_lock(&gl->gl_spin); gl->gl_object = NULL; + spin_unlock(&gl->gl_spin); gfs2_glock_add_to_lru(gl); gfs2_glock_put(gl); } + gfs2_free_clones(rgd); kfree(rgd->rd_bits); kmem_cache_free(gfs2_rgrpd_cachep, rgd); } } -void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) -{ - mutex_lock(&sdp->sd_rindex_mutex); - clear_rgrpdi(sdp); - mutex_unlock(&sdp->sd_rindex_mutex); -} - static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd) { printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)rgd->rd_addr); @@ -546,17 +550,6 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp) return total_data; } -static void gfs2_rindex_in(struct gfs2_rgrpd *rgd, const void *buf) -{ - const struct gfs2_rindex *str = buf; - - rgd->rd_addr = be64_to_cpu(str->ri_addr); - rgd->rd_length = be32_to_cpu(str->ri_length); - rgd->rd_data0 = be64_to_cpu(str->ri_data0); - rgd->rd_data = be32_to_cpu(str->ri_data); - rgd->rd_bitbytes = be32_to_cpu(str->ri_bitbytes); -} - static void rgd_insert(struct gfs2_rgrpd *rgd) { struct gfs2_sbd *sdp = rgd->rd_sbd; @@ -584,7 +577,7 @@ static void rgd_insert(struct gfs2_rgrpd *rgd) * read_rindex_entry - Pull in a new resource index entry from the disk * @gl: The glock covering the rindex inode * - * Returns: 0 on success, error code otherwise + * Returns: 0 on success, > 0 on EOF, error code otherwise */ static int read_rindex_entry(struct gfs2_inode *ip, @@ -592,19 +585,18 @@ static int read_rindex_entry(struct gfs2_inode *ip, { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); - char buf[sizeof(struct gfs2_rindex)]; + struct gfs2_rindex buf; int error; struct gfs2_rgrpd *rgd; - error = gfs2_internal_read(ip, ra_state, buf, &pos, + if (pos >= i_size_read(&ip->i_inode)) + return 1; + + error = gfs2_internal_read(ip, ra_state, (char *)&buf, &pos, sizeof(struct gfs2_rindex)); - if (!error) - return 0; - if (error != sizeof(struct gfs2_rindex)) { - if (error > 0) - error = -EIO; - return error; - } + + if (error != sizeof(struct gfs2_rindex)) + return (error == 0) ? 1 : error; rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS); error = -ENOMEM; @@ -612,23 +604,34 @@ static int read_rindex_entry(struct gfs2_inode *ip, return error; rgd->rd_sbd = sdp; - - gfs2_rindex_in(rgd, buf); - rgd_insert(rgd); + rgd->rd_addr = be64_to_cpu(buf.ri_addr); + rgd->rd_length = be32_to_cpu(buf.ri_length); + rgd->rd_data0 = be64_to_cpu(buf.ri_data0); + rgd->rd_data = be32_to_cpu(buf.ri_data); + rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes); error = compute_bitstructs(rgd); if (error) - return error; + goto fail; error = gfs2_glock_get(sdp, rgd->rd_addr, &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); if (error) - return error; + goto fail; rgd->rd_gl->gl_object = rgd; rgd->rd_flags &= ~GFS2_RDF_UPTODATE; if (rgd->rd_data > sdp->sd_max_rg_data) sdp->sd_max_rg_data = rgd->rd_data; + spin_lock(&sdp->sd_rindex_spin); + rgd_insert(rgd); + sdp->sd_rgrps++; + spin_unlock(&sdp->sd_rindex_spin); + return error; + +fail: + kfree(rgd->rd_bits); + kmem_cache_free(gfs2_rgrpd_cachep, rgd); return error; } @@ -639,34 +642,28 @@ static int read_rindex_entry(struct gfs2_inode *ip, * Returns: 0 on successful update, error code otherwise */ -int gfs2_ri_update(struct gfs2_inode *ip) +static int gfs2_ri_update(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct inode *inode = &ip->i_inode; struct file_ra_state ra_state; - u64 rgrp_count = i_size_read(inode); int error; - do_div(rgrp_count, sizeof(struct gfs2_rindex)); - clear_rgrpdi(sdp); - file_ra_state_init(&ra_state, inode->i_mapping); - for (sdp->sd_rgrps = 0; sdp->sd_rgrps < rgrp_count; sdp->sd_rgrps++) { + do { error = read_rindex_entry(ip, &ra_state); - if (error) { - clear_rgrpdi(sdp); - return error; - } - } + } while (error == 0); + + if (error < 0) + return error; sdp->sd_rindex_uptodate = 1; return 0; } /** - * gfs2_rindex_hold - Grab a lock on the rindex + * gfs2_rindex_update - Update the rindex if required * @sdp: The GFS2 superblock - * @ri_gh: the glock holder * * We grab a lock on the rindex inode to make sure that it doesn't * change whilst we are performing an operation. We keep this lock @@ -678,30 +675,29 @@ int gfs2_ri_update(struct gfs2_inode *ip) * special file, which might have been updated if someone expanded the * filesystem (via gfs2_grow utility), which adds new resource groups. * - * Returns: 0 on success, error code otherwise + * Returns: 0 on succeess, error code otherwise */ -int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh) +int gfs2_rindex_update(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex); struct gfs2_glock *gl = ip->i_gl; - int error; - - error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh); - if (error) - return error; + struct gfs2_holder ri_gh; + int error = 0; /* Read new copy from disk if we don't have the latest */ if (!sdp->sd_rindex_uptodate) { mutex_lock(&sdp->sd_rindex_mutex); - if (!sdp->sd_rindex_uptodate) { + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh); + if (error) + return error; + if (!sdp->sd_rindex_uptodate) error = gfs2_ri_update(ip); - if (error) - gfs2_glock_dq_uninit(ri_gh); - } + gfs2_glock_dq_uninit(&ri_gh); mutex_unlock(&sdp->sd_rindex_mutex); } + return error; } @@ -873,8 +869,13 @@ fail: struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip) { + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + int error; BUG_ON(ip->i_alloc != NULL); ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_NOFS); + error = gfs2_rindex_update(sdp); + if (error) + fs_warn(sdp, "rindex update returns %d\n", error); return ip->i_alloc; } @@ -1029,7 +1030,7 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) * Returns: errno */ -int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, +int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); @@ -1041,18 +1042,6 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, if (gfs2_assert_warn(sdp, al->al_requested)) return -EINVAL; - if (hold_rindex) { - /* We need to hold the rindex unless the inode we're using is - the rindex itself, in which case it's already held. */ - if (ip != GFS2_I(sdp->sd_rindex)) - error = gfs2_rindex_hold(sdp, &al->al_ri_gh); - else if (!sdp->sd_rgrps) /* We may not have the rindex read - in, so: */ - error = gfs2_ri_update(ip); - if (error) - return error; - } - try_again: do { error = get_local_rgrp(ip, &last_unlinked); @@ -1069,11 +1058,8 @@ try_again: } } while (error && tries++ < 3); - if (error) { - if (hold_rindex && ip != GFS2_I(sdp->sd_rindex)) - gfs2_glock_dq_uninit(&al->al_ri_gh); + if (error) return error; - } /* no error, so we have the rgrp set in the inode's allocation. */ al->al_file = file; @@ -1103,8 +1089,6 @@ void gfs2_inplace_release(struct gfs2_inode *ip) al->al_rgd = NULL; if (al->al_rgd_gh.gh_gl) gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl) - gfs2_glock_dq_uninit(&al->al_ri_gh); } /** @@ -1558,34 +1542,26 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) { struct gfs2_rgrpd *rgd; - struct gfs2_holder ri_gh, rgd_gh; - struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex); - int ri_locked = 0; + struct gfs2_holder rgd_gh; int error; - if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - goto fail; - ri_locked = 1; - } + error = gfs2_rindex_update(sdp); + if (error) + return error; error = -EINVAL; rgd = gfs2_blk2rgrpd(sdp, no_addr); if (!rgd) - goto fail_rindex; + goto fail; error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh); if (error) - goto fail_rindex; + goto fail; if (gfs2_get_block_type(rgd, no_addr) != type) error = -ESTALE; gfs2_glock_dq_uninit(&rgd_gh); -fail_rindex: - if (ri_locked) - gfs2_glock_dq_uninit(&ri_gh); fail: return error; } diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 59b950f43ccf..0439fca18f08 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -18,13 +18,13 @@ struct gfs2_holder; extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); -struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk); -struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); -struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); +extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk); +extern struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); +extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); -extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh); - +extern int gfs2_rindex_update(struct gfs2_sbd *sdp); +extern void gfs2_free_clones(struct gfs2_rgrpd *rgd); extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh); extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh); @@ -36,16 +36,13 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip) ip->i_alloc = NULL; } -extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, +extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line); #define gfs2_inplace_reserve(ip) \ - gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__) -#define gfs2_inplace_reserve_ri(ip) \ - gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__) + gfs2_inplace_reserve_i((ip), __FILE__, __LINE__) extern void gfs2_inplace_release(struct gfs2_inode *ip); -extern int gfs2_ri_update(struct gfs2_inode *ip); extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b05fa5954550..f716c4f8b252 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1037,7 +1037,6 @@ static int statfs_slow_fill(struct gfs2_rgrpd *rgd, static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) { - struct gfs2_holder ri_gh; struct gfs2_rgrpd *rgd_next; struct gfs2_holder *gha, *gh; unsigned int slots = 64; @@ -1050,10 +1049,6 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host if (!gha) return -ENOMEM; - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - goto out; - rgd_next = gfs2_rgrpd_get_first(sdp); for (;;) { @@ -1096,9 +1091,6 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host yield(); } - gfs2_glock_dq_uninit(&ri_gh); - -out: kfree(gha); return error; } @@ -1150,6 +1142,10 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) struct gfs2_statfs_change_host sc; int error; + error = gfs2_rindex_update(sdp); + if (error) + return error; + if (gfs2_tune_get(sdp, gt_statfs_slow)) error = gfs2_statfs_slow(sdp, &sc); else @@ -1420,21 +1416,17 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip) if (error) goto out; - error = gfs2_rindex_hold(sdp, &al->al_ri_gh); - if (error) - goto out_qs; - rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); if (!rgd) { gfs2_consist_inode(ip); error = -EIO; - goto out_rindex_relse; + goto out_qs; } error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &al->al_rgd_gh); if (error) - goto out_rindex_relse; + goto out_qs; error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, sdp->sd_jdesc->jd_blocks); @@ -1449,8 +1441,6 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip) out_rg_gunlock: gfs2_glock_dq_uninit(&al->al_rgd_gh); -out_rindex_relse: - gfs2_glock_dq_uninit(&al->al_ri_gh); out_qs: gfs2_quota_unhold(ip); out: diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 695304cf01cc..167f4af53b1d 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -332,15 +332,8 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, if (error) goto out_alloc; - error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh); - if (error) - goto out_quota; - error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL); - gfs2_glock_dq_uninit(&al->al_ri_gh); - -out_quota: gfs2_quota_unhold(ip); out_alloc: gfs2_alloc_put(ip); @@ -1502,24 +1495,18 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip) if (error) goto out_alloc; - error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh); - if (error) - goto out_quota; - error = ea_foreach(ip, ea_dealloc_unstuffed, NULL); if (error) - goto out_rindex; + goto out_quota; if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { error = ea_dealloc_indirect(ip); if (error) - goto out_rindex; + goto out_quota; } error = ea_dealloc_block(ip); -out_rindex: - gfs2_glock_dq_uninit(&al->al_ri_gh); out_quota: gfs2_quota_unhold(ip); out_alloc: -- cgit v1.2.3-58-ga151 From 54335b1fca27b84baa75b1f45985d98262003837 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 1 Sep 2011 13:31:59 +0100 Subject: GFS2: Cache the most recently used resource group in the inode This means that after the initial allocation for any inode, the last used resource group is cached in the inode for future use. This drastically reduces the number of lookups of resource groups in the common case, and this the contention on that data structure. The allocation algorithm is the same as previously, except that we always check to see if the goal block is within the cached rgrp first before going to the rbtree to look one up. Signed-off-by: Steven Whitehouse --- fs/gfs2/aops.c | 2 +- fs/gfs2/file.c | 6 +++--- fs/gfs2/incore.h | 3 +-- fs/gfs2/inode.c | 12 +++++------- fs/gfs2/quota.c | 4 ++-- fs/gfs2/rgrp.c | 51 +++++++++++++++++++++++++-------------------------- fs/gfs2/super.c | 1 + fs/gfs2/trans.h | 8 +++++--- fs/gfs2/xattr.c | 2 +- 9 files changed, 44 insertions(+), 45 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 212fe74927ba..4858e1fed8b1 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -663,7 +663,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, if (&ip->i_inode == sdp->sd_rindex) rblocks += 2 * RES_STATFS; if (alloc_required) - rblocks += gfs2_rg_blocks(al); + rblocks += gfs2_rg_blocks(ip); error = gfs2_trans_begin(sdp, rblocks, PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index d717b72500a4..3467f3662149 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -397,7 +397,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) rblocks += data_blocks ? data_blocks : 1; if (ind_blocks || data_blocks) { rblocks += RES_STATFS + RES_QUOTA; - rblocks += gfs2_rg_blocks(al); + rblocks += gfs2_rg_blocks(ip); } ret = gfs2_trans_begin(sdp, rblocks, 0); if (ret) @@ -823,7 +823,7 @@ static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len, unsigned int *data_blocks, unsigned int *ind_blocks) { const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone; + unsigned int max_blocks = ip->i_rgd->rd_free_clone; unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); for (tmp = max_data; tmp > sdp->sd_diptrs;) { @@ -912,7 +912,7 @@ retry: al->al_requested = data_blocks + ind_blocks; rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + - RES_RG_HDR + gfs2_rg_blocks(al); + RES_RG_HDR + gfs2_rg_blocks(ip); if (gfs2_is_jdata(ip)) rblocks += data_blocks ? data_blocks : 1; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 56847f5903ae..55e335b52839 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -256,8 +256,6 @@ struct gfs2_alloc { unsigned int al_line; char *al_file; struct gfs2_holder al_rgd_gh; - struct gfs2_rgrpd *al_rgd; - }; enum { @@ -279,6 +277,7 @@ struct gfs2_inode { struct gfs2_holder i_iopen_gh; struct gfs2_holder i_gh; /* for prepare/commit_write only */ struct gfs2_alloc *i_alloc; + struct gfs2_rgrpd *i_rgd; u64 i_goal; /* goal block for allocations */ struct rw_semaphore i_rw_mutex; struct list_head i_trunc_list; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 29703dd97dc9..55b3bbaf2f25 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -583,7 +583,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, goto fail_quota_locks; error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - al->al_rgd->rd_length + + dip->i_rgd->rd_length + 2 * RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (error) @@ -613,8 +613,7 @@ fail_end_trans: gfs2_trans_end(sdp); fail_ipreserv: - if (dip->i_alloc->al_rgd) - gfs2_inplace_release(dip); + gfs2_inplace_release(dip); fail_quota_locks: gfs2_quota_unlock(dip); @@ -731,8 +730,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, brelse(bh); gfs2_trans_end(sdp); - if (dip->i_alloc->al_rgd) - gfs2_inplace_release(dip); + gfs2_inplace_release(dip); gfs2_quota_unlock(dip); gfs2_alloc_put(dip); mark_inode_dirty(inode); @@ -896,7 +894,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, goto out_gunlock_q; error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - gfs2_rg_blocks(al) + + gfs2_rg_blocks(dip) + 2 * RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (error) @@ -1371,7 +1369,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, goto out_gunlock_q; error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - gfs2_rg_blocks(al) + + gfs2_rg_blocks(ndip) + 4 * RES_DINODE + 4 * RES_LEAF + RES_STATFS + RES_QUOTA + 4, 0); if (error) diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 3a9a9749f496..10a59cd21f0c 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -813,7 +813,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) goto out_alloc; if (nalloc) - blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS; + blocks += gfs2_rg_blocks(ip) + nalloc * ind_blocks + RES_STATFS; error = gfs2_trans_begin(sdp, blocks, 0); if (error) @@ -1598,7 +1598,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, error = gfs2_inplace_reserve(ip); if (error) goto out_alloc; - blocks += gfs2_rg_blocks(al); + blocks += gfs2_rg_blocks(ip); } /* Some quotas span block boundaries and can update two blocks, diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 88d5b75067a8..5bfb97002c2a 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -882,24 +882,21 @@ struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip) /** * try_rgrp_fit - See if a given reservation will fit in a given RG * @rgd: the RG data - * @al: the struct gfs2_alloc structure describing the reservation + * @ip: the inode * * If there's room for the requested blocks to be allocated from the RG: - * Sets the $al_rgd field in @al. * * Returns: 1 on success (it fits), 0 on failure (it doesn't fit) */ -static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) +static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip) { + const struct gfs2_alloc *al = ip->i_alloc; + if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) return 0; - - if (rgd->rd_free_clone >= al->al_requested) { - al->al_rgd = rgd; + if (rgd->rd_free_clone >= al->al_requested) return 1; - } - return 0; } @@ -985,7 +982,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) int error, rg_locked; int loops = 0; - rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal); + if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) + rgd = begin = ip->i_rgd; + else + rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal); if (rgd == NULL) return -EBADSLT; @@ -1002,8 +1002,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) } switch (error) { case 0: - if (try_rgrp_fit(rgd, al)) + if (try_rgrp_fit(rgd, ip)) { + ip->i_rgd = rgd; return 0; + } if (rgd->rd_flags & GFS2_RDF_CHECK) try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); if (!rg_locked) @@ -1014,7 +1016,6 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) if (rgd == begin) loops++; break; - default: return error; } @@ -1042,21 +1043,20 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, if (gfs2_assert_warn(sdp, al->al_requested)) return -EINVAL; -try_again: do { error = get_local_rgrp(ip, &last_unlinked); - /* If there is no space, flushing the log may release some */ - if (error) { - if (ip == GFS2_I(sdp->sd_rindex) && - !sdp->sd_rindex_uptodate) { - error = gfs2_ri_update(ip); - if (error) - return error; - goto try_again; - } - gfs2_log_flush(sdp, NULL); + if (error != -ENOSPC) + break; + /* Check that fs hasn't grown if writing to rindex */ + if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) { + error = gfs2_ri_update(ip); + if (error) + break; + continue; } - } while (error && tries++ < 3); + /* Flushing the log may release space */ + gfs2_log_flush(sdp, NULL); + } while (tries++ < 3); if (error) return error; @@ -1086,7 +1086,6 @@ void gfs2_inplace_release(struct gfs2_inode *ip) al->al_alloced, al->al_requested, al->al_file, al->al_line); - al->al_rgd = NULL; if (al->al_rgd_gh.gh_gl) gfs2_glock_dq_uninit(&al->al_rgd_gh); } @@ -1339,7 +1338,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n) if (al == NULL) return -ECANCELED; - rgd = al->al_rgd; + rgd = ip->i_rgd; if (rgrp_contains_block(rgd, ip->i_goal)) goal = ip->i_goal - rgd->rd_data0; @@ -1398,7 +1397,7 @@ int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_alloc *al = dip->i_alloc; - struct gfs2_rgrpd *rgd = al->al_rgd; + struct gfs2_rgrpd *rgd = dip->i_rgd; u32 blk; u64 block; unsigned int n = 1; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index f716c4f8b252..87e9141a4def 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1574,6 +1574,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb) if (ip) { ip->i_flags = 0; ip->i_gl = NULL; + ip->i_rgd = NULL; } return &ip->i_inode; } diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index 980c5c05398a..f8f101ef600c 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -28,10 +28,12 @@ struct gfs2_glock; /* reserve either the number of blocks to be allocated plus the rg header * block, or all of the blocks in the rg, whichever is smaller */ -static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al) +static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip) { - return (al->al_requested < al->al_rgd->rd_length)? - al->al_requested + 1 : al->al_rgd->rd_length; + const struct gfs2_alloc *al = ip->i_alloc; + if (al->al_requested < ip->i_rgd->rd_length) + return al->al_requested + 1; + return ip->i_rgd->rd_length; } extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 167f4af53b1d..e7bf0ea1c3cc 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -727,7 +727,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, goto out_gunlock_q; error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), - blks + gfs2_rg_blocks(al) + + blks + gfs2_rg_blocks(ip) + RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (error) goto out_ipres; -- cgit v1.2.3-58-ga151 From 534029e2fd06ac9a5a1b33b2735e3ac3242adb29 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 1 Sep 2011 16:36:44 +0100 Subject: GFS2: Remove obsolete assert Given that a resource group has been locked, there is no reason why we should not be able to allocate as many blocks as are free. The al_requested parameter should really be considered as a minimum number of blocks to be available. Should this limit be overshot, there are other mechanisms which will prevent over allocation. Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 5bfb97002c2a..08b3a8002aca 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1077,15 +1077,8 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, void gfs2_inplace_release(struct gfs2_inode *ip) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_alloc *al = ip->i_alloc; - if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1) - fs_warn(sdp, "al_alloced = %u, al_requested = %u " - "al_file = %s, al_line = %u\n", - al->al_alloced, al->al_requested, al->al_file, - al->al_line); - if (al->al_rgd_gh.gh_gl) gfs2_glock_dq_uninit(&al->al_rgd_gh); } -- cgit v1.2.3-58-ga151 From d56fa8a1c17b68274349fc852f634af99c0c4671 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 2 Sep 2011 12:43:41 +0100 Subject: GFS2: Call do_strip() directly from recursive_scan() The recursive_scan() function only ever takes a single "bc" argument, so we might as well just call do_strip() directly from resource_scan() rather than pass it in as an argument. Also the "data" argument is always a struct strip_mine, so we can pass that in, rather than using a void pointer. This also moves do_strip() ahead of recursive_scan() so that we don't need to add a prototype. Signed-off-by: Steven Whitehouse --- fs/gfs2/bmap.c | 149 +++++++++++++++++++++++++++------------------------------ 1 file changed, 71 insertions(+), 78 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index e60296137707..9d3a0c26df28 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -36,11 +36,6 @@ struct metapath { __u16 mp_list[GFS2_MAX_META_HEIGHT]; }; -typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh, - struct buffer_head *bh, __be64 *top, - __be64 *bottom, unsigned int height, - void *data); - struct strip_mine { int sm_first; unsigned int sm_height; @@ -667,76 +662,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi return ret; } -/** - * recursive_scan - recursively scan through the end of a file - * @ip: the inode - * @dibh: the dinode buffer - * @mp: the path through the metadata to the point to start - * @height: the height the recursion is at - * @block: the indirect block to look at - * @first: 1 if this is the first block - * @bc: the call to make for each piece of metadata - * @data: data opaque to this function to pass to @bc - * - * When this is first called @height and @block should be zero and - * @first should be 1. - * - * Returns: errno - */ - -static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, - struct metapath *mp, unsigned int height, - u64 block, int first, block_call_t bc, - void *data) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct buffer_head *bh = NULL; - __be64 *top, *bottom; - u64 bn; - int error; - int mh_size = sizeof(struct gfs2_meta_header); - - if (!height) { - error = gfs2_meta_inode_buffer(ip, &bh); - if (error) - return error; - dibh = bh; - - top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0]; - bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs; - } else { - error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh); - if (error) - return error; - - top = (__be64 *)(bh->b_data + mh_size) + - (first ? mp->mp_list[height] : 0); - - bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs; - } - - error = bc(ip, dibh, bh, top, bottom, height, data); - if (error) - goto out; - - if (height < ip->i_height - 1) - for (; top < bottom; top++, first = 0) { - if (!*top) - continue; - - bn = be64_to_cpu(*top); - - error = recursive_scan(ip, dibh, mp, height + 1, bn, - first, bc, data); - if (error) - break; - } - -out: - brelse(bh); - return error; -} - /** * do_strip - Look for a layer a particular layer of the file and strip it off * @ip: the inode @@ -752,9 +677,8 @@ out: static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, struct buffer_head *bh, __be64 *top, __be64 *bottom, - unsigned int height, void *data) + unsigned int height, struct strip_mine *sm) { - struct strip_mine *sm = data; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrp_list rlist; u64 bn, bstart; @@ -885,6 +809,75 @@ out: return error; } +/** + * recursive_scan - recursively scan through the end of a file + * @ip: the inode + * @dibh: the dinode buffer + * @mp: the path through the metadata to the point to start + * @height: the height the recursion is at + * @block: the indirect block to look at + * @first: 1 if this is the first block + * @sm: data opaque to this function to pass to @bc + * + * When this is first called @height and @block should be zero and + * @first should be 1. + * + * Returns: errno + */ + +static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, + struct metapath *mp, unsigned int height, + u64 block, int first, struct strip_mine *sm) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *bh = NULL; + __be64 *top, *bottom; + u64 bn; + int error; + int mh_size = sizeof(struct gfs2_meta_header); + + if (!height) { + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + return error; + dibh = bh; + + top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0]; + bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs; + } else { + error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh); + if (error) + return error; + + top = (__be64 *)(bh->b_data + mh_size) + + (first ? mp->mp_list[height] : 0); + + bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs; + } + + error = do_strip(ip, dibh, bh, top, bottom, height, sm); + if (error) + goto out; + + if (height < ip->i_height - 1) + for (; top < bottom; top++, first = 0) { + if (!*top) + continue; + + bn = be64_to_cpu(*top); + + error = recursive_scan(ip, dibh, mp, height + 1, bn, + first, sm); + if (error) + break; + } + +out: + brelse(bh); + return error; +} + + /** * gfs2_block_truncate_page - Deal with zeroing out data for truncate * @@ -1024,7 +1017,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size) sm.sm_first = !!size; sm.sm_height = height; - error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm); + error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm); if (error) break; } -- cgit v1.2.3-58-ga151 From 70b0c3656f12964a6dac104214c904c66e626058 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 2 Sep 2011 16:08:09 +0100 Subject: GFS2: Use cached rgrp in gfs2_rlist_add() Each block which is deallocated, requires a call to gfs2_rlist_add() and each of those calls was calling gfs2_blk2rgrpd() in order to figure out which rgrp the block belonged in. This can be speeded up by making use of the rgrp cached in the inode. We also reset this cached rgrp in case the block has changed rgrp. This should provide a big reduction in gfs2_blk2rgrpd() calls during deallocation. Signed-off-by: Steven Whitehouse --- fs/gfs2/bmap.c | 4 ++-- fs/gfs2/dir.c | 2 +- fs/gfs2/rgrp.c | 14 +++++++++----- fs/gfs2/rgrp.h | 2 +- fs/gfs2/xattr.c | 4 ++-- 5 files changed, 15 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 9d3a0c26df28..22ad413213ca 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -724,7 +724,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, blen++; else { if (bstart) - gfs2_rlist_add(sdp, &rlist, bstart); + gfs2_rlist_add(ip, &rlist, bstart); bstart = bn; blen = 1; @@ -732,7 +732,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, } if (bstart) - gfs2_rlist_add(sdp, &rlist, bstart); + gfs2_rlist_add(ip, &rlist, bstart); else goto out; /* Nothing to do */ diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 90b877b464ca..8ccad2467cb6 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1821,7 +1821,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, if (blk != leaf_no) brelse(bh); - gfs2_rlist_add(sdp, &rlist, blk); + gfs2_rlist_add(dip, &rlist, blk); l_blocks++; } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 08b3a8002aca..3088fb25656d 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1560,7 +1560,7 @@ fail: /** * gfs2_rlist_add - add a RG to a list of RGs - * @sdp: the filesystem + * @ip: the inode * @rlist: the list of resource groups * @block: the block * @@ -1570,9 +1570,10 @@ fail: * */ -void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, +void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, u64 block) { + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd; struct gfs2_rgrpd **tmp; unsigned int new_space; @@ -1581,12 +1582,15 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, if (gfs2_assert_warn(sdp, !rlist->rl_ghs)) return; - rgd = gfs2_blk2rgrpd(sdp, block); + if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, block)) + rgd = ip->i_rgd; + else + rgd = gfs2_blk2rgrpd(sdp, block); if (!rgd) { - if (gfs2_consist(sdp)) - fs_err(sdp, "block = %llu\n", (unsigned long long)block); + fs_err(sdp, "rlist_add: no rgrp for block %llu\n", (unsigned long long)block); return; } + ip->i_rgd = rgd; for (x = 0; x < rlist->rl_rgrps; x++) if (rlist->rl_rgd[x] == rgd) diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 0439fca18f08..0e886d830784 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -60,7 +60,7 @@ struct gfs2_rgrp_list { struct gfs2_holder *rl_ghs; }; -extern void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, +extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, u64 block); extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index e7bf0ea1c3cc..71d7bf830c09 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1356,14 +1356,14 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) blen++; else { if (bstart) - gfs2_rlist_add(sdp, &rlist, bstart); + gfs2_rlist_add(ip, &rlist, bstart); bstart = bn; blen = 1; } blks++; } if (bstart) - gfs2_rlist_add(sdp, &rlist, bstart); + gfs2_rlist_add(ip, &rlist, bstart); else goto out; -- cgit v1.2.3-58-ga151 From b5b24d7aeb9608935786369ac2d3e9f362877d55 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 7 Sep 2011 10:33:25 +0100 Subject: GFS2: Fix AIL flush issue during fsync Unfortunately, it is not enough to just ignore locked buffers during the AIL flush from fsync. We need to be able to ignore all buffers which are locked, dirty or pinned at this stage as they might have been added subsequent to the log flush earlier in the fsync function. In addition, this means that we no longer need to rely on i_mutex to keep out writes during fsync, so we can, as a side-effect, remove that protection too. Signed-off-by: Steven Whitehouse Tested-By: Abhijith Das --- fs/gfs2/file.c | 8 ++------ fs/gfs2/glops.c | 32 ++++++++++++++++---------------- fs/gfs2/glops.h | 2 +- fs/gfs2/super.c | 2 +- 4 files changed, 20 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 3467f3662149..3b65f67bb38e 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -593,16 +593,12 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, sync_state &= ~I_DIRTY_SYNC; if (sync_state) { - mutex_lock(&inode->i_mutex); ret = sync_inode_metadata(inode, 1); - if (ret) { - mutex_unlock(&inode->i_mutex); + if (ret) return ret; - } if (gfs2_is_jdata(ip)) filemap_write_and_wait(mapping); - gfs2_ail_flush(ip->i_gl); - mutex_unlock(&inode->i_mutex); + gfs2_ail_flush(ip->i_gl, 1); } if (mapping->nrpages) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 951541b6234c..78418b4fa857 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -42,41 +42,41 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) /** * __gfs2_ail_flush - remove all buffers for a given lock from the AIL * @gl: the glock + * @fsync: set when called from fsync (not all buffers will be clean) * * None of the buffers should be dirty, locked, or pinned. */ -static void __gfs2_ail_flush(struct gfs2_glock *gl, unsigned long b_state) +static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) { struct gfs2_sbd *sdp = gl->gl_sbd; struct list_head *head = &gl->gl_ail_list; - struct gfs2_bufdata *bd; + struct gfs2_bufdata *bd, *tmp; struct buffer_head *bh; + const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock); sector_t blocknr; + gfs2_log_lock(sdp); spin_lock(&sdp->sd_ail_lock); - while (!list_empty(head)) { - bd = list_entry(head->next, struct gfs2_bufdata, - bd_ail_gl_list); + list_for_each_entry_safe(bd, tmp, head, bd_ail_gl_list) { bh = bd->bd_bh; - blocknr = bh->b_blocknr; - if (bh->b_state & b_state) + if (bh->b_state & b_state) { + if (fsync) + continue; gfs2_ail_error(gl, bh); + } + blocknr = bh->b_blocknr; bh->b_private = NULL; gfs2_remove_from_ail(bd); /* drops ref on bh */ - spin_unlock(&sdp->sd_ail_lock); bd->bd_bh = NULL; bd->bd_blkno = blocknr; - gfs2_log_lock(sdp); gfs2_trans_add_revoke(sdp, bd); - gfs2_log_unlock(sdp); - - spin_lock(&sdp->sd_ail_lock); } - gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); + BUG_ON(!fsync && atomic_read(&gl->gl_ail_count)); spin_unlock(&sdp->sd_ail_lock); + gfs2_log_unlock(sdp); } @@ -99,13 +99,13 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) BUG_ON(current->journal_info); current->journal_info = &tr; - __gfs2_ail_flush(gl, (1ul << BH_Dirty)|(1ul << BH_Pinned)|(1ul << BH_Lock)); + __gfs2_ail_flush(gl, 0); gfs2_trans_end(sdp); gfs2_log_flush(sdp, NULL); } -void gfs2_ail_flush(struct gfs2_glock *gl) +void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) { struct gfs2_sbd *sdp = gl->gl_sbd; unsigned int revokes = atomic_read(&gl->gl_ail_count); @@ -117,7 +117,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl) ret = gfs2_trans_begin(sdp, 0, revokes); if (ret) return; - __gfs2_ail_flush(gl, (1ul << BH_Dirty)|(1ul << BH_Pinned)); + __gfs2_ail_flush(gl, fsync); gfs2_trans_end(sdp); gfs2_log_flush(sdp, NULL); } diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h index 6fce409b5a50..bf95a2dc1662 100644 --- a/fs/gfs2/glops.h +++ b/fs/gfs2/glops.h @@ -23,6 +23,6 @@ extern const struct gfs2_glock_operations gfs2_quota_glops; extern const struct gfs2_glock_operations gfs2_journal_glops; extern const struct gfs2_glock_operations *gfs2_glops_list[]; -extern void gfs2_ail_flush(struct gfs2_glock *gl); +extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync); #endif /* __GLOPS_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 87e9141a4def..71e420989f77 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1533,7 +1533,7 @@ static void gfs2_evict_inode(struct inode *inode) out_truncate: gfs2_log_flush(sdp, ip->i_gl); write_inode_now(inode, 1); - gfs2_ail_flush(ip->i_gl); + gfs2_ail_flush(ip->i_gl, 0); /* Case 2 starts here */ error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); -- cgit v1.2.3-58-ga151 From ccad4e147acf2a59b463f5df3cee8b43b144ce82 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 7 Sep 2011 12:15:23 +0100 Subject: GFS2: Correctly set goal block after allocation The new goal block should be set to the end of the newly allocated extent, not the start of it. Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 3088fb25656d..8ec41744594b 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1346,7 +1346,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n) rgd->rd_last_alloc = blk; block = rgd->rd_data0 + blk; - ip->i_goal = block; + ip->i_goal = block + *n - 1; error = gfs2_meta_inode_buffer(ip, &dibh); if (error == 0) { struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; -- cgit v1.2.3-58-ga151 From 13d921e37174e3d1042deeb303537c1d935da553 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 7 Sep 2011 15:12:51 +0100 Subject: GFS2: Clean up ->page_mkwrite This patch brings gfs2's ->page_mkwrite uptodate with respect to the expectations set by the VM. Also added is a check to wait if the fs is frozen, before we attempt to get a glock. This will only work on the node which initiates the freeze, but thats ok since the transaction lock will still provide the expected barrier on other nodes. The major change here is that we return a locked page now, except when we don't return a page at all (error cases). This removes the race which required rechecking the page after it was returned. Signed-off-by: Steven Whitehouse Cc: Nick Piggin --- fs/gfs2/file.c | 64 +++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 3b65f67bb38e..aa3a4ddb834e 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -366,8 +366,15 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) unsigned int data_blocks, ind_blocks, rblocks; struct gfs2_holder gh; struct gfs2_alloc *al; + loff_t size; int ret; + /* Wait if fs is frozen. This is racy so we check again later on + * and retry if the fs has been frozen after the page lock has + * been acquired + */ + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret) @@ -376,8 +383,15 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); - if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) + if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) { + lock_page(page); + if (!PageUptodate(page) || page->mapping != inode->i_mapping) { + ret = -EAGAIN; + unlock_page(page); + } goto out_unlock; + } + ret = -ENOMEM; al = gfs2_alloc_get(ip); if (al == NULL) @@ -405,21 +419,29 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) lock_page(page); ret = -EINVAL; - last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT; - if (page->index > last_index) - goto out_unlock_page; + size = i_size_read(inode); + last_index = (size - 1) >> PAGE_CACHE_SHIFT; + /* Check page index against inode size */ + if (size == 0 || (page->index > last_index)) + goto out_trans_end; + + ret = -EAGAIN; + /* If truncated, we must retry the operation, we may have raced + * with the glock demotion code. + */ + if (!PageUptodate(page) || page->mapping != inode->i_mapping) + goto out_trans_end; + + /* Unstuff, if required, and allocate backing blocks for page */ ret = 0; - if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping) - goto out_unlock_page; - if (gfs2_is_stuffed(ip)) { + if (gfs2_is_stuffed(ip)) ret = gfs2_unstuff_dinode(ip, page); - if (ret) - goto out_unlock_page; - } - ret = gfs2_allocate_page_backing(page); + if (ret == 0) + ret = gfs2_allocate_page_backing(page); -out_unlock_page: - unlock_page(page); +out_trans_end: + if (ret) + unlock_page(page); gfs2_trans_end(sdp); out_trans_fail: gfs2_inplace_release(ip); @@ -431,11 +453,17 @@ out_unlock: gfs2_glock_dq(&gh); out: gfs2_holder_uninit(&gh); - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else if (ret) - ret = VM_FAULT_SIGBUS; - return ret; + if (ret == 0) { + set_page_dirty(page); + /* This check must be post dropping of transaction lock */ + if (inode->i_sb->s_frozen == SB_UNFROZEN) { + wait_on_page_writeback(page); + } else { + ret = -EAGAIN; + unlock_page(page); + } + } + return block_page_mkwrite_return(ret); } static const struct vm_operations_struct gfs2_vm_ops = { -- cgit v1.2.3-58-ga151 From f75bbfb4dda68c86eb33cde7e2b5c1343c6d5812 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 8 Sep 2011 10:21:13 +0100 Subject: GFS2: Fix off-by-one in gfs2_blk2rgrpd Bob reported: I found an off-by-one problem with how I coded this section: It should be: + else if (blk >= cur->rd_data0 + cur->rd_data) In fact, cur->rd_data0 + cur->rd_data is the start of the next rgrp (the next ri_addr), so without the "=" check it can land on the wrong rgrp. In all normal cases, this won't be a problem: you're searching for a block _within_ the rgrp, which will pass the test properly. Where it gets into trouble is if you search the rgrps for the block exactly equal to ri_addr. I don't think anything in the kernel does this, but I found a place in gfs2-utils gfs2_edit where it does. So I definitely need to fix it in libgfs2. I'd like to suggest we fix it in the kernel as well for the sake of keeping the functions similar. So this patch fixes the above mentioned off by one error as well as removing the unused parent pointer. Reported-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 8ec41744594b..1daf8a78c733 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -329,17 +329,16 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk) { - struct rb_node **newn, *parent = NULL; + struct rb_node **newn; + struct gfs2_rgrpd *cur; spin_lock(&sdp->sd_rindex_spin); newn = &sdp->sd_rindex_tree.rb_node; while (*newn) { - struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd, - rd_node); - parent = *newn; + cur = rb_entry(*newn, struct gfs2_rgrpd, rd_node); if (blk < cur->rd_addr) newn = &((*newn)->rb_left); - else if (blk > cur->rd_data0 + cur->rd_data) + else if (blk >= cur->rd_data0 + cur->rd_data) newn = &((*newn)->rb_right); else { spin_unlock(&sdp->sd_rindex_spin); -- cgit v1.2.3-58-ga151 From bd5437a7d4307a35f2c7cc19cad706ec0e5d61f0 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 15 Sep 2011 09:59:56 -0400 Subject: GFS2: speed up delete/unlink performance for large files This patch improves the performance of delete/unlink operations in a GFS2 file system where the files are large by adding a layer of metadata read-ahead for indirect blocks. Mileage will vary, but on my system, deleting an 8.6G file dropped from 22 seconds to about 4.5 seconds. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/bmap.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 22ad413213ca..834cd9442a1d 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -831,7 +831,7 @@ static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head *bh = NULL; - __be64 *top, *bottom; + __be64 *top, *bottom, *t2; u64 bn; int error; int mh_size = sizeof(struct gfs2_meta_header); @@ -859,7 +859,27 @@ static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, if (error) goto out; - if (height < ip->i_height - 1) + if (height < ip->i_height - 1) { + struct buffer_head *rabh; + + for (t2 = top; t2 < bottom; t2++, first = 0) { + if (!*t2) + continue; + + bn = be64_to_cpu(*t2); + rabh = gfs2_getbuf(ip->i_gl, bn, CREATE); + if (trylock_buffer(rabh)) { + if (buffer_uptodate(rabh)) { + unlock_buffer(rabh); + brelse(rabh); + continue; + } + rabh->b_end_io = end_buffer_read_sync; + submit_bh(READA | REQ_META, rabh); + continue; + } + brelse(rabh); + } for (; top < bottom; top++, first = 0) { if (!*top) continue; @@ -871,7 +891,7 @@ static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, if (error) break; } - + } out: brelse(bh); return error; -- cgit v1.2.3-58-ga151 From 64dd153c83743af81f20924c6343652d731eeecb Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Mon, 12 Sep 2011 18:15:24 -0500 Subject: GFS2: rewrite fallocate code to write blocks directly GFS2's fallocate code currently goes through the page cache. Since it's only writing to the end of the file or to holes in it, it doesn't need to, and it was causing issues on low memory environments. This patch pulls in some of Steve's block allocation work, and uses it to simply allocate the blocks for the file, and zero them out at allocation time. It provides a slight performance increase, and it dramatically simplifies the code. Signed-off-by: Benjamin Marzinski Signed-off-by: Steven Whitehouse --- fs/gfs2/bmap.c | 12 ++++ fs/gfs2/file.c | 171 ++++++++----------------------------------------------- fs/gfs2/incore.h | 3 + 3 files changed, 39 insertions(+), 147 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 834cd9442a1d..97b61955850a 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -427,12 +428,14 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); + struct super_block *sb = sdp->sd_vfs; struct buffer_head *dibh = mp->mp_bh[0]; u64 bn, dblock = 0; unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0; unsigned dblks = 0; unsigned ptrs_per_blk; const unsigned end_of_metadata = height - 1; + int ret; int eob = 0; enum alloc_state state; __be64 *ptr; @@ -535,6 +538,15 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, dblock = bn; while (n-- > 0) *ptr++ = cpu_to_be64(bn++); + if (buffer_zeronew(bh_map)) { + ret = sb_issue_zeroout(sb, dblock, dblks, + GFP_NOFS); + if (ret) { + fs_err(sdp, + "Failed to zero data buffers\n"); + clear_buffer_zeronew(bh_map); + } + } break; } } while ((state != ALLOC_DATA) || !dblock); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index aa3a4ddb834e..5002408dabea 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -669,135 +669,18 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, return generic_file_aio_write(iocb, iov, nr_segs, pos); } -static int empty_write_end(struct page *page, unsigned from, - unsigned to, int mode) -{ - struct inode *inode = page->mapping->host; - struct gfs2_inode *ip = GFS2_I(inode); - struct buffer_head *bh; - unsigned offset, blksize = 1 << inode->i_blkbits; - pgoff_t end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT; - - zero_user(page, from, to-from); - mark_page_accessed(page); - - if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) { - if (!gfs2_is_writeback(ip)) - gfs2_page_add_databufs(ip, page, from, to); - - block_commit_write(page, from, to); - return 0; - } - - offset = 0; - bh = page_buffers(page); - while (offset < to) { - if (offset >= from) { - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - clear_buffer_new(bh); - write_dirty_buffer(bh, WRITE); - } - offset += blksize; - bh = bh->b_this_page; - } - - offset = 0; - bh = page_buffers(page); - while (offset < to) { - if (offset >= from) { - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - return -EIO; - } - offset += blksize; - bh = bh->b_this_page; - } - return 0; -} - -static int needs_empty_write(sector_t block, struct inode *inode) -{ - int error; - struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; - - bh_map.b_size = 1 << inode->i_blkbits; - error = gfs2_block_map(inode, block, &bh_map, 0); - if (unlikely(error)) - return error; - return !buffer_mapped(&bh_map); -} - -static int write_empty_blocks(struct page *page, unsigned from, unsigned to, - int mode) -{ - struct inode *inode = page->mapping->host; - unsigned start, end, next, blksize; - sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - int ret; - - blksize = 1 << inode->i_blkbits; - next = end = 0; - while (next < from) { - next += blksize; - block++; - } - start = next; - do { - next += blksize; - ret = needs_empty_write(block, inode); - if (unlikely(ret < 0)) - return ret; - if (ret == 0) { - if (end) { - ret = __block_write_begin(page, start, end - start, - gfs2_block_map); - if (unlikely(ret)) - return ret; - ret = empty_write_end(page, start, end, mode); - if (unlikely(ret)) - return ret; - end = 0; - } - start = next; - } - else - end = next; - block++; - } while (next < to); - - if (end) { - ret = __block_write_begin(page, start, end - start, gfs2_block_map); - if (unlikely(ret)) - return ret; - ret = empty_write_end(page, start, end, mode); - if (unlikely(ret)) - return ret; - } - - return 0; -} - static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, int mode) { struct gfs2_inode *ip = GFS2_I(inode); struct buffer_head *dibh; int error; - u64 start = offset >> PAGE_CACHE_SHIFT; - unsigned int start_offset = offset & ~PAGE_CACHE_MASK; - u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT; - pgoff_t curr; - struct page *page; - unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK; - unsigned int from, to; - - if (!end_offset) - end_offset = PAGE_CACHE_SIZE; + unsigned int nr_blks; + sector_t lblock = offset >> inode->i_blkbits; error = gfs2_meta_inode_buffer(ip, &dibh); if (unlikely(error)) - goto out; + return error; gfs2_trans_add_bh(ip->i_gl, dibh, 1); @@ -807,39 +690,31 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, goto out; } - curr = start; - offset = start << PAGE_CACHE_SHIFT; - from = start_offset; - to = PAGE_CACHE_SIZE; - while (curr <= end) { - page = grab_cache_page_write_begin(inode->i_mapping, curr, - AOP_FLAG_NOFS); - if (unlikely(!page)) { - error = -ENOMEM; - goto out; - } + while (len) { + struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; + bh_map.b_size = len; + set_buffer_zeronew(&bh_map); - if (curr == end) - to = end_offset; - error = write_empty_blocks(page, from, to, mode); - if (!error && offset + to > inode->i_size && - !(mode & FALLOC_FL_KEEP_SIZE)) { - i_size_write(inode, offset + to); - } - unlock_page(page); - page_cache_release(page); - if (error) + error = gfs2_block_map(inode, lblock, &bh_map, 1); + if (unlikely(error)) goto out; - curr++; - offset += PAGE_CACHE_SIZE; - from = 0; + len -= bh_map.b_size; + nr_blks = bh_map.b_size >> inode->i_blkbits; + lblock += nr_blks; + if (!buffer_new(&bh_map)) + continue; + if (unlikely(!buffer_zeronew(&bh_map))) { + error = -EIO; + goto out; + } } + if (offset + len > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE)) + i_size_write(inode, offset + len); mark_inode_dirty(inode); - brelse(dibh); - out: + brelse(dibh); return error; } @@ -879,6 +754,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, int error; loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1); loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; + loff_t max_chunk_size = UINT_MAX & bsize_mask; next = (next + 1) << sdp->sd_sb.sb_bsize_shift; /* We only support the FALLOC_FL_KEEP_SIZE mode */ @@ -932,7 +808,8 @@ retry: goto out_qunlock; } max_bytes = bytes; - calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks); + calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len, + &max_bytes, &data_blocks, &ind_blocks); al->al_requested = data_blocks + ind_blocks; rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 55e335b52839..6429aa4339ff 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -103,12 +103,15 @@ struct gfs2_rgrpd { enum gfs2_state_bits { BH_Pinned = BH_PrivateStart, BH_Escaped = BH_PrivateStart + 1, + BH_Zeronew = BH_PrivateStart + 2, }; BUFFER_FNS(Pinned, pinned) TAS_BUFFER_FNS(Pinned, pinned) BUFFER_FNS(Escaped, escaped) TAS_BUFFER_FNS(Escaped, escaped) +BUFFER_FNS(Zeronew, zeronew) +TAS_BUFFER_FNS(Zeronew, zeronew) struct gfs2_bufdata { struct buffer_head *bd_bh; -- cgit v1.2.3-58-ga151 From 891a8e9335176b7eb9adc5e34f555ee5e1da47c6 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 19 Sep 2011 10:25:49 +0100 Subject: GFS2: Misc fixes Some items picked up through automated code analysis. A few bits of unreachable code and two unchecked return values. Signed-off-by: Steven Whitehouse --- fs/gfs2/lops.c | 2 -- fs/gfs2/ops_fstype.c | 3 --- fs/gfs2/quota.c | 11 ++++++++--- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index de05b4d66ef3..0301be655b12 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -695,8 +695,6 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, brelse(bh_log); brelse(bh_ip); - if (error) - break; sdp->sd_replayed_blocks++; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 69166ff7f37d..7e823bbd2453 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -651,7 +651,6 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) fs_err(sdp, "can't lookup journal index: %d\n", error); return PTR_ERR(sdp->sd_jindex); } - ip = GFS2_I(sdp->sd_jindex); /* Load in the journal index special file */ @@ -763,7 +762,6 @@ fail: static int init_inodes(struct gfs2_sbd *sdp, int undo) { int error = 0; - struct gfs2_inode *ip; struct inode *master = sdp->sd_master_dir->d_inode; if (undo) @@ -788,7 +786,6 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) fs_err(sdp, "can't get resource index inode: %d\n", error); goto fail_statfs; } - ip = GFS2_I(sdp->sd_rindex); sdp->sd_rindex_uptodate = 0; /* Read in the quota inode */ diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 10a59cd21f0c..7e528dc14f85 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -645,8 +645,11 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, int err, nbytes; u64 size; - if (gfs2_is_stuffed(ip)) - gfs2_unstuff_dinode(ip, NULL); + if (gfs2_is_stuffed(ip)) { + err = gfs2_unstuff_dinode(ip, NULL); + if (err) + return err; + } memset(&q, 0, sizeof(struct gfs2_quota)); err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q)); @@ -927,7 +930,9 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) unsigned int x; int error = 0; - gfs2_quota_hold(ip, uid, gid); + error = gfs2_quota_hold(ip, uid, gid); + if (error) + return error; if (capable(CAP_SYS_RESOURCE) || sdp->sd_args.ar_quota != GFS2_QUOTA_ON) -- cgit v1.2.3-58-ga151 From 9ae32429fe036fcfce036ec57b28fc59f3911976 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 20 Sep 2011 12:16:11 +0100 Subject: GFS2: Remove two unused variables The two variables being initialised in gfs2_inplace_reserve to track the file & line number of the caller are never used, so we might as well remove them. If something does go wrong, then a stack trace is probably more useful anyway. Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 4 ---- fs/gfs2/rgrp.c | 14 +++----------- fs/gfs2/rgrp.h | 6 +----- 3 files changed, 4 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 6429aa4339ff..7389dfdcc9ef 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -246,7 +246,6 @@ struct gfs2_glock { struct gfs2_alloc { /* Quota stuff */ - struct gfs2_quota_data *al_qd[2*MAXQUOTAS]; struct gfs2_holder al_qd_ghs[2*MAXQUOTAS]; unsigned int al_qd_num; @@ -255,9 +254,6 @@ struct gfs2_alloc { u32 al_alloced; /* Filled in by gfs2_alloc_*() */ /* Filled in by gfs2_inplace_reserve() */ - - unsigned int al_line; - char *al_file; struct gfs2_holder al_rgd_gh; }; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 1daf8a78c733..96bd6d759f29 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1024,14 +1024,13 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) } /** - * gfs2_inplace_reserve_i - Reserve space in the filesystem + * gfs2_inplace_reserve - Reserve space in the filesystem * @ip: the inode to reserve space for * * Returns: errno */ -int gfs2_inplace_reserve_i(struct gfs2_inode *ip, - char *file, unsigned int line) +int gfs2_inplace_reserve(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_alloc *al = ip->i_alloc; @@ -1057,14 +1056,7 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, gfs2_log_flush(sdp, NULL); } while (tries++ < 3); - if (error) - return error; - - /* no error, so we have the rgrp set in the inode's allocation. */ - al->al_file = file; - al->al_line = line; - - return 0; + return error; } /** diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 0e886d830784..cf5c50180192 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -36,11 +36,7 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip) ip->i_alloc = NULL; } -extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, - char *file, unsigned int line); -#define gfs2_inplace_reserve(ip) \ - gfs2_inplace_reserve_i((ip), __FILE__, __LINE__) - +extern int gfs2_inplace_reserve(struct gfs2_inode *ip); extern void gfs2_inplace_release(struct gfs2_inode *ip); extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); -- cgit v1.2.3-58-ga151 From b99b98dc2673a123a73068f16720232d7be7e669 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 21 Sep 2011 11:05:16 +0100 Subject: GFS2: Move readahead of metadata during deallocation into its own function Move the recently added readahead of the indirect pointer tree during deallocation into its own function in order that we can use it elsewhere in the future. Also this fixes the resetting of the "first" variable in the original patch. Signed-off-by: Steven Whitehouse --- fs/gfs2/bmap.c | 45 ++++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 97b61955850a..41d494d79709 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -269,6 +269,30 @@ static inline __be64 *metapointer(unsigned int height, const struct metapath *mp return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height]; } +static void gfs2_metapath_ra(struct gfs2_glock *gl, + const struct buffer_head *bh, const __be64 *pos) +{ + struct buffer_head *rabh; + const __be64 *endp = (const __be64 *)(bh->b_data + bh->b_size); + const __be64 *t; + + for (t = pos; t < endp; t++) { + if (!*t) + continue; + + rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE); + if (trylock_buffer(rabh)) { + if (!buffer_uptodate(rabh)) { + rabh->b_end_io = end_buffer_read_sync; + submit_bh(READA | REQ_META, rabh); + continue; + } + unlock_buffer(rabh); + } + brelse(rabh); + } +} + /** * lookup_metapath - Walk the metadata tree to a specific point * @ip: The inode @@ -843,7 +867,7 @@ static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head *bh = NULL; - __be64 *top, *bottom, *t2; + __be64 *top, *bottom; u64 bn; int error; int mh_size = sizeof(struct gfs2_meta_header); @@ -872,26 +896,9 @@ static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, goto out; if (height < ip->i_height - 1) { - struct buffer_head *rabh; - for (t2 = top; t2 < bottom; t2++, first = 0) { - if (!*t2) - continue; + gfs2_metapath_ra(ip->i_gl, bh, top); - bn = be64_to_cpu(*t2); - rabh = gfs2_getbuf(ip->i_gl, bn, CREATE); - if (trylock_buffer(rabh)) { - if (buffer_uptodate(rabh)) { - unlock_buffer(rabh); - brelse(rabh); - continue; - } - rabh->b_end_io = end_buffer_read_sync; - submit_bh(READA | REQ_META, rabh); - continue; - } - brelse(rabh); - } for (; top < bottom; top++, first = 0) { if (!*top) continue; -- cgit v1.2.3-58-ga151 From a2d6b6cacb4fd4494b4037c01abb1332cefbb37b Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 21 Oct 2011 10:14:04 +0400 Subject: CIFS: Fix error handling in cifs_readv_complete In cifs_readv_receive we don't update rdata->result to error value after kmap'ing a page. We should kunmap the page in the no error case only. Signed-off-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifssmb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index aaad4ce6e6c5..4435b11c41b9 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1619,9 +1619,9 @@ cifs_readv_complete(struct work_struct *work) list_for_each_entry_safe(page, tpage, &rdata->pages, lru) { list_del(&page->lru); lru_cache_add_file(page); - kunmap(page); if (rdata->result == 0) { + kunmap(page); flush_dcache_page(page); SetPageUptodate(page); } -- cgit v1.2.3-58-ga151 From 1939dd84b3f52e9c8d1b46dffae2058f16a3ff6a Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sat, 22 Oct 2011 01:26:05 -0400 Subject: ext4: cleanup ext4_ext_grow_indepth code Currently code make an impression what grow procedure is very complicated and some mythical paths, blocks are involved. But in fact grow in depth it relatively simple procedure: 1) Just create new meta block and copy root data to that block. 2) Convert root from extent to index if old depth == 0 3) Update root block pointer This patch does: - Reorganize code to make it more self explanatory - Do not pass path parameter to new_meta_block() in order to provoke allocation from inode's group because top-level block should site closer to it's inode, but not to leaf data block. [ This happens anyway, due to logic in mballoc; we should drop the path parameter from new_meta_block() entirely. -- tytso ] Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 41 +++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2fc6cc0a51ca..385ebb435332 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1050,16 +1050,14 @@ cleanup: */ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, unsigned int flags, - struct ext4_ext_path *path, struct ext4_extent *newext) { - struct ext4_ext_path *curp = path; struct ext4_extent_header *neh; struct buffer_head *bh; ext4_fsblk_t newblock; int err = 0; - newblock = ext4_ext_new_meta_block(handle, inode, path, + newblock = ext4_ext_new_meta_block(handle, inode, NULL, newext, &err, flags); if (newblock == 0) return err; @@ -1079,7 +1077,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, } /* move top-level index/leaf into new block */ - memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data)); + memmove(bh->b_data, EXT4_I(inode)->i_data, + sizeof(EXT4_I(inode)->i_data)); /* set size of new block */ neh = ext_block_hdr(bh); @@ -1097,32 +1096,23 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, if (err) goto out; - /* create index in new top-level index: num,max,pointer */ - err = ext4_ext_get_access(handle, inode, curp); - if (err) - goto out; - - curp->p_hdr->eh_magic = EXT4_EXT_MAGIC; - curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); - curp->p_hdr->eh_entries = cpu_to_le16(1); - curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); - - if (path[0].p_hdr->eh_depth) - curp->p_idx->ei_block = - EXT_FIRST_INDEX(path[0].p_hdr)->ei_block; - else - curp->p_idx->ei_block = - EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; - ext4_idx_store_pblock(curp->p_idx, newblock); - + /* Update top-level index: num,max,pointer */ neh = ext_inode_hdr(inode); + neh->eh_entries = cpu_to_le16(1); + ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock); + if (neh->eh_depth == 0) { + /* Root extent block becomes index block */ + neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); + EXT_FIRST_INDEX(neh)->ei_block = + EXT_FIRST_EXTENT(neh)->ee_block; + } ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), ext4_idx_pblock(EXT_FIRST_INDEX(neh))); - neh->eh_depth = cpu_to_le16(path->p_depth + 1); - err = ext4_ext_dirty(handle, inode, curp); + neh->eh_depth = cpu_to_le16(neh->eh_depth + 1); + ext4_mark_inode_dirty(handle, inode); out: brelse(bh); @@ -1170,8 +1160,7 @@ repeat: err = PTR_ERR(path); } else { /* tree is full, time to grow in depth */ - err = ext4_ext_grow_indepth(handle, inode, flags, - path, newext); + err = ext4_ext_grow_indepth(handle, inode, flags, newext); if (err) goto out; -- cgit v1.2.3-58-ga151 From 42274bb22afc3e877ae5abed787b0b09d7dede52 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Sat, 22 Oct 2011 14:37:50 +0400 Subject: CIFS: Fix DFS handling in cifs_get_file_info We should call cifs_all_info_to_fattr in rc == 0 case only. Cc: Signed-off-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/inode.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 663c4e313be4..2c50bd2f65d1 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -562,7 +562,16 @@ int cifs_get_file_info(struct file *filp) xid = GetXid(); rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data); - if (rc == -EOPNOTSUPP || rc == -EINVAL) { + switch (rc) { + case 0: + cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false); + break; + case -EREMOTE: + cifs_create_dfs_fattr(&fattr, inode->i_sb); + rc = 0; + break; + case -EOPNOTSUPP: + case -EINVAL: /* * FIXME: legacy server -- fall back to path-based call? * for now, just skip revalidating and mark inode for @@ -570,18 +579,14 @@ int cifs_get_file_info(struct file *filp) */ rc = 0; CIFS_I(inode)->time = 0; + default: goto cgfi_exit; - } else if (rc == -EREMOTE) { - cifs_create_dfs_fattr(&fattr, inode->i_sb); - rc = 0; - } else if (rc) - goto cgfi_exit; + } /* * don't bother with SFU junk here -- just mark inode as needing * revalidation. */ - cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false); fattr.cf_uniqueid = CIFS_I(inode)->uniqueid; fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; cifs_fattr_to_inode(inode, &fattr); -- cgit v1.2.3-58-ga151 From 5423732a71577f7860c56a4eea2c34ff162ddd73 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Wed, 19 Oct 2011 19:12:58 -0700 Subject: nfsd41: use SEQ4_STATUS_BACKCHANNEL_FAULT when cb_sequence is invalid Signed-off-by: Benny Halevy Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 10 ++++++++++ fs/nfsd/nfs4state.c | 8 +++++++- fs/nfsd/state.h | 1 + 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index de018ecadae6..7748d6a18d97 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -39,6 +39,8 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC +static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason); + #define NFSPROC4_CB_NULL 0 #define NFSPROC4_CB_COMPOUND 1 @@ -460,6 +462,8 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr, */ status = 0; out: + if (status) + nfsd4_mark_cb_fault(cb->cb_clp, status); return status; out_overflow: print_overflow_msg(__func__, xdr); @@ -686,6 +690,12 @@ static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason) warn_no_callback_path(clp, reason); } +static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason) +{ + clp->cl_cb_state = NFSD4_CB_FAULT; + warn_no_callback_path(clp, reason); +} + static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) { struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index e8c2a3ec0e60..b51ad43b7ea1 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1945,8 +1945,14 @@ out: nfsd4_get_session(cstate->session); atomic_inc(&clp->cl_refcount); - if (clp->cl_cb_state == NFSD4_CB_DOWN) + switch (clp->cl_cb_state) { + case NFSD4_CB_DOWN: seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN; + break; + case NFSD4_CB_FAULT: + seq->status_flags |= SEQ4_STATUS_BACKCHANNEL_FAULT; + break; + } } kfree(conn); spin_unlock(&client_lock); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 1a5820066040..a3cf38476a1b 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -258,6 +258,7 @@ struct nfs4_client { #define NFSD4_CB_UP 0 #define NFSD4_CB_UNKNOWN 1 #define NFSD4_CB_DOWN 2 +#define NFSD4_CB_FAULT 3 int cl_cb_state; struct nfsd4_callback cl_cb_null; struct nfsd4_session *cl_cb_session; -- cgit v1.2.3-58-ga151 From fc0c3dd13bac0675cdedc4e0d0641aa8a22e82de Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Wed, 19 Oct 2011 19:13:06 -0700 Subject: nfsd4: seq->status_flags may be used unitialized Reported-by: Gopala Suryanarayana Signed-off-by: Benny Halevy Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b51ad43b7ea1..1527aaffb000 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1947,11 +1947,13 @@ out: atomic_inc(&clp->cl_refcount); switch (clp->cl_cb_state) { case NFSD4_CB_DOWN: - seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN; + seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; break; case NFSD4_CB_FAULT: - seq->status_flags |= SEQ4_STATUS_BACKCHANNEL_FAULT; + seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT; break; + default: + seq->status_flags = 0; } } kfree(conn); -- cgit v1.2.3-58-ga151 From c668fc6dfcce98f8222ce1c997f4e5c4ac63f3d0 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Wed, 19 Oct 2011 19:13:13 -0700 Subject: nfsd4: allow NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL | NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED RFC5661 says: The client may set one or both of OPEN4_SHARE_ACCESS_WANT_SIGNAL_DELEG_WHEN_RESRC_AVAIL and OPEN4_SHARE_ACCESS_WANT_PUSH_DELEG_WHEN_UNCONTENDED. Signed-off-by: Benny Halevy Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index fdc09a52cd8d..706ada1956c1 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -677,6 +677,8 @@ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *x) switch (w) { case NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL: case NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED: + case (NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL | + NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED): return nfs_ok; } xdr_error: -- cgit v1.2.3-58-ga151 From 92bac8c5d60623167c6802b1f125e6d623708185 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Wed, 19 Oct 2011 19:13:29 -0700 Subject: nfsd4: typo logical vs bitwise negate for want_mask Signed-off-by: Benny Halevy Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 706ada1956c1..decea140b323 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -671,7 +671,7 @@ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *x) default: return nfserr_bad_xdr; } - w &= !NFS4_SHARE_WANT_MASK; + w &= ~NFS4_SHARE_WANT_MASK; if (!w) return nfs_ok; switch (w) { -- cgit v1.2.3-58-ga151 From 345c284290cabb5484df909303e73d6def8ec8ec Mon Sep 17 00:00:00 2001 From: Mi Jinlong Date: Thu, 20 Oct 2011 17:51:39 +0800 Subject: nfs41: implement DESTROY_CLIENTID operation According to rfc5661 18.50, implement DESTROY_CLIENTID operation. Signed-off-by: Mi Jinlong Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 2 +- fs/nfsd/nfs4state.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ fs/nfsd/nfs4xdr.c | 12 +++++++++++- fs/nfsd/xdr4.h | 5 +++++ 4 files changed, 61 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 458ebb6b59c7..fa383361bc61 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1645,7 +1645,7 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_name = "OP_SEQUENCE", }, [OP_DESTROY_CLIENTID] = { - .op_func = NULL, + .op_func = (nfsd4op_func)nfsd4_destroy_clientid, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP | OP_MODIFIES_SOMETHING, .op_name = "OP_DESTROY_CLIENTID", diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1527aaffb000..47e94e33a975 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1962,6 +1962,50 @@ out: return status; } +static inline bool has_resources(struct nfs4_client *clp) +{ + return !list_empty(&clp->cl_openowners) + || !list_empty(&clp->cl_delegations) + || !list_empty(&clp->cl_sessions); +} + +__be32 +nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc) +{ + struct nfs4_client *conf, *unconf, *clp; + int status = 0; + + nfs4_lock_state(); + unconf = find_unconfirmed_client(&dc->clientid); + conf = find_confirmed_client(&dc->clientid); + + if (conf) { + clp = conf; + + if (!is_client_expired(conf) && has_resources(conf)) { + status = nfserr_clientid_busy; + goto out; + } + + /* rfc5661 18.50.3 */ + if (cstate->session && conf == cstate->session->se_client) { + status = nfserr_clientid_busy; + goto out; + } + } else if (unconf) + clp = unconf; + else { + status = nfserr_stale_clientid; + goto out; + } + + expire_client(clp); +out: + nfs4_unlock_state(); + dprintk("%s return %d\n", __func__, ntohl(status)); + return status; +} + __be32 nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc) { diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index decea140b323..66d095d7955e 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1417,6 +1417,16 @@ xdr_error: goto out; } +static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, struct nfsd4_destroy_clientid *dc) +{ + DECODE_HEAD; + + READ_BUF(8); + COPYMEM(&dc->clientid, 8); + + DECODE_TAIL; +} + static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc) { DECODE_HEAD; @@ -1538,7 +1548,7 @@ static nfsd4_dec nfsd41_dec_ops[] = { [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_test_stateid, [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid, [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, }; diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index e3057350eea1..2364747ee97d 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -398,6 +398,10 @@ struct nfsd4_destroy_session { struct nfs4_sessionid sessionid; }; +struct nfsd4_destroy_clientid { + clientid_t clientid; +}; + struct nfsd4_reclaim_complete { u32 rca_one_fs; }; @@ -552,6 +556,7 @@ extern __be32 nfsd4_sequence(struct svc_rqst *, extern __be32 nfsd4_destroy_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_session *); +extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_clientid *); __be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *); extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, struct nfsd4_open *open); -- cgit v1.2.3-58-ga151 From f9d9ef62cd3ecbd6cbb7957a253c1e81f69d5586 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 29 Sep 2011 13:11:33 +0200 Subject: btrfs: do not allow mounting non-subvolumes via subvol option There's a missing test whether the path passed to subvol=path option during mount is a real subvolume, allowing any directory located in default subovlume to be passed and accepted for mount. (current btrfs progs prevent this early) $ btrfs subvol snapshot . p1-snap ERROR: '.' is not a subvolume (with "is subvolume?" test bypassed) $ btrfs subvol snapshot . p1-snap Create a snapshot of '.' in './p1-snap' $ btrfs subvol list -p . ID 258 parent 5 top level 5 path subvol ID 259 parent 5 top level 5 path subvol1 ID 260 parent 5 top level 5 path default-subvol1 ID 262 parent 5 top level 5 path p1/p1-snapshot ID 263 parent 259 top level 5 path subvol1/subvol1-snap The problem I see is that this makes a false impression of snapshotting the given subvolume but in fact snapshots the default one: a user expects outcome like ID 263 but in fact gets ID 262 . This patch makes mount fail with EINVAL with a message in syslog. Signed-off-by: David Sterba --- fs/btrfs/super.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 29eecbb6ec3a..5429b1fa0bfc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -740,6 +740,16 @@ static int btrfs_set_super(struct super_block *s, void *data) return set_anon_super(s, data); } +/* + * subvolumes are identified by ino 256 + */ +static inline int is_subvolume_inode(struct inode *inode) +{ + if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + return 1; + return 0; +} + /* * This will strip out the subvol=%s argument for an argument string and add * subvolid=0 to make sure we get the actual tree root for path walking to the @@ -843,6 +853,15 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, if (error) return ERR_PTR(error); + if (!is_subvolume_inode(path.dentry->d_inode)) { + path_put(&path); + mntput(mnt); + error = -EINVAL; + printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", + subvol_name); + return ERR_PTR(-EINVAL); + } + /* Get a ref to the sb and the dentry we found and return it */ s = path.mnt->mnt_sb; atomic_inc(&s->s_active); -- cgit v1.2.3-58-ga151 From dff51cd1c60856c28f5d22a571294c2b70b6b322 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 14 Jun 2011 12:52:17 +0200 Subject: btrfs: ratelimit WARN_ON in use_block_rsv The WARN_ON under some circumstances heavily polute log and slow down the machine. This is just a safety, as the warning should be fixed by another patch, nevertheless, it still pops up during testing. Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cef355f1328a..28c4809851a5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "compat.h" #include "hash.h" #include "ctree.h" @@ -5783,7 +5784,13 @@ use_block_rsv(struct btrfs_trans_handle *trans, if (!ret) return block_rsv; if (ret) { - WARN_ON(1); + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL, + /*DEFAULT_RATELIMIT_BURST*/ 2); + if (__ratelimit(&_rs)) { + printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); + WARN_ON(1); + } ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); if (!ret) { return block_rsv; -- cgit v1.2.3-58-ga151 From 9562ad9ab36df7ccef920d119f3b5100025db95f Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 24 Oct 2011 16:11:30 +0200 Subject: block: Remove the control of complete cpu from bio. bio originally has the functionality to set the complete cpu, but it is broken. Chirstoph said that "This code is unused, and from the all the discussions lately pretty obviously broken. The only thing keeping it serves is creating more confusion and possibly more bugs." And Jens replied with "We can kill bio_set_completion_cpu(). I'm fine with leaving cpu control to the request based drivers, they are the only ones that can toggle the setting anyway". So this patch tries to remove all the work of controling complete cpu from a bio. Cc: Shaohua Li Cc: Christoph Hellwig Signed-off-by: Tao Ma Signed-off-by: Jens Axboe --- block/blk-core.c | 4 +--- drivers/md/raid1.c | 1 - fs/bio.c | 1 - include/linux/bio.h | 8 -------- include/linux/blk_types.h | 11 ++++------- 5 files changed, 5 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/block/blk-core.c b/block/blk-core.c index 7e1523521c70..da697936d220 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1276,7 +1276,6 @@ out: void init_request_from_bio(struct request *req, struct bio *bio) { - req->cpu = bio->bi_comp_cpu; req->cmd_type = REQ_TYPE_FS; req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK; @@ -1362,8 +1361,7 @@ get_rq: */ init_request_from_bio(req, bio); - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || - bio_flagged(bio, BIO_CPU_AFFINE)) + if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) req->cpu = raw_smp_processor_id(); plug = current->plug; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d4ddfa627301..2948a520f7ba 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2172,7 +2172,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i bio->bi_next = NULL; bio->bi_flags &= ~(BIO_POOL_MASK-1); bio->bi_flags |= 1 << BIO_UPTODATE; - bio->bi_comp_cpu = -1; bio->bi_rw = READ; bio->bi_vcnt = 0; bio->bi_idx = 0; diff --git a/fs/bio.c b/fs/bio.c index 9bfade8a609b..41c93c722244 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -255,7 +255,6 @@ void bio_init(struct bio *bio) { memset(bio, 0, sizeof(*bio)); bio->bi_flags = 1 << BIO_UPTODATE; - bio->bi_comp_cpu = -1; atomic_set(&bio->bi_cnt, 1); } EXPORT_SYMBOL(bio_init); diff --git a/include/linux/bio.h b/include/linux/bio.h index ce33e6868a2f..a3c071c9e189 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -268,14 +268,6 @@ extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int); extern unsigned int bvec_nr_vecs(unsigned short idx); -/* - * Allow queuer to specify a completion CPU for this bio - */ -static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu) -{ - bio->bi_comp_cpu = cpu; -} - /* * bio_set is used to allow other portions of the IO system to * allocate their own private memory pools for bio and iovec structures. diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 71fc53bb8f1c..4053cbd4490e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -59,8 +59,6 @@ struct bio { unsigned int bi_max_vecs; /* max bvl_vecs we can hold */ - unsigned int bi_comp_cpu; /* completion CPU */ - atomic_t bi_cnt; /* pin count */ struct bio_vec *bi_io_vec; /* the actual vec list */ @@ -93,11 +91,10 @@ struct bio { #define BIO_BOUNCED 5 /* bio is a bounce bio */ #define BIO_USER_MAPPED 6 /* contains user pages */ #define BIO_EOPNOTSUPP 7 /* not supported */ -#define BIO_CPU_AFFINE 8 /* complete bio on same CPU as submitted */ -#define BIO_NULL_MAPPED 9 /* contains invalid user pages */ -#define BIO_FS_INTEGRITY 10 /* fs owns integrity data, not block layer */ -#define BIO_QUIET 11 /* Make BIO Quiet */ -#define BIO_MAPPED_INTEGRITY 12/* integrity metadata has been remapped */ +#define BIO_NULL_MAPPED 8 /* contains invalid user pages */ +#define BIO_FS_INTEGRITY 9 /* fs owns integrity data, not block layer */ +#define BIO_QUIET 10 /* Make BIO Quiet */ +#define BIO_MAPPED_INTEGRITY 11/* integrity metadata has been remapped */ #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) /* -- cgit v1.2.3-58-ga151 From abfa034e4b8ed0046fa589769e9840af645bc4ba Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 16 Aug 2011 10:50:10 +0530 Subject: fs/9p: Update zero-copy implementation in 9p * remove lot of update to different data structure * add a seperate callback for zero copy request. * above makes non zero copy code path simpler * remove conditionalizing TREAD/TREADDIR/TWRITE in the zero copy path * Fix the dotu p9_check_errors with zero copy. Add sufficient doc around * Add support for both in and output buffers in zero copy callback * pin and unpin pages in the same context * use helpers instead of defining page offset and rest of page ourself * Fix mem leak in p9_check_errors * Remove 'E' and 'F' in p9pdu_vwritef Signed-off-by: Aneesh Kumar K.V Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_dir.c | 2 +- include/net/9p/9p.h | 11 +- include/net/9p/transport.h | 10 +- net/9p/client.c | 391 +++++++++++++++++++++++++++++++++------------ net/9p/protocol.c | 46 +----- net/9p/protocol.h | 1 + net/9p/trans_common.c | 53 ++---- net/9p/trans_common.h | 21 +-- net/9p/trans_virtio.c | 319 +++++++++++++++++++++--------------- 9 files changed, 506 insertions(+), 348 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 9c2bdda5cd9d..ce6600f33659 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -231,7 +231,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, while (err == 0) { if (rdir->tail == rdir->head) { err = p9_client_readdir(fid, rdir->buf, buflen, - filp->f_pos); + filp->f_pos); if (err <= 0) goto unlock_and_exit; diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h index a6326ef8ade6..d83a01300871 100644 --- a/include/net/9p/9p.h +++ b/include/net/9p/9p.h @@ -359,6 +359,9 @@ enum p9_qid_t { /* Room for readdir header */ #define P9_READDIRHDRSZ 24 +/* size of header for zero copy read/write */ +#define P9_ZC_HDR_SZ 4096 + /** * struct p9_qid - file system entity information * @type: 8-bit type &p9_qid_t @@ -555,10 +558,6 @@ struct p9_rstatfs { * @tag: transaction id of the request * @offset: used by marshalling routines to track current position in buffer * @capacity: used by marshalling routines to track total malloc'd capacity - * @pubuf: Payload user buffer given by the caller - * @pkbuf: Payload kernel buffer given by the caller - * @pbuf_size: pubuf/pkbuf(only one will be !NULL) size to be read/write. - * @private: For transport layer's use. * @sdata: payload * * &p9_fcall represents the structure for all 9P RPC @@ -575,10 +574,6 @@ struct p9_fcall { size_t offset; size_t capacity; - char __user *pubuf; - char *pkbuf; - size_t pbuf_size; - void *private; u8 *sdata; }; diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h index 83531ebeee99..adcbb20f6511 100644 --- a/include/net/9p/transport.h +++ b/include/net/9p/transport.h @@ -26,13 +26,6 @@ #ifndef NET_9P_TRANSPORT_H #define NET_9P_TRANSPORT_H -#define P9_TRANS_PREF_PAYLOAD_MASK 0x1 - -/* Default. Add Payload to PDU before sending it down to transport layer */ -#define P9_TRANS_PREF_PAYLOAD_DEF 0x0 -/* Send pay load separately to transport layer along with PDU.*/ -#define P9_TRANS_PREF_PAYLOAD_SEP 0x1 - /** * struct p9_trans_module - transport module interface * @list: used to maintain a list of currently available transports @@ -56,13 +49,14 @@ struct p9_trans_module { struct list_head list; char *name; /* name of transport */ int maxsize; /* max message size of transport */ - int pref; /* Preferences of this transport */ int def; /* this transport should be default */ struct module *owner; int (*create)(struct p9_client *, const char *, char *); void (*close) (struct p9_client *); int (*request) (struct p9_client *, struct p9_req_t *req); int (*cancel) (struct p9_client *, struct p9_req_t *req); + int (*zc_request)(struct p9_client *, struct p9_req_t *, + char *, char *, int , int, int, int); }; void v9fs_register_trans(struct p9_trans_module *m); diff --git a/net/9p/client.c b/net/9p/client.c index 0505a03c374c..305a4e719b03 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -203,11 +203,12 @@ free_and_return: * */ -static struct p9_req_t *p9_tag_alloc(struct p9_client *c, u16 tag) +static struct p9_req_t *p9_tag_alloc(struct p9_client *c, u16 tag, int max_size) { unsigned long flags; int row, col; struct p9_req_t *req; + int alloc_msize = min(c->msize, max_size); /* This looks up the original request by tag so we know which * buffer to read the data into */ @@ -245,23 +246,12 @@ static struct p9_req_t *p9_tag_alloc(struct p9_client *c, u16 tag) return ERR_PTR(-ENOMEM); } init_waitqueue_head(req->wq); - if ((c->trans_mod->pref & P9_TRANS_PREF_PAYLOAD_MASK) == - P9_TRANS_PREF_PAYLOAD_SEP) { - int alloc_msize = min(c->msize, 4096); - req->tc = kmalloc(sizeof(struct p9_fcall)+alloc_msize, - GFP_NOFS); - req->tc->capacity = alloc_msize; - req->rc = kmalloc(sizeof(struct p9_fcall)+alloc_msize, - GFP_NOFS); - req->rc->capacity = alloc_msize; - } else { - req->tc = kmalloc(sizeof(struct p9_fcall)+c->msize, - GFP_NOFS); - req->tc->capacity = c->msize; - req->rc = kmalloc(sizeof(struct p9_fcall)+c->msize, - GFP_NOFS); - req->rc->capacity = c->msize; - } + req->tc = kmalloc(sizeof(struct p9_fcall) + alloc_msize, + GFP_NOFS); + req->tc->capacity = alloc_msize; + req->rc = kmalloc(sizeof(struct p9_fcall) + alloc_msize, + GFP_NOFS); + req->rc->capacity = alloc_msize; if ((!req->tc) || (!req->rc)) { printk(KERN_ERR "Couldn't grow tag array\n"); kfree(req->tc); @@ -485,27 +475,8 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) if (!p9_is_proto_dotl(c)) { char *ename; - - if (req->tc->pbuf_size) { - /* Handle user buffers */ - size_t len = req->rc->size - req->rc->offset; - if (req->tc->pubuf) { - /* User Buffer */ - err = copy_from_user( - &req->rc->sdata[req->rc->offset], - req->tc->pubuf, len); - if (err) { - err = -EFAULT; - goto out_err; - } - } else { - /* Kernel Buffer */ - memmove(&req->rc->sdata[req->rc->offset], - req->tc->pkbuf, len); - } - } err = p9pdu_readf(req->rc, c->proto_version, "s?d", - &ename, &ecode); + &ename, &ecode); if (err) goto out_err; @@ -515,11 +486,10 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) if (!err || !IS_ERR_VALUE(err)) { err = p9_errstr2errno(ename, strlen(ename)); - P9_DPRINTK(P9_DEBUG_9P, "<<< RERROR (%d) %s\n", -ecode, - ename); - - kfree(ename); + P9_DPRINTK(P9_DEBUG_9P, "<<< RERROR (%d) %s\n", + -ecode, ename); } + kfree(ename); } else { err = p9pdu_readf(req->rc, c->proto_version, "d", &ecode); err = -ecode; @@ -527,7 +497,6 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) P9_DPRINTK(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode); } - return err; out_err: @@ -536,6 +505,110 @@ out_err: return err; } +/** + * p9_check_zc_errors - check 9p packet for error return and process it + * @c: current client instance + * @req: request to parse and check for error conditions + * @in_hdrlen: Size of response protocol buffer. + * + * returns error code if one is discovered, otherwise returns 0 + * + * this will have to be more complicated if we have multiple + * error packet types + */ + +static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req, + char *uidata, int in_hdrlen, int kern_buf) +{ + int err; + int ecode; + int8_t type; + char *ename = NULL; + + err = p9_parse_header(req->rc, NULL, &type, NULL, 0); + if (err) { + P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse header %d\n", err); + return err; + } + + if (type != P9_RERROR && type != P9_RLERROR) + return 0; + + if (!p9_is_proto_dotl(c)) { + /* Error is reported in string format */ + uint16_t len; + /* 7 = header size for RERROR, 2 is the size of string len; */ + int inline_len = in_hdrlen - (7 + 2); + + /* Read the size of error string */ + err = p9pdu_readf(req->rc, c->proto_version, "w", &len); + if (err) + goto out_err; + + ename = kmalloc(len + 1, GFP_NOFS); + if (!ename) { + err = -ENOMEM; + goto out_err; + } + if (len <= inline_len) { + /* We have error in protocol buffer itself */ + if (pdu_read(req->rc, ename, len)) { + err = -EFAULT; + goto out_free; + + } + } else { + /* + * Part of the data is in user space buffer. + */ + if (pdu_read(req->rc, ename, inline_len)) { + err = -EFAULT; + goto out_free; + + } + if (kern_buf) { + memcpy(ename + inline_len, uidata, + len - inline_len); + } else { + err = copy_from_user(ename + inline_len, + uidata, len - inline_len); + if (err) { + err = -EFAULT; + goto out_free; + } + } + } + ename[len] = 0; + if (p9_is_proto_dotu(c)) { + /* For dotu we also have error code */ + err = p9pdu_readf(req->rc, + c->proto_version, "d", &ecode); + if (err) + goto out_free; + err = -ecode; + } + if (!err || !IS_ERR_VALUE(err)) { + err = p9_errstr2errno(ename, strlen(ename)); + + P9_DPRINTK(P9_DEBUG_9P, "<<< RERROR (%d) %s\n", + -ecode, ename); + } + kfree(ename); + } else { + err = p9pdu_readf(req->rc, c->proto_version, "d", &ecode); + err = -ecode; + + P9_DPRINTK(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode); + } + return err; + +out_free: + kfree(ename); +out_err: + P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse error%d\n", err); + return err; +} + static struct p9_req_t * p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...); @@ -579,23 +652,12 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq) return 0; } -/** - * p9_client_rpc - issue a request and wait for a response - * @c: client session - * @type: type of request - * @fmt: protocol format string (see protocol.c) - * - * Returns request structure (which client must free using p9_free_req) - */ - -static struct p9_req_t * -p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) +static struct p9_req_t *p9_client_prepare_req(struct p9_client *c, + int8_t type, int req_size, + const char *fmt, va_list ap) { - va_list ap; int tag, err; struct p9_req_t *req; - unsigned long flags; - int sigpending; P9_DPRINTK(P9_DEBUG_MUX, "client %p op %d\n", c, type); @@ -607,12 +669,6 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) if ((c->status == BeginDisconnect) && (type != P9_TCLUNK)) return ERR_PTR(-EIO); - if (signal_pending(current)) { - sigpending = 1; - clear_thread_flag(TIF_SIGPENDING); - } else - sigpending = 0; - tag = P9_NOTAG; if (type != P9_TVERSION) { tag = p9_idpool_get(c->tagpool); @@ -620,18 +676,50 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) return ERR_PTR(-ENOMEM); } - req = p9_tag_alloc(c, tag); + req = p9_tag_alloc(c, tag, req_size); if (IS_ERR(req)) return req; /* marshall the data */ p9pdu_prepare(req->tc, tag, type); - va_start(ap, fmt); err = p9pdu_vwritef(req->tc, c->proto_version, fmt, ap); - va_end(ap); if (err) goto reterr; p9pdu_finalize(req->tc); + return req; +reterr: + p9_free_req(c, req); + return ERR_PTR(err); +} + +/** + * p9_client_rpc - issue a request and wait for a response + * @c: client session + * @type: type of request + * @fmt: protocol format string (see protocol.c) + * + * Returns request structure (which client must free using p9_free_req) + */ + +static struct p9_req_t * +p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) +{ + va_list ap; + int sigpending, err; + unsigned long flags; + struct p9_req_t *req; + + va_start(ap, fmt); + req = p9_client_prepare_req(c, type, c->msize, fmt, ap); + va_end(ap); + if (IS_ERR(req)) + return req; + + if (signal_pending(current)) { + sigpending = 1; + clear_thread_flag(TIF_SIGPENDING); + } else + sigpending = 0; err = c->trans_mod->request(c, req); if (err < 0) { @@ -639,18 +727,14 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) c->status = Disconnected; goto reterr; } - - P9_DPRINTK(P9_DEBUG_MUX, "wait %p tag: %d\n", req->wq, tag); + /* Wait for the response */ err = wait_event_interruptible(*req->wq, - req->status >= REQ_STATUS_RCVD); - P9_DPRINTK(P9_DEBUG_MUX, "wait %p tag: %d returned %d\n", - req->wq, tag, err); + req->status >= REQ_STATUS_RCVD); if (req->status == REQ_STATUS_ERROR) { P9_DPRINTK(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err); err = req->t_err; } - if ((err == -ERESTARTSYS) && (c->status == Connected)) { P9_DPRINTK(P9_DEBUG_MUX, "flushing\n"); sigpending = 1; @@ -663,13 +747,11 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) if (req->status == REQ_STATUS_RCVD) err = 0; } - if (sigpending) { spin_lock_irqsave(¤t->sighand->siglock, flags); recalc_sigpending(); spin_unlock_irqrestore(¤t->sighand->siglock, flags); } - if (err < 0) goto reterr; @@ -678,7 +760,92 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) P9_DPRINTK(P9_DEBUG_MUX, "exit: client %p op %d\n", c, type); return req; } +reterr: + P9_DPRINTK(P9_DEBUG_MUX, + "exit: client %p op %d error: %d\n", c, type, err); + p9_free_req(c, req); + return ERR_PTR(err); +} + +/** + * p9_client_zc_rpc - issue a request and wait for a response + * @c: client session + * @type: type of request + * @uidata: user bffer that should be ued for zero copy read + * @uodata: user buffer that shoud be user for zero copy write + * @inlen: read buffer size + * @olen: write buffer size + * @hdrlen: reader header size, This is the size of response protocol data + * @fmt: protocol format string (see protocol.c) + * + * Returns request structure (which client must free using p9_free_req) + */ +static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type, + char *uidata, char *uodata, + int inlen, int olen, int in_hdrlen, + int kern_buf, const char *fmt, ...) +{ + va_list ap; + int sigpending, err; + unsigned long flags; + struct p9_req_t *req; + + va_start(ap, fmt); + /* + * We allocate a inline protocol data of only 4k bytes. + * The actual content is passed in zero-copy fashion. + */ + req = p9_client_prepare_req(c, type, P9_ZC_HDR_SZ, fmt, ap); + va_end(ap); + if (IS_ERR(req)) + return req; + + if (signal_pending(current)) { + sigpending = 1; + clear_thread_flag(TIF_SIGPENDING); + } else + sigpending = 0; + + /* If we are called with KERNEL_DS force kern_buf */ + if (segment_eq(get_fs(), KERNEL_DS)) + kern_buf = 1; + + err = c->trans_mod->zc_request(c, req, uidata, uodata, + inlen, olen, in_hdrlen, kern_buf); + if (err < 0) { + if (err == -EIO) + c->status = Disconnected; + goto reterr; + } + if (req->status == REQ_STATUS_ERROR) { + P9_DPRINTK(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err); + err = req->t_err; + } + if ((err == -ERESTARTSYS) && (c->status == Connected)) { + P9_DPRINTK(P9_DEBUG_MUX, "flushing\n"); + sigpending = 1; + clear_thread_flag(TIF_SIGPENDING); + if (c->trans_mod->cancel(c, req)) + p9_client_flush(c, req); + + /* if we received the response anyway, don't signal error */ + if (req->status == REQ_STATUS_RCVD) + err = 0; + } + if (sigpending) { + spin_lock_irqsave(¤t->sighand->siglock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + } + if (err < 0) + goto reterr; + + err = p9_check_zc_errors(c, req, uidata, in_hdrlen, kern_buf); + if (!err) { + P9_DPRINTK(P9_DEBUG_MUX, "exit: client %p op %d\n", c, type); + return req; + } reterr: P9_DPRINTK(P9_DEBUG_MUX, "exit: client %p op %d error: %d\n", c, type, err); @@ -1330,13 +1497,15 @@ int p9_client_read(struct p9_fid *fid, char *data, char __user *udata, u64 offset, u32 count) { - int err, rsize; - struct p9_client *clnt; - struct p9_req_t *req; char *dataptr; + int kernel_buf = 0; + struct p9_req_t *req; + struct p9_client *clnt; + int err, rsize, non_zc = 0; + - P9_DPRINTK(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %d\n", fid->fid, - (long long unsigned) offset, count); + P9_DPRINTK(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %d\n", + fid->fid, (long long unsigned) offset, count); err = 0; clnt = fid->clnt; @@ -1348,13 +1517,24 @@ p9_client_read(struct p9_fid *fid, char *data, char __user *udata, u64 offset, rsize = count; /* Don't bother zerocopy for small IO (< 1024) */ - if (((clnt->trans_mod->pref & P9_TRANS_PREF_PAYLOAD_MASK) == - P9_TRANS_PREF_PAYLOAD_SEP) && (rsize > 1024)) { - req = p9_client_rpc(clnt, P9_TREAD, "dqE", fid->fid, offset, - rsize, data, udata); + if (clnt->trans_mod->zc_request && rsize > 1024) { + char *indata; + if (data) { + kernel_buf = 1; + indata = data; + } else + indata = (char *)udata; + /* + * response header len is 11 + * PDU Header(7) + IO Size (4) + */ + req = p9_client_zc_rpc(clnt, P9_TREAD, indata, NULL, rsize, 0, + 11, kernel_buf, "dqd", fid->fid, + offset, rsize); } else { + non_zc = 1; req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset, - rsize); + rsize); } if (IS_ERR(req)) { err = PTR_ERR(req); @@ -1370,7 +1550,7 @@ p9_client_read(struct p9_fid *fid, char *data, char __user *udata, u64 offset, P9_DPRINTK(P9_DEBUG_9P, "<<< RREAD count %d\n", count); P9_DUMP_PKT(1, req->rc); - if (!req->tc->pbuf_size) { + if (non_zc) { if (data) { memmove(data, dataptr, count); } else { @@ -1396,6 +1576,7 @@ p9_client_write(struct p9_fid *fid, char *data, const char __user *udata, u64 offset, u32 count) { int err, rsize; + int kernel_buf = 0; struct p9_client *clnt; struct p9_req_t *req; @@ -1411,19 +1592,24 @@ p9_client_write(struct p9_fid *fid, char *data, const char __user *udata, if (count < rsize) rsize = count; - /* Don't bother zerocopy form small IO (< 1024) */ - if (((clnt->trans_mod->pref & P9_TRANS_PREF_PAYLOAD_MASK) == - P9_TRANS_PREF_PAYLOAD_SEP) && (rsize > 1024)) { - req = p9_client_rpc(clnt, P9_TWRITE, "dqE", fid->fid, offset, - rsize, data, udata); + /* Don't bother zerocopy for small IO (< 1024) */ + if (clnt->trans_mod->zc_request && rsize > 1024) { + char *odata; + if (data) { + kernel_buf = 1; + odata = data; + } else + odata = (char *)udata; + req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, odata, 0, rsize, + P9_ZC_HDR_SZ, kernel_buf, "dqd", + fid->fid, offset, rsize); } else { - if (data) req = p9_client_rpc(clnt, P9_TWRITE, "dqD", fid->fid, - offset, rsize, data); + offset, rsize, data); else req = p9_client_rpc(clnt, P9_TWRITE, "dqU", fid->fid, - offset, rsize, udata); + offset, rsize, udata); } if (IS_ERR(req)) { err = PTR_ERR(req); @@ -1824,7 +2010,7 @@ EXPORT_SYMBOL_GPL(p9_client_xattrcreate); int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) { - int err, rsize; + int err, rsize, non_zc = 0; struct p9_client *clnt; struct p9_req_t *req; char *dataptr; @@ -1842,13 +2028,18 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) if (count < rsize) rsize = count; - if ((clnt->trans_mod->pref & P9_TRANS_PREF_PAYLOAD_MASK) == - P9_TRANS_PREF_PAYLOAD_SEP) { - req = p9_client_rpc(clnt, P9_TREADDIR, "dqF", fid->fid, - offset, rsize, data); + /* Don't bother zerocopy for small IO (< 1024) */ + if (clnt->trans_mod->zc_request && rsize > 1024) { + /* + * response header len is 11 + * PDU Header(7) + IO Size (4) + */ + req = p9_client_zc_rpc(clnt, P9_TREADDIR, data, NULL, rsize, 0, + 11, 1, "dqd", fid->fid, offset, rsize); } else { + non_zc = 1; req = p9_client_rpc(clnt, P9_TREADDIR, "dqd", fid->fid, - offset, rsize); + offset, rsize); } if (IS_ERR(req)) { err = PTR_ERR(req); @@ -1863,7 +2054,7 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) P9_DPRINTK(P9_DEBUG_9P, "<<< RREADDIR count %d\n", count); - if (!req->tc->pbuf_size && data) + if (non_zc) memmove(data, dataptr, count); p9_free_req(clnt, req); diff --git a/net/9p/protocol.c b/net/9p/protocol.c index df58375ea6b3..b7d4e8aa5383 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -81,7 +81,7 @@ void p9stat_free(struct p9_wstat *stbuf) } EXPORT_SYMBOL(p9stat_free); -static size_t pdu_read(struct p9_fcall *pdu, void *data, size_t size) +size_t pdu_read(struct p9_fcall *pdu, void *data, size_t size) { size_t len = min(pdu->size - pdu->offset, size); memcpy(data, &pdu->sdata[pdu->offset], len); @@ -108,26 +108,6 @@ pdu_write_u(struct p9_fcall *pdu, const char __user *udata, size_t size) return size - len; } -static size_t -pdu_write_urw(struct p9_fcall *pdu, const char *kdata, const char __user *udata, - size_t size) -{ - BUG_ON(pdu->size > P9_IOHDRSZ); - pdu->pubuf = (char __user *)udata; - pdu->pkbuf = (char *)kdata; - pdu->pbuf_size = size; - return 0; -} - -static size_t -pdu_write_readdir(struct p9_fcall *pdu, const char *kdata, size_t size) -{ - BUG_ON(pdu->size > P9_READDIRHDRSZ); - pdu->pkbuf = (char *)kdata; - pdu->pbuf_size = size; - return 0; -} - /* b - int8_t w - int16_t @@ -459,26 +439,6 @@ p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt, errcode = -EFAULT; } break; - case 'E':{ - int32_t cnt = va_arg(ap, int32_t); - const char *k = va_arg(ap, const void *); - const char __user *u = va_arg(ap, - const void __user *); - errcode = p9pdu_writef(pdu, proto_version, "d", - cnt); - if (!errcode && pdu_write_urw(pdu, k, u, cnt)) - errcode = -EFAULT; - } - break; - case 'F':{ - int32_t cnt = va_arg(ap, int32_t); - const char *k = va_arg(ap, const void *); - errcode = p9pdu_writef(pdu, proto_version, "d", - cnt); - if (!errcode && pdu_write_readdir(pdu, k, cnt)) - errcode = -EFAULT; - } - break; case 'U':{ int32_t count = va_arg(ap, int32_t); const char __user *udata = @@ -637,10 +597,6 @@ void p9pdu_reset(struct p9_fcall *pdu) { pdu->offset = 0; pdu->size = 0; - pdu->private = NULL; - pdu->pubuf = NULL; - pdu->pkbuf = NULL; - pdu->pbuf_size = 0; } int p9dirent_read(char *buf, int len, struct p9_dirent *dirent, diff --git a/net/9p/protocol.h b/net/9p/protocol.h index 2431c0f38d56..a0eb8ff11f22 100644 --- a/net/9p/protocol.h +++ b/net/9p/protocol.h @@ -32,3 +32,4 @@ int p9pdu_prepare(struct p9_fcall *pdu, int16_t tag, int8_t type); int p9pdu_finalize(struct p9_fcall *pdu); void p9pdu_dump(int, struct p9_fcall *); void p9pdu_reset(struct p9_fcall *pdu); +size_t pdu_read(struct p9_fcall *pdu, void *data, size_t size); diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c index 9a70ebdec56e..de8df957867d 100644 --- a/net/9p/trans_common.c +++ b/net/9p/trans_common.c @@ -21,30 +21,25 @@ /** * p9_release_req_pages - Release pages after the transaction. - * @*private: PDU's private page of struct trans_rpage_info */ -void -p9_release_req_pages(struct trans_rpage_info *rpinfo) +void p9_release_pages(struct page **pages, int nr_pages) { int i = 0; - - while (rpinfo->rp_data[i] && rpinfo->rp_nr_pages--) { - put_page(rpinfo->rp_data[i]); + while (pages[i] && nr_pages--) { + put_page(pages[i]); i++; } } -EXPORT_SYMBOL(p9_release_req_pages); +EXPORT_SYMBOL(p9_release_pages); /** * p9_nr_pages - Return number of pages needed to accommodate the payload. */ -int -p9_nr_pages(struct p9_req_t *req) +int p9_nr_pages(char *data, int len) { unsigned long start_page, end_page; - start_page = (unsigned long)req->tc->pubuf >> PAGE_SHIFT; - end_page = ((unsigned long)req->tc->pubuf + req->tc->pbuf_size + - PAGE_SIZE - 1) >> PAGE_SHIFT; + start_page = (unsigned long)data >> PAGE_SHIFT; + end_page = ((unsigned long)data + len + PAGE_SIZE - 1) >> PAGE_SHIFT; return end_page - start_page; } EXPORT_SYMBOL(p9_nr_pages); @@ -58,35 +53,17 @@ EXPORT_SYMBOL(p9_nr_pages); * @nr_pages: number of pages to accommodate the payload * @rw: Indicates if the pages are for read or write. */ -int -p9_payload_gup(struct p9_req_t *req, size_t *pdata_off, int *pdata_len, - int nr_pages, u8 rw) -{ - uint32_t first_page_bytes = 0; - int32_t pdata_mapped_pages; - struct trans_rpage_info *rpinfo; - - *pdata_off = (__force size_t)req->tc->pubuf & (PAGE_SIZE-1); - if (*pdata_off) - first_page_bytes = min(((size_t)PAGE_SIZE - *pdata_off), - req->tc->pbuf_size); +int p9_payload_gup(char *data, int *nr_pages, struct page **pages, int write) +{ + int nr_mapped_pages; - rpinfo = req->tc->private; - pdata_mapped_pages = get_user_pages_fast((unsigned long)req->tc->pubuf, - nr_pages, rw, &rpinfo->rp_data[0]); - if (pdata_mapped_pages <= 0) - return pdata_mapped_pages; + nr_mapped_pages = get_user_pages_fast((unsigned long)data, + *nr_pages, write, pages); + if (nr_mapped_pages <= 0) + return nr_mapped_pages; - rpinfo->rp_nr_pages = pdata_mapped_pages; - if (*pdata_off) { - *pdata_len = first_page_bytes; - *pdata_len += min((req->tc->pbuf_size - *pdata_len), - ((size_t)pdata_mapped_pages - 1) << PAGE_SHIFT); - } else { - *pdata_len = min(req->tc->pbuf_size, - (size_t)pdata_mapped_pages << PAGE_SHIFT); - } + *nr_pages = nr_mapped_pages; return 0; } EXPORT_SYMBOL(p9_payload_gup); diff --git a/net/9p/trans_common.h b/net/9p/trans_common.h index 76309223bb02..173bb550a9eb 100644 --- a/net/9p/trans_common.h +++ b/net/9p/trans_common.h @@ -12,21 +12,6 @@ * */ -/* TRUE if it is user context */ -#define P9_IS_USER_CONTEXT (!segment_eq(get_fs(), KERNEL_DS)) - -/** - * struct trans_rpage_info - To store mapped page information in PDU. - * @rp_alloc:Set if this structure is allocd, not a reuse unused space in pdu. - * @rp_nr_pages: Number of mapped pages - * @rp_data: Array of page pointers - */ -struct trans_rpage_info { - u8 rp_alloc; - int rp_nr_pages; - struct page *rp_data[0]; -}; - -void p9_release_req_pages(struct trans_rpage_info *); -int p9_payload_gup(struct p9_req_t *, size_t *, int *, int, u8); -int p9_nr_pages(struct p9_req_t *); +void p9_release_pages(struct page **, int); +int p9_payload_gup(char *, int *, struct page **, int); +int p9_nr_pages(char *, int); diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index e317583fcc73..32aa9834229c 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -150,12 +150,10 @@ static void req_done(struct virtqueue *vq) while (1) { spin_lock_irqsave(&chan->lock, flags); rc = virtqueue_get_buf(chan->vq, &len); - if (rc == NULL) { spin_unlock_irqrestore(&chan->lock, flags); break; } - chan->ring_bufs_avail = 1; spin_unlock_irqrestore(&chan->lock, flags); /* Wakeup if anyone waiting for VirtIO ring space. */ @@ -163,17 +161,6 @@ static void req_done(struct virtqueue *vq) P9_DPRINTK(P9_DEBUG_TRANS, ": rc %p\n", rc); P9_DPRINTK(P9_DEBUG_TRANS, ": lookup tag %d\n", rc->tag); req = p9_tag_lookup(chan->client, rc->tag); - if (req->tc->private) { - struct trans_rpage_info *rp = req->tc->private; - int p = rp->rp_nr_pages; - /*Release pages */ - p9_release_req_pages(rp); - atomic_sub(p, &vp_pinned); - wake_up(&vp_wq); - if (rp->rp_alloc) - kfree(rp); - req->tc->private = NULL; - } req->status = REQ_STATUS_RCVD; p9_client_cb(chan->client, req); } @@ -193,9 +180,8 @@ static void req_done(struct virtqueue *vq) * */ -static int -pack_sg_list(struct scatterlist *sg, int start, int limit, char *data, - int count) +static int pack_sg_list(struct scatterlist *sg, int start, + int limit, char *data, int count) { int s; int index = start; @@ -224,31 +210,36 @@ static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req) * this takes a list of pages. * @sg: scatter/gather list to pack into * @start: which segment of the sg_list to start at - * @pdata_off: Offset into the first page * @**pdata: a list of pages to add into sg. + * @nr_pages: number of pages to pack into the scatter/gather list + * @data: data to pack into scatter/gather list * @count: amount of data to pack into the scatter/gather list */ static int -pack_sg_list_p(struct scatterlist *sg, int start, int limit, size_t pdata_off, - struct page **pdata, int count) +pack_sg_list_p(struct scatterlist *sg, int start, int limit, + struct page **pdata, int nr_pages, char *data, int count) { - int s; - int i = 0; + int i = 0, s; + int data_off; int index = start; - if (pdata_off) { - s = min((int)(PAGE_SIZE - pdata_off), count); - sg_set_page(&sg[index++], pdata[i++], s, pdata_off); - count -= s; - } - - while (count) { - BUG_ON(index > limit); - s = min((int)PAGE_SIZE, count); - sg_set_page(&sg[index++], pdata[i++], s, 0); + BUG_ON(nr_pages > (limit - start)); + /* + * if the first page doesn't start at + * page boundary find the offset + */ + data_off = offset_in_page(data); + while (nr_pages) { + s = rest_of_page(data); + if (s > count) + s = count; + sg_set_page(&sg[index++], pdata[i++], s, data_off); + data_off = 0; + data += s; count -= s; + nr_pages--; } - return index-start; + return index - start; } /** @@ -261,114 +252,166 @@ pack_sg_list_p(struct scatterlist *sg, int start, int limit, size_t pdata_off, static int p9_virtio_request(struct p9_client *client, struct p9_req_t *req) { - int in, out, inp, outp; - struct virtio_chan *chan = client->trans; + int err; + int in, out; unsigned long flags; - size_t pdata_off = 0; - struct trans_rpage_info *rpinfo = NULL; - int err, pdata_len = 0; + struct virtio_chan *chan = client->trans; P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request\n"); req->status = REQ_STATUS_SENT; +req_retry: + spin_lock_irqsave(&chan->lock, flags); + + /* Handle out VirtIO ring buffers */ + out = pack_sg_list(chan->sg, 0, + VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); - if (req->tc->pbuf_size && (req->tc->pubuf && P9_IS_USER_CONTEXT)) { - int nr_pages = p9_nr_pages(req); - int rpinfo_size = sizeof(struct trans_rpage_info) + - sizeof(struct page *) * nr_pages; + in = pack_sg_list(chan->sg, out, + VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity); - if (atomic_read(&vp_pinned) >= chan->p9_max_pages) { - err = wait_event_interruptible(vp_wq, - atomic_read(&vp_pinned) < chan->p9_max_pages); + err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc); + if (err < 0) { + if (err == -ENOSPC) { + chan->ring_bufs_avail = 0; + spin_unlock_irqrestore(&chan->lock, flags); + err = wait_event_interruptible(*chan->vc_wq, + chan->ring_bufs_avail); if (err == -ERESTARTSYS) return err; - P9_DPRINTK(P9_DEBUG_TRANS, "9p: May gup pages now.\n"); - } - if (rpinfo_size <= (req->tc->capacity - req->tc->size)) { - /* We can use sdata */ - req->tc->private = req->tc->sdata + req->tc->size; - rpinfo = (struct trans_rpage_info *)req->tc->private; - rpinfo->rp_alloc = 0; + P9_DPRINTK(P9_DEBUG_TRANS, "9p:Retry virtio request\n"); + goto req_retry; } else { - req->tc->private = kmalloc(rpinfo_size, GFP_NOFS); - if (!req->tc->private) { - P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: " - "private kmalloc returned NULL"); - return -ENOMEM; - } - rpinfo = (struct trans_rpage_info *)req->tc->private; - rpinfo->rp_alloc = 1; + spin_unlock_irqrestore(&chan->lock, flags); + P9_DPRINTK(P9_DEBUG_TRANS, + "9p debug: " + "virtio rpc add_buf returned failure"); + return -EIO; } + } + virtqueue_kick(chan->vq); + spin_unlock_irqrestore(&chan->lock, flags); - err = p9_payload_gup(req, &pdata_off, &pdata_len, nr_pages, - req->tc->id == P9_TREAD ? 1 : 0); - if (err < 0) { - if (rpinfo->rp_alloc) - kfree(rpinfo); + P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request kicked\n"); + return 0; +} + +static int p9_get_mapped_pages(struct virtio_chan *chan, + struct page **pages, char *data, + int nr_pages, int write, int kern_buf) +{ + int err; + if (!kern_buf) { + /* + * We allow only p9_max_pages pinned. We wait for the + * Other zc request to finish here + */ + if (atomic_read(&vp_pinned) >= chan->p9_max_pages) { + err = wait_event_interruptible(vp_wq, + (atomic_read(&vp_pinned) < chan->p9_max_pages)); + if (err == -ERESTARTSYS) + return err; + } + err = p9_payload_gup(data, &nr_pages, pages, write); + if (err < 0) return err; - } else { - atomic_add(rpinfo->rp_nr_pages, &vp_pinned); + atomic_add(nr_pages, &vp_pinned); + } else { + /* kernel buffer, no need to pin pages */ + int s, index = 0; + int count = nr_pages; + while (nr_pages) { + s = rest_of_page(data); + pages[index++] = virt_to_page(data); + data += s; + nr_pages--; } + nr_pages = count; } + return nr_pages; +} -req_retry_pinned: - spin_lock_irqsave(&chan->lock, flags); +/** + * p9_virtio_zc_request - issue a zero copy request + * @client: client instance issuing the request + * @req: request to be issued + * @uidata: user bffer that should be ued for zero copy read + * @uodata: user buffer that shoud be user for zero copy write + * @inlen: read buffer size + * @olen: write buffer size + * @hdrlen: reader header size, This is the size of response protocol data + * + */ +static int +p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, + char *uidata, char *uodata, int inlen, + int outlen, int in_hdr_len, int kern_buf) +{ + int in, out, err; + unsigned long flags; + int in_nr_pages = 0, out_nr_pages = 0; + struct page **in_pages = NULL, **out_pages = NULL; + struct virtio_chan *chan = client->trans; - /* Handle out VirtIO ring buffers */ - out = pack_sg_list(chan->sg, 0, VIRTQUEUE_NUM, req->tc->sdata, - req->tc->size); - - if (req->tc->pbuf_size && (req->tc->id == P9_TWRITE)) { - /* We have additional write payload buffer to take care */ - if (req->tc->pubuf && P9_IS_USER_CONTEXT) { - outp = pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM, - pdata_off, rpinfo->rp_data, pdata_len); - } else { - char *pbuf; - if (req->tc->pubuf) - pbuf = (__force char *) req->tc->pubuf; - else - pbuf = req->tc->pkbuf; - outp = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM, pbuf, - req->tc->pbuf_size); + P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request\n"); + + if (uodata) { + out_nr_pages = p9_nr_pages(uodata, outlen); + out_pages = kmalloc(sizeof(struct page *) * out_nr_pages, + GFP_NOFS); + if (!out_pages) { + err = -ENOMEM; + goto err_out; + } + out_nr_pages = p9_get_mapped_pages(chan, out_pages, uodata, + out_nr_pages, 0, kern_buf); + if (out_nr_pages < 0) { + err = out_nr_pages; + kfree(out_pages); + out_pages = NULL; + goto err_out; } - out += outp; } - - /* Handle in VirtIO ring buffers */ - if (req->tc->pbuf_size && - ((req->tc->id == P9_TREAD) || (req->tc->id == P9_TREADDIR))) { - /* - * Take care of additional Read payload. - * 11 is the read/write header = PDU Header(7) + IO Size (4). - * Arrange in such a way that server places header in the - * alloced memory and payload onto the user buffer. - */ - inp = pack_sg_list(chan->sg, out, - VIRTQUEUE_NUM, req->rc->sdata, 11); - /* - * Running executables in the filesystem may result in - * a read request with kernel buffer as opposed to user buffer. - */ - if (req->tc->pubuf && P9_IS_USER_CONTEXT) { - in = pack_sg_list_p(chan->sg, out+inp, VIRTQUEUE_NUM, - pdata_off, rpinfo->rp_data, pdata_len); - } else { - char *pbuf; - if (req->tc->pubuf) - pbuf = (__force char *) req->tc->pubuf; - else - pbuf = req->tc->pkbuf; - - in = pack_sg_list(chan->sg, out+inp, VIRTQUEUE_NUM, - pbuf, req->tc->pbuf_size); + if (uidata) { + in_nr_pages = p9_nr_pages(uidata, inlen); + in_pages = kmalloc(sizeof(struct page *) * in_nr_pages, + GFP_NOFS); + if (!in_pages) { + err = -ENOMEM; + goto err_out; + } + in_nr_pages = p9_get_mapped_pages(chan, in_pages, uidata, + in_nr_pages, 1, kern_buf); + if (in_nr_pages < 0) { + err = in_nr_pages; + kfree(in_pages); + in_pages = NULL; + goto err_out; } - in += inp; - } else { - in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM, - req->rc->sdata, req->rc->capacity); } + req->status = REQ_STATUS_SENT; +req_retry_pinned: + spin_lock_irqsave(&chan->lock, flags); + /* out data */ + out = pack_sg_list(chan->sg, 0, + VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); + + if (out_pages) + out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM, + out_pages, out_nr_pages, uodata, outlen); + /* + * Take care of in data + * For example TREAD have 11. + * 11 is the read/write header = PDU Header(7) + IO Size (4). + * Arrange in such a way that server places header in the + * alloced memory and payload onto the user buffer. + */ + in = pack_sg_list(chan->sg, out, + VIRTQUEUE_NUM, req->rc->sdata, in_hdr_len); + if (in_pages) + in += pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM, + in_pages, in_nr_pages, uidata, inlen); err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc); if (err < 0) { @@ -376,28 +419,45 @@ req_retry_pinned: chan->ring_bufs_avail = 0; spin_unlock_irqrestore(&chan->lock, flags); err = wait_event_interruptible(*chan->vc_wq, - chan->ring_bufs_avail); + chan->ring_bufs_avail); if (err == -ERESTARTSYS) - return err; + goto err_out; P9_DPRINTK(P9_DEBUG_TRANS, "9p:Retry virtio request\n"); goto req_retry_pinned; } else { spin_unlock_irqrestore(&chan->lock, flags); P9_DPRINTK(P9_DEBUG_TRANS, - "9p debug: " - "virtio rpc add_buf returned failure"); - if (rpinfo && rpinfo->rp_alloc) - kfree(rpinfo); - return -EIO; + "9p debug: " + "virtio rpc add_buf returned failure"); + err = -EIO; + goto err_out; } } - virtqueue_kick(chan->vq); spin_unlock_irqrestore(&chan->lock, flags); - P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request kicked\n"); - return 0; + err = wait_event_interruptible(*req->wq, + req->status >= REQ_STATUS_RCVD); + /* + * Non kernel buffers are pinned, unpin them + */ +err_out: + if (!kern_buf) { + if (in_pages) { + p9_release_pages(in_pages, in_nr_pages); + atomic_sub(in_nr_pages, &vp_pinned); + } + if (out_pages) { + p9_release_pages(out_pages, out_nr_pages); + atomic_sub(out_nr_pages, &vp_pinned); + } + /* wakeup anybody waiting for slots to pin pages */ + wake_up(&vp_wq); + } + kfree(in_pages); + kfree(out_pages); + return err; } static ssize_t p9_mount_tag_show(struct device *dev, @@ -591,8 +651,8 @@ static struct p9_trans_module p9_virtio_trans = { .create = p9_virtio_create, .close = p9_virtio_close, .request = p9_virtio_request, + .zc_request = p9_virtio_zc_request, .cancel = p9_virtio_cancel, - /* * We leave one entry for input and one entry for response * headers. We also skip one more entry to accomodate, address @@ -600,7 +660,6 @@ static struct p9_trans_module p9_virtio_trans = { * page in zero copy. */ .maxsize = PAGE_SIZE * (VIRTQUEUE_NUM - 3), - .pref = P9_TRANS_PREF_PAYLOAD_SEP, .def = 0, .owner = THIS_MODULE, }; -- cgit v1.2.3-58-ga151 From 464f5ecf00bb4513ba257520678f5168452f67ba Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 30 Aug 2011 11:31:21 +0530 Subject: fs/9p: inode file operation is properly initialized init_special_inode Signed-off-by: Aneesh Kumar K.V Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_inode.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index e3c03db3c788..b5a1076aaa6c 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -278,10 +278,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, case S_IFSOCK: if (v9fs_proto_dotl(v9ses)) { inode->i_op = &v9fs_file_inode_operations_dotl; - inode->i_fop = &v9fs_file_operations_dotl; } else if (v9fs_proto_dotu(v9ses)) { inode->i_op = &v9fs_file_inode_operations; - inode->i_fop = &v9fs_file_operations; } else { P9_DPRINTK(P9_DEBUG_ERROR, "special files without extended mode\n"); -- cgit v1.2.3-58-ga151 From 4d5077f1b2aa502a0ca98b450d1b16fbccfe9c63 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 30 Aug 2011 12:19:34 +0530 Subject: fs/9p: Cleanup option parsing in 9p Instead of saying all integer argument option should be listed in the beginning move integer parsing to each option type. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Eric Van Hensbergen --- fs/9p/v9fs.c | 33 ++++++++++++++++++++++++++------- net/9p/client.c | 12 +++++------- 2 files changed, 31 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index ef9661886112..2b78014a124a 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -132,21 +132,19 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) options = tmp_options; while ((p = strsep(&options, ",")) != NULL) { - int token; + int token, r; if (!*p) continue; token = match_token(p, tokens, args); - if (token < Opt_uname) { - int r = match_int(&args[0], &option); + switch (token) { + case Opt_debug: + r = match_int(&args[0], &option); if (r < 0) { P9_DPRINTK(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); + "integer field, but no integer?\n"); ret = r; continue; } - } - switch (token) { - case Opt_debug: v9ses->debug = option; #ifdef CONFIG_NET_9P_DEBUG p9_debug_level = option; @@ -154,12 +152,33 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) break; case Opt_dfltuid: + r = match_int(&args[0], &option); + if (r < 0) { + P9_DPRINTK(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } v9ses->dfltuid = option; break; case Opt_dfltgid: + r = match_int(&args[0], &option); + if (r < 0) { + P9_DPRINTK(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } v9ses->dfltgid = option; break; case Opt_afid: + r = match_int(&args[0], &option); + if (r < 0) { + P9_DPRINTK(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } v9ses->afid = option; break; case Opt_uname: diff --git a/net/9p/client.c b/net/9p/client.c index 9eadadb0a698..0edee4de608a 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -123,21 +123,19 @@ static int parse_opts(char *opts, struct p9_client *clnt) options = tmp_options; while ((p = strsep(&options, ",")) != NULL) { - int token; + int token, r; if (!*p) continue; token = match_token(p, tokens, args); - if (token < Opt_trans) { - int r = match_int(&args[0], &option); + switch (token) { + case Opt_msize: + r = match_int(&args[0], &option); if (r < 0) { P9_DPRINTK(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); + "integer field, but no integer?\n"); ret = r; continue; } - } - switch (token) { - case Opt_msize: clnt->msize = option; break; case Opt_trans: -- cgit v1.2.3-58-ga151 From 348b59012e5c6402741d067cf6eeeb6271999d06 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sun, 7 Aug 2011 00:46:59 +0530 Subject: net/9p: Convert net/9p protocol dumps to tracepoints This helps in more control over debugging. root@qemu-img-64:~# ls /pass/123 ls: cannot access /pass/123: No such file or directory root@qemu-img-64:~# cat /sys/kernel/debug/tracing/trace # tracer: nop # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | ls-1536 [001] 70.928584: 9p_protocol_dump: clnt 18446612132784021504 P9_TWALK(tag = 1) 000: 16 00 00 00 6e 01 00 01 00 00 00 02 00 00 00 01 010: 00 03 00 31 32 33 00 00 00 ff ff ff ff 00 00 00 ls-1536 [001] 70.928587: => trace_9p_protocol_dump => p9pdu_finalize => p9_client_rpc => p9_client_walk => v9fs_vfs_lookup => d_alloc_and_lookup => walk_component => path_lookupat ls-1536 [000] 70.929696: 9p_protocol_dump: clnt 18446612132784021504 P9_RLERROR(tag = 1) 000: 0b 00 00 00 07 01 00 02 00 00 00 4e 03 00 02 00 010: 00 00 00 00 03 00 02 00 00 00 00 00 ff 43 00 00 ls-1536 [000] 70.929697: => trace_9p_protocol_dump => p9_client_rpc => p9_client_walk => v9fs_vfs_lookup => d_alloc_and_lookup => walk_component => path_lookupat => do_path_lookup Signed-off-by: Aneesh Kumar K.V Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_dir.c | 12 ++-- include/net/9p/9p.h | 3 - include/net/9p/client.h | 6 +- include/trace/events/9p.h | 176 ++++++++++++++++++++++++++++++++++++++++++++++ net/9p/client.c | 77 ++++++++++---------- net/9p/protocol.c | 53 ++++---------- net/9p/protocol.h | 3 +- 7 files changed, 238 insertions(+), 92 deletions(-) create mode 100644 include/trace/events/9p.h (limited to 'fs') diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index ce6600f33659..598fff1a54e5 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -165,9 +165,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) } while (rdir->head < rdir->tail) { p9stat_init(&st); - err = p9stat_read(rdir->buf + rdir->head, - rdir->tail - rdir->head, &st, - fid->clnt->proto_version); + err = p9stat_read(fid->clnt, rdir->buf + rdir->head, + rdir->tail - rdir->head, &st); if (err) { P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); err = -EIO; @@ -241,10 +240,9 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, while (rdir->head < rdir->tail) { - err = p9dirent_read(rdir->buf + rdir->head, - rdir->tail - rdir->head, - &curdirent, - fid->clnt->proto_version); + err = p9dirent_read(fid->clnt, rdir->buf + rdir->head, + rdir->tail - rdir->head, + &curdirent); if (err < 0) { P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); err = -EIO; diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h index d83a01300871..2d70b95b3b55 100644 --- a/include/net/9p/9p.h +++ b/include/net/9p/9p.h @@ -76,11 +76,8 @@ do { \ } \ } while (0) -#define P9_DUMP_PKT(way, pdu) p9pdu_dump(way, pdu) - #else #define P9_DPRINTK(level, format, arg...) do { } while (0) -#define P9_DUMP_PKT(way, pdu) do { } while (0) #endif diff --git a/include/net/9p/client.h b/include/net/9p/client.h index d479d7dcc4d5..fc9b90b0c052 100644 --- a/include/net/9p/client.h +++ b/include/net/9p/client.h @@ -240,8 +240,8 @@ int p9_client_read(struct p9_fid *fid, char *data, char __user *udata, int p9_client_write(struct p9_fid *fid, char *data, const char __user *udata, u64 offset, u32 count); int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset); -int p9dirent_read(char *buf, int len, struct p9_dirent *dirent, - int proto_version); +int p9dirent_read(struct p9_client *clnt, char *buf, int len, + struct p9_dirent *dirent); struct p9_wstat *p9_client_stat(struct p9_fid *fid); int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst); int p9_client_setattr(struct p9_fid *fid, struct p9_iattr_dotl *attr); @@ -259,7 +259,7 @@ struct p9_req_t *p9_tag_lookup(struct p9_client *, u16); void p9_client_cb(struct p9_client *c, struct p9_req_t *req); int p9_parse_header(struct p9_fcall *, int32_t *, int8_t *, int16_t *, int); -int p9stat_read(char *, int, struct p9_wstat *, int); +int p9stat_read(struct p9_client *, char *, int, struct p9_wstat *); void p9stat_free(struct p9_wstat *); int p9_is_proto_dotu(struct p9_client *clnt); diff --git a/include/trace/events/9p.h b/include/trace/events/9p.h new file mode 100644 index 000000000000..beeaed8398ec --- /dev/null +++ b/include/trace/events/9p.h @@ -0,0 +1,176 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM 9p + +#if !defined(_TRACE_9P_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_9P_H + +#include + +#define show_9p_op(type) \ + __print_symbolic(type, \ + { P9_TLERROR, "P9_TLERROR" }, \ + { P9_RLERROR, "P9_RLERROR" }, \ + { P9_TSTATFS, "P9_TSTATFS" }, \ + { P9_RSTATFS, "P9_RSTATFS" }, \ + { P9_TLOPEN, "P9_TLOPEN" }, \ + { P9_RLOPEN, "P9_RLOPEN" }, \ + { P9_TLCREATE, "P9_TLCREATE" }, \ + { P9_RLCREATE, "P9_RLCREATE" }, \ + { P9_TSYMLINK, "P9_TSYMLINK" }, \ + { P9_RSYMLINK, "P9_RSYMLINK" }, \ + { P9_TMKNOD, "P9_TMKNOD" }, \ + { P9_RMKNOD, "P9_RMKNOD" }, \ + { P9_TRENAME, "P9_TRENAME" }, \ + { P9_RRENAME, "P9_RRENAME" }, \ + { P9_TREADLINK, "P9_TREADLINK" }, \ + { P9_RREADLINK, "P9_RREADLINK" }, \ + { P9_TGETATTR, "P9_TGETATTR" }, \ + { P9_RGETATTR, "P9_RGETATTR" }, \ + { P9_TSETATTR, "P9_TSETATTR" }, \ + { P9_RSETATTR, "P9_RSETATTR" }, \ + { P9_TXATTRWALK, "P9_TXATTRWALK" }, \ + { P9_RXATTRWALK, "P9_RXATTRWALK" }, \ + { P9_TXATTRCREATE, "P9_TXATTRCREATE" }, \ + { P9_RXATTRCREATE, "P9_RXATTRCREATE" }, \ + { P9_TREADDIR, "P9_TREADDIR" }, \ + { P9_RREADDIR, "P9_RREADDIR" }, \ + { P9_TFSYNC, "P9_TFSYNC" }, \ + { P9_RFSYNC, "P9_RFSYNC" }, \ + { P9_TLOCK, "P9_TLOCK" }, \ + { P9_RLOCK, "P9_RLOCK" }, \ + { P9_TGETLOCK, "P9_TGETLOCK" }, \ + { P9_RGETLOCK, "P9_RGETLOCK" }, \ + { P9_TLINK, "P9_TLINK" }, \ + { P9_RLINK, "P9_RLINK" }, \ + { P9_TMKDIR, "P9_TMKDIR" }, \ + { P9_RMKDIR, "P9_RMKDIR" }, \ + { P9_TRENAMEAT, "P9_TRENAMEAT" }, \ + { P9_RRENAMEAT, "P9_RRENAMEAT" }, \ + { P9_TUNLINKAT, "P9_TUNLINKAT" }, \ + { P9_RUNLINKAT, "P9_RUNLINKAT" }, \ + { P9_TVERSION, "P9_TVERSION" }, \ + { P9_RVERSION, "P9_RVERSION" }, \ + { P9_TAUTH, "P9_TAUTH" }, \ + { P9_RAUTH, "P9_RAUTH" }, \ + { P9_TATTACH, "P9_TATTACH" }, \ + { P9_RATTACH, "P9_RATTACH" }, \ + { P9_TERROR, "P9_TERROR" }, \ + { P9_RERROR, "P9_RERROR" }, \ + { P9_TFLUSH, "P9_TFLUSH" }, \ + { P9_RFLUSH, "P9_RFLUSH" }, \ + { P9_TWALK, "P9_TWALK" }, \ + { P9_RWALK, "P9_RWALK" }, \ + { P9_TOPEN, "P9_TOPEN" }, \ + { P9_ROPEN, "P9_ROPEN" }, \ + { P9_TCREATE, "P9_TCREATE" }, \ + { P9_RCREATE, "P9_RCREATE" }, \ + { P9_TREAD, "P9_TREAD" }, \ + { P9_RREAD, "P9_RREAD" }, \ + { P9_TWRITE, "P9_TWRITE" }, \ + { P9_RWRITE, "P9_RWRITE" }, \ + { P9_TCLUNK, "P9_TCLUNK" }, \ + { P9_RCLUNK, "P9_RCLUNK" }, \ + { P9_TREMOVE, "P9_TREMOVE" }, \ + { P9_RREMOVE, "P9_RREMOVE" }, \ + { P9_TSTAT, "P9_TSTAT" }, \ + { P9_RSTAT, "P9_RSTAT" }, \ + { P9_TWSTAT, "P9_TWSTAT" }, \ + { P9_RWSTAT, "P9_RWSTAT" }) + +TRACE_EVENT(9p_client_req, + TP_PROTO(struct p9_client *clnt, int8_t type, int tag), + + TP_ARGS(clnt, type, tag), + + TP_STRUCT__entry( + __field( void *, clnt ) + __field( __u8, type ) + __field( __u32, tag ) + ), + + TP_fast_assign( + __entry->clnt = clnt; + __entry->type = type; + __entry->tag = tag; + ), + + TP_printk("client %lu request %s tag %d", + (long)__entry->clnt, show_9p_op(__entry->type), + __entry->tag) + ); + +TRACE_EVENT(9p_client_res, + TP_PROTO(struct p9_client *clnt, int8_t type, int tag, int err), + + TP_ARGS(clnt, type, tag, err), + + TP_STRUCT__entry( + __field( void *, clnt ) + __field( __u8, type ) + __field( __u32, tag ) + __field( __u32, err ) + ), + + TP_fast_assign( + __entry->clnt = clnt; + __entry->type = type; + __entry->tag = tag; + __entry->err = err; + ), + + TP_printk("client %lu response %s tag %d err %d", + (long)__entry->clnt, show_9p_op(__entry->type), + __entry->tag, __entry->err) +); + +/* dump 32 bytes of protocol data */ +#define P9_PROTO_DUMP_SZ 32 +TRACE_EVENT(9p_protocol_dump, + TP_PROTO(struct p9_client *clnt, struct p9_fcall *pdu), + + TP_ARGS(clnt, pdu), + + TP_STRUCT__entry( + __field( void *, clnt ) + __field( __u8, type ) + __field( __u16, tag ) + __array( unsigned char, line, P9_PROTO_DUMP_SZ ) + ), + + TP_fast_assign( + __entry->clnt = clnt; + __entry->type = pdu->id; + __entry->tag = pdu->tag; + memcpy(__entry->line, pdu->sdata, P9_PROTO_DUMP_SZ); + ), + TP_printk("clnt %lu %s(tag = %d)\n%.3x: " + "%02x %02x %02x %02x %02x %02x %02x %02x " + "%02x %02x %02x %02x %02x %02x %02x %02x\n" + "%.3x: " + "%02x %02x %02x %02x %02x %02x %02x %02x " + "%02x %02x %02x %02x %02x %02x %02x %02x\n", + (long)__entry->clnt, show_9p_op(__entry->type), + __entry->tag, 0, + __entry->line[0], __entry->line[1], + __entry->line[2], __entry->line[3], + __entry->line[4], __entry->line[5], + __entry->line[6], __entry->line[7], + __entry->line[8], __entry->line[9], + __entry->line[10], __entry->line[11], + __entry->line[12], __entry->line[13], + __entry->line[14], __entry->line[15], + 16, + __entry->line[16], __entry->line[17], + __entry->line[18], __entry->line[19], + __entry->line[20], __entry->line[21], + __entry->line[22], __entry->line[23], + __entry->line[24], __entry->line[25], + __entry->line[26], __entry->line[27], + __entry->line[28], __entry->line[29], + __entry->line[30], __entry->line[31]) + ); + +#endif /* _TRACE_9P_H */ + +/* This part must be outside protection */ +#include diff --git a/net/9p/client.c b/net/9p/client.c index b1c02187f862..854ca7a911c4 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -38,6 +38,9 @@ #include #include "protocol.h" +#define CREATE_TRACE_POINTS +#include + /* * Client Option Parsing (code inspired by NFS code) * - a little lazy - parse all client options @@ -464,11 +467,15 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) int ecode; err = p9_parse_header(req->rc, NULL, &type, NULL, 0); + /* + * dump the response from server + * This should be after check errors which poplulate pdu_fcall. + */ + trace_9p_protocol_dump(c, req->rc); if (err) { P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse header %d\n", err); return err; } - if (type != P9_RERROR && type != P9_RLERROR) return 0; @@ -525,6 +532,11 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req, char *ename = NULL; err = p9_parse_header(req->rc, NULL, &type, NULL, 0); + /* + * dump the response from server + * This should be after parse_header which poplulate pdu_fcall. + */ + trace_9p_protocol_dump(c, req->rc); if (err) { P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse header %d\n", err); return err; @@ -684,7 +696,8 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c, err = p9pdu_vwritef(req->tc, c->proto_version, fmt, ap); if (err) goto reterr; - p9pdu_finalize(req->tc); + p9pdu_finalize(c, req->tc); + trace_9p_client_req(c, type, tag); return req; reterr: p9_free_req(c, req); @@ -755,13 +768,10 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) goto reterr; err = p9_check_errors(c, req); - if (!err) { - P9_DPRINTK(P9_DEBUG_MUX, "exit: client %p op %d\n", c, type); + trace_9p_client_res(c, type, req->rc->tag, err); + if (!err) return req; - } reterr: - P9_DPRINTK(P9_DEBUG_MUX, - "exit: client %p op %d error: %d\n", c, type, err); p9_free_req(c, req); return ERR_PTR(err); } @@ -841,13 +851,10 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type, goto reterr; err = p9_check_zc_errors(c, req, uidata, in_hdrlen, kern_buf); - if (!err) { - P9_DPRINTK(P9_DEBUG_MUX, "exit: client %p op %d\n", c, type); + trace_9p_client_res(c, type, req->rc->tag, err); + if (!err) return req; - } reterr: - P9_DPRINTK(P9_DEBUG_MUX, "exit: client %p op %d error: %d\n", c, type, - err); p9_free_req(c, req); return ERR_PTR(err); } @@ -935,7 +942,7 @@ static int p9_client_version(struct p9_client *c) err = p9pdu_readf(req->rc, c->proto_version, "ds", &msize, &version); if (err) { P9_DPRINTK(P9_DEBUG_9P, "version error %d\n", err); - P9_DUMP_PKT(1, req->rc); + trace_9p_protocol_dump(c, req->rc); goto error; } @@ -1072,15 +1079,14 @@ EXPORT_SYMBOL(p9_client_begin_disconnect); struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, char *uname, u32 n_uname, char *aname) { - int err; + int err = 0; struct p9_req_t *req; struct p9_fid *fid; struct p9_qid qid; - P9_DPRINTK(P9_DEBUG_9P, ">>> TATTACH afid %d uname %s aname %s\n", - afid ? afid->fid : -1, uname, aname); - err = 0; + P9_DPRINTK(P9_DEBUG_9P, ">>> TATTACH afid %d uname %s aname %s\n", + afid ? afid->fid : -1, uname, aname); fid = p9_fid_create(clnt); if (IS_ERR(fid)) { err = PTR_ERR(fid); @@ -1097,7 +1103,7 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, err = p9pdu_readf(req->rc, clnt->proto_version, "Q", &qid); if (err) { - P9_DUMP_PKT(1, req->rc); + trace_9p_protocol_dump(clnt, req->rc); p9_free_req(clnt, req); goto error; } @@ -1157,7 +1163,7 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname, err = p9pdu_readf(req->rc, clnt->proto_version, "R", &nwqids, &wqids); if (err) { - P9_DUMP_PKT(1, req->rc); + trace_9p_protocol_dump(clnt, req->rc); p9_free_req(clnt, req); goto clunk_fid; } @@ -1224,7 +1230,7 @@ int p9_client_open(struct p9_fid *fid, int mode) err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit); if (err) { - P9_DUMP_PKT(1, req->rc); + trace_9p_protocol_dump(clnt, req->rc); goto free_and_error; } @@ -1267,7 +1273,7 @@ int p9_client_create_dotl(struct p9_fid *ofid, char *name, u32 flags, u32 mode, err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", qid, &iounit); if (err) { - P9_DUMP_PKT(1, req->rc); + trace_9p_protocol_dump(clnt, req->rc); goto free_and_error; } @@ -1312,7 +1318,7 @@ int p9_client_fcreate(struct p9_fid *fid, char *name, u32 perm, int mode, err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit); if (err) { - P9_DUMP_PKT(1, req->rc); + trace_9p_protocol_dump(clnt, req->rc); goto free_and_error; } @@ -1351,7 +1357,7 @@ int p9_client_symlink(struct p9_fid *dfid, char *name, char *symtgt, gid_t gid, err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid); if (err) { - P9_DUMP_PKT(1, req->rc); + trace_9p_protocol_dump(clnt, req->rc); goto free_and_error; } @@ -1542,12 +1548,11 @@ p9_client_read(struct p9_fid *fid, char *data, char __user *udata, u64 offset, err = p9pdu_readf(req->rc, clnt->proto_version, "D", &count, &dataptr); if (err) { - P9_DUMP_PKT(1, req->rc); + trace_9p_protocol_dump(clnt, req->rc); goto free_and_error; } P9_DPRINTK(P9_DEBUG_9P, "<<< RREAD count %d\n", count); - P9_DUMP_PKT(1, req->rc); if (non_zc) { if (data) { @@ -1617,7 +1622,7 @@ p9_client_write(struct p9_fid *fid, char *data, const char __user *udata, err = p9pdu_readf(req->rc, clnt->proto_version, "d", &count); if (err) { - P9_DUMP_PKT(1, req->rc); + trace_9p_protocol_dump(clnt, req->rc); goto free_and_error; } @@ -1657,7 +1662,7 @@ struct p9_wstat *p9_client_stat(struct p9_fid *fid) err = p9pdu_readf(req->rc, clnt->proto_version, "wS", &ignored, ret); if (err) { - P9_DUMP_PKT(1, req->rc); + trace_9p_protocol_dump(clnt, req->rc); p9_free_req(clnt, req); goto error; } @@ -1708,7 +1713,7 @@ struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid, err = p9pdu_readf(req->rc, clnt->proto_version, "A", ret); if (err) { - P9_DUMP_PKT(1, req->rc); + trace_9p_protocol_dump(clnt, req->rc); p9_free_req(clnt, req); goto error; } @@ -1856,7 +1861,7 @@ int p9_client_statfs(struct p9_fid *fid, struct p9_rstatfs *sb) &sb->bsize, &sb->blocks, &sb->bfree, &sb->bavail, &sb->files, &sb->ffree, &sb->fsid, &sb->namelen); if (err) { - P9_DUMP_PKT(1, req->rc); + trace_9p_protocol_dump(clnt, req->rc); p9_free_req(clnt, req); goto error; } @@ -1963,7 +1968,7 @@ struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid, } err = p9pdu_readf(req->rc, clnt->proto_version, "q", attr_size); if (err) { - P9_DUMP_PKT(1, req->rc); + trace_9p_protocol_dump(clnt, req->rc); p9_free_req(clnt, req); goto clunk_fid; } @@ -2047,7 +2052,7 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) err = p9pdu_readf(req->rc, clnt->proto_version, "D", &count, &dataptr); if (err) { - P9_DUMP_PKT(1, req->rc); + trace_9p_protocol_dump(clnt, req->rc); goto free_and_error; } @@ -2084,7 +2089,7 @@ int p9_client_mknod_dotl(struct p9_fid *fid, char *name, int mode, err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid); if (err) { - P9_DUMP_PKT(1, req->rc); + trace_9p_protocol_dump(clnt, req->rc); goto error; } P9_DPRINTK(P9_DEBUG_9P, "<<< RMKNOD qid %x.%llx.%x\n", qid->type, @@ -2115,7 +2120,7 @@ int p9_client_mkdir_dotl(struct p9_fid *fid, char *name, int mode, err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid); if (err) { - P9_DUMP_PKT(1, req->rc); + trace_9p_protocol_dump(clnt, req->rc); goto error; } P9_DPRINTK(P9_DEBUG_9P, "<<< RMKDIR qid %x.%llx.%x\n", qid->type, @@ -2150,7 +2155,7 @@ int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status) err = p9pdu_readf(req->rc, clnt->proto_version, "b", status); if (err) { - P9_DUMP_PKT(1, req->rc); + trace_9p_protocol_dump(clnt, req->rc); goto error; } P9_DPRINTK(P9_DEBUG_9P, "<<< RLOCK status %i\n", *status); @@ -2183,7 +2188,7 @@ int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *glock) &glock->start, &glock->length, &glock->proc_id, &glock->client_id); if (err) { - P9_DUMP_PKT(1, req->rc); + trace_9p_protocol_dump(clnt, req->rc); goto error; } P9_DPRINTK(P9_DEBUG_9P, "<<< RGETLOCK type %i start %lld length %lld " @@ -2211,7 +2216,7 @@ int p9_client_readlink(struct p9_fid *fid, char **target) err = p9pdu_readf(req->rc, clnt->proto_version, "s", target); if (err) { - P9_DUMP_PKT(1, req->rc); + trace_9p_protocol_dump(clnt, req->rc); goto error; } P9_DPRINTK(P9_DEBUG_9P, "<<< RREADLINK target %s\n", *target); diff --git a/net/9p/protocol.c b/net/9p/protocol.c index b7d4e8aa5383..55e10a96c902 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -37,40 +37,11 @@ #include #include "protocol.h" +#include + static int p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...); -#ifdef CONFIG_NET_9P_DEBUG -void -p9pdu_dump(int way, struct p9_fcall *pdu) -{ - int len = pdu->size; - - if ((p9_debug_level & P9_DEBUG_VPKT) != P9_DEBUG_VPKT) { - if ((p9_debug_level & P9_DEBUG_PKT) == P9_DEBUG_PKT) { - if (len > 32) - len = 32; - } else { - /* shouldn't happen */ - return; - } - } - - if (way) - print_hex_dump_bytes("[9P] ", DUMP_PREFIX_OFFSET, pdu->sdata, - len); - else - print_hex_dump_bytes("]9P[ ", DUMP_PREFIX_OFFSET, pdu->sdata, - len); -} -#else -void -p9pdu_dump(int way, struct p9_fcall *pdu) -{ -} -#endif -EXPORT_SYMBOL(p9pdu_dump); - void p9stat_free(struct p9_wstat *stbuf) { kfree(stbuf->name); @@ -551,7 +522,7 @@ p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...) return ret; } -int p9stat_read(char *buf, int len, struct p9_wstat *st, int proto_version) +int p9stat_read(struct p9_client *clnt, char *buf, int len, struct p9_wstat *st) { struct p9_fcall fake_pdu; int ret; @@ -561,10 +532,10 @@ int p9stat_read(char *buf, int len, struct p9_wstat *st, int proto_version) fake_pdu.sdata = buf; fake_pdu.offset = 0; - ret = p9pdu_readf(&fake_pdu, proto_version, "S", st); + ret = p9pdu_readf(&fake_pdu, clnt->proto_version, "S", st); if (ret) { P9_DPRINTK(P9_DEBUG_9P, "<<< p9stat_read failed: %d\n", ret); - P9_DUMP_PKT(0, &fake_pdu); + trace_9p_protocol_dump(clnt, &fake_pdu); } return ret; @@ -577,7 +548,7 @@ int p9pdu_prepare(struct p9_fcall *pdu, int16_t tag, int8_t type) return p9pdu_writef(pdu, 0, "dbw", 0, type, tag); } -int p9pdu_finalize(struct p9_fcall *pdu) +int p9pdu_finalize(struct p9_client *clnt, struct p9_fcall *pdu) { int size = pdu->size; int err; @@ -586,7 +557,7 @@ int p9pdu_finalize(struct p9_fcall *pdu) err = p9pdu_writef(pdu, 0, "d", size); pdu->size = size; - P9_DUMP_PKT(0, pdu); + trace_9p_protocol_dump(clnt, pdu); P9_DPRINTK(P9_DEBUG_9P, ">>> size=%d type: %d tag: %d\n", pdu->size, pdu->id, pdu->tag); @@ -599,8 +570,8 @@ void p9pdu_reset(struct p9_fcall *pdu) pdu->size = 0; } -int p9dirent_read(char *buf, int len, struct p9_dirent *dirent, - int proto_version) +int p9dirent_read(struct p9_client *clnt, char *buf, int len, + struct p9_dirent *dirent) { struct p9_fcall fake_pdu; int ret; @@ -611,11 +582,11 @@ int p9dirent_read(char *buf, int len, struct p9_dirent *dirent, fake_pdu.sdata = buf; fake_pdu.offset = 0; - ret = p9pdu_readf(&fake_pdu, proto_version, "Qqbs", &dirent->qid, - &dirent->d_off, &dirent->d_type, &nameptr); + ret = p9pdu_readf(&fake_pdu, clnt->proto_version, "Qqbs", &dirent->qid, + &dirent->d_off, &dirent->d_type, &nameptr); if (ret) { P9_DPRINTK(P9_DEBUG_9P, "<<< p9dirent_read failed: %d\n", ret); - P9_DUMP_PKT(1, &fake_pdu); + trace_9p_protocol_dump(clnt, &fake_pdu); goto out; } diff --git a/net/9p/protocol.h b/net/9p/protocol.h index a0eb8ff11f22..2cc525fa49fa 100644 --- a/net/9p/protocol.h +++ b/net/9p/protocol.h @@ -29,7 +29,6 @@ int p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt, va_list ap); int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...); int p9pdu_prepare(struct p9_fcall *pdu, int16_t tag, int8_t type); -int p9pdu_finalize(struct p9_fcall *pdu); -void p9pdu_dump(int, struct p9_fcall *); +int p9pdu_finalize(struct p9_client *clnt, struct p9_fcall *pdu); void p9pdu_reset(struct p9_fcall *pdu); size_t pdu_read(struct p9_fcall *pdu, void *data, size_t size); -- cgit v1.2.3-58-ga151 From 85160e03a79e0d7f9082e61f6a784abc6f402701 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Sat, 22 Oct 2011 15:33:29 +0400 Subject: CIFS: Implement caching mechanism for mandatory brlocks If we have an oplock and negotiate mandatory locking style we handle all brlock requests on the client. Signed-off-by: Pavel Shilovsky Acked-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 2 + fs/cifs/file.c | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 197 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index d153d0b89d39..8238aa13e01c 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -496,6 +496,8 @@ extern struct cifs_tcon *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb); */ struct cifsLockInfo { struct list_head llist; /* pointer to next cifsLockInfo */ + struct list_head blist; /* pointer to locks blocked on this */ + wait_queue_head_t block_q; __u64 offset; __u64 length; __u32 pid; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index a3b545ff5250..34cbbee30b18 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -275,11 +275,14 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, spin_unlock(&cifs_file_list_lock); cifs_set_oplock_level(pCifsInode, oplock); + pCifsInode->can_cache_brlcks = pCifsInode->clientCanCacheAll; file->private_data = pCifsFile; return pCifsFile; } +static void cifs_del_lock_waiters(struct cifsLockInfo *lock); + /* * Release a reference on the file private data. This may involve closing * the filehandle out on the server. Must be called without holding @@ -335,6 +338,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) if (li->netfid != cifs_file->netfid) continue; list_del(&li->llist); + cifs_del_lock_waiters(li); kfree(li); } mutex_unlock(&cifsi->lock_mutex); @@ -640,24 +644,182 @@ int cifs_closedir(struct inode *inode, struct file *file) return rc; } -static int store_file_lock(struct cifsInodeInfo *cinode, __u64 len, - __u64 offset, __u8 type, __u16 netfid) +static struct cifsLockInfo * +cifs_lock_init(__u64 len, __u64 offset, __u8 type, __u16 netfid) { struct cifsLockInfo *li = kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL); - if (li == NULL) - return -ENOMEM; + if (!li) + return li; li->netfid = netfid; li->offset = offset; li->length = len; li->type = type; li->pid = current->tgid; + INIT_LIST_HEAD(&li->blist); + init_waitqueue_head(&li->block_q); + return li; +} + +static void +cifs_del_lock_waiters(struct cifsLockInfo *lock) +{ + struct cifsLockInfo *li, *tmp; + list_for_each_entry_safe(li, tmp, &lock->blist, blist) { + list_del_init(&li->blist); + wake_up(&li->block_q); + } +} + +static bool +cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset, + __u64 length, __u8 type, __u16 netfid, + struct cifsLockInfo **conf_lock) +{ + struct cifsLockInfo *li, *tmp; + + list_for_each_entry_safe(li, tmp, &cinode->llist, llist) { + if (offset + length <= li->offset || + offset >= li->offset + li->length) + continue; + else if ((type & LOCKING_ANDX_SHARED_LOCK) && + ((netfid == li->netfid && current->tgid == li->pid) || + type == li->type)) + continue; + else { + *conf_lock = li; + return true; + } + } + return false; +} + +static int +cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, + __u8 type, __u16 netfid, struct file_lock *flock) +{ + int rc = 0; + struct cifsLockInfo *conf_lock; + bool exist; + + mutex_lock(&cinode->lock_mutex); + + exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid, + &conf_lock); + if (exist) { + flock->fl_start = conf_lock->offset; + flock->fl_end = conf_lock->offset + conf_lock->length - 1; + flock->fl_pid = conf_lock->pid; + if (conf_lock->type & LOCKING_ANDX_SHARED_LOCK) + flock->fl_type = F_RDLCK; + else + flock->fl_type = F_WRLCK; + } else if (!cinode->can_cache_brlcks) + rc = 1; + else + flock->fl_type = F_UNLCK; + + mutex_unlock(&cinode->lock_mutex); + return rc; +} + +static int +cifs_lock_add(struct cifsInodeInfo *cinode, __u64 len, __u64 offset, + __u8 type, __u16 netfid) +{ + struct cifsLockInfo *li; + + li = cifs_lock_init(len, offset, type, netfid); + if (!li) + return -ENOMEM; + mutex_lock(&cinode->lock_mutex); list_add_tail(&li->llist, &cinode->llist); mutex_unlock(&cinode->lock_mutex); return 0; } +static int +cifs_lock_add_if(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, + __u8 type, __u16 netfid, bool wait) +{ + struct cifsLockInfo *lock, *conf_lock; + bool exist; + int rc = 0; + + lock = cifs_lock_init(length, offset, type, netfid); + if (!lock) + return -ENOMEM; + +try_again: + exist = false; + mutex_lock(&cinode->lock_mutex); + + exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid, + &conf_lock); + if (!exist && cinode->can_cache_brlcks) { + list_add_tail(&lock->llist, &cinode->llist); + mutex_unlock(&cinode->lock_mutex); + return rc; + } + + if (!exist) + rc = 1; + else if (!wait) + rc = -EACCES; + else { + list_add_tail(&lock->blist, &conf_lock->blist); + mutex_unlock(&cinode->lock_mutex); + rc = wait_event_interruptible(lock->block_q, + (lock->blist.prev == &lock->blist) && + (lock->blist.next == &lock->blist)); + if (!rc) + goto try_again; + else { + mutex_lock(&cinode->lock_mutex); + list_del_init(&lock->blist); + mutex_unlock(&cinode->lock_mutex); + } + } + + kfree(lock); + mutex_unlock(&cinode->lock_mutex); + return rc; +} + +static int +cifs_push_locks(struct cifsFileInfo *cfile) +{ + int xid, rc = 0, stored_rc; + struct cifsLockInfo *li, *tmp; + struct cifs_tcon *tcon; + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + + xid = GetXid(); + tcon = tlink_tcon(cfile->tlink); + + mutex_lock(&cinode->lock_mutex); + if (!cinode->can_cache_brlcks) { + mutex_unlock(&cinode->lock_mutex); + FreeXid(xid); + return rc; + } + + list_for_each_entry_safe(li, tmp, &cinode->llist, llist) { + stored_rc = CIFSSMBLock(xid, tcon, cfile->netfid, + li->pid, li->length, li->offset, + 0, 1, li->type, 0, 0); + if (stored_rc) + rc = stored_rc; + } + + cinode->can_cache_brlcks = false; + mutex_unlock(&cinode->lock_mutex); + + FreeXid(xid); + return rc; +} + static void cifs_read_flock(struct file_lock *flock, __u8 *type, int *lock, int *unlock, bool *wait_flag) @@ -708,6 +870,7 @@ cifs_getlk(struct cifsFileInfo *cfile, struct file_lock *flock, __u8 type, { int rc = 0; __u64 length = 1 + flock->fl_end - flock->fl_start; + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); __u16 netfid = cfile->netfid; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); @@ -723,6 +886,11 @@ cifs_getlk(struct cifsFileInfo *cfile, struct file_lock *flock, __u8 type, return rc; } + rc = cifs_lock_test(cinode, flock->fl_start, length, type, netfid, + flock); + if (!rc) + return rc; + /* BB we could chain these into one lock request BB */ rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, flock->fl_start, 0, 1, type, 0, 0); @@ -790,12 +958,19 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u8 type, } if (lock) { + rc = cifs_lock_add_if(cinode, flock->fl_start, length, + type, netfid, wait_flag); + if (rc < 0) + return rc; + else if (!rc) + goto out; + rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, - flock->fl_start, 0, lock, type, wait_flag, 0); + flock->fl_start, 0, 1, type, wait_flag, 0); if (rc == 0) { /* For Windows locks we must store them. */ - rc = store_file_lock(cinode, length, flock->fl_start, - type, netfid); + rc = cifs_lock_add(cinode, length, flock->fl_start, + type, netfid); } } else if (unlock) { /* @@ -816,14 +991,19 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u8 type, if (cfile->netfid != li->netfid) continue; - stored_rc = CIFSSMBLock(xid, tcon, netfid, - current->tgid, li->length, - li->offset, 1, 0, li->type, - 0, 0); + if (!cinode->can_cache_brlcks) + stored_rc = CIFSSMBLock(xid, tcon, netfid, + current->tgid, + li->length, li->offset, + 1, 0, li->type, 0, 0); + else + stored_rc = 0; + if (stored_rc) rc = stored_rc; else { list_del(&li->llist); + cifs_del_lock_waiters(li); kfree(li); } } @@ -2404,6 +2584,10 @@ void cifs_oplock_break(struct work_struct *work) cFYI(1, "Oplock flush inode %p rc %d", inode, rc); } + rc = cifs_push_locks(cfile); + if (rc) + cERROR(1, "Push locks rc = %d", rc); + /* * releasing stale oplock after recent reconnect of smb session using * a now incorrect file handle is not a data integrity issue but do -- cgit v1.2.3-58-ga151 From 4f6bcec910d45e4f46b1514977caa529bc69e645 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Sat, 22 Oct 2011 15:33:30 +0400 Subject: CIFS: Implement caching mechanism for posix brlocks to handle all lock requests on the client in an exclusive oplock case. Signed-off-by: Pavel Shilovsky Acked-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsproto.h | 4 +- fs/cifs/cifssmb.c | 8 +-- fs/cifs/file.c | 150 ++++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 147 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index c25d0636cc4f..67c26cfe160d 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -374,8 +374,8 @@ extern int CIFSSMBLock(const int xid, struct cifs_tcon *tcon, const __u32 numLock, const __u8 lockType, const bool waitFlag, const __u8 oplock_level); extern int CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon, - const __u16 smb_file_id, const int get_flag, - const __u64 len, struct file_lock *, + const __u16 smb_file_id, const __u32 netpid, + const int get_flag, const __u64 len, struct file_lock *, const __u16 lock_type, const bool waitFlag); extern int CIFSSMBTDis(const int xid, struct cifs_tcon *tcon); extern int CIFSSMBEcho(struct TCP_Server_Info *server); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 4435b11c41b9..6a45a1769388 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2393,9 +2393,9 @@ CIFSSMBLock(const int xid, struct cifs_tcon *tcon, int CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon, - const __u16 smb_file_id, const int get_flag, const __u64 len, - struct file_lock *pLockData, const __u16 lock_type, - const bool waitFlag) + const __u16 smb_file_id, const __u32 netpid, const int get_flag, + const __u64 len, struct file_lock *pLockData, + const __u16 lock_type, const bool waitFlag) { struct smb_com_transaction2_sfi_req *pSMB = NULL; struct smb_com_transaction2_sfi_rsp *pSMBr = NULL; @@ -2453,7 +2453,7 @@ CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon, } else pSMB->Timeout = 0; - parm_data->pid = cpu_to_le32(current->tgid); + parm_data->pid = cpu_to_le32(netpid); parm_data->start = cpu_to_le64(pLockData->fl_start); parm_data->length = cpu_to_le64(len); /* normalize negative numbers */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 34cbbee30b18..805e2bd1dfd5 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -788,7 +788,42 @@ try_again: } static int -cifs_push_locks(struct cifsFileInfo *cfile) +cifs_posix_lock_test(struct file *file, struct file_lock *flock) +{ + int rc = 0; + struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); + unsigned char saved_type = flock->fl_type; + + mutex_lock(&cinode->lock_mutex); + posix_test_lock(file, flock); + + if (flock->fl_type == F_UNLCK && !cinode->can_cache_brlcks) { + flock->fl_type = saved_type; + rc = 1; + } + + mutex_unlock(&cinode->lock_mutex); + return rc; +} + +static int +cifs_posix_lock_set(struct file *file, struct file_lock *flock) +{ + struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); + int rc; + + mutex_lock(&cinode->lock_mutex); + if (!cinode->can_cache_brlcks) { + mutex_unlock(&cinode->lock_mutex); + return 1; + } + rc = posix_lock_file_wait(file, flock); + mutex_unlock(&cinode->lock_mutex); + return rc; +} + +static int +cifs_push_mandatory_locks(struct cifsFileInfo *cfile) { int xid, rc = 0, stored_rc; struct cifsLockInfo *li, *tmp; @@ -820,6 +855,91 @@ cifs_push_locks(struct cifsFileInfo *cfile) return rc; } +/* copied from fs/locks.c with a name change */ +#define cifs_for_each_lock(inode, lockp) \ + for (lockp = &inode->i_flock; *lockp != NULL; \ + lockp = &(*lockp)->fl_next) + +static int +cifs_push_posix_locks(struct cifsFileInfo *cfile) +{ + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct file_lock *flock, **before; + struct cifsLockInfo *lck, *tmp; + int rc = 0, xid, type; + __u64 length; + struct list_head locks_to_send; + + xid = GetXid(); + + mutex_lock(&cinode->lock_mutex); + if (!cinode->can_cache_brlcks) { + mutex_unlock(&cinode->lock_mutex); + FreeXid(xid); + return rc; + } + + INIT_LIST_HEAD(&locks_to_send); + + lock_flocks(); + cifs_for_each_lock(cfile->dentry->d_inode, before) { + flock = *before; + length = 1 + flock->fl_end - flock->fl_start; + if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK) + type = CIFS_RDLCK; + else + type = CIFS_WRLCK; + + lck = cifs_lock_init(length, flock->fl_start, type, + cfile->netfid); + if (!lck) { + rc = -ENOMEM; + goto send_locks; + } + lck->pid = flock->fl_pid; + + list_add_tail(&lck->llist, &locks_to_send); + } + +send_locks: + unlock_flocks(); + + list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) { + struct file_lock tmp_lock; + int stored_rc; + + tmp_lock.fl_start = lck->offset; + stored_rc = CIFSSMBPosixLock(xid, tcon, lck->netfid, lck->pid, + 0, lck->length, &tmp_lock, + lck->type, 0); + if (stored_rc) + rc = stored_rc; + list_del(&lck->llist); + kfree(lck); + } + + cinode->can_cache_brlcks = false; + mutex_unlock(&cinode->lock_mutex); + + FreeXid(xid); + return rc; +} + +static int +cifs_push_locks(struct cifsFileInfo *cfile) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + + if ((tcon->ses->capabilities & CAP_UNIX) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + return cifs_push_posix_locks(cfile); + + return cifs_push_mandatory_locks(cfile); +} + static void cifs_read_flock(struct file_lock *flock, __u8 *type, int *lock, int *unlock, bool *wait_flag) @@ -865,24 +985,30 @@ cifs_read_flock(struct file_lock *flock, __u8 *type, int *lock, int *unlock, } static int -cifs_getlk(struct cifsFileInfo *cfile, struct file_lock *flock, __u8 type, +cifs_getlk(struct file *file, struct file_lock *flock, __u8 type, bool wait_flag, bool posix_lck, int xid) { int rc = 0; __u64 length = 1 + flock->fl_end - flock->fl_start; + struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); __u16 netfid = cfile->netfid; - struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); if (posix_lck) { int posix_lock_type; + + rc = cifs_posix_lock_test(file, flock); + if (!rc) + return rc; + if (type & LOCKING_ANDX_SHARED_LOCK) posix_lock_type = CIFS_RDLCK; else posix_lock_type = CIFS_WRLCK; - rc = CIFSSMBPosixLock(xid, tcon, netfid, 1 /* get */, - length, flock, posix_lock_type, - wait_flag); + rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid, + 1 /* get */, length, flock, + posix_lock_type, wait_flag); return rc; } @@ -944,6 +1070,11 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u8 type, if (posix_lck) { int posix_lock_type; + + rc = cifs_posix_lock_set(file, flock); + if (!rc || rc < 0) + return rc; + if (type & LOCKING_ANDX_SHARED_LOCK) posix_lock_type = CIFS_RDLCK; else @@ -952,8 +1083,9 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u8 type, if (unlock == 1) posix_lock_type = CIFS_UNLCK; - rc = CIFSSMBPosixLock(xid, tcon, netfid, 0 /* set */, length, - flock, posix_lock_type, wait_flag); + rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid, + 0 /* set */, length, flock, + posix_lock_type, wait_flag); goto out; } @@ -1052,7 +1184,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock) * negative length which we can not accept over the wire. */ if (IS_GETLK(cmd)) { - rc = cifs_getlk(cfile, flock, type, wait_flag, posix_lck, xid); + rc = cifs_getlk(file, flock, type, wait_flag, posix_lck, xid); FreeXid(xid); return rc; } -- cgit v1.2.3-58-ga151 From 9ee305b70e09f5132c9723780ce10e69710b8bca Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Sat, 22 Oct 2011 15:33:31 +0400 Subject: CIFS: Send as many mandatory unlock ranges at once as possible that reduces a traffic and increases a performance. Signed-off-by: Pavel Shilovsky Acked-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsproto.h | 3 + fs/cifs/cifssmb.c | 40 +++++++++++++ fs/cifs/file.c | 160 ++++++++++++++++++++++++++++++++++++++++------------ 3 files changed, 167 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 67c26cfe160d..ef4f631e4c01 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -368,6 +368,9 @@ extern int CIFSGetSrvInodeNumber(const int xid, struct cifs_tcon *tcon, const struct nls_table *nls_codepage, int remap_special_chars); +extern int cifs_lockv(const int xid, struct cifs_tcon *tcon, const __u16 netfid, + const __u8 lock_type, const __u32 num_unlock, + const __u32 num_lock, LOCKING_ANDX_RANGE *buf); extern int CIFSSMBLock(const int xid, struct cifs_tcon *tcon, const __u16 netfid, const __u32 netpid, const __u64 len, const __u64 offset, const __u32 numUnlock, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 6a45a1769388..6600aa2d2ef3 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2320,6 +2320,46 @@ CIFSSMBWrite2(const int xid, struct cifs_io_parms *io_parms, return rc; } +int cifs_lockv(const int xid, struct cifs_tcon *tcon, const __u16 netfid, + const __u8 lock_type, const __u32 num_unlock, + const __u32 num_lock, LOCKING_ANDX_RANGE *buf) +{ + int rc = 0; + LOCK_REQ *pSMB = NULL; + struct kvec iov[2]; + int resp_buf_type; + __u16 count; + + cFYI(1, "cifs_lockv num lock %d num unlock %d", num_lock, num_unlock); + + rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB); + if (rc) + return rc; + + pSMB->Timeout = 0; + pSMB->NumberOfLocks = cpu_to_le16(num_lock); + pSMB->NumberOfUnlocks = cpu_to_le16(num_unlock); + pSMB->LockType = lock_type; + pSMB->AndXCommand = 0xFF; /* none */ + pSMB->Fid = netfid; /* netfid stays le */ + + count = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE); + inc_rfc1001_len(pSMB, count); + pSMB->ByteCount = cpu_to_le16(count); + + iov[0].iov_base = (char *)pSMB; + iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4 - + (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE); + iov[1].iov_base = (char *)buf; + iov[1].iov_len = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE); + + cifs_stats_inc(&tcon->num_locks); + rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP); + if (rc) + cFYI(1, "Send error in cifs_lockv = %d", rc); + + return rc; +} int CIFSSMBLock(const int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 805e2bd1dfd5..569184e6ee01 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1057,6 +1057,128 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u8 type, return rc; } +static void +cifs_move_llist(struct list_head *source, struct list_head *dest) +{ + struct list_head *li, *tmp; + list_for_each_safe(li, tmp, source) + list_move(li, dest); +} + +static void +cifs_free_llist(struct list_head *llist) +{ + struct cifsLockInfo *li, *tmp; + list_for_each_entry_safe(li, tmp, llist, llist) { + cifs_del_lock_waiters(li); + list_del(&li->llist); + kfree(li); + } +} + +static int +cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid) +{ + int rc = 0, stored_rc; + int types[] = {LOCKING_ANDX_LARGE_FILES, + LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; + unsigned int i; + unsigned int max_num, num; + LOCKING_ANDX_RANGE *buf, *cur; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsLockInfo *li, *tmp; + __u64 length = 1 + flock->fl_end - flock->fl_start; + struct list_head tmp_llist; + + INIT_LIST_HEAD(&tmp_llist); + + max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) / + sizeof(LOCKING_ANDX_RANGE); + buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + mutex_lock(&cinode->lock_mutex); + for (i = 0; i < 2; i++) { + cur = buf; + num = 0; + list_for_each_entry_safe(li, tmp, &cinode->llist, llist) { + if (flock->fl_start > li->offset || + (flock->fl_start + length) < + (li->offset + li->length)) + continue; + if (current->tgid != li->pid) + continue; + if (cfile->netfid != li->netfid) + continue; + if (types[i] != li->type) + continue; + if (!cinode->can_cache_brlcks) { + cur->Pid = cpu_to_le16(li->pid); + cur->LengthLow = cpu_to_le32((u32)li->length); + cur->LengthHigh = + cpu_to_le32((u32)(li->length>>32)); + cur->OffsetLow = cpu_to_le32((u32)li->offset); + cur->OffsetHigh = + cpu_to_le32((u32)(li->offset>>32)); + /* + * We need to save a lock here to let us add + * it again to the inode list if the unlock + * range request fails on the server. + */ + list_move(&li->llist, &tmp_llist); + if (++num == max_num) { + stored_rc = cifs_lockv(xid, tcon, + cfile->netfid, + li->type, num, + 0, buf); + if (stored_rc) { + /* + * We failed on the unlock range + * request - add all locks from + * the tmp list to the head of + * the inode list. + */ + cifs_move_llist(&tmp_llist, + &cinode->llist); + rc = stored_rc; + } else + /* + * The unlock range request + * succeed - free the tmp list. + */ + cifs_free_llist(&tmp_llist); + cur = buf; + num = 0; + } else + cur++; + } else { + /* + * We can cache brlock requests - simply remove + * a lock from the inode list. + */ + list_del(&li->llist); + cifs_del_lock_waiters(li); + kfree(li); + } + } + if (num) { + stored_rc = cifs_lockv(xid, tcon, cfile->netfid, + types[i], num, 0, buf); + if (stored_rc) { + cifs_move_llist(&tmp_llist, &cinode->llist); + rc = stored_rc; + } else + cifs_free_llist(&tmp_llist); + } + } + + mutex_unlock(&cinode->lock_mutex); + kfree(buf); + return rc; +} + static int cifs_setlk(struct file *file, struct file_lock *flock, __u8 type, bool wait_flag, bool posix_lck, int lock, int unlock, int xid) @@ -1104,43 +1226,9 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u8 type, rc = cifs_lock_add(cinode, length, flock->fl_start, type, netfid); } - } else if (unlock) { - /* - * For each stored lock that this unlock overlaps completely, - * unlock it. - */ - int stored_rc = 0; - struct cifsLockInfo *li, *tmp; - - mutex_lock(&cinode->lock_mutex); - list_for_each_entry_safe(li, tmp, &cinode->llist, llist) { - if (flock->fl_start > li->offset || - (flock->fl_start + length) < - (li->offset + li->length)) - continue; - if (current->tgid != li->pid) - continue; - if (cfile->netfid != li->netfid) - continue; - - if (!cinode->can_cache_brlcks) - stored_rc = CIFSSMBLock(xid, tcon, netfid, - current->tgid, - li->length, li->offset, - 1, 0, li->type, 0, 0); - else - stored_rc = 0; + } else if (unlock) + rc = cifs_unlock_range(cfile, flock, xid); - if (stored_rc) - rc = stored_rc; - else { - list_del(&li->llist); - cifs_del_lock_waiters(li); - kfree(li); - } - } - mutex_unlock(&cinode->lock_mutex); - } out: if (flock->fl_flags & FL_POSIX) posix_lock_file_wait(file, flock); -- cgit v1.2.3-58-ga151 From 32b9aaf1a53b3c8d435f86339b01b3968520cb0a Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Sat, 22 Oct 2011 15:33:32 +0400 Subject: CIFS: Make cifs_push_locks send as many locks at once as possible that reduces a traffic and increases a performance. Signed-off-by: Pavel Shilovsky Acked-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/file.c | 49 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 569184e6ee01..ea096ce5d4f7 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -829,6 +829,11 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) struct cifsLockInfo *li, *tmp; struct cifs_tcon *tcon; struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + unsigned int num, max_num; + LOCKING_ANDX_RANGE *buf, *cur; + int types[] = {LOCKING_ANDX_LARGE_FILES, + LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; + int i; xid = GetXid(); tcon = tlink_tcon(cfile->tlink); @@ -840,17 +845,49 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) return rc; } - list_for_each_entry_safe(li, tmp, &cinode->llist, llist) { - stored_rc = CIFSSMBLock(xid, tcon, cfile->netfid, - li->pid, li->length, li->offset, - 0, 1, li->type, 0, 0); - if (stored_rc) - rc = stored_rc; + max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) / + sizeof(LOCKING_ANDX_RANGE); + buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); + if (!buf) { + mutex_unlock(&cinode->lock_mutex); + FreeXid(xid); + return rc; + } + + for (i = 0; i < 2; i++) { + cur = buf; + num = 0; + list_for_each_entry_safe(li, tmp, &cinode->llist, llist) { + if (li->type != types[i]) + continue; + cur->Pid = cpu_to_le16(li->pid); + cur->LengthLow = cpu_to_le32((u32)li->length); + cur->LengthHigh = cpu_to_le32((u32)(li->length>>32)); + cur->OffsetLow = cpu_to_le32((u32)li->offset); + cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32)); + if (++num == max_num) { + stored_rc = cifs_lockv(xid, tcon, cfile->netfid, + li->type, 0, num, buf); + if (stored_rc) + rc = stored_rc; + cur = buf; + num = 0; + } else + cur++; + } + + if (num) { + stored_rc = cifs_lockv(xid, tcon, cfile->netfid, + types[i], 0, num, buf); + if (stored_rc) + rc = stored_rc; + } } cinode->can_cache_brlcks = false; mutex_unlock(&cinode->lock_mutex); + kfree(buf); FreeXid(xid); return rc; } -- cgit v1.2.3-58-ga151 From 611d7a5dc6f2a1a0cfd8cc07b9d15f794cbe5f98 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Tue, 4 Oct 2011 14:20:17 +0200 Subject: ore: Make ore_calc_stripe_info EXPORT_SYMBOL ore_calc_stripe_info is needed by exofs::export.c for the layout calculations. Make it exportable Signed-off-by: Boaz Harrosh --- fs/exofs/ore.c | 8 +++----- include/scsi/osd_ore.h | 3 +++ 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 3b1cc3a132d7..d92998d5c2d6 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -57,9 +57,6 @@ MODULE_LICENSE("GPL"); * 3. Cache some havily used calculations that will be needed by users. */ -static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, - struct ore_striping_info *si); - enum { BIO_MAX_PAGES_KMALLOC = (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),}; @@ -409,8 +406,8 @@ EXPORT_SYMBOL(ore_check_io); * * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit */ -static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, - struct ore_striping_info *si) +void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, + struct ore_striping_info *si) { u32 stripe_unit = layout->stripe_unit; u32 group_width = layout->group_width; @@ -443,6 +440,7 @@ static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, si->group_length = T - H; si->M = M; } +EXPORT_SYMBOL(ore_calc_stripe_info); static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, unsigned pgbase, struct ore_per_dev_state *per_dev, diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index af2231a0fd09..a8e39d14f82b 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -146,6 +146,9 @@ static inline unsigned ore_io_state_size(unsigned numdevs) /* ore.c */ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout); +void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, + struct ore_striping_info *si); + int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, bool is_reading, u64 offset, u64 length, struct ore_io_state **ios); -- cgit v1.2.3-58-ga151 From 3e335672e018c06e007f85a5d54afd721fb3d6d5 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 24 Oct 2011 16:36:33 -0700 Subject: fs/Makefile: Always inspect exofs/ fs/exofs directory has multiple targets now, of which the ore.ko will be needed by the pnfs-objects-layout-driver (fs/nfs/objlayout). As suggested by: Michal Marek convert inclusion of exofs/ from obj-$(CONFIG_EXOFS_FS) => obj-$(y). So ORE can be selected also from fs/nfs/Kconfig CC: Michal Marek CC: Al Viro Signed-off-by: Boaz Harrosh --- fs/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/Makefile b/fs/Makefile index afc109691a9b..5c30a13341eb 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -120,6 +120,6 @@ obj-$(CONFIG_DEBUG_FS) += debugfs/ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ -obj-$(CONFIG_EXOFS_FS) += exofs/ +obj-$(y) += exofs/ # Multiple mods, used by nfs/objlayout obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ -- cgit v1.2.3-58-ga151 From a1fec1dbbc8db974d2582e4040590cebe72171e4 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 12 Oct 2011 18:42:22 +0200 Subject: ore: RAID5 read This patch introduces the first stage of RAID5 support mainly the skip-over-raid-units when reading. For writes it inserts BLANK units, into where XOR blocks should be calculated and written to. It introduces the new "general raid maths", and the main additional parameters and components needed for raid5. Since at this stage it could corrupt future version that actually do support raid5. The enablement of raid5 mounting and setting of parity-count > 0 is disabled. So the raid5 code will never be used. Mounting of raid5 is only enabled later once the basic XOR write is also in. But if the patch "enable RAID5" is applied this code has been tested to be able to properly read raid5 volumes and is according to standard. Also it has been tested that the new maths still properly supports RAID0 and grouping code just as before. (BTW: I have found more bugs in the pnfs-obj RAID math fixed here) The ore.c file is getting too big, so new ore_raid.[hc] files are added that will include the special raid stuff that are not used in striping and mirrors. In future write support these will get bigger. When adding the ore_raid.c to Kbuild file I was forced to rename ore.ko to libore.ko. Is it possible to keep source file, say ore.c and module file ore.ko the same even if there are multiple files inside ore.ko? Signed-off-by: Boaz Harrosh --- fs/exofs/Kbuild | 3 +- fs/exofs/ore.c | 326 +++++++++++++++++++++++++++++++++++++------------ fs/exofs/ore_raid.c | 140 +++++++++++++++++++++ fs/exofs/ore_raid.h | 64 ++++++++++ include/scsi/osd_ore.h | 21 +++- 5 files changed, 473 insertions(+), 81 deletions(-) create mode 100644 fs/exofs/ore_raid.c create mode 100644 fs/exofs/ore_raid.h (limited to 'fs') diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index c5a5855a6c44..352ba149d23e 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild @@ -13,7 +13,8 @@ # # ore module library -obj-$(CONFIG_ORE) += ore.o +libore-y := ore.o ore_raid.o +obj-$(CONFIG_ORE) += libore.o exofs-y := inode.o file.o symlink.o namei.o dir.o super.o obj-$(CONFIG_EXOFS_FS) += exofs.o diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index d92998d5c2d6..fd6090ddd3bf 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -24,24 +24,9 @@ #include #include +#include -#include - -#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) - -#ifdef CONFIG_EXOFS_DEBUG -#define ORE_DBGMSG(fmt, a...) \ - printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) -#else -#define ORE_DBGMSG(fmt, a...) \ - do { if (0) printk(fmt, ##a); } while (0) -#endif - -/* u64 has problems with printk this will cast it to unsigned long long */ -#define _LLU(x) (unsigned long long)(x) - -#define ORE_DBGMSG2(M...) do {} while (0) -/* #define ORE_DBGMSG2 ORE_DBGMSG */ +#include "ore_raid.h" MODULE_AUTHOR("Boaz Harrosh "); MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); @@ -133,21 +118,81 @@ static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) return ore_comp_dev(ios->oc, index); } -static int _get_io_state(struct ore_layout *layout, - struct ore_components *oc, unsigned numdevs, - struct ore_io_state **pios) +static int _ore_get_io_state(struct ore_layout *layout, + struct ore_components *oc, unsigned numdevs, + unsigned sgs_per_dev, unsigned num_par_pages, + struct ore_io_state **pios) { struct ore_io_state *ios; + struct page **pages; + struct osd_sg_entry *sgilist; + struct __alloc_all_io_state { + struct ore_io_state ios; + struct ore_per_dev_state per_dev[numdevs]; + union { + struct osd_sg_entry sglist[sgs_per_dev * numdevs]; + struct page *pages[num_par_pages]; + }; + } *_aios; + + if (likely(sizeof(*_aios) <= PAGE_SIZE)) { + _aios = kzalloc(sizeof(*_aios), GFP_KERNEL); + if (unlikely(!_aios)) { + ORE_DBGMSG("Failed kzalloc bytes=%zd\n", + sizeof(*_aios)); + *pios = NULL; + return -ENOMEM; + } + pages = num_par_pages ? _aios->pages : NULL; + sgilist = sgs_per_dev ? _aios->sglist : NULL; + ios = &_aios->ios; + } else { + struct __alloc_small_io_state { + struct ore_io_state ios; + struct ore_per_dev_state per_dev[numdevs]; + } *_aio_small; + union __extra_part { + struct osd_sg_entry sglist[sgs_per_dev * numdevs]; + struct page *pages[num_par_pages]; + } *extra_part; + + _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL); + if (unlikely(!_aio_small)) { + ORE_DBGMSG("Failed alloc first part bytes=%zd\n", + sizeof(*_aio_small)); + *pios = NULL; + return -ENOMEM; + } + extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL); + if (unlikely(!extra_part)) { + ORE_DBGMSG("Failed alloc second part bytes=%zd\n", + sizeof(*extra_part)); + kfree(_aio_small); + *pios = NULL; + return -ENOMEM; + } - /*TODO: Maybe use kmem_cach per sbi of size - * exofs_io_state_size(layout->s_numdevs) - */ - ios = kzalloc(ore_io_state_size(numdevs), GFP_KERNEL); - if (unlikely(!ios)) { - ORE_DBGMSG("Failed kzalloc bytes=%d\n", - ore_io_state_size(numdevs)); - *pios = NULL; - return -ENOMEM; + pages = num_par_pages ? extra_part->pages : NULL; + sgilist = sgs_per_dev ? extra_part->sglist : NULL; + /* In this case the per_dev[0].sgilist holds the pointer to + * be freed + */ + ios = &_aio_small->ios; + ios->extra_part_alloc = true; + } + + if (pages) { + ios->parity_pages = pages; + ios->max_par_pages = num_par_pages; + } + if (sgilist) { + unsigned d; + + for (d = 0; d < numdevs; ++d) { + ios->per_dev[d].sglist = sgilist; + sgilist += sgs_per_dev; + } + ios->sgs_per_dev = sgs_per_dev; } ios->layout = layout; @@ -178,9 +223,42 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, { struct ore_io_state *ios; unsigned numdevs = layout->group_width * layout->mirrors_p1; + unsigned sgs_per_dev = 0, max_par_pages = 0; int ret; - ret = _get_io_state(layout, oc, numdevs, pios); + if (layout->parity && length) { + unsigned data_devs = layout->group_width - layout->parity; + unsigned stripe_size = layout->stripe_unit * data_devs; + unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; + u32 remainder; + u64 num_stripes; + u64 num_raid_units; + + num_stripes = div_u64_rem(length, stripe_size, &remainder); + if (remainder) + ++num_stripes; + + num_raid_units = num_stripes * layout->parity; + + if (is_reading) { + /* For reads add per_dev sglist array */ + /* TODO: Raid 6 we need twice more. Actually: + * num_stripes / LCMdP(W,P); + * if (W%P != 0) num_stripes *= parity; + */ + + /* first/last seg is split */ + num_raid_units += layout->group_width; + sgs_per_dev = div_u64(num_raid_units, data_devs); + } else { + /* For Writes add parity pages array. */ + max_par_pages = num_raid_units * pages_in_unit * + sizeof(struct page *); + } + } + + ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages, + pios); if (unlikely(ret)) return ret; @@ -189,10 +267,11 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, ios->offset = offset; if (length) { - ore_calc_stripe_info(layout, offset, &ios->si); - ios->length = (length <= ios->si.group_length) ? length : - ios->si.group_length; + ore_calc_stripe_info(layout, offset, length, &ios->si); + ios->length = ios->si.length; ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; + if (layout->parity) + _ore_post_alloc_raid_stuff(ios); } return 0; @@ -209,7 +288,7 @@ EXPORT_SYMBOL(ore_get_rw_state); int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, struct ore_io_state **pios) { - return _get_io_state(layout, oc, oc->numdevs, pios); + return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios); } EXPORT_SYMBOL(ore_get_io_state); @@ -227,6 +306,7 @@ void ore_put_io_state(struct ore_io_state *ios) bio_put(per_dev->bio); } + _ore_free_raid_stuff(ios); kfree(ios); } } @@ -367,53 +447,65 @@ EXPORT_SYMBOL(ore_check_io); /* * L - logical offset into the file * - * U - The number of bytes in a stripe within a group + * D - number of Data devices + * D = group_width - parity * - * U = stripe_unit * group_width + * U - The number of bytes in a stripe within a group + * U = stripe_unit * D * * T - The number of bytes striped within a group of component objects * (before advancing to the next group) - * - * T = stripe_unit * group_width * group_depth + * T = U * group_depth * * S - The number of bytes striped across all component objects * before the pattern repeats + * S = T * group_count * - * S = stripe_unit * group_width * group_depth * group_count - * - * M - The "major" (i.e., across all components) stripe number - * + * M - The "major" (i.e., across all components) cycle number * M = L / S * - * G - Counts the groups from the beginning of the major stripe - * + * G - Counts the groups from the beginning of the major cycle * G = (L - (M * S)) / T [or (L % S) / T] * * H - The byte offset within the group - * * H = (L - (M * S)) % T [or (L % S) % T] * * N - The "minor" (i.e., across the group) stripe number - * * N = H / U * * C - The component index coresponding to L * - * C = (H - (N * U)) / stripe_unit + G * group_width - * [or (L % U) / stripe_unit + G * group_width] + * C = (H - (N * U)) / stripe_unit + G * D + * [or (L % U) / stripe_unit + G * D] * * O - The component offset coresponding to L - * * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit + * + * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity + * divide by parity + * LCMdP = lcm(group_width, parity) / parity + * + * R - The parity Rotation stripe + * (Note parity cycle always starts at a group's boundary) + * R = N % LCMdP + * + * I = the first parity device index + * I = (group_width + group_width - R*parity - parity) % group_width + * + * Craid - The component index Rotated + * Craid = (group_width + C - R*parity) % group_width + * (We add the group_width to avoid negative numbers modulo math) */ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, - struct ore_striping_info *si) + u64 length, struct ore_striping_info *si) { u32 stripe_unit = layout->stripe_unit; u32 group_width = layout->group_width; u64 group_depth = layout->group_depth; + u32 parity = layout->parity; - u32 U = stripe_unit * group_width; + u32 D = group_width - parity; + u32 U = D * stripe_unit; u64 T = U * group_depth; u64 S = T * layout->group_count; u64 M = div64_u64(file_offset, S); @@ -429,22 +521,43 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, u32 N = div_u64(H, U); /* "H - (N * U)" is just "H % U" so it's bound to u32 */ - si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; - si->dev *= layout->mirrors_p1; + u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; div_u64_rem(file_offset, stripe_unit, &si->unit_off); si->obj_offset = si->unit_off + (N * stripe_unit) + (M * group_depth * stripe_unit); - si->group_length = T - H; + if (parity) { + u32 LCMdP = lcm(group_width, parity) / parity; + /* R = N % LCMdP; */ + u32 RxP = (N % LCMdP) * parity; + u32 first_dev = C - C % group_width; + + si->par_dev = (group_width + group_width - parity - RxP) % + group_width + first_dev; + si->dev = (group_width + C - RxP) % group_width + first_dev; + si->bytes_in_stripe = U; + si->first_stripe_start = M * S + G * T + N * U; + } else { + /* Make the math correct see _prepare_one_group */ + si->par_dev = group_width; + si->dev = C; + } + + si->dev *= layout->mirrors_p1; + si->par_dev *= layout->mirrors_p1; + si->offset = file_offset; + si->length = T - H; + if (si->length > length) + si->length = length; si->M = M; } EXPORT_SYMBOL(ore_calc_stripe_info); -static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, - unsigned pgbase, struct ore_per_dev_state *per_dev, - int cur_len) +int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, + unsigned pgbase, struct page **pages, + struct ore_per_dev_state *per_dev, int cur_len) { unsigned pg = *cur_pg; struct request_queue *q = @@ -455,8 +568,11 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, if (per_dev->bio == NULL) { unsigned pages_in_stripe = ios->layout->group_width * (ios->layout->stripe_unit / PAGE_SIZE); - unsigned bio_size = (ios->nr_pages + pages_in_stripe) / - ios->layout->group_width; + unsigned nr_pages = ios->nr_pages * ios->layout->group_width / + (ios->layout->group_width - + ios->layout->parity); + unsigned bio_size = (nr_pages + pages_in_stripe) / + ios->layout->group_width; per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); if (unlikely(!per_dev->bio)) { @@ -471,12 +587,13 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); unsigned added_len; - BUG_ON(ios->nr_pages <= pg); cur_len -= pglen; - added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], + added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], pglen, pgbase); if (unlikely(pglen != added_len)) { + ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n", + per_dev->bio->bi_vcnt); ret = -ENOMEM; goto out; } @@ -501,9 +618,11 @@ static int _prepare_for_striping(struct ore_io_state *ios) struct ore_striping_info *si = &ios->si; unsigned stripe_unit = ios->layout->stripe_unit; unsigned mirrors_p1 = ios->layout->mirrors_p1; - unsigned devs_in_group = ios->layout->group_width * mirrors_p1; + unsigned group_width = ios->layout->group_width; + unsigned devs_in_group = group_width * mirrors_p1; unsigned dev = si->dev; unsigned first_dev = dev - (dev % devs_in_group); + unsigned dev_order; unsigned cur_pg = ios->pages_consumed; u64 length = ios->length; int ret = 0; @@ -513,7 +632,10 @@ static int _prepare_for_striping(struct ore_io_state *ios) return 0; } - BUG_ON(length > si->group_length); + BUG_ON(length > si->length); + + dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev); + si->cur_comp = dev_order; while (length) { unsigned comp = dev - first_dev; @@ -522,17 +644,20 @@ static int _prepare_for_striping(struct ore_io_state *ios) if (!per_dev->length) { per_dev->dev = dev; - if (dev < si->dev) { - per_dev->offset = si->obj_offset + stripe_unit - - si->unit_off; - cur_len = stripe_unit; - } else if (dev == si->dev) { + if (dev == si->dev) { + WARN_ON(dev == si->par_dev); per_dev->offset = si->obj_offset; cur_len = stripe_unit - si->unit_off; page_off = si->unit_off & ~PAGE_MASK; BUG_ON(page_off && (page_off != ios->pgbase)); - } else { /* dev > si->dev */ - per_dev->offset = si->obj_offset - si->unit_off; + } else { + if (si->cur_comp > dev_order) + per_dev->offset = + si->obj_offset - si->unit_off; + else /* si->cur_comp < dev_order */ + per_dev->offset = + si->obj_offset + stripe_unit - + si->unit_off; cur_len = stripe_unit; } } else { @@ -541,8 +666,8 @@ static int _prepare_for_striping(struct ore_io_state *ios) if (cur_len >= length) cur_len = length; - ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, - cur_len); + ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages, + per_dev, cur_len); if (unlikely(ret)) goto out; @@ -550,6 +675,41 @@ static int _prepare_for_striping(struct ore_io_state *ios) dev = (dev % devs_in_group) + first_dev; length -= cur_len; + + si->cur_comp = (si->cur_comp + 1) % group_width; + if (unlikely((dev == si->par_dev) || + (!length && ios->parity_pages))) { + if (!length) + /* If we are writing and this is the very last + * stripe. then operate on parity dev. + */ + dev = si->par_dev; + if (ios->reading) + /* In writes cur_len just means if it's the + * last one. See _ore_add_parity_unit. + */ + cur_len = length; + per_dev = &ios->per_dev[dev - first_dev]; + if (!per_dev->length) { + /* Only/always the parity unit of the first + * stripe will be empty. So this is a chance to + * initialize the per_dev info. + */ + per_dev->dev = dev; + per_dev->offset = si->obj_offset - si->unit_off; + } + + ret = _ore_add_parity_unit(ios, si, per_dev, cur_len); + if (unlikely(ret)) + goto out; + + /* Rotate next par_dev backwards with wraping */ + si->par_dev = (devs_in_group + si->par_dev - + ios->layout->parity * mirrors_p1) % + devs_in_group + first_dev; + /* Next stripe, start fresh */ + si->cur_comp = 0; + } } out: ios->numdevs = devs_in_group; @@ -747,12 +907,24 @@ static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) per_dev->or = or; if (ios->pages) { - osd_req_read(or, obj, per_dev->offset, - per_dev->bio, per_dev->length); + if (per_dev->cur_sg) { + /* finalize the last sg_entry */ + _ore_add_sg_seg(per_dev, 0, false); + if (unlikely(!per_dev->cur_sg)) + return 0; /* Skip parity only device */ + + osd_req_read_sg(or, obj, per_dev->bio, + per_dev->sglist, per_dev->cur_sg); + } else { + /* The no raid case */ + osd_req_read(or, obj, per_dev->offset, + per_dev->bio, per_dev->length); + } + ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" - " dev=%d\n", _LLU(obj->id), + " dev=%d sg_len=%d\n", _LLU(obj->id), _LLU(per_dev->offset), _LLU(per_dev->length), - first_dev); + first_dev, per_dev->cur_sg); } else { BUG_ON(ios->kern_buff); @@ -849,7 +1021,7 @@ static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, { unsigned stripe_unit = layout->stripe_unit; - ore_calc_stripe_info(layout, file_offset, &ti->si); + ore_calc_stripe_info(layout, file_offset, 0, &ti->si); ti->prev_group_obj_off = ti->si.M * stripe_unit; ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c new file mode 100644 index 000000000000..8d4b93a93c67 --- /dev/null +++ b/fs/exofs/ore_raid.c @@ -0,0 +1,140 @@ +/* + * Copyright (C) 2011 + * Boaz Harrosh + * + * This file is part of the objects raid engine (ore). + * + * It is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with "ore". If not, write to the Free Software Foundation, Inc: + * "Free Software Foundation " + */ + +#include + +#include "ore_raid.h" + +struct page *_raid_page_alloc(void) +{ + return alloc_page(GFP_KERNEL); +} + +void _raid_page_free(struct page *p) +{ + __free_page(p); +} + +void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, + bool not_last) +{ + struct osd_sg_entry *sge; + + ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d " + "offset=0x%llx length=0x%x last_sgs_total=0x%x\n", + per_dev->dev, cur_len, not_last, per_dev->cur_sg, + _LLU(per_dev->offset), per_dev->length, + per_dev->last_sgs_total); + + if (!per_dev->cur_sg) { + sge = per_dev->sglist; + + /* First time we prepare two entries */ + if (per_dev->length) { + ++per_dev->cur_sg; + sge->offset = per_dev->offset; + sge->len = per_dev->length; + } else { + /* Here the parity is the first unit of this object. + * This happens every time we reach a parity device on + * the same stripe as the per_dev->offset. We need to + * just skip this unit. + */ + per_dev->offset += cur_len; + return; + } + } else { + /* finalize the last one */ + sge = &per_dev->sglist[per_dev->cur_sg - 1]; + sge->len = per_dev->length - per_dev->last_sgs_total; + } + + if (not_last) { + /* Partly prepare the next one */ + struct osd_sg_entry *next_sge = sge + 1; + + ++per_dev->cur_sg; + next_sge->offset = sge->offset + sge->len + cur_len; + /* Save cur len so we know how mutch was added next time */ + per_dev->last_sgs_total = per_dev->length; + next_sge->len = 0; + } else if (!sge->len) { + /* Optimize for when the last unit is a parity */ + --per_dev->cur_sg; + } +} + +/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ +int _ore_add_parity_unit(struct ore_io_state *ios, + struct ore_striping_info *si, + struct ore_per_dev_state *per_dev, + unsigned cur_len) +{ + if (ios->reading) { + BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev); + _ore_add_sg_seg(per_dev, cur_len, true); + } else { + struct page **pages = ios->parity_pages + ios->cur_par_page; + unsigned num_pages = ios->layout->stripe_unit / PAGE_SIZE; + unsigned array_start = 0; + unsigned i; + int ret; + + for (i = 0; i < num_pages; i++) { + pages[i] = _raid_page_alloc(); + if (unlikely(!pages[i])) + return -ENOMEM; + + ++(ios->cur_par_page); + /* TODO: only read support for now */ + clear_highpage(pages[i]); + } + + ORE_DBGMSG("writing dev=%d num_pages=%d cur_par_page=%d", + per_dev->dev, num_pages, ios->cur_par_page); + + ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, + per_dev, num_pages * PAGE_SIZE); + if (unlikely(ret)) + return ret; + } + return 0; +} + +int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) +{ + /*TODO: Only raid writes has stuff to add here */ + return 0; +} + +void _ore_free_raid_stuff(struct ore_io_state *ios) +{ + if (ios->parity_pages) { /* writing and raid */ + unsigned i; + + for (i = 0; i < ios->cur_par_page; i++) { + struct page *page = ios->parity_pages[i]; + + if (page) + _raid_page_free(page); + } + if (ios->extra_part_alloc) + kfree(ios->parity_pages); + } else { + /* Will only be set if raid reading && sglist is big */ + if (ios->extra_part_alloc) + kfree(ios->per_dev[0].sglist); + } +} diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h new file mode 100644 index 000000000000..c21080b4407f --- /dev/null +++ b/fs/exofs/ore_raid.h @@ -0,0 +1,64 @@ +/* + * Copyright (C) from 2011 + * Boaz Harrosh + * + * This file is part of the objects raid engine (ore). + * + * It is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with "ore". If not, write to the Free Software Foundation, Inc: + * "Free Software Foundation " + */ + +#include + +#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) + +#ifdef CONFIG_EXOFS_DEBUG +#define ORE_DBGMSG(fmt, a...) \ + printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) +#else +#define ORE_DBGMSG(fmt, a...) \ + do { if (0) printk(fmt, ##a); } while (0) +#endif + +/* u64 has problems with printk this will cast it to unsigned long long */ +#define _LLU(x) (unsigned long long)(x) + +#define ORE_DBGMSG2(M...) do {} while (0) +/* #define ORE_DBGMSG2 ORE_DBGMSG */ + +/* Calculate the component order in a stripe. eg the logical data unit + * address within the stripe of @dev given the @par_dev of this stripe. + */ +static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1, + unsigned par_dev, unsigned dev) +{ + unsigned first_dev = dev - dev % devs_in_group; + + dev -= first_dev; + par_dev -= first_dev; + + if (devs_in_group == par_dev) /* The raid 0 case */ + return dev / mirrors_p1; + /* raid4/5/6 case */ + return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) / + mirrors_p1; +} + +/* ios_raid.c stuff needed by ios.c */ +int _ore_post_alloc_raid_stuff(struct ore_io_state *ios); +void _ore_free_raid_stuff(struct ore_io_state *ios); + +void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, + bool not_last); +int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, + struct ore_per_dev_state *per_dev, unsigned cur_len); + +/* ios.c stuff needed by ios_raid.c */ +int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, + unsigned pgbase, struct page **pages, + struct ore_per_dev_state *per_dev, int cur_len); diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index a8e39d14f82b..43821c18cd3f 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -40,6 +40,7 @@ struct ore_layout { unsigned mirrors_p1; unsigned group_width; + unsigned parity; u64 group_depth; unsigned group_count; @@ -89,11 +90,16 @@ static inline void ore_comp_set_dev( } struct ore_striping_info { + u64 offset; u64 obj_offset; - u64 group_length; + u64 length; + u64 first_stripe_start; /* only used in raid writes */ u64 M; /* for truncate */ + unsigned bytes_in_stripe; unsigned dev; + unsigned par_dev; unsigned unit_off; + unsigned cur_comp; }; struct ore_io_state; @@ -127,6 +133,13 @@ struct ore_io_state { bool reading; + /* House keeping of Parity pages */ + bool extra_part_alloc; + struct page **parity_pages; + unsigned max_par_pages; + unsigned cur_par_page; + unsigned sgs_per_dev; + /* Variable array of size numdevs */ unsigned numdevs; struct ore_per_dev_state { @@ -134,7 +147,10 @@ struct ore_io_state { struct bio *bio; loff_t offset; unsigned length; + unsigned last_sgs_total; unsigned dev; + struct osd_sg_entry *sglist; + unsigned cur_sg; } per_dev[]; }; @@ -147,8 +163,7 @@ static inline unsigned ore_io_state_size(unsigned numdevs) /* ore.c */ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout); void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, - struct ore_striping_info *si); - + u64 length, struct ore_striping_info *si); int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, bool is_reading, u64 offset, u64 length, struct ore_io_state **ios); -- cgit v1.2.3-58-ga151 From 769ba8d92025fa390f3097e658b8ed6e032d68e9 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Fri, 14 Oct 2011 15:33:51 +0200 Subject: ore: RAID5 Write This is finally the RAID5 Write support. The bigger part of this patch is not the XOR engine itself, But the read4write logic, which is a complete mini prepare_for_striping reading engine that can read scattered pages of a stripe into cache so it can be used for XOR calculation. That is, if the write was not stripe aligned. The main algorithm behind the XOR engine is the 2 dimensional array: struct __stripe_pages_2d. A drawing might save 1000 words --- __stripe_pages_2d | n = pages_in_stripe_unit; w = group_width - parity; | pages array presented to the XOR lib | | V | __1_page_stripe[0].pages --> [c0][c1]..[cw][c_par] <---| | | __1_page_stripe[1].pages --> [c0][c1]..[cw][c_par] <--- | ... | ... | __1_page_stripe[n].pages --> [c0][c1]..[cw][c_par] ^ | data added columns first then row --- The pages are put on this array columns first. .i.e: p0-of-c0, p1-of-c0, ... pn-of-c0, p0-of-c1, ... So we are doing a corner turn of the pages. Note that pages will zigzag down and left. but are put sequentially in growing order. So when the time comes to XOR the stripe, only the beginning and end of the array need be checked. We scan the array and any NULL spot will be field by pages-to-be-read. The FS that wants to support RAID5 needs to supply an operations-vector that searches a given page in cache, and specifies if the page is uptodate or need reading. All these pages to be read are put on a slave ore_io_state and synchronously read. All the pages of a stripe are read in one IO, using the scatter gather mechanism. In write we constrain our IO to only be incomplete on a single stripe. Meaning either the complete IO is within a single stripe so we might have pages to read from both beginning or end of the strip. Or we have some reading to do at beginning but end at strip boundary. The left over pages are pushed to the next IO by the API already established by previous work, where an IO offset/length combination presented to the ORE might get the length truncated and the user must re-submit the leftover pages. (Both exofs and NFS support this) But any ORE user should make it's best effort to align it's IO before hand and avoid complications. A cached ore_layout->stripe_size member can be used for that calculation. (NOTE: that ORE demands that stripe_size may not be bigger then 32bit) What else? Well read it and tell me. Signed-off-by: Boaz Harrosh --- fs/exofs/Kconfig | 9 +- fs/exofs/ore.c | 36 +++- fs/exofs/ore_raid.c | 534 ++++++++++++++++++++++++++++++++++++++++++++++++- fs/exofs/ore_raid.h | 15 ++ include/scsi/osd_ore.h | 9 + 5 files changed, 587 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig index 70bae4149291..fa9a286c8771 100644 --- a/fs/exofs/Kconfig +++ b/fs/exofs/Kconfig @@ -1,10 +1,17 @@ +# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects +# for every ORE user we do it like this. Any user should add itself here +# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are +# selected here, and we default to "ON". So in effect it is like been +# selected by any of the users. config ORE tristate + depends on EXOFS_FS + select ASYNC_XOR + default SCSI_OSD_ULD config EXOFS_FS tristate "exofs: OSD based file system support" depends on SCSI_OSD_ULD - select ORE help EXOFS is a file system that uses an OSD storage device, as its backing storage. diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index fd6090ddd3bf..08ee454b2187 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -95,6 +95,14 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) layout->max_io_length = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * layout->group_width; + if (layout->parity) { + unsigned stripe_length = + (layout->group_width - layout->parity) * + layout->stripe_unit; + + layout->max_io_length /= stripe_length; + layout->max_io_length *= stripe_length; + } return 0; } EXPORT_SYMBOL(ore_verify_layout); @@ -118,7 +126,7 @@ static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) return ore_comp_dev(ios->oc, index); } -static int _ore_get_io_state(struct ore_layout *layout, +int _ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, unsigned numdevs, unsigned sgs_per_dev, unsigned num_par_pages, struct ore_io_state **pios) @@ -334,7 +342,7 @@ static void _done_io(struct osd_request *or, void *p) kref_put(&ios->kref, _last_io); } -static int ore_io_execute(struct ore_io_state *ios) +int ore_io_execute(struct ore_io_state *ios) { DECLARE_COMPLETION_ONSTACK(wait); bool sync = (ios->done == NULL); @@ -597,6 +605,8 @@ int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, ret = -ENOMEM; goto out; } + _add_stripe_page(ios->sp2d, &ios->si, pages[pg]); + pgbase = 0; ++pg; } @@ -636,6 +646,7 @@ static int _prepare_for_striping(struct ore_io_state *ios) dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev); si->cur_comp = dev_order; + si->cur_pg = si->unit_off / PAGE_SIZE; while (length) { unsigned comp = dev - first_dev; @@ -677,14 +688,14 @@ static int _prepare_for_striping(struct ore_io_state *ios) length -= cur_len; si->cur_comp = (si->cur_comp + 1) % group_width; - if (unlikely((dev == si->par_dev) || - (!length && ios->parity_pages))) { - if (!length) + if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) { + if (!length && ios->sp2d) { /* If we are writing and this is the very last * stripe. then operate on parity dev. */ dev = si->par_dev; - if (ios->reading) + } + if (ios->sp2d) /* In writes cur_len just means if it's the * last one. See _ore_add_parity_unit. */ @@ -709,6 +720,7 @@ static int _prepare_for_striping(struct ore_io_state *ios) devs_in_group + first_dev; /* Next stripe, start fresh */ si->cur_comp = 0; + si->cur_pg = 0; } } out: @@ -873,6 +885,14 @@ int ore_write(struct ore_io_state *ios) int i; int ret; + if (unlikely(ios->sp2d && !ios->r4w)) { + /* A library is attempting a RAID-write without providing + * a pages lock interface. + */ + WARN_ON_ONCE(1); + return -ENOTSUPP; + } + ret = _prepare_for_striping(ios); if (unlikely(ret)) return ret; @@ -888,7 +908,7 @@ int ore_write(struct ore_io_state *ios) } EXPORT_SYMBOL(ore_write); -static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) +int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp) { struct osd_request *or; struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; @@ -952,7 +972,7 @@ int ore_read(struct ore_io_state *ios) return ret; for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - ret = _read_mirror(ios, i); + ret = _ore_read_mirror(ios, i); if (unlikely(ret)) return ret; } diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 8d4b93a93c67..29c47e5c4a86 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -14,9 +14,13 @@ */ #include +#include #include "ore_raid.h" +#undef ORE_DBGMSG2 +#define ORE_DBGMSG2 ORE_DBGMSG + struct page *_raid_page_alloc(void) { return alloc_page(GFP_KERNEL); @@ -27,6 +31,236 @@ void _raid_page_free(struct page *p) __free_page(p); } +/* This struct is forward declare in ore_io_state, but is private to here. + * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit. + * + * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn. + * Ascending page index access is sp2d(p-minor, c-major). But storage is + * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor + * API. + */ +struct __stripe_pages_2d { + /* Cache some hot path repeated calculations */ + unsigned parity; + unsigned data_devs; + unsigned pages_in_unit; + + bool needed ; + + /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */ + struct __1_page_stripe { + bool alloc; + unsigned write_count; + struct async_submit_ctl submit; + struct dma_async_tx_descriptor *tx; + + /* The size of this array is data_devs + parity */ + struct page **pages; + struct page **scribble; + /* bool array, size of this array is data_devs */ + char *page_is_read; + } _1p_stripes[]; +}; + +/* This can get bigger then a page. So support multiple page allocations + * _sp2d_free should be called even if _sp2d_alloc fails (by returning + * none-zero). + */ +static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, + unsigned parity, struct __stripe_pages_2d **psp2d) +{ + struct __stripe_pages_2d *sp2d; + unsigned data_devs = group_width - parity; + struct _alloc_all_bytes { + struct __alloc_stripe_pages_2d { + struct __stripe_pages_2d sp2d; + struct __1_page_stripe _1p_stripes[pages_in_unit]; + } __asp2d; + struct __alloc_1p_arrays { + struct page *pages[group_width]; + struct page *scribble[group_width]; + char page_is_read[data_devs]; + } __a1pa[pages_in_unit]; + } *_aab; + struct __alloc_1p_arrays *__a1pa; + struct __alloc_1p_arrays *__a1pa_end; + const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]); + unsigned num_a1pa, alloc_size, i; + + /* FIXME: check these numbers in ore_verify_layout */ + BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE); + BUG_ON(sizeof__a1pa > PAGE_SIZE); + + if (sizeof(*_aab) > PAGE_SIZE) { + num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa; + alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa; + } else { + num_a1pa = pages_in_unit; + alloc_size = sizeof(*_aab); + } + + _aab = kzalloc(alloc_size, GFP_KERNEL); + if (unlikely(!_aab)) { + ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size); + return -ENOMEM; + } + + sp2d = &_aab->__asp2d.sp2d; + *psp2d = sp2d; /* From here Just call _sp2d_free */ + + __a1pa = _aab->__a1pa; + __a1pa_end = __a1pa + num_a1pa; + + for (i = 0; i < pages_in_unit; ++i) { + if (unlikely(__a1pa >= __a1pa_end)) { + num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, + pages_in_unit - i); + + __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL); + if (unlikely(!__a1pa)) { + ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", + num_a1pa); + return -ENOMEM; + } + __a1pa_end = __a1pa + num_a1pa; + /* First *pages is marked for kfree of the buffer */ + sp2d->_1p_stripes[i].alloc = true; + } + + sp2d->_1p_stripes[i].pages = __a1pa->pages; + sp2d->_1p_stripes[i].scribble = __a1pa->scribble ; + sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read; + ++__a1pa; + } + + sp2d->parity = parity; + sp2d->data_devs = data_devs; + sp2d->pages_in_unit = pages_in_unit; + return 0; +} + +static void _sp2d_reset(struct __stripe_pages_2d *sp2d, + const struct _ore_r4w_op *r4w, void *priv) +{ + unsigned data_devs = sp2d->data_devs; + unsigned group_width = data_devs + sp2d->parity; + unsigned p; + + if (!sp2d->needed) + return; + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->write_count < group_width) { + unsigned c; + + for (c = 0; c < data_devs; c++) + if (_1ps->page_is_read[c]) { + struct page *page = _1ps->pages[c]; + + r4w->put_page(priv, page); + _1ps->page_is_read[c] = false; + } + } + + memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages)); + _1ps->write_count = 0; + _1ps->tx = NULL; + } + + sp2d->needed = false; +} + +static void _sp2d_free(struct __stripe_pages_2d *sp2d) +{ + unsigned i; + + if (!sp2d) + return; + + for (i = 0; i < sp2d->pages_in_unit; ++i) { + if (sp2d->_1p_stripes[i].alloc) + kfree(sp2d->_1p_stripes[i].pages); + } + + kfree(sp2d); +} + +static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d) +{ + unsigned p; + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->write_count) + return p; + } + + return ~0; +} + +static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) +{ + unsigned p; + + for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->write_count) + return p; + } + + return ~0; +} + +static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) +{ + unsigned p; + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (!_1ps->write_count) + continue; + + init_async_submit(&_1ps->submit, + ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK, + NULL, + NULL, NULL, + (addr_conv_t *)_1ps->scribble); + + /* TODO: raid6 */ + _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages, + 0, sp2d->data_devs, PAGE_SIZE, + &_1ps->submit); + } + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + /* NOTE: We wait for HW synchronously (I don't have such HW + * to test with.) Is parallelism needed with today's multi + * cores? + */ + async_tx_issue_pending(_1ps->tx); + } +} + +void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page) +{ + struct __1_page_stripe *_1ps; + + sp2d->needed = true; + + _1ps = &sp2d->_1p_stripes[si->cur_pg]; + _1ps->pages[si->cur_comp] = page; + ++_1ps->write_count; + + si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit; + /* si->cur_comp is advanced outside at main loop */ +} + void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, bool not_last) { @@ -76,6 +310,240 @@ void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, } } +static int _alloc_read_4_write(struct ore_io_state *ios) +{ + struct ore_layout *layout = ios->layout; + int ret; + /* We want to only read those pages not in cache so worst case + * is a stripe populated with every other page + */ + unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2; + + ret = _ore_get_io_state(layout, ios->oc, + layout->group_width * layout->mirrors_p1, + sgs_per_dev, 0, &ios->ios_read_4_write); + return ret; +} + +/* @si contains info of the to-be-inserted page. Update of @si should be + * maintained by caller. Specificaly si->dev, si->obj_offset, ... + */ +static int _add_to_read_4_write(struct ore_io_state *ios, + struct ore_striping_info *si, struct page *page) +{ + struct request_queue *q; + struct ore_per_dev_state *per_dev; + struct ore_io_state *read_ios; + unsigned first_dev = si->dev - (si->dev % + (ios->layout->group_width * ios->layout->mirrors_p1)); + unsigned comp = si->dev - first_dev; + unsigned added_len; + + if (!ios->ios_read_4_write) { + int ret = _alloc_read_4_write(ios); + + if (unlikely(ret)) + return ret; + } + + read_ios = ios->ios_read_4_write; + read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1; + + per_dev = &read_ios->per_dev[comp]; + if (!per_dev->length) { + per_dev->bio = bio_kmalloc(GFP_KERNEL, + ios->sp2d->pages_in_unit); + if (unlikely(!per_dev->bio)) { + ORE_DBGMSG("Failed to allocate BIO size=%u\n", + ios->sp2d->pages_in_unit); + return -ENOMEM; + } + per_dev->offset = si->obj_offset; + per_dev->dev = si->dev; + } else if (si->obj_offset != (per_dev->offset + per_dev->length)) { + u64 gap = si->obj_offset - (per_dev->offset + per_dev->length); + + _ore_add_sg_seg(per_dev, gap, true); + } + q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); + added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0); + if (unlikely(added_len != PAGE_SIZE)) { + ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", + per_dev->bio->bi_vcnt); + return -ENOMEM; + } + + per_dev->length += PAGE_SIZE; + return 0; +} + +static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) +{ + struct bio_vec *bv; + unsigned i, d; + + /* loop on all devices all pages */ + for (d = 0; d < ios->numdevs; d++) { + struct bio *bio = ios->per_dev[d].bio; + + if (!bio) + continue; + + __bio_for_each_segment(bv, bio, i, 0) { + struct page *page = bv->bv_page; + + SetPageUptodate(page); + if (PageError(page)) + ClearPageError(page); + } + } +} + +/* read_4_write is hacked to read the start of the first stripe and/or + * the end of the last stripe. If needed, with an sg-gap at each device/page. + * It is assumed to be called after the to_be_written pages of the first stripe + * are populating ios->sp2d[][] + * + * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations + * These pages are held at sp2d[p].pages[c] but with + * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are + * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is + * @uptodate=true, so we don't need to read it, only unlock, after IO. + * + * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then + * to-be-written count, we should consider the xor-in-place mode. + * need_to_read_pages_count is the actual number of pages not present in cache. + * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough + * approximation? In this mode the read pages are put in the empty places of + * ios->sp2d[p][*], xor is calculated the same way. These pages are + * allocated/freed and don't go through cache + */ +static int _read_4_write(struct ore_io_state *ios) +{ + struct ore_io_state *ios_read; + struct ore_striping_info read_si; + struct __stripe_pages_2d *sp2d = ios->sp2d; + u64 offset = ios->si.first_stripe_start; + u64 last_stripe_end; + unsigned bytes_in_stripe = ios->si.bytes_in_stripe; + unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1; + int ret; + + if (offset == ios->offset) /* Go to start collect $200 */ + goto read_last_stripe; + + min_p = _sp2d_min_pg(sp2d); + max_p = _sp2d_max_pg(sp2d); + + for (c = 0; ; c++) { + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + read_si.obj_offset += min_p * PAGE_SIZE; + offset += min_p * PAGE_SIZE; + for (p = min_p; p <= max_p; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + struct page **pp = &_1ps->pages[c]; + bool uptodate; + + if (*pp) + /* to-be-written pages start here */ + goto read_last_stripe; + + *pp = ios->r4w->get_page(ios->private, offset, + &uptodate); + if (unlikely(!*pp)) + return -ENOMEM; + + if (!uptodate) + _add_to_read_4_write(ios, &read_si, *pp); + + /* Mark read-pages to be cache_released */ + _1ps->page_is_read[c] = true; + read_si.obj_offset += PAGE_SIZE; + offset += PAGE_SIZE; + } + offset += (sp2d->pages_in_unit - p) * PAGE_SIZE; + } + +read_last_stripe: + offset = ios->offset + (ios->length + PAGE_SIZE - 1) / + PAGE_SIZE * PAGE_SIZE; + last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) + * bytes_in_stripe; + if (offset == last_stripe_end) /* Optimize for the aligned case */ + goto read_it; + + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + p = read_si.unit_off / PAGE_SIZE; + c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, + ios->layout->mirrors_p1, read_si.par_dev, read_si.dev); + + BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end); + /* unaligned IO must be within a single stripe */ + + if (min_p == sp2d->pages_in_unit) { + /* Didn't do it yet */ + min_p = _sp2d_min_pg(sp2d); + max_p = _sp2d_max_pg(sp2d); + } + + while (offset < last_stripe_end) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if ((min_p <= p) && (p <= max_p)) { + struct page *page; + bool uptodate; + + BUG_ON(_1ps->pages[c]); + page = ios->r4w->get_page(ios->private, offset, + &uptodate); + if (unlikely(!page)) + return -ENOMEM; + + _1ps->pages[c] = page; + /* Mark read-pages to be cache_released */ + _1ps->page_is_read[c] = true; + if (!uptodate) + _add_to_read_4_write(ios, &read_si, page); + } + + offset += PAGE_SIZE; + if (p == (sp2d->pages_in_unit - 1)) { + ++c; + p = 0; + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + } else { + read_si.obj_offset += PAGE_SIZE; + ++p; + } + } + +read_it: + ios_read = ios->ios_read_4_write; + if (!ios_read) + return 0; + + /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change + * to check for per_dev->bio + */ + ios_read->pages = ios->pages; + + /* Now read these devices */ + for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) { + ret = _ore_read_mirror(ios_read, i); + if (unlikely(ret)) + return ret; + } + + ret = ore_io_execute(ios_read); /* Synchronus execution */ + if (unlikely(ret)) { + ORE_DBGMSG("!! ore_io_execute => %d\n", ret); + return ret; + } + + _mark_read4write_pages_uptodate(ios_read, ret); + return 0; +} + /* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, @@ -86,42 +554,89 @@ int _ore_add_parity_unit(struct ore_io_state *ios, BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev); _ore_add_sg_seg(per_dev, cur_len, true); } else { + struct __stripe_pages_2d *sp2d = ios->sp2d; struct page **pages = ios->parity_pages + ios->cur_par_page; - unsigned num_pages = ios->layout->stripe_unit / PAGE_SIZE; + unsigned num_pages; unsigned array_start = 0; unsigned i; int ret; + si->cur_pg = _sp2d_min_pg(sp2d); + num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; + + if (!cur_len) /* If last stripe operate on parity comp */ + si->cur_comp = sp2d->data_devs; + + if (!per_dev->length) { + per_dev->offset += si->cur_pg * PAGE_SIZE; + /* If first stripe, Read in all read4write pages + * (if needed) before we calculate the first parity. + */ + _read_4_write(ios); + } + for (i = 0; i < num_pages; i++) { pages[i] = _raid_page_alloc(); if (unlikely(!pages[i])) return -ENOMEM; ++(ios->cur_par_page); - /* TODO: only read support for now */ - clear_highpage(pages[i]); } - ORE_DBGMSG("writing dev=%d num_pages=%d cur_par_page=%d", - per_dev->dev, num_pages, ios->cur_par_page); + BUG_ON(si->cur_comp != sp2d->data_devs); + BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, per_dev, num_pages * PAGE_SIZE); if (unlikely(ret)) return ret; + + /* TODO: raid6 if (last_parity_dev) */ + _gen_xor_unit(sp2d); + _sp2d_reset(sp2d, ios->r4w, ios->private); } return 0; } int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) { - /*TODO: Only raid writes has stuff to add here */ + struct ore_layout *layout = ios->layout; + + if (ios->parity_pages) { + unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; + unsigned stripe_size = ios->si.bytes_in_stripe; + u64 last_stripe, first_stripe; + + if (_sp2d_alloc(pages_in_unit, layout->group_width, + layout->parity, &ios->sp2d)) { + return -ENOMEM; + } + + BUG_ON(ios->offset % PAGE_SIZE); + + /* Round io down to last full strip */ + first_stripe = div_u64(ios->offset, stripe_size); + last_stripe = div_u64(ios->offset + ios->length, stripe_size); + + /* If an IO spans more then a single stripe it must end at + * a stripe boundary. The reminder at the end is pushed into the + * next IO. + */ + if (last_stripe != first_stripe) { + ios->length = last_stripe * stripe_size - ios->offset; + + BUG_ON(!ios->length); + ios->nr_pages = (ios->length + PAGE_SIZE - 1) / + PAGE_SIZE; + ios->si.length = ios->length; /*make it consistent */ + } + } return 0; } void _ore_free_raid_stuff(struct ore_io_state *ios) { - if (ios->parity_pages) { /* writing and raid */ + if (ios->sp2d) { /* writing and raid */ unsigned i; for (i = 0; i < ios->cur_par_page; i++) { @@ -132,9 +647,14 @@ void _ore_free_raid_stuff(struct ore_io_state *ios) } if (ios->extra_part_alloc) kfree(ios->parity_pages); + /* If IO returned an error pages might need unlocking */ + _sp2d_reset(ios->sp2d, ios->r4w, ios->private); + _sp2d_free(ios->sp2d); } else { /* Will only be set if raid reading && sglist is big */ if (ios->extra_part_alloc) kfree(ios->per_dev[0].sglist); } + if (ios->ios_read_4_write) + ore_put_io_state(ios->ios_read_4_write); } diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h index c21080b4407f..2ffd2c3c6e46 100644 --- a/fs/exofs/ore_raid.h +++ b/fs/exofs/ore_raid.h @@ -57,8 +57,23 @@ void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, bool not_last); int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, struct ore_per_dev_state *per_dev, unsigned cur_len); +void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page); +static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page) +{ + if (!sp2d) /* Inline the fast path */ + return; /* Hay no raid stuff */ + _ore_add_stripe_page(sp2d, si, page); +} /* ios.c stuff needed by ios_raid.c */ +int _ore_get_io_state(struct ore_layout *layout, + struct ore_components *oc, unsigned numdevs, + unsigned sgs_per_dev, unsigned num_par_pages, + struct ore_io_state **pios); int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, unsigned pgbase, struct page **pages, struct ore_per_dev_state *per_dev, int cur_len); +int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp); +int ore_io_execute(struct ore_io_state *ios); diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index 43821c18cd3f..f05fa826f89e 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -99,11 +99,17 @@ struct ore_striping_info { unsigned dev; unsigned par_dev; unsigned unit_off; + unsigned cur_pg; unsigned cur_comp; }; struct ore_io_state; typedef void (*ore_io_done_fn)(struct ore_io_state *ios, void *private); +struct _ore_r4w_op { + /* @Priv given here is passed ios->private */ + struct page * (*get_page)(void *priv, u64 page_index, bool *uptodate); + void (*put_page)(void *priv, struct page *page); +}; struct ore_io_state { struct kref kref; @@ -139,6 +145,9 @@ struct ore_io_state { unsigned max_par_pages; unsigned cur_par_page; unsigned sgs_per_dev; + struct __stripe_pages_2d *sp2d; + struct ore_io_state *ios_read_4_write; + const struct _ore_r4w_op *r4w; /* Variable array of size numdevs */ unsigned numdevs; -- cgit v1.2.3-58-ga151 From dd296619974c50c46c67e58f355a7e85ef3f0c01 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 12 Oct 2011 15:42:07 +0200 Subject: exofs: Support for RAID5 read-4-write interface. The ore need suplied a r4w_get_page/r4w_put_page API from Filesystem so it can get cache pages to read-into when writing parial stripes. Also I commented out and NULLed the .writepage (singular) vector. Because it gives terrible write pattern to raid and is apparently not needed. Even in OOM conditions the system copes (even better) with out it. TODO: How to specify to write_cache_pages() to start or include a certain page? Signed-off-by: Boaz Harrosh --- fs/exofs/inode.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 86c0ac87b8e3..3e5f3a6be90a 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -63,6 +63,7 @@ struct page_collect { bool read_4_write; /* This means two things: that the read is sync * And the pages should not be unlocked. */ + struct page *that_locked_page; }; static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, @@ -81,6 +82,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, pcol->length = 0; pcol->pg_first = -1; pcol->read_4_write = false; + pcol->that_locked_page = NULL; } static void _pcol_reset(struct page_collect *pcol) @@ -93,6 +95,7 @@ static void _pcol_reset(struct page_collect *pcol) pcol->length = 0; pcol->pg_first = -1; pcol->ios = NULL; + pcol->that_locked_page = NULL; /* this is probably the end of the loop but in writes * it might not end here. don't be left with nothing @@ -391,6 +394,8 @@ static int readpage_strip(void *data, struct page *page) EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, page->index); + pcol->that_locked_page = page; + if (page->index < end_index) len = PAGE_CACHE_SIZE; else if (page->index == end_index) @@ -560,6 +565,56 @@ static void writepages_done(struct ore_io_state *ios, void *p) EXOFS_DBGMSG2("writepages_done END\n"); } +static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) +{ + struct page_collect *pcol = priv; + pgoff_t index = offset / PAGE_SIZE; + + if (!pcol->that_locked_page || + (pcol->that_locked_page->index != index)) { + struct page *page = find_get_page(pcol->inode->i_mapping, index); + + if (!page) { + page = find_or_create_page(pcol->inode->i_mapping, + index, GFP_NOFS); + if (unlikely(!page)) { + EXOFS_DBGMSG("grab_cache_page Failed " + "index=0x%llx\n", _LLU(index)); + return NULL; + } + unlock_page(page); + } + if (PageDirty(page) || PageWriteback(page)) + *uptodate = true; + else + *uptodate = PageUptodate(page); + EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate); + return page; + } else { + EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n", + pcol->that_locked_page->index); + *uptodate = true; + return pcol->that_locked_page; + } +} + +static void __r4w_put_page(void *priv, struct page *page) +{ + struct page_collect *pcol = priv; + + if (pcol->that_locked_page != page) { + EXOFS_DBGMSG("index=0x%lx\n", page->index); + page_cache_release(page); + return; + } + EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index); +} + +static const struct _ore_r4w_op _r4w_op = { + .get_page = &__r4w_get_page, + .put_page = &__r4w_put_page, +}; + static int write_exec(struct page_collect *pcol) { struct exofs_i_info *oi = exofs_i(pcol->inode); @@ -589,6 +644,7 @@ static int write_exec(struct page_collect *pcol) ios = pcol->ios; ios->pages = pcol_copy->pages; ios->done = writepages_done; + ios->r4w = &_r4w_op; ios->private = pcol_copy; /* pages ownership was passed to pcol_copy */ @@ -773,6 +829,7 @@ static int exofs_writepages(struct address_space *mapping, return 0; } +/* static int exofs_writepage(struct page *page, struct writeback_control *wbc) { struct page_collect pcol; @@ -788,7 +845,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc) return write_exec(&pcol); } - +*/ /* i_mutex held using inode->i_size directly */ static void _write_failed(struct inode *inode, loff_t to) { @@ -894,7 +951,7 @@ static void exofs_invalidatepage(struct page *page, unsigned long offset) const struct address_space_operations exofs_aops = { .readpage = exofs_readpage, .readpages = exofs_readpages, - .writepage = exofs_writepage, + .writepage = NULL, .writepages = exofs_writepages, .write_begin = exofs_write_begin_export, .write_end = exofs_write_end, -- cgit v1.2.3-58-ga151 From 44231e686b2ba3b5702db867bb84e6d76b7cf2c7 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Sun, 21 Nov 2010 20:03:24 +0200 Subject: ore: Enable RAID5 mounts Now that we support raid5 Enable it at mount. Raid6 will come next raid4 is not demanded for so it will probably not be enabled. (Until some one wants it) NOTE: That mkfs.exofs had support for raid5/6 since long time ago. (Making an empty raidX FS is just as easy as raid0 ;-} ) Signed-off-by: Boaz Harrosh --- fs/exofs/ore.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 08ee454b2187..fcfa86ae6faf 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -49,9 +49,17 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) { u64 stripe_length; -/* FIXME: Only raid0 is supported for now. */ - if (layout->raid_algorithm != PNFS_OSD_RAID_0) { - ORE_ERR("Only RAID_0 for now\n"); + switch (layout->raid_algorithm) { + case PNFS_OSD_RAID_0: + layout->parity = 0; + break; + case PNFS_OSD_RAID_5: + layout->parity = 1; + break; + case PNFS_OSD_RAID_PQ: + case PNFS_OSD_RAID_4: + default: + ORE_ERR("Only RAID_0/5 for now\n"); return -EINVAL; } if (0 != (layout->stripe_unit & ~PAGE_MASK)) { -- cgit v1.2.3-58-ga151 From 750c9c47a5f9daa88333ac435a1afe4d4b428230 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Tue, 25 Oct 2011 05:35:05 -0400 Subject: ext4: remove messy logic from ext4_ext_rm_leaf - Both callers(truncate and punch_hole) already aligned left end point so we no longer need split logic here. - Remove dead duplicated code. - Call ext4_ext_dirty only after we have updated eh_entries, otherwise we'll loose entries update. Regression caused by d583fb87a3ff0 266'th testcase in xfstests (http://patchwork.ozlabs.org/patch/120872) Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 103 +++++++----------------------------------------------- 1 file changed, 12 insertions(+), 91 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 385ebb435332..d0e3f333d3f0 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2311,13 +2311,12 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, int err = 0, correct_index = 0; int depth = ext_depth(inode), credits; struct ext4_extent_header *eh; - ext4_lblk_t a, b, block; + ext4_lblk_t a, b; unsigned num; ext4_lblk_t ex_ee_block; unsigned short ex_ee_len; unsigned uninitialized = 0; struct ext4_extent *ex; - struct ext4_map_blocks map; /* the header must be checked already in ext4_ext_remove_space() */ ext_debug("truncate since %u in leaf\n", start); @@ -2360,86 +2359,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); continue; - } else if (a != ex_ee_block && - b != ex_ee_block + ex_ee_len - 1) { - /* - * If this is a truncate, then this condition should - * never happen because at least one of the end points - * needs to be on the edge of the extent. - */ - if (end == EXT_MAX_BLOCKS - 1) { - ext_debug(" bad truncate %u:%u\n", - start, end); - block = 0; - num = 0; - err = -EIO; - goto out; - } - /* - * else this is a hole punch, so the extent needs to - * be split since neither edge of the hole is on the - * extent edge - */ - else{ - map.m_pblk = ext4_ext_pblock(ex); - map.m_lblk = ex_ee_block; - map.m_len = b - ex_ee_block; - - err = ext4_split_extent(handle, - inode, path, &map, 0, - EXT4_GET_BLOCKS_PUNCH_OUT_EXT | - EXT4_GET_BLOCKS_PRE_IO); - - if (err < 0) - goto out; - - ex_ee_len = ext4_ext_get_actual_len(ex); - - b = ex_ee_block+ex_ee_len - 1 < end ? - ex_ee_block+ex_ee_len - 1 : end; - - /* Then remove tail of this extent */ - block = ex_ee_block; - num = a - block; - } + } else if (b != ex_ee_block + ex_ee_len - 1) { + EXT4_ERROR_INODE(inode," bad truncate %u:%u\n", + start, end); + err = -EIO; + goto out; } else if (a != ex_ee_block) { /* remove tail of the extent */ - block = ex_ee_block; - num = a - block; - } else if (b != ex_ee_block + ex_ee_len - 1) { - /* remove head of the extent */ - block = b; - num = ex_ee_block + ex_ee_len - b; - - /* - * If this is a truncate, this condition - * should never happen - */ - if (end == EXT_MAX_BLOCKS - 1) { - ext_debug(" bad truncate %u:%u\n", - start, end); - err = -EIO; - goto out; - } + num = a - ex_ee_block; } else { /* remove whole extent: excellent! */ - block = ex_ee_block; num = 0; - if (a != ex_ee_block) { - ext_debug(" bad truncate %u:%u\n", - start, end); - err = -EIO; - goto out; - } - - if (b != ex_ee_block + ex_ee_len - 1) { - ext_debug(" bad truncate %u:%u\n", - start, end); - err = -EIO; - goto out; - } } - /* * 3 for leaf, sb, and inode plus 2 (bmap and group * descriptor) for each block group; assume two block @@ -2466,19 +2397,10 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (err) goto out; - if (num == 0) { + if (num == 0) /* this extent is removed; mark slot entirely unused */ ext4_ext_store_pblock(ex, 0); - } else if (block != ex_ee_block) { - /* - * If this was a head removal, then we need to update - * the physical block since it is now at a different - * location - */ - ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a)); - } - ex->ee_block = cpu_to_le32(block); ex->ee_len = cpu_to_le16(num); /* * Do not mark uninitialized if all the blocks in the @@ -2486,11 +2408,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, */ if (uninitialized && num) ext4_ext_mark_uninitialized(ex); - - err = ext4_ext_dirty(handle, inode, path + depth); - if (err) - goto out; - /* * If the extent was completely released, * we need to remove it from the leaf @@ -2513,6 +2430,10 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, } else *partial_cluster = 0; + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + ext_debug("new extent: %u:%u:%llu\n", block, num, ext4_ext_pblock(ex)); ex--; -- cgit v1.2.3-58-ga151 From 16d0587090ab93206768f726f71d84ecf55e05c4 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Tue, 25 Oct 2011 14:17:28 +0300 Subject: NFSd: call svc rpcbind cleanup explicitly We have to call svc_rpcb_cleanup() explicitly from nfsd_last_thread() since this function is registered as service shutdown callback and thus nobody else will done it for us. Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- fs/nfsd/nfssvc.c | 2 ++ include/linux/sunrpc/svc.h | 1 + net/sunrpc/svc.c | 3 ++- 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index dc5a1bf476b1..52cd976b6099 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -256,6 +256,8 @@ static void nfsd_last_thread(struct svc_serv *serv) nfsd_serv = NULL; nfsd_shutdown(); + svc_rpcb_cleanup(serv); + printk(KERN_WARNING "nfsd: last server has exited, flushing export " "cache\n"); nfsd_export_flush(); diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 223588a976a0..5e71a306216f 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -401,6 +401,7 @@ struct svc_procedure { /* * Function prototypes. */ +void svc_rpcb_cleanup(struct svc_serv *serv); struct svc_serv *svc_create(struct svc_program *, unsigned int, void (*shutdown)(struct svc_serv *)); struct svc_rqst *svc_prepare_thread(struct svc_serv *serv, diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 407462ff4779..252552a685dc 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -367,11 +367,12 @@ static int svc_rpcb_setup(struct svc_serv *serv) return 0; } -static void svc_rpcb_cleanup(struct svc_serv *serv) +void svc_rpcb_cleanup(struct svc_serv *serv) { svc_unregister(serv); rpcb_put_local(); } +EXPORT_SYMBOL_GPL(svc_rpcb_cleanup); static int svc_uses_rpcbind(struct svc_serv *serv) { -- cgit v1.2.3-58-ga151 From a4e5d88b1b24827f4f6a3cba43228936a67d81ba Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Tue, 25 Oct 2011 08:15:12 -0400 Subject: ext4: update EOFBLOCKS flag on fallocate properly EOFBLOCK_FL should be updated if called w/o FALLOCATE_FL_KEEP_SIZE Currently it happens only if new extent was allocated. TESTCASE: fallocate test_file -n -l4096 fallocate test_file -l4096 Last fallocate cmd has updated size, but keept EOFBLOCK_FL set. And fsck will complain about that. Also remove ping pong in ext4_fallocate() in case of new extents, where ext4_ext_map_blocks() clear EOFBLOCKS bit, and later ext4_falloc_update_inode() restore it again. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 2 ++ fs/ext4/extents.c | 28 +++++++++++++++++----------- fs/ext4/inode.c | 6 ++++-- 3 files changed, 23 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4546da4f26c4..3647ae0b21ab 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -539,6 +539,8 @@ struct ext4_new_group_data { #define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020 /* Don't normalize allocation size (used for fallocate) */ #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 + /* Request will not result in inode size update (user for fallocate) */ +#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 /* * Flags used by ext4_free_blocks diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d0e3f333d3f0..8686eb756b38 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3432,14 +3432,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, /* buffered write, writepage time, convert*/ ret = ext4_ext_convert_to_initialized(handle, inode, map, path); - if (ret >= 0) { + if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); - err = check_eofblocks_fl(handle, inode, map->m_lblk, path, - map->m_len); - if (err < 0) - goto out2; - } - out: if (ret <= 0) { err = ret; @@ -3480,6 +3474,12 @@ out: map_out: map->m_flags |= EXT4_MAP_MAPPED; + if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { + err = check_eofblocks_fl(handle, inode, map->m_lblk, path, + map->m_len); + if (err < 0) + goto out2; + } out1: if (allocated > map->m_len) allocated = map->m_len; @@ -3955,7 +3955,10 @@ got_allocated_blocks: map->m_flags |= EXT4_MAP_UNINIT; } - err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); + err = 0; + if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) + err = check_eofblocks_fl(handle, inode, map->m_lblk, + path, ar.len); if (!err) err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); @@ -4214,6 +4217,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) int ret = 0; int ret2 = 0; int retries = 0; + int flags; struct ext4_map_blocks map; unsigned int credits, blkbits = inode->i_blkbits; @@ -4250,6 +4254,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); return ret; } + flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT | + EXT4_GET_BLOCKS_NO_NORMALIZE; + if (mode & FALLOC_FL_KEEP_SIZE) + flags |= EXT4_GET_BLOCKS_KEEP_SIZE; retry: while (ret >= 0 && ret < max_blocks) { map.m_lblk = map.m_lblk + ret; @@ -4259,9 +4267,7 @@ retry: ret = PTR_ERR(handle); break; } - ret = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_CREATE_UNINIT_EXT | - EXT4_GET_BLOCKS_NO_NORMALIZE); + ret = ext4_map_blocks(handle, inode, &map, flags); if (ret <= 0) { #ifdef EXT4FS_DEBUG WARN_ON(ret <= 0); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ff6aace0bb3c..87ec615f0fd6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -477,9 +477,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, */ down_read((&EXT4_I(inode)->i_data_sem)); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - retval = ext4_ext_map_blocks(handle, inode, map, 0); + retval = ext4_ext_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); } else { - retval = ext4_ind_map_blocks(handle, inode, map, 0); + retval = ext4_ind_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); } up_read((&EXT4_I(inode)->i_data_sem)); -- cgit v1.2.3-58-ga151 From b9e2780d576a010d4aba1e69f247170bf3718d6b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 25 Oct 2011 05:38:41 -0700 Subject: sysfs: Remove support for tagged directories with untagged members (again) In commit 8a9ea3237e7e ("Merge git://.../davem/net-next") where my sysfs changes from the net tree merged with the sysfs rbtree changes from Mickulas Patocka the conflict resolution failed to preserve the simplified property that was the point of my changes. That is sysfs_find_dirent can now say something is a match if and only s_name and s_ns match what we are looking for, and sysfs_readdir can simply return all of the directory entries where s_ns matches the directory that we should be returning. Now that we are back to exact matches we can tweak sysfs_find_dirent and the name rb_tree to order sysfs_dirents by s_ns s_name and remove the second loop in sysfs_find_dirent. However that change seems a bit much for a conflict resolution so it can come later. Signed-off-by: Eric W. Biederman Signed-off-by: Linus Torvalds --- fs/sysfs/dir.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 0344ee70a47c..48ffbdf0d017 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -590,8 +590,8 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, #undef node } - if (found && ns) { - while (found->s_ns && found->s_ns != ns) { + if (found) { + while (found->s_ns != ns) { p = rb_next(&found->name_node); if (!p) return NULL; @@ -947,7 +947,7 @@ static struct sysfs_dirent *sysfs_dir_pos(const void *ns, #undef node } } - while (pos && pos->s_ns && pos->s_ns != ns) { + while (pos && pos->s_ns != ns) { struct rb_node *p = rb_next(&pos->inode_node); if (!p) pos = NULL; @@ -967,7 +967,7 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns, pos = NULL; else pos = rb_entry(p, struct sysfs_dirent, inode_node); - } while (pos && pos->s_ns && pos->s_ns != ns); + } while (pos && pos->s_ns != ns); return pos; } -- cgit v1.2.3-58-ga151 From cf8039036a6e9c5f7144841925f212a957faf1aa Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 25 Oct 2011 09:18:41 -0400 Subject: ext4: prevent stack overrun in ext4_file_open In ext4_file_open, the filesystem records the mountpoint of the first file that is opened after mounting the filesystem. It does this by allocating a 64-byte stack buffer, calling d_path() to grab the mount point through which this file was accessed, and then memcpy()ing 64 bytes into the superblock's s_last_mounted field, starting from the return value of d_path(), which is stored as "cp". However, if cp > buf (which it frequently is since path components are prepended starting at the end of buf) then we can end up copying stack data into the superblock. Writing stack variables into the superblock doesn't sound like a great idea, so use strlcpy instead. Andi Kleen suggested using strlcpy instead of strncpy. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index e4095e988eba..9781099337ec 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -181,8 +181,8 @@ static int ext4_file_open(struct inode * inode, struct file * filp) path.dentry = mnt->mnt_root; cp = d_path(&path, buf, sizeof(buf)); if (!IS_ERR(cp)) { - memcpy(sbi->s_es->s_last_mounted, cp, - sizeof(sbi->s_es->s_last_mounted)); + strlcpy(sbi->s_es->s_last_mounted, cp, + sizeof(sbi->s_es->s_last_mounted)); ext4_mark_super_dirty(sb); } } -- cgit v1.2.3-58-ga151 From 7c272194e66e91830b90f6202e61c69f8590f1eb Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 3 Aug 2011 09:58:09 -0700 Subject: ceph: make readpages fully async When we get a ->readpages() aop, submit async reads for all page ranges in the provided page list. Lock the pages immediately, so that VFS/MM will block until the reads complete. Signed-off-by: Sage Weil --- fs/ceph/addr.c | 185 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 115 insertions(+), 70 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 5a3953db8118..5bb39a50f904 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -228,102 +228,147 @@ static int ceph_readpage(struct file *filp, struct page *page) } /* - * Build a vector of contiguous pages from the provided page list. + * Finish an async read(ahead) op. */ -static struct page **page_vector_from_list(struct list_head *page_list, - unsigned *nr_pages) +static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) { - struct page **pages; - struct page *page; - int next_index, contig_pages = 0; + struct inode *inode = req->r_inode; + struct ceph_osd_reply_head *replyhead; + int rc, bytes; + int i; - /* build page vector */ - pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS); - if (!pages) - return ERR_PTR(-ENOMEM); + /* parse reply */ + replyhead = msg->front.iov_base; + WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); + rc = le32_to_cpu(replyhead->result); + bytes = le32_to_cpu(msg->hdr.data_len); - BUG_ON(list_empty(page_list)); - next_index = list_entry(page_list->prev, struct page, lru)->index; - list_for_each_entry_reverse(page, page_list, lru) { - if (page->index == next_index) { - dout("readpages page %d %p\n", contig_pages, page); - pages[contig_pages] = page; - contig_pages++; - next_index++; - } else { - break; + dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); + + /* unlock all pages, zeroing any data we didn't read */ + for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) { + struct page *page = req->r_pages[i]; + + if (bytes < (int)PAGE_CACHE_SIZE) { + /* zero (remainder of) page */ + int s = bytes < 0 ? 0 : bytes; + zero_user_segment(page, s, PAGE_CACHE_SIZE); } + dout("finish_read %p uptodate %p idx %lu\n", inode, page, + page->index); + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); } - *nr_pages = contig_pages; - return pages; + kfree(req->r_pages); } /* - * Read multiple pages. Leave pages we don't read + unlock in page_list; - * the caller (VM) cleans them up. + * start an async read(ahead) operation. return nr_pages we submitted + * a read for on success, or negative error code. */ -static int ceph_readpages(struct file *file, struct address_space *mapping, - struct list_head *page_list, unsigned nr_pages) +static int start_read(struct inode *inode, struct list_head *page_list) { - struct inode *inode = file->f_dentry->d_inode; - struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->client->osdc; - int rc = 0; - struct page **pages; - loff_t offset; + struct ceph_inode_info *ci = ceph_inode(inode); + struct page *page = list_entry(page_list->prev, struct page, lru); + struct ceph_osd_request *req; + u64 off; u64 len; + int i; + struct page **pages; + pgoff_t next_index; + int nr_pages = 0; + int ret; - dout("readpages %p file %p nr_pages %d\n", - inode, file, nr_pages); - - pages = page_vector_from_list(page_list, &nr_pages); - if (IS_ERR(pages)) - return PTR_ERR(pages); + off = page->index << PAGE_CACHE_SHIFT; - /* guess read extent */ - offset = pages[0]->index << PAGE_CACHE_SHIFT; + /* count pages */ + next_index = page->index; + list_for_each_entry_reverse(page, page_list, lru) { + if (page->index != next_index) + break; + nr_pages++; + next_index++; + } len = nr_pages << PAGE_CACHE_SHIFT; - rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, - offset, &len, - ci->i_truncate_seq, ci->i_truncate_size, - pages, nr_pages, 0); - if (rc == -ENOENT) - rc = 0; - if (rc < 0) - goto out; - - for (; !list_empty(page_list) && len > 0; - rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { - struct page *page = - list_entry(page_list->prev, struct page, lru); + dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, + off, len); + + req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), + off, &len, + CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, + NULL, 0, + ci->i_truncate_seq, ci->i_truncate_size, + NULL, false, 1, 0); + if (!req) + return -ENOMEM; + /* build page vector */ + nr_pages = len >> PAGE_CACHE_SHIFT; + pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); + ret = -ENOMEM; + if (!pages) + goto out; + for (i = 0; i < nr_pages; ++i) { + page = list_entry(page_list->prev, struct page, lru); + BUG_ON(PageLocked(page)); list_del(&page->lru); - - if (rc < (int)PAGE_CACHE_SIZE) { - /* zero (remainder of) page */ - int s = rc < 0 ? 0 : rc; - zero_user_segment(page, s, PAGE_CACHE_SIZE); - } - - if (add_to_page_cache_lru(page, mapping, page->index, + + dout("start_read %p adding %p idx %lu\n", inode, page, + page->index); + if (add_to_page_cache_lru(page, &inode->i_data, page->index, GFP_NOFS)) { page_cache_release(page); - dout("readpages %p add_to_page_cache failed %p\n", + dout("start_read %p add_to_page_cache failed %p\n", inode, page); - continue; + nr_pages = i; + goto out_pages; } - dout("readpages %p adding %p idx %lu\n", inode, page, - page->index); - flush_dcache_page(page); - SetPageUptodate(page); - unlock_page(page); - page_cache_release(page); + pages[i] = page; } - rc = 0; + req->r_pages = pages; + req->r_num_pages = nr_pages; + req->r_callback = finish_read; + req->r_inode = inode; + + dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); + ret = ceph_osdc_start_request(osdc, req, false); + if (ret < 0) + goto out_pages; + ceph_osdc_put_request(req); + return nr_pages; -out: +out_pages: + ceph_release_page_vector(pages, nr_pages); kfree(pages); +out: + ceph_osdc_put_request(req); + return ret; +} + + +/* + * Read multiple pages. Leave pages we don't read + unlock in page_list; + * the caller (VM) cleans them up. + */ +static int ceph_readpages(struct file *file, struct address_space *mapping, + struct list_head *page_list, unsigned nr_pages) +{ + struct inode *inode = file->f_dentry->d_inode; + int rc = 0; + + dout("readpages %p file %p nr_pages %d\n", inode, file, nr_pages); + while (!list_empty(page_list)) { + rc = start_read(inode, page_list); + if (rc < 0) + goto out; + BUG_ON(rc == 0); + } +out: + dout("readpages %p file %p ret %d\n", inode, file, rc); return rc; } -- cgit v1.2.3-58-ga151 From 83817e35cbd9b36db955a22418c9e30324353587 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 4 Aug 2011 08:03:44 -0700 Subject: ceph: rename rsize -> rasize It controls readahead. Signed-off-by: Sage Weil --- fs/ceph/super.c | 14 +++++++++++--- fs/ceph/super.h | 8 +++++--- 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 88bacaf385d9..387addbf942e 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -114,6 +114,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait) enum { Opt_wsize, Opt_rsize, + Opt_rasize, Opt_caps_wanted_delay_min, Opt_caps_wanted_delay_max, Opt_cap_release_safety, @@ -136,6 +137,7 @@ enum { static match_table_t fsopt_tokens = { {Opt_wsize, "wsize=%d"}, {Opt_rsize, "rsize=%d"}, + {Opt_rasize, "rasize=%d"}, {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, {Opt_cap_release_safety, "cap_release_safety=%d"}, @@ -196,6 +198,9 @@ static int parse_fsopt_token(char *c, void *private) case Opt_rsize: fsopt->rsize = intval; break; + case Opt_rasize: + fsopt->rasize = intval; + break; case Opt_caps_wanted_delay_min: fsopt->caps_wanted_delay_min = intval; break; @@ -293,6 +298,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; fsopt->rsize = CEPH_RSIZE_DEFAULT; + fsopt->rasize = CEPH_RASIZE_DEFAULT; fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; @@ -376,6 +382,8 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, ",wsize=%d", fsopt->wsize); if (fsopt->rsize != CEPH_RSIZE_DEFAULT) seq_printf(m, ",rsize=%d", fsopt->rsize); + if (fsopt->rasize != CEPH_RASIZE_DEFAULT) + seq_printf(m, ",rasize=%d", fsopt->rsize); if (fsopt->congestion_kb != default_congestion_kb()) seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) @@ -774,10 +782,10 @@ static int ceph_register_bdi(struct super_block *sb, { int err; - /* set ra_pages based on rsize mount option? */ - if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) + /* set ra_pages based on rasize mount option? */ + if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE) fsc->backing_dev_info.ra_pages = - (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) + (fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; else fsc->backing_dev_info.ra_pages = diff --git a/fs/ceph/super.h b/fs/ceph/super.h index a23eed526f05..5c72430b8f71 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -36,7 +36,8 @@ #define ceph_test_mount_opt(fsc, opt) \ (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) -#define CEPH_RSIZE_DEFAULT (512*1024) /* readahead */ +#define CEPH_RSIZE_DEFAULT 0 /* max read size */ +#define CEPH_RASIZE_DEFAULT (8192*1024) /* readahead */ #define CEPH_MAX_READDIR_DEFAULT 1024 #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) #define CEPH_SNAPDIRNAME_DEFAULT ".snap" @@ -45,8 +46,9 @@ struct ceph_mount_options { int flags; int sb_flags; - int wsize; - int rsize; /* max readahead */ + int wsize; /* max write size */ + int rsize; /* max read size */ + int rasize; /* max readahead */ int congestion_kb; /* max writeback in flight */ int caps_wanted_delay_min, caps_wanted_delay_max; int cap_release_safety; -- cgit v1.2.3-58-ga151 From 0d66a487c120012f33fbcd6af5cbf0a0cad71557 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 4 Aug 2011 08:21:30 -0700 Subject: ceph: implement (optional) max read size The 'rsize' mount option limits the maximum size of an individual read(ahead) operation that is sent off to an OSD. This is distinct from 'rasize', which controls the size of the readahead window. Signed-off-by: Sage Weil --- fs/ceph/addr.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 5bb39a50f904..5ffee90d9fb5 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -268,7 +268,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) * start an async read(ahead) operation. return nr_pages we submitted * a read for on success, or negative error code. */ -static int start_read(struct inode *inode, struct list_head *page_list) +static int start_read(struct inode *inode, struct list_head *page_list, int max) { struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->client->osdc; @@ -292,6 +292,8 @@ static int start_read(struct inode *inode, struct list_head *page_list) break; nr_pages++; next_index++; + if (max && nr_pages == max) + break; } len = nr_pages << PAGE_CACHE_SHIFT; dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, @@ -358,11 +360,18 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, struct list_head *page_list, unsigned nr_pages) { struct inode *inode = file->f_dentry->d_inode; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); int rc = 0; + int max = 0; + + if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) + max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) + >> PAGE_SHIFT; - dout("readpages %p file %p nr_pages %d\n", inode, file, nr_pages); + dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages, + max); while (!list_empty(page_list)) { - rc = start_read(inode, page_list); + rc = start_read(inode, page_list, max); if (rc < 0) goto out; BUG_ON(rc == 0); -- cgit v1.2.3-58-ga151 From 6a8ea4706adb4b4d8f77a8da5f9778b65fbf6f48 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 4 Aug 2011 15:41:37 -0700 Subject: ceph: document ioctls ...after some prodding by Christoph. Signed-off-by: Sage Weil --- fs/ceph/ioctl.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h index 0c5167e43180..be4a60487333 100644 --- a/fs/ceph/ioctl.h +++ b/fs/ceph/ioctl.h @@ -6,7 +6,31 @@ #define CEPH_IOCTL_MAGIC 0x97 -/* just use u64 to align sanely on all archs */ +/* + * CEPH_IOC_GET_LAYOUT - get file layout or dir layout policy + * CEPH_IOC_SET_LAYOUT - set file layout + * CEPH_IOC_SET_LAYOUT_POLICY - set dir layout policy + * + * The file layout specifies how file data is striped over objects in + * the distributed object store, which object pool they belong to (if + * it differs from the default), and an optional 'preferred osd' to + * store them on. + * + * Files get a new layout based on the policy set on the containing + * directory or one of its ancestors. The GET_LAYOUT ioctl will let + * you examine the layout for a file or the policy on a directory. + * + * SET_LAYOUT will let you set a layout on a newly created file. This + * only works immediately after the file is created and before any + * data is written to it. + * + * SET_LAYOUT_POLICY will let you set a layout policy (default layout) + * on a directory that will apply to any new files created in that + * directory (or any child directory that doesn't specify a layout of + * its own). + */ + +/* use u64 to align sanely on all archs */ struct ceph_ioctl_layout { __u64 stripe_unit, stripe_count, object_size; __u64 data_pool; @@ -21,6 +45,8 @@ struct ceph_ioctl_layout { struct ceph_ioctl_layout) /* + * CEPH_IOC_GET_DATALOC - get location of file data in the cluster + * * Extract identity, address of the OSD and object storing a given * file offset. */ @@ -39,7 +65,34 @@ struct ceph_ioctl_dataloc { #define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \ struct ceph_ioctl_dataloc) +/* + * CEPH_IOC_LAZYIO - relax consistency + * + * Normally Ceph switches to synchronous IO when multiple clients have + * the file open (and or more for write). Reads and writes bypass the + * page cache and go directly to the OSD. Setting this flag on a file + * descriptor will allow buffered IO for this file in cases where the + * application knows it won't interfere with other nodes (or doesn't + * care). + */ #define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4) + +/* + * CEPH_IOC_SYNCIO - force synchronous IO + * + * This ioctl sets a file flag that forces the synchronous IO that + * bypasses the page cache, even if it is not necessary. This is + * essentially the opposite behavior of IOC_LAZYIO. This forces the + * same read/write path as a file opened by multiple clients when one + * or more of those clients is opened for write. + * + * Note that this type of sync IO takes a different path than a file + * opened with O_SYNC/D_SYNC (writes hit the page cache and are + * immediately flushed on page boundaries). It is very similar to + * O_DIRECT (writes bypass the page cache) excep that O_DIRECT writes + * are not copied (user page must remain stable) and O_DIRECT writes + * have alignment restrictions (on the buffer and file offset). + */ #define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5) #endif -- cgit v1.2.3-58-ga151 From 6ab00d465a1c8c02c2216f8220727282f3aa50b5 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 9 Aug 2011 09:41:59 -0700 Subject: libceph: create messenger with client This simplifies the init/shutdown paths, and makes client->msgr available during the rest of the setup process. Signed-off-by: Sage Weil --- drivers/block/rbd.c | 2 +- fs/ceph/super.c | 9 ++++++--- include/linux/ceph/libceph.h | 4 +++- net/ceph/ceph_common.c | 47 ++++++++++++++++++++++---------------------- 4 files changed, 34 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 15f65b5f3fc7..0b3c5663a187 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -260,7 +260,7 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt, kref_init(&rbdc->kref); INIT_LIST_HEAD(&rbdc->node); - rbdc->client = ceph_create_client(opt, rbdc); + rbdc->client = ceph_create_client(opt, rbdc, 0, 0); if (IS_ERR(rbdc->client)) goto out_rbdc; opt = NULL; /* Now rbdc->client is responsible for opt */ diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 387addbf942e..f90dc0b011a7 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -430,20 +430,23 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, struct ceph_options *opt) { struct ceph_fs_client *fsc; + const unsigned supported_features = + CEPH_FEATURE_FLOCK | + CEPH_FEATURE_DIRLAYOUTHASH; + const unsigned required_features = 0; int err = -ENOMEM; fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); if (!fsc) return ERR_PTR(-ENOMEM); - fsc->client = ceph_create_client(opt, fsc); + fsc->client = ceph_create_client(opt, fsc, supported_features, + required_features); if (IS_ERR(fsc->client)) { err = PTR_ERR(fsc->client); goto fail; } fsc->client->extra_mon_dispatch = extra_mon_dispatch; - fsc->client->supported_features |= CEPH_FEATURE_FLOCK | - CEPH_FEATURE_DIRLAYOUTHASH; fsc->client->monc.want_mdsmap = 1; fsc->mount_options = fsopt; diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 563755181c1e..95bd8502e715 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -215,7 +215,9 @@ extern void ceph_destroy_options(struct ceph_options *opt); extern int ceph_compare_options(struct ceph_options *new_opt, struct ceph_client *client); extern struct ceph_client *ceph_create_client(struct ceph_options *opt, - void *private); + void *private, + unsigned supported_features, + unsigned required_features); extern u64 ceph_client_id(struct ceph_client *client); extern void ceph_destroy_client(struct ceph_client *client); extern int __ceph_open_session(struct ceph_client *client, diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 2883ea01e680..97f70e50ad3b 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -432,9 +432,12 @@ EXPORT_SYMBOL(ceph_client_id); /* * create a fresh client instance */ -struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private) +struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, + unsigned supported_features, + unsigned required_features) { struct ceph_client *client; + struct ceph_entity_addr *myaddr = NULL; int err = -ENOMEM; client = kzalloc(sizeof(*client), GFP_KERNEL); @@ -449,15 +452,27 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private) client->auth_err = 0; client->extra_mon_dispatch = NULL; - client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT; - client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT; - - client->msgr = NULL; + client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT | + supported_features; + client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT | + required_features; + + /* msgr */ + if (ceph_test_opt(client, MYIP)) + myaddr = &client->options->my_addr; + client->msgr = ceph_messenger_create(myaddr, + client->supported_features, + client->required_features); + if (IS_ERR(client->msgr)) { + err = PTR_ERR(client->msgr); + goto fail; + } + client->msgr->nocrc = ceph_test_opt(client, NOCRC); /* subsystems */ err = ceph_monc_init(&client->monc, client); if (err < 0) - goto fail; + goto fail_msgr; err = ceph_osdc_init(&client->osdc, client); if (err < 0) goto fail_monc; @@ -466,6 +481,8 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private) fail_monc: ceph_monc_stop(&client->monc); +fail_msgr: + ceph_messenger_destroy(client->msgr); fail: kfree(client); return ERR_PTR(err); @@ -490,8 +507,7 @@ void ceph_destroy_client(struct ceph_client *client) ceph_debugfs_client_cleanup(client); - if (client->msgr) - ceph_messenger_destroy(client->msgr); + ceph_messenger_destroy(client->msgr); ceph_destroy_options(client->options); @@ -514,24 +530,9 @@ static int have_mon_and_osd_map(struct ceph_client *client) */ int __ceph_open_session(struct ceph_client *client, unsigned long started) { - struct ceph_entity_addr *myaddr = NULL; int err; unsigned long timeout = client->options->mount_timeout * HZ; - /* initialize the messenger */ - if (client->msgr == NULL) { - if (ceph_test_opt(client, MYIP)) - myaddr = &client->options->my_addr; - client->msgr = ceph_messenger_create(myaddr, - client->supported_features, - client->required_features); - if (IS_ERR(client->msgr)) { - client->msgr = NULL; - return PTR_ERR(client->msgr); - } - client->msgr->nocrc = ceph_test_opt(client, NOCRC); - } - /* open session, and wait for mon and osd maps */ err = ceph_monc_open_session(&client->monc); if (err < 0) -- cgit v1.2.3-58-ga151 From b61c27636fffbaf1980e675282777b9467254a40 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 9 Aug 2011 15:03:46 -0700 Subject: libceph: don't complain on msgpool alloc failures The pool allocation failures are masked by the pool; there is no need to spam the console about them. (That's the whole point of having the pool in the first place.) Mark msg allocations whose failure is safely handled as such. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 2 +- fs/ceph/mds_client.c | 11 ++++++----- include/linux/ceph/messenger.h | 3 ++- net/ceph/messenger.c | 15 +++++++++++---- net/ceph/mon_client.c | 24 +++++++++++++++--------- net/ceph/msgpool.c | 4 ++-- net/ceph/osd_client.c | 8 ++++---- 7 files changed, 41 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 8d74ad7ba556..b8731bf3ef1f 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -945,7 +945,7 @@ static int send_cap_msg(struct ceph_mds_session *session, seq, issue_seq, mseq, follows, size, max_size, xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false); if (!msg) return -ENOMEM; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 86c59e16ba74..1d72f15fe9f4 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -764,7 +764,8 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq) struct ceph_msg *msg; struct ceph_mds_session_head *h; - msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS, + false); if (!msg) { pr_err("create_session_msg ENOMEM creating msg\n"); return NULL; @@ -1240,7 +1241,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc, while (session->s_num_cap_releases < session->s_nr_caps + extra) { spin_unlock(&session->s_cap_lock); msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, - GFP_NOFS); + GFP_NOFS, false); if (!msg) goto out_unlocked; dout("add_cap_releases %p msg %p now %d\n", session, msg, @@ -1652,7 +1653,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, if (req->r_old_dentry_drop) len += req->r_old_dentry->d_name.len; - msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false); if (!msg) { msg = ERR_PTR(-ENOMEM); goto out_free2; @@ -2518,7 +2519,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, goto fail_nopagelist; ceph_pagelist_init(pagelist); - reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS); + reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false); if (!reply) goto fail_nomsg; @@ -2831,7 +2832,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, dnamelen = dentry->d_name.len; len += dnamelen; - msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false); if (!msg) return; lease = msg->front.iov_base; diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index d7adf151d335..bbd4a4bb2c6b 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -238,7 +238,8 @@ extern void ceph_con_keepalive(struct ceph_connection *con); extern struct ceph_connection *ceph_con_get(struct ceph_connection *con); extern void ceph_con_put(struct ceph_connection *con); -extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags); +extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, + bool can_fail); extern void ceph_msg_kfree(struct ceph_msg *m); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 9918e9eb276e..2de711a0c037 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2281,7 +2281,8 @@ EXPORT_SYMBOL(ceph_con_keepalive); * construct a new message with given type, size * the new msg has a ref count of 1. */ -struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags) +struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, + bool can_fail) { struct ceph_msg *m; @@ -2333,7 +2334,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags) m->front.iov_base = kmalloc(front_len, flags); } if (m->front.iov_base == NULL) { - pr_err("msg_new can't allocate %d bytes\n", + dout("ceph_msg_new can't allocate %d bytes\n", front_len); goto out2; } @@ -2348,7 +2349,13 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags) out2: ceph_msg_put(m); out: - pr_err("msg_new can't create type %d front %d\n", type, front_len); + if (!can_fail) { + pr_err("msg_new can't create type %d front %d\n", type, + front_len); + } else { + dout("msg_new can't create type %d front %d\n", type, + front_len); + } return NULL; } EXPORT_SYMBOL(ceph_msg_new); @@ -2398,7 +2405,7 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, } if (!msg) { *skip = 0; - msg = ceph_msg_new(type, front_len, GFP_NOFS); + msg = ceph_msg_new(type, front_len, GFP_NOFS, false); if (!msg) { pr_err("unable to allocate msg type %d len %d\n", type, front_len); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 556721665335..af2ef3082499 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -517,10 +517,12 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) init_completion(&req->completion); err = -ENOMEM; - req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS); + req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS, + true); if (!req->request) goto out; - req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS); + req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS, + true); if (!req->reply) goto out; @@ -615,10 +617,12 @@ int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op, init_completion(&req->completion); err = -ENOMEM; - req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS); + req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS, + true); if (!req->request) goto out; - req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS); + req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS, + true); if (!req->reply) goto out; @@ -765,19 +769,21 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) err = -ENOMEM; monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK, sizeof(struct ceph_mon_subscribe_ack), - GFP_NOFS); + GFP_NOFS, true); if (!monc->m_subscribe_ack) goto out_con; - monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS); + monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS, + true); if (!monc->m_subscribe) goto out_subscribe_ack; - monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS); + monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS, + true); if (!monc->m_auth_reply) goto out_subscribe; - monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS); + monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true); monc->pending_auth = 0; if (!monc->m_auth) goto out_auth_reply; @@ -970,7 +976,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, case CEPH_MSG_MON_MAP: case CEPH_MSG_MDS_MAP: case CEPH_MSG_OSD_MAP: - m = ceph_msg_new(type, front_len, GFP_NOFS); + m = ceph_msg_new(type, front_len, GFP_NOFS, false); break; } diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c index 1f4cb30a42c5..11d5f4196a73 100644 --- a/net/ceph/msgpool.c +++ b/net/ceph/msgpool.c @@ -12,7 +12,7 @@ static void *msgpool_alloc(gfp_t gfp_mask, void *arg) struct ceph_msgpool *pool = arg; struct ceph_msg *msg; - msg = ceph_msg_new(0, pool->front_len, gfp_mask); + msg = ceph_msg_new(0, pool->front_len, gfp_mask, true); if (!msg) { dout("msgpool_alloc %s failed\n", pool->name); } else { @@ -61,7 +61,7 @@ struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, WARN_ON(1); /* try to alloc a fresh message */ - return ceph_msg_new(0, front_len, GFP_NOFS); + return ceph_msg_new(0, front_len, GFP_NOFS, false); } msg = mempool_alloc(pool->pool, GFP_NOFS); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 88ad8a2501b5..01ceb59a8b4a 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -227,7 +227,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); else msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, - OSD_OPREPLY_FRONT_LEN, gfp_flags); + OSD_OPREPLY_FRONT_LEN, gfp_flags, true); if (!msg) { ceph_osdc_put_request(req); return NULL; @@ -250,7 +250,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op, 0); else - msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags); + msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true); if (!msg) { ceph_osdc_put_request(req); return NULL; @@ -2032,7 +2032,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, if (front > req->r_reply->front.iov_len) { pr_warning("get_reply front %d > preallocated %d\n", front, (int)req->r_reply->front.iov_len); - m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS); + m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS, false); if (!m) goto out; ceph_msg_put(req->r_reply); @@ -2080,7 +2080,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, switch (type) { case CEPH_MSG_OSD_MAP: case CEPH_MSG_WATCH_NOTIFY: - return ceph_msg_new(type, front, GFP_NOFS); + return ceph_msg_new(type, front, GFP_NOFS, false); case CEPH_MSG_OSD_OPREPLY: return get_reply(con, hdr, skip); default: -- cgit v1.2.3-58-ga151 From 80db8bea6a0f4fd047eafd8329a44d5a110f462b Mon Sep 17 00:00:00 2001 From: Noah Watkins Date: Mon, 22 Aug 2011 13:49:23 -0600 Subject: ceph: replace leading spaces with tabs Trivial formatting fix. Signed-off-by: Noah Watkins Signed-off-by: Sage Weil --- fs/ceph/super.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index f90dc0b011a7..788f5ad8e66d 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -294,29 +294,29 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name); - fsopt->sb_flags = flags; - fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; + fsopt->sb_flags = flags; + fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; - fsopt->rsize = CEPH_RSIZE_DEFAULT; - fsopt->rasize = CEPH_RASIZE_DEFAULT; - fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); + fsopt->rsize = CEPH_RSIZE_DEFAULT; + fsopt->rasize = CEPH_RASIZE_DEFAULT; + fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; - fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; - fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; - fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; - fsopt->congestion_kb = default_congestion_kb(); - - /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ - err = -EINVAL; - if (!dev_name) - goto out; - *path = strstr(dev_name, ":/"); - if (*path == NULL) { - pr_err("device name is missing path (no :/ in %s)\n", - dev_name); - goto out; - } + fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; + fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; + fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; + fsopt->congestion_kb = default_congestion_kb(); + + /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ + err = -EINVAL; + if (!dev_name) + goto out; + *path = strstr(dev_name, ":/"); + if (*path == NULL) { + pr_err("device name is missing path (no :/ in %s)\n", + dev_name); + goto out; + } dev_name_end = *path; dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); -- cgit v1.2.3-58-ga151 From 83eaea22bdfc9e1cec88f81be5b64f30f6c37e8b Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 24 Aug 2011 14:07:01 -0700 Subject: Revert "ceph: don't truncate dirty pages in invalidate work thread" This reverts commit c9af9fb68e01eb2c2165e1bc45cfeeed510c64e6. We need to block and truncate all pages in order to reliably invalidate them. Otherwise, we could: - have some uptodate pages in the cache - queue an invalidate - write(2) locks some pages - invalidate_work skips them - write(2) only overwrites part of the page - page now dirty and uptodate -> partial leakage of invalidated data It's not entirely clear why we started skipping locked pages in the first place. I just ran this through fsx and didn't see any problems. Signed-off-by: Sage Weil --- fs/ceph/inode.c | 46 +--------------------------------------------- 1 file changed, 1 insertion(+), 45 deletions(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 095799ba9dd1..5dde7d51dc11 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -9,7 +9,6 @@ #include #include #include -#include #include "super.h" #include "mds_client.h" @@ -1363,49 +1362,6 @@ void ceph_queue_invalidate(struct inode *inode) } } -/* - * invalidate any pages that are not dirty or under writeback. this - * includes pages that are clean and mapped. - */ -static void ceph_invalidate_nondirty_pages(struct address_space *mapping) -{ - struct pagevec pvec; - pgoff_t next = 0; - int i; - - pagevec_init(&pvec, 0); - while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - pgoff_t index; - int skip_page = - (PageDirty(page) || PageWriteback(page)); - - if (!skip_page) - skip_page = !trylock_page(page); - - /* - * We really shouldn't be looking at the ->index of an - * unlocked page. But we're not allowed to lock these - * pages. So we rely upon nobody altering the ->index - * of this (pinned-by-us) page. - */ - index = page->index; - if (index > next) - next = index; - next++; - - if (skip_page) - continue; - - generic_error_remove_page(mapping, page); - unlock_page(page); - } - pagevec_release(&pvec); - cond_resched(); - } -} - /* * Invalidate inode pages in a worker thread. (This can't be done * in the message handler context.) @@ -1429,7 +1385,7 @@ static void ceph_invalidate_work(struct work_struct *work) orig_gen = ci->i_rdcache_gen; spin_unlock(&inode->i_lock); - ceph_invalidate_nondirty_pages(inode->i_mapping); + truncate_inode_pages(&inode->i_data, 0); spin_lock(&inode->i_lock); if (orig_gen == ci->i_rdcache_gen && -- cgit v1.2.3-58-ga151 From a35eca958aa1c7d0b5f993db1a3ded45ae16d59b Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Thu, 25 Aug 2011 12:43:06 -0700 Subject: ceph: let the set_layout ioctl set single traits Previously we were validating the passed-in stripe unit, object size, and stripe count against each other (and not testing most other stuff). Instead, make sure that the composed previous layout and new values are valid, and only send the new values to the MDS. This lets users change the pool without setting the whole layout, for instance. Signed-off-by: Greg Farnum --- fs/ceph/ioctl.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 3b256b50f7d8..5a14c29cbba6 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -42,17 +42,39 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; struct ceph_ioctl_layout l; + struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode); + struct ceph_ioctl_layout nl; int err, i; - /* copy and validate */ if (copy_from_user(&l, arg, sizeof(l))) return -EFAULT; - if ((l.object_size & ~PAGE_MASK) || - (l.stripe_unit & ~PAGE_MASK) || - !l.stripe_unit || - (l.object_size && - (unsigned)l.object_size % (unsigned)l.stripe_unit)) + /* validate changed params against current layout */ + err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT); + if (!err) { + nl.stripe_unit = ceph_file_layout_su(ci->i_layout); + nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); + nl.object_size = ceph_file_layout_object_size(ci->i_layout); + nl.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool); + nl.preferred_osd = + (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred); + } else + return err; + + if (l.stripe_count) + nl.stripe_count = l.stripe_count; + if (l.stripe_unit) + nl.stripe_unit = l.stripe_unit; + if (l.object_size) + nl.object_size = l.object_size; + if (l.data_pool) + nl.data_pool = l.data_pool; + if (l.preferred_osd) + nl.preferred_osd = l.preferred_osd; + + if ((nl.object_size & ~PAGE_MASK) || + (nl.stripe_unit & ~PAGE_MASK) || + ((unsigned)nl.object_size % (unsigned)nl.stripe_unit)) return -EINVAL; /* make sure it's a valid data pool */ -- cgit v1.2.3-58-ga151 From 3310f7541f0c991b51324a7712db51fb8f912601 Mon Sep 17 00:00:00 2001 From: Amon Ott Date: Thu, 20 Oct 2011 13:04:07 -0700 Subject: ceph: fix 32-bit ino numbers Fix 32-bit ino generation to not always be 1. Signed-off-by: Amon Ott --- fs/ceph/super.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 5c72430b8f71..b01442aaf278 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -346,9 +346,10 @@ static inline struct ceph_vino ceph_vino(struct inode *inode) * x86_64+ino32 64 32 * x86_64 64 64 */ -static inline u32 ceph_ino_to_ino32(ino_t ino) +static inline u32 ceph_ino_to_ino32(__u64 vino) { - ino ^= ino >> (sizeof(ino) * 8 - 32); + u32 ino = vino & 0xffffffff; + ino ^= vino >> 32; if (!ino) ino = 1; return ino; @@ -359,11 +360,11 @@ static inline u32 ceph_ino_to_ino32(ino_t ino) */ static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) { - ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ #if BITS_PER_LONG == 32 - ino = ceph_ino_to_ino32(ino); + return ceph_ino_to_ino32(vino.ino); +#else + return (ino_t)vino.ino; #endif - return ino; } /* -- cgit v1.2.3-58-ga151 From 339573406737461cfb17bebabf7ba536a302d841 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 24 Oct 2011 09:05:47 -0700 Subject: libceph: fix double-free of page vector ceph_release_page_vector() kfrees the vector; we shouldn't do it here too. Reported-by: Jeff Wu Signed-off-by: Sage Weil --- fs/ceph/addr.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 5ffee90d9fb5..4144caf2f9d3 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -345,7 +345,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) out_pages: ceph_release_page_vector(pages, nr_pages); - kfree(pages); out: ceph_osdc_put_request(req); return ret; -- cgit v1.2.3-58-ga151 From 909a4cf1ffe4b875c87abf38239a9bfd25167e0c Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Wed, 26 Oct 2011 03:22:31 -0400 Subject: ext4: avoid setting directory i_nlink to zero If a directory with more than EXT4_LINK_MAX subdirectories, the nlink count is set to 1. Subsequently, if any subdirectories are deleted, ext4_dec_count() decrements the i_nlink count, which may go to 0 temporarily before being incremented back to 1. While this is done under i_mutex, which prevents races for directory and inode operations that check i_nlink, the temporary i_nlink == 0 case is exposed to userspace via stat() and similar calls that do not hold i_mutex. Instead, change the code to not decrement i_nlink count for any directories that do not already have i_nlink larger than 2. Reported-by: Cliff White Reviewed-by: Johann Lombardi Signed-off-by: Andreas Dilger Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a067835bbac1..4a550aa07614 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1706,9 +1706,8 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode) */ static void ext4_dec_count(handle_t *handle, struct inode *inode) { - drop_nlink(inode); - if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0) - inc_nlink(inode); + if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) + drop_nlink(inode); } -- cgit v1.2.3-58-ga151 From 665436175c3ca9d35f135e1ba6bdd63745cff08a Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 26 Oct 2011 03:32:07 -0400 Subject: ext4: use ext4_reserve_inode_write in ext4_xattr_set_handle ext4_mark_iloc_dirty() says: * The caller must have previously called ext4_reserve_inode_write(). * Give this, we know that the caller already has write access to iloc->bh. ext4_xattr_set_handle, however, just open-codes it. May as well use the helper function for consistency. No bug here, just tidiness. (Note: on cleanup path, ext4_reserve_inode_write sets the bh to NULL if it returns an error, and brelse() of a null bh is handled gracefully). Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/xattr.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index c757adc97250..0ae3668520f8 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -985,11 +985,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); - error = ext4_get_inode_loc(inode, &is.iloc); - if (error) - goto cleanup; - - error = ext4_journal_get_write_access(handle, is.iloc.bh); + error = ext4_reserve_inode_write(handle, inode, &is.iloc); if (error) goto cleanup; -- cgit v1.2.3-58-ga151 From f85b287a01237857a50c93868231f7e831581a27 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 26 Oct 2011 03:42:36 -0400 Subject: ext4: error handling fix in ext4_ext_convert_to_initialized() When allocated is unsigned it breaks the error handling at the end of the function when we call: allocated = ext4_split_extent(...); if (allocated < 0) err = allocated; I've made it a signed int instead of unsigned. Signed-off-by: Dan Carpenter Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 8686eb756b38..02a4d80573a1 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2928,7 +2928,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, struct ext4_extent zero_ex; struct ext4_extent *ex; ext4_lblk_t ee_block, eof_block; - unsigned int allocated, ee_len, depth; + unsigned int ee_len, depth; + int allocated; int err = 0; int split_flag = 0; -- cgit v1.2.3-58-ga151 From 6f8ff537266ee5396c920fb0c842a21df3055ff3 Mon Sep 17 00:00:00 2001 From: Curt Wohlgemuth Date: Wed, 26 Oct 2011 04:38:59 -0400 Subject: ext4: handle NULL p_ext in ext4_ext_next_allocated_block() In ext4_ext_next_allocated_block(), the path[depth] might have a p_ext that is NULL -- see ext4_ext_binsearch(). In such a case, dereferencing it will crash the machine. This patch checks for p_ext == NULL in ext4_ext_next_allocated_block() before dereferencinging it. Tested using a hand-crafted an inode with eh_entries == 0 in an extent block, verified that running FIEMAP on it crashes without this patch, works fine with it. Signed-off-by: Curt Wohlgemuth Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 02a4d80573a1..797b63b59740 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1392,7 +1392,8 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path) while (depth >= 0) { if (depth == path->p_depth) { /* leaf */ - if (path[depth].p_ext != + if (path[depth].p_ext && + path[depth].p_ext != EXT_LAST_EXTENT(path[depth].p_hdr)) return le32_to_cpu(path[depth].p_ext[1].ee_block); } else { -- cgit v1.2.3-58-ga151 From fcbb5515825f1bb20b7a6f75ec48bee61416f879 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Wed, 26 Oct 2011 05:00:19 -0400 Subject: ext4: let ext4_page_mkwrite stop started handle in failure The started journal handle should be stopped in failure case. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" Acked-by: Jan Kara Cc: stable@kernel.org --- fs/ext4/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 87ec615f0fd6..e4b26faac5ff 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4791,6 +4791,7 @@ retry_alloc: PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { unlock_page(page); ret = VM_FAULT_SIGBUS; + ext4_journal_stop(handle); goto out; } ext4_set_inode_state(inode, EXT4_STATE_JDATA); -- cgit v1.2.3-58-ga151 From ebbe027797f67d34708ccfabdb129886d549f9ce Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Wed, 26 Oct 2011 05:14:27 -0400 Subject: ext4: use stream-alloc when mb_group_prealloc set to zero The kernel will crash on ext4_mb_mark_diskspace_used: BUG_ON(ac->ac_b_ex.fe_len <= 0); after we set /sys/fs/ext4/sda/mb_group_prealloc to zero and create new files in an ext4 filesystem. The reason is: ac_b_ex.fe_len also set to zero(mb_group_prealloc) in ext4_mb_normalize_group_request because the ac_flags contains EXT4_MB_HINT_GROUP_ALLOC. I think when someone set mb_group_prealloc to zero, it means DO NOT USE GROUP PREALLOCATION, so we should set alloc-strategy to STREAM in this case. Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cd70b3041185..89762652aae2 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4006,6 +4006,11 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) return; } + if (sbi->s_mb_group_prealloc <= 0) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + return; + } + /* don't use group allocation for large files */ size = max(size, isize); if (size > sbi->s_mb_stream_request) { -- cgit v1.2.3-58-ga151 From 66a83cde47deb4e8874539326e12e88ed82158d3 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Wed, 26 Oct 2011 05:29:21 -0400 Subject: ext4: remove unused variable in ext4_mb_generate_from_pa() The variable 'count' in function ext4_mb_generate_from_pa() looks useless, so remove it. Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 89762652aae2..c66531d800b1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3345,7 +3345,6 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t groupnr; ext4_grpblk_t start; int preallocated = 0; - int count = 0; int len; /* all form of preallocation discards first load group, @@ -3368,7 +3367,6 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, BUG_ON(groupnr != group); ext4_set_bits(bitmap, start, len); preallocated += len; - count++; } mb_debug(1, "prellocated %u for group %u\n", preallocated, group); } -- cgit v1.2.3-58-ga151 From b051d8dc4e1f011e1b0543a875f5861be5d90222 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Wed, 26 Oct 2011 05:30:30 -0400 Subject: ext4: remove unused variable in mb_find_extent() The variable 'ord' in function mb_find_extent() is redundant, so remove it. Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c66531d800b1..2bb1ddc5c30b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1391,7 +1391,6 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, { int next = block; int max; - int ord; void *buddy; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); @@ -1433,9 +1432,8 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) break; - ord = mb_find_order_for_block(e4b, next); + order = mb_find_order_for_block(e4b, next); - order = ord; block = next >> order; ex->fe_len += 1 << order; } -- cgit v1.2.3-58-ga151 From 0a10da73e1fa6fb9b45f1166011ff3b04c27c010 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Wed, 26 Oct 2011 08:48:54 -0400 Subject: ext4: fix a wrong comment in __mb_check_buddy() The comment says the bit should be 0, but the after code assert the bit to be 1. This makes people confused, so fix it. Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 2bb1ddc5c30b..e2d8be8f28bf 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -581,7 +581,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, continue; } - /* both bits in buddy2 must be 0 */ + /* both bits in buddy2 must be 1 */ MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); -- cgit v1.2.3-58-ga151 From b3ff05690845911cc40387176f0bc5a7af9ef3ff Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Wed, 26 Oct 2011 11:08:39 -0400 Subject: ext4: don't check io->flag when setting EXT4_STATE_DIO_UNWRITTEN inode state When we want to convert the unitialized extent in direct write, we can either do it in ext4_end_io_nolock(AIO case) or in ext4_ext_direct_IO(non AIO case) and EXT4_I(inode)->cur_aio_dio is a guard for ext4_ext_map_blocks to find the right case. In e9e3bcecf, we mistakenly change it by: - if (io) + if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { io->flag = EXT4_IO_END_UNWRITTEN; - else + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); So now if we map 2 blocks, and the first one set the EXT_IO_END_UNWRITTEN, the 2nd mapping will set inode state because of the check for the flag. This is wrong. Cc: Eric Sandeen Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 797b63b59740..c2ac06cb2d46 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3390,9 +3390,11 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, * that this IO needs to conversion to written when IO is * completed */ - if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { - io->flag = EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + if (io) { + if (!(io->flag & EXT4_IO_END_UNWRITTEN)) { + io->flag = EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } } else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); if (ext4_should_dioread_nolock(inode)) @@ -3946,9 +3948,11 @@ got_allocated_blocks: * that we need to perform conversion when IO is done. */ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { - if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { - io->flag = EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + if (io) { + if (!(io->flag & EXT4_IO_END_UNWRITTEN)) { + io->flag = EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } } else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); -- cgit v1.2.3-58-ga151 From 60325f0c6ee7c6b68f95aaa643260fb33d4bdd88 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 26 Oct 2011 18:26:49 -0700 Subject: fs/Makefile: Stupid typo breakage of exofs inclusion In my last patch I did a stupid mistake and broke the exofs compilation completely. Fix it ASAP. Instead of obj-y I did obj-$(y) Really Really sorry. Me totally blushing :-{| Signed-off-by: Boaz Harrosh Signed-off-by: Linus Torvalds --- fs/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/Makefile b/fs/Makefile index 5c30a13341eb..d2c3353d5477 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -120,6 +120,6 @@ obj-$(CONFIG_DEBUG_FS) += debugfs/ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ -obj-$(y) += exofs/ # Multiple mods, used by nfs/objlayout +obj-y += exofs/ # Multiple modules obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ -- cgit v1.2.3-58-ga151 From 44705754610dbc63503bc7679ff9d9f84978a76f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 27 Oct 2011 04:05:13 -0400 Subject: jbd2: fix build when CONFIG_BUG is not enabled Fix build error when CONFIG_BUG is not enabled: fs/jbd2/transaction.c:1175:3: error: implicit declaration of function '__WARN' by changing __WARN() to WARN_ON(), as suggested by Arnaud Lacombe . Signed-off-by: Randy Dunlap Signed-off-by: "Theodore Ts'o" Cc: Arnd Bergmann Cc: Arnaud Lacombe --- fs/jbd2/transaction.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index b01fd6104089..1e5c5ead5b0d 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -27,6 +27,7 @@ #include #include #include +#include #include static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); @@ -1171,8 +1172,7 @@ out_unlock_bh: jbd_unlock_bh_state(bh); out: JBUFFER_TRACE(jh, "exit"); - if (ret) - __WARN(); /* All errors are bugs, so dump the stack */ + WARN_ON(ret); /* All errors are bugs, so dump the stack */ return ret; } -- cgit v1.2.3-58-ga151 From 6f91bc5fda82d2c49b4f7fb29424cf6a3c7574bc Mon Sep 17 00:00:00 2001 From: Eric Gouriou Date: Thu, 27 Oct 2011 11:43:23 -0400 Subject: ext4: optimize ext4_ext_convert_to_initialized() This patch introduces a fast path in ext4_ext_convert_to_initialized() for the case when the conversion can be performed by transferring the newly initialized blocks from the uninitialized extent into an adjacent initialized extent. Doing so removes the expensive invocations of memmove() which occur during extent insertion and the subsequent merge. In practice this should be the common case for clients performing append writes into files pre-allocated via fallocate(FALLOC_FL_KEEP_SIZE). In such a workload performed via direct IO and when using a suboptimal implementation of memmove() (x86_64 prior to the 2.6.39 rewrite), this patch reduces kernel CPU consumption by 32%. Two new trace points are added to ext4_ext_convert_to_initialized() to offer visibility into its operations. No exit trace point has been added due to the multiplicity of return points. This can be revisited once the upstream cleanup is backported. Signed-off-by: Eric Gouriou Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 93 +++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/super.c | 1 + include/trace/events/ext4.h | 82 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 176 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c2ac06cb2d46..8b6a17b60970 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2919,12 +2919,23 @@ out: * a> There is no split required: Entire extent should be initialized * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent + * + * Pre-conditions: + * - The extent pointed to by 'path' is uninitialized. + * - The extent pointed to by 'path' contains a superset + * of the logical span [map->m_lblk, map->m_lblk + map->m_len). + * + * Post-conditions on success: + * - the returned value is the number of blocks beyond map->l_lblk + * that are allocated and initialized. + * It is guaranteed to be >= map->m_len. */ static int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path *path) { + struct ext4_extent_header *eh; struct ext4_map_blocks split_map; struct ext4_extent zero_ex; struct ext4_extent *ex; @@ -2944,11 +2955,93 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, eof_block = map->m_lblk + map->m_len; depth = ext_depth(inode); + eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); allocated = ee_len - (map->m_lblk - ee_block); + trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); + + /* Pre-conditions */ + BUG_ON(!ext4_ext_is_uninitialized(ex)); + BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); + BUG_ON(map->m_lblk + map->m_len > ee_block + ee_len); + + /* + * Attempt to transfer newly initialized blocks from the currently + * uninitialized extent to its left neighbor. This is much cheaper + * than an insertion followed by a merge as those involve costly + * memmove() calls. This is the common case in steady state for + * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append + * writes. + * + * Limitations of the current logic: + * - L1: we only deal with writes at the start of the extent. + * The approach could be extended to writes at the end + * of the extent but this scenario was deemed less common. + * - L2: we do not deal with writes covering the whole extent. + * This would require removing the extent if the transfer + * is possible. + * - L3: we only attempt to merge with an extent stored in the + * same extent tree node. + */ + if ((map->m_lblk == ee_block) && /*L1*/ + (map->m_len < ee_len) && /*L2*/ + (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/ + struct ext4_extent *prev_ex; + ext4_lblk_t prev_lblk; + ext4_fsblk_t prev_pblk, ee_pblk; + unsigned int prev_len, write_len; + + prev_ex = ex - 1; + prev_lblk = le32_to_cpu(prev_ex->ee_block); + prev_len = ext4_ext_get_actual_len(prev_ex); + prev_pblk = ext4_ext_pblock(prev_ex); + ee_pblk = ext4_ext_pblock(ex); + write_len = map->m_len; + + /* + * A transfer of blocks from 'ex' to 'prev_ex' is allowed + * upon those conditions: + * - C1: prev_ex is initialized, + * - C2: prev_ex is logically abutting ex, + * - C3: prev_ex is physically abutting ex, + * - C4: prev_ex can receive the additional blocks without + * overflowing the (initialized) length limit. + */ + if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/ + ((prev_lblk + prev_len) == ee_block) && /*C2*/ + ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ + (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/ + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + trace_ext4_ext_convert_to_initialized_fastpath(inode, + map, ex, prev_ex); + + /* Shift the start of ex by 'write_len' blocks */ + ex->ee_block = cpu_to_le32(ee_block + write_len); + ext4_ext_store_pblock(ex, ee_pblk + write_len); + ex->ee_len = cpu_to_le16(ee_len - write_len); + ext4_ext_mark_uninitialized(ex); /* Restore the flag */ + + /* Extend prev_ex by 'write_len' blocks */ + prev_ex->ee_len = cpu_to_le16(prev_len + write_len); + + /* Mark the block containing both extents as dirty */ + ext4_ext_dirty(handle, inode, path + depth); + + /* Update path to point to the right extent */ + path[depth].p_ext = prev_ex; + + /* Result: number of initialized blocks past m_lblk */ + allocated = write_len; + goto out; + } + } + WARN_ON(map->m_lblk < ee_block); /* * It is safe to convert extent to initialized via explicit diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dcc460537bc7..9953d80145ad 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -45,6 +45,7 @@ #include #include "ext4.h" +#include "ext4_extents.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index c9a341e385a3..748ff7cbe555 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -9,6 +9,7 @@ struct ext4_allocation_context; struct ext4_allocation_request; +struct ext4_extent; struct ext4_prealloc_space; struct ext4_inode_info; struct mpage_da_data; @@ -1394,6 +1395,87 @@ DEFINE_EVENT(ext4__truncate, ext4_truncate_exit, TP_ARGS(inode) ); +/* 'ux' is the uninitialized extent. */ +TRACE_EVENT(ext4_ext_convert_to_initialized_enter, + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, + struct ext4_extent *ux), + + TP_ARGS(inode, map, ux), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( ext4_lblk_t, m_lblk ) + __field( unsigned, m_len ) + __field( ext4_lblk_t, u_lblk ) + __field( unsigned, u_len ) + __field( ext4_fsblk_t, u_pblk ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->m_lblk = map->m_lblk; + __entry->m_len = map->m_len; + __entry->u_lblk = le32_to_cpu(ux->ee_block); + __entry->u_len = ext4_ext_get_actual_len(ux); + __entry->u_pblk = ext4_ext_pblock(ux); + ), + + TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u u_lblk %u u_len %u " + "u_pblk %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->m_lblk, __entry->m_len, + __entry->u_lblk, __entry->u_len, __entry->u_pblk) +); + +/* + * 'ux' is the uninitialized extent. + * 'ix' is the initialized extent to which blocks are transferred. + */ +TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath, + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, + struct ext4_extent *ux, struct ext4_extent *ix), + + TP_ARGS(inode, map, ux, ix), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( ext4_lblk_t, m_lblk ) + __field( unsigned, m_len ) + __field( ext4_lblk_t, u_lblk ) + __field( unsigned, u_len ) + __field( ext4_fsblk_t, u_pblk ) + __field( ext4_lblk_t, i_lblk ) + __field( unsigned, i_len ) + __field( ext4_fsblk_t, i_pblk ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->m_lblk = map->m_lblk; + __entry->m_len = map->m_len; + __entry->u_lblk = le32_to_cpu(ux->ee_block); + __entry->u_len = ext4_ext_get_actual_len(ux); + __entry->u_pblk = ext4_ext_pblock(ux); + __entry->i_lblk = le32_to_cpu(ix->ee_block); + __entry->i_len = ext4_ext_get_actual_len(ix); + __entry->i_pblk = ext4_ext_pblock(ix); + ), + + TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u " + "u_lblk %u u_len %u u_pblk %llu " + "i_lblk %u i_len %u i_pblk %llu ", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->m_lblk, __entry->m_len, + __entry->u_lblk, __entry->u_len, __entry->u_pblk, + __entry->i_lblk, __entry->i_len, __entry->i_pblk) +); + DECLARE_EVENT_CLASS(ext4__map_blocks_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len, unsigned int flags), -- cgit v1.2.3-58-ga151 From 80e675f906db54eb1ce3a9555cee5f45b5b72ab2 Mon Sep 17 00:00:00 2001 From: Eric Gouriou Date: Thu, 27 Oct 2011 11:52:18 -0400 Subject: ext4: optimize memmmove lengths in extent/index insertions ext4_ext_insert_extent() (respectively ext4_ext_insert_index()) was using EXT_MAX_EXTENT() (resp. EXT_MAX_INDEX()) to determine how many entries needed to be moved beyond the insertion point. In practice this means that (320 - I) * 24 bytes were memmove()'d when I is the insertion point, rather than (#entries - I) * 24 bytes. This patch uses EXT_LAST_EXTENT() (resp. EXT_LAST_INDEX()) instead to only move existing entries. The code flow is also simplified slightly to highlight similarities and reduce code duplication in the insertion logic. This patch reduces system CPU consumption by over 25% on a 4kB synchronous append DIO write workload when used with the pre-2.6.39 x86_64 memmove() implementation. With the much faster 2.6.39 memmove() implementation we still see a decrease in system CPU usage between 2% and 7%. Note that the ext_debug() output changes with this patch, splitting some log information between entries. Users of the ext_debug() output should note that the "move %d" units changed from reporting the number of bytes moved to reporting the number of entries moved. Signed-off-by: Eric Gouriou Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 85 +++++++++++++++++++++++++++---------------------------- 1 file changed, 42 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 8b6a17b60970..29969622af8b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -754,31 +754,25 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, return -EIO; } - len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; if (logical > le32_to_cpu(curp->p_idx->ei_block)) { /* insert after */ - if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { - len = (len - 1) * sizeof(struct ext4_extent_idx); - len = len < 0 ? 0 : len; - ext_debug("insert new index %d after: %llu. " - "move %d from 0x%p to 0x%p\n", - logical, ptr, len, - (curp->p_idx + 1), (curp->p_idx + 2)); - memmove(curp->p_idx + 2, curp->p_idx + 1, len); - } + ext_debug("insert new index %d after: %llu\n", logical, ptr); ix = curp->p_idx + 1; } else { /* insert before */ - len = len * sizeof(struct ext4_extent_idx); - len = len < 0 ? 0 : len; - ext_debug("insert new index %d before: %llu. " - "move %d from 0x%p to 0x%p\n", - logical, ptr, len, - curp->p_idx, (curp->p_idx + 1)); - memmove(curp->p_idx + 1, curp->p_idx, len); + ext_debug("insert new index %d before: %llu\n", logical, ptr); ix = curp->p_idx; } + len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; + BUG_ON(len < 0); + if (len > 0) { + ext_debug("insert new index %d: " + "move %d indices from 0x%p to 0x%p\n", + logical, len, ix, ix + 1); + memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); + } + if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); return -EIO; @@ -1779,41 +1773,46 @@ has_space: ext4_ext_pblock(newext), ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext)); - path[depth].p_ext = EXT_FIRST_EXTENT(eh); - } else if (le32_to_cpu(newext->ee_block) + nearex = EXT_FIRST_EXTENT(eh); + } else { + if (le32_to_cpu(newext->ee_block) > le32_to_cpu(nearex->ee_block)) { -/* BUG_ON(newext->ee_block == nearex->ee_block); */ - if (nearex != EXT_LAST_EXTENT(eh)) { - len = EXT_MAX_EXTENT(eh) - nearex; - len = (len - 1) * sizeof(struct ext4_extent); - len = len < 0 ? 0 : len; - ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, " - "move %d from 0x%p to 0x%p\n", + /* Insert after */ + ext_debug("insert %d:%llu:[%d]%d %s before: " + "nearest 0x%p\n" + le32_to_cpu(newext->ee_block), + ext4_ext_pblock(newext), + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + nearex); + nearex++; + } else { + /* Insert before */ + BUG_ON(newext->ee_block == nearex->ee_block); + ext_debug("insert %d:%llu:[%d]%d %s after: " + "nearest 0x%p\n" le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), - nearex, len, nearex + 1, nearex + 2); - memmove(nearex + 2, nearex + 1, len); + nearex); + } + len = EXT_LAST_EXTENT(eh) - nearex + 1; + if (len > 0) { + ext_debug("insert %d:%llu:[%d]%d: " + "move %d extents from 0x%p to 0x%p\n", + le32_to_cpu(newext->ee_block), + ext4_ext_pblock(newext), + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + len, nearex, nearex + 1); + memmove(nearex + 1, nearex, + len * sizeof(struct ext4_extent)); } - path[depth].p_ext = nearex + 1; - } else { - BUG_ON(newext->ee_block == nearex->ee_block); - len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent); - len = len < 0 ? 0 : len; - ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, " - "move %d from 0x%p to 0x%p\n", - le32_to_cpu(newext->ee_block), - ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), - ext4_ext_get_actual_len(newext), - nearex, len, nearex, nearex + 1); - memmove(nearex + 1, nearex, len); - path[depth].p_ext = nearex; } le16_add_cpu(&eh->eh_entries, 1); - nearex = path[depth].p_ext; + path[depth].p_ext = nearex; nearex->ee_block = newext->ee_block; ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); nearex->ee_len = newext->ee_len; -- cgit v1.2.3-58-ga151 From 96814ecb404d587fade79af70bb741bc753e9ffb Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 24 Oct 2011 20:46:50 -0500 Subject: Add definition for share encryption Samba supports a setfs info level to negotiate encrypted shares. This patch adds the defines so we recognize this info level. Later patches will add the enablement for it. Acked-by: Jeremy Allison Signed-off-by: Steve French --- fs/cifs/cifspdu.h | 44 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 3c6ef34fe2bc..3fb03e2c8e86 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -1911,6 +1911,10 @@ typedef struct whoami_rsp_data { /* Query level 0x202 */ /* SETFSInfo Levels */ #define SMB_SET_CIFS_UNIX_INFO 0x200 +/* level 0x203 is defined above in list of QFS info levels */ +/* #define SMB_REQUEST_TRANSPORT_ENCRYPTION 0x203 */ + +/* Level 0x200 request structure follows */ typedef struct smb_com_transaction2_setfsi_req { struct smb_hdr hdr; /* wct = 15 */ __le16 TotalParameterCount; @@ -1938,13 +1942,39 @@ typedef struct smb_com_transaction2_setfsi_req { __le64 ClientUnixCap; /* Data end */ } __attribute__((packed)) TRANSACTION2_SETFSI_REQ; +/* level 0x203 request structure follows */ +typedef struct smb_com_transaction2_setfs_enc_req { + struct smb_hdr hdr; /* wct = 15 */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; /* 4 */ + __le16 ParameterOffset; + __le16 DataCount; /* 12 */ + __le16 DataOffset; + __u8 SetupCount; /* one */ + __u8 Reserved3; + __le16 SubCommand; /* TRANS2_SET_FS_INFORMATION */ + __le16 ByteCount; + __u8 Pad; + __u16 Reserved4; /* Parameters start. */ + __le16 InformationLevel;/* Parameters end. */ + /* NTLMSSP Blob, Data start. */ +} __attribute__((packed)) TRANSACTION2_SETFSI_ENC_REQ; + +/* response for setfsinfo levels 0x200 and 0x203 */ typedef struct smb_com_transaction2_setfsi_rsp { struct smb_hdr hdr; /* wct = 10 */ struct trans2_resp t2; __u16 ByteCount; } __attribute__((packed)) TRANSACTION2_SETFSI_RSP; - typedef struct smb_com_transaction2_get_dfs_refer_req { struct smb_hdr hdr; /* wct = 15 */ __le16 TotalParameterCount; @@ -2096,13 +2126,13 @@ typedef struct { #define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and QFS PROXY call */ #ifdef CONFIG_CIFS_POSIX -/* Can not set pathnames cap yet until we send new posix create SMB since - otherwise server can treat such handles opened with older ntcreatex - (by a new client which knows how to send posix path ops) - as non-posix handles (can affect write behavior with byte range locks. - We can add back in POSIX_PATH_OPS cap when Posix Create/Mkdir finished */ +/* presumably don't need the 0x20 POSIX_PATH_OPS_CAP since we never send + LockingX instead of posix locking call on unix sess (and we do not expect + LockingX to use different (ie Windows) semantics than posix locking on + the same session (if WINE needs to do this later, we can add this cap + back in later */ /* #define CIFS_UNIX_CAP_MASK 0x000000fb */ -#define CIFS_UNIX_CAP_MASK 0x000000db +#define CIFS_UNIX_CAP_MASK 0x000003db #else #define CIFS_UNIX_CAP_MASK 0x00000013 #endif /* CONFIG_CIFS_POSIX */ -- cgit v1.2.3-58-ga151 From 814e1d25a59662f9552e6dc1305d1df3616fc87e Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Thu, 1 Sep 2011 08:22:57 +0800 Subject: cleanup: vfs: small comment fix for block_invalidatepage The patch is aganist 3.1-rc3. Signed-off-by: Wang Sheng-Hui Signed-off-by: Christoph Hellwig --- fs/buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 1a80b048ade8..936d6035f6e2 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1470,13 +1470,13 @@ static void discard_buffer(struct buffer_head * bh) } /** - * block_invalidatepage - invalidate part of all of a buffer-backed page + * block_invalidatepage - invalidate part or all of a buffer-backed page * * @page: the page which is affected * @offset: the index of the truncation point * * block_invalidatepage() is called when all or part of the page has become - * invalidatedby a truncate operation. + * invalidated by a truncate operation. * * block_invalidatepage() does not have to release all buffers, but it must * ensure that no dirty buffer is left outside @offset and that no I/O -- cgit v1.2.3-58-ga151 From a877ee03ac010ded434b77f7831f43cbb1fcc60f Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Fri, 7 Oct 2011 13:41:15 -0400 Subject: vfs: add "device" tag to /proc/self/mountstats nfsiostat was failing to find mounted filesystems on kernels after 2.6.38 because of changes to show_vfsstat() by commit c7f404b40a3665d9f4e9a927cc5c1ee0479ed8f9. This patch adds back the "device" tag before the nfs server entry so scripts can parse the mountstats file correctly. Signed-off-by: Bryan Schumaker CC: stable@kernel.org [>=2.6.39] Signed-off-by: Christoph Hellwig --- fs/namespace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index b4febb29d3bb..e5e1c7d1839b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1109,6 +1109,7 @@ static int show_vfsstat(struct seq_file *m, void *v) /* device */ if (mnt->mnt_sb->s_op->show_devname) { + seq_puts(m, "device "); err = mnt->mnt_sb->s_op->show_devname(m, mnt); } else { if (mnt->mnt_devname) { -- cgit v1.2.3-58-ga151 From 1448c721e4fa2faf742029a0403b4b787fccb7fa Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 17 Oct 2011 13:40:02 -0700 Subject: compat: sync compat_stats with statfs. This was found by inspection while tracking a similar bug in compat_statfs64, that has been fixed in mainline since decemeber. - This fixes a bug where not all of the f_spare fields were cleared on mips and s390. - Add the f_flags field to struct compat_statfs - Copy f_flags to userspace in case someone cares. - Use __clear_user to copy the f_spare field to userspace to ensure that all of the elements of f_spare are cleared. On some architectures f_spare is has 5 ints and on some architectures f_spare only has 4 ints. Which makes the previous technique of clearing each int individually broken. I don't expect anyone actually uses the old statfs system call anymore but if they do let them benefit from having the compat and the native version working the same. Signed-off-by: Eric W. Biederman Signed-off-by: Christoph Hellwig --- arch/mips/include/asm/compat.h | 3 ++- arch/parisc/include/asm/compat.h | 3 ++- arch/powerpc/include/asm/compat.h | 3 ++- arch/s390/include/asm/compat.h | 3 ++- arch/sparc/include/asm/compat.h | 3 ++- arch/x86/include/asm/compat.h | 3 ++- fs/compat.c | 7 ++----- 7 files changed, 14 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h index dbc51065df5b..b77df0366ee6 100644 --- a/arch/mips/include/asm/compat.h +++ b/arch/mips/include/asm/compat.h @@ -111,7 +111,8 @@ struct compat_statfs { int f_bavail; compat_fsid_t f_fsid; int f_namelen; - int f_spare[6]; + int f_flags; + int f_spare[5]; }; #define COMPAT_RLIM_INFINITY 0x7fffffffUL diff --git a/arch/parisc/include/asm/compat.h b/arch/parisc/include/asm/compat.h index efa0b60c63fe..760f331d4fa3 100644 --- a/arch/parisc/include/asm/compat.h +++ b/arch/parisc/include/asm/compat.h @@ -105,7 +105,8 @@ struct compat_statfs { __kernel_fsid_t f_fsid; s32 f_namelen; s32 f_frsize; - s32 f_spare[5]; + s32 f_flags; + s32 f_spare[4]; }; struct compat_sigcontext { diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h index 91010e8f8479..88e602f6430d 100644 --- a/arch/powerpc/include/asm/compat.h +++ b/arch/powerpc/include/asm/compat.h @@ -100,7 +100,8 @@ struct compat_statfs { compat_fsid_t f_fsid; int f_namelen; /* SunOS ignores this field. */ int f_frsize; - int f_spare[5]; + int f_flags; + int f_spare[4]; }; #define COMPAT_RLIM_OLD_INFINITY 0x7fffffff diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index da359ca6fe55..cdb9b78f6c08 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -131,7 +131,8 @@ struct compat_statfs { compat_fsid_t f_fsid; s32 f_namelen; s32 f_frsize; - s32 f_spare[6]; + s32 f_flags; + s32 f_spare[5]; }; #define COMPAT_RLIM_OLD_INFINITY 0x7fffffff diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h index 6f57325bb883..b8be20d42a0a 100644 --- a/arch/sparc/include/asm/compat.h +++ b/arch/sparc/include/asm/compat.h @@ -134,7 +134,8 @@ struct compat_statfs { compat_fsid_t f_fsid; int f_namelen; /* SunOS ignores this field. */ int f_frsize; - int f_spare[5]; + int f_flags; + int f_spare[4]; }; #define COMPAT_RLIM_INFINITY 0x7fffffff diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 1d9cd27c2920..30d737ef2a42 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -108,7 +108,8 @@ struct compat_statfs { compat_fsid_t f_fsid; int f_namelen; /* SunOS ignores this field. */ int f_frsize; - int f_spare[5]; + int f_flags; + int f_spare[4]; }; #define COMPAT_RLIM_OLD_INFINITY 0x7fffffff diff --git a/fs/compat.c b/fs/compat.c index 58b1da459893..0f1ab468fa2d 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -247,11 +247,8 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs * __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || __put_user(kbuf->f_frsize, &ubuf->f_frsize) || - __put_user(0, &ubuf->f_spare[0]) || - __put_user(0, &ubuf->f_spare[1]) || - __put_user(0, &ubuf->f_spare[2]) || - __put_user(0, &ubuf->f_spare[3]) || - __put_user(0, &ubuf->f_spare[4])) + __put_user(kbuf->f_flags, &ubuf->f_flags) || + __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) return -EFAULT; return 0; } -- cgit v1.2.3-58-ga151 From 8fd90c8d1dacb5ff0f372217c97f57a9e61559cd Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sun, 23 Oct 2011 23:13:30 +0530 Subject: vfs: indicate that the permission functions take all the MAY_* flags Acked-by: J. Bruce Fields Acked-by: David Howells Signed-off-by: Andreas Gruenbacher Signed-off-by: Aneesh Kumar K.V Signed-off-by: Christoph Hellwig --- fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 0b3138de2a3b..2a4574f48001 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -257,7 +257,7 @@ other_perms: /** * generic_permission - check for access rights on a Posix-like filesystem * @inode: inode to check access rights for - * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...) * * Used to check for read/write/execute permissions on a file. * We use "fsuid" for this, letting us set arbitrary permissions @@ -331,7 +331,7 @@ static inline int do_inode_permission(struct inode *inode, int mask) /** * inode_permission - check for access rights to a given inode * @inode: inode to check permission on - * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...) * * Used to check for read/write/execute permissions on an inode. * We use "fsuid" for this, letting us set arbitrary permissions -- cgit v1.2.3-58-ga151 From d124b60a838141bb9cac1b6567e9ca4539d1fff0 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sun, 23 Oct 2011 23:13:32 +0530 Subject: vfs: pass all mask flags check_acl and posix_acl_permission Acked-by: J. Bruce Fields Acked-by: David Howells Signed-off-by: Andreas Gruenbacher Signed-off-by: Aneesh Kumar K.V Signed-off-by: Christoph Hellwig --- fs/namei.c | 2 -- fs/posix_acl.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 2a4574f48001..276cd30ab8f8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -227,8 +227,6 @@ static int acl_permission_check(struct inode *inode, int mask) { unsigned int mode = inode->i_mode; - mask &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK; - if (current_user_ns() != inode_userns(inode)) goto other_perms; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 10027b42b7e2..cea4623f1ed6 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -218,6 +218,8 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want) const struct posix_acl_entry *pa, *pe, *mask_obj; int found = 0; + want &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK; + FOREACH_ACL_ENTRY(pa, acl, pe) { switch(pa->e_tag) { case ACL_USER_OBJ: -- cgit v1.2.3-58-ga151 From 948409c74d217f6cf054b8c927765a1c3fe16b53 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sun, 23 Oct 2011 23:13:33 +0530 Subject: vfs: add a comment to inode_permission() Acked-by: J. Bruce Fields Acked-by: David Howells Signed-off-by: Andreas Gruenbacher Signed-off-by: Aneesh Kumar K.V Signed-off-by: Christoph Hellwig --- fs/namei.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 276cd30ab8f8..9061157e39d6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -221,7 +221,7 @@ static int check_acl(struct inode *inode, int mask) } /* - * This does basic POSIX ACL permission checking + * This does the basic permission checking */ static int acl_permission_check(struct inode *inode, int mask) { @@ -271,7 +271,7 @@ int generic_permission(struct inode *inode, int mask) int ret; /* - * Do the basic POSIX ACL permission checks. + * Do the basic permission checks. */ ret = acl_permission_check(inode, mask); if (ret != -EACCES) @@ -335,6 +335,8 @@ static inline int do_inode_permission(struct inode *inode, int mask) * We use "fsuid" for this, letting us set arbitrary permissions * for filesystem access without changing the "normal" uids which * are used for other things. + * + * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. */ int inode_permission(struct inode *inode, int mask) { -- cgit v1.2.3-58-ga151 From 62a3ddef6181d7d932c565d97552d2f7b9ab4d28 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 28 Oct 2011 10:03:41 +0200 Subject: vfs: fix spinning prevention in prune_icache_sb We need to move the inode to the end of the list to actually make the spinning prevention explained in the comment above it work. With a plain list_move it will simply stay in place as we're always reclaiming from the head of the list. Signed-off-by: Christoph Hellwig --- fs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index ec7924696a13..ecbb68dc7e2a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -634,7 +634,7 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan) * inode to the back of the list so we don't spin on it. */ if (!spin_trylock(&inode->i_lock)) { - list_move(&inode->i_lru, &sb->s_inode_lru); + list_move_tail(&inode->i_lru, &sb->s_inode_lru); continue; } -- cgit v1.2.3-58-ga151 From eb28be2b4c0a0608e54f0a8fc237372c674eb7d0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 1 Aug 2011 21:38:03 -0700 Subject: direct-io: separate fields only used in the submission path from struct dio This large, but largely mechanic, patch moves all fields in struct dio that are only used in the submission path into a separate on stack data structure. This has the advantage that the memory is very likely cache hot, which is not guaranteed for memory fresh out of kmalloc. This also gives gcc more optimization potential because it can easier determine that there are no external aliases for these variables. The sdio initialization is a initialization now instead of memset. This allows gcc to break sdio into individual fields and optimize away unnecessary zeroing (after all the functions are inlined) Signed-off-by: Andi Kleen Acked-by: Jeff Moyer Signed-off-by: Christoph Hellwig --- fs/direct-io.c | 389 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 201 insertions(+), 188 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index 44a360ca8046..02ccf766903c 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -55,13 +55,10 @@ * blocksize. */ -struct dio { - /* BIO submission state */ +/* dio_state only used in the submission path */ + +struct dio_submit { struct bio *bio; /* bio under assembly */ - struct inode *inode; - int rw; - loff_t i_size; /* i_size when submitted */ - int flags; /* doesn't change */ unsigned blkbits; /* doesn't change */ unsigned blkfactor; /* When we're using an alignment which is finer than the filesystem's soft @@ -81,13 +78,12 @@ struct dio { int boundary; /* prev block is at a boundary */ int reap_counter; /* rate limit reaping */ get_block_t *get_block; /* block mapping function */ - dio_iodone_t *end_io; /* IO completion function */ dio_submit_t *submit_io; /* IO submition function */ + loff_t logical_offset_in_bio; /* current first logical block in bio */ sector_t final_block_in_bio; /* current final block in bio + 1 */ sector_t next_block_for_io; /* next block to be put under IO, in dio_blocks units */ - struct buffer_head map_bh; /* last get_block() result */ /* * Deferred addition of a page to the dio. These variables are @@ -100,18 +96,6 @@ struct dio { sector_t cur_page_block; /* Where it starts */ loff_t cur_page_fs_offset; /* Offset in file */ - /* BIO completion state */ - spinlock_t bio_lock; /* protects BIO fields below */ - unsigned long refcount; /* direct_io_worker() and bios */ - struct bio *bio_list; /* singly linked via bi_private */ - struct task_struct *waiter; /* waiting task (NULL if none) */ - - /* AIO related stuff */ - struct kiocb *iocb; /* kiocb */ - int is_async; /* is IO async ? */ - int io_error; /* IO error in completion path */ - ssize_t result; /* IO result */ - /* * Page fetching state. These variables belong to dio_refill_pages(). */ @@ -125,6 +109,30 @@ struct dio { */ unsigned head; /* next page to process */ unsigned tail; /* last valid page + 1 */ +}; + +/* dio_state communicated between submission path and end_io */ +struct dio { + int flags; /* doesn't change */ + struct inode *inode; + int rw; + loff_t i_size; /* i_size when submitted */ + dio_iodone_t *end_io; /* IO completion function */ + struct buffer_head map_bh; /* last get_block() result */ + + + /* BIO completion state */ + spinlock_t bio_lock; /* protects BIO fields below */ + unsigned long refcount; /* direct_io_worker() and bios */ + struct bio *bio_list; /* singly linked via bi_private */ + struct task_struct *waiter; /* waiting task (NULL if none) */ + + /* AIO related stuff */ + struct kiocb *iocb; /* kiocb */ + int is_async; /* is IO async ? */ + int io_error; /* IO error in completion path */ + ssize_t result; /* IO result */ + int page_errors; /* errno from get_user_pages() */ /* @@ -182,27 +190,27 @@ EXPORT_SYMBOL_GPL(inode_dio_done); /* * How many pages are in the queue? */ -static inline unsigned dio_pages_present(struct dio *dio) +static inline unsigned dio_pages_present(struct dio_submit *sdio) { - return dio->tail - dio->head; + return sdio->tail - sdio->head; } /* * Go grab and pin some userspace pages. Typically we'll get 64 at a time. */ -static int dio_refill_pages(struct dio *dio) +static int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) { int ret; int nr_pages; - nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES); + nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES); ret = get_user_pages_fast( - dio->curr_user_address, /* Where from? */ + sdio->curr_user_address, /* Where from? */ nr_pages, /* How many pages? */ dio->rw == READ, /* Write to memory? */ &dio->pages[0]); /* Put results here */ - if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) { + if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) { struct page *page = ZERO_PAGE(0); /* * A memory fault, but the filesystem has some outstanding @@ -213,17 +221,17 @@ static int dio_refill_pages(struct dio *dio) dio->page_errors = ret; page_cache_get(page); dio->pages[0] = page; - dio->head = 0; - dio->tail = 1; + sdio->head = 0; + sdio->tail = 1; ret = 0; goto out; } if (ret >= 0) { - dio->curr_user_address += ret * PAGE_SIZE; - dio->curr_page += ret; - dio->head = 0; - dio->tail = ret; + sdio->curr_user_address += ret * PAGE_SIZE; + sdio->curr_page += ret; + sdio->head = 0; + sdio->tail = ret; ret = 0; } out: @@ -236,17 +244,17 @@ out: * decent number of pages, less frequently. To provide nicer use of the * L1 cache. */ -static struct page *dio_get_page(struct dio *dio) +static struct page *dio_get_page(struct dio *dio, struct dio_submit *sdio) { - if (dio_pages_present(dio) == 0) { + if (dio_pages_present(sdio) == 0) { int ret; - ret = dio_refill_pages(dio); + ret = dio_refill_pages(dio, sdio); if (ret) return ERR_PTR(ret); - BUG_ON(dio_pages_present(dio) == 0); + BUG_ON(dio_pages_present(sdio) == 0); } - return dio->pages[dio->head++]; + return dio->pages[sdio->head++]; } /** @@ -368,8 +376,9 @@ void dio_end_io(struct bio *bio, int error) EXPORT_SYMBOL_GPL(dio_end_io); static void -dio_bio_alloc(struct dio *dio, struct block_device *bdev, - sector_t first_sector, int nr_vecs) +dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, + struct block_device *bdev, + sector_t first_sector, int nr_vecs) { struct bio *bio; @@ -386,8 +395,8 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev, else bio->bi_end_io = dio_bio_end_io; - dio->bio = bio; - dio->logical_offset_in_bio = dio->cur_page_fs_offset; + sdio->bio = bio; + sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; } /* @@ -397,9 +406,9 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev, * * bios hold a dio reference between submit_bio and ->end_io. */ -static void dio_bio_submit(struct dio *dio) +static void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) { - struct bio *bio = dio->bio; + struct bio *bio = sdio->bio; unsigned long flags; bio->bi_private = dio; @@ -411,24 +420,24 @@ static void dio_bio_submit(struct dio *dio) if (dio->is_async && dio->rw == READ) bio_set_pages_dirty(bio); - if (dio->submit_io) - dio->submit_io(dio->rw, bio, dio->inode, - dio->logical_offset_in_bio); + if (sdio->submit_io) + sdio->submit_io(dio->rw, bio, dio->inode, + sdio->logical_offset_in_bio); else submit_bio(dio->rw, bio); - dio->bio = NULL; - dio->boundary = 0; - dio->logical_offset_in_bio = 0; + sdio->bio = NULL; + sdio->boundary = 0; + sdio->logical_offset_in_bio = 0; } /* * Release any resources in case of a failure */ -static void dio_cleanup(struct dio *dio) +static void dio_cleanup(struct dio *dio, struct dio_submit *sdio) { - while (dio_pages_present(dio)) - page_cache_release(dio_get_page(dio)); + while (dio_pages_present(sdio)) + page_cache_release(dio_get_page(dio, sdio)); } /* @@ -518,11 +527,11 @@ static void dio_await_completion(struct dio *dio) * * This also helps to limit the peak amount of pinned userspace memory. */ -static int dio_bio_reap(struct dio *dio) +static int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) { int ret = 0; - if (dio->reap_counter++ >= 64) { + if (sdio->reap_counter++ >= 64) { while (dio->bio_list) { unsigned long flags; struct bio *bio; @@ -536,14 +545,14 @@ static int dio_bio_reap(struct dio *dio) if (ret == 0) ret = ret2; } - dio->reap_counter = 0; + sdio->reap_counter = 0; } return ret; } /* * Call into the fs to map some more disk blocks. We record the current number - * of available blocks at dio->blocks_available. These are in units of the + * of available blocks at sdio->blocks_available. These are in units of the * fs blocksize, (1 << inode->i_blkbits). * * The fs is allowed to map lots of blocks at once. If it wants to do that, @@ -564,7 +573,7 @@ static int dio_bio_reap(struct dio *dio) * buffer_mapped(). However the direct-io code will only process holes one * block at a time - it will repeatedly call get_block() as it walks the hole. */ -static int get_more_blocks(struct dio *dio) +static int get_more_blocks(struct dio *dio, struct dio_submit *sdio) { int ret; struct buffer_head *map_bh = &dio->map_bh; @@ -580,11 +589,11 @@ static int get_more_blocks(struct dio *dio) */ ret = dio->page_errors; if (ret == 0) { - BUG_ON(dio->block_in_file >= dio->final_block_in_request); - fs_startblk = dio->block_in_file >> dio->blkfactor; - dio_count = dio->final_block_in_request - dio->block_in_file; - fs_count = dio_count >> dio->blkfactor; - blkmask = (1 << dio->blkfactor) - 1; + BUG_ON(sdio->block_in_file >= sdio->final_block_in_request); + fs_startblk = sdio->block_in_file >> sdio->blkfactor; + dio_count = sdio->final_block_in_request - sdio->block_in_file; + fs_count = dio_count >> sdio->blkfactor; + blkmask = (1 << sdio->blkfactor) - 1; if (dio_count & blkmask) fs_count++; @@ -604,12 +613,12 @@ static int get_more_blocks(struct dio *dio) */ create = dio->rw & WRITE; if (dio->flags & DIO_SKIP_HOLES) { - if (dio->block_in_file < (i_size_read(dio->inode) >> - dio->blkbits)) + if (sdio->block_in_file < (i_size_read(dio->inode) >> + sdio->blkbits)) create = 0; } - ret = (*dio->get_block)(dio->inode, fs_startblk, + ret = (*sdio->get_block)(dio->inode, fs_startblk, map_bh, create); } return ret; @@ -618,20 +627,21 @@ static int get_more_blocks(struct dio *dio) /* * There is no bio. Make one now. */ -static int dio_new_bio(struct dio *dio, sector_t start_sector) +static int dio_new_bio(struct dio *dio, struct dio_submit *sdio, + sector_t start_sector) { sector_t sector; int ret, nr_pages; - ret = dio_bio_reap(dio); + ret = dio_bio_reap(dio, sdio); if (ret) goto out; - sector = start_sector << (dio->blkbits - 9); - nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev)); + sector = start_sector << (sdio->blkbits - 9); + nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev)); nr_pages = min(nr_pages, BIO_MAX_PAGES); BUG_ON(nr_pages <= 0); - dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages); - dio->boundary = 0; + dio_bio_alloc(dio, sdio, dio->map_bh.b_bdev, sector, nr_pages); + sdio->boundary = 0; out: return ret; } @@ -643,21 +653,21 @@ out: * * Return zero on success. Non-zero means the caller needs to start a new BIO. */ -static int dio_bio_add_page(struct dio *dio) +static int dio_bio_add_page(struct dio_submit *sdio) { int ret; - ret = bio_add_page(dio->bio, dio->cur_page, - dio->cur_page_len, dio->cur_page_offset); - if (ret == dio->cur_page_len) { + ret = bio_add_page(sdio->bio, sdio->cur_page, + sdio->cur_page_len, sdio->cur_page_offset); + if (ret == sdio->cur_page_len) { /* * Decrement count only, if we are done with this page */ - if ((dio->cur_page_len + dio->cur_page_offset) == PAGE_SIZE) - dio->pages_in_io--; - page_cache_get(dio->cur_page); - dio->final_block_in_bio = dio->cur_page_block + - (dio->cur_page_len >> dio->blkbits); + if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE) + sdio->pages_in_io--; + page_cache_get(sdio->cur_page); + sdio->final_block_in_bio = sdio->cur_page_block + + (sdio->cur_page_len >> sdio->blkbits); ret = 0; } else { ret = 1; @@ -675,14 +685,14 @@ static int dio_bio_add_page(struct dio *dio) * The caller of this function is responsible for removing cur_page from the * dio, and for dropping the refcount which came from that presence. */ -static int dio_send_cur_page(struct dio *dio) +static int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio) { int ret = 0; - if (dio->bio) { - loff_t cur_offset = dio->cur_page_fs_offset; - loff_t bio_next_offset = dio->logical_offset_in_bio + - dio->bio->bi_size; + if (sdio->bio) { + loff_t cur_offset = sdio->cur_page_fs_offset; + loff_t bio_next_offset = sdio->logical_offset_in_bio + + sdio->bio->bi_size; /* * See whether this new request is contiguous with the old. @@ -698,28 +708,28 @@ static int dio_send_cur_page(struct dio *dio) * be the next logical offset in the bio, submit the bio we * have. */ - if (dio->final_block_in_bio != dio->cur_page_block || + if (sdio->final_block_in_bio != sdio->cur_page_block || cur_offset != bio_next_offset) - dio_bio_submit(dio); + dio_bio_submit(dio, sdio); /* * Submit now if the underlying fs is about to perform a * metadata read */ - else if (dio->boundary) - dio_bio_submit(dio); + else if (sdio->boundary) + dio_bio_submit(dio, sdio); } - if (dio->bio == NULL) { - ret = dio_new_bio(dio, dio->cur_page_block); + if (sdio->bio == NULL) { + ret = dio_new_bio(dio, sdio, sdio->cur_page_block); if (ret) goto out; } - if (dio_bio_add_page(dio) != 0) { - dio_bio_submit(dio); - ret = dio_new_bio(dio, dio->cur_page_block); + if (dio_bio_add_page(sdio) != 0) { + dio_bio_submit(dio, sdio); + ret = dio_new_bio(dio, sdio, sdio->cur_page_block); if (ret == 0) { - ret = dio_bio_add_page(dio); + ret = dio_bio_add_page(sdio); BUG_ON(ret != 0); } } @@ -745,7 +755,7 @@ out: * page to the dio instead. */ static int -submit_page_section(struct dio *dio, struct page *page, +submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, unsigned offset, unsigned len, sector_t blocknr) { int ret = 0; @@ -760,20 +770,20 @@ submit_page_section(struct dio *dio, struct page *page, /* * Can we just grow the current page's presence in the dio? */ - if ( (dio->cur_page == page) && - (dio->cur_page_offset + dio->cur_page_len == offset) && - (dio->cur_page_block + - (dio->cur_page_len >> dio->blkbits) == blocknr)) { - dio->cur_page_len += len; + if (sdio->cur_page == page && + sdio->cur_page_offset + sdio->cur_page_len == offset && + sdio->cur_page_block + + (sdio->cur_page_len >> sdio->blkbits) == blocknr) { + sdio->cur_page_len += len; /* - * If dio->boundary then we want to schedule the IO now to + * If sdio->boundary then we want to schedule the IO now to * avoid metadata seeks. */ - if (dio->boundary) { - ret = dio_send_cur_page(dio); - page_cache_release(dio->cur_page); - dio->cur_page = NULL; + if (sdio->boundary) { + ret = dio_send_cur_page(dio, sdio); + page_cache_release(sdio->cur_page); + sdio->cur_page = NULL; } goto out; } @@ -781,20 +791,20 @@ submit_page_section(struct dio *dio, struct page *page, /* * If there's a deferred page already there then send it. */ - if (dio->cur_page) { - ret = dio_send_cur_page(dio); - page_cache_release(dio->cur_page); - dio->cur_page = NULL; + if (sdio->cur_page) { + ret = dio_send_cur_page(dio, sdio); + page_cache_release(sdio->cur_page); + sdio->cur_page = NULL; if (ret) goto out; } page_cache_get(page); /* It is in dio */ - dio->cur_page = page; - dio->cur_page_offset = offset; - dio->cur_page_len = len; - dio->cur_page_block = blocknr; - dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits; + sdio->cur_page = page; + sdio->cur_page_offset = offset; + sdio->cur_page_len = len; + sdio->cur_page_block = blocknr; + sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits; out: return ret; } @@ -826,19 +836,19 @@ static void clean_blockdev_aliases(struct dio *dio) * `end' is zero if we're doing the start of the IO, 1 at the end of the * IO. */ -static void dio_zero_block(struct dio *dio, int end) +static void dio_zero_block(struct dio *dio, struct dio_submit *sdio, int end) { unsigned dio_blocks_per_fs_block; unsigned this_chunk_blocks; /* In dio_blocks */ unsigned this_chunk_bytes; struct page *page; - dio->start_zero_done = 1; - if (!dio->blkfactor || !buffer_new(&dio->map_bh)) + sdio->start_zero_done = 1; + if (!sdio->blkfactor || !buffer_new(&dio->map_bh)) return; - dio_blocks_per_fs_block = 1 << dio->blkfactor; - this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1); + dio_blocks_per_fs_block = 1 << sdio->blkfactor; + this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1); if (!this_chunk_blocks) return; @@ -850,14 +860,14 @@ static void dio_zero_block(struct dio *dio, int end) if (end) this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks; - this_chunk_bytes = this_chunk_blocks << dio->blkbits; + this_chunk_bytes = this_chunk_blocks << sdio->blkbits; page = ZERO_PAGE(0); - if (submit_page_section(dio, page, 0, this_chunk_bytes, - dio->next_block_for_io)) + if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes, + sdio->next_block_for_io)) return; - dio->next_block_for_io += this_chunk_blocks; + sdio->next_block_for_io += this_chunk_blocks; } /* @@ -876,9 +886,9 @@ static void dio_zero_block(struct dio *dio, int end) * it should set b_size to PAGE_SIZE or more inside get_block(). This gives * fine alignment but still allows this function to work in PAGE_SIZE units. */ -static int do_direct_IO(struct dio *dio) +static int do_direct_IO(struct dio *dio, struct dio_submit *sdio) { - const unsigned blkbits = dio->blkbits; + const unsigned blkbits = sdio->blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; struct page *page; unsigned block_in_page; @@ -886,10 +896,10 @@ static int do_direct_IO(struct dio *dio) int ret = 0; /* The I/O can start at any block offset within the first page */ - block_in_page = dio->first_block_in_page; + block_in_page = sdio->first_block_in_page; - while (dio->block_in_file < dio->final_block_in_request) { - page = dio_get_page(dio); + while (sdio->block_in_file < sdio->final_block_in_request) { + page = dio_get_page(dio, sdio); if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; @@ -901,14 +911,14 @@ static int do_direct_IO(struct dio *dio) unsigned this_chunk_blocks; /* # of blocks */ unsigned u; - if (dio->blocks_available == 0) { + if (sdio->blocks_available == 0) { /* * Need to go and map some more disk */ unsigned long blkmask; unsigned long dio_remainder; - ret = get_more_blocks(dio); + ret = get_more_blocks(dio, sdio); if (ret) { page_cache_release(page); goto out; @@ -916,18 +926,18 @@ static int do_direct_IO(struct dio *dio) if (!buffer_mapped(map_bh)) goto do_holes; - dio->blocks_available = - map_bh->b_size >> dio->blkbits; - dio->next_block_for_io = - map_bh->b_blocknr << dio->blkfactor; + sdio->blocks_available = + map_bh->b_size >> sdio->blkbits; + sdio->next_block_for_io = + map_bh->b_blocknr << sdio->blkfactor; if (buffer_new(map_bh)) clean_blockdev_aliases(dio); - if (!dio->blkfactor) + if (!sdio->blkfactor) goto do_holes; - blkmask = (1 << dio->blkfactor) - 1; - dio_remainder = (dio->block_in_file & blkmask); + blkmask = (1 << sdio->blkfactor) - 1; + dio_remainder = (sdio->block_in_file & blkmask); /* * If we are at the start of IO and that IO @@ -941,8 +951,8 @@ static int do_direct_IO(struct dio *dio) * on-disk */ if (!buffer_new(map_bh)) - dio->next_block_for_io += dio_remainder; - dio->blocks_available -= dio_remainder; + sdio->next_block_for_io += dio_remainder; + sdio->blocks_available -= dio_remainder; } do_holes: /* Handle holes */ @@ -961,7 +971,7 @@ do_holes: */ i_size_aligned = ALIGN(i_size_read(dio->inode), 1 << blkbits); - if (dio->block_in_file >= + if (sdio->block_in_file >= i_size_aligned >> blkbits) { /* We hit eof */ page_cache_release(page); @@ -969,7 +979,7 @@ do_holes: } zero_user(page, block_in_page << blkbits, 1 << blkbits); - dio->block_in_file++; + sdio->block_in_file++; block_in_page++; goto next_block; } @@ -979,38 +989,40 @@ do_holes: * is finer than the underlying fs, go check to see if * we must zero out the start of this block. */ - if (unlikely(dio->blkfactor && !dio->start_zero_done)) - dio_zero_block(dio, 0); + if (unlikely(sdio->blkfactor && !sdio->start_zero_done)) + dio_zero_block(dio, sdio, 0); /* * Work out, in this_chunk_blocks, how much disk we * can add to this page */ - this_chunk_blocks = dio->blocks_available; + this_chunk_blocks = sdio->blocks_available; u = (PAGE_SIZE - offset_in_page) >> blkbits; if (this_chunk_blocks > u) this_chunk_blocks = u; - u = dio->final_block_in_request - dio->block_in_file; + u = sdio->final_block_in_request - sdio->block_in_file; if (this_chunk_blocks > u) this_chunk_blocks = u; this_chunk_bytes = this_chunk_blocks << blkbits; BUG_ON(this_chunk_bytes == 0); - dio->boundary = buffer_boundary(map_bh); - ret = submit_page_section(dio, page, offset_in_page, - this_chunk_bytes, dio->next_block_for_io); + sdio->boundary = buffer_boundary(map_bh); + ret = submit_page_section(dio, sdio, page, + offset_in_page, + this_chunk_bytes, + sdio->next_block_for_io); if (ret) { page_cache_release(page); goto out; } - dio->next_block_for_io += this_chunk_blocks; + sdio->next_block_for_io += this_chunk_blocks; - dio->block_in_file += this_chunk_blocks; + sdio->block_in_file += this_chunk_blocks; block_in_page += this_chunk_blocks; - dio->blocks_available -= this_chunk_blocks; + sdio->blocks_available -= this_chunk_blocks; next_block: - BUG_ON(dio->block_in_file > dio->final_block_in_request); - if (dio->block_in_file == dio->final_block_in_request) + BUG_ON(sdio->block_in_file > sdio->final_block_in_request); + if (sdio->block_in_file == sdio->final_block_in_request) break; } @@ -1026,7 +1038,7 @@ static ssize_t direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs, unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, - dio_submit_t submit_io, struct dio *dio) + dio_submit_t submit_io, struct dio *dio, struct dio_submit *sdio) { unsigned long user_addr; unsigned long flags; @@ -1037,15 +1049,15 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, dio->inode = inode; dio->rw = rw; - dio->blkbits = blkbits; - dio->blkfactor = inode->i_blkbits - blkbits; - dio->block_in_file = offset >> blkbits; + sdio->blkbits = blkbits; + sdio->blkfactor = inode->i_blkbits - blkbits; + sdio->block_in_file = offset >> blkbits; - dio->get_block = get_block; + sdio->get_block = get_block; dio->end_io = end_io; - dio->submit_io = submit_io; - dio->final_block_in_bio = -1; - dio->next_block_for_io = -1; + sdio->submit_io = submit_io; + sdio->final_block_in_bio = -1; + sdio->next_block_for_io = -1; dio->iocb = iocb; dio->i_size = i_size_read(inode); @@ -1057,45 +1069,45 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, * In case of non-aligned buffers, we may need 2 more * pages since we need to zero out first and last block. */ - if (unlikely(dio->blkfactor)) - dio->pages_in_io = 2; + if (unlikely(sdio->blkfactor)) + sdio->pages_in_io = 2; for (seg = 0; seg < nr_segs; seg++) { user_addr = (unsigned long)iov[seg].iov_base; - dio->pages_in_io += + sdio->pages_in_io += ((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE - user_addr/PAGE_SIZE); } for (seg = 0; seg < nr_segs; seg++) { user_addr = (unsigned long)iov[seg].iov_base; - dio->size += bytes = iov[seg].iov_len; + sdio->size += bytes = iov[seg].iov_len; /* Index into the first page of the first block */ - dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; - dio->final_block_in_request = dio->block_in_file + + sdio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; + sdio->final_block_in_request = sdio->block_in_file + (bytes >> blkbits); /* Page fetching state */ - dio->head = 0; - dio->tail = 0; - dio->curr_page = 0; + sdio->head = 0; + sdio->tail = 0; + sdio->curr_page = 0; - dio->total_pages = 0; + sdio->total_pages = 0; if (user_addr & (PAGE_SIZE-1)) { - dio->total_pages++; + sdio->total_pages++; bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1)); } - dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; - dio->curr_user_address = user_addr; + sdio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; + sdio->curr_user_address = user_addr; - ret = do_direct_IO(dio); + ret = do_direct_IO(dio, sdio); dio->result += iov[seg].iov_len - - ((dio->final_block_in_request - dio->block_in_file) << + ((sdio->final_block_in_request - sdio->block_in_file) << blkbits); if (ret) { - dio_cleanup(dio); + dio_cleanup(dio, sdio); break; } } /* end iovec loop */ @@ -1111,23 +1123,23 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, * There may be some unwritten disk at the end of a part-written * fs-block-sized block. Go zero that now. */ - dio_zero_block(dio, 1); + dio_zero_block(dio, sdio, 1); - if (dio->cur_page) { - ret2 = dio_send_cur_page(dio); + if (sdio->cur_page) { + ret2 = dio_send_cur_page(dio, sdio); if (ret == 0) ret = ret2; - page_cache_release(dio->cur_page); - dio->cur_page = NULL; + page_cache_release(sdio->cur_page); + sdio->cur_page = NULL; } - if (dio->bio) - dio_bio_submit(dio); + if (sdio->bio) + dio_bio_submit(dio, sdio); /* * It is possible that, we return short IO due to end of file. * In that case, we need to release all the pages we got hold on. */ - dio_cleanup(dio); + dio_cleanup(dio, sdio); /* * All block lookups have been performed. For READ requests @@ -1146,7 +1158,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, */ BUG_ON(ret == -EIOCBQUEUED); if (dio->is_async && ret == 0 && dio->result && - ((rw & READ) || (dio->result == dio->size))) + ((rw & READ) || (dio->result == sdio->size))) ret = -EIOCBQUEUED; if (ret != -EIOCBQUEUED) @@ -1211,6 +1223,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, ssize_t retval = -EINVAL; loff_t end = offset; struct dio *dio; + struct dio_submit sdio = { 0, }; if (rw & WRITE) rw = WRITE_ODIRECT; @@ -1290,7 +1303,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, retval = direct_io_worker(rw, iocb, inode, iov, offset, nr_segs, blkbits, get_block, end_io, - submit_io, dio); + submit_io, dio, &sdio); out: return retval; -- cgit v1.2.3-58-ga151 From cde1ecb3247f67c167918ea6326159209996fd54 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 1 Aug 2011 21:38:04 -0700 Subject: direct-io: fix a wrong comment There's nothing on the stack, even before my changes. Signed-off-by: Andi Kleen Acked-by: Jeff Moyer Signed-off-by: Christoph Hellwig --- fs/direct-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index 02ccf766903c..c1e03ae961af 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -39,7 +39,7 @@ /* * How many user pages to map in one call to get_user_pages(). This determines - * the size of a structure on the stack. + * the size of a structure in the slab cache */ #define DIO_PAGES 64 -- cgit v1.2.3-58-ga151 From 0dc2bc49be545626a2dc6da133202ffe69ac3fcc Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 1 Aug 2011 21:38:05 -0700 Subject: direct-io: rearrange fields in dio/dio_submit to avoid holes Fix most problems reported by pahole. There is still a weird 104 byte hole after map_bh. I'm not sure what causes this. Signed-off-by: Andi Kleen Acked-by: Jeff Moyer Signed-off-by: Christoph Hellwig --- fs/direct-io.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index c1e03ae961af..b01ea0d7160d 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -73,10 +73,10 @@ struct dio_submit { sector_t block_in_file; /* Current offset into the underlying file in dio_block units. */ unsigned blocks_available; /* At block_in_file. changes */ + int reap_counter; /* rate limit reaping */ sector_t final_block_in_request;/* doesn't change */ unsigned first_block_in_page; /* doesn't change, Used only once */ int boundary; /* prev block is at a boundary */ - int reap_counter; /* rate limit reaping */ get_block_t *get_block; /* block mapping function */ dio_submit_t *submit_io; /* IO submition function */ @@ -114,27 +114,26 @@ struct dio_submit { /* dio_state communicated between submission path and end_io */ struct dio { int flags; /* doesn't change */ - struct inode *inode; int rw; + struct inode *inode; loff_t i_size; /* i_size when submitted */ dio_iodone_t *end_io; /* IO completion function */ - struct buffer_head map_bh; /* last get_block() result */ /* BIO completion state */ spinlock_t bio_lock; /* protects BIO fields below */ + int page_errors; /* errno from get_user_pages() */ + int is_async; /* is IO async ? */ + int io_error; /* IO error in completion path */ unsigned long refcount; /* direct_io_worker() and bios */ struct bio *bio_list; /* singly linked via bi_private */ struct task_struct *waiter; /* waiting task (NULL if none) */ /* AIO related stuff */ struct kiocb *iocb; /* kiocb */ - int is_async; /* is IO async ? */ - int io_error; /* IO error in completion path */ ssize_t result; /* IO result */ - int page_errors; /* errno from get_user_pages() */ - + struct buffer_head map_bh; /* last get_block() result */ /* * pages[] (and any fields placed after it) are not zeroed out at * allocation time. Don't add new fields after pages[] unless you -- cgit v1.2.3-58-ga151 From 6e8267f532a17165ab551ac5fdafcba5333dcca5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 1 Aug 2011 21:38:06 -0700 Subject: direct-io: use a slab cache for struct dio A direct slab call is slightly faster than kmalloc and can be better cached per CPU. It also avoids rounding to the next kmalloc slab. In addition this enforces cache line alignment for struct dio to avoid any false sharing. Signed-off-by: Andi Kleen Acked-by: Jeff Moyer Signed-off-by: Christoph Hellwig --- fs/direct-io.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index b01ea0d7160d..6bb04409e725 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -140,7 +140,9 @@ struct dio { * wish that they not be zeroed. */ struct page *pages[DIO_PAGES]; /* page buffer */ -}; +} ____cacheline_aligned_in_smp; + +static struct kmem_cache *dio_cache __read_mostly; static void __inode_dio_wait(struct inode *inode) { @@ -330,7 +332,7 @@ static void dio_bio_end_aio(struct bio *bio, int error) if (remaining == 0) { dio_complete(dio, dio->iocb->ki_pos, 0, true); - kfree(dio); + kmem_cache_free(dio_cache, dio); } } @@ -1180,7 +1182,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, if (ret2 == 0) { ret = dio_complete(dio, offset, ret, false); - kfree(dio); + kmem_cache_free(dio_cache, dio); } else BUG_ON(ret != -EIOCBQUEUED); @@ -1256,7 +1258,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (rw == READ && end == offset) return 0; - dio = kmalloc(sizeof(*dio), GFP_KERNEL); + dio = kmem_cache_alloc(dio_cache, GFP_KERNEL); retval = -ENOMEM; if (!dio) goto out; @@ -1280,7 +1282,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, end - 1); if (retval) { mutex_unlock(&inode->i_mutex); - kfree(dio); + kmem_cache_free(dio_cache, dio); goto out; } } @@ -1308,3 +1310,10 @@ out: return retval; } EXPORT_SYMBOL(__blockdev_direct_IO); + +static __init int dio_init(void) +{ + dio_cache = KMEM_CACHE(dio, SLAB_PANIC); + return 0; +} +module_init(dio_init) -- cgit v1.2.3-58-ga151 From 18772641dbe2c89c6122c603f81f6a9574aee556 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 1 Aug 2011 21:38:07 -0700 Subject: direct-io: separate map_bh from dio Only a single b_private field in the map_bh buffer head is needed after the submission path. Move map_bh separately to avoid storing this information in the long term slab. This avoids the weird 104 byte hole in struct dio_submit which also needed to be memseted early. Signed-off-by: Andi Kleen Signed-off-by: Christoph Hellwig --- fs/direct-io.c | 66 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index 6bb04409e725..edf3174afd6a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -119,6 +119,7 @@ struct dio { loff_t i_size; /* i_size when submitted */ dio_iodone_t *end_io; /* IO completion function */ + void *private; /* copy from map_bh.b_private */ /* BIO completion state */ spinlock_t bio_lock; /* protects BIO fields below */ @@ -133,7 +134,6 @@ struct dio { struct kiocb *iocb; /* kiocb */ ssize_t result; /* IO result */ - struct buffer_head map_bh; /* last get_block() result */ /* * pages[] (and any fields placed after it) are not zeroed out at * allocation time. Don't add new fields after pages[] unless you @@ -301,7 +301,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is if (dio->end_io && dio->result) { dio->end_io(dio->iocb, offset, transferred, - dio->map_bh.b_private, ret, is_async); + dio->private, ret, is_async); } else { if (is_async) aio_complete(dio->iocb, ret, 0); @@ -574,10 +574,10 @@ static int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) * buffer_mapped(). However the direct-io code will only process holes one * block at a time - it will repeatedly call get_block() as it walks the hole. */ -static int get_more_blocks(struct dio *dio, struct dio_submit *sdio) +static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, + struct buffer_head *map_bh) { int ret; - struct buffer_head *map_bh = &dio->map_bh; sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ unsigned long fs_count; /* Number of filesystem-sized blocks */ unsigned long dio_count;/* Number of dio_block-sized blocks */ @@ -621,6 +621,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio) ret = (*sdio->get_block)(dio->inode, fs_startblk, map_bh, create); + + /* Store for completion */ + dio->private = map_bh->b_private; } return ret; } @@ -629,7 +632,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio) * There is no bio. Make one now. */ static int dio_new_bio(struct dio *dio, struct dio_submit *sdio, - sector_t start_sector) + sector_t start_sector, struct buffer_head *map_bh) { sector_t sector; int ret, nr_pages; @@ -638,10 +641,10 @@ static int dio_new_bio(struct dio *dio, struct dio_submit *sdio, if (ret) goto out; sector = start_sector << (sdio->blkbits - 9); - nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev)); + nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev)); nr_pages = min(nr_pages, BIO_MAX_PAGES); BUG_ON(nr_pages <= 0); - dio_bio_alloc(dio, sdio, dio->map_bh.b_bdev, sector, nr_pages); + dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages); sdio->boundary = 0; out: return ret; @@ -686,7 +689,8 @@ static int dio_bio_add_page(struct dio_submit *sdio) * The caller of this function is responsible for removing cur_page from the * dio, and for dropping the refcount which came from that presence. */ -static int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio) +static int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, + struct buffer_head *map_bh) { int ret = 0; @@ -721,14 +725,14 @@ static int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio) } if (sdio->bio == NULL) { - ret = dio_new_bio(dio, sdio, sdio->cur_page_block); + ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh); if (ret) goto out; } if (dio_bio_add_page(sdio) != 0) { dio_bio_submit(dio, sdio); - ret = dio_new_bio(dio, sdio, sdio->cur_page_block); + ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh); if (ret == 0) { ret = dio_bio_add_page(sdio); BUG_ON(ret != 0); @@ -757,7 +761,8 @@ out: */ static int submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, - unsigned offset, unsigned len, sector_t blocknr) + unsigned offset, unsigned len, sector_t blocknr, + struct buffer_head *map_bh) { int ret = 0; @@ -782,7 +787,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, * avoid metadata seeks. */ if (sdio->boundary) { - ret = dio_send_cur_page(dio, sdio); + ret = dio_send_cur_page(dio, sdio, map_bh); page_cache_release(sdio->cur_page); sdio->cur_page = NULL; } @@ -793,7 +798,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, * If there's a deferred page already there then send it. */ if (sdio->cur_page) { - ret = dio_send_cur_page(dio, sdio); + ret = dio_send_cur_page(dio, sdio, map_bh); page_cache_release(sdio->cur_page); sdio->cur_page = NULL; if (ret) @@ -815,16 +820,16 @@ out: * file blocks. Only called for S_ISREG files - blockdevs do not set * buffer_new */ -static void clean_blockdev_aliases(struct dio *dio) +static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh) { unsigned i; unsigned nblocks; - nblocks = dio->map_bh.b_size >> dio->inode->i_blkbits; + nblocks = map_bh->b_size >> dio->inode->i_blkbits; for (i = 0; i < nblocks; i++) { - unmap_underlying_metadata(dio->map_bh.b_bdev, - dio->map_bh.b_blocknr + i); + unmap_underlying_metadata(map_bh->b_bdev, + map_bh->b_blocknr + i); } } @@ -837,7 +842,8 @@ static void clean_blockdev_aliases(struct dio *dio) * `end' is zero if we're doing the start of the IO, 1 at the end of the * IO. */ -static void dio_zero_block(struct dio *dio, struct dio_submit *sdio, int end) +static void dio_zero_block(struct dio *dio, struct dio_submit *sdio, int end, + struct buffer_head *map_bh) { unsigned dio_blocks_per_fs_block; unsigned this_chunk_blocks; /* In dio_blocks */ @@ -845,7 +851,7 @@ static void dio_zero_block(struct dio *dio, struct dio_submit *sdio, int end) struct page *page; sdio->start_zero_done = 1; - if (!sdio->blkfactor || !buffer_new(&dio->map_bh)) + if (!sdio->blkfactor || !buffer_new(map_bh)) return; dio_blocks_per_fs_block = 1 << sdio->blkfactor; @@ -865,7 +871,7 @@ static void dio_zero_block(struct dio *dio, struct dio_submit *sdio, int end) page = ZERO_PAGE(0); if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes, - sdio->next_block_for_io)) + sdio->next_block_for_io, map_bh)) return; sdio->next_block_for_io += this_chunk_blocks; @@ -887,13 +893,13 @@ static void dio_zero_block(struct dio *dio, struct dio_submit *sdio, int end) * it should set b_size to PAGE_SIZE or more inside get_block(). This gives * fine alignment but still allows this function to work in PAGE_SIZE units. */ -static int do_direct_IO(struct dio *dio, struct dio_submit *sdio) +static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, + struct buffer_head *map_bh) { const unsigned blkbits = sdio->blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; struct page *page; unsigned block_in_page; - struct buffer_head *map_bh = &dio->map_bh; int ret = 0; /* The I/O can start at any block offset within the first page */ @@ -919,7 +925,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio) unsigned long blkmask; unsigned long dio_remainder; - ret = get_more_blocks(dio, sdio); + ret = get_more_blocks(dio, sdio, map_bh); if (ret) { page_cache_release(page); goto out; @@ -932,7 +938,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio) sdio->next_block_for_io = map_bh->b_blocknr << sdio->blkfactor; if (buffer_new(map_bh)) - clean_blockdev_aliases(dio); + clean_blockdev_aliases(dio, map_bh); if (!sdio->blkfactor) goto do_holes; @@ -991,7 +997,7 @@ do_holes: * we must zero out the start of this block. */ if (unlikely(sdio->blkfactor && !sdio->start_zero_done)) - dio_zero_block(dio, sdio, 0); + dio_zero_block(dio, sdio, 0, map_bh); /* * Work out, in this_chunk_blocks, how much disk we @@ -1011,7 +1017,8 @@ do_holes: ret = submit_page_section(dio, sdio, page, offset_in_page, this_chunk_bytes, - sdio->next_block_for_io); + sdio->next_block_for_io, + map_bh); if (ret) { page_cache_release(page); goto out; @@ -1047,6 +1054,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, ssize_t ret = 0; ssize_t ret2; size_t bytes; + struct buffer_head map_bh = { 0, }; dio->inode = inode; dio->rw = rw; @@ -1101,7 +1109,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, sdio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; sdio->curr_user_address = user_addr; - ret = do_direct_IO(dio, sdio); + ret = do_direct_IO(dio, sdio, &map_bh); dio->result += iov[seg].iov_len - ((sdio->final_block_in_request - sdio->block_in_file) << @@ -1124,10 +1132,10 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, * There may be some unwritten disk at the end of a part-written * fs-block-sized block. Go zero that now. */ - dio_zero_block(dio, sdio, 1); + dio_zero_block(dio, sdio, 1, &map_bh); if (sdio->cur_page) { - ret2 = dio_send_cur_page(dio, sdio); + ret2 = dio_send_cur_page(dio, sdio, &map_bh); if (ret == 0) ret = ret2; page_cache_release(sdio->cur_page); -- cgit v1.2.3-58-ga151 From ba253fbf6d3502c54e1ac8792e7ac8290a1f5b8d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 1 Aug 2011 21:38:08 -0700 Subject: direct-io: inline the complete submission path Add inlines to all the submission path functions. While this increases code size it also gives gcc a lot of optimization opportunities in this critical hotpath. In particular -- together with some other changes -- this allows gcc to get rid of the unnecessary clearing of sdio at the beginning and optimize the messy parameter passing. Any non inlining of a function which takes a sdio parameter would break this optimization because they cannot be done if the address of a structure is taken. Note that benefits are only seen with CONFIG_OPTIMIZE_INLINING and CONFIG_CC_OPTIMIZE_FOR_SIZE both set to off. This gives about 2.2% improvement on a large database benchmark with a high IOPS rate. Signed-off-by: Andi Kleen Signed-off-by: Christoph Hellwig --- fs/direct-io.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index edf3174afd6a..6d425821be66 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -199,7 +199,7 @@ static inline unsigned dio_pages_present(struct dio_submit *sdio) /* * Go grab and pin some userspace pages. Typically we'll get 64 at a time. */ -static int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) +static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) { int ret; int nr_pages; @@ -245,7 +245,8 @@ out: * decent number of pages, less frequently. To provide nicer use of the * L1 cache. */ -static struct page *dio_get_page(struct dio *dio, struct dio_submit *sdio) +static inline struct page *dio_get_page(struct dio *dio, + struct dio_submit *sdio) { if (dio_pages_present(sdio) == 0) { int ret; @@ -376,7 +377,7 @@ void dio_end_io(struct bio *bio, int error) } EXPORT_SYMBOL_GPL(dio_end_io); -static void +static inline void dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, struct block_device *bdev, sector_t first_sector, int nr_vecs) @@ -407,7 +408,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, * * bios hold a dio reference between submit_bio and ->end_io. */ -static void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) +static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) { struct bio *bio = sdio->bio; unsigned long flags; @@ -435,7 +436,7 @@ static void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) /* * Release any resources in case of a failure */ -static void dio_cleanup(struct dio *dio, struct dio_submit *sdio) +static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) { while (dio_pages_present(sdio)) page_cache_release(dio_get_page(dio, sdio)); @@ -528,7 +529,7 @@ static void dio_await_completion(struct dio *dio) * * This also helps to limit the peak amount of pinned userspace memory. */ -static int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) +static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) { int ret = 0; @@ -631,8 +632,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, /* * There is no bio. Make one now. */ -static int dio_new_bio(struct dio *dio, struct dio_submit *sdio, - sector_t start_sector, struct buffer_head *map_bh) +static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio, + sector_t start_sector, struct buffer_head *map_bh) { sector_t sector; int ret, nr_pages; @@ -657,7 +658,7 @@ out: * * Return zero on success. Non-zero means the caller needs to start a new BIO. */ -static int dio_bio_add_page(struct dio_submit *sdio) +static inline int dio_bio_add_page(struct dio_submit *sdio) { int ret; @@ -689,8 +690,8 @@ static int dio_bio_add_page(struct dio_submit *sdio) * The caller of this function is responsible for removing cur_page from the * dio, and for dropping the refcount which came from that presence. */ -static int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, - struct buffer_head *map_bh) +static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, + struct buffer_head *map_bh) { int ret = 0; @@ -759,7 +760,7 @@ out: * If that doesn't work out then we put the old page into the bio and add this * page to the dio instead. */ -static int +static inline int submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, unsigned offset, unsigned len, sector_t blocknr, struct buffer_head *map_bh) @@ -842,8 +843,8 @@ static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh) * `end' is zero if we're doing the start of the IO, 1 at the end of the * IO. */ -static void dio_zero_block(struct dio *dio, struct dio_submit *sdio, int end, - struct buffer_head *map_bh) +static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio, + int end, struct buffer_head *map_bh) { unsigned dio_blocks_per_fs_block; unsigned this_chunk_blocks; /* In dio_blocks */ @@ -1042,7 +1043,7 @@ out: return ret; } -static ssize_t +static inline ssize_t direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs, unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, @@ -1216,6 +1217,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, * expected that filesystem provide exclusion between new direct I/O * and truncates. For DIO_LOCKING filesystems this is done by i_mutex, * but other filesystems need to take care of this on their own. + * + * NOTE: if you pass "sdio" to anything by pointer make sure that function + * is always inlined. Otherwise gcc is unable to split the structure into + * individual fields and will generate much worse code. This is important + * for the whole file. */ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, -- cgit v1.2.3-58-ga151 From 847cc6371ba820763773e993000410d6d8d23515 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 1 Aug 2011 21:38:09 -0700 Subject: direct-io: merge direct_io_walker into __blockdev_direct_IO This doesn't change anything for the compiler, but hch thought it would make the code clearer. I moved the reference counting into its own little inline. Signed-off-by: Andi Kleen Acked-by: Jeff Moyer Signed-off-by: Christoph Hellwig --- fs/direct-io.c | 271 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 132 insertions(+), 139 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index 6d425821be66..d740ab67ff6e 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1043,136 +1043,10 @@ out: return ret; } -static inline ssize_t -direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, - const struct iovec *iov, loff_t offset, unsigned long nr_segs, - unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, - dio_submit_t submit_io, struct dio *dio, struct dio_submit *sdio) +static inline int drop_refcount(struct dio *dio) { - unsigned long user_addr; + int ret2; unsigned long flags; - int seg; - ssize_t ret = 0; - ssize_t ret2; - size_t bytes; - struct buffer_head map_bh = { 0, }; - - dio->inode = inode; - dio->rw = rw; - sdio->blkbits = blkbits; - sdio->blkfactor = inode->i_blkbits - blkbits; - sdio->block_in_file = offset >> blkbits; - - sdio->get_block = get_block; - dio->end_io = end_io; - sdio->submit_io = submit_io; - sdio->final_block_in_bio = -1; - sdio->next_block_for_io = -1; - - dio->iocb = iocb; - dio->i_size = i_size_read(inode); - - spin_lock_init(&dio->bio_lock); - dio->refcount = 1; - - /* - * In case of non-aligned buffers, we may need 2 more - * pages since we need to zero out first and last block. - */ - if (unlikely(sdio->blkfactor)) - sdio->pages_in_io = 2; - - for (seg = 0; seg < nr_segs; seg++) { - user_addr = (unsigned long)iov[seg].iov_base; - sdio->pages_in_io += - ((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE - - user_addr/PAGE_SIZE); - } - - for (seg = 0; seg < nr_segs; seg++) { - user_addr = (unsigned long)iov[seg].iov_base; - sdio->size += bytes = iov[seg].iov_len; - - /* Index into the first page of the first block */ - sdio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; - sdio->final_block_in_request = sdio->block_in_file + - (bytes >> blkbits); - /* Page fetching state */ - sdio->head = 0; - sdio->tail = 0; - sdio->curr_page = 0; - - sdio->total_pages = 0; - if (user_addr & (PAGE_SIZE-1)) { - sdio->total_pages++; - bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1)); - } - sdio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; - sdio->curr_user_address = user_addr; - - ret = do_direct_IO(dio, sdio, &map_bh); - - dio->result += iov[seg].iov_len - - ((sdio->final_block_in_request - sdio->block_in_file) << - blkbits); - - if (ret) { - dio_cleanup(dio, sdio); - break; - } - } /* end iovec loop */ - - if (ret == -ENOTBLK) { - /* - * The remaining part of the request will be - * be handled by buffered I/O when we return - */ - ret = 0; - } - /* - * There may be some unwritten disk at the end of a part-written - * fs-block-sized block. Go zero that now. - */ - dio_zero_block(dio, sdio, 1, &map_bh); - - if (sdio->cur_page) { - ret2 = dio_send_cur_page(dio, sdio, &map_bh); - if (ret == 0) - ret = ret2; - page_cache_release(sdio->cur_page); - sdio->cur_page = NULL; - } - if (sdio->bio) - dio_bio_submit(dio, sdio); - - /* - * It is possible that, we return short IO due to end of file. - * In that case, we need to release all the pages we got hold on. - */ - dio_cleanup(dio, sdio); - - /* - * All block lookups have been performed. For READ requests - * we can let i_mutex go now that its achieved its purpose - * of protecting us from looking up uninitialized blocks. - */ - if (rw == READ && (dio->flags & DIO_LOCKING)) - mutex_unlock(&dio->inode->i_mutex); - - /* - * The only time we want to leave bios in flight is when a successful - * partial aio read or full aio write have been setup. In that case - * bio completion will call aio_complete. The only time it's safe to - * call aio_complete is when we return -EIOCBQUEUED, so we key on that. - * This had *better* be the only place that raises -EIOCBQUEUED. - */ - BUG_ON(ret == -EIOCBQUEUED); - if (dio->is_async && ret == 0 && dio->result && - ((rw & READ) || (dio->result == sdio->size))) - ret = -EIOCBQUEUED; - - if (ret != -EIOCBQUEUED) - dio_await_completion(dio); /* * Sync will always be dropping the final ref and completing the @@ -1188,14 +1062,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, spin_lock_irqsave(&dio->bio_lock, flags); ret2 = --dio->refcount; spin_unlock_irqrestore(&dio->bio_lock, flags); - - if (ret2 == 0) { - ret = dio_complete(dio, offset, ret, false); - kmem_cache_free(dio_cache, dio); - } else - BUG_ON(ret != -EIOCBQUEUED); - - return ret; + return ret2; } /* @@ -1239,6 +1106,9 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, loff_t end = offset; struct dio *dio; struct dio_submit sdio = { 0, }; + unsigned long user_addr; + size_t bytes; + struct buffer_head map_bh = { 0, }; if (rw & WRITE) rw = WRITE_ODIRECT; @@ -1316,9 +1186,132 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && (end > i_size_read(inode))); - retval = direct_io_worker(rw, iocb, inode, iov, offset, - nr_segs, blkbits, get_block, end_io, - submit_io, dio, &sdio); + retval = 0; + + dio->inode = inode; + dio->rw = rw; + sdio.blkbits = blkbits; + sdio.blkfactor = inode->i_blkbits - blkbits; + sdio.block_in_file = offset >> blkbits; + + sdio.get_block = get_block; + dio->end_io = end_io; + sdio.submit_io = submit_io; + sdio.final_block_in_bio = -1; + sdio.next_block_for_io = -1; + + dio->iocb = iocb; + dio->i_size = i_size_read(inode); + + spin_lock_init(&dio->bio_lock); + dio->refcount = 1; + + /* + * In case of non-aligned buffers, we may need 2 more + * pages since we need to zero out first and last block. + */ + if (unlikely(sdio.blkfactor)) + sdio.pages_in_io = 2; + + for (seg = 0; seg < nr_segs; seg++) { + user_addr = (unsigned long)iov[seg].iov_base; + sdio.pages_in_io += + ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) / + PAGE_SIZE - user_addr / PAGE_SIZE); + } + + for (seg = 0; seg < nr_segs; seg++) { + user_addr = (unsigned long)iov[seg].iov_base; + sdio.size += bytes = iov[seg].iov_len; + + /* Index into the first page of the first block */ + sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; + sdio.final_block_in_request = sdio.block_in_file + + (bytes >> blkbits); + /* Page fetching state */ + sdio.head = 0; + sdio.tail = 0; + sdio.curr_page = 0; + + sdio.total_pages = 0; + if (user_addr & (PAGE_SIZE-1)) { + sdio.total_pages++; + bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1)); + } + sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; + sdio.curr_user_address = user_addr; + + retval = do_direct_IO(dio, &sdio, &map_bh); + + dio->result += iov[seg].iov_len - + ((sdio.final_block_in_request - sdio.block_in_file) << + blkbits); + + if (retval) { + dio_cleanup(dio, &sdio); + break; + } + } /* end iovec loop */ + + if (retval == -ENOTBLK) { + /* + * The remaining part of the request will be + * be handled by buffered I/O when we return + */ + retval = 0; + } + /* + * There may be some unwritten disk at the end of a part-written + * fs-block-sized block. Go zero that now. + */ + dio_zero_block(dio, &sdio, 1, &map_bh); + + if (sdio.cur_page) { + ssize_t ret2; + + ret2 = dio_send_cur_page(dio, &sdio, &map_bh); + if (retval == 0) + retval = ret2; + page_cache_release(sdio.cur_page); + sdio.cur_page = NULL; + } + if (sdio.bio) + dio_bio_submit(dio, &sdio); + + /* + * It is possible that, we return short IO due to end of file. + * In that case, we need to release all the pages we got hold on. + */ + dio_cleanup(dio, &sdio); + + /* + * All block lookups have been performed. For READ requests + * we can let i_mutex go now that its achieved its purpose + * of protecting us from looking up uninitialized blocks. + */ + if (rw == READ && (dio->flags & DIO_LOCKING)) + mutex_unlock(&dio->inode->i_mutex); + + /* + * The only time we want to leave bios in flight is when a successful + * partial aio read or full aio write have been setup. In that case + * bio completion will call aio_complete. The only time it's safe to + * call aio_complete is when we return -EIOCBQUEUED, so we key on that. + * This had *better* be the only place that raises -EIOCBQUEUED. + */ + BUG_ON(retval == -EIOCBQUEUED); + if (dio->is_async && retval == 0 && dio->result && + ((rw & READ) || (dio->result == sdio.size))) + retval = -EIOCBQUEUED; + + if (retval != -EIOCBQUEUED) + dio_await_completion(dio); + + if (drop_refcount(dio) == 0) { + retval = dio_complete(dio, offset, retval, false); + kmem_cache_free(dio_cache, dio); + } else + BUG_ON(retval != -EIOCBQUEUED); out: return retval; -- cgit v1.2.3-58-ga151 From ef3d0fd27e90f67e35da516dafc1482c82939a60 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 15 Sep 2011 16:06:48 -0700 Subject: vfs: do (nearly) lockless generic_file_llseek The i_mutex lock use of generic _file_llseek hurts. Independent processes accessing the same file synchronize over a single lock, even though they have no need for synchronization at all. Under high utilization this can cause llseek to scale very poorly on larger systems. This patch does some rethinking of the llseek locking model: First the 64bit f_pos is not necessarily atomic without locks on 32bit systems. This can already cause races with read() today. This was discussed on linux-kernel in the past and deemed acceptable. The patch does not change that. Let's look at the different seek variants: SEEK_SET: Doesn't really need any locking. If there's a race one writer wins, the other loses. For 32bit the non atomic update races against read() stay the same. Without a lock they can also happen against write() now. The read() race was deemed acceptable in past discussions, and I think if it's ok for read it's ok for write too. => Don't need a lock. SEEK_END: This behaves like SEEK_SET plus it reads the maximum size too. Reading the maximum size would have the 32bit atomic problem. But luckily we already have a way to read the maximum size without locking (i_size_read), so we can just use that instead. Without i_mutex there is no synchronization with write() anymore, however since the write() update is atomic on 64bit it just behaves like another racy SEEK_SET. On non atomic 32bit it's the same as SEEK_SET. => Don't need a lock, but need to use i_size_read() SEEK_CUR: This has a read-modify-write race window on the same file. One could argue that any application doing unsynchronized seeks on the same file is already broken. But for the sake of not adding a regression here I'm using the file->f_lock to synchronize this. Using this lock is much better than the inode mutex because it doesn't synchronize between processes. => So still need a lock, but can use a f_lock. This patch implements this new scheme in generic_file_llseek. I dropped generic_file_llseek_unlocked and changed all callers. Signed-off-by: Andi Kleen Signed-off-by: Christoph Hellwig --- fs/btrfs/file.c | 2 +- fs/cifs/cifsfs.c | 2 +- fs/gfs2/file.c | 4 +-- fs/nfs/file.c | 5 ++-- fs/read_write.c | 85 ++++++++++++++++++++++++++---------------------------- include/linux/fs.h | 9 ++++-- 6 files changed, 54 insertions(+), 53 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e4e57d59edb7..1266f6e9cdb2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1821,7 +1821,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) switch (origin) { case SEEK_END: case SEEK_CUR: - offset = generic_file_llseek_unlocked(file, offset, origin); + offset = generic_file_llseek(file, offset, origin); goto out; case SEEK_DATA: case SEEK_HOLE: diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 54b8f1e7da94..db7ce87d37a5 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -723,7 +723,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin) if (rc < 0) return (loff_t)rc; } - return generic_file_llseek_unlocked(file, offset, origin); + return generic_file_llseek(file, offset, origin); } static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index edeb9e802903..fe6bc0207818 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -63,11 +63,11 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin) error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (!error) { - error = generic_file_llseek_unlocked(file, offset, origin); + error = generic_file_llseek(file, offset, origin); gfs2_glock_dq_uninit(&i_gh); } } else - error = generic_file_llseek_unlocked(file, offset, origin); + error = generic_file_llseek(file, offset, origin); return error; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 28b8c3f3cda3..12623abcf3d4 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -198,11 +198,12 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) if (retval < 0) return (loff_t)retval; + /* AK: should drop this lock. Unlikely to be needed. */ spin_lock(&inode->i_lock); - loff = generic_file_llseek_unlocked(filp, offset, origin); + loff = generic_file_llseek(filp, offset, origin); spin_unlock(&inode->i_lock); } else - loff = generic_file_llseek_unlocked(filp, offset, origin); + loff = generic_file_llseek(filp, offset, origin); return loff; } diff --git a/fs/read_write.c b/fs/read_write.c index 179f1c33ea57..672b187def62 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -35,23 +35,45 @@ static inline int unsigned_offsets(struct file *file) return file->f_mode & FMODE_UNSIGNED_OFFSET; } +static loff_t lseek_execute(struct file *file, struct inode *inode, + loff_t offset, loff_t maxsize) +{ + if (offset < 0 && !unsigned_offsets(file)) + return -EINVAL; + if (offset > maxsize) + return -EINVAL; + + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + return offset; +} + /** - * generic_file_llseek_unlocked - lockless generic llseek implementation + * generic_file_llseek - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @origin: type of seek * - * Updates the file offset to the value specified by @offset and @origin. - * Locking must be provided by the caller. + * This is a generic implemenation of ->llseek usable for all normal local + * filesystems. It just updates the file offset to the value specified by + * @offset and @origin under i_mutex. + * + * Synchronization: + * SEEK_SET is unsynchronized (but atomic on 64bit platforms) + * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. + * read/writes behave like SEEK_SET against seeks. + * SEEK_END */ loff_t -generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) +generic_file_llseek(struct file *file, loff_t offset, int origin) { struct inode *inode = file->f_mapping->host; switch (origin) { case SEEK_END: - offset += inode->i_size; + offset += i_size_read(inode); break; case SEEK_CUR: /* @@ -62,14 +84,22 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) */ if (offset == 0) return file->f_pos; - offset += file->f_pos; - break; + /* + * f_lock protects against read/modify/write race with other + * SEEK_CURs. Note that parallel writes and reads behave + * like SEEK_SET. + */ + spin_lock(&file->f_lock); + offset = lseek_execute(file, inode, file->f_pos + offset, + inode->i_sb->s_maxbytes); + spin_unlock(&file->f_lock); + return offset; case SEEK_DATA: /* * In the generic case the entire file is data, so as long as * offset isn't at the end of the file then the offset is data. */ - if (offset >= inode->i_size) + if (offset >= i_size_read(inode)) return -ENXIO; break; case SEEK_HOLE: @@ -77,46 +107,13 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) * There is a virtual hole at the end of the file, so as long as * offset isn't i_size or larger, return i_size. */ - if (offset >= inode->i_size) + if (offset >= i_size_read(inode)) return -ENXIO; - offset = inode->i_size; + offset = i_size_read(inode); break; } - if (offset < 0 && !unsigned_offsets(file)) - return -EINVAL; - if (offset > inode->i_sb->s_maxbytes) - return -EINVAL; - - /* Special lock needed here? */ - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } - - return offset; -} -EXPORT_SYMBOL(generic_file_llseek_unlocked); - -/** - * generic_file_llseek - generic llseek implementation for regular files - * @file: file structure to seek on - * @offset: file offset to seek to - * @origin: type of seek - * - * This is a generic implemenation of ->llseek useable for all normal local - * filesystems. It just updates the file offset to the value specified by - * @offset and @origin under i_mutex. - */ -loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) -{ - loff_t rval; - - mutex_lock(&file->f_dentry->d_inode->i_mutex); - rval = generic_file_llseek_unlocked(file, offset, origin); - mutex_unlock(&file->f_dentry->d_inode->i_mutex); - - return rval; + return lseek_execute(file, inode, offset, inode->i_sb->s_maxbytes); } EXPORT_SYMBOL(generic_file_llseek); diff --git a/include/linux/fs.h b/include/linux/fs.h index c1884e974ff4..db85196f6308 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -964,7 +964,12 @@ struct file { #define f_dentry f_path.dentry #define f_vfsmnt f_path.mnt const struct file_operations *f_op; - spinlock_t f_lock; /* f_ep_links, f_flags, no IRQ */ + + /* + * Protects f_ep_links, f_flags, f_pos vs i_size in lseek SEEK_CUR. + * Must not be taken from IRQ context. + */ + spinlock_t f_lock; #ifdef CONFIG_SMP int f_sb_list_cpu; #endif @@ -2398,8 +2403,6 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); extern loff_t noop_llseek(struct file *file, loff_t offset, int origin); extern loff_t no_llseek(struct file *file, loff_t offset, int origin); extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); -extern loff_t generic_file_llseek_unlocked(struct file *file, loff_t offset, - int origin); extern int generic_file_open(struct inode * inode, struct file * filp); extern int nonseekable_open(struct inode * inode, struct file * filp); -- cgit v1.2.3-58-ga151 From 5760495a872d63a182962680a13c2af29235237c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 15 Sep 2011 16:06:50 -0700 Subject: vfs: add generic_file_llseek_size Add a generic_file_llseek variant to the VFS that allows passing in the maximum file size of the file system, instead of always using maxbytes from the superblock. This can be used to eliminate some cut'n'paste seek code in ext4. Signed-off-by: Andi Kleen Signed-off-by: Christoph Hellwig --- fs/read_write.c | 37 ++++++++++++++++++++++++++++--------- include/linux/fs.h | 2 ++ 2 files changed, 30 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/read_write.c b/fs/read_write.c index 672b187def62..dfd125798791 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -51,23 +51,23 @@ static loff_t lseek_execute(struct file *file, struct inode *inode, } /** - * generic_file_llseek - generic llseek implementation for regular files + * generic_file_llseek_size - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @origin: type of seek + * @size: max size of file system * - * This is a generic implemenation of ->llseek usable for all normal local - * filesystems. It just updates the file offset to the value specified by - * @offset and @origin under i_mutex. + * This is a variant of generic_file_llseek that allows passing in a custom + * file size. * * Synchronization: - * SEEK_SET is unsynchronized (but atomic on 64bit platforms) + * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. * read/writes behave like SEEK_SET against seeks. - * SEEK_END */ loff_t -generic_file_llseek(struct file *file, loff_t offset, int origin) +generic_file_llseek_size(struct file *file, loff_t offset, int origin, + loff_t maxsize) { struct inode *inode = file->f_mapping->host; @@ -91,7 +91,7 @@ generic_file_llseek(struct file *file, loff_t offset, int origin) */ spin_lock(&file->f_lock); offset = lseek_execute(file, inode, file->f_pos + offset, - inode->i_sb->s_maxbytes); + maxsize); spin_unlock(&file->f_lock); return offset; case SEEK_DATA: @@ -113,7 +113,26 @@ generic_file_llseek(struct file *file, loff_t offset, int origin) break; } - return lseek_execute(file, inode, offset, inode->i_sb->s_maxbytes); + return lseek_execute(file, inode, offset, maxsize); +} +EXPORT_SYMBOL(generic_file_llseek_size); + +/** + * generic_file_llseek - generic llseek implementation for regular files + * @file: file structure to seek on + * @offset: file offset to seek to + * @origin: type of seek + * + * This is a generic implemenation of ->llseek useable for all normal local + * filesystems. It just updates the file offset to the value specified by + * @offset and @origin under i_mutex. + */ +loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + + return generic_file_llseek_size(file, offset, origin, + inode->i_sb->s_maxbytes); } EXPORT_SYMBOL(generic_file_llseek); diff --git a/include/linux/fs.h b/include/linux/fs.h index db85196f6308..d055cc7d7240 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2403,6 +2403,8 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); extern loff_t noop_llseek(struct file *file, loff_t offset, int origin); extern loff_t no_llseek(struct file *file, loff_t offset, int origin); extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); +extern loff_t generic_file_llseek_size(struct file *file, loff_t offset, + int origin, loff_t maxsize); extern int generic_file_open(struct inode * inode, struct file * filp); extern int nonseekable_open(struct inode * inode, struct file * filp); -- cgit v1.2.3-58-ga151 From 4cce0e28b932c11454f75d1c1fae674600c23fbf Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 15 Sep 2011 16:06:51 -0700 Subject: ext4: replace cut'n'pasted llseek code with generic_file_llseek_size This gives ext4 the benefits of unlocked llseek. Cc: tytso@mit.edu Signed-off-by: Andi Kleen Signed-off-by: Christoph Hellwig --- fs/ext4/file.c | 47 +---------------------------------------------- 1 file changed, 1 insertion(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index e4095e988eba..b9548f477bb8 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -224,53 +224,8 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin) maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; else maxbytes = inode->i_sb->s_maxbytes; - mutex_lock(&inode->i_mutex); - switch (origin) { - case SEEK_END: - offset += inode->i_size; - break; - case SEEK_CUR: - if (offset == 0) { - mutex_unlock(&inode->i_mutex); - return file->f_pos; - } - offset += file->f_pos; - break; - case SEEK_DATA: - /* - * In the generic case the entire file is data, so as long as - * offset isn't at the end of the file then the offset is data. - */ - if (offset >= inode->i_size) { - mutex_unlock(&inode->i_mutex); - return -ENXIO; - } - break; - case SEEK_HOLE: - /* - * There is a virtual hole at the end of the file, so as long as - * offset isn't i_size or larger, return i_size. - */ - if (offset >= inode->i_size) { - mutex_unlock(&inode->i_mutex); - return -ENXIO; - } - offset = inode->i_size; - break; - } - - if (offset < 0 || offset > maxbytes) { - mutex_unlock(&inode->i_mutex); - return -EINVAL; - } - - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } - mutex_unlock(&inode->i_mutex); - return offset; + return generic_file_llseek_size(file, offset, origin, maxbytes); } const struct file_operations ext4_file_operations = { -- cgit v1.2.3-58-ga151 From 79835a710d6ff811659c8de46f89c7577c3b8cc6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 15 Sep 2011 16:06:52 -0700 Subject: nfs: drop unnecessary locking in llseek This makes NFS follow the standard generic_file_llseek locking scheme. Cc: Trond.Myklebust@netapp.com Signed-off-by: Andi Kleen Signed-off-by: Christoph Hellwig --- fs/nfs/file.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 12623abcf3d4..91c01f0a4c3b 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -180,8 +180,6 @@ force_reval: static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) { - loff_t loff; - dprintk("NFS: llseek file(%s/%s, %lld, %d)\n", filp->f_path.dentry->d_parent->d_name.name, filp->f_path.dentry->d_name.name, @@ -197,14 +195,9 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) int retval = nfs_revalidate_file_size(inode, filp); if (retval < 0) return (loff_t)retval; + } - /* AK: should drop this lock. Unlikely to be needed. */ - spin_lock(&inode->i_lock); - loff = generic_file_llseek(filp, offset, origin); - spin_unlock(&inode->i_lock); - } else - loff = generic_file_llseek(filp, offset, origin); - return loff; + return generic_file_llseek(filp, offset, origin); } /* -- cgit v1.2.3-58-ga151 From f3c7691e8d30d88899b514675c7c86d19057b5fd Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 21 Sep 2011 10:58:13 -0400 Subject: leases: fix write-open/read-lease race In setlease, we use i_writecount to decide whether we can give out a read lease. In open, we break leases before incrementing i_writecount. There is therefore a window between the break lease and the i_writecount increment when setlease could add a new read lease. This would leave us with a simultaneous write open and read lease, which shouldn't happen. Signed-off-by: J. Bruce Fields Signed-off-by: Christoph Hellwig --- fs/namei.c | 5 +---- fs/open.c | 4 ++++ 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 9061157e39d6..7657be4352bf 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2035,10 +2035,7 @@ static int may_open(struct path *path, int acc_mode, int flag) if (flag & O_NOATIME && !inode_owner_or_capable(inode)) return -EPERM; - /* - * Ensure there are no outstanding leases on the file. - */ - return break_lease(inode, flag); + return 0; } static int handle_truncate(struct file *filp) diff --git a/fs/open.c b/fs/open.c index f71192109457..22c41b543f2d 100644 --- a/fs/open.c +++ b/fs/open.c @@ -685,6 +685,10 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, if (error) goto cleanup_all; + error = break_lease(inode, f->f_flags); + if (error) + goto cleanup_all; + if (!open && f->f_op) open = f->f_op->open; if (open) { -- cgit v1.2.3-58-ga151 From d12799b4c3c4d1e65d7fdbb5d91538c92d6863dd Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Wed, 26 Oct 2011 23:41:04 +0400 Subject: CIFS: Remove extra mutex_unlock in cifs_lock_add_if to prevent the mutex being unlocked twice if we interrupt a blocked lock. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/file.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index ea096ce5d4f7..91e05f2f0acf 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -778,7 +778,6 @@ try_again: else { mutex_lock(&cinode->lock_mutex); list_del_init(&lock->blist); - mutex_unlock(&cinode->lock_mutex); } } -- cgit v1.2.3-58-ga151 From 8ea00c6977d8b1463ee86d6689c8ef35ee2529a0 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 28 Oct 2011 14:49:46 -0500 Subject: [CIFS] Update cifs version to 1.76 Update cifs version to 1.76 now that async read, lock caching, and changes to oplock enabled interface are in. Thanks to Pavel for reminding me. Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifsfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index d9dbaf869cd1..30ff56005d8f 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -125,5 +125,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "1.75" +#define CIFS_VERSION "1.76" #endif /* _CIFSFS_H */ -- cgit v1.2.3-58-ga151 From fba90ffee813e2425feb9a57c532b3d297af18c3 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sat, 29 Oct 2011 09:03:00 -0400 Subject: ext4: migrate cleanup This patch cleanup code a bit, actual logic not changed - Move current block pointer to migrate_structure, let's all walk info will be in one structure. - Get rid of usless null ind-block ptr checks, caller already does that check. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/migrate.c | 101 +++++++++++++++++++----------------------------------- 1 file changed, 35 insertions(+), 66 deletions(-) (limited to 'fs') diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 6f07a06f2437..8a9a0912fdae 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -20,13 +20,13 @@ * The contiguous blocks details which can be * represented by a single extent */ -struct list_blocks_struct { - ext4_lblk_t first_block, last_block; +struct migrate_struct { + ext4_lblk_t first_block, last_block, curr_block; ext4_fsblk_t first_pblock, last_pblock; }; static int finish_range(handle_t *handle, struct inode *inode, - struct list_blocks_struct *lb) + struct migrate_struct *lb) { int retval = 0, needed; @@ -86,8 +86,7 @@ err_out: } static int update_extent_range(handle_t *handle, struct inode *inode, - ext4_fsblk_t pblock, ext4_lblk_t blk_num, - struct list_blocks_struct *lb) + ext4_fsblk_t pblock, struct migrate_struct *lb) { int retval; /* @@ -95,9 +94,10 @@ static int update_extent_range(handle_t *handle, struct inode *inode, */ if (lb->first_pblock && (lb->last_pblock+1 == pblock) && - (lb->last_block+1 == blk_num)) { + (lb->last_block+1 == lb->curr_block)) { lb->last_pblock = pblock; - lb->last_block = blk_num; + lb->last_block = lb->curr_block; + lb->curr_block++; return 0; } /* @@ -105,64 +105,49 @@ static int update_extent_range(handle_t *handle, struct inode *inode, */ retval = finish_range(handle, inode, lb); lb->first_pblock = lb->last_pblock = pblock; - lb->first_block = lb->last_block = blk_num; - + lb->first_block = lb->last_block = lb->curr_block; + lb->curr_block++; return retval; } static int update_ind_extent_range(handle_t *handle, struct inode *inode, - ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, - struct list_blocks_struct *lb) + ext4_fsblk_t pblock, + struct migrate_struct *lb) { struct buffer_head *bh; __le32 *i_data; int i, retval = 0; - ext4_lblk_t blk_count = *blk_nump; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - if (!pblock) { - /* Only update the file block number */ - *blk_nump += max_entries; - return 0; - } - bh = sb_bread(inode->i_sb, pblock); if (!bh) return -EIO; i_data = (__le32 *)bh->b_data; - for (i = 0; i < max_entries; i++, blk_count++) { + for (i = 0; i < max_entries; i++) { if (i_data[i]) { retval = update_extent_range(handle, inode, - le32_to_cpu(i_data[i]), - blk_count, lb); + le32_to_cpu(i_data[i]), lb); if (retval) break; + } else { + lb->curr_block++; } } - - /* Update the file block number */ - *blk_nump = blk_count; put_bh(bh); return retval; } static int update_dind_extent_range(handle_t *handle, struct inode *inode, - ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, - struct list_blocks_struct *lb) + ext4_fsblk_t pblock, + struct migrate_struct *lb) { struct buffer_head *bh; __le32 *i_data; int i, retval = 0; - ext4_lblk_t blk_count = *blk_nump; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - if (!pblock) { - /* Only update the file block number */ - *blk_nump += max_entries * max_entries; - return 0; - } bh = sb_bread(inode->i_sb, pblock); if (!bh) return -EIO; @@ -171,38 +156,28 @@ static int update_dind_extent_range(handle_t *handle, struct inode *inode, for (i = 0; i < max_entries; i++) { if (i_data[i]) { retval = update_ind_extent_range(handle, inode, - le32_to_cpu(i_data[i]), - &blk_count, lb); + le32_to_cpu(i_data[i]), lb); if (retval) break; } else { /* Only update the file block number */ - blk_count += max_entries; + lb->curr_block += max_entries; } } - - /* Update the file block number */ - *blk_nump = blk_count; put_bh(bh); return retval; } static int update_tind_extent_range(handle_t *handle, struct inode *inode, - ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, - struct list_blocks_struct *lb) + ext4_fsblk_t pblock, + struct migrate_struct *lb) { struct buffer_head *bh; __le32 *i_data; int i, retval = 0; - ext4_lblk_t blk_count = *blk_nump; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - if (!pblock) { - /* Only update the file block number */ - *blk_nump += max_entries * max_entries * max_entries; - return 0; - } bh = sb_bread(inode->i_sb, pblock); if (!bh) return -EIO; @@ -211,16 +186,14 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode, for (i = 0; i < max_entries; i++) { if (i_data[i]) { retval = update_dind_extent_range(handle, inode, - le32_to_cpu(i_data[i]), - &blk_count, lb); + le32_to_cpu(i_data[i]), lb); if (retval) break; - } else + } else { /* Only update the file block number */ - blk_count += max_entries * max_entries; + lb->curr_block += max_entries * max_entries; + } } - /* Update the file block number */ - *blk_nump = blk_count; put_bh(bh); return retval; @@ -461,10 +434,9 @@ int ext4_ext_migrate(struct inode *inode) handle_t *handle; int retval = 0, i; __le32 *i_data; - ext4_lblk_t blk_count = 0; struct ext4_inode_info *ei; struct inode *tmp_inode = NULL; - struct list_blocks_struct lb; + struct migrate_struct lb; unsigned long max_entries; __u32 goal; @@ -550,35 +522,32 @@ int ext4_ext_migrate(struct inode *inode) /* 32 bit block address 4 bytes */ max_entries = inode->i_sb->s_blocksize >> 2; - for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) { + for (i = 0; i < EXT4_NDIR_BLOCKS; i++) { if (i_data[i]) { retval = update_extent_range(handle, tmp_inode, - le32_to_cpu(i_data[i]), - blk_count, &lb); + le32_to_cpu(i_data[i]), &lb); if (retval) goto err_out; - } + } else + lb.curr_block++; } if (i_data[EXT4_IND_BLOCK]) { retval = update_ind_extent_range(handle, tmp_inode, - le32_to_cpu(i_data[EXT4_IND_BLOCK]), - &blk_count, &lb); + le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb); if (retval) goto err_out; } else - blk_count += max_entries; + lb.curr_block += max_entries; if (i_data[EXT4_DIND_BLOCK]) { retval = update_dind_extent_range(handle, tmp_inode, - le32_to_cpu(i_data[EXT4_DIND_BLOCK]), - &blk_count, &lb); + le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb); if (retval) goto err_out; } else - blk_count += max_entries * max_entries; + lb.curr_block += max_entries * max_entries; if (i_data[EXT4_TIND_BLOCK]) { retval = update_tind_extent_range(handle, tmp_inode, - le32_to_cpu(i_data[EXT4_TIND_BLOCK]), - &blk_count, &lb); + le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb); if (retval) goto err_out; } -- cgit v1.2.3-58-ga151 From 5cb81dabcc28863e7d04e6fd9ede154bd8459c14 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sat, 29 Oct 2011 09:05:00 -0400 Subject: ext4: fix quota accounting during migration The tmp_inode should have same uid/gid as the original inode. Otherwise new metadata blocks will be accounted to wrong quota-id, which will result in a quota leak after the inode migration is completed. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 ++- fs/ext4/ialloc.c | 9 ++++++--- fs/ext4/migrate.c | 7 +++++-- fs/ext4/namei.c | 8 ++++---- 4 files changed, 17 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3647ae0b21ab..657e82649fa5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1811,7 +1811,8 @@ extern int ext4fs_dirhash(const char *name, int len, struct /* ialloc.c */ extern struct inode *ext4_new_inode(handle_t *, struct inode *, int, - const struct qstr *qstr, __u32 goal); + const struct qstr *qstr, __u32 goal, + uid_t *owner); extern void ext4_free_inode(handle_t *, struct inode *); extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index e007fecdaedc..acdde93b74d7 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -691,7 +691,7 @@ err_ret: * group to find a free inode. */ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, - const struct qstr *qstr, __u32 goal) + const struct qstr *qstr, __u32 goal, uid_t *owner) { struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; @@ -852,8 +852,11 @@ got: flex_group = ext4_flex_group(sbi, group); atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); } - - if (test_opt(sb, GRPID)) { + if (owner) { + inode->i_mode = mode; + inode->i_uid = owner[0]; + inode->i_gid = owner[1]; + } else if (test_opt(sb, GRPID)) { inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = dir->i_gid; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 8a9a0912fdae..f729377bf043 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -439,6 +439,7 @@ int ext4_ext_migrate(struct inode *inode) struct migrate_struct lb; unsigned long max_entries; __u32 goal; + uid_t owner[2]; /* * If the filesystem does not support extents, or the inode @@ -466,10 +467,12 @@ int ext4_ext_migrate(struct inode *inode) } goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; + owner[0] = inode->i_uid; + owner[1] = inode->i_gid; tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, - S_IFREG, NULL, goal); + S_IFREG, NULL, goal, owner); if (IS_ERR(tmp_inode)) { - retval = -ENOMEM; + retval = PTR_ERR(inode); ext4_journal_stop(handle); return retval; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 4a550aa07614..f73d582e89f9 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1754,7 +1754,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0); + inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL); err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; @@ -1790,7 +1790,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0); + inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL); err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); @@ -1830,7 +1830,7 @@ retry: ext4_handle_sync(handle); inode = ext4_new_inode(handle, dir, S_IFDIR | mode, - &dentry->d_name, 0); + &dentry->d_name, 0, NULL); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; @@ -2277,7 +2277,7 @@ retry: ext4_handle_sync(handle); inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, - &dentry->d_name, 0); + &dentry->d_name, 0, NULL); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; -- cgit v1.2.3-58-ga151 From 81fdbb4a8d34242f0ed048395c4ddc910f1dffbe Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Sat, 29 Oct 2011 09:23:38 -0400 Subject: ext4: move variables to their scope Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 29969622af8b..05693804813d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -117,11 +117,9 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { - int depth; - if (path) { + int depth = path->p_depth; struct ext4_extent *ex; - depth = path->p_depth; /* * Try to predict block placement assuming that we are @@ -247,7 +245,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check) int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) { struct ext4_inode_info *ei = EXT4_I(inode); - int idxs, num = 0; + int idxs; idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent_idx)); @@ -262,6 +260,8 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) */ if (ei->i_da_metadata_calc_len && ei->i_da_metadata_calc_last_lblock+1 == lblock) { + int num = 0; + if ((ei->i_da_metadata_calc_len % idxs) == 0) num++; if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) @@ -324,8 +324,6 @@ static int ext4_valid_extent_entries(struct inode *inode, struct ext4_extent_header *eh, int depth) { - struct ext4_extent *ext; - struct ext4_extent_idx *ext_idx; unsigned short entries; if (eh->eh_entries == 0) return 1; @@ -334,7 +332,7 @@ static int ext4_valid_extent_entries(struct inode *inode, if (depth == 0) { /* leaf entries */ - ext = EXT_FIRST_EXTENT(eh); + struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); while (entries) { if (!ext4_valid_extent(inode, ext)) return 0; @@ -342,7 +340,7 @@ static int ext4_valid_extent_entries(struct inode *inode, entries--; } } else { - ext_idx = EXT_FIRST_INDEX(eh); + struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); while (entries) { if (!ext4_valid_extent_idx(inode, ext_idx)) return 0; @@ -3722,13 +3720,12 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t newblock = 0; int free_on_err = 0, err = 0, depth, ret; unsigned int allocated = 0, offset = 0; - unsigned int allocated_clusters = 0, reserved_clusters = 0; + unsigned int allocated_clusters = 0; unsigned int punched_out = 0; unsigned int result = 0; struct ext4_allocation_request ar; ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; ext4_lblk_t cluster_offset; - struct ext4_map_blocks punch_map; ext_debug("blocks %u/%u requested for inode %lu\n", map->m_lblk, map->m_len, inode->i_ino); @@ -3804,6 +3801,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* if found extent covers block, simply return it */ if (in_range(map->m_lblk, ee_block, ee_len)) { + struct ext4_map_blocks punch_map; ext4_fsblk_t partial_cluster = 0; newblock = map->m_lblk - ee_block + ee_start; @@ -4084,8 +4082,9 @@ got_allocated_blocks: * block allocation which had been deferred till now. */ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { + unsigned int reserved_clusters; /* - * Check how many clusters we had reserved this allocted range. + * Check how many clusters we had reserved this allocated range */ reserved_clusters = get_reserved_cluster_alloc(inode, map->m_lblk, allocated); -- cgit v1.2.3-58-ga151 From 02dc62fba89eaee0157752c5f1ba811ef3156e00 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Sat, 29 Oct 2011 09:29:11 -0400 Subject: ext4: clean up AGGRESSIVE_TEST code Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 05693804813d..652b4dc5dfcb 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -181,12 +181,10 @@ static inline int ext4_ext_space_block(struct inode *inode, int check) size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent); - if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 6) - size = 6; + if (!check && size > 6) + size = 6; #endif - } return size; } @@ -196,12 +194,10 @@ static inline int ext4_ext_space_block_idx(struct inode *inode, int check) size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent_idx); - if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 5) - size = 5; + if (!check && size > 5) + size = 5; #endif - } return size; } @@ -212,12 +208,10 @@ static inline int ext4_ext_space_root(struct inode *inode, int check) size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent); - if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 3) - size = 3; + if (!check && size > 3) + size = 3; #endif - } return size; } @@ -228,12 +222,10 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check) size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent_idx); - if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 4) - size = 4; + if (!check && size > 4) + size = 4; #endif - } return size; } -- cgit v1.2.3-58-ga151 From e7b319e39776bd0e9c0c7855b023dafed2c93d27 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Sat, 29 Oct 2011 09:39:51 -0400 Subject: ext4: trace punch_hole correctly in ext4_ext_map_blocks When ext4_ext_map_blocks() is called by punch_hole, trace should trace blocks punched out. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 652b4dc5dfcb..36a0f177ecad 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4171,12 +4171,12 @@ out2: ext4_ext_drop_refs(path); kfree(path); } - trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, - newblock, map->m_len, err ? err : allocated); - result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ? punched_out : allocated; + trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, + newblock, map->m_len, err ? err : result); + return err ? err : result; } -- cgit v1.2.3-58-ga151 From 6d6a435190bdf2e04c9465cde5bdc3ac68cf11a4 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 29 Oct 2011 10:15:35 -0400 Subject: ext4: fix race in xattr block allocation path Ceph users reported that when using Ceph on ext4, the filesystem would often become corrupted, containing inodes with incorrect i_blocks counters. I managed to reproduce this with a very hacked-up "streamtest" binary from the Ceph tree. Ceph is doing a lot of xattr writes, to out-of-inode blocks. There is also another thread which does sync_file_range and close, of the same files. The problem appears to happen due to this race: sync/flush thread xattr-set thread ----------------- ---------------- do_writepages ext4_xattr_set ext4_da_writepages ext4_xattr_set_handle mpage_da_map_blocks ext4_xattr_block_set set DELALLOC_RESERVE ext4_new_meta_blocks ext4_mb_new_blocks if (!i_delalloc_reserved_flag) vfs_dq_alloc_block ext4_get_blocks down_write(i_data_sem) set i_delalloc_reserved_flag ... up_write(i_data_sem) if (i_delalloc_reserved_flag) vfs_dq_alloc_block_nofail In other words, the sync/flush thread pops in and sets i_delalloc_reserved_flag on the inode, which makes the xattr thread think that it's in a delalloc path in ext4_new_meta_blocks(), and add the block for a second time, after already having added it once in the !i_delalloc_reserved_flag case in ext4_mb_new_blocks The real problem is that we shouldn't be using the DELALLOC_RESERVED state flag, and instead we should be passing EXT4_GET_BLOCKS_DELALLOC_RESERVE down to ext4_map_blocks() instead of using an inode state flag. We'll fix this for now with using i_data_sem to prevent this race, but this is really not the right way to fix things. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/xattr.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 0ae3668520f8..93a00d89a220 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -820,8 +820,14 @@ inserted: if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + /* + * take i_data_sem because we will test + * i_delalloc_reserved_flag in ext4_mb_new_blocks + */ + down_read((&EXT4_I(inode)->i_data_sem)); block = ext4_new_meta_blocks(handle, inode, goal, 0, NULL, &error); + up_read((&EXT4_I(inode)->i_data_sem)); if (error) goto cleanup; -- cgit v1.2.3-58-ga151 From 5079276066cc421b48a6a63a54a34775979e8506 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Sat, 29 Oct 2011 17:17:57 +0400 Subject: CIFS: Fix the VFS brlock cache usage in posix locking case Request to the cache in FL_POSIX case only. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/file.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 91e05f2f0acf..c1f063cd1b0c 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -793,6 +793,9 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock) struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); unsigned char saved_type = flock->fl_type; + if ((flock->fl_flags & FL_POSIX) == 0) + return 1; + mutex_lock(&cinode->lock_mutex); posix_test_lock(file, flock); @@ -809,12 +812,15 @@ static int cifs_posix_lock_set(struct file *file, struct file_lock *flock) { struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); - int rc; + int rc = 1; + + if ((flock->fl_flags & FL_POSIX) == 0) + return rc; mutex_lock(&cinode->lock_mutex); if (!cinode->can_cache_brlcks) { mutex_unlock(&cinode->lock_mutex); - return 1; + return rc; } rc = posix_lock_file_wait(file, flock); mutex_unlock(&cinode->lock_mutex); -- cgit v1.2.3-58-ga151 From 9ef5992e442b2b0bf6364bfcc5574e983a983159 Mon Sep 17 00:00:00 2001 From: Shirish Pargaonkar Date: Thu, 20 Oct 2011 13:21:59 -0500 Subject: cifs: Assume passwords are encoded according to iocharset (try #2) Re-posting a patch originally posted by Oskar Liljeblad after rebasing on 3.2. Modify cifs to assume that the supplied password is encoded according to iocharset. Before this patch passwords would be treated as raw 8-bit data, which made authentication with Unicode passwords impossible (at least passwords with characters > 0xFF). The previous code would as a side effect accept passwords encoded with ISO 8859-1, since Unicode < 0x100 basically is ISO 8859-1. Software which relies on that will no longer support password chars > 0x7F unless it also uses iocharset=iso8859-1. (mount.cifs does not care about the encoding so it will work as expected.) Signed-off-by: Oskar Liljeblad Signed-off-by: Shirish Pargaonkar Reviewed-by: Pavel Shilovsky Tested-by: A Signed-off-by: Steve French --- fs/cifs/cifsencrypt.c | 8 +++---- fs/cifs/cifsproto.h | 8 ++++--- fs/cifs/connect.c | 2 +- fs/cifs/sess.c | 2 +- fs/cifs/smbencrypt.c | 63 ++++++++++----------------------------------------- 5 files changed, 23 insertions(+), 60 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 2cfb695d1f89..5d9b9acc5fce 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -204,7 +204,7 @@ int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov, } /* first calculate 24 bytes ntlm response and then 16 byte session key */ -int setup_ntlm_response(struct cifs_ses *ses) +int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp) { int rc = 0; unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE; @@ -221,14 +221,14 @@ int setup_ntlm_response(struct cifs_ses *ses) ses->auth_key.len = temp_len; rc = SMBNTencrypt(ses->password, ses->server->cryptkey, - ses->auth_key.response + CIFS_SESS_KEY_SIZE); + ses->auth_key.response + CIFS_SESS_KEY_SIZE, nls_cp); if (rc) { cFYI(1, "%s Can't generate NTLM response, error: %d", __func__, rc); return rc; } - rc = E_md4hash(ses->password, temp_key); + rc = E_md4hash(ses->password, temp_key, nls_cp); if (rc) { cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); return rc; @@ -404,7 +404,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, } /* calculate md4 hash of password */ - E_md4hash(ses->password, nt_hash); + E_md4hash(ses->password, nt_hash, nls_cp); rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, CIFS_NTHASH_SIZE); diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index ef4f631e4c01..6f4e243e0f62 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -395,8 +395,9 @@ extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *, extern int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov, struct TCP_Server_Info *server, __u32 expected_sequence_number); -extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *); -extern int setup_ntlm_response(struct cifs_ses *); +extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *, + const struct nls_table *); +extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *); extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *); extern void cifs_crypto_shash_release(struct TCP_Server_Info *); @@ -448,7 +449,8 @@ extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr, const unsigned char *path, struct cifs_sb_info *cifs_sb, int xid); extern int mdfour(unsigned char *, unsigned char *, int); -extern int E_md4hash(const unsigned char *passwd, unsigned char *p16); +extern int E_md4hash(const unsigned char *passwd, unsigned char *p16, + const struct nls_table *codepage); extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index d545a95c30ed..c0458c543f17 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3452,7 +3452,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses, else #endif /* CIFS_WEAK_PW_HASH */ rc = SMBNTencrypt(tcon->password, ses->server->cryptkey, - bcc_ptr); + bcc_ptr, nls_codepage); bcc_ptr += CIFS_AUTH_RESP_SIZE; if (ses->capabilities & CAP_UNICODE) { diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index c7d80e24f24e..4ec3ee9d72cc 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -683,7 +683,7 @@ ssetup_ntlmssp_authenticate: cpu_to_le16(CIFS_AUTH_RESP_SIZE); /* calculate ntlm response and session key */ - rc = setup_ntlm_response(ses); + rc = setup_ntlm_response(ses, nls_cp); if (rc) { cERROR(1, "Error %d during NTLM authentication", rc); goto ssetup_exit; diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index ac1221d969d6..7cacba12b8f1 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -199,75 +199,36 @@ SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24) return rc; } -/* Routines for Windows NT MD4 Hash functions. */ -static int -_my_wcslen(__u16 *str) -{ - int len = 0; - while (*str++ != 0) - len++; - return len; -} - -/* - * Convert a string into an NT UNICODE string. - * Note that regardless of processor type - * this must be in intel (little-endian) - * format. - */ - -static int -_my_mbstowcs(__u16 *dst, const unsigned char *src, int len) -{ /* BB not a very good conversion routine - change/fix */ - int i; - __u16 val; - - for (i = 0; i < len; i++) { - val = *src; - SSVAL(dst, 0, val); - dst++; - src++; - if (val == 0) - break; - } - return i; -} - /* * Creates the MD4 Hash of the users password in NT UNICODE. */ int -E_md4hash(const unsigned char *passwd, unsigned char *p16) +E_md4hash(const unsigned char *passwd, unsigned char *p16, + const struct nls_table *codepage) { int rc; int len; __u16 wpwd[129]; /* Password cannot be longer than 128 characters */ - if (passwd) { - len = strlen((char *) passwd); - if (len > 128) - len = 128; - - /* Password must be converted to NT unicode */ - _my_mbstowcs(wpwd, passwd, len); - } else + if (passwd) /* Password must be converted to NT unicode */ + len = cifs_strtoUCS(wpwd, passwd, 128, codepage); + else { len = 0; + *wpwd = 0; /* Ensure string is null terminated */ + } - wpwd[len] = 0; /* Ensure string is null terminated */ - /* Calculate length in bytes */ - len = _my_wcslen(wpwd) * sizeof(__u16); - - rc = mdfour(p16, (unsigned char *) wpwd, len); - memset(wpwd, 0, 129 * 2); + rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__u16)); + memset(wpwd, 0, 129 * sizeof(__u16)); return rc; } /* Does the NT MD4 hash then des encryption. */ int -SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) +SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24, + const struct nls_table *codepage) { int rc; unsigned char p16[16], p21[21]; @@ -275,7 +236,7 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) memset(p16, '\0', 16); memset(p21, '\0', 21); - rc = E_md4hash(passwd, p16); + rc = E_md4hash(passwd, p16, codepage); if (rc) { cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); return rc; -- cgit v1.2.3-58-ga151 From ad4e38dd6a33bb3a4882c487d7abe621e583b982 Mon Sep 17 00:00:00 2001 From: Curt Wohlgemuth Date: Fri, 7 Oct 2011 21:51:56 -0600 Subject: writeback: send work item to queue_io, move_expired_inodes Instead of sending ->older_than_this to queue_io() and move_expired_inodes(), send the entire wb_writeback_work structure. There are other fields of a work item that are useful in these routines and in tracepoints. Acked-by: Jan Kara Signed-off-by: Curt Wohlgemuth Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 16 ++++++++-------- include/trace/events/writeback.h | 5 +++-- 2 files changed, 11 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6401cd76f109..c51029693600 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -251,7 +251,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) */ static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, - unsigned long *older_than_this) + struct wb_writeback_work *work) { LIST_HEAD(tmp); struct list_head *pos, *node; @@ -262,8 +262,8 @@ static int move_expired_inodes(struct list_head *delaying_queue, while (!list_empty(delaying_queue)) { inode = wb_inode(delaying_queue->prev); - if (older_than_this && - inode_dirtied_after(inode, *older_than_this)) + if (work->older_than_this && + inode_dirtied_after(inode, *work->older_than_this)) break; if (sb && sb != inode->i_sb) do_sb_sort = 1; @@ -302,13 +302,13 @@ out: * | * +--> dequeue for IO */ -static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) +static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) { int moved; assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); - moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); - trace_writeback_queue_io(wb, older_than_this, moved); + moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work); + trace_writeback_queue_io(wb, work, moved); } static int write_inode(struct inode *inode, struct writeback_control *wbc) @@ -651,7 +651,7 @@ long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) - queue_io(wb, NULL); + queue_io(wb, &work); __writeback_inodes_wb(wb, &work); spin_unlock(&wb->list_lock); @@ -745,7 +745,7 @@ static long wb_writeback(struct bdi_writeback *wb, trace_writeback_start(wb->bdi, work); if (list_empty(&wb->b_io)) - queue_io(wb, work->older_than_this); + queue_io(wb, work); if (work->sb) progress = writeback_sb_inodes(work->sb, wb, work); else diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 0ce9f06f58c2..1261db3916cc 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -157,9 +157,9 @@ DEFINE_WBC_EVENT(wbc_writepage); TRACE_EVENT(writeback_queue_io, TP_PROTO(struct bdi_writeback *wb, - unsigned long *older_than_this, + struct wb_writeback_work *work, int moved), - TP_ARGS(wb, older_than_this, moved), + TP_ARGS(wb, work, moved), TP_STRUCT__entry( __array(char, name, 32) __field(unsigned long, older) @@ -167,6 +167,7 @@ TRACE_EVENT(writeback_queue_io, __field(int, moved) ), TP_fast_assign( + unsigned long *older_than_this = work->older_than_this; strncpy(__entry->name, dev_name(wb->bdi->dev), 32); __entry->older = older_than_this ? *older_than_this : 0; __entry->age = older_than_this ? -- cgit v1.2.3-58-ga151 From 0e175a1835ffc979e55787774e58ec79e41957d7 Mon Sep 17 00:00:00 2001 From: Curt Wohlgemuth Date: Fri, 7 Oct 2011 21:54:10 -0600 Subject: writeback: Add a 'reason' to wb_writeback_work This creates a new 'reason' field in a wb_writeback_work structure, which unambiguously identifies who initiates writeback activity. A 'wb_reason' enumeration has been added to writeback.h, to enumerate the possible reasons. The 'writeback_work_class' and tracepoint event class and 'writeback_queue_io' tracepoints are updated to include the symbolic 'reason' in all trace events. And the 'writeback_inodes_sbXXX' family of routines has had a wb_stats parameter added to them, so callers can specify why writeback is being started. Acked-by: Jan Kara Signed-off-by: Curt Wohlgemuth Signed-off-by: Wu Fengguang --- fs/btrfs/extent-tree.c | 3 ++- fs/buffer.c | 2 +- fs/ext4/inode.c | 2 +- fs/fs-writeback.c | 49 +++++++++++++++++++++++++++++----------- fs/quota/quota.c | 2 +- fs/sync.c | 4 ++-- fs/ubifs/budget.c | 2 +- include/linux/backing-dev.h | 3 ++- include/linux/writeback.h | 32 +++++++++++++++++++++----- include/trace/events/writeback.h | 14 ++++++++---- mm/backing-dev.c | 3 ++- mm/page-writeback.c | 3 ++- mm/vmscan.c | 3 ++- 13 files changed, 88 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f5be06a2462f..c9ee0e18bbdc 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3340,7 +3340,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, smp_mb(); nr_pages = min_t(unsigned long, nr_pages, root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); - writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); + writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, + WB_REASON_FS_FREE_SPACE); spin_lock(&space_info->lock); if (reserved > space_info->bytes_reserved) diff --git a/fs/buffer.c b/fs/buffer.c index 1a80b048ade8..f5dcee6c4cfb 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -285,7 +285,7 @@ static void free_more_memory(void) struct zone *zone; int nid; - wakeup_flusher_threads(1024); + wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM); yield(); for_each_online_node(nid) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 986e2388f031..7fa73a3b2120 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2241,7 +2241,7 @@ static int ext4_nonda_switch(struct super_block *sb) * start pushing delalloc when 1/2 of free blocks are dirty. */ if (free_blocks < 2 * dirty_blocks) - writeback_inodes_sb_if_idle(sb); + writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE); return 0; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index c51029693600..73c3992b2bb4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -41,11 +41,23 @@ struct wb_writeback_work { unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; + enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ struct completion *done; /* set if the caller waits */ }; +const char *wb_reason_name[] = { + [WB_REASON_BACKGROUND] = "background", + [WB_REASON_TRY_TO_FREE_PAGES] = "try_to_free_pages", + [WB_REASON_SYNC] = "sync", + [WB_REASON_PERIODIC] = "periodic", + [WB_REASON_LAPTOP_TIMER] = "laptop_timer", + [WB_REASON_FREE_MORE_MEM] = "free_more_memory", + [WB_REASON_FS_FREE_SPACE] = "fs_free_space", + [WB_REASON_FORKER_THREAD] = "forker_thread" +}; + /* * Include the creation of the trace points after defining the * wb_writeback_work structure so that the definition remains local to this @@ -115,7 +127,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi, static void __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - bool range_cyclic) + bool range_cyclic, enum wb_reason reason) { struct wb_writeback_work *work; @@ -135,6 +147,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, work->sync_mode = WB_SYNC_NONE; work->nr_pages = nr_pages; work->range_cyclic = range_cyclic; + work->reason = reason; bdi_queue_work(bdi, work); } @@ -150,9 +163,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, * completion. Caller need not hold sb s_umount semaphore. * */ -void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) +void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, + enum wb_reason reason) { - __bdi_start_writeback(bdi, nr_pages, true); + __bdi_start_writeback(bdi, nr_pages, true, reason); } /** @@ -641,12 +655,14 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb, return wrote; } -long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, + enum wb_reason reason) { struct wb_writeback_work work = { .nr_pages = nr_pages, .sync_mode = WB_SYNC_NONE, .range_cyclic = 1, + .reason = reason, }; spin_lock(&wb->list_lock); @@ -825,6 +841,7 @@ static long wb_check_background_flush(struct bdi_writeback *wb) .sync_mode = WB_SYNC_NONE, .for_background = 1, .range_cyclic = 1, + .reason = WB_REASON_BACKGROUND, }; return wb_writeback(wb, &work); @@ -858,6 +875,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) .sync_mode = WB_SYNC_NONE, .for_kupdate = 1, .range_cyclic = 1, + .reason = WB_REASON_PERIODIC, }; return wb_writeback(wb, &work); @@ -976,7 +994,7 @@ int bdi_writeback_thread(void *data) * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back * the whole world. */ -void wakeup_flusher_threads(long nr_pages) +void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) { struct backing_dev_info *bdi; @@ -989,7 +1007,7 @@ void wakeup_flusher_threads(long nr_pages) list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { if (!bdi_has_dirty_io(bdi)) continue; - __bdi_start_writeback(bdi, nr_pages, false); + __bdi_start_writeback(bdi, nr_pages, false, reason); } rcu_read_unlock(); } @@ -1210,7 +1228,9 @@ static void wait_sb_inodes(struct super_block *sb) * on how many (if any) will be written, and this function does not wait * for IO completion of submitted IO. */ -void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) +void writeback_inodes_sb_nr(struct super_block *sb, + unsigned long nr, + enum wb_reason reason) { DECLARE_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { @@ -1219,6 +1239,7 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) .tagged_writepages = 1, .done = &done, .nr_pages = nr, + .reason = reason, }; WARN_ON(!rwsem_is_locked(&sb->s_umount)); @@ -1235,9 +1256,9 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr); * on how many (if any) will be written, and this function does not wait * for IO completion of submitted IO. */ -void writeback_inodes_sb(struct super_block *sb) +void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) { - return writeback_inodes_sb_nr(sb, get_nr_dirty_pages()); + return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); } EXPORT_SYMBOL(writeback_inodes_sb); @@ -1248,11 +1269,11 @@ EXPORT_SYMBOL(writeback_inodes_sb); * Invoke writeback_inodes_sb if no writeback is currently underway. * Returns 1 if writeback was started, 0 if not. */ -int writeback_inodes_sb_if_idle(struct super_block *sb) +int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason) { if (!writeback_in_progress(sb->s_bdi)) { down_read(&sb->s_umount); - writeback_inodes_sb(sb); + writeback_inodes_sb(sb, reason); up_read(&sb->s_umount); return 1; } else @@ -1269,11 +1290,12 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle); * Returns 1 if writeback was started, 0 if not. */ int writeback_inodes_sb_nr_if_idle(struct super_block *sb, - unsigned long nr) + unsigned long nr, + enum wb_reason reason) { if (!writeback_in_progress(sb->s_bdi)) { down_read(&sb->s_umount); - writeback_inodes_sb_nr(sb, nr); + writeback_inodes_sb_nr(sb, nr, reason); up_read(&sb->s_umount); return 1; } else @@ -1297,6 +1319,7 @@ void sync_inodes_sb(struct super_block *sb) .nr_pages = LONG_MAX, .range_cyclic = 0, .done = &done, + .reason = WB_REASON_SYNC, }; WARN_ON(!rwsem_is_locked(&sb->s_umount)); diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 10b6be3ca280..4bae57fc603b 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -286,7 +286,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, /* caller already holds s_umount */ if (sb->s_flags & MS_RDONLY) return -EROFS; - writeback_inodes_sb(sb); + writeback_inodes_sb(sb, WB_REASON_SYNC); return 0; default: return -EINVAL; diff --git a/fs/sync.c b/fs/sync.c index c98a7477edfd..101b8ef901d7 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -43,7 +43,7 @@ static int __sync_filesystem(struct super_block *sb, int wait) if (wait) sync_inodes_sb(sb); else - writeback_inodes_sb(sb); + writeback_inodes_sb(sb, WB_REASON_SYNC); if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, wait); @@ -98,7 +98,7 @@ static void sync_filesystems(int wait) */ SYSCALL_DEFINE0(sync) { - wakeup_flusher_threads(0); + wakeup_flusher_threads(0, WB_REASON_SYNC); sync_filesystems(0); sync_filesystems(1); if (unlikely(laptop_mode)) diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 315de66e52b2..bc4f94b28706 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -63,7 +63,7 @@ static void shrink_liability(struct ubifs_info *c, int nr_to_write) { down_read(&c->vfs_sb->s_umount); - writeback_inodes_sb(c->vfs_sb); + writeback_inodes_sb(c->vfs_sb, WB_REASON_FS_FREE_SPACE); up_read(&c->vfs_sb->s_umount); } diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c3b92010d894..b1038bd686ac 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -118,7 +118,8 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); void bdi_unregister(struct backing_dev_info *bdi); int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); -void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages); +void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, + enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); int bdi_writeback_thread(void *data); int bdi_has_dirty_io(struct backing_dev_info *bdi); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index ddb4652cb337..a378c295851f 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -38,6 +38,23 @@ enum writeback_sync_modes { WB_SYNC_ALL, /* Wait on every mapping */ }; +/* + * why some writeback work was initiated + */ +enum wb_reason { + WB_REASON_BACKGROUND, + WB_REASON_TRY_TO_FREE_PAGES, + WB_REASON_SYNC, + WB_REASON_PERIODIC, + WB_REASON_LAPTOP_TIMER, + WB_REASON_FREE_MORE_MEM, + WB_REASON_FS_FREE_SPACE, + WB_REASON_FORKER_THREAD, + + WB_REASON_MAX, +}; +extern const char *wb_reason_name[]; + /* * A control structure which tells the writeback code what to do. These are * always on the stack, and hence need no locking. They are always initialised @@ -69,14 +86,17 @@ struct writeback_control { */ struct bdi_writeback; int inode_wait(void *); -void writeback_inodes_sb(struct super_block *); -void writeback_inodes_sb_nr(struct super_block *, unsigned long nr); -int writeback_inodes_sb_if_idle(struct super_block *); -int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr); +void writeback_inodes_sb(struct super_block *, enum wb_reason reason); +void writeback_inodes_sb_nr(struct super_block *, unsigned long nr, + enum wb_reason reason); +int writeback_inodes_sb_if_idle(struct super_block *, enum wb_reason reason); +int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr, + enum wb_reason reason); void sync_inodes_sb(struct super_block *); -long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages); +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, + enum wb_reason reason); long wb_do_writeback(struct bdi_writeback *wb, int force_wait); -void wakeup_flusher_threads(long nr_pages); +void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 1261db3916cc..b99caa8b780c 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -34,6 +34,7 @@ DECLARE_EVENT_CLASS(writeback_work_class, __field(int, for_kupdate) __field(int, range_cyclic) __field(int, for_background) + __field(int, reason) ), TP_fast_assign( strncpy(__entry->name, dev_name(bdi->dev), 32); @@ -43,16 +44,18 @@ DECLARE_EVENT_CLASS(writeback_work_class, __entry->for_kupdate = work->for_kupdate; __entry->range_cyclic = work->range_cyclic; __entry->for_background = work->for_background; + __entry->reason = work->reason; ), TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " - "kupdate=%d range_cyclic=%d background=%d", + "kupdate=%d range_cyclic=%d background=%d reason=%s", __entry->name, MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), __entry->nr_pages, __entry->sync_mode, __entry->for_kupdate, __entry->range_cyclic, - __entry->for_background + __entry->for_background, + wb_reason_name[__entry->reason] ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ @@ -165,6 +168,7 @@ TRACE_EVENT(writeback_queue_io, __field(unsigned long, older) __field(long, age) __field(int, moved) + __field(int, reason) ), TP_fast_assign( unsigned long *older_than_this = work->older_than_this; @@ -173,12 +177,14 @@ TRACE_EVENT(writeback_queue_io, __entry->age = older_than_this ? (jiffies - *older_than_this) * 1000 / HZ : -1; __entry->moved = moved; + __entry->reason = work->reason; ), - TP_printk("bdi %s: older=%lu age=%ld enqueue=%d", + TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s", __entry->name, __entry->older, /* older_than_this in jiffies */ __entry->age, /* older_than_this in relative milliseconds */ - __entry->moved) + __entry->moved, + wb_reason_name[__entry->reason]) ); TRACE_EVENT(global_dirty_state, diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 5dcaa3c756d1..dd8916feb05e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -476,7 +476,8 @@ static int bdi_forker_thread(void *ptr) * the bdi from the thread. Hopefully 1024 is * large enough for efficient IO. */ - writeback_inodes_wb(&bdi->wb, 1024); + writeback_inodes_wb(&bdi->wb, 1024, + WB_REASON_FORKER_THREAD); } else { /* * The spinlock makes sure we do not lose diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 45d36f7dc169..650846b61584 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1301,7 +1301,8 @@ void laptop_mode_timer_fn(unsigned long data) * threshold */ if (bdi_has_dirty_io(&q->backing_dev_info)) - bdi_start_writeback(&q->backing_dev_info, nr_pages); + bdi_start_writeback(&q->backing_dev_info, nr_pages, + WB_REASON_LAPTOP_TIMER); } /* diff --git a/mm/vmscan.c b/mm/vmscan.c index b55699cd9067..c735bd770d3d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2181,7 +2181,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, */ writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; if (total_scanned > writeback_threshold) { - wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); + wakeup_flusher_threads(laptop_mode ? 0 : total_scanned, + WB_REASON_TRY_TO_FREE_PAGES); sc->may_writepage = 1; } -- cgit v1.2.3-58-ga151 From d73d5046a72467d4510825b99e2269e09ad80e15 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sun, 30 Oct 2011 18:26:08 -0400 Subject: ext4: Use correct locking for ext4_end_io_nolock() We must hold i_completed_io_lock when manipulating anything on the i_completed_io_list linked list. This includes io->lock, which we were checking in ext4_end_io_nolock(). So move this check to ext4_end_io_work(). This also has the bonus of avoiding extra work if it is already done without needing to take the mutex. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/fsync.c | 3 --- fs/ext4/page-io.c | 14 +++++++++++--- 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index c942924a0645..851ac5b3cec9 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -83,9 +83,6 @@ int ext4_flush_completed_IO(struct inode *inode) int ret = 0; int ret2 = 0; - if (list_empty(&ei->i_completed_io_list)) - return ret; - dump_completed_IO(inode); spin_lock_irqsave(&ei->i_completed_io_lock, flags); while (!list_empty(&ei->i_completed_io_list)){ diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 92f38ee13f8a..aed40966f342 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -87,6 +87,9 @@ void ext4_free_io_end(ext4_io_end_t *io) /* * check a range of space and convert unwritten extents to written. + * + * Called with inode->i_mutex; we depend on this when we manipulate + * io->flag, since we could otherwise race with ext4_flush_completed_IO() */ int ext4_end_io_nolock(ext4_io_end_t *io) { @@ -100,9 +103,6 @@ int ext4_end_io_nolock(ext4_io_end_t *io) "list->prev 0x%p\n", io, inode->i_ino, io->list.next, io->list.prev); - if (list_empty(&io->list)) - return ret; - if (!(io->flag & EXT4_IO_END_UNWRITTEN)) return ret; @@ -142,6 +142,13 @@ static void ext4_end_io_work(struct work_struct *work) unsigned long flags; int ret; + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + if (list_empty(&io->list)) { + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + goto free; + } + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + if (!mutex_trylock(&inode->i_mutex)) { /* * Requeue the work instead of waiting so that the work @@ -170,6 +177,7 @@ static void ext4_end_io_work(struct work_struct *work) list_del_init(&io->list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); mutex_unlock(&inode->i_mutex); +free: ext4_free_io_end(io); } -- cgit v1.2.3-58-ga151 From 4e298021216727cc27017c5032ade86167c66256 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 30 Oct 2011 18:41:19 -0400 Subject: ext4: remove unnecessary call to waitqueue_active() The usage of waitqueue_active() is not necessary, and introduces (I believe) a hard-to-hit race. Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index aed40966f342..4fa1d709b604 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -70,7 +70,6 @@ static void put_io_page(struct ext4_io_page *io_page) void ext4_free_io_end(ext4_io_end_t *io) { int i; - wait_queue_head_t *wq; BUG_ON(!io); if (io->page) @@ -78,10 +77,8 @@ void ext4_free_io_end(ext4_io_end_t *io) for (i = 0; i < io->num_io_pages; i++) put_io_page(io->pages[i]); io->num_io_pages = 0; - wq = ext4_ioend_wq(io->inode); - if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && - waitqueue_active(wq)) - wake_up_all(wq); + if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) + wake_up_all(ext4_ioend_wq(io->inode)); kmem_cache_free(io_end_cachep, io); } @@ -96,7 +93,6 @@ int ext4_end_io_nolock(ext4_io_end_t *io) struct inode *inode = io->inode; loff_t offset = io->offset; ssize_t size = io->size; - wait_queue_head_t *wq; int ret = 0; ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," @@ -121,11 +117,8 @@ int ext4_end_io_nolock(ext4_io_end_t *io) if (io->flag & EXT4_IO_END_UNWRITTEN) { io->flag &= ~EXT4_IO_END_UNWRITTEN; /* Wake up anyone waiting on unwritten extent conversion */ - wq = ext4_ioend_wq(io->inode); - if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) && - waitqueue_active(wq)) { - wake_up_all(wq); - } + if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten)) + wake_up_all(ext4_ioend_wq(io->inode)); } return ret; -- cgit v1.2.3-58-ga151 From b82e384c7bb9a19036b4daf58fa216df7cd48aa0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 31 Oct 2011 10:56:32 -0400 Subject: ext4: optimize locking for end_io extent conversion Now that we are doing the locking correctly, we need to grab the i_completed_io_lock() twice per end_io. We can clean this up by removing the structure from the i_complted_io_list, and use this as the locking mechanism to prevent ext4_flush_completed_IO() racing against ext4_end_io_work(), instead of clearing the EXT4_IO_END_UNWRITTEN in io->flag. In addition, if the ext4_convert_unwritten_extents() returns an error, we no longer keep the end_io structure on the linked list. This doesn't help, because it tends to lock up the file system and wedges the system. That's one way to call attention to the problem, but it doesn't help the overall robustness of the system. Signed-off-by: "Theodore Ts'o" --- fs/ext4/fsync.c | 5 ++--- fs/ext4/page-io.c | 37 +++++++++++-------------------------- 2 files changed, 13 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 851ac5b3cec9..00a2cb753efd 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -88,6 +88,7 @@ int ext4_flush_completed_IO(struct inode *inode) while (!list_empty(&ei->i_completed_io_list)){ io = list_entry(ei->i_completed_io_list.next, ext4_io_end_t, list); + list_del_init(&io->list); /* * Calling ext4_end_io_nolock() to convert completed * IO to written. @@ -104,11 +105,9 @@ int ext4_flush_completed_IO(struct inode *inode) */ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); ret = ext4_end_io_nolock(io); - spin_lock_irqsave(&ei->i_completed_io_lock, flags); if (ret < 0) ret2 = ret; - else - list_del_init(&io->list); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); } spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); return (ret2 < 0) ? ret2 : 0; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 4fa1d709b604..9eebf44646f6 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -99,28 +99,21 @@ int ext4_end_io_nolock(ext4_io_end_t *io) "list->prev 0x%p\n", io, inode->i_ino, io->list.next, io->list.prev); - if (!(io->flag & EXT4_IO_END_UNWRITTEN)) - return ret; - ret = ext4_convert_unwritten_extents(inode, offset, size); if (ret < 0) { - printk(KERN_EMERG "%s: failed to convert unwritten " - "extents to written extents, error is %d " - "io is still on inode %lu aio dio list\n", - __func__, ret, inode->i_ino); - return ret; + ext4_msg(inode->i_sb, KERN_EMERG, + "failed to convert unwritten extents to written " + "extents -- potential data loss! " + "(inode %lu, offset %llu, size %zd, error %d)", + inode->i_ino, offset, size, ret); } if (io->iocb) aio_complete(io->iocb, io->result, 0); - /* clear the DIO AIO unwritten flag */ - if (io->flag & EXT4_IO_END_UNWRITTEN) { - io->flag &= ~EXT4_IO_END_UNWRITTEN; - /* Wake up anyone waiting on unwritten extent conversion */ - if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten)) - wake_up_all(ext4_ioend_wq(io->inode)); - } + /* Wake up anyone waiting on unwritten extent conversion */ + if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten)) + wake_up_all(ext4_ioend_wq(io->inode)); return ret; } @@ -133,16 +126,15 @@ static void ext4_end_io_work(struct work_struct *work) struct inode *inode = io->inode; struct ext4_inode_info *ei = EXT4_I(inode); unsigned long flags; - int ret; spin_lock_irqsave(&ei->i_completed_io_lock, flags); if (list_empty(&io->list)) { spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); goto free; } - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); if (!mutex_trylock(&inode->i_mutex)) { + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); /* * Requeue the work instead of waiting so that the work * items queued after this can be processed. @@ -159,16 +151,9 @@ static void ext4_end_io_work(struct work_struct *work) io->flag |= EXT4_IO_END_QUEUED; return; } - ret = ext4_end_io_nolock(io); - if (ret < 0) { - mutex_unlock(&inode->i_mutex); - return; - } - - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (!list_empty(&io->list)) - list_del_init(&io->list); + list_del_init(&io->list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + (void) ext4_end_io_nolock(io); mutex_unlock(&inode->i_mutex); free: ext4_free_io_end(io); -- cgit v1.2.3-58-ga151 From 92407e75ce45b41c46944891711fd8faf0714d84 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Sun, 23 Oct 2011 20:21:17 -0700 Subject: nfs4: serialize layoutcommit Current pnfs_layoutcommit_inode can not handle parallel layoutcommit. And as Trond suggested , there is no need for client to optimize for parallel layoutcommit. So add NFS_INO_LAYOUTCOMMITTING flag to mark inflight layoutcommit and serialize lalyoutcommit with it. Also mark_inode_dirty_sync if pnfs_layoutcommit_inode fails to issue layoutcommit. Reported-by: Vitaliy Gusev Signed-off-by: Peng Tao Signed-off-by: Jim Rees Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 6 ++++++ fs/nfs/pnfs.c | 25 ++++++++++++++++++++++--- include/linux/nfs_fs.h | 1 + 3 files changed, 29 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d2ae413c986a..b60fddf606f7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5950,6 +5950,7 @@ static void nfs4_layoutcommit_release(void *calldata) { struct nfs4_layoutcommit_data *data = calldata; struct pnfs_layout_segment *lseg, *tmp; + unsigned long *bitlock = &NFS_I(data->args.inode)->flags; pnfs_cleanup_layoutcommit(data); /* Matched by references in pnfs_set_layoutcommit */ @@ -5959,6 +5960,11 @@ static void nfs4_layoutcommit_release(void *calldata) &lseg->pls_flags)) put_lseg(lseg); } + + clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); + smp_mb__after_clear_bit(); + wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); + put_rpccred(data->cred); kfree(data); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index ee73d9a4f700..a2478bc74442 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1443,17 +1443,31 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ data = kzalloc(sizeof(*data), GFP_NOFS); if (!data) { - mark_inode_dirty_sync(inode); status = -ENOMEM; goto out; } + if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + goto out_free; + + if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { + if (!sync) { + status = -EAGAIN; + goto out_free; + } + status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (status) + goto out_free; + } + INIT_LIST_HEAD(&data->lseg_list); spin_lock(&inode->i_lock); if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { + clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags); spin_unlock(&inode->i_lock); - kfree(data); - goto out; + wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING); + goto out_free; } pnfs_list_write_lseg(inode, &data->lseg_list); @@ -1475,6 +1489,11 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) status = nfs4_proc_layoutcommit(data, sync); out: + if (status) + mark_inode_dirty_sync(inode); dprintk("<-- %s status %d\n", __func__, status); return status; +out_free: + kfree(data); + goto out; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 60a137b7f171..ab2c6343361a 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -229,6 +229,7 @@ struct nfs_inode { #define NFS_INO_COMMIT (7) /* inode is committing unstable writes */ #define NFS_INO_PNFS_COMMIT (8) /* use pnfs code for commit */ #define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ +#define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */ static inline struct nfs_inode *NFS_I(const struct inode *inode) { -- cgit v1.2.3-58-ga151 From d743c3c9c236cc61403a4f7d6283d59ddf68b2bd Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Sun, 23 Oct 2011 20:22:38 -0700 Subject: NFS4: fix cb_recallany decode error craa_type_mask is bitmap4 per RFC5661. We need to expect a length before extracting bitmap value. Cc: Alexandros Batsakis Signed-off-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/callback_xdr.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 918ad647afea..ee1a5b3cd48d 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -488,17 +488,18 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallanyargs *args) { - __be32 *p; + uint32_t bitmap[2]; + __be32 *p, status; args->craa_addr = svc_addr(rqstp); p = read_buf(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); args->craa_objs_to_keep = ntohl(*p++); - p = read_buf(xdr, 4); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_BADXDR); - args->craa_type_mask = ntohl(*p); + status = decode_bitmap(xdr, bitmap); + if (unlikely(status)) + return status; + args->craa_type_mask = bitmap[0]; return 0; } -- cgit v1.2.3-58-ga151 From c02f557dd0a026d7147da3b6f7daf52c6ff5580f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 25 Oct 2011 12:17:43 -0400 Subject: NFS: Fix documenting comment for nfs_create_request() Clean up: the first parameter of nfs_create_request() has been incorrectly documented since time immemorial (OK, since before 2.6.12). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index b60970cc7f1f..0a5ff5c19511 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -41,7 +41,7 @@ nfs_page_free(struct nfs_page *p) /** * nfs_create_request - Create an NFS read/write request. - * @file: file descriptor to use + * @ctx: open context to use * @inode: inode to which the request is attached * @page: page to write * @offset: starting offset within the page for the write -- cgit v1.2.3-58-ga151 From c6e696660213a89a5bfde8b49d539553904c808f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 25 Oct 2011 12:17:53 -0400 Subject: NFS: Clean up nfs4_xdr_dec_secinfo() Clean up: Remove superfluous logic at the tail of nfs4_xdr_dec_secinfo() . Introduced by commit 5a5ea0d4 "NFS: Add secinfo procedure" (March 24, 2011). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 1dce12f41a4f..e6161b213ed1 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -6602,8 +6602,6 @@ static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp, if (status) goto out; status = decode_secinfo(xdr, res); - if (status) - goto out; out: return status; } -- cgit v1.2.3-58-ga151 From e414966b81a74745ac8d6bfeda0d95fb721e6d91 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 25 Oct 2011 12:18:03 -0400 Subject: NFS: Remove no-op less-than-zero checks on unsigned variables. Introduced by commit 16b374ca "NFSv4.1: pnfs: filelayout: add driver's LAYOUTGET and GETDEVICEINFO infrastructure" (October 20, 2010). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 09119418402f..12185aadb349 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -449,9 +449,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, fl->dsaddr = dsaddr; - if (fl->first_stripe_index < 0 || - fl->first_stripe_index >= dsaddr->stripe_count) { - dprintk("%s Bad first_stripe_index %d\n", + if (fl->first_stripe_index >= dsaddr->stripe_count) { + dprintk("%s Bad first_stripe_index %u\n", __func__, fl->first_stripe_index); goto out_put; } @@ -552,7 +551,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, /* Note that a zero value for num_fh is legal for STRIPE_SPARSE. * Futher checking is done in filelayout_check_layout */ - if (fl->num_fh < 0 || fl->num_fh > + if (fl->num_fh > max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT)) goto out_err; -- cgit v1.2.3-58-ga151 From 0edeb71dc9133bfb505d3bf59642e07cd936613e Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 31 Oct 2011 17:30:44 -0400 Subject: ext4: Create helper function for EXT4_IO_END_UNWRITTEN and i_aiodio_unwritten EXT4_IO_END_UNWRITTEN flag set and the increase of i_aiodio_unwritten should be done simultaneously since ext4_end_io_nolock always clear the flag and decrease the counter in the same time. We have found some bugs that the flag is set while leaving i_aiodio_unwritten unchanged(commit 32c80b32c053d). So this patch just tries to create a helper function to wrap them to avoid any future bug. The idea is inspired by Eric. Cc: Eric Sandeen Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 9 +++++++++ fs/ext4/extents.c | 18 ++++++------------ fs/ext4/inode.c | 5 +---- fs/ext4/page-io.c | 6 ++---- 4 files changed, 18 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 657e82649fa5..604c200216c1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1275,6 +1275,15 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } +static inline void ext4_set_io_unwritten_flag(struct inode *inode, + struct ext4_io_end *io_end) +{ + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + io_end->flag |= EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } +} + /* * Inode dynamic state flags */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 36a0f177ecad..a5c8caaaa099 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3472,12 +3472,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, * that this IO needs to conversion to written when IO is * completed */ - if (io) { - if (!(io->flag & EXT4_IO_END_UNWRITTEN)) { - io->flag = EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); - } - } else + if (io) + ext4_set_io_unwritten_flag(inode, io); + else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); if (ext4_should_dioread_nolock(inode)) map->m_flags |= EXT4_MAP_UNINIT; @@ -4030,12 +4027,9 @@ got_allocated_blocks: * that we need to perform conversion when IO is done. */ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { - if (io) { - if (!(io->flag & EXT4_IO_END_UNWRITTEN)) { - io->flag = EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); - } - } else + if (io) + ext4_set_io_unwritten_flag(inode, io); + else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e4b26faac5ff..60af5126eb05 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2831,10 +2831,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) * but being more careful is always safe for the future change. */ inode = io_end->inode; - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - io_end->flag |= EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); - } + ext4_set_io_unwritten_flag(inode, io_end); /* Add the io_end to per-inode completed io list*/ spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 9eebf44646f6..7ce1d0b19c94 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -336,10 +336,8 @@ submit_and_retry: if ((io_end->num_io_pages >= MAX_IO_PAGES) && (io_end->pages[io_end->num_io_pages-1] != io_page)) goto submit_and_retry; - if (buffer_uninit(bh) && !(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - io_end->flag |= EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); - } + if (buffer_uninit(bh)) + ext4_set_io_unwritten_flag(inode, io_end); io->io_end->size += bh->b_size; io->io_next_block++; ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); -- cgit v1.2.3-58-ga151 From e260daf27902b2189a9198f5b64fa4567939bb5b Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Mon, 31 Oct 2011 17:54:36 -0400 Subject: ext4: move vars to local scope in ext4_discard_partial_page_buffers_no_lock() Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 60af5126eb05..f60b459b27d7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3179,7 +3179,6 @@ int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; unsigned int offset = from & (PAGE_CACHE_SIZE-1); unsigned int blocksize, max, pos; - unsigned int end_of_block, range_to_discard; ext4_lblk_t iblock; struct buffer_head *bh; int err = 0; @@ -3231,6 +3230,8 @@ int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, pos = offset; while (pos < offset + length) { + unsigned int end_of_block, range_to_discard; + err = 0; /* The length of space left to zero and unmap */ -- cgit v1.2.3-58-ga151 From 5129d05fda57be13f434dbe8536de39a6c25496d Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Mon, 31 Oct 2011 17:56:10 -0400 Subject: ext4: return ENOMEM if find_or_create_pages fails Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f60b459b27d7..5fcef98f98e5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3129,7 +3129,7 @@ int ext4_discard_partial_page_buffers(handle_t *handle, page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, mapping_gfp_mask(mapping) & ~__GFP_FS); if (!page) - return -EINVAL; + return -ENOMEM; err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page, from, length, flags); @@ -3372,7 +3372,7 @@ int ext4_block_zero_page_range(handle_t *handle, page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, mapping_gfp_mask(mapping) & ~__GFP_FS); if (!page) - return -EINVAL; + return -ENOMEM; blocksize = inode->i_sb->s_blocksize; max = blocksize - (offset & (blocksize - 1)); -- cgit v1.2.3-58-ga151 From edb5ac8993e25143f6af1ab143843a65c52e2a15 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Mon, 31 Oct 2011 18:04:38 -0400 Subject: ext4: let ext4_discard_partial_buffers handle unaligned range correctly As comment says, we should handle unaligned range rather than aligned one. This fixes a bug found by running xfstests #91. Signed-off-by: Yongqiang Yang --- fs/ext4/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5fcef98f98e5..de05e86f1993 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3206,8 +3206,8 @@ int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, * to be updated with the contents of the block before * we write the zeros on top of it. */ - if (!(from & (blocksize - 1)) || - !((from + length) & (blocksize - 1))) { + if ((from & (blocksize - 1)) || + ((from + length) & (blocksize - 1))) { create_empty_buffers(page, blocksize, 0); } else { /* -- cgit v1.2.3-58-ga151 From 4af835089984ce9e24c44a51be64c5524788e973 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Mon, 31 Oct 2011 18:21:29 -0400 Subject: ext4: remove comments about extent mount option in ext4_new_inode() Remove comments about 'extent' mount option in ext4_new_inode(), since it's no longer exists. Signed-off-by: Eryu Guan Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index acdde93b74d7..612bec255c6c 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -873,11 +873,7 @@ got: ei->i_dir_start_lookup = 0; ei->i_disksize = 0; - /* - * Don't inherit extent flag from directory, amongst others. We set - * extent flag on newly created directory and file only if -o extent - * mount option is specified - */ + /* Don't inherit extent flag from directory, amongst others. */ ei->i_flags = ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); ei->i_file_acl = 0; -- cgit v1.2.3-58-ga151 From 3c6fe77017bc6ce489f231c35fed3220b6691836 Mon Sep 17 00:00:00 2001 From: Greg Harm Date: Mon, 31 Oct 2011 18:41:47 -0400 Subject: ext4: Don't normalize an falloc request if it can fit in 1 extent. If an fallocate request fits in EXT_UNINIT_MAX_LEN, then set the EXT4_GET_BLOCKS_NO_NORMALIZE flag. For larger fallocate requests, let mballoc.c normalize the request. This fixes a problem where large requests were being split into non-contiguous extents due to commit 556b27abf73: ext4: do not normalize block requests from fallocate. Testing: *) Checked that 8.x MB falloc'ed files are still laid down next to each other (contiguously). *) Checked that the maximum size extent (127.9MB) is allocated as 1 extent. *) Checked that a 1GB file is somewhat contiguous (often 5-6 non-contiguous extents now). *) Checked that a 120MB file can still be falloc'ed even if there are no single extents large enough to hold it. Signed-off-by: Greg Harm Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a5c8caaaa099..2798945a1920 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4337,10 +4337,16 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); return ret; } - flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT | - EXT4_GET_BLOCKS_NO_NORMALIZE; + flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT; if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + /* + * Don't normalize the request if it can fit in one extent so + * that it doesn't get unnecessarily split into multiple + * extents. + */ + if (len <= EXT_UNINIT_MAX_LEN << blkbits) + flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; retry: while (ret >= 0 && ret < max_blocks) { map.m_lblk = map.m_lblk + ret; -- cgit v1.2.3-58-ga151 From 78ace70c4186c0d18314eb001637aa97d1585e65 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 10 Oct 2011 01:08:05 -0700 Subject: udf: Convert printks to pr_ Use the current logging styles. Convert a few printks that should have been udf_warn and udf_err. Coalesce formats. Add #define pr_fmt. Move an #include "udfdecls.h" above other includes in udftime.c so pr_fmt works correctly. Strip prefixes from conversions as appropriate. Reorder logging definitions in udfdecl.h Signed-off-by: Joe Perches Signed-off-by: Jan Kara --- fs/udf/directory.c | 4 +-- fs/udf/inode.c | 35 +++++++++++--------------- fs/udf/super.c | 73 ++++++++++++++++++++++++------------------------------ fs/udf/truncate.c | 22 ++++++++-------- fs/udf/udfdecl.h | 38 ++++++++++++++-------------- fs/udf/udftime.c | 3 ++- fs/udf/unicode.c | 6 ++--- 7 files changed, 83 insertions(+), 98 deletions(-) (limited to 'fs') diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 2ffdb6733af1..4e0843897d07 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -201,7 +201,7 @@ struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offs struct short_ad *sa; if ((!ptr) || (!offset)) { - printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n"); + pr_err("%s: invalidparms\n", __func__); return NULL; } @@ -223,7 +223,7 @@ struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset struct long_ad *la; if ((!ptr) || (!offset)) { - printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n"); + pr_err("%s: invalidparms\n", __func__); return NULL; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index f94d6f9febf5..f67e7e268c49 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -84,12 +84,10 @@ void udf_evict_inode(struct inode *inode) end_writeback(inode); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && inode->i_size != iinfo->i_lenExtents) { - printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has " - "inode size %llu different from extent length %llu. " - "Filesystem need not be standards compliant.\n", - inode->i_sb->s_id, inode->i_ino, inode->i_mode, - (unsigned long long)inode->i_size, - (unsigned long long)iinfo->i_lenExtents); + udf_warn(inode->i_sb, "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n", + inode->i_ino, inode->i_mode, + (unsigned long long)inode->i_size, + (unsigned long long)iinfo->i_lenExtents); } kfree(iinfo->i_ext.i_data); iinfo->i_ext.i_data = NULL; @@ -1177,16 +1175,15 @@ static void __udf_read_inode(struct inode *inode) */ bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident); if (!bh) { - printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n", - inode->i_ino); + udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino); make_bad_inode(inode); return; } if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE && ident != TAG_IDENT_USE) { - printk(KERN_ERR "udf: udf_read_inode(ino %ld) " - "failed ident=%d\n", inode->i_ino, ident); + udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n", + inode->i_ino, ident); brelse(bh); make_bad_inode(inode); return; @@ -1226,8 +1223,8 @@ static void __udf_read_inode(struct inode *inode) } brelse(ibh); } else if (fe->icbTag.strategyType != cpu_to_le16(4)) { - printk(KERN_ERR "udf: unsupported strategy type: %d\n", - le16_to_cpu(fe->icbTag.strategyType)); + udf_err(inode->i_sb, "unsupported strategy type: %d\n", + le16_to_cpu(fe->icbTag.strategyType)); brelse(bh); make_bad_inode(inode); return; @@ -1421,9 +1418,8 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) udf_debug("METADATA BITMAP FILE-----\n"); break; default: - printk(KERN_ERR "udf: udf_fill_inode(ino %ld) failed unknown " - "file type=%d\n", inode->i_ino, - fe->icbTag.fileType); + udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n", + inode->i_ino, fe->icbTag.fileType); make_bad_inode(inode); return; } @@ -1446,8 +1442,8 @@ static int udf_alloc_i_data(struct inode *inode, size_t size) iinfo->i_ext.i_data = kmalloc(size, GFP_KERNEL); if (!iinfo->i_ext.i_data) { - printk(KERN_ERR "udf:udf_alloc_i_data (ino %ld) " - "no free memory\n", inode->i_ino); + udf_err(inode->i_sb, "(ino %ld) no free memory\n", + inode->i_ino); return -ENOMEM; } @@ -1697,9 +1693,8 @@ out: if (do_sync) { sync_dirty_buffer(bh); if (buffer_write_io_error(bh)) { - printk(KERN_WARNING "IO error syncing udf inode " - "[%s:%08lx]\n", inode->i_sb->s_id, - inode->i_ino); + udf_warn(inode->i_sb, "IO error syncing udf inode [%08lx]\n", + inode->i_ino); err = -EIO; } } diff --git a/fs/udf/super.c b/fs/udf/super.c index 622331f1290d..39e3f351c7a6 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -547,8 +547,7 @@ static int udf_parse_options(char *options, struct udf_options *uopt, uopt->dmode = option & 0777; break; default: - printk(KERN_ERR "udf: bad mount option \"%s\" " - "or missing value\n", p); + pr_err("bad mount option \"%s\" or missing value\n", p); return 0; } } @@ -1106,11 +1105,9 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block); if (!sbi->s_vat_inode && sbi->s_last_block != blocks - 1) { - printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the" - " last recorded block (%lu), retrying with the last " - "block of the device (%lu).\n", - (unsigned long)sbi->s_last_block, - (unsigned long)blocks - 1); + pr_notice("Failed to read VAT inode from the last recorded block (%lu), retrying with the last block of the device (%lu).\n", + (unsigned long)sbi->s_last_block, + (unsigned long)blocks - 1); udf_find_vat_block(sb, p_index, type1_index, blocks - 1); } if (!sbi->s_vat_inode) @@ -1208,8 +1205,8 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block) if (map->s_partition_type == UDF_METADATA_MAP25) { ret = udf_load_metadata_files(sb, i); if (ret) { - printk(KERN_ERR "UDF-fs: error loading MetaData " - "partition map %d\n", i); + udf_err(sb, "error loading MetaData partition map %d\n", + i); goto out_bh; } } else { @@ -1222,9 +1219,7 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block) * overwrite blocks instead of relocating them). */ sb->s_flags |= MS_RDONLY; - printk(KERN_NOTICE "UDF-fs: Filesystem marked read-only " - "because writing to pseudooverwrite partition is " - "not implemented.\n"); + pr_notice("Filesystem marked read-only because writing to pseudooverwrite partition is not implemented\n"); } out_bh: /* In case loading failed, we handle cleanup in udf_fill_super */ @@ -1466,9 +1461,9 @@ static noinline int udf_process_sequence(struct super_block *sb, long block, bh = udf_read_tagged(sb, block, block, &ident); if (!bh) { - printk(KERN_ERR "udf: Block %Lu of volume descriptor " - "sequence is corrupted or we could not read " - "it.\n", (unsigned long long)block); + udf_err(sb, + "Block %llu of volume descriptor sequence is corrupted or we could not read it\n", + (unsigned long long)block); return 1; } @@ -1541,7 +1536,7 @@ static noinline int udf_process_sequence(struct super_block *sb, long block, * in a suitable order */ if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) { - printk(KERN_ERR "udf: Primary Volume Descriptor not found!\n"); + udf_err(sb, "Primary Volume Descriptor not found!\n"); return 1; } if (udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block)) @@ -1728,7 +1723,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, if (!sb_set_blocksize(sb, uopt->blocksize)) { if (!silent) - printk(KERN_WARNING "UDF-fs: Bad block size\n"); + udf_warn(sb, "Bad block size\n"); return 0; } sbi->s_last_block = uopt->lastblock; @@ -1737,7 +1732,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, nsr_off = udf_check_vsd(sb); if (!nsr_off) { if (!silent) - printk(KERN_WARNING "UDF-fs: No VRS found\n"); + udf_warn(sb, "No VRS found\n"); return 0; } if (nsr_off == -1) @@ -1753,7 +1748,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, sbi->s_anchor = uopt->anchor; if (!udf_find_anchor(sb, fileset)) { if (!silent) - printk(KERN_WARNING "UDF-fs: No anchor found\n"); + udf_warn(sb, "No anchor found\n"); return 0; } return 1; @@ -1974,15 +1969,14 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) ret = udf_load_vrs(sb, &uopt, silent, &fileset); if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) { if (!silent) - printk(KERN_NOTICE - "UDF-fs: Rescanning with blocksize " - "%d\n", UDF_DEFAULT_BLOCKSIZE); + pr_notice("Rescanning with blocksize %d\n", + UDF_DEFAULT_BLOCKSIZE); uopt.blocksize = UDF_DEFAULT_BLOCKSIZE; ret = udf_load_vrs(sb, &uopt, silent, &fileset); } } if (!ret) { - printk(KERN_WARNING "UDF-fs: No partition found (1)\n"); + udf_warn(sb, "No partition found (1)\n"); goto error_out; } @@ -1997,10 +1991,9 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) le16_to_cpu(lvidiu->maxUDFWriteRev); */ if (minUDFReadRev > UDF_MAX_READ_VERSION) { - printk(KERN_ERR "UDF-fs: minUDFReadRev=%x " - "(max is %x)\n", - le16_to_cpu(lvidiu->minUDFReadRev), - UDF_MAX_READ_VERSION); + udf_err(sb, "minUDFReadRev=%x (max is %x)\n", + le16_to_cpu(lvidiu->minUDFReadRev), + UDF_MAX_READ_VERSION); goto error_out; } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION) sb->s_flags |= MS_RDONLY; @@ -2014,28 +2007,27 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) } if (!sbi->s_partitions) { - printk(KERN_WARNING "UDF-fs: No partition found (2)\n"); + udf_warn(sb, "No partition found (2)\n"); goto error_out; } if (sbi->s_partmaps[sbi->s_partition].s_partition_flags & UDF_PART_FLAG_READ_ONLY) { - printk(KERN_NOTICE "UDF-fs: Partition marked readonly; " - "forcing readonly mount\n"); + pr_notice("Partition marked readonly; forcing readonly mount\n"); sb->s_flags |= MS_RDONLY; } if (udf_find_fileset(sb, &fileset, &rootdir)) { - printk(KERN_WARNING "UDF-fs: No fileset found\n"); + udf_warn(sb, "No fileset found\n"); goto error_out; } if (!silent) { struct timestamp ts; udf_time_to_disk_stamp(&ts, sbi->s_record_time); - udf_info("UDF: Mounting volume '%s', " - "timestamp %04u/%02u/%02u %02u:%02u (%x)\n", - sbi->s_volume_ident, le16_to_cpu(ts.year), ts.month, ts.day, + udf_info("Mounting volume '%s', timestamp %04u/%02u/%02u %02u:%02u (%x)\n", + sbi->s_volume_ident, + le16_to_cpu(ts.year), ts.month, ts.day, ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone)); } if (!(sb->s_flags & MS_RDONLY)) @@ -2046,8 +2038,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) /* perhaps it's not extensible enough, but for now ... */ inode = udf_iget(sb, &rootdir); if (!inode) { - printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, " - "partition=%d\n", + udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n", rootdir.logicalBlockNum, rootdir.partitionReferenceNum); goto error_out; } @@ -2055,7 +2046,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) /* Allocate a dentry for the root inode */ sb->s_root = d_alloc_root(inode); if (!sb->s_root) { - printk(KERN_ERR "UDF-fs: Couldn't allocate root dentry\n"); + udf_err(sb, "Couldn't allocate root dentry\n"); iput(inode); goto error_out; } @@ -2095,7 +2086,7 @@ void _udf_err(struct super_block *sb, const char *function, va_start(args, fmt); vsnprintf(error_buf, sizeof(error_buf), fmt, args); va_end(args); - printk(KERN_CRIT "UDF-fs error (device %s): %s: %s", + pr_crit("error (device %s): %s: %s", sb->s_id, function, error_buf); } @@ -2107,7 +2098,7 @@ void _udf_warn(struct super_block *sb, const char *function, va_start(args, fmt); vsnprintf(error_buf, sizeof(error_buf), fmt, args); va_end(args); - printk(KERN_WARNING "UDF-fs warning (device %s): %s: %s", + pr_warn("warning (device %s): %s: %s", sb->s_id, function, error_buf); } @@ -2200,11 +2191,11 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb, bh = udf_read_ptagged(sb, &loc, 0, &ident); if (!bh) { - printk(KERN_ERR "udf: udf_count_free failed\n"); + udf_err(sb, "udf_count_free failed\n"); goto out; } else if (ident != TAG_IDENT_SBD) { brelse(bh); - printk(KERN_ERR "udf: udf_count_free failed\n"); + udf_err(sb, "udf_count_free failed\n"); goto out; } diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 8424308db4b4..4b98fee8e161 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -95,23 +95,21 @@ void udf_truncate_tail_extent(struct inode *inode) lbcount += elen; if (lbcount > inode->i_size) { if (lbcount - inode->i_size >= inode->i_sb->s_blocksize) - printk(KERN_WARNING - "udf_truncate_tail_extent(): Too long " - "extent after EOF in inode %u: i_size: " - "%Ld lbcount: %Ld extent %u+%u\n", - (unsigned)inode->i_ino, - (long long)inode->i_size, - (long long)lbcount, - (unsigned)eloc.logicalBlockNum, - (unsigned)elen); + udf_warn(inode->i_sb, + "Too long extent after EOF in inode %u: i_size: %lld lbcount: %lld extent %u+%u\n", + (unsigned)inode->i_ino, + (long long)inode->i_size, + (long long)lbcount, + (unsigned)eloc.logicalBlockNum, + (unsigned)elen); nelen = elen - (lbcount - inode->i_size); epos.offset -= adsize; extent_trunc(inode, &epos, &eloc, etype, elen, nelen); epos.offset += adsize; if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1) - printk(KERN_ERR "udf_truncate_tail_extent(): " - "Extent after EOF in inode %u.\n", - (unsigned)inode->i_ino); + udf_err(inode->i_sb, + "Extent after EOF in inode %u\n", + (unsigned)inode->i_ino); break; } } diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 85e15edc080f..f3d449867301 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -1,6 +1,8 @@ #ifndef __UDF_DECL_H #define __UDF_DECL_H +#define pr_fmt(fmt) "UDF-fs: " fmt + #include "ecma_167.h" #include "osta_udf.h" @@ -16,18 +18,11 @@ #define UDF_PREALLOCATE #define UDF_DEFAULT_PREALLOC_BLOCKS 8 -#undef UDFFS_DEBUG - -#ifdef UDFFS_DEBUG -#define udf_debug(f, a...) \ -do { \ - printk(KERN_DEBUG "UDF-fs DEBUG %s:%d:%s: ", \ - __FILE__, __LINE__, __func__); \ - printk(f, ##a); \ -} while (0) -#else -#define udf_debug(f, a...) /**/ -#endif +__attribute__((format(printf, 3, 4))) +extern void _udf_err(struct super_block *sb, const char *function, + const char *fmt, ...); +#define udf_err(sb, fmt, ...) \ + _udf_err(sb, __func__, fmt, ##__VA_ARGS__) __attribute__((format(printf, 3, 4))) extern void _udf_warn(struct super_block *sb, const char *function, @@ -35,14 +30,19 @@ extern void _udf_warn(struct super_block *sb, const char *function, #define udf_warn(sb, fmt, ...) \ _udf_warn(sb, __func__, fmt, ##__VA_ARGS__) -__attribute__((format(printf, 3, 4))) -extern void _udf_err(struct super_block *sb, const char *function, - const char *fmt, ...); -#define udf_err(sb, fmt, ...) \ - _udf_err(sb, __func__, fmt, ##__VA_ARGS__) +#define udf_info(fmt, ...) \ + pr_info("INFO " fmt, ##__VA_ARGS__) -#define udf_info(f, a...) \ - printk(KERN_INFO "UDF-fs INFO " f, ##a); +#undef UDFFS_DEBUG + +#ifdef UDFFS_DEBUG +#define udf_debug(fmt, ...) \ + printk(KERN_DEBUG pr_fmt("%s:%d:%s: " fmt), \ + __FILE__, __LINE__, __func__, ##__VA_ARGS__) +#else +#define udf_debug(fmt, ...) \ + no_printk(fmt, ##__VA_ARGS__) +#endif #define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) ) #define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) ) diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c index b8c828c4d200..1f11483eba6a 100644 --- a/fs/udf/udftime.c +++ b/fs/udf/udftime.c @@ -34,9 +34,10 @@ * http://www.boulder.nist.gov/timefreq/pubs/bulletin/leapsecond.htm */ +#include "udfdecl.h" + #include #include -#include "udfdecl.h" #define EPOCH_YEAR 1970 diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index d03a90b6ad69..44b815e57f94 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -114,7 +114,7 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i) cmp_id = ocu_i->u_cmpID; if (cmp_id != 8 && cmp_id != 16) { memset(utf_o, 0, sizeof(struct ustr)); - printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", + pr_err("unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name); return 0; } @@ -242,7 +242,7 @@ try_again: if (utf_cnt) { error_out: ocu[++u_len] = '?'; - printk(KERN_DEBUG "udf: bad UTF-8 character\n"); + printk(KERN_DEBUG pr_fmt("bad UTF-8 character\n")); } ocu[length - 1] = (uint8_t)u_len + 1; @@ -267,7 +267,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, cmp_id = ocu_i->u_cmpID; if (cmp_id != 8 && cmp_id != 16) { memset(utf_o, 0, sizeof(struct ustr)); - printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", + pr_err("unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name); return 0; } -- cgit v1.2.3-58-ga151 From c2bff36c299b88ca5e05638dfa210d709e2a87ef Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 10 Oct 2011 01:08:06 -0700 Subject: udf: Neaten logging output, use vsprintf extension %pV Use %pV and remove a static buffer to save some text space and fix possible issues when several processes call error reporting function in parallel. Also change error level from KERN_CRIT to KERN_ERR. Signed-off-by: Joe Perches Signed-off-by: Jan Kara --- fs/udf/super.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/udf/super.c b/fs/udf/super.c index 39e3f351c7a6..926228e7435a 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -75,8 +75,6 @@ #define UDF_DEFAULT_BLOCKSIZE 2048 -static char error_buf[1024]; - /* These are the "meat" - everything else is stuffing */ static int udf_fill_super(struct super_block *, void *, int); static void udf_put_super(struct super_block *); @@ -2077,29 +2075,37 @@ error_out: void _udf_err(struct super_block *sb, const char *function, const char *fmt, ...) { + struct va_format vaf; va_list args; - if (!(sb->s_flags & MS_RDONLY)) { - /* mark sb error */ + /* mark sb error */ + if (!(sb->s_flags & MS_RDONLY)) sb->s_dirt = 1; - } + va_start(args, fmt); - vsnprintf(error_buf, sizeof(error_buf), fmt, args); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("error (device %s): %s: %pV", sb->s_id, function, &vaf); + va_end(args); - pr_crit("error (device %s): %s: %s", - sb->s_id, function, error_buf); } void _udf_warn(struct super_block *sb, const char *function, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - vsnprintf(error_buf, sizeof(error_buf), fmt, args); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_warn("warning (device %s): %s: %pV", sb->s_id, function, &vaf); + va_end(args); - pr_warn("warning (device %s): %s: %s", - sb->s_id, function, error_buf); } static void udf_put_super(struct super_block *sb) -- cgit v1.2.3-58-ga151 From a983f368f8986c1ecb64f2947fcf594343130215 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 10 Oct 2011 01:08:07 -0700 Subject: udf: Neaten udf_debug uses Just whitespace and argument alignment. Introduce some checkpatch warnings that deserve to be ignored. Reviewed-by: NamJae Jeon Signed-off-by: Joe Perches Signed-off-by: Jan Kara --- fs/udf/balloc.c | 14 +++++----- fs/udf/directory.c | 4 +-- fs/udf/inode.c | 3 +-- fs/udf/lowlevel.c | 2 +- fs/udf/partition.c | 8 +++--- fs/udf/super.c | 78 ++++++++++++++++++++++++------------------------------ 6 files changed, 50 insertions(+), 59 deletions(-) (limited to 'fs') diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 95518a9f589e..987585bb0a1d 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -59,8 +59,8 @@ static int __load_block_bitmap(struct super_block *sb, int nr_groups = bitmap->s_nr_groups; if (block_group >= nr_groups) { - udf_debug("block_group (%d) > nr_groups (%d)\n", block_group, - nr_groups); + udf_debug("block_group (%d) > nr_groups (%d)\n", + block_group, nr_groups); } if (bitmap->s_block_bitmap[block_group]) { @@ -126,8 +126,9 @@ static void udf_bitmap_free_blocks(struct super_block *sb, if (bloc->logicalBlockNum + count < count || (bloc->logicalBlockNum + count) > partmap->s_partition_len) { udf_debug("%d < %d || %d + %d > %d\n", - bloc->logicalBlockNum, 0, bloc->logicalBlockNum, - count, partmap->s_partition_len); + bloc->logicalBlockNum, 0, + bloc->logicalBlockNum, count, + partmap->s_partition_len); goto error_return; } @@ -155,7 +156,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb, if (udf_set_bit(bit + i, bh->b_data)) { udf_debug("bit %ld already set\n", bit + i); udf_debug("byte=%2x\n", - ((char *)bh->b_data)[(bit + i) >> 3]); + ((char *)bh->b_data)[(bit + i) >> 3]); } } udf_add_free_space(sb, sbi->s_partition, count); @@ -369,7 +370,8 @@ static void udf_table_free_blocks(struct super_block *sb, if (bloc->logicalBlockNum + count < count || (bloc->logicalBlockNum + count) > partmap->s_partition_len) { udf_debug("%d < %d || %d + %d > %d\n", - bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count, + bloc->logicalBlockNum, 0, + bloc->logicalBlockNum, count, partmap->s_partition_len); goto error_return; } diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 4e0843897d07..3e44f575fb9c 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -162,8 +162,8 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset) int padlen; if ((!buffer) || (!offset)) { - udf_debug("invalidparms\n, buffer=%p, offset=%p\n", buffer, - offset); + udf_debug("invalidparms, buffer=%p, offset=%p\n", + buffer, offset); return NULL; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index f67e7e268c49..329e7a108ab7 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1985,8 +1985,7 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos, *elen = le32_to_cpu(lad->extLength) & UDF_EXTENT_LENGTH_MASK; break; default: - udf_debug("alloc_type = %d unsupported\n", - iinfo->i_alloc_type); + udf_debug("alloc_type = %d unsupported\n", iinfo->i_alloc_type); return -1; } diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c index 43e24a3b8e10..6583fe9b0645 100644 --- a/fs/udf/lowlevel.c +++ b/fs/udf/lowlevel.c @@ -38,7 +38,7 @@ unsigned int udf_get_last_session(struct super_block *sb) if (i == 0) { udf_debug("XA disk: %s, vol_desc_start=%d\n", - (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba); + ms_info.xa_flag ? "yes" : "no", ms_info.addr.lba); if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ vol_desc_start = ms_info.addr.lba; } else { diff --git a/fs/udf/partition.c b/fs/udf/partition.c index c72edb2260e3..f3e472c67709 100644 --- a/fs/udf/partition.c +++ b/fs/udf/partition.c @@ -33,8 +33,8 @@ uint32_t udf_get_pblock(struct super_block *sb, uint32_t block, struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map; if (partition >= sbi->s_partitions) { - udf_debug("block=%d, partition=%d, offset=%d: " - "invalid partition\n", block, partition, offset); + udf_debug("block=%d, partition=%d, offset=%d: invalid partition\n", + block, partition, offset); return 0xFFFFFFFF; } map = &sbi->s_partmaps[partition]; @@ -60,8 +60,8 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block, vdata = &map->s_type_specific.s_virtual; if (block > vdata->s_num_entries) { - udf_debug("Trying to access block beyond end of VAT " - "(%d max %d)\n", block, vdata->s_num_entries); + udf_debug("Trying to access block beyond end of VAT (%d max %d)\n", + block, vdata->s_num_entries); return 0xFFFFFFFF; } diff --git a/fs/udf/super.c b/fs/udf/super.c index 926228e7435a..e58123ad75ba 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -639,20 +639,16 @@ static loff_t udf_check_vsd(struct super_block *sb) udf_debug("ISO9660 Boot Record found\n"); break; case 1: - udf_debug("ISO9660 Primary Volume Descriptor " - "found\n"); + udf_debug("ISO9660 Primary Volume Descriptor found\n"); break; case 2: - udf_debug("ISO9660 Supplementary Volume " - "Descriptor found\n"); + udf_debug("ISO9660 Supplementary Volume Descriptor found\n"); break; case 3: - udf_debug("ISO9660 Volume Partition Descriptor " - "found\n"); + udf_debug("ISO9660 Volume Partition Descriptor found\n"); break; case 255: - udf_debug("ISO9660 Volume Descriptor Set " - "Terminator found\n"); + udf_debug("ISO9660 Volume Descriptor Set Terminator found\n"); break; default: udf_debug("ISO9660 VRS (%u) found\n", @@ -803,8 +799,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) pvoldesc->recordingDateAndTime)) { #ifdef UDFFS_DEBUG struct timestamp *ts = &pvoldesc->recordingDateAndTime; - udf_debug("recording time %04u/%02u/%02u" - " %02u:%02u (%x)\n", + udf_debug("recording time %04u/%02u/%02u %02u:%02u (%x)\n", le16_to_cpu(ts->year), ts->month, ts->day, ts->hour, ts->minute, le16_to_cpu(ts->typeAndTimezone)); #endif @@ -815,7 +810,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name, outstr->u_len > 31 ? 31 : outstr->u_len); udf_debug("volIdent[] = '%s'\n", - UDF_SB(sb)->s_volume_ident); + UDF_SB(sb)->s_volume_ident); } if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128)) @@ -847,7 +842,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) addr.partitionReferenceNum = map->s_partition_num; udf_debug("Metadata file location: block = %d part = %d\n", - addr.logicalBlockNum, addr.partitionReferenceNum); + addr.logicalBlockNum, addr.partitionReferenceNum); mdata->s_metadata_fe = udf_iget(sb, &addr); @@ -867,7 +862,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) addr.partitionReferenceNum = map->s_partition_num; udf_debug("Mirror metadata file location: block = %d part = %d\n", - addr.logicalBlockNum, addr.partitionReferenceNum); + addr.logicalBlockNum, addr.partitionReferenceNum); mdata->s_mirror_fe = udf_iget(sb, &addr); @@ -896,7 +891,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) addr.partitionReferenceNum = map->s_partition_num; udf_debug("Bitmap file location: block = %d part = %d\n", - addr.logicalBlockNum, addr.partitionReferenceNum); + addr.logicalBlockNum, addr.partitionReferenceNum); mdata->s_bitmap_fe = udf_iget(sb, &addr); @@ -988,10 +983,9 @@ static int udf_fill_partdesc_info(struct super_block *sb, if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE)) map->s_partition_flags |= UDF_PART_FLAG_OVERWRITABLE; - udf_debug("Partition (%d type %x) starts at physical %d, " - "block length %d\n", p_index, - map->s_partition_type, map->s_partition_root, - map->s_partition_len); + udf_debug("Partition (%d type %x) starts at physical %d, block length %d\n", + p_index, map->s_partition_type, + map->s_partition_root, map->s_partition_len); if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) && strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03)) @@ -1008,12 +1002,12 @@ static int udf_fill_partdesc_info(struct super_block *sb, map->s_uspace.s_table = udf_iget(sb, &loc); if (!map->s_uspace.s_table) { udf_debug("cannot load unallocSpaceTable (part %d)\n", - p_index); + p_index); return 1; } map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE; udf_debug("unallocSpaceTable (part %d) @ %ld\n", - p_index, map->s_uspace.s_table->i_ino); + p_index, map->s_uspace.s_table->i_ino); } if (phd->unallocSpaceBitmap.extLength) { @@ -1026,8 +1020,8 @@ static int udf_fill_partdesc_info(struct super_block *sb, bitmap->s_extPosition = le32_to_cpu( phd->unallocSpaceBitmap.extPosition); map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP; - udf_debug("unallocSpaceBitmap (part %d) @ %d\n", p_index, - bitmap->s_extPosition); + udf_debug("unallocSpaceBitmap (part %d) @ %d\n", + p_index, bitmap->s_extPosition); } if (phd->partitionIntegrityTable.extLength) @@ -1043,13 +1037,13 @@ static int udf_fill_partdesc_info(struct super_block *sb, map->s_fspace.s_table = udf_iget(sb, &loc); if (!map->s_fspace.s_table) { udf_debug("cannot load freedSpaceTable (part %d)\n", - p_index); + p_index); return 1; } map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE; udf_debug("freedSpaceTable (part %d) @ %ld\n", - p_index, map->s_fspace.s_table->i_ino); + p_index, map->s_fspace.s_table->i_ino); } if (phd->freedSpaceBitmap.extLength) { @@ -1062,8 +1056,8 @@ static int udf_fill_partdesc_info(struct super_block *sb, bitmap->s_extPosition = le32_to_cpu( phd->freedSpaceBitmap.extPosition); map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP; - udf_debug("freedSpaceBitmap (part %d) @ %d\n", p_index, - bitmap->s_extPosition); + udf_debug("freedSpaceBitmap (part %d) @ %d\n", + p_index, bitmap->s_extPosition); } return 0; } @@ -1325,9 +1319,8 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, struct metadataPartitionMap *mdm = (struct metadataPartitionMap *) &(lvd->partitionMaps[offset]); - udf_debug("Parsing Logical vol part %d " - "type %d id=%s\n", i, type, - UDF_ID_METADATA); + udf_debug("Parsing Logical vol part %d type %d id=%s\n", + i, type, UDF_ID_METADATA); map->s_partition_type = UDF_METADATA_MAP25; map->s_partition_func = udf_get_pblock_meta25; @@ -1346,21 +1339,20 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, mdm->flags & 0x01; udf_debug("Metadata Ident suffix=0x%x\n", - (le16_to_cpu( - ((__le16 *) - mdm->partIdent.identSuffix)[0]))); + le16_to_cpu(*(__le16 *) + mdm->partIdent.identSuffix)); udf_debug("Metadata part num=%d\n", - le16_to_cpu(mdm->partitionNum)); + le16_to_cpu(mdm->partitionNum)); udf_debug("Metadata part alloc unit size=%d\n", - le32_to_cpu(mdm->allocUnitSize)); + le32_to_cpu(mdm->allocUnitSize)); udf_debug("Metadata file loc=%d\n", - le32_to_cpu(mdm->metadataFileLoc)); + le32_to_cpu(mdm->metadataFileLoc)); udf_debug("Mirror file loc=%d\n", - le32_to_cpu(mdm->metadataMirrorFileLoc)); + le32_to_cpu(mdm->metadataMirrorFileLoc)); udf_debug("Bitmap file loc=%d\n", - le32_to_cpu(mdm->metadataBitmapFileLoc)); + le32_to_cpu(mdm->metadataBitmapFileLoc)); udf_debug("Duplicate Flag: %d %d\n", - mdata->s_dup_md_flag, mdm->flags); + mdata->s_dup_md_flag, mdm->flags); } else { udf_debug("Unknown ident: %s\n", upm2->partIdent.ident); @@ -1370,16 +1362,15 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, map->s_partition_num = le16_to_cpu(upm2->partitionNum); } udf_debug("Partition (%d:%d) type %d on volume %d\n", - i, map->s_partition_num, type, - map->s_volumeseqnum); + i, map->s_partition_num, type, map->s_volumeseqnum); } if (fileset) { struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]); *fileset = lelb_to_cpu(la->extLocation); - udf_debug("FileSet found in LogicalVolDesc at block=%d, " - "partition=%d\n", fileset->logicalBlockNum, + udf_debug("FileSet found in LogicalVolDesc at block=%d, partition=%d\n", + fileset->logicalBlockNum, fileset->partitionReferenceNum); } if (lvd->integritySeqExt.extLength) @@ -1734,8 +1725,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, return 0; } if (nsr_off == -1) - udf_debug("Failed to read byte 32768. Assuming open " - "disc. Skipping validity check\n"); + udf_debug("Failed to read byte 32768. Assuming open disc. Skipping validity check\n"); if (!sbi->s_last_block) sbi->s_last_block = udf_get_last_block(sb); } else { -- cgit v1.2.3-58-ga151 From 0aaa618863c40e86b543debe002d6f65ff5d61d4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 10 Oct 2011 18:32:06 +0200 Subject: quota: Drop path reference on error exit from quotactl One error exit from quotactl forgot to do path_put(). Fix that. Reported-by: Valerie Aurora Signed-off-by: Jan Kara --- fs/quota/quota.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/quota/quota.c b/fs/quota/quota.c index b34bdb25490c..a18ef461a5ba 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -363,12 +363,15 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, } sb = quotactl_block(special); - if (IS_ERR(sb)) - return PTR_ERR(sb); + if (IS_ERR(sb)) { + ret = PTR_ERR(sb); + goto out; + } ret = do_quotactl(sb, type, cmds, id, addr, pathp); drop_super(sb); +out: if (pathp && !IS_ERR(pathp)) path_put(pathp); return ret; -- cgit v1.2.3-58-ga151 From 6360e21f943172bb71772ec150b96a9e787a535f Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 19 Oct 2011 14:16:23 +0400 Subject: ext3: Allow quota file use root reservation Quota file is fs's metadata, so it is reasonable to permit use root resevation if necessary. This patch fix 265'th xfstest failure Signed-off-by: Dmitry Monakhov Signed-off-by: Jan Kara --- fs/ext3/balloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index a7fceaba87f1..a2038928f9a3 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -1440,14 +1440,14 @@ out: * * Check if filesystem has at least 1 free block available for allocation. */ -static int ext3_has_free_blocks(struct ext3_sb_info *sbi) +static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation) { ext3_fsblk_t free_blocks, root_blocks; free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && - sbi->s_resuid != current_fsuid() && + !use_reservation && sbi->s_resuid != current_fsuid() && (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { return 0; } @@ -1468,7 +1468,7 @@ static int ext3_has_free_blocks(struct ext3_sb_info *sbi) */ int ext3_should_retry_alloc(struct super_block *sb, int *retries) { - if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3) + if (!ext3_has_free_blocks(EXT3_SB(sb), 0) || (*retries)++ > 3) return 0; jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); @@ -1546,7 +1546,7 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) my_rsv = &block_i->rsv_window_node; - if (!ext3_has_free_blocks(sbi)) { + if (!ext3_has_free_blocks(sbi, IS_NOQUOTA(inode))) { *errp = -ENOSPC; goto out; } -- cgit v1.2.3-58-ga151 From 3080a74ea39eece6ac21aae768c48ab8b1f89ac1 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 23 Oct 2011 19:28:32 +0900 Subject: udf: Skip mirror metadata FE loading when metadata FE is ok It is not necessary to load mirror metadata FE when metadata FE is OK. So try to read it only the first time udf_get_pblock_meta25() fails to map the block from metadata FE. Signed-off-by: Ashish Sangwan Signed-off-by: Namjae Jeon Signed-off-by: Jan Kara --- fs/udf/partition.c | 8 ++++++- fs/udf/super.c | 68 ++++++++++++++++++++++++++---------------------------- fs/udf/udf_sb.h | 1 + fs/udf/udfdecl.h | 2 ++ 4 files changed, 43 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/udf/partition.c b/fs/udf/partition.c index f3e472c67709..b526f25c04c9 100644 --- a/fs/udf/partition.c +++ b/fs/udf/partition.c @@ -321,8 +321,14 @@ uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block, /* We shouldn't mount such media... */ BUG_ON(!inode); retblk = udf_try_read_meta(inode, block, partition, offset); - if (retblk == 0xFFFFFFFF) { + if (retblk == 0xFFFFFFFF && mdata->s_metadata_fe) { udf_warn(sb, "error reading from METADATA, trying to read from MIRROR\n"); + if (!mdata->s_mirror_loaded_flag) { + mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb, + mdata->s_mirror_file_loc, map->s_partition_num); + mdata->s_mirror_loaded_flag = 1; + } + inode = mdata->s_mirror_fe; if (!inode) return 0xFFFFFFFF; diff --git a/fs/udf/super.c b/fs/udf/super.c index e58123ad75ba..dfe043a36593 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -826,59 +826,57 @@ out1: return ret; } +struct inode *udf_find_metadata_inode_efe(struct super_block *sb, + u32 meta_file_loc, u32 partition_num) +{ + struct kernel_lb_addr addr; + struct inode *metadata_fe; + + addr.logicalBlockNum = meta_file_loc; + addr.partitionReferenceNum = partition_num; + + metadata_fe = udf_iget(sb, &addr); + + if (metadata_fe == NULL) + udf_warn(sb, "metadata inode efe not found\n"); + else if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) { + udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n"); + iput(metadata_fe); + metadata_fe = NULL; + } + + return metadata_fe; +} + static int udf_load_metadata_files(struct super_block *sb, int partition) { struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map; struct udf_meta_data *mdata; struct kernel_lb_addr addr; - int fe_error = 0; map = &sbi->s_partmaps[partition]; mdata = &map->s_type_specific.s_metadata; /* metadata address */ - addr.logicalBlockNum = mdata->s_meta_file_loc; - addr.partitionReferenceNum = map->s_partition_num; - udf_debug("Metadata file location: block = %d part = %d\n", - addr.logicalBlockNum, addr.partitionReferenceNum); + mdata->s_meta_file_loc, map->s_partition_num); - mdata->s_metadata_fe = udf_iget(sb, &addr); + mdata->s_metadata_fe = udf_find_metadata_inode_efe(sb, + mdata->s_meta_file_loc, map->s_partition_num); if (mdata->s_metadata_fe == NULL) { - udf_warn(sb, "metadata inode efe not found, will try mirror inode\n"); - fe_error = 1; - } else if (UDF_I(mdata->s_metadata_fe)->i_alloc_type != - ICBTAG_FLAG_AD_SHORT) { - udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n"); - fe_error = 1; - iput(mdata->s_metadata_fe); - mdata->s_metadata_fe = NULL; - } - - /* mirror file entry */ - addr.logicalBlockNum = mdata->s_mirror_file_loc; - addr.partitionReferenceNum = map->s_partition_num; + /* mirror file entry */ + udf_debug("Mirror metadata file location: block = %d part = %d\n", + mdata->s_mirror_file_loc, map->s_partition_num); - udf_debug("Mirror metadata file location: block = %d part = %d\n", - addr.logicalBlockNum, addr.partitionReferenceNum); + mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb, + mdata->s_mirror_file_loc, map->s_partition_num); - mdata->s_mirror_fe = udf_iget(sb, &addr); - - if (mdata->s_mirror_fe == NULL) { - if (fe_error) { - udf_err(sb, "mirror inode efe not found and metadata inode is missing too, exiting...\n"); - goto error_exit; - } else - udf_warn(sb, "mirror inode efe not found, but metadata inode is OK\n"); - } else if (UDF_I(mdata->s_mirror_fe)->i_alloc_type != - ICBTAG_FLAG_AD_SHORT) { - udf_warn(sb, "mirror inode efe does not have short allocation descriptors!\n"); - iput(mdata->s_mirror_fe); - mdata->s_mirror_fe = NULL; - if (fe_error) + if (mdata->s_mirror_fe == NULL) { + udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n"); goto error_exit; + } } /* diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 4858c191242b..a3146b05feeb 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -61,6 +61,7 @@ struct udf_meta_data { __u32 s_alloc_unit_size; __u16 s_align_unit_size; __u8 s_dup_md_flag; + __u8 s_mirror_loaded_flag; struct inode *s_metadata_fe; struct inode *s_mirror_fe; struct inode *s_bitmap_fe; diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index f3d449867301..79aae3fe7b55 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -135,6 +135,8 @@ static inline void udf_updated_lvid(struct super_block *sb) UDF_SB(sb)->s_lvid_dirty = 1; } extern u64 lvid_get_unique_id(struct super_block *sb); +struct inode *udf_find_metadata_inode_efe(struct super_block *sb, + u32 meta_file_loc, u32 partition_num); /* namei.c */ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, -- cgit v1.2.3-58-ga151 From ed47a7d00c22b326fc4c97342a73ecd15929732e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 24 Oct 2011 16:47:48 +0200 Subject: udf: Cleanup metadata flags handling Use simple ->s_flags variable instead of u8 variable for each flag. Signed-off-by: Jan Kara --- fs/udf/partition.c | 4 ++-- fs/udf/super.c | 8 ++++---- fs/udf/udf_sb.h | 6 ++++-- 3 files changed, 10 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/udf/partition.c b/fs/udf/partition.c index b526f25c04c9..d6caf01a2097 100644 --- a/fs/udf/partition.c +++ b/fs/udf/partition.c @@ -323,10 +323,10 @@ uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block, retblk = udf_try_read_meta(inode, block, partition, offset); if (retblk == 0xFFFFFFFF && mdata->s_metadata_fe) { udf_warn(sb, "error reading from METADATA, trying to read from MIRROR\n"); - if (!mdata->s_mirror_loaded_flag) { + if (!(mdata->s_flags & MF_MIRROR_FE_LOADED)) { mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb, mdata->s_mirror_file_loc, map->s_partition_num); - mdata->s_mirror_loaded_flag = 1; + mdata->s_flags |= MF_MIRROR_FE_LOADED; } inode = mdata->s_mirror_fe; diff --git a/fs/udf/super.c b/fs/udf/super.c index dfe043a36593..e185253470df 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1333,8 +1333,8 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, le32_to_cpu(mdm->allocUnitSize); mdata->s_align_unit_size = le16_to_cpu(mdm->alignUnitSize); - mdata->s_dup_md_flag = - mdm->flags & 0x01; + if (mdm->flags & 0x01) + mdata->s_flags |= MF_DUPLICATE_MD; udf_debug("Metadata Ident suffix=0x%x\n", le16_to_cpu(*(__le16 *) @@ -1349,8 +1349,8 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, le32_to_cpu(mdm->metadataMirrorFileLoc)); udf_debug("Bitmap file loc=%d\n", le32_to_cpu(mdm->metadataBitmapFileLoc)); - udf_debug("Duplicate Flag: %d %d\n", - mdata->s_dup_md_flag, mdm->flags); + udf_debug("Flags: %d %d\n", + mdata->s_flags, mdm->flags); } else { udf_debug("Unknown ident: %s\n", upm2->partIdent.ident); diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index a3146b05feeb..5142a82e3276 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -54,14 +54,16 @@ #pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */ +#define MF_DUPLICATE_MD 0x01 +#define MF_MIRROR_FE_LOADED 0x02 + struct udf_meta_data { __u32 s_meta_file_loc; __u32 s_mirror_file_loc; __u32 s_bitmap_file_loc; __u32 s_alloc_unit_size; __u16 s_align_unit_size; - __u8 s_dup_md_flag; - __u8 s_mirror_loaded_flag; + int s_flags; struct inode *s_metadata_fe; struct inode *s_mirror_fe; struct inode *s_bitmap_fe; -- cgit v1.2.3-58-ga151 From ff3fc1736f1967b59801ab2cf6409fc6c8556b0a Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Mon, 31 Oct 2011 18:55:50 -0400 Subject: ext4: fix a typo in struct ext4_allocation_context This patch changes "bext" to "best". Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index dc99930d4cb5..47705f3285e3 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -175,7 +175,7 @@ struct ext4_allocation_context { /* the best found extent */ struct ext4_free_extent ac_b_ex; - /* copy of the bext found extent taken before preallocation efforts */ + /* copy of the best found extent taken before preallocation efforts */ struct ext4_free_extent ac_f_ex; /* number of iterations done. we have to track to limit searching */ -- cgit v1.2.3-58-ga151 From afeacc8c1f38b7bb93d4bc7b4ba04c2605061ef0 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 26 May 2011 16:00:52 -0400 Subject: fs: add export.h to files using EXPORT_SYMBOL/THIS_MODULE macros These files were getting via an implicit include path, but we want to crush those out of existence since they cost time during compiles of processing thousands of lines of headers for no reason. Give them the lightweight header that just contains the EXPORT_SYMBOL infrastructure. Signed-off-by: Paul Gortmaker --- fs/bio-integrity.c | 1 + fs/gfs2/ops_fstype.c | 1 + fs/ioprio.c | 1 + fs/jfs/jfs_logmgr.c | 1 + fs/nfs/pagelist.c | 1 + fs/nfs/pnfs_dev.c | 1 + fs/nfs/write.c | 1 + fs/nfsd/nfs4acl.c | 1 + fs/ocfs2/cluster/tcp.c | 1 + fs/ocfs2/dlm/dlmdebug.c | 1 + fs/proc/vmcore.c | 1 + 11 files changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 9c5e6b2cd11a..c2183f3917cd 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -22,6 +22,7 @@ #include #include +#include #include #include #include diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 7e823bbd2453..cb23c2be731a 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/ioprio.c b/fs/ioprio.c index 7da2a06508e5..f79dab83e17b 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -21,6 +21,7 @@ */ #include #include +#include #include #include #include diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 583636f745e5..cc5f811ed383 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -67,6 +67,7 @@ #include /* for sync_blockdev() */ #include #include +#include #include #include #include diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index b60970cc7f1f..a788d8522c88 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "internal.h" #include "pnfs.h" diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 6fda5228ef56..4f359d2a26eb 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -28,6 +28,7 @@ * such damages. */ +#include #include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PNFS diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 2219c88d96b2..cd1edfd8c2d0 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -20,6 +20,7 @@ #include #include #include +#include #include diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index ad88f1c0a4c3..9c51aff02ae2 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -36,6 +36,7 @@ #include #include +#include #include "acl.h" diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index db5ee4b4f47a..ad7d0c155de4 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 56f82cb912e3..0e28e242226d 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "cluster/heartbeat.h" #include "cluster/nodemanager.h" diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index cd99bf557650..b0f450a2bb7c 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3-58-ga151 From 143cb494cb6662e37c4020b7fe9839837f718e56 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 1 Jul 2011 14:23:34 -0400 Subject: fs: add module.h to files that were implicitly using it Some files were using the complete module.h infrastructure without actually including the header at all. Fix them up in advance so once the implicit presence is removed, we won't get failures like this: CC [M] fs/nfsd/nfssvc.o fs/nfsd/nfssvc.c: In function 'nfsd_create_serv': fs/nfsd/nfssvc.c:335: error: 'THIS_MODULE' undeclared (first use in this function) fs/nfsd/nfssvc.c:335: error: (Each undeclared identifier is reported only once fs/nfsd/nfssvc.c:335: error: for each function it appears in.) fs/nfsd/nfssvc.c: In function 'nfsd': fs/nfsd/nfssvc.c:555: error: implicit declaration of function 'module_put_and_exit' make[3]: *** [fs/nfsd/nfssvc.o] Error 1 Signed-off-by: Paul Gortmaker --- fs/cifs/connect.c | 1 + fs/exofs/ore.c | 1 + fs/exofs/super.c | 1 + fs/fuse/cuse.c | 1 + fs/logfs/super.c | 1 + fs/nfs/nfs4filelayout.c | 1 + fs/nfs/pnfs.c | 1 + fs/nfsd/nfsctl.c | 1 + fs/nfsd/nfssvc.c | 1 + 9 files changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index d545a95c30ed..7ef4e2846658 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include "cifspdu.h" #include "cifsglob.h" diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index fcfa86ae6faf..d271ad837202 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -23,6 +23,7 @@ */ #include +#include #include #include diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 057b237b8b69..e6085ec192d6 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b6cca47f7b07..3426521f3205 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -47,6 +47,7 @@ #include #include #include +#include #include "fuse_i.h" diff --git a/fs/logfs/super.c b/fs/logfs/super.c index ce03a182c771..b9b3154b0485 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 09119418402f..955699515e70 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -31,6 +31,7 @@ #include #include +#include #include "internal.h" #include "nfs4filelayout.h" diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index ee73d9a4f700..ba1d5388fafd 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -29,6 +29,7 @@ #include #include +#include #include "internal.h" #include "pnfs.h" #include "iostat.h" diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index db34a585e112..c45a2ea4a090 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "idmap.h" #include "nfsd.h" diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index dc5a1bf476b1..4b8e828ae15f 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -8,6 +8,7 @@ #include #include +#include #include #include -- cgit v1.2.3-58-ga151 From fc360bd9cdcf875639a77f07fafec26699c546f3 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 31 Oct 2011 17:06:32 -0700 Subject: /proc/self/numa_maps: restore "huge" tag for hugetlb vmas The display of the "huge" tag was accidentally removed in 29ea2f698 ("mm: use walk_page_range() instead of custom page table walking code"). Reported-by: Stephen Hemminger Tested-by: Stephen Hemminger Reviewed-by: Stephen Wilson Cc: KOSAKI Motohiro Cc: Hugh Dickins Acked-by: David Rientjes Cc: Lee Schermerhorn Cc: Alexey Dobriyan Cc: Christoph Lameter Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5afaa58a8630..c7d4ee663f14 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1039,6 +1039,9 @@ static int show_numa_map(struct seq_file *m, void *v) seq_printf(m, " stack"); } + if (is_vm_hugetlb_page(vma)) + seq_printf(m, " huge"); + walk_page_range(vma->vm_start, vma->vm_end, &walk); if (!md->pages) -- cgit v1.2.3-58-ga151 From fcf634098c00dd9cd247447368495f0b79be12d1 Mon Sep 17 00:00:00 2001 From: Christopher Yeoh Date: Mon, 31 Oct 2011 17:06:39 -0700 Subject: Cross Memory Attach The basic idea behind cross memory attach is to allow MPI programs doing intra-node communication to do a single copy of the message rather than a double copy of the message via shared memory. The following patch attempts to achieve this by allowing a destination process, given an address and size from a source process, to copy memory directly from the source process into its own address space via a system call. There is also a symmetrical ability to copy from the current process's address space into a destination process's address space. - Use of /proc/pid/mem has been considered, but there are issues with using it: - Does not allow for specifying iovecs for both src and dest, assuming preadv or pwritev was implemented either the area read from or written to would need to be contiguous. - Currently mem_read allows only processes who are currently ptrace'ing the target and are still able to ptrace the target to read from the target. This check could possibly be moved to the open call, but its not clear exactly what race this restriction is stopping (reason appears to have been lost) - Having to send the fd of /proc/self/mem via SCM_RIGHTS on unix domain socket is a bit ugly from a userspace point of view, especially when you may have hundreds if not (eventually) thousands of processes that all need to do this with each other - Doesn't allow for some future use of the interface we would like to consider adding in the future (see below) - Interestingly reading from /proc/pid/mem currently actually involves two copies! (But this could be fixed pretty easily) As mentioned previously use of vmsplice instead was considered, but has problems. Since you need the reader and writer working co-operatively if the pipe is not drained then you block. Which requires some wrapping to do non blocking on the send side or polling on the receive. In all to all communication it requires ordering otherwise you can deadlock. And in the example of many MPI tasks writing to one MPI task vmsplice serialises the copying. There are some cases of MPI collectives where even a single copy interface does not get us the performance gain we could. For example in an MPI_Reduce rather than copy the data from the source we would like to instead use it directly in a mathops (say the reduce is doing a sum) as this would save us doing a copy. We don't need to keep a copy of the data from the source. I haven't implemented this, but I think this interface could in the future do all this through the use of the flags - eg could specify the math operation and type and the kernel rather than just copying the data would apply the specified operation between the source and destination and store it in the destination. Although we don't have a "second user" of the interface (though I've had some nibbles from people who may be interested in using it for intra process messaging which is not MPI). This interface is something which hardware vendors are already doing for their custom drivers to implement fast local communication. And so in addition to this being useful for OpenMPI it would mean the driver maintainers don't have to fix things up when the mm changes. There was some discussion about how much faster a true zero copy would go. Here's a link back to the email with some testing I did on that: http://marc.info/?l=linux-mm&m=130105930902915&w=2 There is a basic man page for the proposed interface here: http://ozlabs.org/~cyeoh/cma/process_vm_readv.txt This has been implemented for x86 and powerpc, other architecture should mainly (I think) just need to add syscall numbers for the process_vm_readv and process_vm_writev. There are 32 bit compatibility versions for 64-bit kernels. For arch maintainers there are some simple tests to be able to quickly verify that the syscalls are working correctly here: http://ozlabs.org/~cyeoh/cma/cma-test-20110718.tgz Signed-off-by: Chris Yeoh Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Arnd Bergmann Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: David Howells Cc: James Morris Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/systbl.h | 2 + arch/powerpc/include/asm/unistd.h | 4 +- arch/x86/ia32/ia32entry.S | 2 + arch/x86/include/asm/unistd_32.h | 4 +- arch/x86/include/asm/unistd_64.h | 4 + arch/x86/kernel/syscall_table_32.S | 2 + fs/aio.c | 4 +- fs/compat.c | 7 +- fs/read_write.c | 8 +- include/linux/compat.h | 3 +- include/linux/fs.h | 7 +- include/linux/syscalls.h | 13 + kernel/sys_ni.c | 4 + mm/Makefile | 3 +- mm/process_vm_access.c | 496 +++++++++++++++++++++++++++++++++++++ security/keys/compat.c | 2 +- security/keys/keyctl.c | 2 +- 17 files changed, 550 insertions(+), 17 deletions(-) create mode 100644 mm/process_vm_access.c (limited to 'fs') diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index fa0d27a400de..559ae1ee6706 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -354,3 +354,5 @@ COMPAT_SYS_SPU(clock_adjtime) SYSCALL_SPU(syncfs) COMPAT_SYS_SPU(sendmmsg) SYSCALL_SPU(setns) +COMPAT_SYS(process_vm_readv) +COMPAT_SYS(process_vm_writev) diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index b8b3f599362b..d3d1b5efd7eb 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -373,10 +373,12 @@ #define __NR_syncfs 348 #define __NR_sendmmsg 349 #define __NR_setns 350 +#define __NR_process_vm_readv 351 +#define __NR_process_vm_writev 352 #ifdef __KERNEL__ -#define __NR_syscalls 351 +#define __NR_syscalls 353 #define __NR__exit __NR_exit #define NR_syscalls __NR_syscalls diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 54edb207ff3a..a6253ec1b284 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -850,4 +850,6 @@ ia32_sys_call_table: .quad sys_syncfs .quad compat_sys_sendmmsg /* 345 */ .quad sys_setns + .quad compat_sys_process_vm_readv + .quad compat_sys_process_vm_writev ia32_syscall_end: diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 593485b38ab3..599c77d38f33 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -352,10 +352,12 @@ #define __NR_syncfs 344 #define __NR_sendmmsg 345 #define __NR_setns 346 +#define __NR_process_vm_readv 347 +#define __NR_process_vm_writev 348 #ifdef __KERNEL__ -#define NR_syscalls 347 +#define NR_syscalls 349 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 0a6ba337a2eb..0431f193c3f2 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -682,6 +682,10 @@ __SYSCALL(__NR_sendmmsg, sys_sendmmsg) __SYSCALL(__NR_setns, sys_setns) #define __NR_getcpu 309 __SYSCALL(__NR_getcpu, sys_getcpu) +#define __NR_process_vm_readv 310 +__SYSCALL(__NR_process_vm_readv, sys_process_vm_readv) +#define __NR_process_vm_writev 311 +__SYSCALL(__NR_process_vm_writev, sys_process_vm_writev) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index bc19be332bc9..9a0e31293920 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -346,3 +346,5 @@ ENTRY(sys_call_table) .long sys_syncfs .long sys_sendmmsg /* 345 */ .long sys_setns + .long sys_process_vm_readv + .long sys_process_vm_writev diff --git a/fs/aio.c b/fs/aio.c index e29ec485af25..632b235f4fbe 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1387,13 +1387,13 @@ static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat) ret = compat_rw_copy_check_uvector(type, (struct compat_iovec __user *)kiocb->ki_buf, kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, - &kiocb->ki_iovec); + &kiocb->ki_iovec, 1); else #endif ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf, kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, - &kiocb->ki_iovec); + &kiocb->ki_iovec, 1); if (ret < 0) goto out; diff --git a/fs/compat.c b/fs/compat.c index 302e761bd0aa..c98787536bb8 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -546,7 +546,7 @@ out: ssize_t compat_rw_copy_check_uvector(int type, const struct compat_iovec __user *uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_pointer, - struct iovec **ret_pointer) + struct iovec **ret_pointer, int check_access) { compat_ssize_t tot_len; struct iovec *iov = *ret_pointer = fast_pointer; @@ -593,7 +593,8 @@ ssize_t compat_rw_copy_check_uvector(int type, } if (len < 0) /* size_t not fitting in compat_ssize_t .. */ goto out; - if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) { + if (check_access && + !access_ok(vrfy_dir(type), compat_ptr(buf), len)) { ret = -EFAULT; goto out; } @@ -1107,7 +1108,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, goto out; tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs, - UIO_FASTIOV, iovstack, &iov); + UIO_FASTIOV, iovstack, &iov, 1); if (tot_len == 0) { ret = 0; goto out; diff --git a/fs/read_write.c b/fs/read_write.c index dfd125798791..5ad4248b0cd8 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -633,7 +633,8 @@ ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_pointer, - struct iovec **ret_pointer) + struct iovec **ret_pointer, + int check_access) { unsigned long seg; ssize_t ret; @@ -689,7 +690,8 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, ret = -EINVAL; goto out; } - if (unlikely(!access_ok(vrfy_dir(type), buf, len))) { + if (check_access + && unlikely(!access_ok(vrfy_dir(type), buf, len))) { ret = -EFAULT; goto out; } @@ -721,7 +723,7 @@ static ssize_t do_readv_writev(int type, struct file *file, } ret = rw_copy_check_uvector(type, uvector, nr_segs, - ARRAY_SIZE(iovstack), iovstack, &iov); + ARRAY_SIZE(iovstack), iovstack, &iov, 1); if (ret <= 0) goto out; diff --git a/include/linux/compat.h b/include/linux/compat.h index c6e7523bf765..154bf5683015 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -547,7 +547,8 @@ extern ssize_t compat_rw_copy_check_uvector(int type, const struct compat_iovec __user *uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_pointer, - struct iovec **ret_pointer); + struct iovec **ret_pointer, + int check_access); extern void __user *compat_alloc_user_space(unsigned long len); diff --git a/include/linux/fs.h b/include/linux/fs.h index 14493a2d5a03..87b4c6b9692d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1633,9 +1633,10 @@ struct inode_operations { struct seq_file; ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, - unsigned long nr_segs, unsigned long fast_segs, - struct iovec *fast_pointer, - struct iovec **ret_pointer); + unsigned long nr_segs, unsigned long fast_segs, + struct iovec *fast_pointer, + struct iovec **ret_pointer, + int check_access); extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 1ff0ec2a5e8d..86a24b1166d1 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -844,4 +844,17 @@ asmlinkage long sys_open_by_handle_at(int mountdirfd, struct file_handle __user *handle, int flags); asmlinkage long sys_setns(int fd, int nstype); +asmlinkage long sys_process_vm_readv(pid_t pid, + const struct iovec __user *lvec, + unsigned long liovcnt, + const struct iovec __user *rvec, + unsigned long riovcnt, + unsigned long flags); +asmlinkage long sys_process_vm_writev(pid_t pid, + const struct iovec __user *lvec, + unsigned long liovcnt, + const struct iovec __user *rvec, + unsigned long riovcnt, + unsigned long flags); + #endif diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index a9a5de07c4f1..47bfa16430d7 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -145,6 +145,10 @@ cond_syscall(sys_io_submit); cond_syscall(sys_io_cancel); cond_syscall(sys_io_getevents); cond_syscall(sys_syslog); +cond_syscall(sys_process_vm_readv); +cond_syscall(sys_process_vm_writev); +cond_syscall(compat_sys_process_vm_readv); +cond_syscall(compat_sys_process_vm_writev); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/mm/Makefile b/mm/Makefile index 836e4163c1bf..50ec00ef2a0e 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,7 +5,8 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - vmalloc.o pagewalk.o pgtable-generic.o + vmalloc.o pagewalk.o pgtable-generic.o \ + process_vm_access.o obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c new file mode 100644 index 000000000000..e920aa3ce104 --- /dev/null +++ b/mm/process_vm_access.c @@ -0,0 +1,496 @@ +/* + * linux/mm/process_vm_access.c + * + * Copyright (C) 2010-2011 Christopher Yeoh , IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_COMPAT +#include +#endif + +/** + * process_vm_rw_pages - read/write pages from task specified + * @task: task to read/write from + * @mm: mm for task + * @process_pages: struct pages area that can store at least + * nr_pages_to_copy struct page pointers + * @pa: address of page in task to start copying from/to + * @start_offset: offset in page to start copying from/to + * @len: number of bytes to copy + * @lvec: iovec array specifying where to copy to/from + * @lvec_cnt: number of elements in iovec array + * @lvec_current: index in iovec array we are up to + * @lvec_offset: offset in bytes from current iovec iov_base we are up to + * @vm_write: 0 means copy from, 1 means copy to + * @nr_pages_to_copy: number of pages to copy + * @bytes_copied: returns number of bytes successfully copied + * Returns 0 on success, error code otherwise + */ +static int process_vm_rw_pages(struct task_struct *task, + struct mm_struct *mm, + struct page **process_pages, + unsigned long pa, + unsigned long start_offset, + unsigned long len, + const struct iovec *lvec, + unsigned long lvec_cnt, + unsigned long *lvec_current, + size_t *lvec_offset, + int vm_write, + unsigned int nr_pages_to_copy, + ssize_t *bytes_copied) +{ + int pages_pinned; + void *target_kaddr; + int pgs_copied = 0; + int j; + int ret; + ssize_t bytes_to_copy; + ssize_t rc = 0; + + *bytes_copied = 0; + + /* Get the pages we're interested in */ + down_read(&mm->mmap_sem); + pages_pinned = get_user_pages(task, mm, pa, + nr_pages_to_copy, + vm_write, 0, process_pages, NULL); + up_read(&mm->mmap_sem); + + if (pages_pinned != nr_pages_to_copy) { + rc = -EFAULT; + goto end; + } + + /* Do the copy for each page */ + for (pgs_copied = 0; + (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt); + pgs_copied++) { + /* Make sure we have a non zero length iovec */ + while (*lvec_current < lvec_cnt + && lvec[*lvec_current].iov_len == 0) + (*lvec_current)++; + if (*lvec_current == lvec_cnt) + break; + + /* + * Will copy smallest of: + * - bytes remaining in page + * - bytes remaining in destination iovec + */ + bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset, + len - *bytes_copied); + bytes_to_copy = min_t(ssize_t, bytes_to_copy, + lvec[*lvec_current].iov_len + - *lvec_offset); + + target_kaddr = kmap(process_pages[pgs_copied]) + start_offset; + + if (vm_write) + ret = copy_from_user(target_kaddr, + lvec[*lvec_current].iov_base + + *lvec_offset, + bytes_to_copy); + else + ret = copy_to_user(lvec[*lvec_current].iov_base + + *lvec_offset, + target_kaddr, bytes_to_copy); + kunmap(process_pages[pgs_copied]); + if (ret) { + *bytes_copied += bytes_to_copy - ret; + pgs_copied++; + rc = -EFAULT; + goto end; + } + *bytes_copied += bytes_to_copy; + *lvec_offset += bytes_to_copy; + if (*lvec_offset == lvec[*lvec_current].iov_len) { + /* + * Need to copy remaining part of page into the + * next iovec if there are any bytes left in page + */ + (*lvec_current)++; + *lvec_offset = 0; + start_offset = (start_offset + bytes_to_copy) + % PAGE_SIZE; + if (start_offset) + pgs_copied--; + } else { + start_offset = 0; + } + } + +end: + if (vm_write) { + for (j = 0; j < pages_pinned; j++) { + if (j < pgs_copied) + set_page_dirty_lock(process_pages[j]); + put_page(process_pages[j]); + } + } else { + for (j = 0; j < pages_pinned; j++) + put_page(process_pages[j]); + } + + return rc; +} + +/* Maximum number of pages kmalloc'd to hold struct page's during copy */ +#define PVM_MAX_KMALLOC_PAGES (PAGE_SIZE * 2) + +/** + * process_vm_rw_single_vec - read/write pages from task specified + * @addr: start memory address of target process + * @len: size of area to copy to/from + * @lvec: iovec array specifying where to copy to/from locally + * @lvec_cnt: number of elements in iovec array + * @lvec_current: index in iovec array we are up to + * @lvec_offset: offset in bytes from current iovec iov_base we are up to + * @process_pages: struct pages area that can store at least + * nr_pages_to_copy struct page pointers + * @mm: mm for task + * @task: task to read/write from + * @vm_write: 0 means copy from, 1 means copy to + * @bytes_copied: returns number of bytes successfully copied + * Returns 0 on success or on failure error code + */ +static int process_vm_rw_single_vec(unsigned long addr, + unsigned long len, + const struct iovec *lvec, + unsigned long lvec_cnt, + unsigned long *lvec_current, + size_t *lvec_offset, + struct page **process_pages, + struct mm_struct *mm, + struct task_struct *task, + int vm_write, + ssize_t *bytes_copied) +{ + unsigned long pa = addr & PAGE_MASK; + unsigned long start_offset = addr - pa; + unsigned long nr_pages; + ssize_t bytes_copied_loop; + ssize_t rc = 0; + unsigned long nr_pages_copied = 0; + unsigned long nr_pages_to_copy; + unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES + / sizeof(struct pages *); + + *bytes_copied = 0; + + /* Work out address and page range required */ + if (len == 0) + return 0; + nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; + + while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) { + nr_pages_to_copy = min(nr_pages - nr_pages_copied, + max_pages_per_loop); + + rc = process_vm_rw_pages(task, mm, process_pages, pa, + start_offset, len, + lvec, lvec_cnt, + lvec_current, lvec_offset, + vm_write, nr_pages_to_copy, + &bytes_copied_loop); + start_offset = 0; + *bytes_copied += bytes_copied_loop; + + if (rc < 0) { + return rc; + } else { + len -= bytes_copied_loop; + nr_pages_copied += nr_pages_to_copy; + pa += nr_pages_to_copy * PAGE_SIZE; + } + } + + return rc; +} + +/* Maximum number of entries for process pages array + which lives on stack */ +#define PVM_MAX_PP_ARRAY_COUNT 16 + +/** + * process_vm_rw_core - core of reading/writing pages from task specified + * @pid: PID of process to read/write from/to + * @lvec: iovec array specifying where to copy to/from locally + * @liovcnt: size of lvec array + * @rvec: iovec array specifying where to copy to/from in the other process + * @riovcnt: size of rvec array + * @flags: currently unused + * @vm_write: 0 if reading from other process, 1 if writing to other process + * Returns the number of bytes read/written or error code. May + * return less bytes than expected if an error occurs during the copying + * process. + */ +static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, + unsigned long liovcnt, + const struct iovec *rvec, + unsigned long riovcnt, + unsigned long flags, int vm_write) +{ + struct task_struct *task; + struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT]; + struct page **process_pages = pp_stack; + struct mm_struct *mm; + unsigned long i; + ssize_t rc = 0; + ssize_t bytes_copied_loop; + ssize_t bytes_copied = 0; + unsigned long nr_pages = 0; + unsigned long nr_pages_iov; + unsigned long iov_l_curr_idx = 0; + size_t iov_l_curr_offset = 0; + ssize_t iov_len; + + /* + * Work out how many pages of struct pages we're going to need + * when eventually calling get_user_pages + */ + for (i = 0; i < riovcnt; i++) { + iov_len = rvec[i].iov_len; + if (iov_len > 0) { + nr_pages_iov = ((unsigned long)rvec[i].iov_base + + iov_len) + / PAGE_SIZE - (unsigned long)rvec[i].iov_base + / PAGE_SIZE + 1; + nr_pages = max(nr_pages, nr_pages_iov); + } + } + + if (nr_pages == 0) + return 0; + + if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) { + /* For reliability don't try to kmalloc more than + 2 pages worth */ + process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES, + sizeof(struct pages *)*nr_pages), + GFP_KERNEL); + + if (!process_pages) + return -ENOMEM; + } + + /* Get process information */ + rcu_read_lock(); + task = find_task_by_vpid(pid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + if (!task) { + rc = -ESRCH; + goto free_proc_pages; + } + + task_lock(task); + if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) { + task_unlock(task); + rc = -EPERM; + goto put_task_struct; + } + mm = task->mm; + + if (!mm || (task->flags & PF_KTHREAD)) { + task_unlock(task); + rc = -EINVAL; + goto put_task_struct; + } + + atomic_inc(&mm->mm_users); + task_unlock(task); + + for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) { + rc = process_vm_rw_single_vec( + (unsigned long)rvec[i].iov_base, rvec[i].iov_len, + lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset, + process_pages, mm, task, vm_write, &bytes_copied_loop); + bytes_copied += bytes_copied_loop; + if (rc != 0) { + /* If we have managed to copy any data at all then + we return the number of bytes copied. Otherwise + we return the error code */ + if (bytes_copied) + rc = bytes_copied; + goto put_mm; + } + } + + rc = bytes_copied; +put_mm: + mmput(mm); + +put_task_struct: + put_task_struct(task); + +free_proc_pages: + if (process_pages != pp_stack) + kfree(process_pages); + return rc; +} + +/** + * process_vm_rw - check iovecs before calling core routine + * @pid: PID of process to read/write from/to + * @lvec: iovec array specifying where to copy to/from locally + * @liovcnt: size of lvec array + * @rvec: iovec array specifying where to copy to/from in the other process + * @riovcnt: size of rvec array + * @flags: currently unused + * @vm_write: 0 if reading from other process, 1 if writing to other process + * Returns the number of bytes read/written or error code. May + * return less bytes than expected if an error occurs during the copying + * process. + */ +static ssize_t process_vm_rw(pid_t pid, + const struct iovec __user *lvec, + unsigned long liovcnt, + const struct iovec __user *rvec, + unsigned long riovcnt, + unsigned long flags, int vm_write) +{ + struct iovec iovstack_l[UIO_FASTIOV]; + struct iovec iovstack_r[UIO_FASTIOV]; + struct iovec *iov_l = iovstack_l; + struct iovec *iov_r = iovstack_r; + ssize_t rc; + + if (flags != 0) + return -EINVAL; + + /* Check iovecs */ + if (vm_write) + rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, + iovstack_l, &iov_l, 1); + else + rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV, + iovstack_l, &iov_l, 1); + if (rc <= 0) + goto free_iovecs; + + rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV, + iovstack_r, &iov_r, 0); + if (rc <= 0) + goto free_iovecs; + + rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags, + vm_write); + +free_iovecs: + if (iov_r != iovstack_r) + kfree(iov_r); + if (iov_l != iovstack_l) + kfree(iov_l); + + return rc; +} + +SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec, + unsigned long, liovcnt, const struct iovec __user *, rvec, + unsigned long, riovcnt, unsigned long, flags) +{ + return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0); +} + +SYSCALL_DEFINE6(process_vm_writev, pid_t, pid, + const struct iovec __user *, lvec, + unsigned long, liovcnt, const struct iovec __user *, rvec, + unsigned long, riovcnt, unsigned long, flags) +{ + return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1); +} + +#ifdef CONFIG_COMPAT + +asmlinkage ssize_t +compat_process_vm_rw(compat_pid_t pid, + const struct compat_iovec __user *lvec, + unsigned long liovcnt, + const struct compat_iovec __user *rvec, + unsigned long riovcnt, + unsigned long flags, int vm_write) +{ + struct iovec iovstack_l[UIO_FASTIOV]; + struct iovec iovstack_r[UIO_FASTIOV]; + struct iovec *iov_l = iovstack_l; + struct iovec *iov_r = iovstack_r; + ssize_t rc = -EFAULT; + + if (flags != 0) + return -EINVAL; + + if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec))) + goto out; + + if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec))) + goto out; + + if (vm_write) + rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, + UIO_FASTIOV, iovstack_l, + &iov_l, 1); + else + rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt, + UIO_FASTIOV, iovstack_l, + &iov_l, 1); + if (rc <= 0) + goto free_iovecs; + rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt, + UIO_FASTIOV, iovstack_r, + &iov_r, 0); + if (rc <= 0) + goto free_iovecs; + + rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags, + vm_write); + +free_iovecs: + if (iov_r != iovstack_r) + kfree(iov_r); + if (iov_l != iovstack_l) + kfree(iov_l); + +out: + return rc; +} + +asmlinkage ssize_t +compat_sys_process_vm_readv(compat_pid_t pid, + const struct compat_iovec __user *lvec, + unsigned long liovcnt, + const struct compat_iovec __user *rvec, + unsigned long riovcnt, + unsigned long flags) +{ + return compat_process_vm_rw(pid, lvec, liovcnt, rvec, + riovcnt, flags, 0); +} + +asmlinkage ssize_t +compat_sys_process_vm_writev(compat_pid_t pid, + const struct compat_iovec __user *lvec, + unsigned long liovcnt, + const struct compat_iovec __user *rvec, + unsigned long riovcnt, + unsigned long flags) +{ + return compat_process_vm_rw(pid, lvec, liovcnt, rvec, + riovcnt, flags, 1); +} + +#endif diff --git a/security/keys/compat.c b/security/keys/compat.c index 338b510e9027..4c48e13448f8 100644 --- a/security/keys/compat.c +++ b/security/keys/compat.c @@ -38,7 +38,7 @@ long compat_keyctl_instantiate_key_iov( ret = compat_rw_copy_check_uvector(WRITE, _payload_iov, ioc, ARRAY_SIZE(iovstack), - iovstack, &iov); + iovstack, &iov, 1); if (ret < 0) return ret; if (ret == 0) diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index eca51918c951..0b3f5d72af1c 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -1065,7 +1065,7 @@ long keyctl_instantiate_key_iov(key_serial_t id, goto no_payload; ret = rw_copy_check_uvector(WRITE, _payload_iov, ioc, - ARRAY_SIZE(iovstack), iovstack, &iov); + ARRAY_SIZE(iovstack), iovstack, &iov, 1); if (ret < 0) return ret; if (ret == 0) -- cgit v1.2.3-58-ga151 From c9f01245b6a7d77d17deaa71af10f6aca14fa24e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 31 Oct 2011 17:07:15 -0700 Subject: oom: remove oom_disable_count This removes mm->oom_disable_count entirely since it's unnecessary and currently buggy. The counter was intended to be per-process but it's currently decremented in the exit path for each thread that exits, causing it to underflow. The count was originally intended to prevent oom killing threads that share memory with threads that cannot be killed since it doesn't lead to future memory freeing. The counter could be fixed to represent all threads sharing the same mm, but it's better to remove the count since: - it is possible that the OOM_DISABLE thread sharing memory with the victim is waiting on that thread to exit and will actually cause future memory freeing, and - there is no guarantee that a thread is disabled from oom killing just because another thread sharing its mm is oom disabled. Signed-off-by: David Rientjes Reported-by: Oleg Nesterov Reviewed-by: Oleg Nesterov Cc: Ying Han Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 4 ---- fs/proc/base.c | 13 ------------- include/linux/mm_types.h | 3 --- kernel/exit.c | 2 -- kernel/fork.c | 10 +--------- mm/oom_kill.c | 23 +++++------------------ 6 files changed, 6 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 25dcbe5fc356..36254645b7cc 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -841,10 +841,6 @@ static int exec_mmap(struct mm_struct *mm) tsk->mm = mm; tsk->active_mm = mm; activate_mm(active_mm, mm); - if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { - atomic_dec(&old_mm->oom_disable_count); - atomic_inc(&tsk->mm->oom_disable_count); - } task_unlock(tsk); arch_pick_mmap_layout(mm); if (old_mm) { diff --git a/fs/proc/base.c b/fs/proc/base.c index 5eb02069e1b8..8f0087e20e16 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1107,13 +1107,6 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, goto err_sighand; } - if (oom_adjust != task->signal->oom_adj) { - if (oom_adjust == OOM_DISABLE) - atomic_inc(&task->mm->oom_disable_count); - if (task->signal->oom_adj == OOM_DISABLE) - atomic_dec(&task->mm->oom_disable_count); - } - /* * Warn that /proc/pid/oom_adj is deprecated, see * Documentation/feature-removal-schedule.txt. @@ -1215,12 +1208,6 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, goto err_sighand; } - if (oom_score_adj != task->signal->oom_score_adj) { - if (oom_score_adj == OOM_SCORE_ADJ_MIN) - atomic_inc(&task->mm->oom_disable_count); - if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - atomic_dec(&task->mm->oom_disable_count); - } task->signal->oom_score_adj = oom_score_adj; if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) task->signal->oom_score_adj_min = oom_score_adj; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c93d00a6e95d..6456624aa964 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -336,9 +336,6 @@ struct mm_struct { unsigned int token_priority; unsigned int last_interval; - /* How many tasks sharing this mm are OOM_DISABLE */ - atomic_t oom_disable_count; - unsigned long flags; /* Must use atomic bitops to access the bits */ struct core_state *core_state; /* coredumping support */ diff --git a/kernel/exit.c b/kernel/exit.c index 2913b3509d42..d0b7d988f873 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -681,8 +681,6 @@ static void exit_mm(struct task_struct * tsk) enter_lazy_tlb(mm, current); /* We don't want this task to be frozen prematurely */ clear_freeze_flag(tsk); - if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - atomic_dec(&mm->oom_disable_count); task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); diff --git a/kernel/fork.c b/kernel/fork.c index 8e6b6f4fb272..70d76191afb9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -501,7 +501,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->cached_hole_size = ~0UL; mm_init_aio(mm); mm_init_owner(mm, p); - atomic_set(&mm->oom_disable_count, 0); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -816,8 +815,6 @@ good_mm: /* Initializing for Swap token stuff */ mm->token_priority = 0; mm->last_interval = 0; - if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - atomic_inc(&mm->oom_disable_count); tsk->mm = mm; tsk->active_mm = mm; @@ -1391,13 +1388,8 @@ bad_fork_cleanup_io: bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: - if (p->mm) { - task_lock(p); - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - atomic_dec(&p->mm->oom_disable_count); - task_unlock(p); + if (p->mm) mmput(p->mm); - } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b0d8943bc9fd..2b97e8f04607 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -54,13 +54,7 @@ int test_set_oom_score_adj(int new_val) spin_lock_irq(&sighand->siglock); old_val = current->signal->oom_score_adj; - if (new_val != old_val) { - if (new_val == OOM_SCORE_ADJ_MIN) - atomic_inc(¤t->mm->oom_disable_count); - else if (old_val == OOM_SCORE_ADJ_MIN) - atomic_dec(¤t->mm->oom_disable_count); - current->signal->oom_score_adj = new_val; - } + current->signal->oom_score_adj = new_val; spin_unlock_irq(&sighand->siglock); return old_val; @@ -172,16 +166,6 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, if (!p) return 0; - /* - * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN - * so the entire heuristic doesn't need to be executed for something - * that cannot be killed. - */ - if (atomic_read(&p->mm->oom_disable_count)) { - task_unlock(p); - return 0; - } - /* * The memory controller may have a limit of 0 bytes, so avoid a divide * by zero, if necessary. @@ -451,6 +435,9 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) for_each_process(q) if (q->mm == mm && !same_thread_group(q, p) && !(q->flags & PF_KTHREAD)) { + if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + continue; + task_lock(q); /* Protect ->comm from prctl() */ pr_err("Kill process %d (%s) sharing same memory\n", task_pid_nr(q), q->comm); @@ -727,7 +714,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, read_lock(&tasklist_lock); if (sysctl_oom_kill_allocating_task && !oom_unkillable_task(current, NULL, nodemask) && - current->mm && !atomic_read(¤t->mm->oom_disable_count)) { + current->mm) { /* * oom_kill_process() needs tasklist_lock held. If it returns * non-zero, current could not be killed so we must fallback to -- cgit v1.2.3-58-ga151 From f5fc870da2f8798edb5481cd2137a3b2d5bd1b19 Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Mon, 31 Oct 2011 17:07:21 -0700 Subject: tmpfs: add "tmpfs" to the Kconfig prompt to make it obvious. Add the leading word "tmpfs" to the Kconfig string to make it blindingly obvious that this selection refers to tmpfs. Signed-off-by: Robert P. J. Day Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 9fe0b349f4cd..5f4c45d4aa10 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -109,7 +109,7 @@ source "fs/proc/Kconfig" source "fs/sysfs/Kconfig" config TMPFS - bool "Virtual memory file system support (former shm fs)" + bool "Tmpfs virtual memory file system support (former shm fs)" depends on SHMEM help Tmpfs is a file system which keeps all files in virtual memory. -- cgit v1.2.3-58-ga151 From bc3e53f682d93df677dbd5006a404722b3adfe18 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 31 Oct 2011 17:07:30 -0700 Subject: mm: distinguish between mlocked and pinned pages Some kernel components pin user space memory (infiniband and perf) (by increasing the page count) and account that memory as "mlocked". The difference between mlocking and pinning is: A. mlocked pages are marked with PG_mlocked and are exempt from swapping. Page migration may move them around though. They are kept on a special LRU list. B. Pinned pages cannot be moved because something needs to directly access physical memory. They may not be on any LRU list. I recently saw an mlockalled process where mm->locked_vm became bigger than the virtual size of the process (!) because some memory was accounted for twice: Once when the page was mlocked and once when the Infiniband layer increased the refcount because it needt to pin the RDMA memory. This patch introduces a separate counter for pinned pages and accounts them seperately. Signed-off-by: Christoph Lameter Cc: Mike Marciniszyn Cc: Roland Dreier Cc: Sean Hefty Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/infiniband/core/umem.c | 6 +++--- drivers/infiniband/hw/ipath/ipath_user_pages.c | 6 +++--- drivers/infiniband/hw/qib/qib_user_pages.c | 4 ++-- fs/proc/task_mmu.c | 2 ++ include/linux/mm_types.h | 2 +- kernel/events/core.c | 6 +++--- 6 files changed, 14 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index b645e558876f..9155f91d66bf 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -136,7 +136,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, down_write(¤t->mm->mmap_sem); - locked = npages + current->mm->locked_vm; + locked = npages + current->mm->pinned_vm; lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { @@ -206,7 +206,7 @@ out: __ib_umem_release(context->device, umem, 0); kfree(umem); } else - current->mm->locked_vm = locked; + current->mm->pinned_vm = locked; up_write(¤t->mm->mmap_sem); if (vma_list) @@ -222,7 +222,7 @@ static void ib_umem_account(struct work_struct *work) struct ib_umem *umem = container_of(work, struct ib_umem, work); down_write(&umem->mm->mmap_sem); - umem->mm->locked_vm -= umem->diff; + umem->mm->pinned_vm -= umem->diff; up_write(&umem->mm->mmap_sem); mmput(umem->mm); kfree(umem); diff --git a/drivers/infiniband/hw/ipath/ipath_user_pages.c b/drivers/infiniband/hw/ipath/ipath_user_pages.c index cfed5399f074..dc66c4506916 100644 --- a/drivers/infiniband/hw/ipath/ipath_user_pages.c +++ b/drivers/infiniband/hw/ipath/ipath_user_pages.c @@ -79,7 +79,7 @@ static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages, goto bail_release; } - current->mm->locked_vm += num_pages; + current->mm->pinned_vm += num_pages; ret = 0; goto bail; @@ -178,7 +178,7 @@ void ipath_release_user_pages(struct page **p, size_t num_pages) __ipath_release_user_pages(p, num_pages, 1); - current->mm->locked_vm -= num_pages; + current->mm->pinned_vm -= num_pages; up_write(¤t->mm->mmap_sem); } @@ -195,7 +195,7 @@ static void user_pages_account(struct work_struct *_work) container_of(_work, struct ipath_user_pages_work, work); down_write(&work->mm->mmap_sem); - work->mm->locked_vm -= work->num_pages; + work->mm->pinned_vm -= work->num_pages; up_write(&work->mm->mmap_sem); mmput(work->mm); kfree(work); diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index 7689e49c13c9..2bc1d2b96298 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -74,7 +74,7 @@ static int __qib_get_user_pages(unsigned long start_page, size_t num_pages, goto bail_release; } - current->mm->locked_vm += num_pages; + current->mm->pinned_vm += num_pages; ret = 0; goto bail; @@ -151,7 +151,7 @@ void qib_release_user_pages(struct page **p, size_t num_pages) __qib_release_user_pages(p, num_pages, 1); if (current->mm) { - current->mm->locked_vm -= num_pages; + current->mm->pinned_vm -= num_pages; up_write(¤t->mm->mmap_sem); } } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c7d4ee663f14..e418c5abdb0e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -44,6 +44,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" "VmLck:\t%8lu kB\n" + "VmPin:\t%8lu kB\n" "VmHWM:\t%8lu kB\n" "VmRSS:\t%8lu kB\n" "VmData:\t%8lu kB\n" @@ -55,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) hiwater_vm << (PAGE_SHIFT-10), (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), + mm->pinned_vm << (PAGE_SHIFT-10), hiwater_rss << (PAGE_SHIFT-10), total_rss << (PAGE_SHIFT-10), data << (PAGE_SHIFT-10), diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6456624aa964..f3175830cc73 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -304,7 +304,7 @@ struct mm_struct { unsigned long hiwater_rss; /* High-watermark of RSS usage */ unsigned long hiwater_vm; /* High-water virtual memory usage */ - unsigned long total_vm, locked_vm, shared_vm, exec_vm; + unsigned long total_vm, locked_vm, pinned_vm, shared_vm, exec_vm; unsigned long stack_vm, reserved_vm, def_flags, nr_ptes; unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; diff --git a/kernel/events/core.c b/kernel/events/core.c index d1a1bee35228..12a0287e0358 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3544,7 +3544,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) struct ring_buffer *rb = event->rb; atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); - vma->vm_mm->locked_vm -= event->mmap_locked; + vma->vm_mm->pinned_vm -= event->mmap_locked; rcu_assign_pointer(event->rb, NULL); mutex_unlock(&event->mmap_mutex); @@ -3625,7 +3625,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; - locked = vma->vm_mm->locked_vm + extra; + locked = vma->vm_mm->pinned_vm + extra; if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && !capable(CAP_IPC_LOCK)) { @@ -3651,7 +3651,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) atomic_long_add(user_extra, &user->locked_vm); event->mmap_locked = extra; event->mmap_user = get_current_user(); - vma->vm_mm->locked_vm += event->mmap_locked; + vma->vm_mm->pinned_vm += event->mmap_locked; unlock: if (!ret) -- cgit v1.2.3-58-ga151 From 94054fa3fca1fd78db02cb3d68d5627120f0a1d4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 31 Oct 2011 17:07:45 -0700 Subject: xfs: warn if direct reclaim tries to writeback pages Direct reclaim should never writeback pages. For now, handle the situation and warn about it. Ultimately, this will be a BUG_ON. Signed-off-by: Mel Gorman Cc: Dave Chinner Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Wu Fengguang Cc: Jan Kara Cc: Minchan Kim Cc: Rik van Riel Cc: Mel Gorman Cc: Alex Elder Cc: Theodore Ts'o Cc: Chris Mason Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/xfs/xfs_aops.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 11b2aad982d4..33b13310ee0c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -902,11 +902,11 @@ xfs_vm_writepage( * random callers for direct reclaim or memcg reclaim. We explicitly * allow reclaim from kswapd as the stack usage there is relatively low. * - * This should really be done by the core VM, but until that happens - * filesystems like XFS, btrfs and ext4 have to take care of this - * by themselves. + * This should never happen except in the case of a VM regression so + * warn about it. */ - if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC) + if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == + PF_MEMALLOC)) goto redirty; /* -- cgit v1.2.3-58-ga151 From 966dbde2c208e07bab7a45a7855e1e693eabe661 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 31 Oct 2011 17:07:48 -0700 Subject: ext4: warn if direct reclaim tries to writeback pages Direct reclaim should never writeback pages. Warn if an attempt is made. Signed-off-by: Mel Gorman Cc: Dave Chinner Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Wu Fengguang Cc: Jan Kara Cc: Minchan Kim Cc: Rik van Riel Cc: Mel Gorman Cc: Alex Elder Cc: Theodore Ts'o Cc: Chris Mason Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext4/inode.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 986e2388f031..0defe0bfe019 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1811,8 +1811,12 @@ static int ext4_writepage(struct page *page, * We don't want to do block allocation, so redirty * the page and return. We may reach here when we do * a journal commit via journal_submit_inode_data_buffers. - * We can also reach here via shrink_page_list + * We can also reach here via shrink_page_list but it + * should never be for direct reclaim so warn if that + * happens */ + WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == + PF_MEMALLOC); goto redirty_page; } if (commit_write) -- cgit v1.2.3-58-ga151 From 798248206b59acc6e1238c778281419c041891a7 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Mon, 31 Oct 2011 17:08:07 -0700 Subject: lib/string.c: introduce memchr_inv() memchr_inv() is mainly used to check whether the whole buffer is filled with just a specified byte. The function name and prototype are stolen from logfs and the implementation is from SLUB. Signed-off-by: Akinobu Mita Acked-by: Christoph Lameter Acked-by: Pekka Enberg Cc: Matt Mackall Acked-by: Joern Engel Cc: Marcin Slusarz Cc: Eric Dumazet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/logfs/logfs.h | 1 - fs/logfs/super.c | 22 -------------------- include/linux/string.h | 1 + lib/string.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++ mm/slub.c | 47 ++----------------------------------------- 5 files changed, 57 insertions(+), 68 deletions(-) (limited to 'fs') diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index f22d108bfa5d..398ecff6e548 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -618,7 +618,6 @@ static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs, struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index); void emergency_read_end(struct page *page); void logfs_crash_dump(struct super_block *sb); -void *memchr_inv(const void *s, int c, size_t n); int logfs_statfs(struct dentry *dentry, struct kstatfs *stats); int logfs_check_ds(struct logfs_disk_super *ds); int logfs_write_sb(struct super_block *sb); diff --git a/fs/logfs/super.c b/fs/logfs/super.c index ce03a182c771..f2697e4df109 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -90,28 +90,6 @@ void logfs_crash_dump(struct super_block *sb) dump_segfile(sb); } -/* - * TODO: move to lib/string.c - */ -/** - * memchr_inv - Find a character in an area of memory. - * @s: The memory area - * @c: The byte to search for - * @n: The size of the area. - * - * returns the address of the first character other than @c, or %NULL - * if the whole buffer contains just @c. - */ -void *memchr_inv(const void *s, int c, size_t n) -{ - const unsigned char *p = s; - while (n-- != 0) - if ((unsigned char)c != *p++) - return (void *)(p - 1); - - return NULL; -} - /* * FIXME: There should be a reserve for root, similar to ext2. */ diff --git a/include/linux/string.h b/include/linux/string.h index a176db2f2c85..e033564f10ba 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -114,6 +114,7 @@ extern int memcmp(const void *,const void *,__kernel_size_t); #ifndef __HAVE_ARCH_MEMCHR extern void * memchr(const void *,int,__kernel_size_t); #endif +void *memchr_inv(const void *s, int c, size_t n); extern char *kstrdup(const char *s, gfp_t gfp); extern char *kstrndup(const char *s, size_t len, gfp_t gfp); diff --git a/lib/string.c b/lib/string.c index 01fad9b203e1..11df54325fb8 100644 --- a/lib/string.c +++ b/lib/string.c @@ -756,3 +756,57 @@ void *memchr(const void *s, int c, size_t n) } EXPORT_SYMBOL(memchr); #endif + +static void *check_bytes8(const u8 *start, u8 value, unsigned int bytes) +{ + while (bytes) { + if (*start != value) + return (void *)start; + start++; + bytes--; + } + return NULL; +} + +/** + * memchr_inv - Find an unmatching character in an area of memory. + * @start: The memory area + * @c: Find a character other than c + * @bytes: The size of the area. + * + * returns the address of the first character other than @c, or %NULL + * if the whole buffer contains just @c. + */ +void *memchr_inv(const void *start, int c, size_t bytes) +{ + u8 value = c; + u64 value64; + unsigned int words, prefix; + + if (bytes <= 16) + return check_bytes8(start, value, bytes); + + value64 = value | value << 8 | value << 16 | value << 24; + value64 = (value64 & 0xffffffff) | value64 << 32; + prefix = 8 - ((unsigned long)start) % 8; + + if (prefix) { + u8 *r = check_bytes8(start, value, prefix); + if (r) + return r; + start += prefix; + bytes -= prefix; + } + + words = bytes / 8; + + while (words) { + if (*(u64 *)start != value64) + return check_bytes8(start, value, 8); + start += 8; + words--; + } + + return check_bytes8(start, value, bytes % 8); +} +EXPORT_SYMBOL(memchr_inv); diff --git a/mm/slub.c b/mm/slub.c index 95215aa6a75e..7d2a996c307e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -655,49 +655,6 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) memset(p + s->objsize, val, s->inuse - s->objsize); } -static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes) -{ - while (bytes) { - if (*start != value) - return start; - start++; - bytes--; - } - return NULL; -} - -static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes) -{ - u64 value64; - unsigned int words, prefix; - - if (bytes <= 16) - return check_bytes8(start, value, bytes); - - value64 = value | value << 8 | value << 16 | value << 24; - value64 = (value64 & 0xffffffff) | value64 << 32; - prefix = 8 - ((unsigned long)start) % 8; - - if (prefix) { - u8 *r = check_bytes8(start, value, prefix); - if (r) - return r; - start += prefix; - bytes -= prefix; - } - - words = bytes / 8; - - while (words) { - if (*(u64 *)start != value64) - return check_bytes8(start, value, 8); - start += 8; - words--; - } - - return check_bytes8(start, value, bytes % 8); -} - static void restore_bytes(struct kmem_cache *s, char *message, u8 data, void *from, void *to) { @@ -712,7 +669,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, u8 *fault; u8 *end; - fault = check_bytes(start, value, bytes); + fault = memchr_inv(start, value, bytes); if (!fault) return 1; @@ -805,7 +762,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) if (!remainder) return 1; - fault = check_bytes(end - remainder, POISON_INUSE, remainder); + fault = memchr_inv(end - remainder, POISON_INUSE, remainder); if (!fault) return 1; while (end > fault && end[-1] == POISON_INUSE) -- cgit v1.2.3-58-ga151 From 09f363c7363eb10cfb4b82094bd7064e5608258b Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 31 Oct 2011 17:08:57 -0700 Subject: vmscan: fix shrinker callback bug in fs/super.c The callback must not return -1 when nr_to_scan is zero. Fix the bug in fs/super.c and add this requirement to the callback specification. Signed-off-by: Mikulas Patocka Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 2 +- include/linux/shrinker.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 3f56a269a4f4..32a81f3467e0 100644 --- a/fs/super.c +++ b/fs/super.c @@ -61,7 +61,7 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc) return -1; if (!grab_super_passive(sb)) - return -1; + return !sc->nr_to_scan ? 0 : -1; if (sb->s_op && sb->s_op->nr_cached_objects) fs_objects = sb->s_op->nr_cached_objects(sb); diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 790651b4e5ba..a83833a1f7a2 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -20,6 +20,7 @@ struct shrink_control { * 'nr_to_scan' entries and attempt to free them up. It should return * the number of objects which remain in the cache. If it returns -1, it means * it cannot do any scanning at this time (eg. there is a risk of deadlock). + * The callback must not return -1 if nr_to_scan is zero. * * The 'gfpmask' refers to the allocation we are currently trying to * fulfil. -- cgit v1.2.3-58-ga151 From 72a2ebd8bc62e6658513d3b2a1119e91c3ea6810 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 31 Oct 2011 17:09:00 -0700 Subject: fs/buffer.c: add device information for error output in __find_get_block_slow() On the ext4 mailing list[1], we got some report about errors in __find_get_block_slow(), but the information is very limited. If the device information is given, we can know the name of the sick volume. Futhermore, we can get the corresponding status of that block(group, inode block etc) by analyzing the disk layout. [1] http://marc.info/?l=linux-ext4&m=131379831421147&w=2 Signed-off-by: Tao Ma Cc: Al Viro Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 936d6035f6e2..70a19745cb61 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -213,13 +213,16 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) * elsewhere, don't buffer_error if we had some unmapped buffers */ if (all_mapped) { + char b[BDEVNAME_SIZE]; + printk("__find_get_block_slow() failed. " "block=%llu, b_blocknr=%llu\n", (unsigned long long)block, (unsigned long long)bh->b_blocknr); printk("b_state=0x%08lx, b_size=%zu\n", bh->b_state, bh->b_size); - printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits); + printk("device %s blocksize: %d\n", bdevname(bdev, b), + 1 << bd_inode->i_blkbits); } out_unlock: spin_unlock(&bd_mapping->private_lock); -- cgit v1.2.3-58-ga151 From d70ef97baf048412c395bb5d65791d8fe133a52b Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 31 Oct 2011 17:10:04 -0700 Subject: fs/pipe.c: add ->statfs callback for pipefs Currently a statfs on a pipe's /proc//fd/ link returns -ENOSYS. Wire pipfs up so that the statfs succeeds. This is required by checkpoint-restart in the userspace to make it possible to distinguish pipes from fifos. When we dump information about task's open files we use the /proc/pid/fd directoy's symlinks and the fact that opening any of them gives us exactly the same dentry->inode pair as the original process has. Now if a task we're dumping has opened pipe and fifo we need to detect this and act accordingly. Knowing that an fd with type S_ISFIFO resides on a pipefs is the most precise way. Signed-off-by: Pavel Emelyanov Reviewed-by: Tejun Heo Acked-by: Serge Hallyn Signed-off-by: Cyrill Gorcunov Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 0e0be1dc0f8e..4065f07366b3 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1254,6 +1254,7 @@ out: static const struct super_operations pipefs_ops = { .destroy_inode = free_inode_nonrcu, + .statfs = simple_statfs, }; /* -- cgit v1.2.3-58-ga151 From b9075fa968a0a4347aef35e235e2995c0e57dddd Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 31 Oct 2011 17:11:33 -0700 Subject: treewide: use __printf not __attribute__((format(printf,...))) Standardize the style for compiler based printf format verification. Standardized the location of __printf too. Done via script and a little typing. $ grep -rPl --include=*.[ch] -w "__attribute__" * | \ grep -vP "^(tools|scripts|include/linux/compiler-gcc.h)" | \ xargs perl -n -i -e 'local $/; while (<>) { s/\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf\s*,\s*(.+)\s*,\s*(.+)\s*\)\s*\)\s*\)/__printf($1, $2)/g ; print; }' [akpm@linux-foundation.org: revert arch bits] Signed-off-by: Joe Perches Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/isdn/hisax/callc.c | 4 +- drivers/isdn/hisax/hisax.h | 4 +- drivers/isdn/hisax/isdnl1.h | 2 +- drivers/isdn/hisax/isdnl3.c | 2 +- drivers/isdn/hisax/st5481_d.c | 4 +- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 3 +- drivers/net/wireless/ath/ath.h | 5 +- drivers/net/wireless/ath/ath5k/debug.h | 4 +- drivers/net/wireless/ath/ath6kl/debug.h | 4 +- drivers/net/wireless/b43/b43.h | 12 +-- drivers/net/wireless/b43legacy/b43legacy.h | 16 ++-- drivers/staging/iio/trigger.h | 3 +- fs/ecryptfs/ecryptfs_kernel.h | 2 +- fs/ext2/ext2.h | 8 +- fs/ext4/ext4.h | 44 +++++------ fs/fat/fat.h | 9 +-- fs/gfs2/glock.h | 2 +- fs/hpfs/hpfs_fn.h | 4 +- fs/nilfs2/nilfs.h | 8 +- fs/ntfs/debug.h | 15 ++-- fs/ocfs2/super.h | 14 ++-- fs/partitions/ldm.c | 16 ++-- fs/udf/udfdecl.h | 4 +- fs/ufs/ufs.h | 9 ++- fs/xfs/xfs_message.h | 42 +++++----- include/asm-generic/bug.h | 11 +-- include/drm/drmP.h | 10 +-- include/linux/audit.h | 11 ++- include/linux/blktrace_api.h | 2 +- include/linux/device.h | 110 +++++++++++++-------------- include/linux/dynamic_debug.h | 19 +++-- include/linux/ext3_fs.h | 16 ++-- include/linux/fs.h | 4 +- include/linux/fscache-cache.h | 8 +- include/linux/gameport.h | 8 +- include/linux/kallsyms.h | 5 +- include/linux/kdb.h | 9 +-- include/linux/kernel.h | 44 +++++------ include/linux/kexec.h | 4 +- include/linux/kmod.h | 4 +- include/linux/kobject.h | 26 +++---- include/linux/kthread.h | 4 +- include/linux/libata.h | 18 ++--- include/linux/mmiotrace.h | 8 +- include/linux/netdevice.h | 34 ++++----- include/linux/printk.h | 12 +-- include/linux/quotaops.h | 2 +- include/linux/seq_file.h | 3 +- include/linux/trace_seq.h | 8 +- include/net/bluetooth/bluetooth.h | 2 +- include/net/netfilter/nf_log.h | 3 +- include/net/sock.h | 4 +- include/sound/core.h | 4 +- include/sound/info.h | 4 +- include/sound/seq_kernel.h | 4 +- include/xen/hvc-console.h | 4 +- include/xen/xenbus.h | 12 +-- net/nfc/nfc.h | 2 +- net/rds/rds.h | 8 +- net/sunrpc/svc.c | 5 +- sound/firewire/cmp.c | 2 +- 61 files changed, 324 insertions(+), 350 deletions(-) (limited to 'fs') diff --git a/drivers/isdn/hisax/callc.c b/drivers/isdn/hisax/callc.c index 37e685eafd24..c4897e1075d8 100644 --- a/drivers/isdn/hisax/callc.c +++ b/drivers/isdn/hisax/callc.c @@ -65,7 +65,7 @@ hisax_findcard(int driverid) return (struct IsdnCardState *) 0; } -static __attribute__((format(printf, 3, 4))) void +static __printf(3, 4) void link_debug(struct Channel *chanp, int direction, char *fmt, ...) { va_list args; @@ -1068,7 +1068,7 @@ init_d_st(struct Channel *chanp) return 0; } -static __attribute__((format(printf, 2, 3))) void +static __printf(2, 3) void callc_debug(struct FsmInst *fi, char *fmt, ...) { va_list args; diff --git a/drivers/isdn/hisax/hisax.h b/drivers/isdn/hisax/hisax.h index 0a5c42a3f125..aff45a11a92d 100644 --- a/drivers/isdn/hisax/hisax.h +++ b/drivers/isdn/hisax/hisax.h @@ -1287,9 +1287,9 @@ int jiftime(char *s, long mark); int HiSax_command(isdn_ctrl * ic); int HiSax_writebuf_skb(int id, int chan, int ack, struct sk_buff *skb); -__attribute__((format(printf, 3, 4))) +__printf(3, 4) void HiSax_putstatus(struct IsdnCardState *cs, char *head, char *fmt, ...); -__attribute__((format(printf, 3, 0))) +__printf(3, 0) void VHiSax_putstatus(struct IsdnCardState *cs, char *head, char *fmt, va_list args); void HiSax_reportcard(int cardnr, int sel); int QuickHex(char *txt, u_char * p, int cnt); diff --git a/drivers/isdn/hisax/isdnl1.h b/drivers/isdn/hisax/isdnl1.h index 425d86116f2b..66ddcab19bba 100644 --- a/drivers/isdn/hisax/isdnl1.h +++ b/drivers/isdn/hisax/isdnl1.h @@ -21,7 +21,7 @@ #define B_XMTBUFREADY 1 #define B_ACKPENDING 2 -__attribute__((format(printf, 2, 3))) +__printf(2, 3) void debugl1(struct IsdnCardState *cs, char *fmt, ...); void DChannel_proc_xmt(struct IsdnCardState *cs); void DChannel_proc_rcv(struct IsdnCardState *cs); diff --git a/drivers/isdn/hisax/isdnl3.c b/drivers/isdn/hisax/isdnl3.c index ad291f21b201..1c24e4457b6f 100644 --- a/drivers/isdn/hisax/isdnl3.c +++ b/drivers/isdn/hisax/isdnl3.c @@ -66,7 +66,7 @@ static char *strL3Event[] = "EV_TIMEOUT", }; -static __attribute__((format(printf, 2, 3))) void +static __printf(2, 3) void l3m_debug(struct FsmInst *fi, char *fmt, ...) { va_list args; diff --git a/drivers/isdn/hisax/st5481_d.c b/drivers/isdn/hisax/st5481_d.c index 44082637a09f..db247b79e561 100644 --- a/drivers/isdn/hisax/st5481_d.c +++ b/drivers/isdn/hisax/st5481_d.c @@ -167,7 +167,7 @@ static struct FsmNode L1FnList[] __initdata = {ST_L1_F8, EV_IND_RSY, l1_ignore}, }; -static __attribute__((format(printf, 2, 3))) +static __printf(2, 3) void l1m_debug(struct FsmInst *fi, char *fmt, ...) { va_list args; @@ -270,7 +270,7 @@ static char *strDoutEvent[] = "EV_DOUT_UNDERRUN", }; -static __attribute__((format(printf, 2, 3))) +static __printf(2, 3) void dout_debug(struct FsmInst *fi, char *fmt, ...) { va_list args; diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index fca66165110e..8fda331c65df 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -581,8 +581,9 @@ extern const struct ethtool_ops mlx4_en_ethtool_ops; * printk / logging functions */ +__printf(3, 4) int en_print(const char *level, const struct mlx4_en_priv *priv, - const char *format, ...) __attribute__ ((format (printf, 3, 4))); + const char *format, ...); #define en_dbg(mlevel, priv, format, arg...) \ do { \ diff --git a/drivers/net/wireless/ath/ath.h b/drivers/net/wireless/ath/ath.h index 908fdbc3e0ee..0f9ee46cfc97 100644 --- a/drivers/net/wireless/ath/ath.h +++ b/drivers/net/wireless/ath/ath.h @@ -173,8 +173,7 @@ bool ath_hw_keyreset(struct ath_common *common, u16 entry); void ath_hw_cycle_counters_update(struct ath_common *common); int32_t ath_hw_get_listen_time(struct ath_common *common); -extern __attribute__((format (printf, 2, 3))) -void ath_printk(const char *level, const char *fmt, ...); +extern __printf(2, 3) void ath_printk(const char *level, const char *fmt, ...); #define _ath_printk(level, common, fmt, ...) \ do { \ @@ -258,7 +257,7 @@ do { \ #else -static inline __attribute__((format (printf, 3, 4))) +static inline __attribute__ ((format (printf, 3, 4))) void ath_dbg(struct ath_common *common, enum ATH_DEBUG dbg_mask, const char *fmt, ...) { diff --git a/drivers/net/wireless/ath/ath5k/debug.h b/drivers/net/wireless/ath/ath5k/debug.h index 7f37df3125fd..0a3f916a1ef3 100644 --- a/drivers/net/wireless/ath/ath5k/debug.h +++ b/drivers/net/wireless/ath/ath5k/debug.h @@ -141,10 +141,10 @@ ath5k_debug_printtxbuf(struct ath5k_hw *ah, struct ath5k_buf *bf); #include -static inline void __attribute__ ((format (printf, 3, 4))) +static inline __printf(3, 4) void ATH5K_DBG(struct ath5k_hw *ah, unsigned int m, const char *fmt, ...) {} -static inline void __attribute__ ((format (printf, 3, 4))) +static inline __printf(3, 4) void ATH5K_DBG_UNLIMIT(struct ath5k_hw *ah, unsigned int m, const char *fmt, ...) {} diff --git a/drivers/net/wireless/ath/ath6kl/debug.h b/drivers/net/wireless/ath/ath6kl/debug.h index 9288a3ce1e39..7b7675f70a10 100644 --- a/drivers/net/wireless/ath/ath6kl/debug.h +++ b/drivers/net/wireless/ath/ath6kl/debug.h @@ -44,8 +44,8 @@ enum ATH6K_DEBUG_MASK { }; extern unsigned int debug_mask; -extern int ath6kl_printk(const char *level, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); +extern __printf(2, 3) +int ath6kl_printk(const char *level, const char *fmt, ...); #define ath6kl_info(fmt, ...) \ ath6kl_printk(KERN_INFO, fmt, ##__VA_ARGS__) diff --git a/drivers/net/wireless/b43/b43.h b/drivers/net/wireless/b43/b43.h index 447a2307c9d9..37110dfd2c96 100644 --- a/drivers/net/wireless/b43/b43.h +++ b/drivers/net/wireless/b43/b43.h @@ -1011,14 +1011,10 @@ static inline bool b43_using_pio_transfers(struct b43_wldev *dev) } /* Message printing */ -void b43info(struct b43_wl *wl, const char *fmt, ...) - __attribute__ ((format(printf, 2, 3))); -void b43err(struct b43_wl *wl, const char *fmt, ...) - __attribute__ ((format(printf, 2, 3))); -void b43warn(struct b43_wl *wl, const char *fmt, ...) - __attribute__ ((format(printf, 2, 3))); -void b43dbg(struct b43_wl *wl, const char *fmt, ...) - __attribute__ ((format(printf, 2, 3))); +__printf(2, 3) void b43info(struct b43_wl *wl, const char *fmt, ...); +__printf(2, 3) void b43err(struct b43_wl *wl, const char *fmt, ...); +__printf(2, 3) void b43warn(struct b43_wl *wl, const char *fmt, ...); +__printf(2, 3) void b43dbg(struct b43_wl *wl, const char *fmt, ...); /* A WARN_ON variant that vanishes when b43 debugging is disabled. diff --git a/drivers/net/wireless/b43legacy/b43legacy.h b/drivers/net/wireless/b43legacy/b43legacy.h index 12b518251581..1d4fc9db7f5e 100644 --- a/drivers/net/wireless/b43legacy/b43legacy.h +++ b/drivers/net/wireless/b43legacy/b43legacy.h @@ -810,15 +810,15 @@ struct b43legacy_lopair *b43legacy_get_lopair(struct b43legacy_phy *phy, /* Message printing */ -void b43legacyinfo(struct b43legacy_wl *wl, const char *fmt, ...) - __attribute__((format(printf, 2, 3))); -void b43legacyerr(struct b43legacy_wl *wl, const char *fmt, ...) - __attribute__((format(printf, 2, 3))); -void b43legacywarn(struct b43legacy_wl *wl, const char *fmt, ...) - __attribute__((format(printf, 2, 3))); +__printf(2, 3) +void b43legacyinfo(struct b43legacy_wl *wl, const char *fmt, ...); +__printf(2, 3) +void b43legacyerr(struct b43legacy_wl *wl, const char *fmt, ...); +__printf(2, 3) +void b43legacywarn(struct b43legacy_wl *wl, const char *fmt, ...); #if B43legacy_DEBUG -void b43legacydbg(struct b43legacy_wl *wl, const char *fmt, ...) - __attribute__((format(printf, 2, 3))); +__printf(2, 3) +void b43legacydbg(struct b43legacy_wl *wl, const char *fmt, ...); #else /* DEBUG */ # define b43legacydbg(wl, fmt...) do { /* nothing */ } while (0) #endif /* DEBUG */ diff --git a/drivers/staging/iio/trigger.h b/drivers/staging/iio/trigger.h index 598fcb3599f9..5cc42a655c88 100644 --- a/drivers/staging/iio/trigger.h +++ b/drivers/staging/iio/trigger.h @@ -115,8 +115,7 @@ void iio_trigger_poll_chained(struct iio_trigger *trig, s64 time); irqreturn_t iio_trigger_generic_data_rdy_poll(int irq, void *private); -struct iio_trigger *iio_allocate_trigger(const char *fmt, ...) - __attribute__((format(printf, 1, 2))); +__printf(1, 2) struct iio_trigger *iio_allocate_trigger(const char *fmt, ...); void iio_free_trigger(struct iio_trigger *trig); #endif /* _IIO_TRIGGER_H_ */ diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index b36c5572b3f3..54481a3b2c79 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -514,7 +514,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt) #define ecryptfs_printk(type, fmt, arg...) \ __ecryptfs_printk(type "%s: " fmt, __func__, ## arg); -__attribute__ ((format(printf, 1, 2))) +__printf(1, 2) void __ecryptfs_printk(const char *fmt, ...); extern const struct file_operations ecryptfs_main_fops; diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index af9fc89b1b2d..9a4e5e206d08 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -135,10 +135,10 @@ extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long); struct dentry *ext2_get_parent(struct dentry *child); /* super.c */ -extern void ext2_error (struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -extern void ext2_msg(struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +void ext2_error(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ext2_msg(struct super_block *, const char *, const char *, ...); extern void ext2_update_dynamic_rev (struct super_block *sb); extern void ext2_write_super (struct super_block *); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b7d7bd0f066e..cec3145e532c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1878,40 +1878,40 @@ extern int ext4_group_extend(struct super_block *sb, extern void *ext4_kvmalloc(size_t size, gfp_t flags); extern void *ext4_kvzalloc(size_t size, gfp_t flags); extern void ext4_kvfree(void *ptr); -extern void __ext4_error(struct super_block *, const char *, unsigned int, - const char *, ...) - __attribute__ ((format (printf, 4, 5))); +extern __printf(4, 5) +void __ext4_error(struct super_block *, const char *, unsigned int, + const char *, ...); #define ext4_error(sb, message...) __ext4_error(sb, __func__, \ __LINE__, ## message) -extern void ext4_error_inode(struct inode *, const char *, unsigned int, - ext4_fsblk_t, const char *, ...) - __attribute__ ((format (printf, 5, 6))); -extern void ext4_error_file(struct file *, const char *, unsigned int, - ext4_fsblk_t, const char *, ...) - __attribute__ ((format (printf, 5, 6))); +extern __printf(5, 6) +void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); +extern __printf(5, 6) +void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); extern void __ext4_std_error(struct super_block *, const char *, unsigned int, int); -extern void __ext4_abort(struct super_block *, const char *, unsigned int, - const char *, ...) - __attribute__ ((format (printf, 4, 5))); +extern __printf(4, 5) +void __ext4_abort(struct super_block *, const char *, unsigned int, + const char *, ...); #define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \ __LINE__, ## message) -extern void __ext4_warning(struct super_block *, const char *, unsigned int, - const char *, ...) - __attribute__ ((format (printf, 4, 5))); +extern __printf(4, 5) +void __ext4_warning(struct super_block *, const char *, unsigned int, + const char *, ...); #define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \ __LINE__, ## message) -extern void ext4_msg(struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +void ext4_msg(struct super_block *, const char *, const char *, ...); extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, const char *, unsigned int, const char *); #define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \ __LINE__, msg) -extern void __ext4_grp_locked_error(const char *, unsigned int, \ - struct super_block *, ext4_group_t, \ - unsigned long, ext4_fsblk_t, \ - const char *, ...) - __attribute__ ((format (printf, 7, 8))); +extern __printf(7, 8) +void __ext4_grp_locked_error(const char *, unsigned int, + struct super_block *, ext4_group_t, + unsigned long, ext4_fsblk_t, + const char *, ...); #define ext4_grp_locked_error(sb, grp, message...) \ __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message) extern void ext4_update_dynamic_rev(struct super_block *sb); diff --git a/fs/fat/fat.h b/fs/fat/fat.h index a5d3853822e0..1510a4d51990 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -326,15 +326,14 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent, extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2); /* fat/misc.c */ -extern void -__fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) - __attribute__ ((format (printf, 3, 4))) __cold; +extern __printf(3, 4) __cold +void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...); #define fat_fs_error(sb, fmt, args...) \ __fat_fs_error(sb, 1, fmt , ## args) #define fat_fs_error_ratelimit(sb, fmt, args...) \ __fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args) -void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) - __attribute__ ((format (printf, 3, 4))) __cold; +__printf(3, 4) __cold +void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...); extern int fat_clusters_flush(struct super_block *sb); extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 66707118af25..2553b858a72e 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -201,7 +201,7 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); -__attribute__ ((format(printf, 2, 3))) +__printf(2, 3) void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); /** diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 331b5e234ef3..de946170ebb1 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -311,8 +311,8 @@ static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb) /* super.c */ -void hpfs_error(struct super_block *, const char *, ...) - __attribute__((format (printf, 2, 3))); +__printf(2, 3) +void hpfs_error(struct super_block *, const char *, ...); int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *); unsigned hpfs_count_one_bitmap(struct super_block *, secno); diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 255d5e1c03b7..3777d138f895 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -276,10 +276,10 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, /* super.c */ extern struct inode *nilfs_alloc_inode(struct super_block *); extern void nilfs_destroy_inode(struct inode *); -extern void nilfs_error(struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -extern void nilfs_warning(struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +void nilfs_error(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void nilfs_warning(struct super_block *, const char *, const char *, ...); extern struct nilfs_super_block * nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **); extern int nilfs_store_magic_and_option(struct super_block *, diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h index 2142b1c68b61..53c27eaf2307 100644 --- a/fs/ntfs/debug.h +++ b/fs/ntfs/debug.h @@ -30,8 +30,9 @@ extern int debug_msgs; -extern void __ntfs_debug(const char *file, int line, const char *function, - const char *format, ...) __attribute__ ((format (printf, 4, 5))); +extern __printf(4, 5) +void __ntfs_debug(const char *file, int line, const char *function, + const char *format, ...); /** * ntfs_debug - write a debug level message to syslog * @f: a printf format string containing the message @@ -52,12 +53,14 @@ extern void ntfs_debug_dump_runlist(const runlist_element *rl); #endif /* !DEBUG */ -extern void __ntfs_warning(const char *function, const struct super_block *sb, - const char *fmt, ...) __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +void __ntfs_warning(const char *function, const struct super_block *sb, + const char *fmt, ...); #define ntfs_warning(sb, f, a...) __ntfs_warning(__func__, sb, f, ##a) -extern void __ntfs_error(const char *function, const struct super_block *sb, - const char *fmt, ...) __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +void __ntfs_error(const char *function, const struct super_block *sb, + const char *fmt, ...); #define ntfs_error(sb, f, a...) __ntfs_error(__func__, sb, f, ##a) #endif /* _LINUX_NTFS_DEBUG_H */ diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index 40c7de084c10..74ff74cf78fe 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h @@ -31,17 +31,15 @@ extern struct workqueue_struct *ocfs2_wq; int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, int node_num); -void __ocfs2_error(struct super_block *sb, - const char *function, - const char *fmt, ...) - __attribute__ ((format (printf, 3, 4))); +__printf(3, 4) +void __ocfs2_error(struct super_block *sb, const char *function, + const char *fmt, ...); #define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args) -void __ocfs2_abort(struct super_block *sb, - const char *function, - const char *fmt, ...) - __attribute__ ((format (printf, 3, 4))); +__printf(3, 4) +void __ocfs2_abort(struct super_block *sb, const char *function, + const char *fmt, ...); #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c index af9fdf046769..bd8ae788f689 100644 --- a/fs/partitions/ldm.c +++ b/fs/partitions/ldm.c @@ -49,18 +49,20 @@ #define ldm_error(f, a...) _ldm_printk (KERN_ERR, __func__, f, ##a) #define ldm_info(f, a...) _ldm_printk (KERN_INFO, __func__, f, ##a) -__attribute__ ((format (printf, 3, 4))) -static void _ldm_printk (const char *level, const char *function, - const char *fmt, ...) +static __printf(3, 4) +void _ldm_printk(const char *level, const char *function, const char *fmt, ...) { - static char buf[128]; + struct va_format vaf; va_list args; va_start (args, fmt); - vsnprintf (buf, sizeof (buf), fmt, args); - va_end (args); - printk ("%s%s(): %s\n", level, function, buf); + vaf.fmt = fmt; + vaf.va = &args; + + printk("%s%s(): %pV\n", level, function, &vaf); + + va_end(args); } /** diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index dbd52d4b5eed..dc8a8dcc5ae1 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -112,8 +112,8 @@ struct extent_position { /* super.c */ -__attribute__((format(printf, 3, 4))) -extern void udf_warning(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) void udf_warning(struct super_block *, const char *, + const char *, ...); static inline void udf_updated_lvid(struct super_block *sb) { struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh; diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 5be2755dd715..c26f2bcec264 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -117,9 +117,12 @@ extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buf extern const struct file_operations ufs_dir_operations; /* super.c */ -extern void ufs_warning (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); -extern void ufs_error (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); -extern void ufs_panic (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +void ufs_warning(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ufs_error(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ufs_panic(struct super_block *, const char *, const char *, ...); /* symlink.c */ extern const struct inode_operations ufs_fast_symlink_inode_operations; diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 7fb7ea007672..56dc0c17f16a 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -3,31 +3,29 @@ struct xfs_mount; -extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern void xfs_alert_tag(const struct xfs_mount *mp, int tag, - const char *fmt, ...) - __attribute__ ((format (printf, 3, 4))); -extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); +extern __printf(2, 3) +void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(3, 4) +void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...); +extern __printf(2, 3) +void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_err(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_info(const struct xfs_mount *mp, const char *fmt, ...); #ifdef DEBUG -extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); +extern __printf(2, 3) +void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...); #else -static inline void -__attribute__ ((format (printf, 2, 3))) -xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) +static inline __printf(2, 3) +void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) { } #endif diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index dfb0ec666c94..84458b0c38d1 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -61,11 +61,12 @@ struct bug_entry { */ #ifndef __WARN_TAINT #ifndef __ASSEMBLY__ -extern void warn_slowpath_fmt(const char *file, const int line, - const char *fmt, ...) __attribute__((format(printf, 3, 4))); -extern void warn_slowpath_fmt_taint(const char *file, const int line, - unsigned taint, const char *fmt, ...) - __attribute__((format(printf, 4, 5))); +extern __printf(3, 4) +void warn_slowpath_fmt(const char *file, const int line, + const char *fmt, ...); +extern __printf(4, 5) +void warn_slowpath_fmt_taint(const char *file, const int line, unsigned taint, + const char *fmt, ...); extern void warn_slowpath_null(const char *file, const int line); #define WANT_WARN_ON_SLOWPATH #endif diff --git a/include/drm/drmP.h b/include/drm/drmP.h index 43538b643560..cf3b446139ea 100644 --- a/include/drm/drmP.h +++ b/include/drm/drmP.h @@ -122,12 +122,12 @@ struct drm_device; * using the DRM_DEBUG_KMS and DRM_DEBUG. */ -extern __attribute__((format (printf, 4, 5))) +extern __printf(4, 5) void drm_ut_debug_printk(unsigned int request_level, - const char *prefix, - const char *function_name, - const char *format, ...); -extern __attribute__((format (printf, 2, 3))) + const char *prefix, + const char *function_name, + const char *format, ...); +extern __printf(2, 3) int drm_err(const char *func, const char *format, ...); /***********************************************************************/ diff --git a/include/linux/audit.h b/include/linux/audit.h index 0c8006129fb2..2f81c6f3b630 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -584,14 +584,13 @@ extern int audit_signals; #ifdef CONFIG_AUDIT /* These are defined in audit.c */ /* Public API */ -extern void audit_log(struct audit_context *ctx, gfp_t gfp_mask, - int type, const char *fmt, ...) - __attribute__((format(printf,4,5))); +extern __printf(4, 5) +void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, + const char *fmt, ...); extern struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type); -extern void audit_log_format(struct audit_buffer *ab, - const char *fmt, ...) - __attribute__((format(printf,2,3))); +extern __printf(2, 3) +void audit_log_format(struct audit_buffer *ab, const char *fmt, ...); extern void audit_log_end(struct audit_buffer *ab); extern int audit_string_contains_control(const char *string, size_t len); diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 8e9e4bc6d73b..4d1a0748eaf8 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -170,7 +170,7 @@ extern void blk_trace_shutdown(struct request_queue *); extern int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, struct blk_user_trace_setup *buts); -extern __attribute__((format(printf, 2, 3))) +extern __printf(2, 3) void __trace_note_message(struct blk_trace *, const char *fmt, ...); /** diff --git a/include/linux/device.h b/include/linux/device.h index 85e78fc7d7fd..e88abeecfadf 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -622,8 +622,8 @@ static inline const char *dev_name(const struct device *dev) return kobject_name(&dev->kobj); } -extern int dev_set_name(struct device *dev, const char *name, ...) - __attribute__((format(printf, 2, 3))); +extern __printf(2, 3) +int dev_set_name(struct device *dev, const char *name, ...); #ifdef CONFIG_NUMA static inline int dev_to_node(struct device *dev) @@ -753,10 +753,10 @@ extern struct device *device_create_vargs(struct class *cls, void *drvdata, const char *fmt, va_list vargs); -extern struct device *device_create(struct class *cls, struct device *parent, - dev_t devt, void *drvdata, - const char *fmt, ...) - __attribute__((format(printf, 5, 6))); +extern __printf(5, 6) +struct device *device_create(struct class *cls, struct device *parent, + dev_t devt, void *drvdata, + const char *fmt, ...); extern void device_destroy(struct class *cls, dev_t devt); /* @@ -800,64 +800,56 @@ extern const char *dev_driver_string(const struct device *dev); extern int __dev_printk(const char *level, const struct device *dev, struct va_format *vaf); -extern int dev_printk(const char *level, const struct device *dev, - const char *fmt, ...) - __attribute__ ((format (printf, 3, 4))); -extern int dev_emerg(const struct device *dev, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern int dev_alert(const struct device *dev, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern int dev_crit(const struct device *dev, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern int dev_err(const struct device *dev, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern int dev_warn(const struct device *dev, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern int dev_notice(const struct device *dev, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern int _dev_info(const struct device *dev, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); +extern __printf(3, 4) +int dev_printk(const char *level, const struct device *dev, + const char *fmt, ...) + ; +extern __printf(2, 3) +int dev_emerg(const struct device *dev, const char *fmt, ...); +extern __printf(2, 3) +int dev_alert(const struct device *dev, const char *fmt, ...); +extern __printf(2, 3) +int dev_crit(const struct device *dev, const char *fmt, ...); +extern __printf(2, 3) +int dev_err(const struct device *dev, const char *fmt, ...); +extern __printf(2, 3) +int dev_warn(const struct device *dev, const char *fmt, ...); +extern __printf(2, 3) +int dev_notice(const struct device *dev, const char *fmt, ...); +extern __printf(2, 3) +int _dev_info(const struct device *dev, const char *fmt, ...); #else static inline int __dev_printk(const char *level, const struct device *dev, struct va_format *vaf) - { return 0; } -static inline int dev_printk(const char *level, const struct device *dev, - const char *fmt, ...) - __attribute__ ((format (printf, 3, 4))); -static inline int dev_printk(const char *level, const struct device *dev, - const char *fmt, ...) - { return 0; } - -static inline int dev_emerg(const struct device *dev, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -static inline int dev_emerg(const struct device *dev, const char *fmt, ...) - { return 0; } -static inline int dev_crit(const struct device *dev, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -static inline int dev_crit(const struct device *dev, const char *fmt, ...) - { return 0; } -static inline int dev_alert(const struct device *dev, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -static inline int dev_alert(const struct device *dev, const char *fmt, ...) - { return 0; } -static inline int dev_err(const struct device *dev, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -static inline int dev_err(const struct device *dev, const char *fmt, ...) - { return 0; } -static inline int dev_warn(const struct device *dev, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -static inline int dev_warn(const struct device *dev, const char *fmt, ...) - { return 0; } -static inline int dev_notice(const struct device *dev, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -static inline int dev_notice(const struct device *dev, const char *fmt, ...) - { return 0; } -static inline int _dev_info(const struct device *dev, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -static inline int _dev_info(const struct device *dev, const char *fmt, ...) - { return 0; } +{ return 0; } +static inline __printf(3, 4) +int dev_printk(const char *level, const struct device *dev, + const char *fmt, ...) +{ return 0; } + +static inline __printf(2, 3) +int dev_emerg(const struct device *dev, const char *fmt, ...) +{ return 0; } +static inline __printf(2, 3) +int dev_crit(const struct device *dev, const char *fmt, ...) +{ return 0; } +static inline __printf(2, 3) +int dev_alert(const struct device *dev, const char *fmt, ...) +{ return 0; } +static inline __printf(2, 3) +int dev_err(const struct device *dev, const char *fmt, ...) +{ return 0; } +static inline __printf(2, 3) +int dev_warn(const struct device *dev, const char *fmt, ...) +{ return 0; } +static inline __printf(2, 3) +int dev_notice(const struct device *dev, const char *fmt, ...) +{ return 0; } +static inline __printf(2, 3) +int _dev_info(const struct device *dev, const char *fmt, ...) +{ return 0; } #endif diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 13aae8087b56..0564e3c39882 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -37,22 +37,21 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n, #if defined(CONFIG_DYNAMIC_DEBUG) extern int ddebug_remove_module(const char *mod_name); -extern int __dynamic_pr_debug(struct _ddebug *descriptor, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); +extern __printf(2, 3) +int __dynamic_pr_debug(struct _ddebug *descriptor, const char *fmt, ...); struct device; -extern int __dynamic_dev_dbg(struct _ddebug *descriptor, - const struct device *dev, - const char *fmt, ...) - __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +int __dynamic_dev_dbg(struct _ddebug *descriptor, const struct device *dev, + const char *fmt, ...); struct net_device; -extern int __dynamic_netdev_dbg(struct _ddebug *descriptor, - const struct net_device *dev, - const char *fmt, ...) - __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +int __dynamic_netdev_dbg(struct _ddebug *descriptor, + const struct net_device *dev, + const char *fmt, ...); #define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt) \ static struct _ddebug __used __aligned(8) \ diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index 67a803aee619..81965cce6bfa 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -937,15 +937,15 @@ extern int ext3_group_extend(struct super_block *sb, ext3_fsblk_t n_blocks_count); /* super.c */ -extern void ext3_error (struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +void ext3_error(struct super_block *, const char *, const char *, ...); extern void __ext3_std_error (struct super_block *, const char *, int); -extern void ext3_abort (struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -extern void ext3_warning (struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -extern void ext3_msg(struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +void ext3_abort(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ext3_warning(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ext3_msg(struct super_block *, const char *, const char *, ...); extern void ext3_update_dynamic_rev (struct super_block *sb); #define ext3_std_error(sb, errno) \ diff --git a/include/linux/fs.h b/include/linux/fs.h index 87b4c6b9692d..7a049fd2aa4c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2635,8 +2635,8 @@ static const struct file_operations __fops = { \ .llseek = generic_file_llseek, \ }; -static inline void __attribute__((format(printf, 1, 2))) -__simple_attr_check_format(const char *fmt, ...) +static inline __printf(1, 2) +void __simple_attr_check_format(const char *fmt, ...) { /* don't do anything, just let the compiler check the arguments; */ } diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index af095b54502e..ce31408b1e47 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -492,10 +492,10 @@ static inline void fscache_end_io(struct fscache_retrieval *op, /* * out-of-line cache backend functions */ -extern void fscache_init_cache(struct fscache_cache *cache, - const struct fscache_cache_ops *ops, - const char *idfmt, - ...) __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +void fscache_init_cache(struct fscache_cache *cache, + const struct fscache_cache_ops *ops, + const char *idfmt, ...); extern int fscache_add_cache(struct fscache_cache *cache, struct fscache_object *fsdef, diff --git a/include/linux/gameport.h b/include/linux/gameport.h index b65a6f472775..069ee4139105 100644 --- a/include/linux/gameport.h +++ b/include/linux/gameport.h @@ -78,8 +78,8 @@ static inline void gameport_register_port(struct gameport *gameport) void gameport_unregister_port(struct gameport *gameport); -void gameport_set_phys(struct gameport *gameport, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); +__printf(2, 3) +void gameport_set_phys(struct gameport *gameport, const char *fmt, ...); #else @@ -93,8 +93,8 @@ static inline void gameport_unregister_port(struct gameport *gameport) return; } -static inline void gameport_set_phys(struct gameport *gameport, - const char *fmt, ...) +static inline __printf(2, 3) +void gameport_set_phys(struct gameport *gameport, const char *fmt, ...) { return; } diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index 0df513b7a9f8..387571959dd9 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -101,9 +101,8 @@ static inline int lookup_symbol_attrs(unsigned long addr, unsigned long *size, u #endif /*CONFIG_KALLSYMS*/ /* This macro allows us to keep printk typechecking */ -static void __check_printsym_format(const char *fmt, ...) -__attribute__((format(printf,1,2))); -static inline void __check_printsym_format(const char *fmt, ...) +static __printf(1, 2) +void __check_printsym_format(const char *fmt, ...) { } diff --git a/include/linux/kdb.h b/include/linux/kdb.h index 529d9a0c75a5..064725854db8 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -114,12 +114,9 @@ typedef enum { } kdb_reason_t; extern int kdb_trap_printk; -extern int vkdb_printf(const char *fmt, va_list args) - __attribute__ ((format (printf, 1, 0))); -extern int kdb_printf(const char *, ...) - __attribute__ ((format (printf, 1, 2))); -typedef int (*kdb_printf_t)(const char *, ...) - __attribute__ ((format (printf, 1, 2))); +extern __printf(1, 0) int vkdb_printf(const char *fmt, va_list args); +extern __printf(1, 2) int kdb_printf(const char *, ...); +typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...); extern void kdb_init(int level); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 8eefcf7e95eb..e40c950e1d62 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -296,20 +296,18 @@ extern long long simple_strtoll(const char *,char **,unsigned int); #define strict_strtoull kstrtoull #define strict_strtoll kstrtoll -extern int sprintf(char * buf, const char * fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern int vsprintf(char *buf, const char *, va_list) - __attribute__ ((format (printf, 2, 0))); -extern int snprintf(char * buf, size_t size, const char * fmt, ...) - __attribute__ ((format (printf, 3, 4))); -extern int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) - __attribute__ ((format (printf, 3, 0))); -extern int scnprintf(char * buf, size_t size, const char * fmt, ...) - __attribute__ ((format (printf, 3, 4))); -extern int vscnprintf(char *buf, size_t size, const char *fmt, va_list args) - __attribute__ ((format (printf, 3, 0))); -extern char *kasprintf(gfp_t gfp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); +extern __printf(2, 3) int sprintf(char *buf, const char * fmt, ...); +extern __printf(2, 0) int vsprintf(char *buf, const char *, va_list); +extern __printf(3, 4) +int snprintf(char *buf, size_t size, const char *fmt, ...); +extern __printf(3, 0) +int vsnprintf(char *buf, size_t size, const char *fmt, va_list args); +extern __printf(3, 4) +int scnprintf(char *buf, size_t size, const char *fmt, ...); +extern __printf(3, 0) +int vscnprintf(char *buf, size_t size, const char *fmt, va_list args); +extern __printf(2, 3) +char *kasprintf(gfp_t gfp, const char *fmt, ...); extern char *kvasprintf(gfp_t gfp, const char *fmt, va_list args); extern int sscanf(const char *, const char *, ...) @@ -427,8 +425,8 @@ extern void tracing_start(void); extern void tracing_stop(void); extern void ftrace_off_permanent(void); -static inline void __attribute__ ((format (printf, 1, 2))) -____trace_printk_check_format(const char *fmt, ...) +static inline __printf(1, 2) +void ____trace_printk_check_format(const char *fmt, ...) { } #define __trace_printk_check_format(fmt, args...) \ @@ -467,13 +465,11 @@ do { \ __trace_printk(_THIS_IP_, fmt, ##args); \ } while (0) -extern int -__trace_bprintk(unsigned long ip, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); +extern __printf(2, 3) +int __trace_bprintk(unsigned long ip, const char *fmt, ...); -extern int -__trace_printk(unsigned long ip, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); +extern __printf(2, 3) +int __trace_printk(unsigned long ip, const char *fmt, ...); extern void trace_dump_stack(void); @@ -502,8 +498,8 @@ __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap); extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode); #else -static inline int -trace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); +static inline __printf(1, 2) +int trace_printk(const char *fmt, ...); static inline void tracing_start(void) { } static inline void tracing_stop(void) { } diff --git a/include/linux/kexec.h b/include/linux/kexec.h index c2478a342cd7..8944ebe7963e 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -130,8 +130,8 @@ int kexec_should_crash(struct task_struct *); void crash_save_cpu(struct pt_regs *regs, int cpu); void crash_save_vmcoreinfo(void); void arch_crash_save_vmcoreinfo(void); -void vmcoreinfo_append_str(const char *fmt, ...) - __attribute__ ((format (printf, 1, 2))); +__printf(1, 2) +void vmcoreinfo_append_str(const char *fmt, ...); unsigned long paddr_vmcoreinfo_note(void); #define VMCOREINFO_OSRELEASE(value) \ diff --git a/include/linux/kmod.h b/include/linux/kmod.h index 0da38cf7db7b..b16f65390734 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -32,8 +32,8 @@ extern char modprobe_path[]; /* for sysctl */ /* modprobe exit status on success, -ve on error. Return value * usually useless though. */ -extern int __request_module(bool wait, const char *name, ...) \ - __attribute__((format(printf, 2, 3))); +extern __printf(2, 3) +int __request_module(bool wait, const char *name, ...); #define request_module(mod...) __request_module(true, mod) #define request_module_nowait(mod...) __request_module(false, mod) #define try_then_request_module(x, mod...) \ diff --git a/include/linux/kobject.h b/include/linux/kobject.h index 668729cc0fe9..ad81e1c51487 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -72,8 +72,8 @@ struct kobject { unsigned int uevent_suppress:1; }; -extern int kobject_set_name(struct kobject *kobj, const char *name, ...) - __attribute__((format(printf, 2, 3))); +extern __printf(2, 3) +int kobject_set_name(struct kobject *kobj, const char *name, ...); extern int kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list vargs); @@ -83,15 +83,13 @@ static inline const char *kobject_name(const struct kobject *kobj) } extern void kobject_init(struct kobject *kobj, struct kobj_type *ktype); -extern int __must_check kobject_add(struct kobject *kobj, - struct kobject *parent, - const char *fmt, ...) - __attribute__((format(printf, 3, 4))); -extern int __must_check kobject_init_and_add(struct kobject *kobj, - struct kobj_type *ktype, - struct kobject *parent, - const char *fmt, ...) - __attribute__((format(printf, 4, 5))); +extern __printf(3, 4) __must_check +int kobject_add(struct kobject *kobj, struct kobject *parent, + const char *fmt, ...); +extern __printf(4, 5) __must_check +int kobject_init_and_add(struct kobject *kobj, + struct kobj_type *ktype, struct kobject *parent, + const char *fmt, ...); extern void kobject_del(struct kobject *kobj); @@ -212,8 +210,8 @@ int kobject_uevent(struct kobject *kobj, enum kobject_action action); int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, char *envp[]); -int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...) - __attribute__((format (printf, 2, 3))); +__printf(2, 3) +int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...); int kobject_action_type(const char *buf, size_t count, enum kobject_action *type); @@ -226,7 +224,7 @@ static inline int kobject_uevent_env(struct kobject *kobj, char *envp[]) { return 0; } -static inline __attribute__((format(printf, 2, 3))) +static inline __printf(2, 3) int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...) { return 0; } diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 1e923e5e88e8..5cac19b3a266 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -4,11 +4,11 @@ #include #include +__printf(4, 5) struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, - const char namefmt[], ...) - __attribute__((format(printf, 4, 5))); + const char namefmt[], ...); #define kthread_create(threadfn, data, namefmt, arg...) \ kthread_create_on_node(threadfn, data, -1, namefmt, ##arg) diff --git a/include/linux/libata.h b/include/linux/libata.h index 23fa829bf7a3..cafc09a64fe4 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1256,13 +1256,13 @@ static inline int sata_srst_pmp(struct ata_link *link) /* * printk helpers */ -__attribute__((format (printf, 3, 4))) +__printf(3, 4) int ata_port_printk(const struct ata_port *ap, const char *level, const char *fmt, ...); -__attribute__((format (printf, 3, 4))) +__printf(3, 4) int ata_link_printk(const struct ata_link *link, const char *level, const char *fmt, ...); -__attribute__((format (printf, 3, 4))) +__printf(3, 4) int ata_dev_printk(const struct ata_device *dev, const char *level, const char *fmt, ...); @@ -1304,10 +1304,10 @@ void ata_print_version(const struct device *dev, const char *version); /* * ata_eh_info helpers */ -extern void __ata_ehi_push_desc(struct ata_eh_info *ehi, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern void ata_ehi_push_desc(struct ata_eh_info *ehi, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); +extern __printf(2, 3) +void __ata_ehi_push_desc(struct ata_eh_info *ehi, const char *fmt, ...); +extern __printf(2, 3) +void ata_ehi_push_desc(struct ata_eh_info *ehi, const char *fmt, ...); extern void ata_ehi_clear_desc(struct ata_eh_info *ehi); static inline void ata_ehi_hotplugged(struct ata_eh_info *ehi) @@ -1321,8 +1321,8 @@ static inline void ata_ehi_hotplugged(struct ata_eh_info *ehi) /* * port description helpers */ -extern void ata_port_desc(struct ata_port *ap, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); +extern __printf(2, 3) +void ata_port_desc(struct ata_port *ap, const char *fmt, ...); #ifdef CONFIG_PCI extern void ata_port_pbar_desc(struct ata_port *ap, int bar, ssize_t offset, const char *name); diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index 97491f78b08c..c5d52780d6a0 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -49,8 +49,7 @@ extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size, extern void mmiotrace_iounmap(volatile void __iomem *addr); /* For anyone to insert markers. Remember trailing newline. */ -extern int mmiotrace_printk(const char *fmt, ...) - __attribute__ ((format (printf, 1, 2))); +extern __printf(1, 2) int mmiotrace_printk(const char *fmt, ...); #else /* !CONFIG_MMIOTRACE: */ static inline int is_kmmio_active(void) { @@ -71,10 +70,7 @@ static inline void mmiotrace_iounmap(volatile void __iomem *addr) { } -static inline int mmiotrace_printk(const char *fmt, ...) - __attribute__ ((format (printf, 1, 0))); - -static inline int mmiotrace_printk(const char *fmt, ...) +static inline __printf(1, 2) int mmiotrace_printk(const char *fmt, ...) { return 0; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index df1c836e6948..cbeb5867cff7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2622,23 +2622,23 @@ static inline const char *netdev_name(const struct net_device *dev) extern int __netdev_printk(const char *level, const struct net_device *dev, struct va_format *vaf); -extern int netdev_printk(const char *level, const struct net_device *dev, - const char *format, ...) - __attribute__ ((format (printf, 3, 4))); -extern int netdev_emerg(const struct net_device *dev, const char *format, ...) - __attribute__ ((format (printf, 2, 3))); -extern int netdev_alert(const struct net_device *dev, const char *format, ...) - __attribute__ ((format (printf, 2, 3))); -extern int netdev_crit(const struct net_device *dev, const char *format, ...) - __attribute__ ((format (printf, 2, 3))); -extern int netdev_err(const struct net_device *dev, const char *format, ...) - __attribute__ ((format (printf, 2, 3))); -extern int netdev_warn(const struct net_device *dev, const char *format, ...) - __attribute__ ((format (printf, 2, 3))); -extern int netdev_notice(const struct net_device *dev, const char *format, ...) - __attribute__ ((format (printf, 2, 3))); -extern int netdev_info(const struct net_device *dev, const char *format, ...) - __attribute__ ((format (printf, 2, 3))); +extern __printf(3, 4) +int netdev_printk(const char *level, const struct net_device *dev, + const char *format, ...); +extern __printf(2, 3) +int netdev_emerg(const struct net_device *dev, const char *format, ...); +extern __printf(2, 3) +int netdev_alert(const struct net_device *dev, const char *format, ...); +extern __printf(2, 3) +int netdev_crit(const struct net_device *dev, const char *format, ...); +extern __printf(2, 3) +int netdev_err(const struct net_device *dev, const char *format, ...); +extern __printf(2, 3) +int netdev_warn(const struct net_device *dev, const char *format, ...); +extern __printf(2, 3) +int netdev_notice(const struct net_device *dev, const char *format, ...); +extern __printf(2, 3) +int netdev_info(const struct net_device *dev, const char *format, ...); #define MODULE_ALIAS_NETDEV(device) \ MODULE_ALIAS("netdev-" device) diff --git a/include/linux/printk.h b/include/linux/printk.h index 0101d55d9651..f0e22f75143f 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -82,22 +82,22 @@ struct va_format { * Dummy printk for disabled debugging statements to use whilst maintaining * gcc's format and side-effect checking. */ -static inline __attribute__ ((format (printf, 1, 2))) +static inline __printf(1, 2) int no_printk(const char *fmt, ...) { return 0; } -extern asmlinkage __attribute__ ((format (printf, 1, 2))) +extern asmlinkage __printf(1, 2) void early_printk(const char *fmt, ...); extern int printk_needs_cpu(int cpu); extern void printk_tick(void); #ifdef CONFIG_PRINTK -asmlinkage __attribute__ ((format (printf, 1, 0))) +asmlinkage __printf(1, 0) int vprintk(const char *fmt, va_list args); -asmlinkage __attribute__ ((format (printf, 1, 2))) __cold +asmlinkage __printf(1, 2) __cold int printk(const char *fmt, ...); /* @@ -117,12 +117,12 @@ extern int kptr_restrict; void log_buf_kexec_setup(void); void __init setup_log_buf(int early); #else -static inline __attribute__ ((format (printf, 1, 0))) +static inline __printf(1, 0) int vprintk(const char *s, va_list args) { return 0; } -static inline __attribute__ ((format (printf, 1, 2))) __cold +static inline __printf(1, 2) __cold int printk(const char *s, ...) { return 0; diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 26f9e3612e0f..d93f95e6177c 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -31,7 +31,7 @@ static inline bool is_quota_modification(struct inode *inode, struct iattr *ia) #define quota_error(sb, fmt, args...) \ __quota_error((sb), __func__, fmt , ## args) -extern __attribute__((format (printf, 3, 4))) +extern __printf(3, 4) void __quota_error(struct super_block *sb, const char *func, const char *fmt, ...); diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index be720cd2038d..0b69a4684216 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -84,8 +84,7 @@ int seq_putc(struct seq_file *m, char c); int seq_puts(struct seq_file *m, const char *s); int seq_write(struct seq_file *seq, const void *data, size_t len); -int seq_printf(struct seq_file *, const char *, ...) - __attribute__ ((format (printf,2,3))); +__printf(2, 3) int seq_printf(struct seq_file *, const char *, ...); int seq_path(struct seq_file *, struct path *, char *); int seq_dentry(struct seq_file *, struct dentry *, char *); diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 5cf397ceb726..7dadc3df0c77 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -29,10 +29,10 @@ trace_seq_init(struct trace_seq *s) * Currently only defined when tracing is enabled. */ #ifdef CONFIG_TRACING -extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) - __attribute__ ((format (printf, 2, 0))); +extern __printf(2, 3) +int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); +extern __printf(2, 0) +int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args); extern int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary); extern int trace_print_seq(struct seq_file *m, struct trace_seq *s); diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index e727555d4ee9..e86af08293a8 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -77,7 +77,7 @@ struct bt_power { #define BT_POWER_FORCE_ACTIVE_OFF 0 #define BT_POWER_FORCE_ACTIVE_ON 1 -__attribute__((format (printf, 2, 3))) +__printf(2, 3) int bt_printk(const char *level, const char *fmt, ...); #define BT_INFO(fmt, arg...) bt_printk(KERN_INFO, pr_fmt(fmt), ##arg) diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index 920997f1aff0..e991bd0a27af 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -53,12 +53,13 @@ int nf_log_bind_pf(u_int8_t pf, const struct nf_logger *logger); void nf_log_unbind_pf(u_int8_t pf); /* Calls the registered backend logging function */ +__printf(7, 8) void nf_log_packet(u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const struct nf_loginfo *li, - const char *fmt, ...) __attribute__ ((format(printf,7,8))); + const char *fmt, ...); #endif /* _NF_LOG_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 5ac682f73d63..c6658bef7f32 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -76,8 +76,8 @@ printk(KERN_DEBUG msg); } while (0) #else /* Validate arguments and do nothing */ -static inline void __attribute__ ((format (printf, 2, 3))) -SOCK_DEBUG(struct sock *sk, const char *msg, ...) +static inline __printf(2, 3) +void SOCK_DEBUG(struct sock *sk, const char *msg, ...) { } #endif diff --git a/include/sound/core.h b/include/sound/core.h index 1fa2407c966f..91d513879a78 100644 --- a/include/sound/core.h +++ b/include/sound/core.h @@ -326,9 +326,9 @@ void release_and_free_resource(struct resource *res); /* --- */ #if defined(CONFIG_SND_DEBUG) || defined(CONFIG_SND_VERBOSE_PRINTK) +__printf(4, 5) void __snd_printk(unsigned int level, const char *file, int line, - const char *format, ...) - __attribute__ ((format (printf, 4, 5))); + const char *format, ...); #else #define __snd_printk(level, file, line, format, args...) \ printk(format, ##args) diff --git a/include/sound/info.h b/include/sound/info.h index 4e94cf1ff762..5492cc40dc57 100644 --- a/include/sound/info.h +++ b/include/sound/info.h @@ -110,8 +110,8 @@ void snd_card_info_read_oss(struct snd_info_buffer *buffer); static inline void snd_card_info_read_oss(struct snd_info_buffer *buffer) {} #endif -int snd_iprintf(struct snd_info_buffer *buffer, const char *fmt, ...) \ - __attribute__ ((format (printf, 2, 3))); +__printf(2, 3) +int snd_iprintf(struct snd_info_buffer *buffer, const char *fmt, ...); int snd_info_init(void); int snd_info_done(void); diff --git a/include/sound/seq_kernel.h b/include/sound/seq_kernel.h index 3d9afb6a8c9c..f352a98ce4f4 100644 --- a/include/sound/seq_kernel.h +++ b/include/sound/seq_kernel.h @@ -75,9 +75,9 @@ struct snd_seq_port_callback { }; /* interface for kernel client */ +__printf(3, 4) int snd_seq_create_kernel_client(struct snd_card *card, int client_index, - const char *name_fmt, ...) - __attribute__ ((format (printf, 3, 4))); + const char *name_fmt, ...); int snd_seq_delete_kernel_client(int client); int snd_seq_kernel_client_enqueue(int client, struct snd_seq_event *ev, int atomic, int hop); int snd_seq_kernel_client_dispatch(int client, struct snd_seq_event *ev, int atomic, int hop); diff --git a/include/xen/hvc-console.h b/include/xen/hvc-console.h index 901724dc528d..b62dfef15f61 100644 --- a/include/xen/hvc-console.h +++ b/include/xen/hvc-console.h @@ -6,12 +6,12 @@ extern struct console xenboot_console; #ifdef CONFIG_HVC_XEN void xen_console_resume(void); void xen_raw_console_write(const char *str); -__attribute__((format(printf, 1, 2))) +__printf(1, 2) void xen_raw_printk(const char *fmt, ...); #else static inline void xen_console_resume(void) { } static inline void xen_raw_console_write(const char *str) { } -static inline __attribute__((format(printf, 1, 2))) +static inline __printf(1, 2) void xen_raw_printk(const char *fmt, ...) { } #endif diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index aceeca799fd7..b9f9fb5af0d8 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -156,9 +156,9 @@ int xenbus_scanf(struct xenbus_transaction t, __attribute__((format(scanf, 4, 5))); /* Single printf and write: returns -errno or 0. */ +__printf(4, 5) int xenbus_printf(struct xenbus_transaction t, - const char *dir, const char *node, const char *fmt, ...) - __attribute__((format(printf, 4, 5))); + const char *dir, const char *node, const char *fmt, ...); /* Generic read function: NULL-terminated triples of name, * sprintf-style type string, and pointer. Returns 0 or errno.*/ @@ -200,11 +200,11 @@ int xenbus_watch_path(struct xenbus_device *dev, const char *path, struct xenbus_watch *watch, void (*callback)(struct xenbus_watch *, const char **, unsigned int)); +__printf(4, 5) int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch, void (*callback)(struct xenbus_watch *, const char **, unsigned int), - const char *pathfmt, ...) - __attribute__ ((format (printf, 4, 5))); + const char *pathfmt, ...); int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state); int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn); @@ -223,9 +223,9 @@ int xenbus_free_evtchn(struct xenbus_device *dev, int port); enum xenbus_state xenbus_read_driver_state(const char *path); -__attribute__((format(printf, 3, 4))) +__printf(3, 4) void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...); -__attribute__((format(printf, 3, 4))) +__printf(3, 4) void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...); const char *xenbus_strstate(enum xenbus_state state); diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h index b6753f45624e..d86583f4831d 100644 --- a/net/nfc/nfc.h +++ b/net/nfc/nfc.h @@ -27,7 +27,7 @@ #include #include -__attribute__((format (printf, 2, 3))) +__printf(2, 3) int nfc_printk(const char *level, const char *fmt, ...); #define nfc_info(fmt, arg...) nfc_printk(KERN_INFO, fmt, ##arg) diff --git a/net/rds/rds.h b/net/rds/rds.h index da8adac2bf06..7eaba1831f0d 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -36,8 +36,8 @@ #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args) #else /* sigh, pr_debug() causes unused variable warnings */ -static inline void __attribute__ ((format (printf, 1, 2))) -rdsdebug(char *fmt, ...) +static inline __printf(1, 2) +void rdsdebug(char *fmt, ...) { } #endif @@ -625,8 +625,8 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_lengths *lens, int (*visitor)(struct rds_connection *, void *), size_t item_len); -void __rds_conn_error(struct rds_connection *conn, const char *, ...) - __attribute__ ((format (printf, 2, 3))); +__printf(2, 3) +void __rds_conn_error(struct rds_connection *conn, const char *, ...); #define rds_conn_error(conn, fmt...) \ __rds_conn_error(conn, KERN_WARNING "RDS: " fmt) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 30d70abb4e2c..dd5cc00ed559 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -971,9 +971,8 @@ static void svc_unregister(const struct svc_serv *serv) /* * Printk the given error with the address of the client that caused it. */ -static int -__attribute__ ((format (printf, 2, 3))) -svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) +static __printf(2, 3) +int svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) { va_list args; int r; diff --git a/sound/firewire/cmp.c b/sound/firewire/cmp.c index 14cacbc655dd..76294f2ae47f 100644 --- a/sound/firewire/cmp.c +++ b/sound/firewire/cmp.c @@ -32,7 +32,7 @@ enum bus_reset_handling { SUCCEED_ON_BUS_RESET, }; -static __attribute__((format(printf, 2, 3))) +static __printf(2, 3) void cmp_error(struct cmp_connection *c, const char *fmt, ...) { va_list va; -- cgit v1.2.3-58-ga151 From 0a90e0f1012e576500b551455b046013324826b9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 31 Oct 2011 17:12:57 -0700 Subject: fat: follow rename pack_hex_byte() to hex_byte_pack() There is no functional change. Signed-off-by: Andy Shevchenko Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 5efbd5d7701a..aca191bd5f8f 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -156,8 +156,8 @@ static int uni16_to_x8(struct super_block *sb, unsigned char *ascii, } else { if (uni_xlate == 1) { *op++ = ':'; - op = pack_hex_byte(op, ec >> 8); - op = pack_hex_byte(op, ec); + op = hex_byte_pack(op, ec >> 8); + op = hex_byte_pack(op, ec); len -= 5; } else { *op++ = '?'; -- cgit v1.2.3-58-ga151 From d8805e633e054c816c47cb6e727c81f156d9253d Mon Sep 17 00:00:00 2001 From: Nelson Elhage Date: Mon, 31 Oct 2011 17:13:14 -0700 Subject: epoll: fix spurious lockdep warnings epoll can acquire recursively acquire ep->mtx on multiple "struct eventpoll"s at once in the case where one epoll fd is monitoring another epoll fd. This is perfectly OK, since we're careful about the lock ordering, but it causes spurious lockdep warnings. Annotate the recursion using mutex_lock_nested, and add a comment explaining the nesting rules for good measure. Recent versions of systemd are triggering this, and it can also be demonstrated with the following trivial test program: --------------------8<-------------------- int main(void) { int e1, e2; struct epoll_event evt = { .events = EPOLLIN }; e1 = epoll_create1(0); e2 = epoll_create1(0); epoll_ctl(e1, EPOLL_CTL_ADD, e2, &evt); return 0; } --------------------8<-------------------- Reported-by: Paul Bolle Tested-by: Paul Bolle Signed-off-by: Nelson Elhage Acked-by: Jason Baron Cc: Dave Jones Cc: Davide Libenzi Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 9026fc91fe3b..828e750af23a 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -70,6 +70,15 @@ * simultaneous inserts (A into B and B into A) from racing and * constructing a cycle without either insert observing that it is * going to. + * It is necessary to acquire multiple "ep->mtx"es at once in the + * case when one epoll fd is added to another. In this case, we + * always acquire the locks in the order of nesting (i.e. after + * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired + * before e2->mtx). Since we disallow cycles of epoll file + * descriptors, this ensures that the mutexes are well-ordered. In + * order to communicate this nesting to lockdep, when walking a tree + * of epoll file descriptors, we use the current recursion depth as + * the lockdep subkey. * It is possible to drop the "ep->mtx" and to use the global * mutex "epmutex" (together with "ep->lock") to have it working, * but having "ep->mtx" will make the interface more scalable. @@ -464,13 +473,15 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) * @ep: Pointer to the epoll private data structure. * @sproc: Pointer to the scan callback. * @priv: Private opaque data passed to the @sproc callback. + * @depth: The current depth of recursive f_op->poll calls. * * Returns: The same integer error code returned by the @sproc callback. */ static int ep_scan_ready_list(struct eventpoll *ep, int (*sproc)(struct eventpoll *, struct list_head *, void *), - void *priv) + void *priv, + int depth) { int error, pwake = 0; unsigned long flags; @@ -481,7 +492,7 @@ static int ep_scan_ready_list(struct eventpoll *ep, * We need to lock this because we could be hit by * eventpoll_release_file() and epoll_ctl(). */ - mutex_lock(&ep->mtx); + mutex_lock_nested(&ep->mtx, depth); /* * Steal the ready list, and re-init the original one to the @@ -670,7 +681,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests) { - return ep_scan_ready_list(priv, ep_read_events_proc, NULL); + return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1); } static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) @@ -737,7 +748,7 @@ void eventpoll_release_file(struct file *file) ep = epi->ep; list_del_init(&epi->fllink); - mutex_lock(&ep->mtx); + mutex_lock_nested(&ep->mtx, 0); ep_remove(ep, epi); mutex_unlock(&ep->mtx); } @@ -1134,7 +1145,7 @@ static int ep_send_events(struct eventpoll *ep, esed.maxevents = maxevents; esed.events = events; - return ep_scan_ready_list(ep, ep_send_events_proc, &esed); + return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0); } static inline struct timespec ep_set_mstimeout(long ms) @@ -1267,7 +1278,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) struct rb_node *rbp; struct epitem *epi; - mutex_lock(&ep->mtx); + mutex_lock_nested(&ep->mtx, call_nests + 1); for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { @@ -1409,7 +1420,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, } - mutex_lock(&ep->mtx); + mutex_lock_nested(&ep->mtx, 0); /* * Try to lookup the file inside our RB tree, Since we grabbed "mtx" -- cgit v1.2.3-58-ga151 From f6d90b4f9ce018bff429d6e01ee672de712b8641 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 1 Nov 2011 07:06:17 -0700 Subject: sysfs: Make sysfs_rename safe with sysfs_dirents in rbtrees. In sysfs_rename we need to remove the optimization of not calling sysfs_unlink_sibling and sysfs_link_sibling if the renamed parent directory is not changing. This optimization is no longer valid now that sysfs dirents are stored in an rbtree sorted by name. Move the assignment of s_ns before the call of sysfs_link_sibling. With no sysfs_dirent fields changing after the call of sysfs_link_sibling this allows sysfs_link_sibling to take any of the directory entries into account when it builds the rbtrees, and s_ns looks like a prime canidate to be used in the rbtree in the future. Signed-off-by: Eric W. Biederman Cc: Jiri Slaby Cc: Greg KH Cc: David Miller Cc: Mikulas Patocka Cc: Andrew Morton Signed-off-by: Linus Torvalds --- fs/sysfs/dir.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 48ffbdf0d017..7fdf6a7b7436 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -865,15 +865,13 @@ int sysfs_rename(struct sysfs_dirent *sd, sd->s_name = new_name; } - /* Remove from old parent's list and insert into new parent's list. */ - if (sd->s_parent != new_parent_sd) { - sysfs_unlink_sibling(sd); - sysfs_get(new_parent_sd); - sysfs_put(sd->s_parent); - sd->s_parent = new_parent_sd; - sysfs_link_sibling(sd); - } + /* Move to the appropriate place in the appropriate directories rbtree. */ + sysfs_unlink_sibling(sd); + sysfs_get(new_parent_sd); + sysfs_put(sd->s_parent); sd->s_ns = new_ns; + sd->s_parent = new_parent_sd; + sysfs_link_sibling(sd); error = 0; out: -- cgit v1.2.3-58-ga151 From fc0d14fe2d6403eb21202fd0c1cf67cd2c85ca67 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Thu, 27 Oct 2011 20:43:01 +0200 Subject: nfsd4: typo logical vs bitwise negate in nfsd4_decode_share_access Signed-off-by: Benny Halevy Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 66d095d7955e..b6fa792d6b85 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -655,7 +655,7 @@ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *x) default: return nfserr_bad_xdr; } - w &= !NFS4_SHARE_ACCESS_MASK; + w &= ~NFS4_SHARE_ACCESS_MASK; if (!w) return nfs_ok; if (!argp->minorversion) -- cgit v1.2.3-58-ga151 From 32de67569059d22b02dd9323a40220d953642b7e Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Tue, 1 Nov 2011 18:56:41 -0400 Subject: ext4: fix a syntax error in ext4_ext_insert_extent when debugging enabled This patch fixes a syntax error which omits a comma. Besides this, logical block number is unsigend 32 bits, so printk should use %u instead %d. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2798945a1920..8cacaf645e36 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1682,7 +1682,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, /* try to insert block into found extent and return */ if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) && ext4_can_extents_be_merged(inode, ex, newext)) { - ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", + ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n", ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), @@ -1720,7 +1720,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) next = ext4_ext_next_leaf_block(path); if (next != EXT_MAX_BLOCKS) { - ext_debug("next leaf block - %d\n", next); + ext_debug("next leaf block - %u\n", next); BUG_ON(npath != NULL); npath = ext4_ext_find_extent(inode, next, NULL); if (IS_ERR(npath)) @@ -1758,7 +1758,7 @@ has_space: if (!nearex) { /* there is no extent in this leaf, create first one */ - ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n", + ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_uninitialized(newext), @@ -1768,8 +1768,8 @@ has_space: if (le32_to_cpu(newext->ee_block) > le32_to_cpu(nearex->ee_block)) { /* Insert after */ - ext_debug("insert %d:%llu:[%d]%d %s before: " - "nearest 0x%p\n" + ext_debug("insert %u:%llu:[%d]%d before: " + "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_uninitialized(newext), @@ -1779,8 +1779,8 @@ has_space: } else { /* Insert before */ BUG_ON(newext->ee_block == nearex->ee_block); - ext_debug("insert %d:%llu:[%d]%d %s after: " - "nearest 0x%p\n" + ext_debug("insert %u:%llu:[%d]%d after: " + "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_uninitialized(newext), @@ -1789,7 +1789,7 @@ has_space: } len = EXT_LAST_EXTENT(eh) - nearex + 1; if (len > 0) { - ext_debug("insert %d:%llu:[%d]%d: " + ext_debug("insert %u:%llu:[%d]%d: " "move %d extents from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), -- cgit v1.2.3-58-ga151 From bf52c6f7af55c48ab0fd5f990460b884b428d906 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Tue, 1 Nov 2011 18:59:26 -0400 Subject: ext4: let ext4_ext_rm_leaf work with EXT_DEBUG defined The variable 'block' is removed by commit 750c9c47, so use the replacement ex_ee_block instead. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 8cacaf645e36..61fa9e1614af 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2424,7 +2424,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (err) goto out; - ext_debug("new extent: %u:%u:%llu\n", block, num, + ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num, ext4_ext_pblock(ex)); ex--; ex_ee_block = le32_to_cpu(ex->ee_block); -- cgit v1.2.3-58-ga151 From 8762202dd0d6e46854f786bdb6fb3780a1625efe Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Tue, 1 Nov 2011 19:04:59 -0400 Subject: jbd/jbd2: validate sb->s_first in journal_get_superblock() I hit a J_ASSERT(blocknr != 0) failure in cleanup_journal_tail() when mounting a fsfuzzed ext3 image. It turns out that the corrupted ext3 image has s_first = 0 in journal superblock, and the 0 is passed to journal->j_head in journal_reset(), then to blocknr in cleanup_journal_tail(), in the end the J_ASSERT failed. So validate s_first after reading journal superblock from disk in journal_get_superblock() to ensure s_first is valid. The following script could reproduce it: fstype=ext3 blocksize=1024 img=$fstype.img offset=0 found=0 magic="c0 3b 39 98" dd if=/dev/zero of=$img bs=1M count=8 mkfs -t $fstype -b $blocksize -F $img filesize=`stat -c %s $img` while [ $offset -lt $filesize ] do if od -j $offset -N 4 -t x1 $img | grep -i "$magic";then echo "Found journal: $offset" found=1 break fi offset=`echo "$offset+$blocksize" | bc` done if [ $found -ne 1 ];then echo "Magic \"$magic\" not found" exit 1 fi dd if=/dev/zero of=$img seek=$(($offset+23)) conv=notrunc bs=1 count=1 mkdir -p ./mnt mount -o loop $img ./mnt Cc: Jan Kara Signed-off-by: Eryu Guan Signed-off-by: "Theodore Ts'o" --- fs/jbd/journal.c | 8 ++++++++ fs/jbd2/journal.c | 8 ++++++++ 2 files changed, 16 insertions(+) (limited to 'fs') diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 9fe061fb8779..fea8dd661d2b 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -1135,6 +1135,14 @@ static int journal_get_superblock(journal_t *journal) goto out; } + if (be32_to_cpu(sb->s_first) == 0 || + be32_to_cpu(sb->s_first) >= journal->j_maxlen) { + printk(KERN_WARNING + "JBD: Invalid start block of journal: %u\n", + be32_to_cpu(sb->s_first)); + goto out; + } + return 0; out: diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index f24df13adc4e..d6e93d00833e 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1251,6 +1251,14 @@ static int journal_get_superblock(journal_t *journal) goto out; } + if (be32_to_cpu(sb->s_first) == 0 || + be32_to_cpu(sb->s_first) >= journal->j_maxlen) { + printk(KERN_WARNING + "JBD2: Invalid start block of journal: %u\n", + be32_to_cpu(sb->s_first)); + goto out; + } + return 0; out: -- cgit v1.2.3-58-ga151 From f2a44523b20f323e4aef7c16261d34d6f0a4bf06 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Tue, 1 Nov 2011 19:09:18 -0400 Subject: jbd2: Unify log messages in jbd2 code Some jbd2 code prints out kernel messages with "JBD2: " prefix, at the same time other jbd2 code prints with "JBD: " prefix. Unify the prefix to "JBD2: ". Signed-off-by: Eryu Guan Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 26 +++++++++++++------------- fs/jbd2/journal.c | 36 ++++++++++++++++++------------------ fs/jbd2/recovery.c | 28 ++++++++++++++-------------- fs/jbd2/transaction.c | 4 ++-- 4 files changed, 47 insertions(+), 47 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index eef6979821a4..68d704db787f 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -352,7 +352,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) J_ASSERT(commit_transaction->t_state == T_RUNNING); trace_jbd2_start_commit(journal, commit_transaction); - jbd_debug(1, "JBD: starting commit of transaction %d\n", + jbd_debug(1, "JBD2: starting commit of transaction %d\n", commit_transaction->t_tid); write_lock(&journal->j_state_lock); @@ -427,7 +427,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) __jbd2_journal_clean_checkpoint_list(journal); spin_unlock(&journal->j_list_lock); - jbd_debug (3, "JBD: commit phase 1\n"); + jbd_debug(3, "JBD2: commit phase 1\n"); /* * Switch to a new revoke table. @@ -447,7 +447,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) wake_up(&journal->j_wait_transaction_locked); write_unlock(&journal->j_state_lock); - jbd_debug (3, "JBD: commit phase 2\n"); + jbd_debug(3, "JBD2: commit phase 2\n"); /* * Now start flushing things to disk, in the order they appear @@ -462,7 +462,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) WRITE_SYNC); blk_finish_plug(&plug); - jbd_debug(3, "JBD: commit phase 2\n"); + jbd_debug(3, "JBD2: commit phase 2\n"); /* * Way to go: we have now written out all of the data for a @@ -522,7 +522,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) J_ASSERT (bufs == 0); - jbd_debug(4, "JBD: get descriptor\n"); + jbd_debug(4, "JBD2: get descriptor\n"); descriptor = jbd2_journal_get_descriptor_buffer(journal); if (!descriptor) { @@ -531,7 +531,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) } bh = jh2bh(descriptor); - jbd_debug(4, "JBD: got buffer %llu (%p)\n", + jbd_debug(4, "JBD2: got buffer %llu (%p)\n", (unsigned long long)bh->b_blocknr, bh->b_data); header = (journal_header_t *)&bh->b_data[0]; header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); @@ -625,7 +625,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) commit_transaction->t_buffers == NULL || space_left < tag_bytes + 16) { - jbd_debug(4, "JBD: Submit %d IOs\n", bufs); + jbd_debug(4, "JBD2: Submit %d IOs\n", bufs); /* Write an end-of-descriptor marker before submitting the IOs. "tag" still points to @@ -707,7 +707,7 @@ start_journal_io: so we incur less scheduling load. */ - jbd_debug(3, "JBD: commit phase 3\n"); + jbd_debug(3, "JBD2: commit phase 3\n"); /* * akpm: these are BJ_IO, and j_list_lock is not needed. @@ -771,7 +771,7 @@ wait_for_iobuf: J_ASSERT (commit_transaction->t_shadow_list == NULL); - jbd_debug(3, "JBD: commit phase 4\n"); + jbd_debug(3, "JBD2: commit phase 4\n"); /* Here we wait for the revoke record and descriptor record buffers */ wait_for_ctlbuf: @@ -801,7 +801,7 @@ wait_for_iobuf: if (err) jbd2_journal_abort(journal, err); - jbd_debug(3, "JBD: commit phase 5\n"); + jbd_debug(3, "JBD2: commit phase 5\n"); write_lock(&journal->j_state_lock); J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH); commit_transaction->t_state = T_COMMIT_JFLUSH; @@ -830,7 +830,7 @@ wait_for_iobuf: transaction can be removed from any checkpoint list it was on before. */ - jbd_debug(3, "JBD: commit phase 6\n"); + jbd_debug(3, "JBD2: commit phase 6\n"); J_ASSERT(list_empty(&commit_transaction->t_inode_list)); J_ASSERT(commit_transaction->t_buffers == NULL); @@ -964,7 +964,7 @@ restart_loop: /* Done with this transaction! */ - jbd_debug(3, "JBD: commit phase 7\n"); + jbd_debug(3, "JBD2: commit phase 7\n"); J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH); @@ -1039,7 +1039,7 @@ restart_loop: journal->j_commit_callback(journal, commit_transaction); trace_jbd2_end_commit(journal, commit_transaction); - jbd_debug(1, "JBD: commit %d complete, head %d\n", + jbd_debug(1, "JBD2: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); if (to_free) kfree(commit_transaction); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index d6e93d00833e..0fa0123151d3 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -491,7 +491,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target) */ journal->j_commit_request = target; - jbd_debug(1, "JBD: requesting commit %d/%d\n", + jbd_debug(1, "JBD2: requesting commit %d/%d\n", journal->j_commit_request, journal->j_commit_sequence); wake_up(&journal->j_wait_commit); @@ -500,7 +500,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target) /* This should never happen, but if it does, preserve the evidence before kjournald goes into a loop and increments j_commit_sequence beyond all recognition. */ - WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n", + WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n", journal->j_commit_request, journal->j_commit_sequence, target, journal->j_running_transaction ? @@ -645,7 +645,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) } #endif while (tid_gt(tid, journal->j_commit_sequence)) { - jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", + jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n", tid, journal->j_commit_sequence); wake_up(&journal->j_wait_commit); read_unlock(&journal->j_state_lock); @@ -1093,7 +1093,7 @@ static int journal_reset(journal_t *journal) first = be32_to_cpu(sb->s_first); last = be32_to_cpu(sb->s_maxlen); if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) { - printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n", + printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n", first, last); journal_fail_superblock(journal); return -EINVAL; @@ -1139,7 +1139,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait) */ if (sb->s_start == 0 && journal->j_tail_sequence == journal->j_transaction_sequence) { - jbd_debug(1,"JBD: Skipping superblock update on recovered sb " + jbd_debug(1, "JBD2: Skipping superblock update on recovered sb " "(start %ld, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); @@ -1163,7 +1163,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait) } read_lock(&journal->j_state_lock); - jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", + jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); @@ -1216,8 +1216,8 @@ static int journal_get_superblock(journal_t *journal) ll_rw_block(READ, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - printk (KERN_ERR - "JBD: IO error reading journal superblock\n"); + printk(KERN_ERR + "JBD2: IO error reading journal superblock\n"); goto out; } } @@ -1228,7 +1228,7 @@ static int journal_get_superblock(journal_t *journal) if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) || sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { - printk(KERN_WARNING "JBD: no valid journal superblock found\n"); + printk(KERN_WARNING "JBD2: no valid journal superblock found\n"); goto out; } @@ -1240,14 +1240,14 @@ static int journal_get_superblock(journal_t *journal) journal->j_format_version = 2; break; default: - printk(KERN_WARNING "JBD: unrecognised superblock format ID\n"); + printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n"); goto out; } if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) journal->j_maxlen = be32_to_cpu(sb->s_maxlen); else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { - printk (KERN_WARNING "JBD: journal file too short\n"); + printk(KERN_WARNING "JBD2: journal file too short\n"); goto out; } @@ -1318,8 +1318,8 @@ int jbd2_journal_load(journal_t *journal) ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) || (sb->s_feature_incompat & ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) { - printk (KERN_WARNING - "JBD: Unrecognised features on journal\n"); + printk(KERN_WARNING + "JBD2: Unrecognised features on journal\n"); return -EINVAL; } } @@ -1354,7 +1354,7 @@ int jbd2_journal_load(journal_t *journal) return 0; recovery_error: - printk (KERN_WARNING "JBD: recovery failed\n"); + printk(KERN_WARNING "JBD2: recovery failed\n"); return -EIO; } @@ -1585,7 +1585,7 @@ static int journal_convert_superblock_v1(journal_t *journal, struct buffer_head *bh; printk(KERN_WARNING - "JBD: Converting superblock from version 1 to 2.\n"); + "JBD2: Converting superblock from version 1 to 2.\n"); /* Pre-initialise new fields to zero */ offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); @@ -1702,7 +1702,7 @@ int jbd2_journal_wipe(journal_t *journal, int write) if (!journal->j_tail) goto no_recovery; - printk (KERN_WARNING "JBD: %s recovery information on journal\n", + printk(KERN_WARNING "JBD2: %s recovery information on journal\n", write ? "Clearing" : "Ignoring"); err = jbd2_journal_skip_recovery(journal); @@ -2028,7 +2028,7 @@ static int journal_init_jbd2_journal_head_cache(void) retval = 0; if (!jbd2_journal_head_cache) { retval = -ENOMEM; - printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); + printk(KERN_EMERG "JBD2: no memory for journal_head cache\n"); } return retval; } @@ -2391,7 +2391,7 @@ static void __exit journal_exit(void) #ifdef CONFIG_JBD2_DEBUG int n = atomic_read(&nr_journal_heads); if (n) - printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); + printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n); #endif jbd2_remove_debugfs_entry(); jbd2_remove_jbd_stats_proc_entry(); diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 1cad869494f0..da6d7baf1390 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -89,7 +89,7 @@ static int do_readahead(journal_t *journal, unsigned int start) err = jbd2_journal_bmap(journal, next, &blocknr); if (err) { - printk (KERN_ERR "JBD: bad block at offset %u\n", + printk(KERN_ERR "JBD2: bad block at offset %u\n", next); goto failed; } @@ -138,14 +138,14 @@ static int jread(struct buffer_head **bhp, journal_t *journal, *bhp = NULL; if (offset >= journal->j_maxlen) { - printk(KERN_ERR "JBD: corrupted journal superblock\n"); + printk(KERN_ERR "JBD2: corrupted journal superblock\n"); return -EIO; } err = jbd2_journal_bmap(journal, offset, &blocknr); if (err) { - printk (KERN_ERR "JBD: bad block at offset %u\n", + printk(KERN_ERR "JBD2: bad block at offset %u\n", offset); return err; } @@ -163,7 +163,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal, } if (!buffer_uptodate(bh)) { - printk (KERN_ERR "JBD: Failed to read block at offset %u\n", + printk(KERN_ERR "JBD2: Failed to read block at offset %u\n", offset); brelse(bh); return -EIO; @@ -251,10 +251,10 @@ int jbd2_journal_recover(journal_t *journal) if (!err) err = do_one_pass(journal, &info, PASS_REPLAY); - jbd_debug(1, "JBD: recovery, exit status %d, " + jbd_debug(1, "JBD2: recovery, exit status %d, " "recovered transactions %u to %u\n", err, info.start_transaction, info.end_transaction); - jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n", + jbd_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n", info.nr_replays, info.nr_revoke_hits, info.nr_revokes); /* Restart the log at the next transaction ID, thus invalidating @@ -293,14 +293,14 @@ int jbd2_journal_skip_recovery(journal_t *journal) err = do_one_pass(journal, &info, PASS_SCAN); if (err) { - printk(KERN_ERR "JBD: error %d scanning journal\n", err); + printk(KERN_ERR "JBD2: error %d scanning journal\n", err); ++journal->j_transaction_sequence; } else { #ifdef CONFIG_JBD2_DEBUG int dropped = info.end_transaction - be32_to_cpu(journal->j_superblock->s_sequence); jbd_debug(1, - "JBD: ignoring %d transaction%s from the journal.\n", + "JBD2: ignoring %d transaction%s from the journal.\n", dropped, (dropped == 1) ? "" : "s"); #endif journal->j_transaction_sequence = ++info.end_transaction; @@ -338,7 +338,7 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh, wrap(journal, *next_log_block); err = jread(&obh, journal, io_block); if (err) { - printk(KERN_ERR "JBD: IO error %d recovering block " + printk(KERN_ERR "JBD2: IO error %d recovering block " "%lu in log\n", err, io_block); return 1; } else { @@ -411,7 +411,7 @@ static int do_one_pass(journal_t *journal, * either the next descriptor block or the final commit * record. */ - jbd_debug(3, "JBD: checking block %ld\n", next_log_block); + jbd_debug(3, "JBD2: checking block %ld\n", next_log_block); err = jread(&bh, journal, next_log_block); if (err) goto failed; @@ -491,8 +491,8 @@ static int do_one_pass(journal_t *journal, /* Recover what we can, but * report failure at the end. */ success = err; - printk (KERN_ERR - "JBD: IO error %d recovering " + printk(KERN_ERR + "JBD2: IO error %d recovering " "block %ld in log\n", err, io_block); } else { @@ -520,7 +520,7 @@ static int do_one_pass(journal_t *journal, journal->j_blocksize); if (nbh == NULL) { printk(KERN_ERR - "JBD: Out of memory " + "JBD2: Out of memory " "during recovery.\n"); err = -ENOMEM; brelse(bh); @@ -689,7 +689,7 @@ static int do_one_pass(journal_t *journal, /* It's really bad news if different passes end up at * different places (but possible due to IO errors). */ if (info->end_transaction != next_commit_ID) { - printk (KERN_ERR "JBD: recovery pass %d ended at " + printk(KERN_ERR "JBD2: recovery pass %d ended at " "transaction %u, expected %u\n", pass, next_commit_ID, info->end_transaction); if (!success) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 1e5c5ead5b0d..a0e41a4c080e 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -125,7 +125,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle, unsigned long ts = jiffies; if (nblocks > journal->j_max_transaction_buffers) { - printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", + printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n", current->comm, nblocks, journal->j_max_transaction_buffers); return -ENOSPC; @@ -564,7 +564,7 @@ static void warn_dirty_buffer(struct buffer_head *bh) char b[BDEVNAME_SIZE]; printk(KERN_WARNING - "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " + "JBD2: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " "There's a risk of filesystem corruption in case of system " "crash.\n", bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); -- cgit v1.2.3-58-ga151 From 32096ea1aac14e6f29d4744924092eca52b937b0 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 1 Nov 2011 16:12:33 +0300 Subject: vfs: fix dentry leak in simple_fill_super() put dentry if inode allocation failed, d_genocide() cannot release it Signed-off-by: Konstantin Khlebnikov Signed-off-by: Christoph Hellwig --- fs/libfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/libfs.c b/fs/libfs.c index c18e9a1235b6..a2c0029cd95f 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -510,8 +510,10 @@ int simple_fill_super(struct super_block *s, unsigned long magic, if (!dentry) goto out; inode = new_inode(s); - if (!inode) + if (!inode) { + dput(dentry); goto out; + } inode->i_mode = S_IFREG | files->mode; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_fop = files->ops; -- cgit v1.2.3-58-ga151 From 1fa1e7f615f4d3ae436fa319af6e4eebdd4026a8 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 2 Nov 2011 09:44:39 +0100 Subject: readlinkat: ensure we return ENOENT for the empty pathname for normal lookups Since the commit below which added O_PATH support to the *at() calls, the error return for readlink/readlinkat for the empty pathname has switched from ENOENT to EINVAL: commit 65cfc6722361570bfe255698d9cd4dccaf47570d Author: Al Viro Date: Sun Mar 13 15:56:26 2011 -0400 readlinkat(), fchownat() and fstatat() with empty relative pathnames This is both unexpected for userspace and makes readlink/readlinkat inconsistant with all other interfaces; and inconsistant with our stated return for these pathnames. As the readlinkat call does not have a flags parameter we cannot use the AT_EMPTY_PATH approach used in the other calls. Therefore expose whether the original path is infact entry via a new user_path_at_empty() path lookup function. Use this to determine whether to default to EINVAL or ENOENT for failures. Addresses http://bugs.launchpad.net/bugs/817187 [akpm@linux-foundation.org: remove unused getname_flags()] Signed-off-by: Andy Whitcroft Cc: Christoph Hellwig Cc: Al Viro Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Christoph Hellwig --- fs/namei.c | 18 +++++++++++++----- fs/stat.c | 5 +++-- include/linux/namei.h | 1 + 3 files changed, 17 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 7657be4352bf..ac6d214da827 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -137,7 +137,7 @@ static int do_getname(const char __user *filename, char *page) return retval; } -static char *getname_flags(const char __user * filename, int flags) +static char *getname_flags(const char __user *filename, int flags, int *empty) { char *tmp, *result; @@ -148,6 +148,8 @@ static char *getname_flags(const char __user * filename, int flags) result = tmp; if (retval < 0) { + if (retval == -ENOENT && empty) + *empty = 1; if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { __putname(tmp); result = ERR_PTR(retval); @@ -160,7 +162,7 @@ static char *getname_flags(const char __user * filename, int flags) char *getname(const char __user * filename) { - return getname_flags(filename, 0); + return getname_flags(filename, 0, 0); } #ifdef CONFIG_AUDITSYSCALL @@ -1798,11 +1800,11 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) return __lookup_hash(&this, base, NULL); } -int user_path_at(int dfd, const char __user *name, unsigned flags, - struct path *path) +int user_path_at_empty(int dfd, const char __user *name, unsigned flags, + struct path *path, int *empty) { struct nameidata nd; - char *tmp = getname_flags(name, flags); + char *tmp = getname_flags(name, flags, empty); int err = PTR_ERR(tmp); if (!IS_ERR(tmp)) { @@ -1816,6 +1818,12 @@ int user_path_at(int dfd, const char __user *name, unsigned flags, return err; } +int user_path_at(int dfd, const char __user *name, unsigned flags, + struct path *path) +{ + return user_path_at_empty(dfd, name, flags, path, 0); +} + static int user_path_parent(int dfd, const char __user *path, struct nameidata *nd, char **name) { diff --git a/fs/stat.c b/fs/stat.c index 78a3aa83c7ea..8806b8997d2e 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -294,15 +294,16 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, { struct path path; int error; + int empty = 0; if (bufsiz <= 0) return -EINVAL; - error = user_path_at(dfd, pathname, LOOKUP_EMPTY, &path); + error = user_path_at_empty(dfd, pathname, LOOKUP_EMPTY, &path, &empty); if (!error) { struct inode *inode = path.dentry->d_inode; - error = -EINVAL; + error = empty ? -ENOENT : -EINVAL; if (inode->i_op->readlink) { error = security_inode_readlink(path.dentry); if (!error) { diff --git a/include/linux/namei.h b/include/linux/namei.h index 409328d1cbbb..ffc02135c483 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -67,6 +67,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; #define LOOKUP_EMPTY 0x4000 extern int user_path_at(int, const char __user *, unsigned, struct path *); +extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty); #define user_path(name, path) user_path_at(AT_FDCWD, name, LOOKUP_FOLLOW, path) #define user_lpath(name, path) user_path_at(AT_FDCWD, name, 0, path) -- cgit v1.2.3-58-ga151 From 2833eb2b465a274d1a2529fed76c6d2904f8022b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 28 Oct 2011 14:13:20 +0200 Subject: vfs: ignore error on forced remount On emergency remount we want to force MS_RDONLY on the super block even if ->remount_fs() failed for some reason. Signed-off-by: Miklos Szeredi Tested-by: Toshiyuki Okajima Signed-off-by: Christoph Hellwig --- fs/super.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 32a81f3467e0..afd0f1ad45e0 100644 --- a/fs/super.c +++ b/fs/super.c @@ -727,8 +727,13 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) if (sb->s_op->remount_fs) { retval = sb->s_op->remount_fs(sb, &flags, data); - if (retval) - return retval; + if (retval) { + if (!force) + return retval; + /* If forced remount, go ahead despite any errors */ + WARN(1, "forced remount of a %s fs returned %i\n", + sb->s_type->name, retval); + } } sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); -- cgit v1.2.3-58-ga151 From c818b682c955aa4d13212512deb529cb8bd1348b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 28 Oct 2011 14:13:25 +0200 Subject: jfs: remove unnecessary nlink setting alloc_inode() initializes i_nlink to 1. Remove unnecessary re-initialization. Signed-off-by: Miklos Szeredi Acked-by: Dave Kleikamp Signed-off-by: Christoph Hellwig --- fs/jfs/super.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 06c8a67cbe76..a44eff076c17 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -485,7 +485,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) goto out_unload; } inode->i_ino = 0; - inode->i_nlink = 1; inode->i_size = sb->s_bdev->bd_inode->i_size; inode->i_mapping->a_ops = &jfs_metapage_aops; insert_inode_hash(inode); -- cgit v1.2.3-58-ga151 From a7732b05f775a5575baac34c03bb0e8d16950edf Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 28 Oct 2011 14:13:26 +0200 Subject: ocfs2: remove unnecessary nlink setting alloc_inode() initializes i_nlink to 1. Remove unnecessary re-initialization. Signed-off-by: Miklos Szeredi CC: Joel Becker CC: Mark Fasheh Signed-off-by: Christoph Hellwig --- fs/ocfs2/namei.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 53aa41ed7bf3..0181f63dd463 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -200,8 +200,6 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode) * callers. */ if (S_ISDIR(mode)) inode->i_nlink = 2; - else - inode->i_nlink = 1; inode_init_owner(inode, dir, mode); dquot_initialize(inode); return inode; -- cgit v1.2.3-58-ga151 From dd2a981f46a0903a8770a784f213d4d40bbb6f19 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 28 Oct 2011 14:13:27 +0200 Subject: logfs: remove unnecessary nlink setting alloc_inode() initializes i_nlink to 1. Remove unnecessary re-initialization. Signed-off-by: Miklos Szeredi CC: Joern Engel CC: Prasad Joshi Signed-off-by: Christoph Hellwig --- fs/logfs/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index edfea7a3a747..9abb048b96c0 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -199,7 +199,6 @@ static void logfs_init_inode(struct super_block *sb, struct inode *inode) inode->i_blocks = 0; inode->i_ctime = CURRENT_TIME; inode->i_mtime = CURRENT_TIME; - inode->i_nlink = 1; li->li_refcount = 1; INIT_LIST_HEAD(&li->li_freeing_list); -- cgit v1.2.3-58-ga151 From 6d6b77f163c7eabedbba00ed2abb7d4a570bff76 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 28 Oct 2011 14:13:28 +0200 Subject: filesystems: add missing nlink wrappers Replace direct i_nlink updates with the respective updater function (inc_nlink, drop_nlink, clear_nlink, inode_dec_link_count). Signed-off-by: Miklos Szeredi --- arch/s390/hypfs/inode.c | 2 +- drivers/mtd/mtdchar.c | 2 +- fs/affs/amigaffs.c | 2 +- fs/affs/namei.c | 6 +++--- fs/binfmt_misc.c | 2 +- fs/cifs/link.c | 2 +- fs/coda/dir.c | 2 +- fs/devpts/inode.c | 2 +- fs/ext2/ialloc.c | 2 +- fs/ext3/ialloc.c | 2 +- fs/ext3/namei.c | 2 +- fs/ext4/ialloc.c | 2 +- fs/ext4/migrate.c | 2 +- fs/ext4/namei.c | 2 +- fs/hfs/dir.c | 4 ++-- fs/hfsplus/dir.c | 4 ++-- fs/hpfs/inode.c | 2 +- fs/hugetlbfs/inode.c | 2 +- fs/jfs/jfs_inode.c | 2 +- fs/jfs/namei.c | 10 +++++----- fs/logfs/dir.c | 8 ++++---- fs/logfs/inode.c | 2 +- fs/nfs/inode.c | 2 +- fs/nilfs2/inode.c | 2 +- fs/ocfs2/namei.c | 8 ++++---- fs/proc/proc_sysctl.c | 2 +- fs/reiserfs/inode.c | 4 ++-- fs/reiserfs/namei.c | 12 ++++++------ fs/ubifs/xattr.c | 2 +- fs/udf/namei.c | 11 ++++------- fs/ufs/ialloc.c | 2 +- mm/shmem.c | 2 +- 32 files changed, 55 insertions(+), 58 deletions(-) (limited to 'fs') diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index 7ae0d0d985a8..baa82f8dd076 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -359,7 +359,7 @@ static struct dentry *hypfs_create_file(struct super_block *sb, } else if (mode & S_IFDIR) { inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; - parent->d_inode->i_nlink++; + inc_nlink(parent->d_inode); } else BUG(); inode->i_private = data; diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c index f1af2228a1b1..61086ea3cc6b 100644 --- a/drivers/mtd/mtdchar.c +++ b/drivers/mtd/mtdchar.c @@ -1144,7 +1144,7 @@ static void mtdchar_notify_remove(struct mtd_info *mtd) if (mtd_ino) { /* Destroy the inode if it exists */ - mtd_ino->i_nlink = 0; + clear_nlink(mtd_ino); iput(mtd_ino); } } diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 3a4557e8325c..8f12723bbc97 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -316,7 +316,7 @@ affs_remove_header(struct dentry *dentry) if (inode->i_nlink > 1) retval = affs_remove_link(dentry); else - inode->i_nlink = 0; + clear_nlink(inode); affs_unlock_link(inode); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); diff --git a/fs/affs/namei.c b/fs/affs/namei.c index e3e9efc1fdd8..780a11dc6318 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -277,7 +277,7 @@ affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata inode->i_mapping->a_ops = (AFFS_SB(sb)->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops; error = affs_add_entry(dir, inode, dentry, ST_FILE); if (error) { - inode->i_nlink = 0; + clear_nlink(inode); iput(inode); return error; } @@ -305,7 +305,7 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, int mode) error = affs_add_entry(dir, inode, dentry, ST_USERDIR); if (error) { - inode->i_nlink = 0; + clear_nlink(inode); mark_inode_dirty(inode); iput(inode); return error; @@ -392,7 +392,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) return 0; err: - inode->i_nlink = 0; + clear_nlink(inode); mark_inode_dirty(inode); iput(inode); return error; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index ba1a1ae4a18a..1e9edbdeda7e 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -521,7 +521,7 @@ static void kill_node(Node *e) write_unlock(&entries_lock); if (dentry) { - dentry->d_inode->i_nlink--; + drop_nlink(dentry->d_inode); d_drop(dentry); dput(dentry); simple_release_fs(&bm_mnt, &entry_count); diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 8693b5d0e180..6b0e06434391 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -433,7 +433,7 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, if (old_file->d_inode) { cifsInode = CIFS_I(old_file->d_inode); if (rc == 0) { - old_file->d_inode->i_nlink++; + inc_nlink(old_file->d_inode); /* BB should we make this contingent on superblock flag NOATIME? */ /* old_file->d_inode->i_ctime = CURRENT_TIME;*/ /* parent dir timestamps will update from srv diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 0239433f50cb..28e7e135cfab 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -340,7 +340,7 @@ static int coda_rmdir(struct inode *dir, struct dentry *de) if (!error) { /* VFS may delete the child */ if (de->d_inode) - de->d_inode->i_nlink = 0; + clear_nlink(de->d_inode); /* fix the link count of the parent */ coda_dir_drop_nlink(dir); diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 2f27e578d466..c196e544c64e 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -549,7 +549,7 @@ void devpts_pty_kill(struct tty_struct *tty) dentry = d_find_alias(inode); - inode->i_nlink--; + drop_nlink(inode); d_delete(dentry); dput(dentry); /* d_alloc_name() in devpts_pty_new() */ dput(dentry); /* d_find_alias above */ diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index ee9ed31948e1..c4e81dfb74ba 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -601,7 +601,7 @@ fail_free_drop: fail_drop: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; - inode->i_nlink = 0; + clear_nlink(inode); unlock_new_inode(inode); iput(inode); return ERR_PTR(err); diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index bf09cbf938cc..835d4ea61b0e 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -621,7 +621,7 @@ fail_free_drop: fail_drop: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; - inode->i_nlink = 0; + clear_nlink(inode); unlock_new_inode(inode); iput(inode); brelse(bitmap_bh); diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 0629e09f6511..8a60e3327659 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1833,7 +1833,7 @@ retry: if (err) { out_clear_inode: - inode->i_nlink = 0; + clear_nlink(inode); unlock_new_inode(inode); ext3_mark_inode_dirty(handle, inode); iput (inode); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 9c63f273b550..ecc55bd2f997 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1084,7 +1084,7 @@ fail_free_drop: fail_drop: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; - inode->i_nlink = 0; + clear_nlink(inode); unlock_new_inode(inode); iput(inode); brelse(inode_bitmap_bh); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index b57b98fb44d1..667cc880bc0c 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -507,7 +507,7 @@ int ext4_ext_migrate(struct inode *inode) * Set the i_nlink to zero so it will be deleted later * when we drop inode reference. */ - tmp_inode->i_nlink = 0; + clear_nlink(tmp_inode); ext4_ext_tree_init(handle, tmp_inode); ext4_orphan_add(handle, tmp_inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 1c924faeb6c8..4623c082f3b2 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2539,7 +2539,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_inode) { /* checked empty_dir above, can't have another parent, * ext4_dec_count() won't work for many-linked dirs */ - new_inode->i_nlink = 0; + clear_nlink(new_inode); } else { ext4_inc_count(handle, new_dir); ext4_update_dx_flag(new_dir); diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index b4d70b13be92..bce4eef91a06 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -198,7 +198,7 @@ static int hfs_create(struct inode *dir, struct dentry *dentry, int mode, res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); if (res) { - inode->i_nlink = 0; + clear_nlink(inode); hfs_delete_inode(inode); iput(inode); return res; @@ -227,7 +227,7 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); if (res) { - inode->i_nlink = 0; + clear_nlink(inode); hfs_delete_inode(inode); iput(inode); return res; diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 25b2443a004c..4536cd3f15ae 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -415,7 +415,7 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, goto out; out_err: - inode->i_nlink = 0; + clear_nlink(inode); hfsplus_delete_inode(inode); iput(inode); out: @@ -440,7 +440,7 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); if (res) { - inode->i_nlink = 0; + clear_nlink(inode); hfsplus_delete_inode(inode); iput(inode); goto out; diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 338cd8368451..dfe800c8ae38 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -53,7 +53,7 @@ void hpfs_read_inode(struct inode *i) i->i_mode &= ~0111; i->i_op = &hpfs_file_iops; i->i_fop = &hpfs_file_ops; - i->i_nlink = 0;*/ + clear_nlink(i);*/ make_bad_inode(i); return; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ec889538e5a6..0be5a78598d0 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -970,7 +970,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, d_instantiate(path.dentry, inode); inode->i_size = size; - inode->i_nlink = 0; + clear_nlink(inode); error = -ENFILE; file = alloc_file(&path, FMODE_WRITE | FMODE_READ, diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 2686531e235a..c1a3e603279c 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -157,7 +157,7 @@ fail_drop: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; fail_unlock: - inode->i_nlink = 0; + clear_nlink(inode); unlock_new_inode(inode); fail_put: iput(inode); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index e17545e15664..94b0a624b85f 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -172,7 +172,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, mutex_unlock(&JFS_IP(dip)->commit_mutex); if (rc) { free_ea_wmap(ip); - ip->i_nlink = 0; + clear_nlink(ip); unlock_new_inode(ip); iput(ip); } else { @@ -311,7 +311,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) mutex_unlock(&JFS_IP(dip)->commit_mutex); if (rc) { free_ea_wmap(ip); - ip->i_nlink = 0; + clear_nlink(ip); unlock_new_inode(ip); iput(ip); } else { @@ -844,7 +844,7 @@ static int jfs_link(struct dentry *old_dentry, rc = txCommit(tid, 2, &iplist[0], 0); if (rc) { - ip->i_nlink--; /* never instantiated */ + drop_nlink(ip); /* never instantiated */ iput(ip); } else d_instantiate(dentry, ip); @@ -1048,7 +1048,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, mutex_unlock(&JFS_IP(dip)->commit_mutex); if (rc) { free_ea_wmap(ip); - ip->i_nlink = 0; + clear_nlink(ip); unlock_new_inode(ip); iput(ip); } else { @@ -1433,7 +1433,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, mutex_unlock(&JFS_IP(dir)->commit_mutex); if (rc) { free_ea_wmap(ip); - ip->i_nlink = 0; + clear_nlink(ip); unlock_new_inode(ip); iput(ip); } else { diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index b3ff3d894165..b7d7f67cee5a 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -197,7 +197,7 @@ static int logfs_remove_inode(struct inode *inode) { int ret; - inode->i_nlink--; + drop_nlink(inode); ret = write_inode(inode); LOGFS_BUG_ON(ret, inode->i_sb); return ret; @@ -433,7 +433,7 @@ static int __logfs_create(struct inode *dir, struct dentry *dentry, ta = kzalloc(sizeof(*ta), GFP_KERNEL); if (!ta) { - inode->i_nlink--; + drop_nlink(inode); iput(inode); return -ENOMEM; } @@ -456,7 +456,7 @@ static int __logfs_create(struct inode *dir, struct dentry *dentry, abort_transaction(inode, ta); li->li_flags |= LOGFS_IF_STILLBORN; /* FIXME: truncate symlink */ - inode->i_nlink--; + drop_nlink(inode); iput(inode); goto out; } @@ -563,7 +563,7 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir, inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; ihold(inode); - inode->i_nlink++; + inc_nlink(inode); mark_inode_dirty_sync(inode); return __logfs_create(dir, dentry, inode, NULL, 0); diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index 9abb048b96c0..7e441ad5f792 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -93,7 +93,7 @@ static struct inode *__logfs_iget(struct super_block *sb, ino_t ino) /* inode->i_nlink == 0 can be true when called from * block validator */ /* set i_nlink to 0 to prevent caching */ - inode->i_nlink = 0; + clear_nlink(inode); logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE; iget_failed(inode); if (!err) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 4dc6d078f108..1fc715acc2da 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -320,7 +320,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) memset(&inode->i_ctime, 0, sizeof(inode->i_ctime)); inode->i_version = 0; inode->i_size = 0; - inode->i_nlink = 0; + clear_nlink(inode); inode->i_uid = -2; inode->i_gid = -2; inode->i_blocks = 0; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 666628b395f1..08ac272b7c28 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -354,7 +354,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode) failed_acl: failed_bmap: - inode->i_nlink = 0; + clear_nlink(inode); iput(inode); /* raw_inode will be deleted through generic_delete_inode() */ goto failed; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 0181f63dd463..ea0ecbdda58c 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1377,7 +1377,7 @@ static int ocfs2_rename(struct inode *old_dir, } if (new_inode) { - new_inode->i_nlink--; + drop_nlink(new_inode); new_inode->i_ctime = CURRENT_TIME; } old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; @@ -1385,9 +1385,9 @@ static int ocfs2_rename(struct inode *old_dir, if (update_dot_dot) { status = ocfs2_update_entry(old_inode, handle, &old_inode_dot_dot_res, new_dir); - old_dir->i_nlink--; + drop_nlink(old_dir); if (new_inode) { - new_inode->i_nlink--; + drop_nlink(new_inode); } else { inc_nlink(new_dir); mark_inode_dirty(new_dir); @@ -2280,7 +2280,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, goto leave; } - inode->i_nlink = 0; + clear_nlink(inode); /* do the real work now. */ status = __ocfs2_mknod_locked(dir, inode, 0, &new_di_bh, parent_di_bh, handle, diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 1a77dbef226f..b44113279e30 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -39,7 +39,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, inode->i_fop = &proc_sys_file_operations; } else { inode->i_mode |= S_IFDIR; - inode->i_nlink = 0; + clear_nlink(inode); inode->i_op = &proc_sys_dir_operations; inode->i_fop = &proc_sys_dir_file_operations; } diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 9b0d4b78b4fb..c425441c6942 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1444,7 +1444,7 @@ void reiserfs_read_locked_inode(struct inode *inode, /* a stale NFS handle can trigger this without it being an error */ pathrelse(&path_to_sd); reiserfs_make_bad_inode(inode); - inode->i_nlink = 0; + clear_nlink(inode); return; } @@ -1987,7 +1987,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, make_bad_inode(inode); out_inserted_sd: - inode->i_nlink = 0; + clear_nlink(inode); th->t_trans_id = 0; /* so the caller can't use this handle later */ unlock_new_inode(inode); /* OK to do even if we hadn't locked it */ iput(inode); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index ef392324bbf1..6ce332821633 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -622,7 +622,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode, dentry->d_name.len, inode, 1 /*visible */ ); if (retval) { int err; - inode->i_nlink--; + drop_nlink(inode); reiserfs_update_sd(&th, inode); err = journal_end(&th, dir->i_sb, jbegin_count); if (err) @@ -702,7 +702,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dentry->d_name.len, inode, 1 /*visible */ ); if (retval) { int err; - inode->i_nlink--; + drop_nlink(inode); reiserfs_update_sd(&th, inode); err = journal_end(&th, dir->i_sb, jbegin_count); if (err) @@ -787,7 +787,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) dentry->d_name.len, inode, 1 /*visible */ ); if (retval) { int err; - inode->i_nlink = 0; + clear_nlink(inode); DEC_DIR_INODE_NLINK(dir); reiserfs_update_sd(&th, inode); err = journal_end(&th, dir->i_sb, jbegin_count); @@ -1086,7 +1086,7 @@ static int reiserfs_symlink(struct inode *parent_dir, dentry->d_name.len, inode, 1 /*visible */ ); if (retval) { int err; - inode->i_nlink--; + drop_nlink(inode); reiserfs_update_sd(&th, inode); err = journal_end(&th, parent_dir->i_sb, jbegin_count); if (err) @@ -1129,7 +1129,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, retval = journal_begin(&th, dir->i_sb, jbegin_count); if (retval) { - inode->i_nlink--; + drop_nlink(inode); reiserfs_write_unlock(dir->i_sb); return retval; } @@ -1144,7 +1144,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, if (retval) { int err; - inode->i_nlink--; + drop_nlink(inode); err = journal_end(&th, dir->i_sb, jbegin_count); reiserfs_write_unlock(dir->i_sb); return err ? err : retval; diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 16f19f55e63f..993adc8f1c87 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -558,7 +558,7 @@ int ubifs_removexattr(struct dentry *dentry, const char *name) } ubifs_assert(inode->i_nlink == 1); - inode->i_nlink = 0; + clear_nlink(inode); err = remove_xattr(c, host, inode, &nm); if (err) inode->i_nlink = 1; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index f1dce848ef96..e8d61b114c1b 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -577,8 +577,7 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode, fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (!fi) { - inode->i_nlink--; - mark_inode_dirty(inode); + inode_dec_link_count(inode); iput(inode); return err; } @@ -618,8 +617,7 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode, init_special_inode(inode, mode, rdev); fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (!fi) { - inode->i_nlink--; - mark_inode_dirty(inode); + inode_dec_link_count(inode); iput(inode); return err; } @@ -665,8 +663,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) inode->i_fop = &udf_dir_operations; fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err); if (!fi) { - inode->i_nlink--; - mark_inode_dirty(inode); + inode_dec_link_count(inode); iput(inode); goto out; } @@ -683,7 +680,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (!fi) { - inode->i_nlink = 0; + clear_nlink(inode); mark_inode_dirty(inode); iput(inode); goto out; diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 2eabf04af3de..78a4c70d46b5 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -341,7 +341,7 @@ cg_found: fail_remove_inode: unlock_super(sb); - inode->i_nlink = 0; + clear_nlink(inode); iput(inode); UFSD("EXIT (FAILED): err %d\n", err); return ERR_PTR(err); diff --git a/mm/shmem.c b/mm/shmem.c index fa4fa6ce13bc..45b9acb575f9 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2503,7 +2503,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags d_instantiate(path.dentry, inode); inode->i_size = size; - inode->i_nlink = 0; /* It is unlinked */ + clear_nlink(inode); /* It is unlinked */ #ifndef CONFIG_MMU error = ramfs_nommu_expand_for_mapping(inode, size); if (error) -- cgit v1.2.3-58-ga151 From bfe8684869601dacfcb2cd69ef8cfd9045f62170 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 28 Oct 2011 14:13:29 +0200 Subject: filesystems: add set_nlink() Replace remaining direct i_nlink updates with a new set_nlink() updater function. Signed-off-by: Miklos Szeredi Tested-by: Toshiyuki Okajima Signed-off-by: Christoph Hellwig --- arch/s390/hypfs/inode.c | 2 +- drivers/staging/pohmelfs/inode.c | 2 +- fs/9p/vfs_inode.c | 4 ++-- fs/9p/vfs_inode_dotl.c | 4 ++-- fs/adfs/inode.c | 2 +- fs/affs/amigaffs.c | 2 +- fs/affs/inode.c | 8 ++++---- fs/afs/fsclient.c | 2 +- fs/afs/inode.c | 4 ++-- fs/autofs4/inode.c | 2 +- fs/befs/linuxvfs.c | 2 +- fs/bfs/dir.c | 2 +- fs/bfs/inode.c | 2 +- fs/btrfs/delayed-inode.c | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/inode.c | 4 ++-- fs/btrfs/tree-log.c | 2 +- fs/ceph/caps.c | 2 +- fs/ceph/inode.c | 2 +- fs/cifs/inode.c | 6 +++--- fs/coda/coda_linux.c | 2 +- fs/devpts/inode.c | 2 +- fs/ecryptfs/inode.c | 12 ++++++------ fs/efs/inode.c | 2 +- fs/exofs/inode.c | 2 +- fs/ext2/inode.c | 2 +- fs/ext3/inode.c | 2 +- fs/ext3/namei.c | 4 ++-- fs/ext4/inode.c | 2 +- fs/ext4/namei.c | 6 +++--- fs/fat/inode.c | 4 ++-- fs/fat/namei_msdos.c | 2 +- fs/fat/namei_vfat.c | 2 +- fs/freevxfs/vxfs_inode.c | 2 +- fs/fuse/control.c | 2 +- fs/fuse/inode.c | 2 +- fs/gfs2/glops.c | 2 +- fs/hfs/inode.c | 4 ++-- fs/hfsplus/inode.c | 10 +++++----- fs/hostfs/hostfs_kern.c | 2 +- fs/hpfs/dir.c | 2 +- fs/hpfs/inode.c | 8 ++++---- fs/hpfs/namei.c | 8 ++++---- fs/hppfs/hppfs.c | 2 +- fs/isofs/inode.c | 4 ++-- fs/isofs/rock.c | 4 ++-- fs/jffs2/dir.c | 6 +++--- fs/jffs2/fs.c | 6 +++--- fs/jfs/jfs_imap.c | 6 +++--- fs/jfs/namei.c | 2 +- fs/libfs.c | 2 +- fs/logfs/readwrite.c | 2 +- fs/minix/inode.c | 4 ++-- fs/ncpfs/inode.c | 2 +- fs/nfs/inode.c | 4 ++-- fs/nilfs2/inode.c | 2 +- fs/nilfs2/namei.c | 2 +- fs/ntfs/inode.c | 8 ++++---- fs/ocfs2/dir.c | 4 ++-- fs/ocfs2/dlmglue.c | 2 +- fs/ocfs2/inode.c | 4 ++-- fs/ocfs2/namei.c | 8 ++++---- fs/openpromfs/inode.c | 4 ++-- fs/proc/base.c | 12 ++++++------ fs/proc/generic.c | 2 +- fs/proc/inode.c | 2 +- fs/qnx4/inode.c | 2 +- fs/reiserfs/inode.c | 6 +++--- fs/reiserfs/namei.c | 4 ++-- fs/romfs/super.c | 2 +- fs/squashfs/inode.c | 18 +++++++++--------- fs/stack.c | 2 +- fs/sysfs/inode.c | 2 +- fs/sysv/inode.c | 2 +- fs/ubifs/super.c | 2 +- fs/ubifs/xattr.c | 2 +- fs/udf/inode.c | 8 +++++--- fs/udf/namei.c | 4 ++-- fs/ufs/inode.c | 4 ++-- fs/xfs/xfs_iops.c | 2 +- include/linux/fs.h | 13 +++++++++++++ 81 files changed, 163 insertions(+), 148 deletions(-) (limited to 'fs') diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index baa82f8dd076..481f4f76f664 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -108,7 +108,7 @@ static struct inode *hypfs_make_inode(struct super_block *sb, int mode) ret->i_gid = hypfs_info->gid; ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME; if (mode & S_IFDIR) - ret->i_nlink = 2; + set_nlink(ret, 2); } return ret; } diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c index f3c6060c96b8..7a1955583b7d 100644 --- a/drivers/staging/pohmelfs/inode.c +++ b/drivers/staging/pohmelfs/inode.c @@ -1197,7 +1197,7 @@ const struct inode_operations pohmelfs_file_inode_operations = { void pohmelfs_fill_inode(struct inode *inode, struct netfs_inode_info *info) { inode->i_mode = info->mode; - inode->i_nlink = info->nlink; + set_nlink(inode, info->nlink); inode->i_uid = info->uid; inode->i_gid = info->gid; inode->i_blocks = info->blocks; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index b5a1076aaa6c..879ed8851737 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1138,7 +1138,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, struct v9fs_session_info *v9ses = sb->s_fs_info; struct v9fs_inode *v9inode = V9FS_I(inode); - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_atime.tv_sec = stat->atime; inode->i_mtime.tv_sec = stat->mtime; @@ -1164,7 +1164,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, /* HARDLINKCOUNT %u */ sscanf(ext, "%13s %u", tag_name, &i_nlink); if (!strncmp(tag_name, "HARDLINKCOUNT", 13)) - inode->i_nlink = i_nlink; + set_nlink(inode, i_nlink); } } mode = stat->mode & S_IALLUGO; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index aded79fcd5cf..0b5745e21946 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -606,7 +606,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) inode->i_ctime.tv_nsec = stat->st_ctime_nsec; inode->i_uid = stat->st_uid; inode->i_gid = stat->st_gid; - inode->i_nlink = stat->st_nlink; + set_nlink(inode, stat->st_nlink); mode = stat->st_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; @@ -632,7 +632,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) if (stat->st_result_mask & P9_STATS_GID) inode->i_gid = stat->st_gid; if (stat->st_result_mask & P9_STATS_NLINK) - inode->i_nlink = stat->st_nlink; + set_nlink(inode, stat->st_nlink); if (stat->st_result_mask & P9_STATS_MODE) { inode->i_mode = stat->st_mode; if ((S_ISBLK(inode->i_mode)) || diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index d5250c5aae21..1dab6a174d6a 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -247,7 +247,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj) inode->i_gid = ADFS_SB(sb)->s_gid; inode->i_ino = obj->file_id; inode->i_size = obj->size; - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 8f12723bbc97..de37ec842340 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -215,7 +215,7 @@ affs_remove_link(struct dentry *dentry) break; default: if (!AFFS_TAIL(sb, bh)->link_chain) - inode->i_nlink = 1; + set_nlink(inode, 1); } affs_free_block(sb, link_ino); goto done; diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 5d828903ac69..88a4b0b50058 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -54,7 +54,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) prot = be32_to_cpu(tail->protect); inode->i_size = 0; - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_mode = 0; AFFS_I(inode)->i_extcnt = 1; AFFS_I(inode)->i_ext_last = ~1; @@ -137,7 +137,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) sbi->s_hashsize + 1; } if (tail->link_chain) - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_mapping->a_ops = (sbi->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops; inode->i_op = &affs_file_inode_operations; inode->i_fop = &affs_file_operations; @@ -304,7 +304,7 @@ affs_new_inode(struct inode *dir) inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_ino = block; - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; atomic_set(&AFFS_I(inode)->i_opencnt, 0); AFFS_I(inode)->i_blkcnt = 0; @@ -387,7 +387,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3 AFFS_TAIL(sb, inode_bh)->link_chain = cpu_to_be32(block); affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain)); mark_buffer_dirty_inode(inode_bh, inode); - inode->i_nlink = 2; + set_nlink(inode, 2); ihold(inode); } affs_fix_checksum(sb, bh); diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 346e3289abd7..2f213d109c21 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -90,7 +90,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, vnode->vfs_inode.i_uid = status->owner; vnode->vfs_inode.i_gid = status->group; vnode->vfs_inode.i_generation = vnode->fid.unique; - vnode->vfs_inode.i_nlink = status->nlink; + set_nlink(&vnode->vfs_inode, status->nlink); mode = vnode->vfs_inode.i_mode; mode &= ~S_IALLUGO; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 0fdab6e03d87..d890ae3b2ce6 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -67,7 +67,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) fscache_attr_changed(vnode->cache); #endif - inode->i_nlink = vnode->status.nlink; + set_nlink(inode, vnode->status.nlink); inode->i_uid = vnode->status.owner; inode->i_gid = 0; inode->i_size = vnode->status.size; @@ -174,7 +174,7 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name, inode->i_size = 0; inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; inode->i_op = &afs_autocell_inode_operations; - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_uid = 0; inode->i_gid = 0; inode->i_ctime.tv_sec = get_seconds(); diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 180fa2425e49..8179f1ab8175 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -342,7 +342,7 @@ struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode) inode->i_ino = get_next_ino(); if (S_ISDIR(mode)) { - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_op = &autofs4_dir_inode_operations; inode->i_fop = &autofs4_dir_operations; } else if (S_ISLNK(mode)) { diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 720d885e8dca..8342ca67abcd 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -357,7 +357,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) inode->i_gid = befs_sb->mount_opts.use_gid ? befs_sb->mount_opts.gid : (gid_t) fs32_to_cpu(sb, raw_inode->gid); - inode->i_nlink = 1; + set_nlink(inode, 1); /* * BEFS's time is 64 bits, but current VFS is 32 bits... diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index b14cebfd9047..9cc074019479 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -199,7 +199,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry) printf("unlinking non-existent file %s:%lu (nlink=%d)\n", inode->i_sb->s_id, inode->i_ino, inode->i_nlink); - inode->i_nlink = 1; + set_nlink(inode, 1); } de->ino = 0; mark_buffer_dirty_inode(bh, dir); diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index a8e37f81d097..697af5bf70b3 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -78,7 +78,7 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino); inode->i_uid = le32_to_cpu(di->i_uid); inode->i_gid = le32_to_cpu(di->i_gid); - inode->i_nlink = le32_to_cpu(di->i_nlink); + set_nlink(inode, le32_to_cpu(di->i_nlink)); inode->i_size = BFS_FILESIZE(di); inode->i_blocks = BFS_FILEBLOCKS(di); inode->i_atime.tv_sec = le32_to_cpu(di->i_atime); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index b52c672f4c18..ae4d9cd10961 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1641,7 +1641,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) inode->i_gid = btrfs_stack_inode_gid(inode_item); btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); inode->i_mode = btrfs_stack_inode_mode(inode_item); - inode->i_nlink = btrfs_stack_inode_nlink(inode_item); + set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 07b3ac662e19..07ea91879a91 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1705,7 +1705,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, sb->s_bdi = &fs_info->bdi; fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; - fs_info->btree_inode->i_nlink = 1; + set_nlink(fs_info->btree_inode, 1); /* * we set the i_size on the btree inode to the max possible int. * the real end of the address space is determined by all of diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b2d004ad66a0..75686a61bd45 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2534,7 +2534,7 @@ static void btrfs_read_locked_inode(struct inode *inode) inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); inode->i_mode = btrfs_inode_mode(leaf, inode_item); - inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); + set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); inode->i_uid = btrfs_inode_uid(leaf, inode_item); inode->i_gid = btrfs_inode_gid(leaf, inode_item); btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); @@ -6728,7 +6728,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - inode->i_nlink = 1; + set_nlink(inode, 1); btrfs_i_size_write(inode, 0); err = btrfs_update_inode(trans, new_root, inode); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 786639fca067..0618aa39740b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1030,7 +1030,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, } btrfs_release_path(path); if (nlink != inode->i_nlink) { - inode->i_nlink = nlink; + set_nlink(inode, nlink); btrfs_update_inode(trans, root, inode); } BTRFS_I(inode)->index_cnt = (u64)-1; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index b8731bf3ef1f..15b21e35078a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2363,7 +2363,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } if ((issued & CEPH_CAP_LINK_EXCL) == 0) - inode->i_nlink = le32_to_cpu(grant->nlink); + set_nlink(inode, le32_to_cpu(grant->nlink)); if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { int len = le32_to_cpu(grant->xattr_len); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 5dde7d51dc11..1616a0d37cbd 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -618,7 +618,7 @@ static int fill_inode(struct inode *inode, } if ((issued & CEPH_CAP_LINK_EXCL) == 0) - inode->i_nlink = le32_to_cpu(info->nlink); + set_nlink(inode, le32_to_cpu(info->nlink)); /* be careful with mtime, atime, size */ ceph_decode_timespec(&atime, &info->atime); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 2c50bd2f65d1..e851d5b8931e 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -132,7 +132,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) inode->i_mtime = fattr->cf_mtime; inode->i_ctime = fattr->cf_ctime; inode->i_rdev = fattr->cf_rdev; - inode->i_nlink = fattr->cf_nlink; + set_nlink(inode, fattr->cf_nlink); inode->i_uid = fattr->cf_uid; inode->i_gid = fattr->cf_gid; @@ -905,7 +905,7 @@ struct inode *cifs_root_iget(struct super_block *sb) if (rc && tcon->ipc) { cFYI(1, "ipc connection - fake read inode"); inode->i_mode |= S_IFDIR; - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_op = &cifs_ipc_inode_ops; inode->i_fop = &simple_dir_operations; inode->i_uid = cifs_sb->mnt_uid; @@ -1367,7 +1367,7 @@ mkdir_get_info: /* setting nlink not necessary except in cases where we * failed to get it from the server or was set bogus */ if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2)) - direntry->d_inode->i_nlink = 2; + set_nlink(direntry->d_inode, 2); mode &= ~current_umask(); /* must turn on setgid bit if parent dir has it */ diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index 2bdbcc11b373..854ace712685 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -104,7 +104,7 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr) if (attr->va_gid != -1) inode->i_gid = (gid_t) attr->va_gid; if (attr->va_nlink != -1) - inode->i_nlink = attr->va_nlink; + set_nlink(inode, attr->va_nlink); if (attr->va_size != -1) inode->i_size = attr->va_size; if (attr->va_size != -1) diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index c196e544c64e..d5d5297efe97 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -307,7 +307,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; - inode->i_nlink = 2; + set_nlink(inode, 2); s->s_root = d_alloc_root(inode); if (s->s_root) diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 11f8582d7218..a36d327f1521 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -474,8 +474,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, goto out_lock; fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); - old_dentry->d_inode->i_nlink = - ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink; + set_nlink(old_dentry->d_inode, + ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink); i_size_write(new_dentry->d_inode, file_size_save); out_lock: unlock_dir(lower_dir_dentry); @@ -499,8 +499,8 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry) goto out_unlock; } fsstack_copy_attr_times(dir, lower_dir_inode); - dentry->d_inode->i_nlink = - ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink; + set_nlink(dentry->d_inode, + ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink); dentry->d_inode->i_ctime = dir->i_ctime; d_drop(dentry); out_unlock: @@ -565,7 +565,7 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) goto out; fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); - dir->i_nlink = lower_dir_dentry->d_inode->i_nlink; + set_nlink(dir, lower_dir_dentry->d_inode->i_nlink); out: unlock_dir(lower_dir_dentry); if (!dentry->d_inode) @@ -588,7 +588,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) if (!rc && dentry->d_inode) clear_nlink(dentry->d_inode); fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); - dir->i_nlink = lower_dir_dentry->d_inode->i_nlink; + set_nlink(dir, lower_dir_dentry->d_inode->i_nlink); unlock_dir(lower_dir_dentry); if (!rc) d_drop(dentry); diff --git a/fs/efs/inode.c b/fs/efs/inode.c index 9c13412e6c99..bc84f365d75c 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -96,7 +96,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) efs_inode = (struct efs_dinode *) (bh->b_data + offset); inode->i_mode = be16_to_cpu(efs_inode->di_mode); - inode->i_nlink = be16_to_cpu(efs_inode->di_nlink); + set_nlink(inode, be16_to_cpu(efs_inode->di_nlink)); inode->i_uid = (uid_t)be16_to_cpu(efs_inode->di_uid); inode->i_gid = (gid_t)be16_to_cpu(efs_inode->di_gid); inode->i_size = be32_to_cpu(efs_inode->di_size); diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 3e5f3a6be90a..f6dbf7768ce6 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1165,7 +1165,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) inode->i_mode = le16_to_cpu(fcb.i_mode); inode->i_uid = le32_to_cpu(fcb.i_uid); inode->i_gid = le32_to_cpu(fcb.i_gid); - inode->i_nlink = le16_to_cpu(fcb.i_links_count); + set_nlink(inode, le16_to_cpu(fcb.i_links_count)); inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime); inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime); inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index a8a58f63f07c..91a6945af6d8 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1321,7 +1321,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } - inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le32_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 12661e1deedd..85fe655fe3e0 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -2899,7 +2899,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } - inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le32_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 8a60e3327659..642dc6d66dfd 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1821,7 +1821,7 @@ retry: de->name_len = 2; strcpy (de->name, ".."); ext3_set_de_type(dir->i_sb, de, S_IFDIR); - inode->i_nlink = 2; + set_nlink(inode, 2); BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); err = ext3_journal_dirty_metadata(handle, dir_block); if (err) @@ -2170,7 +2170,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) ext3_warning (inode->i_sb, "ext3_unlink", "Deleting nonexistent file (%lu), %d", inode->i_ino, inode->i_nlink); - inode->i_nlink = 1; + set_nlink(inode, 1); } retval = ext3_delete_entry(handle, dir, de, bh); if (retval) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0defe0bfe019..be6668bbc1b3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3418,7 +3418,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } - inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ei->i_dir_start_lookup = 0; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 4623c082f3b2..5f7fb46293be 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1694,7 +1694,7 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode) if (is_dx(inode) && inode->i_nlink > 1) { /* limit is 16-bit i_links_count */ if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) { - inode->i_nlink = 1; + set_nlink(inode, 1); EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_DIR_NLINK); } @@ -1861,7 +1861,7 @@ retry: de->name_len = 2; strcpy(de->name, ".."); ext4_set_de_type(dir->i_sb, de, S_IFDIR); - inode->i_nlink = 2; + set_nlink(inode, 2); BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, dir, dir_block); if (err) @@ -2214,7 +2214,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) ext4_warning(inode->i_sb, "Deleting nonexistent file (%lu), %d", inode->i_ino, inode->i_nlink); - inode->i_nlink = 1; + set_nlink(inode, 1); } retval = ext4_delete_entry(handle, dir, de, bh); if (retval) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 1726d7303047..808cac7edcfb 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -379,7 +379,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) return error; MSDOS_I(inode)->mmu_private = inode->i_size; - inode->i_nlink = fat_subdirs(inode); + set_nlink(inode, fat_subdirs(inode)); } else { /* not a directory */ inode->i_generation |= 1; inode->i_mode = fat_make_mode(sbi, de->attr, @@ -1233,7 +1233,7 @@ static int fat_read_root(struct inode *inode) fat_save_attrs(inode, ATTR_DIR); inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0; inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0; - inode->i_nlink = fat_subdirs(inode)+2; + set_nlink(inode, fat_subdirs(inode)+2); return 0; } diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 66e83b845455..216b419f30e2 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -387,7 +387,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode) /* the directory was completed, just return a error */ goto out; } - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_mtime = inode->i_atime = inode->i_ctime = ts; /* timestamp is already written, so mark_inode_dirty() is unneeded. */ diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index bb3f29c3557b..a87a65663c25 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -900,7 +900,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode) goto out; } inode->i_version++; - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_mtime = inode->i_atime = inode->i_ctime = ts; /* timestamp is already written, so mark_inode_dirty() is unneeded. */ diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 1a4311437a8b..7b2af5abe2fa 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -227,7 +227,7 @@ vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip) ip->i_uid = (uid_t)vip->vii_uid; ip->i_gid = (gid_t)vip->vii_gid; - ip->i_nlink = vip->vii_nlink; + set_nlink(ip, vip->vii_nlink); ip->i_size = vip->vii_size; ip->i_atime.tv_sec = vip->vii_atime; diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 85542a7daf40..42593c587d48 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -231,7 +231,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, if (iop) inode->i_op = iop; inode->i_fop = fop; - inode->i_nlink = nlink; + set_nlink(inode, nlink); inode->i_private = fc; d_add(dentry, inode); return dentry; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index add96f6ffda5..3e6d72756479 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -151,7 +151,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_ino = attr->ino; inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); - inode->i_nlink = attr->nlink; + set_nlink(inode, attr->nlink); inode->i_uid = attr->uid; inode->i_gid = attr->gid; inode->i_blocks = attr->blocks; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 78418b4fa857..1656df7aacd2 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -299,7 +299,7 @@ static void gfs2_set_nlink(struct inode *inode, u32 nlink) if (nlink == 0) clear_nlink(inode); else - inode->i_nlink = nlink; + set_nlink(inode, nlink); } } diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 96a1b625fc74..a1a9fdcd2a00 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -183,7 +183,7 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode) inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; HFS_I(inode)->flags = 0; HFS_I(inode)->rsrc_inode = NULL; @@ -313,7 +313,7 @@ static int hfs_read_inode(struct inode *inode, void *data) /* Initialize the inode */ inode->i_uid = hsb->s_uid; inode->i_gid = hsb->s_gid; - inode->i_nlink = 1; + set_nlink(inode, 1); if (idata->key) HFS_I(inode)->cat_key = *idata->key; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 4cc1e3a36ec7..40e1413be4cf 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -391,7 +391,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode) inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; hip = HFSPLUS_I(inode); @@ -512,7 +512,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, sizeof(struct hfsplus_cat_folder)); hfsplus_get_perms(inode, &folder->permissions, 1); - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_size = 2 + be32_to_cpu(folder->valence); inode->i_atime = hfsp_mt2ut(folder->access_date); inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); @@ -532,11 +532,11 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ? &file->rsrc_fork : &file->data_fork); hfsplus_get_perms(inode, &file->permissions, 0); - inode->i_nlink = 1; + set_nlink(inode, 1); if (S_ISREG(inode->i_mode)) { if (file->permissions.dev) - inode->i_nlink = - be32_to_cpu(file->permissions.dev); + set_nlink(inode, + be32_to_cpu(file->permissions.dev)); inode->i_op = &hfsplus_file_inode_operations; inode->i_fop = &hfsplus_file_operations; inode->i_mapping->a_ops = &hfsplus_aops; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 0d22afdd4611..2f72da5ae686 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -541,7 +541,7 @@ static int read_name(struct inode *ino, char *name) ino->i_ino = st.ino; ino->i_mode = st.mode; - ino->i_nlink = st.nlink; + set_nlink(ino, st.nlink); ino->i_uid = st.uid; ino->i_gid = st.gid; ino->i_atime = st.atime; diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 96a8ed91cedd..2fa0089a02a8 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -247,7 +247,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name result->i_mode &= ~0111; result->i_op = &hpfs_file_iops; result->i_fop = &hpfs_file_ops; - result->i_nlink = 1; + set_nlink(result, 1); } unlock_new_inode(result); } diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index dfe800c8ae38..3b2cec29972b 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -77,7 +77,7 @@ void hpfs_read_inode(struct inode *i) i->i_mode = S_IFLNK | 0777; i->i_op = &page_symlink_inode_operations; i->i_data.a_ops = &hpfs_symlink_aops; - i->i_nlink = 1; + set_nlink(i, 1); i->i_size = ea_size; i->i_blocks = 1; brelse(bh); @@ -101,7 +101,7 @@ void hpfs_read_inode(struct inode *i) } if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) { brelse(bh); - i->i_nlink = 1; + set_nlink(i, 1); i->i_size = 0; i->i_blocks = 1; init_special_inode(i, mode, @@ -125,13 +125,13 @@ void hpfs_read_inode(struct inode *i) hpfs_count_dnodes(i->i_sb, hpfs_inode->i_dno, &n_dnodes, &n_subdirs, NULL); i->i_blocks = 4 * n_dnodes; i->i_size = 2048 * n_dnodes; - i->i_nlink = 2 + n_subdirs; + set_nlink(i, 2 + n_subdirs); } else { i->i_mode |= S_IFREG; if (!hpfs_inode->i_ea_mode) i->i_mode &= ~0111; i->i_op = &hpfs_file_iops; i->i_fop = &hpfs_file_ops; - i->i_nlink = 1; + set_nlink(i, 1); i->i_size = le32_to_cpu(fnode->file_size); i->i_blocks = ((i->i_size + 511) >> 9) + 1; i->i_data.a_ops = &hpfs_aops; diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 2df69e2f07cf..ea91fcb0ef9b 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -56,7 +56,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) result->i_fop = &hpfs_dir_ops; result->i_blocks = 4; result->i_size = 2048; - result->i_nlink = 2; + set_nlink(result, 2); if (dee.read_only) result->i_mode &= ~0222; @@ -150,7 +150,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc result->i_mode &= ~0111; result->i_op = &hpfs_file_iops; result->i_fop = &hpfs_file_ops; - result->i_nlink = 1; + set_nlink(result, 1); hpfs_i(result)->i_parent_dir = dir->i_ino; result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); result->i_ctime.tv_nsec = 0; @@ -242,7 +242,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t hpfs_i(result)->i_ea_size = 0; result->i_uid = current_fsuid(); result->i_gid = current_fsgid(); - result->i_nlink = 1; + set_nlink(result, 1); result->i_size = 0; result->i_blocks = 1; init_special_inode(result, mode, rdev); @@ -318,7 +318,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy result->i_uid = current_fsuid(); result->i_gid = current_fsgid(); result->i_blocks = 1; - result->i_nlink = 1; + set_nlink(result, 1); result->i_size = strlen(symlink); result->i_op = &page_symlink_inode_operations; result->i_data.a_ops = &hpfs_symlink_aops; diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 970ea987b3f6..f590b1160c6c 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -702,7 +702,7 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) inode->i_ctime = proc_ino->i_ctime; inode->i_ino = proc_ino->i_ino; inode->i_mode = proc_ino->i_mode; - inode->i_nlink = proc_ino->i_nlink; + set_nlink(inode, proc_ino->i_nlink); inode->i_size = proc_ino->i_size; inode->i_blocks = proc_ino->i_blocks; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index a5d03672d04e..562adabef985 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1319,7 +1319,7 @@ static int isofs_read_inode(struct inode *inode) inode->i_mode = S_IFDIR | sbi->s_dmode; else inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; - inode->i_nlink = 1; /* + set_nlink(inode, 1); /* * Set to 1. We know there are 2, but * the find utility tries to optimize * if it is 2, and it screws up. It is @@ -1337,7 +1337,7 @@ static int isofs_read_inode(struct inode *inode) */ inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO; } - inode->i_nlink = 1; + set_nlink(inode, 1); } inode->i_uid = sbi->s_uid; inode->i_gid = sbi->s_gid; diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index 1fbc7de88f50..70e79d0c756a 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -363,7 +363,7 @@ repeat: break; case SIG('P', 'X'): inode->i_mode = isonum_733(rr->u.PX.mode); - inode->i_nlink = isonum_733(rr->u.PX.n_links); + set_nlink(inode, isonum_733(rr->u.PX.n_links)); inode->i_uid = isonum_733(rr->u.PX.uid); inode->i_gid = isonum_733(rr->u.PX.gid); break; @@ -496,7 +496,7 @@ repeat: goto out; } inode->i_mode = reloc->i_mode; - inode->i_nlink = reloc->i_nlink; + set_nlink(inode, reloc->i_nlink); inode->i_uid = reloc->i_uid; inode->i_gid = reloc->i_gid; inode->i_rdev = reloc->i_rdev; diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 9659b7c00468..be6169bd8acd 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -245,7 +245,7 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry) ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name, dentry->d_name.len, dead_f, now); if (dead_f->inocache) - dentry->d_inode->i_nlink = dead_f->inocache->pino_nlink; + set_nlink(dentry->d_inode, dead_f->inocache->pino_nlink); if (!ret) dir_i->i_mtime = dir_i->i_ctime = ITIME(now); return ret; @@ -278,7 +278,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de if (!ret) { mutex_lock(&f->sem); - old_dentry->d_inode->i_nlink = ++f->inocache->pino_nlink; + set_nlink(old_dentry->d_inode, ++f->inocache->pino_nlink); mutex_unlock(&f->sem); d_instantiate(dentry, old_dentry->d_inode); dir_i->i_mtime = dir_i->i_ctime = ITIME(now); @@ -497,7 +497,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) f = JFFS2_INODE_INFO(inode); /* Directories get nlink 2 at start */ - inode->i_nlink = 2; + set_nlink(inode, 2); /* but ic->pino_nlink is the parent ino# */ f->inocache->pino_nlink = dir_i->i_ino; diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index bbcb9755dd2b..7286e44ac665 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -278,7 +278,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime)); inode->i_ctime = ITIME(je32_to_cpu(latest_node.ctime)); - inode->i_nlink = f->inocache->pino_nlink; + set_nlink(inode, f->inocache->pino_nlink); inode->i_blocks = (inode->i_size + 511) >> 9; @@ -291,7 +291,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) case S_IFDIR: { struct jffs2_full_dirent *fd; - inode->i_nlink = 2; /* parent and '.' */ + set_nlink(inode, 2); /* parent and '.' */ for (fd=f->dents; fd; fd = fd->next) { if (fd->type == DT_DIR && fd->ino) @@ -453,7 +453,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r iput(inode); return ERR_PTR(ret); } - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_ino = je32_to_cpu(ri->ino); inode->i_mode = jemode_to_cpu(ri->mode); inode->i_gid = je16_to_cpu(ri->gid); diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index b78b2f978f04..1b6f15f191b3 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -457,7 +457,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) /* read the page of fixed disk inode (AIT) in raw mode */ mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1); if (mp == NULL) { - ip->i_nlink = 1; /* Don't want iput() deleting it */ + set_nlink(ip, 1); /* Don't want iput() deleting it */ iput(ip); return (NULL); } @@ -469,7 +469,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) /* copy on-disk inode to in-memory inode */ if ((copy_from_dinode(dp, ip)) != 0) { /* handle bad return by returning NULL for ip */ - ip->i_nlink = 1; /* Don't want iput() deleting it */ + set_nlink(ip, 1); /* Don't want iput() deleting it */ iput(ip); /* release the page */ release_metapage(mp); @@ -3076,7 +3076,7 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip) ip->i_mode |= 0001; } } - ip->i_nlink = le32_to_cpu(dip->di_nlink); + set_nlink(ip, le32_to_cpu(dip->di_nlink)); jfs_ip->saved_uid = le32_to_cpu(dip->di_uid); if (sbi->uid == -1) diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 94b0a624b85f..a112ad96e474 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -292,7 +292,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) goto out3; } - ip->i_nlink = 2; /* for '.' */ + set_nlink(ip, 2); /* for '.' */ ip->i_op = &jfs_dir_inode_operations; ip->i_fop = &jfs_dir_operations; diff --git a/fs/libfs.c b/fs/libfs.c index a2c0029cd95f..f6d411eef1e7 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -490,7 +490,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic, inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; - inode->i_nlink = 2; + set_nlink(inode, 2); root = d_alloc_root(inode); if (!root) { iput(inode); diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index d8d09380c7de..2ac4217b7901 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -126,7 +126,7 @@ static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode) inode->i_atime = be64_to_timespec(di->di_atime); inode->i_ctime = be64_to_timespec(di->di_ctime); inode->i_mtime = be64_to_timespec(di->di_mtime); - inode->i_nlink = be32_to_cpu(di->di_refcount); + set_nlink(inode, be32_to_cpu(di->di_refcount)); inode->i_generation = be32_to_cpu(di->di_generation); switch (inode->i_mode & S_IFMT) { diff --git a/fs/minix/inode.c b/fs/minix/inode.c index e7d23e25bf1d..64cdcd662ffc 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -446,7 +446,7 @@ static struct inode *V1_minix_iget(struct inode *inode) inode->i_mode = raw_inode->i_mode; inode->i_uid = (uid_t)raw_inode->i_uid; inode->i_gid = (gid_t)raw_inode->i_gid; - inode->i_nlink = raw_inode->i_nlinks; + set_nlink(inode, raw_inode->i_nlinks); inode->i_size = raw_inode->i_size; inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = raw_inode->i_time; inode->i_mtime.tv_nsec = 0; @@ -479,7 +479,7 @@ static struct inode *V2_minix_iget(struct inode *inode) inode->i_mode = raw_inode->i_mode; inode->i_uid = (uid_t)raw_inode->i_uid; inode->i_gid = (gid_t)raw_inode->i_gid; - inode->i_nlink = raw_inode->i_nlinks; + set_nlink(inode, raw_inode->i_nlinks); inode->i_size = raw_inode->i_size; inode->i_mtime.tv_sec = raw_inode->i_mtime; inode->i_atime.tv_sec = raw_inode->i_atime; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 202f370526a7..5b5fa33b6b9d 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -228,7 +228,7 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo) DDPRINTK("ncp_read_inode: inode->i_mode = %u\n", inode->i_mode); - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_uid = server->m.uid; inode->i_gid = server->m.gid; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 1fc715acc2da..c07a55aec838 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -355,7 +355,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) | NFS_INO_INVALID_DATA | NFS_INO_REVAL_PAGECACHE; if (fattr->valid & NFS_ATTR_FATTR_NLINK) - inode->i_nlink = fattr->nlink; + set_nlink(inode, fattr->nlink); else if (nfs_server_capable(inode, NFS_CAP_NLINK)) nfsi->cache_validity |= NFS_INO_INVALID_ATTR; if (fattr->valid & NFS_ATTR_FATTR_OWNER) @@ -1361,7 +1361,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_ATTR; if (S_ISDIR(inode->i_mode)) invalid |= NFS_INO_INVALID_DATA; - inode->i_nlink = fattr->nlink; + set_nlink(inode, fattr->nlink); } } else if (server->caps & NFS_CAP_NLINK) invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 08ac272b7c28..b50ffb72e5b3 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -396,7 +396,7 @@ int nilfs_read_inode_common(struct inode *inode, inode->i_mode = le16_to_cpu(raw_inode->i_mode); inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid); inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid); - inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le64_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index a3141990061e..768982de10e4 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -289,7 +289,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) nilfs_warning(inode->i_sb, __func__, "deleting nonexistent file (%lu), %d\n", inode->i_ino, inode->i_nlink); - inode->i_nlink = 1; + set_nlink(inode, 1); } err = nilfs_delete_entry(de, page); if (err) diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 1371487da955..97e2dacbc867 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -612,7 +612,7 @@ static int ntfs_read_locked_inode(struct inode *vi) * might be tricky due to vfs interactions. Need to think about this * some more when implementing the unlink command. */ - vi->i_nlink = le16_to_cpu(m->link_count); + set_nlink(vi, le16_to_cpu(m->link_count)); /* * FIXME: Reparse points can have the directory bit set even though * they would be S_IFLNK. Need to deal with this further below when we @@ -634,7 +634,7 @@ static int ntfs_read_locked_inode(struct inode *vi) vi->i_mode &= ~vol->dmask; /* Things break without this kludge! */ if (vi->i_nlink > 1) - vi->i_nlink = 1; + set_nlink(vi, 1); } else { vi->i_mode |= S_IFREG; /* Apply the file permissions mask set in the mount options. */ @@ -1242,7 +1242,7 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) vi->i_version = base_vi->i_version; vi->i_uid = base_vi->i_uid; vi->i_gid = base_vi->i_gid; - vi->i_nlink = base_vi->i_nlink; + set_nlink(vi, base_vi->i_nlink); vi->i_mtime = base_vi->i_mtime; vi->i_ctime = base_vi->i_ctime; vi->i_atime = base_vi->i_atime; @@ -1508,7 +1508,7 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) vi->i_version = base_vi->i_version; vi->i_uid = base_vi->i_uid; vi->i_gid = base_vi->i_gid; - vi->i_nlink = base_vi->i_nlink; + set_nlink(vi, base_vi->i_nlink); vi->i_mtime = base_vi->i_mtime; vi->i_ctime = base_vi->i_ctime; vi->i_atime = base_vi->i_atime; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 8582e3f4f120..e2878b5895fb 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2292,7 +2292,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb, ocfs2_journal_dirty(handle, di_bh); i_size_write(inode, size); - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_blocks = ocfs2_inode_sector_count(inode); ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); @@ -2354,7 +2354,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, ocfs2_journal_dirty(handle, new_bh); i_size_write(inode, inode->i_sb->s_blocksize); - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_blocks = ocfs2_inode_sector_count(inode); status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); if (status < 0) { diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 7642d7ca73e5..e1ed5e502ff2 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2092,7 +2092,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) inode->i_uid = be32_to_cpu(lvb->lvb_iuid); inode->i_gid = be32_to_cpu(lvb->lvb_igid); inode->i_mode = be16_to_cpu(lvb->lvb_imode); - inode->i_nlink = be16_to_cpu(lvb->lvb_inlink); + set_nlink(inode, be16_to_cpu(lvb->lvb_inlink)); ocfs2_unpack_timespec(&inode->i_atime, be64_to_cpu(lvb->lvb_iatime_packed)); ocfs2_unpack_timespec(&inode->i_mtime, diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index b4c8bb6b8d28..a22d2c098890 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -291,7 +291,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)le64_to_cpu(fe->i_blkno)); - inode->i_nlink = ocfs2_read_links_count(fe); + set_nlink(inode, ocfs2_read_links_count(fe)); trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno, le32_to_cpu(fe->i_flags)); @@ -1290,7 +1290,7 @@ void ocfs2_refresh_inode(struct inode *inode, OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); ocfs2_set_inode_flags(inode); i_size_write(inode, le64_to_cpu(fe->i_size)); - inode->i_nlink = ocfs2_read_links_count(fe); + set_nlink(inode, ocfs2_read_links_count(fe)); inode->i_uid = le32_to_cpu(fe->i_uid); inode->i_gid = le32_to_cpu(fe->i_gid); inode->i_mode = le16_to_cpu(fe->i_mode); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index ea0ecbdda58c..a8b2bfea574e 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -199,7 +199,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode) * these are used by the support functions here and in * callers. */ if (S_ISDIR(mode)) - inode->i_nlink = 2; + set_nlink(inode, 2); inode_init_owner(inode, dir, mode); dquot_initialize(inode); return inode; @@ -2016,7 +2016,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; if (S_ISDIR(inode->i_mode)) ocfs2_add_links_count(orphan_fe, 1); - orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); + set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe)); ocfs2_journal_dirty(handle, orphan_dir_bh); status = __ocfs2_add_entry(handle, orphan_dir_inode, name, @@ -2114,7 +2114,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; if (S_ISDIR(inode->i_mode)) ocfs2_add_links_count(orphan_fe, -1); - orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); + set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe)); ocfs2_journal_dirty(handle, orphan_dir_bh); leave: @@ -2435,7 +2435,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, di = (struct ocfs2_dinode *)di_bh->b_data; le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL); di->i_orphaned_slot = 0; - inode->i_nlink = 1; + set_nlink(inode, 1); ocfs2_set_links_count(di, inode->i_nlink); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index a2a5bff774e3..e4e0ff7962e2 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -242,7 +242,7 @@ found: inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; inode->i_op = &openprom_inode_operations; inode->i_fop = &openprom_operations; - inode->i_nlink = 2; + set_nlink(inode, 2); break; case op_inode_prop: if (!strcmp(dp->name, "options") && (len == 17) && @@ -251,7 +251,7 @@ found: else inode->i_mode = S_IFREG | S_IRUGO; inode->i_fop = &openpromfs_prop_ops; - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_size = ent_oi->u.prop->length; break; } diff --git a/fs/proc/base.c b/fs/proc/base.c index 8f0087e20e16..851ba3dcdc29 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2248,7 +2248,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir, ei = PROC_I(inode); inode->i_mode = p->mode; if (S_ISDIR(inode->i_mode)) - inode->i_nlink = 2; /* Use getattr to fix if necessary */ + set_nlink(inode, 2); /* Use getattr to fix if necessary */ if (p->iop) inode->i_op = p->iop; if (p->fop) @@ -2642,7 +2642,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir, inode->i_mode = p->mode; if (S_ISDIR(inode->i_mode)) - inode->i_nlink = 2; + set_nlink(inode, 2); if (S_ISLNK(inode->i_mode)) inode->i_size = 64; if (p->iop) @@ -2981,8 +2981,8 @@ static struct dentry *proc_pid_instantiate(struct inode *dir, inode->i_fop = &proc_tgid_base_operations; inode->i_flags|=S_IMMUTABLE; - inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff, - ARRAY_SIZE(tgid_base_stuff)); + set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff, + ARRAY_SIZE(tgid_base_stuff))); d_set_d_op(dentry, &pid_dentry_operations); @@ -3233,8 +3233,8 @@ static struct dentry *proc_task_instantiate(struct inode *dir, inode->i_fop = &proc_tid_base_operations; inode->i_flags|=S_IMMUTABLE; - inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff, - ARRAY_SIZE(tid_base_stuff)); + set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff, + ARRAY_SIZE(tid_base_stuff))); d_set_d_op(dentry, &pid_dentry_operations); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 9d99131d0d65..10090d9c7ad5 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -283,7 +283,7 @@ static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry, struct inode *inode = dentry->d_inode; struct proc_dir_entry *de = PROC_I(inode)->pde; if (de && de->nlink) - inode->i_nlink = de->nlink; + set_nlink(inode, de->nlink); generic_fillattr(inode, stat); return 0; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 7ed72d6c1c6f..7737c5468a40 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -445,7 +445,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) if (de->size) inode->i_size = de->size; if (de->nlink) - inode->i_nlink = de->nlink; + set_nlink(inode, de->nlink); if (de->proc_iops) inode->i_op = de->proc_iops; if (de->proc_fops) { diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 2b0646613f5a..3bdd21418432 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -379,7 +379,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino) inode->i_mode = le16_to_cpu(raw_inode->di_mode); inode->i_uid = (uid_t)le16_to_cpu(raw_inode->di_uid); inode->i_gid = (gid_t)le16_to_cpu(raw_inode->di_gid); - inode->i_nlink = le16_to_cpu(raw_inode->di_nlink); + set_nlink(inode, le16_to_cpu(raw_inode->di_nlink)); inode->i_size = le32_to_cpu(raw_inode->di_size); inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->di_mtime); inode->i_mtime.tv_nsec = 0; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index c425441c6942..950f13af0951 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1154,7 +1154,7 @@ static void init_inode(struct inode *inode, struct treepath *path) set_inode_item_key_version(inode, KEY_FORMAT_3_5); set_inode_sd_version(inode, STAT_DATA_V1); inode->i_mode = sd_v1_mode(sd); - inode->i_nlink = sd_v1_nlink(sd); + set_nlink(inode, sd_v1_nlink(sd)); inode->i_uid = sd_v1_uid(sd); inode->i_gid = sd_v1_gid(sd); inode->i_size = sd_v1_size(sd); @@ -1199,7 +1199,7 @@ static void init_inode(struct inode *inode, struct treepath *path) struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih); inode->i_mode = sd_v2_mode(sd); - inode->i_nlink = sd_v2_nlink(sd); + set_nlink(inode, sd_v2_nlink(sd)); inode->i_uid = sd_v2_uid(sd); inode->i_size = sd_v2_size(sd); inode->i_gid = sd_v2_gid(sd); @@ -1832,7 +1832,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, #endif /* fill stat data */ - inode->i_nlink = (S_ISDIR(mode) ? 2 : 1); + set_nlink(inode, (S_ISDIR(mode) ? 2 : 1)); /* uid and gid must already be set by the caller for quota init */ diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 6ce332821633..80058e8ce361 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -19,7 +19,7 @@ #include #include -#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; } +#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); } #define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i); // directory item contains array of entry headers. This performs @@ -964,7 +964,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) reiserfs_warning(inode->i_sb, "reiserfs-7042", "deleting nonexistent file (%lu), %d", inode->i_ino, inode->i_nlink); - inode->i_nlink = 1; + set_nlink(inode, 1); } drop_nlink(inode); diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 2305e3121cb1..8b4089f30408 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -337,7 +337,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK; inode->i_dataoffset = pos + inode->i_metasize; - i->i_nlink = 1; /* Hard to decide.. */ + set_nlink(i, 1); /* Hard to decide.. */ i->i_size = be32_to_cpu(ri.size); i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0; i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index 04bebcaa2373..fd7b3b3bda13 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -159,7 +159,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) frag_offset = 0; } - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_size = le32_to_cpu(sqsh_ino->file_size); inode->i_fop = &generic_ro_fops; inode->i_mode |= S_IFREG; @@ -203,7 +203,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) } xattr_id = le32_to_cpu(sqsh_ino->xattr); - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le64_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_inode_ops; inode->i_fop = &generic_ro_fops; @@ -232,7 +232,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) if (err < 0) goto failed_read; - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le16_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_dir_inode_ops; inode->i_fop = &squashfs_dir_ops; @@ -257,7 +257,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) goto failed_read; xattr_id = le32_to_cpu(sqsh_ino->xattr); - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le32_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_dir_inode_ops; inode->i_fop = &squashfs_dir_ops; @@ -284,7 +284,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) if (err < 0) goto failed_read; - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); inode->i_op = &squashfs_symlink_inode_ops; inode->i_data.a_ops = &squashfs_symlink_aops; @@ -325,7 +325,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_mode |= S_IFCHR; else inode->i_mode |= S_IFBLK; - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); rdev = le32_to_cpu(sqsh_ino->rdev); init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); @@ -349,7 +349,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_mode |= S_IFBLK; xattr_id = le32_to_cpu(sqsh_ino->xattr); inode->i_op = &squashfs_inode_ops; - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); rdev = le32_to_cpu(sqsh_ino->rdev); init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); @@ -370,7 +370,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_mode |= S_IFIFO; else inode->i_mode |= S_IFSOCK; - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); init_special_inode(inode, inode->i_mode, 0); break; } @@ -389,7 +389,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_mode |= S_IFSOCK; xattr_id = le32_to_cpu(sqsh_ino->xattr); inode->i_op = &squashfs_inode_ops; - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); init_special_inode(inode, inode->i_mode, 0); break; } diff --git a/fs/stack.c b/fs/stack.c index b4f2ab48a61f..9c11519245a6 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -71,6 +71,6 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src) dest->i_ctime = src->i_ctime; dest->i_blkbits = src->i_blkbits; dest->i_flags = src->i_flags; - dest->i_nlink = src->i_nlink; + set_nlink(dest, src->i_nlink); } EXPORT_SYMBOL_GPL(fsstack_copy_attr_all); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index e23f28894a3a..c81b22f3ace1 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -218,7 +218,7 @@ static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode) } if (sysfs_type(sd) == SYSFS_DIR) - inode->i_nlink = sd->s_dir.subdirs + 2; + set_nlink(inode, sd->s_dir.subdirs + 2); } int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 0630eb969a28..25ffb3e9a3f8 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -219,7 +219,7 @@ struct inode *sysv_iget(struct super_block *sb, unsigned int ino) inode->i_mode = fs16_to_cpu(sbi, raw_inode->i_mode); inode->i_uid = (uid_t)fs16_to_cpu(sbi, raw_inode->i_uid); inode->i_gid = (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid); - inode->i_nlink = fs16_to_cpu(sbi, raw_inode->i_nlink); + set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink)); inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size); inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime); inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_mtime); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index b28121278d46..20403dc5d437 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -129,7 +129,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) goto out_ino; inode->i_flags |= (S_NOCMTIME | S_NOATIME); - inode->i_nlink = le32_to_cpu(ino->nlink); + set_nlink(inode, le32_to_cpu(ino->nlink)); inode->i_uid = le32_to_cpu(ino->uid); inode->i_gid = le32_to_cpu(ino->gid); inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 993adc8f1c87..bf18f7a04544 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -561,7 +561,7 @@ int ubifs_removexattr(struct dentry *dentry, const char *name) clear_nlink(inode); err = remove_xattr(c, host, inode, &nm); if (err) - inode->i_nlink = 1; + set_nlink(inode, 1); /* If @i_nlink is 0, 'iput()' will delete the inode */ iput(inode); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 1d1358ed80c1..6e73f1d6e93c 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1236,6 +1236,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) int offset; struct udf_sb_info *sbi = UDF_SB(inode->i_sb); struct udf_inode_info *iinfo = UDF_I(inode); + unsigned int link_count; fe = (struct fileEntry *)bh->b_data; efe = (struct extendedFileEntry *)bh->b_data; @@ -1318,9 +1319,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) inode->i_mode &= ~sbi->s_umask; read_unlock(&sbi->s_cred_lock); - inode->i_nlink = le16_to_cpu(fe->fileLinkCount); - if (!inode->i_nlink) - inode->i_nlink = 1; + link_count = le16_to_cpu(fe->fileLinkCount); + if (!link_count) + link_count = 1; + set_nlink(inode, link_count); inode->i_size = le64_to_cpu(fe->informationLength); iinfo->i_lenExtents = inode->i_size; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index e8d61b114c1b..f1c64c6a4532 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -667,7 +667,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) iput(inode); goto out; } - inode->i_nlink = 2; + set_nlink(inode, 2); cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); cfi.icb.extLocation = cpu_to_lelb(dinfo->i_location); *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = @@ -837,7 +837,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) if (!inode->i_nlink) { udf_debug("Deleting nonexistent file (%lu), %d\n", inode->i_ino, inode->i_nlink); - inode->i_nlink = 1; + set_nlink(inode, 1); } retval = udf_delete_entry(dir, fi, &fibh, &cfi); if (retval) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index b4d791a83207..879b13436fa4 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -589,7 +589,7 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) * Copy data to the in-core inode. */ inode->i_mode = mode = fs16_to_cpu(sb, ufs_inode->ui_mode); - inode->i_nlink = fs16_to_cpu(sb, ufs_inode->ui_nlink); + set_nlink(inode, fs16_to_cpu(sb, ufs_inode->ui_nlink)); if (inode->i_nlink == 0) { ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); return -1; @@ -637,7 +637,7 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode) * Copy data to the in-core inode. */ inode->i_mode = mode = fs16_to_cpu(sb, ufs2_inode->ui_mode); - inode->i_nlink = fs16_to_cpu(sb, ufs2_inode->ui_nlink); + set_nlink(inode, fs16_to_cpu(sb, ufs2_inode->ui_nlink)); if (inode->i_nlink == 0) { ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); return -1; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 9ba2a07b7343..23ce927973a4 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1153,7 +1153,7 @@ xfs_setup_inode( hlist_add_fake(&inode->i_hash); inode->i_mode = ip->i_d.di_mode; - inode->i_nlink = ip->i_d.di_nlink; + set_nlink(inode, ip->i_d.di_nlink); inode->i_uid = ip->i_d.di_uid; inode->i_gid = ip->i_d.di_gid; diff --git a/include/linux/fs.h b/include/linux/fs.h index 7a049fd2aa4c..48c1f5fc7411 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1754,6 +1754,19 @@ static inline void mark_inode_dirty_sync(struct inode *inode) __mark_inode_dirty(inode, I_DIRTY_SYNC); } +/** + * set_nlink - directly set an inode's link count + * @inode: inode + * @nlink: new nlink (should be non-zero) + * + * This is a low-level filesystem helper to replace any + * direct filesystem manipulation of i_nlink. + */ +static inline void set_nlink(struct inode *inode, unsigned int nlink) +{ + inode->i_nlink = nlink; +} + /** * inc_nlink - directly increment an inode's link count * @inode: inode -- cgit v1.2.3-58-ga151 From a78ef704a8dd430225955f0709b22d4a6ba21deb Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 28 Oct 2011 14:13:30 +0200 Subject: vfs: protect i_nlink Prevent direct modification of i_nlink by making it const and adding a non-const __i_nlink alias. Signed-off-by: Miklos Szeredi Tested-by: Toshiyuki Okajima Signed-off-by: Christoph Hellwig --- fs/inode.c | 2 +- include/linux/fs.h | 20 +++++++++++++++----- 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index ecbb68dc7e2a..ee4e66b998f4 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -142,7 +142,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; inode->i_fop = &empty_fops; - inode->i_nlink = 1; + inode->__i_nlink = 1; inode->i_opflags = 0; inode->i_uid = 0; inode->i_gid = 0; diff --git a/include/linux/fs.h b/include/linux/fs.h index 48c1f5fc7411..23467d768cab 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -768,7 +768,17 @@ struct inode { /* Stat data, not accessed from path walking */ unsigned long i_ino; - unsigned int i_nlink; + /* + * Filesystems may only read i_nlink directly. They shall use the + * following functions for modification: + * + * (set|clear|inc|drop)_nlink + * inode_(inc|dec)_link_count + */ + union { + const unsigned int i_nlink; + unsigned int __i_nlink; + }; dev_t i_rdev; loff_t i_size; struct timespec i_atime; @@ -1764,7 +1774,7 @@ static inline void mark_inode_dirty_sync(struct inode *inode) */ static inline void set_nlink(struct inode *inode, unsigned int nlink) { - inode->i_nlink = nlink; + inode->__i_nlink = nlink; } /** @@ -1777,7 +1787,7 @@ static inline void set_nlink(struct inode *inode, unsigned int nlink) */ static inline void inc_nlink(struct inode *inode) { - inode->i_nlink++; + inode->__i_nlink++; } static inline void inode_inc_link_count(struct inode *inode) @@ -1799,7 +1809,7 @@ static inline void inode_inc_link_count(struct inode *inode) */ static inline void drop_nlink(struct inode *inode) { - inode->i_nlink--; + inode->__i_nlink--; } /** @@ -1812,7 +1822,7 @@ static inline void drop_nlink(struct inode *inode) */ static inline void clear_nlink(struct inode *inode) { - inode->i_nlink = 0; + inode->__i_nlink = 0; } static inline void inode_dec_link_count(struct inode *inode) -- cgit v1.2.3-58-ga151 From f0023bc617ba600956b9226f1806033d7486c8ba Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 28 Oct 2011 10:02:42 -0700 Subject: vfs: add d_prune dentry operation This adds a d_prune dentry operation that is called by the VFS prior to pruning (i.e. unhashing and killing) a hashed dentry from the dcache. Wrap dentry_lru_del() and use the new _prune() helper in the cases where we are about to unhash and kill the dentry. This will be used by Ceph to maintain a flag indicating whether the complete contents of a directory are contained in the dcache, allowing it to satisfy lookups and readdir without addition server communication. Renumber a few DCACHE_* #defines to group DCACHE_OP_PRUNE with the other DCACHE_OP_ bits. Signed-off-by: Sage Weil Signed-off-by: Christoph Hellwig --- Documentation/filesystems/Locking | 1 + fs/dcache.c | 40 ++++++++++++++++++++++++++++++++++----- include/linux/dcache.h | 8 +++++--- 3 files changed, 41 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 653380793a6c..d819ba16a0c7 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -29,6 +29,7 @@ d_hash no no no maybe d_compare: yes no no maybe d_delete: no yes no no d_release: no no yes no +d_prune: no yes no no d_iput: no no yes no d_dname: no no no no d_automount: no no yes no diff --git a/fs/dcache.c b/fs/dcache.c index a88948b8bd17..274f13e2f094 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -225,7 +225,7 @@ static void dentry_unlink_inode(struct dentry * dentry) } /* - * dentry_lru_(add|del|move_tail) must be called with d_lock held. + * dentry_lru_(add|del|prune|move_tail) must be called with d_lock held. */ static void dentry_lru_add(struct dentry *dentry) { @@ -245,6 +245,9 @@ static void __dentry_lru_del(struct dentry *dentry) dentry_stat.nr_unused--; } +/* + * Remove a dentry with references from the LRU. + */ static void dentry_lru_del(struct dentry *dentry) { if (!list_empty(&dentry->d_lru)) { @@ -254,6 +257,23 @@ static void dentry_lru_del(struct dentry *dentry) } } +/* + * Remove a dentry that is unreferenced and about to be pruned + * (unhashed and destroyed) from the LRU, and inform the file system. + * This wrapper should be called _prior_ to unhashing a victim dentry. + */ +static void dentry_lru_prune(struct dentry *dentry) +{ + if (!list_empty(&dentry->d_lru)) { + if (dentry->d_flags & DCACHE_OP_PRUNE) + dentry->d_op->d_prune(dentry); + + spin_lock(&dcache_lru_lock); + __dentry_lru_del(dentry); + spin_unlock(&dcache_lru_lock); + } +} + static void dentry_lru_move_tail(struct dentry *dentry) { spin_lock(&dcache_lru_lock); @@ -403,8 +423,12 @@ relock: if (ref) dentry->d_count--; - /* if dentry was on the d_lru list delete it from there */ - dentry_lru_del(dentry); + /* + * if dentry was on the d_lru list delete it from there. + * inform the fs via d_prune that this dentry is about to be + * unhashed and destroyed. + */ + dentry_lru_prune(dentry); /* if it was on the hash then remove it */ __d_drop(dentry); return d_kill(dentry, parent); @@ -854,8 +878,12 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) do { struct inode *inode; - /* detach from the system */ - dentry_lru_del(dentry); + /* + * remove the dentry from the lru, and inform + * the fs that this dentry is about to be + * unhashed and destroyed. + */ + dentry_lru_prune(dentry); __d_shrink(dentry); if (dentry->d_count != 0) { @@ -1283,6 +1311,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) dentry->d_flags |= DCACHE_OP_REVALIDATE; if (op->d_delete) dentry->d_flags |= DCACHE_OP_DELETE; + if (op->d_prune) + dentry->d_flags |= DCACHE_OP_PRUNE; } EXPORT_SYMBOL(d_set_d_op); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 62157c03caf7..4df926199369 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -165,6 +165,7 @@ struct dentry_operations { unsigned int, const char *, const struct qstr *); int (*d_delete)(const struct dentry *); void (*d_release)(struct dentry *); + void (*d_prune)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); @@ -184,8 +185,9 @@ struct dentry_operations { #define DCACHE_OP_COMPARE 0x0002 #define DCACHE_OP_REVALIDATE 0x0004 #define DCACHE_OP_DELETE 0x0008 +#define DCACHE_OP_PRUNE 0x0010 -#define DCACHE_DISCONNECTED 0x0010 +#define DCACHE_DISCONNECTED 0x0020 /* This dentry is possibly not currently connected to the dcache tree, in * which case its parent will either be itself, or will have this flag as * well. nfsd will not use a dentry with this bit set, but will first @@ -196,8 +198,8 @@ struct dentry_operations { * dentry into place and return that dentry rather than the passed one, * typically using d_splice_alias. */ -#define DCACHE_REFERENCED 0x0020 /* Recently used, don't discard. */ -#define DCACHE_RCUACCESS 0x0040 /* Entry has ever been RCU-visible */ +#define DCACHE_REFERENCED 0x0040 /* Recently used, don't discard. */ +#define DCACHE_RCUACCESS 0x0080 /* Entry has ever been RCU-visible */ #define DCACHE_CANT_MOUNT 0x0100 #define DCACHE_GENOCIDE 0x0200 -- cgit v1.2.3-58-ga151 From 548fd1e8dba90bea674f5969d73498959d83924b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Aug 2011 20:08:59 +0100 Subject: um: kill useless include of user.h everything in USER_OBJ gets it via -include user.h Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- fs/hostfs/hostfs_user.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index d51a98384bc0..dd7bc38a3825 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -16,7 +16,6 @@ #include #include "hostfs.h" #include "os.h" -#include "user.h" #include static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p) -- cgit v1.2.3-58-ga151 From 7657cacf478940b995c2c73fdff981c13cc62c5c Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Sat, 22 Oct 2011 01:34:48 +0100 Subject: Squashfs: Add an option to set dev block size to 4K This commit adds an option to set the device block size used to 4K. By default Squashfs sets the device block size (sb_min_blocksize) to 1K or the smallest block size supported by the block device (if larger). This, because blocks are packed together and unaligned in Squashfs, should reduce latency. This, however, gives poor performance on MTD NAND devices where the optimal I/O size is 4K (even though the devices can support smaller block sizes). Using a 4K device block size may also improve overall I/O performance for some file access patterns (e.g. sequential accesses of files in filesystem order) on all media. Signed-off-by: Phillip Lougher --- fs/squashfs/Kconfig | 22 ++++++++++++++++++++++ fs/squashfs/squashfs_fs.h | 7 +++++++ fs/squashfs/super.c | 2 +- 3 files changed, 30 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index 1360d4f88f41..5586011b6a38 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -78,6 +78,28 @@ config SQUASHFS_XZ If unsure, say N. +config SQUASHFS_4K_DEVBLK_SIZE + bool "Use 4K device block size?" + depends on SQUASHFS + help + By default Squashfs sets the dev block size (sb_min_blocksize) + to 1K or the smallest block size supported by the block device + (if larger). This, because blocks are packed together and + unaligned in Squashfs, should reduce latency. + + This, however, gives poor performance on MTD NAND devices where + the optimal I/O size is 4K (even though the devices can support + smaller block sizes). + + Using a 4K device block size may also improve overall I/O + performance for some file access patterns (e.g. sequential + accesses of files in filesystem order) on all media. + + Setting this option will force Squashfs to use a 4K device block + size by default. + + If unsure, say N. + config SQUASHFS_EMBEDDED bool "Additional option for memory-constrained systems" depends on SQUASHFS diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index b4a4e539a08c..e8e14645de9a 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -36,6 +36,13 @@ #define SQUASHFS_FILE_SIZE 131072 #define SQUASHFS_FILE_LOG 17 +/* default size of block device I/O */ +#ifdef CONFIG_SQUASHFS_4K_DEVBLK_SIZE +#define SQUASHFS_DEVBLK_SIZE 4096 +#else +#define SQUASHFS_DEVBLK_SIZE 1024 +#endif + #define SQUASHFS_FILE_MAX_SIZE 1048576 #define SQUASHFS_FILE_MAX_LOG 20 diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 7438850c62d0..2da1715452ac 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -95,7 +95,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) } msblk = sb->s_fs_info; - msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE); + msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); msblk->devblksize_log2 = ffz(~msblk->devblksize); mutex_init(&msblk->read_data_mutex); -- cgit v1.2.3-58-ga151 From a3defbe5c337dbc6da911f8cc49ae3cc3b49b453 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Wed, 2 Nov 2011 13:37:41 -0700 Subject: binfmt_elf: fix PIE execution with randomization disabled The case of address space randomization being disabled in runtime through randomize_va_space sysctl is not treated properly in load_elf_binary(), resulting in SIGKILL coming at exec() time for certain PIE-linked binaries in case the randomization has been disabled at runtime prior to calling exec(). Handle the randomize_va_space == 0 case the same way as if we were not supporting .text randomization at all. Based on original patch by H.J. Lu and Josh Boyer. Signed-off-by: Jiri Kosina Cc: Ingo Molnar Cc: Russell King Cc: H.J. Lu Cc: Tested-by: Josh Boyer Acked-by: Nicolas Pitre Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index dd0fdfc56d38..21ac5ee4b43f 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -795,7 +795,16 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) * might try to exec. This is because the brk will * follow the loader, and is not movable. */ #if defined(CONFIG_X86) || defined(CONFIG_ARM) - load_bias = 0; + /* Memory randomization might have been switched off + * in runtime via sysctl. + * If that is the case, retain the original non-zero + * load_bias value in order to establish proper + * non-randomized mappings. + */ + if (current->flags & PF_RANDOMIZE) + load_bias = 0; + else + load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); #else load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); #endif -- cgit v1.2.3-58-ga151 From 0620d9193cb976ba635d56a6cfd11cb81616d02b Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 2 Nov 2011 13:37:45 -0700 Subject: ramfs: remove module leftovers Since ramfs is hard-selected to "y", the module leftovers make no sense. Signed-off-by: Richard Weinberger Reviewed-by: WANG Cong Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/inode.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'fs') diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index eacb166fb259..462ceb38fec6 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -23,7 +23,6 @@ * caches is sufficient. */ -#include #include #include #include @@ -288,14 +287,7 @@ static int __init init_ramfs_fs(void) { return register_filesystem(&ramfs_fs_type); } - -static void __exit exit_ramfs_fs(void) -{ - unregister_filesystem(&ramfs_fs_type); -} - module_init(init_ramfs_fs) -module_exit(exit_ramfs_fs) int __init init_rootfs(void) { @@ -311,5 +303,3 @@ int __init init_rootfs(void) return err; } - -MODULE_LICENSE("GPL"); -- cgit v1.2.3-58-ga151 From 3069083cc8def2ffad8520f0f24c6f95f140aac5 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 2 Nov 2011 13:38:00 -0700 Subject: isofs: add readpages support Use mpage_readpages() instead of multiple calls to isofs_readpage() to reduce the CPU utilization and make performance higher. Signed-off-by: Namjae Jeon Cc: Al Viro Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/isofs/inode.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index a5d03672d04e..46844ff39d61 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "isofs.h" #include "zisofs.h" @@ -1148,7 +1149,13 @@ struct buffer_head *isofs_bread(struct inode *inode, sector_t block) static int isofs_readpage(struct file *file, struct page *page) { - return block_read_full_page(page,isofs_get_block); + return mpage_readpage(page, isofs_get_block); +} + +static int isofs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, isofs_get_block); } static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) @@ -1158,6 +1165,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) static const struct address_space_operations isofs_aops = { .readpage = isofs_readpage, + .readpages = isofs_readpages, .bmap = _isofs_bmap }; -- cgit v1.2.3-58-ga151 From 434a964daa14b9db083ce20404a4a2add54d037a Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Wed, 2 Nov 2011 13:38:01 -0700 Subject: hfs: fix hfs_find_init() sb->ext_tree NULL ptr oops Clement Lecigne reports a filesystem which causes a kernel oops in hfs_find_init() trying to dereference sb->ext_tree which is NULL. This proves to be because the filesystem has a corrupted MDB extent record, where the extents file does not fit into the first three extents in the file record (the first blocks). In hfs_get_block() when looking up the blocks for the extent file (HFS_EXT_CNID), it fails the first blocks special case, and falls through to the extent code (which ultimately calls hfs_find_init()) which is in the process of being initialised. Hfs avoids this scenario by always having the extents b-tree fitting into the first blocks (the extents B-tree can't have overflow extents). The fix is to check at mount time that the B-tree fits into first blocks, i.e. fail if HFS_I(inode)->alloc_blocks >= HFS_I(inode)->first_blocks Note, the existing commit 47f365eb57573 ("hfs: fix oops on mount with corrupted btree extent records") becomes subsumed into this as a special case, but only for the extents B-tree (HFS_EXT_CNID), it is perfectly acceptable for the catalog B-Tree file to grow beyond three extents, with the remaining extent descriptors in the extents overfow. This fixes CVE-2011-2203 Reported-by: Clement LECIGNE Signed-off-by: Phillip Lougher Cc: Jeff Mahoney Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfs/btree.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 3ebc437736fe..1cbdeea1db44 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -46,11 +46,26 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke case HFS_EXT_CNID: hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize, mdb->drXTFlSize, be32_to_cpu(mdb->drXTClpSiz)); + if (HFS_I(tree->inode)->alloc_blocks > + HFS_I(tree->inode)->first_blocks) { + printk(KERN_ERR "hfs: invalid btree extent records\n"); + unlock_new_inode(tree->inode); + goto free_inode; + } + tree->inode->i_mapping->a_ops = &hfs_btree_aops; break; case HFS_CAT_CNID: hfs_inode_read_fork(tree->inode, mdb->drCTExtRec, mdb->drCTFlSize, mdb->drCTFlSize, be32_to_cpu(mdb->drCTClpSiz)); + + if (!HFS_I(tree->inode)->first_blocks) { + printk(KERN_ERR "hfs: invalid btree extent records " + "(0 size).\n"); + unlock_new_inode(tree->inode); + goto free_inode; + } + tree->inode->i_mapping->a_ops = &hfs_btree_aops; break; default: @@ -59,11 +74,6 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke } unlock_new_inode(tree->inode); - if (!HFS_I(tree->inode)->first_blocks) { - printk(KERN_ERR "hfs: invalid btree extent records (0 size).\n"); - goto free_inode; - } - mapping = tree->inode->i_mapping; page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) -- cgit v1.2.3-58-ga151 From 887df07891de0435c25cffb92268fea2c621f99c Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 2 Nov 2011 13:38:42 -0700 Subject: procfs: report EISDIR when reading sysctl dirs in proc On reading sysctl dirs we should return -EISDIR instead of -EINVAL. Signed-off-by: Pavel Emelyanov Signed-off-by: Cyrill Gorcunov Cc: Alexey Dobriyan Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 1a77dbef226f..dacd840a675a 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -370,6 +370,7 @@ static const struct file_operations proc_sys_file_operations = { }; static const struct file_operations proc_sys_dir_file_operations = { + .read = generic_read_dir, .readdir = proc_sys_readdir, .llseek = generic_file_llseek, }; -- cgit v1.2.3-58-ga151 From aa6afca5bcaba8101f3ea09d5c3e4100b2b9f0e5 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Wed, 2 Nov 2011 13:38:44 -0700 Subject: proc: fix races against execve() of /proc/PID/fd** fd* files are restricted to the task's owner, and other users may not get direct access to them. But one may open any of these files and run any setuid program, keeping opened file descriptors. As there are permission checks on open(), but not on readdir() and read(), operations on the kept file descriptors will not be checked. It makes it possible to violate procfs permission model. Reading fdinfo/* may disclosure current fds' position and flags, reading directory contents of fdinfo/ and fd/ may disclosure the number of opened files by the target task. This information is not sensible per se, but it can reveal some private information (like length of a password stored in a file) under certain conditions. Used existing (un)lock_trace functions to check for ptrace_may_access(), but instead of using EPERM return code from it use EACCES to be consistent with existing proc_pid_follow_link()/proc_pid_readlink() return code. If they differ, attacker can guess what fds exist by analyzing stat() return code. Patched handlers: stat() for fd/*, stat() and read() for fdindo/*, readdir() and lookup() for fd/ and fdinfo/. Signed-off-by: Vasiliy Kulikov Cc: Cyrill Gorcunov Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 146 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 103 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 8f0087e20e16..d4f4913f00db 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1652,12 +1652,46 @@ out: return error; } +static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct task_struct *task = get_proc_task(inode); + int rc; + + if (task == NULL) + return -ESRCH; + + rc = -EACCES; + if (lock_trace(task)) + goto out_task; + + generic_fillattr(inode, stat); + unlock_trace(task); + rc = 0; +out_task: + put_task_struct(task); + return rc; +} + static const struct inode_operations proc_pid_link_inode_operations = { .readlink = proc_pid_readlink, .follow_link = proc_pid_follow_link, .setattr = proc_setattr, }; +static const struct inode_operations proc_fdinfo_link_inode_operations = { + .setattr = proc_setattr, + .getattr = proc_pid_fd_link_getattr, +}; + +static const struct inode_operations proc_fd_link_inode_operations = { + .readlink = proc_pid_readlink, + .follow_link = proc_pid_follow_link, + .setattr = proc_setattr, + .getattr = proc_pid_fd_link_getattr, +}; + /* building an inode */ @@ -1889,49 +1923,61 @@ out: static int proc_fd_info(struct inode *inode, struct path *path, char *info) { - struct task_struct *task = get_proc_task(inode); - struct files_struct *files = NULL; + struct task_struct *task; + struct files_struct *files; struct file *file; int fd = proc_fd(inode); + int rc; - if (task) { - files = get_files_struct(task); - put_task_struct(task); - } - if (files) { - /* - * We are not taking a ref to the file structure, so we must - * hold ->file_lock. - */ - spin_lock(&files->file_lock); - file = fcheck_files(files, fd); - if (file) { - unsigned int f_flags; - struct fdtable *fdt; - - fdt = files_fdtable(files); - f_flags = file->f_flags & ~O_CLOEXEC; - if (FD_ISSET(fd, fdt->close_on_exec)) - f_flags |= O_CLOEXEC; - - if (path) { - *path = file->f_path; - path_get(&file->f_path); - } - if (info) - snprintf(info, PROC_FDINFO_MAX, - "pos:\t%lli\n" - "flags:\t0%o\n", - (long long) file->f_pos, - f_flags); - spin_unlock(&files->file_lock); - put_files_struct(files); - return 0; + task = get_proc_task(inode); + if (!task) + return -ENOENT; + + rc = -EACCES; + if (lock_trace(task)) + goto out_task; + + rc = -ENOENT; + files = get_files_struct(task); + if (files == NULL) + goto out_unlock; + + /* + * We are not taking a ref to the file structure, so we must + * hold ->file_lock. + */ + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (file) { + unsigned int f_flags; + struct fdtable *fdt; + + fdt = files_fdtable(files); + f_flags = file->f_flags & ~O_CLOEXEC; + if (FD_ISSET(fd, fdt->close_on_exec)) + f_flags |= O_CLOEXEC; + + if (path) { + *path = file->f_path; + path_get(&file->f_path); } - spin_unlock(&files->file_lock); - put_files_struct(files); - } - return -ENOENT; + if (info) + snprintf(info, PROC_FDINFO_MAX, + "pos:\t%lli\n" + "flags:\t0%o\n", + (long long) file->f_pos, + f_flags); + rc = 0; + } else + rc = -ENOENT; + spin_unlock(&files->file_lock); + put_files_struct(files); + +out_unlock: + unlock_trace(task); +out_task: + put_task_struct(task); + return rc; } static int proc_fd_link(struct inode *inode, struct path *path) @@ -2026,7 +2072,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, spin_unlock(&files->file_lock); put_files_struct(files); - inode->i_op = &proc_pid_link_inode_operations; + inode->i_op = &proc_fd_link_inode_operations; inode->i_size = 64; ei->op.proc_get_link = proc_fd_link; d_set_d_op(dentry, &tid_fd_dentry_operations); @@ -2058,7 +2104,12 @@ static struct dentry *proc_lookupfd_common(struct inode *dir, if (fd == ~0U) goto out; + result = ERR_PTR(-EACCES); + if (lock_trace(task)) + goto out; + result = instantiate(dir, dentry, task, &fd); + unlock_trace(task); out: put_task_struct(task); out_no_task: @@ -2078,23 +2129,28 @@ static int proc_readfd_common(struct file * filp, void * dirent, retval = -ENOENT; if (!p) goto out_no_task; + + retval = -EACCES; + if (lock_trace(p)) + goto out; + retval = 0; fd = filp->f_pos; switch (fd) { case 0: if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) - goto out; + goto out_unlock; filp->f_pos++; case 1: ino = parent_ino(dentry); if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) - goto out; + goto out_unlock; filp->f_pos++; default: files = get_files_struct(p); if (!files) - goto out; + goto out_unlock; rcu_read_lock(); for (fd = filp->f_pos-2; fd < files_fdtable(files)->max_fds; @@ -2118,6 +2174,9 @@ static int proc_readfd_common(struct file * filp, void * dirent, rcu_read_unlock(); put_files_struct(files); } + +out_unlock: + unlock_trace(p); out: put_task_struct(p); out_no_task: @@ -2195,6 +2254,7 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir, ei->fd = fd; inode->i_mode = S_IFREG | S_IRUSR; inode->i_fop = &proc_fdinfo_file_operations; + inode->i_op = &proc_fdinfo_link_inode_operations; d_set_d_op(dentry, &tid_fd_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ -- cgit v1.2.3-58-ga151 From f1ecf06854a66ee663f4d4cf029c78cd62a15e04 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 2 Nov 2011 13:39:22 -0700 Subject: sysctl: add support for poll() Adding support for poll() in sysctl fs allows userspace to receive notifications of changes in sysctl entries. This adds a infrastructure to allow files in sysctl fs to be pollable and implements it for hostname and domainname. [akpm@linux-foundation.org: s/declare/define/ for definitions] Signed-off-by: Lucas De Marchi Cc: Greg KH Cc: Kay Sievers Cc: Al Viro Cc: "Eric W. Biederman" Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/sysctl.h | 22 ++++++++++++++++++++++ include/linux/utsname.h | 16 ++++++++++++++++ kernel/sys.c | 2 ++ kernel/utsname_sysctl.c | 23 +++++++++++++++++++++++ 5 files changed, 108 insertions(+) (limited to 'fs') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index dacd840a675a..df594803f45a 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -3,6 +3,7 @@ */ #include #include +#include #include #include #include @@ -14,6 +15,15 @@ static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; +void proc_sys_poll_notify(struct ctl_table_poll *poll) +{ + if (!poll) + return; + + atomic_inc(&poll->event); + wake_up_interruptible(&poll->wait); +} + static struct inode *proc_sys_make_inode(struct super_block *sb, struct ctl_table_header *head, struct ctl_table *table) { @@ -176,6 +186,39 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf, return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1); } +static int proc_sys_open(struct inode *inode, struct file *filp) +{ + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + + if (table->poll) + filp->private_data = proc_sys_poll_event(table->poll); + + return 0; +} + +static unsigned int proc_sys_poll(struct file *filp, poll_table *wait) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + unsigned long event = (unsigned long)filp->private_data; + unsigned int ret = DEFAULT_POLLMASK; + + if (!table->proc_handler) + goto out; + + if (!table->poll) + goto out; + + poll_wait(filp, &table->poll->wait, wait); + + if (event != atomic_read(&table->poll->event)) { + filp->private_data = proc_sys_poll_event(table->poll); + ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI; + } + +out: + return ret; +} static int proc_sys_fill_cache(struct file *filp, void *dirent, filldir_t filldir, @@ -364,6 +407,8 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct } static const struct file_operations proc_sys_file_operations = { + .open = proc_sys_open, + .poll = proc_sys_poll, .read = proc_sys_read, .write = proc_sys_write, .llseek = default_llseek, diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 9a1ec10fd504..703cfa33a3ca 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -931,6 +931,7 @@ enum #ifdef __KERNEL__ #include #include +#include /* For the /proc/sys support */ struct ctl_table; @@ -1011,6 +1012,26 @@ extern int proc_do_large_bitmap(struct ctl_table *, int, * cover common cases. */ +/* Support for userspace poll() to watch for changes */ +struct ctl_table_poll { + atomic_t event; + wait_queue_head_t wait; +}; + +static inline void *proc_sys_poll_event(struct ctl_table_poll *poll) +{ + return (void *)(unsigned long)atomic_read(&poll->event); +} + +void proc_sys_poll_notify(struct ctl_table_poll *poll); + +#define __CTL_TABLE_POLL_INITIALIZER(name) { \ + .event = ATOMIC_INIT(0), \ + .wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.wait) } + +#define DEFINE_CTL_TABLE_POLL(name) \ + struct ctl_table_poll name = __CTL_TABLE_POLL_INITIALIZER(name) + /* A sysctl table is an array of struct ctl_table: */ struct ctl_table { @@ -1021,6 +1042,7 @@ struct ctl_table struct ctl_table *child; struct ctl_table *parent; /* Automatically set */ proc_handler *proc_handler; /* Callback for text formatting */ + struct ctl_table_poll *poll; void *extra1; void *extra2; }; diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 4e5b0213fdc1..c714ed75eae2 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -37,6 +37,14 @@ struct new_utsname { #include #include +enum uts_proc { + UTS_PROC_OSTYPE, + UTS_PROC_OSRELEASE, + UTS_PROC_VERSION, + UTS_PROC_HOSTNAME, + UTS_PROC_DOMAINNAME, +}; + struct user_namespace; extern struct user_namespace init_user_ns; @@ -80,6 +88,14 @@ static inline struct uts_namespace *copy_utsname(unsigned long flags, } #endif +#ifdef CONFIG_PROC_SYSCTL +extern void uts_proc_notify(enum uts_proc proc); +#else +static inline void uts_proc_notify(enum uts_proc proc) +{ +} +#endif + static inline struct new_utsname *utsname(void) { return ¤t->nsproxy->uts_ns->name; diff --git a/kernel/sys.c b/kernel/sys.c index 58459509b14c..d06c091e0345 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1286,6 +1286,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; } + uts_proc_notify(UTS_PROC_HOSTNAME); up_write(&uts_sem); return errno; } @@ -1336,6 +1337,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; } + uts_proc_notify(UTS_PROC_DOMAINNAME); up_write(&uts_sem); return errno; } diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index a2cd77e70d4d..3b0d48ebf81d 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -13,6 +13,7 @@ #include #include #include +#include static void *get_uts(ctl_table *table, int write) { @@ -51,12 +52,19 @@ static int proc_do_uts_string(ctl_table *table, int write, uts_table.data = get_uts(table, write); r = proc_dostring(&uts_table,write,buffer,lenp, ppos); put_uts(table, write, uts_table.data); + + if (write) + proc_sys_poll_notify(table->poll); + return r; } #else #define proc_do_uts_string NULL #endif +static DEFINE_CTL_TABLE_POLL(hostname_poll); +static DEFINE_CTL_TABLE_POLL(domainname_poll); + static struct ctl_table uts_kern_table[] = { { .procname = "ostype", @@ -85,6 +93,7 @@ static struct ctl_table uts_kern_table[] = { .maxlen = sizeof(init_uts_ns.name.nodename), .mode = 0644, .proc_handler = proc_do_uts_string, + .poll = &hostname_poll, }, { .procname = "domainname", @@ -92,6 +101,7 @@ static struct ctl_table uts_kern_table[] = { .maxlen = sizeof(init_uts_ns.name.domainname), .mode = 0644, .proc_handler = proc_do_uts_string, + .poll = &domainname_poll, }, {} }; @@ -105,6 +115,19 @@ static struct ctl_table uts_root_table[] = { {} }; +#ifdef CONFIG_PROC_SYSCTL +/* + * Notify userspace about a change in a certain entry of uts_kern_table, + * identified by the parameter proc. + */ +void uts_proc_notify(enum uts_proc proc) +{ + struct ctl_table *table = &uts_kern_table[proc]; + + proc_sys_poll_notify(table->poll); +} +#endif + static int __init utsname_sysctl_init(void) { register_sysctl_table(uts_root_table); -- cgit v1.2.3-58-ga151 From 080d676de095a14ecba14c0b9a91acb5bbb634df Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Wed, 2 Nov 2011 13:40:10 -0700 Subject: aio: allocate kiocbs in batches In testing aio on a fast storage device, I found that the context lock takes up a fair amount of cpu time in the I/O submission path. The reason is that we take it for every I/O submitted (see __aio_get_req). Since we know how many I/Os are passed to io_submit, we can preallocate the kiocbs in batches, reducing the number of times we take and release the lock. In my testing, I was able to reduce the amount of time spent in _raw_spin_lock_irq by .56% (average of 3 runs). The command I used to test this was: aio-stress -O -o 2 -o 3 -r 8 -d 128 -b 32 -i 32 -s 16384 I also tested the patch with various numbers of events passed to io_submit, and I ran the xfstests aio group of tests to ensure I didn't break anything. Signed-off-by: Jeff Moyer Cc: Daniel Ehrenberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 136 +++++++++++++++++++++++++++++++++++++++++----------- include/linux/aio.h | 1 + 2 files changed, 108 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 632b235f4fbe..78c514cfd212 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -440,8 +440,6 @@ void exit_aio(struct mm_struct *mm) static struct kiocb *__aio_get_req(struct kioctx *ctx) { struct kiocb *req = NULL; - struct aio_ring *ring; - int okay = 0; req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); if (unlikely(!req)) @@ -459,39 +457,114 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx) INIT_LIST_HEAD(&req->ki_run_list); req->ki_eventfd = NULL; - /* Check if the completion queue has enough free space to - * accept an event from this io. - */ + return req; +} + +/* + * struct kiocb's are allocated in batches to reduce the number of + * times the ctx lock is acquired and released. + */ +#define KIOCB_BATCH_SIZE 32L +struct kiocb_batch { + struct list_head head; + long count; /* number of requests left to allocate */ +}; + +static void kiocb_batch_init(struct kiocb_batch *batch, long total) +{ + INIT_LIST_HEAD(&batch->head); + batch->count = total; +} + +static void kiocb_batch_free(struct kiocb_batch *batch) +{ + struct kiocb *req, *n; + + list_for_each_entry_safe(req, n, &batch->head, ki_batch) { + list_del(&req->ki_batch); + kmem_cache_free(kiocb_cachep, req); + } +} + +/* + * Allocate a batch of kiocbs. This avoids taking and dropping the + * context lock a lot during setup. + */ +static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) +{ + unsigned short allocated, to_alloc; + long avail; + bool called_fput = false; + struct kiocb *req, *n; + struct aio_ring *ring; + + to_alloc = min(batch->count, KIOCB_BATCH_SIZE); + for (allocated = 0; allocated < to_alloc; allocated++) { + req = __aio_get_req(ctx); + if (!req) + /* allocation failed, go with what we've got */ + break; + list_add(&req->ki_batch, &batch->head); + } + + if (allocated == 0) + goto out; + +retry: spin_lock_irq(&ctx->ctx_lock); - ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0); - if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) { + ring = kmap_atomic(ctx->ring_info.ring_pages[0]); + + avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active; + BUG_ON(avail < 0); + if (avail == 0 && !called_fput) { + /* + * Handle a potential starvation case. It is possible that + * we hold the last reference on a struct file, causing us + * to delay the final fput to non-irq context. In this case, + * ctx->reqs_active is artificially high. Calling the fput + * routine here may free up a slot in the event completion + * ring, allowing this allocation to succeed. + */ + kunmap_atomic(ring); + spin_unlock_irq(&ctx->ctx_lock); + aio_fput_routine(NULL); + called_fput = true; + goto retry; + } + + if (avail < allocated) { + /* Trim back the number of requests. */ + list_for_each_entry_safe(req, n, &batch->head, ki_batch) { + list_del(&req->ki_batch); + kmem_cache_free(kiocb_cachep, req); + if (--allocated <= avail) + break; + } + } + + batch->count -= allocated; + list_for_each_entry(req, &batch->head, ki_batch) { list_add(&req->ki_list, &ctx->active_reqs); ctx->reqs_active++; - okay = 1; } - kunmap_atomic(ring, KM_USER0); - spin_unlock_irq(&ctx->ctx_lock); - if (!okay) { - kmem_cache_free(kiocb_cachep, req); - req = NULL; - } + kunmap_atomic(ring); + spin_unlock_irq(&ctx->ctx_lock); - return req; +out: + return allocated; } -static inline struct kiocb *aio_get_req(struct kioctx *ctx) +static inline struct kiocb *aio_get_req(struct kioctx *ctx, + struct kiocb_batch *batch) { struct kiocb *req; - /* Handle a potential starvation case -- should be exceedingly rare as - * requests will be stuck on fput_head only if the aio_fput_routine is - * delayed and the requests were the last user of the struct file. - */ - req = __aio_get_req(ctx); - if (unlikely(NULL == req)) { - aio_fput_routine(NULL); - req = __aio_get_req(ctx); - } + + if (list_empty(&batch->head)) + if (kiocb_batch_refill(ctx, batch) == 0) + return NULL; + req = list_first_entry(&batch->head, struct kiocb, ki_batch); + list_del(&req->ki_batch); return req; } @@ -1515,7 +1588,8 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, - struct iocb *iocb, bool compat) + struct iocb *iocb, struct kiocb_batch *batch, + bool compat) { struct kiocb *req; struct file *file; @@ -1541,7 +1615,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, if (unlikely(!file)) return -EBADF; - req = aio_get_req(ctx); /* returns with 2 references to req */ + req = aio_get_req(ctx, batch); /* returns with 2 references to req */ if (unlikely(!req)) { fput(file); return -EAGAIN; @@ -1621,8 +1695,9 @@ long do_io_submit(aio_context_t ctx_id, long nr, { struct kioctx *ctx; long ret = 0; - int i; + int i = 0; struct blk_plug plug; + struct kiocb_batch batch; if (unlikely(nr < 0)) return -EINVAL; @@ -1639,6 +1714,8 @@ long do_io_submit(aio_context_t ctx_id, long nr, return -EINVAL; } + kiocb_batch_init(&batch, nr); + blk_start_plug(&plug); /* @@ -1659,12 +1736,13 @@ long do_io_submit(aio_context_t ctx_id, long nr, break; } - ret = io_submit_one(ctx, user_iocb, &tmp, compat); + ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat); if (ret) break; } blk_finish_plug(&plug); + kiocb_batch_free(&batch); put_ioctx(ctx); return i ? i : ret; } diff --git a/include/linux/aio.h b/include/linux/aio.h index 2dcb72bff4b6..2314ad8b3c9c 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -117,6 +117,7 @@ struct kiocb { struct list_head ki_list; /* the aio core uses this * for cancellation */ + struct list_head ki_batch; /* batch allocation */ /* * If the aio_resfd field of the userspace iocb is not zero, -- cgit v1.2.3-58-ga151 From 6f276e49fd108362be3fd67154aaaacf872ea026 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Tue, 1 Nov 2011 12:16:15 +0600 Subject: nfs: Fix unused variable warning from file.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following unused variable warning. fs/nfs/file.c: In function ‘nfs_file_release’: fs/nfs/file.c:140:17: warning: unused variable ‘dentry’ fs/nfs/file.c: In function ‘nfs_file_read’: fs/nfs/file.c:237:9: warning: unused variable ‘count’ Signed-off-by: Rakib Mullick Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 28b8c3f3cda3..bd7dff001106 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -137,11 +137,9 @@ nfs_file_open(struct inode *inode, struct file *filp) static int nfs_file_release(struct inode *inode, struct file *filp) { - struct dentry *dentry = filp->f_path.dentry; - dprintk("NFS: release(%s/%s)\n", - dentry->d_parent->d_name.name, - dentry->d_name.name); + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name); nfs_inc_stats(inode, NFSIOS_VFSRELEASE); return nfs_release(inode, filp); @@ -234,14 +232,13 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov, struct dentry * dentry = iocb->ki_filp->f_path.dentry; struct inode * inode = dentry->d_inode; ssize_t result; - size_t count = iov_length(iov, nr_segs); if (iocb->ki_filp->f_flags & O_DIRECT) return nfs_file_direct_read(iocb, iov, nr_segs, pos); dprintk("NFS: read(%s/%s, %lu@%lu)\n", dentry->d_parent->d_name.name, dentry->d_name.name, - (unsigned long) count, (unsigned long) pos); + (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos); result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); if (!result) { -- cgit v1.2.3-58-ga151 From 2b72c9ccd22c4a3299e5a358dcd639fb253730f4 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Tue, 1 Nov 2011 12:23:42 +0600 Subject: nfs: Remove unused variable from write.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_NFS=y and CONFIG_NFS_V3_{,V4}=n we get the following warning. fs/nfs/write.c: In function ‘nfs_writeback_done’: fs/nfs/write.c:1246:21: warning: unused variable ‘server’ Remove the variable 'server' to fix the above warning. Signed-off-by: Rakib Mullick Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 2219c88d96b2..b016b8a36399 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1243,7 +1243,6 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) { struct nfs_writeargs *argp = &data->args; struct nfs_writeres *resp = &data->res; - struct nfs_server *server = NFS_SERVER(data->inode); int status; dprintk("NFS: %5u nfs_writeback_done (status %d)\n", @@ -1277,7 +1276,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) if (time_before(complain, jiffies)) { dprintk("NFS: faulty NFS server %s:" " (committed = %d) != (stable = %d)\n", - server->nfs_client->cl_hostname, + NFS_SERVER(data->inode)->nfs_client->cl_hostname, resp->verf->committed, argp->stable); complain = jiffies + 300 * HZ; } -- cgit v1.2.3-58-ga151 From 4cdc685c7d06f659ef6c336d4242005cdd8df401 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 31 Oct 2011 14:45:06 -0700 Subject: pnfs-obj: Remove redundant EOF from objlayout_io_state The EOF calculation was done on .read_pagelist(), cached in objlayout_io_state->eof, and set in objlayout_read_done() into nfs_read_data->res.eof. So set it directly into nfs_read_data->res.eof and avoid the extra member. Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust --- fs/nfs/objlayout/objlayout.c | 16 +++++++--------- fs/nfs/objlayout/objlayout.h | 1 - 2 files changed, 7 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 1d06f8e2adea..1300736e0fb4 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -287,17 +287,14 @@ static void _rpc_read_complete(struct work_struct *work) void objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) { - int eof = state->eof; - struct nfs_read_data *rdata; + struct nfs_read_data *rdata = state->rpcdata; state->status = status; - dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof); - rdata = state->rpcdata; + dprintk("%s: Begin status=%zd eof=%d\n", __func__, + status, rdata->res.eof); rdata->task.tk_status = status; - if (status >= 0) { + if (status >= 0) rdata->res.count = status; - rdata->res.eof = eof; - } objlayout_iodone(state); /* must not use state after this point */ @@ -330,11 +327,14 @@ objlayout_read_pagelist(struct nfs_read_data *rdata) status = 0; rdata->res.count = 0; rdata->res.eof = 1; + /*FIXME: do we need to call pnfs_ld_read_done() */ goto out; } count = eof - offset; } + rdata->res.eof = (offset + count) >= eof; + state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, rdata->args.pages, rdata->args.pgbase, offset, count, @@ -345,8 +345,6 @@ objlayout_read_pagelist(struct nfs_read_data *rdata) goto out; } - state->eof = state->offset + state->count >= eof; - status = objio_read_pagelist(state); out: dprintk("%s: Return status %Zd\n", __func__, status); diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index a8244c8e042d..ffb884c6fef0 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -86,7 +86,6 @@ struct objlayout_io_state { void *rpcdata; int status; /* res */ - int eof; /* res */ int committed; /* res */ /* Error reporting (layout_return) */ -- cgit v1.2.3-58-ga151 From e6c40fe3f4c4967f1cb486191ed4a5d5f55f3f7e Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 31 Oct 2011 14:45:46 -0700 Subject: pnfs-obj: Return PNFS_NOT_ATTEMPTED in case of read/write_pagelist objlayout driver was always returning PNFS_ATTEMPTED from it's read/write_pagelist operations. Even on error. Fix that. Start by establishing an error return API from io-engine, by not returning ssize_t (length-or-error) but returning "int" 0=OK, 0>Error. And clean up all return types in io-engine. Then if io-engine returned error return PNFS_NOT_ATTEMPTED to generic layer. (With a dprint) Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust --- fs/nfs/objlayout/objio_osd.c | 32 ++++++++++++++++---------------- fs/nfs/objlayout/objlayout.c | 36 +++++++++++++++++++----------------- fs/nfs/objlayout/objlayout.h | 4 ++-- 3 files changed, 37 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index d0cda12fddc3..0c7c9ec24e67 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -142,7 +142,7 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg) } struct objio_state; -typedef ssize_t (*objio_done_fn)(struct objio_state *ios); +typedef int (*objio_done_fn)(struct objio_state *ios); struct objio_state { /* Generic layer */ @@ -720,7 +720,7 @@ out: return 0; } -static ssize_t _sync_done(struct objio_state *ios) +static int _sync_done(struct objio_state *ios) { struct completion *waiting = ios->private; @@ -742,10 +742,10 @@ static void _done_io(struct osd_request *or, void *p) kref_put(&ios->kref, _last_io); } -static ssize_t _io_exec(struct objio_state *ios) +static int _io_exec(struct objio_state *ios) { DECLARE_COMPLETION_ONSTACK(wait); - ssize_t status = 0; /* sync status */ + int ret = 0; unsigned i; objio_done_fn saved_done_fn = ios->done; bool sync = ios->ol_state.sync; @@ -771,16 +771,16 @@ static ssize_t _io_exec(struct objio_state *ios) if (sync) { wait_for_completion(&wait); - status = saved_done_fn(ios); + ret = saved_done_fn(ios); } - return status; + return ret; } /* * read */ -static ssize_t _read_done(struct objio_state *ios) +static int _read_done(struct objio_state *ios) { ssize_t status; int ret = _io_check(ios, false); @@ -793,7 +793,7 @@ static ssize_t _read_done(struct objio_state *ios) status = ret; objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); - return status; + return ret; } static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) @@ -833,7 +833,7 @@ err: return ret; } -static ssize_t _read_exec(struct objio_state *ios) +static int _read_exec(struct objio_state *ios) { unsigned i; int ret; @@ -847,14 +847,14 @@ static ssize_t _read_exec(struct objio_state *ios) } ios->done = _read_done; - return _io_exec(ios); /* In sync mode exec returns the io status */ + return _io_exec(ios); err: _io_free(ios); return ret; } -ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) +int objio_read_pagelist(struct objlayout_io_state *ol_state) { struct objio_state *ios = container_of(ol_state, struct objio_state, ol_state); @@ -870,7 +870,7 @@ ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) /* * write */ -static ssize_t _write_done(struct objio_state *ios) +static int _write_done(struct objio_state *ios) { ssize_t status; int ret = _io_check(ios, true); @@ -887,7 +887,7 @@ static ssize_t _write_done(struct objio_state *ios) } objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); - return status; + return ret; } static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) @@ -955,7 +955,7 @@ err: return ret; } -static ssize_t _write_exec(struct objio_state *ios) +static int _write_exec(struct objio_state *ios) { unsigned i; int ret; @@ -969,14 +969,14 @@ static ssize_t _write_exec(struct objio_state *ios) } ios->done = _write_done; - return _io_exec(ios); /* In sync mode exec returns the io->status */ + return _io_exec(ios); err: _io_free(ios); return ret; } -ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) +int objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) { struct objio_state *ios = container_of(ol_state, struct objio_state, ol_state); diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 1300736e0fb4..99c807df11dc 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -315,16 +315,13 @@ objlayout_read_pagelist(struct nfs_read_data *rdata) loff_t offset = rdata->args.offset; size_t count = rdata->args.count; struct objlayout_io_state *state; - ssize_t status = 0; + int err; loff_t eof; - dprintk("%s: Begin inode %p offset %llu count %d\n", - __func__, rdata->inode, offset, (int)count); - eof = i_size_read(rdata->inode); if (unlikely(offset + count > eof)) { if (offset >= eof) { - status = 0; + err = 0; rdata->res.count = 0; rdata->res.eof = 1; /*FIXME: do we need to call pnfs_ld_read_done() */ @@ -341,14 +338,19 @@ objlayout_read_pagelist(struct nfs_read_data *rdata) rdata->lseg, rdata, GFP_KERNEL); if (unlikely(!state)) { - status = -ENOMEM; + err = -ENOMEM; goto out; } + dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n", + __func__, rdata->inode->i_ino, offset, count, rdata->res.eof); - status = objio_read_pagelist(state); + err = objio_read_pagelist(state); out: - dprintk("%s: Return status %Zd\n", __func__, status); - rdata->pnfs_error = status; + if (unlikely(err)) { + rdata->pnfs_error = err; + dprintk("%s: Returned Error %d\n", __func__, err); + return PNFS_NOT_ATTEMPTED; + } return PNFS_ATTEMPTED; } @@ -406,10 +408,7 @@ objlayout_write_pagelist(struct nfs_write_data *wdata, int how) { struct objlayout_io_state *state; - ssize_t status; - - dprintk("%s: Begin inode %p offset %llu count %u\n", - __func__, wdata->inode, wdata->args.offset, wdata->args.count); + int err; state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, wdata->args.pages, @@ -419,16 +418,19 @@ objlayout_write_pagelist(struct nfs_write_data *wdata, wdata->lseg, wdata, GFP_NOFS); if (unlikely(!state)) { - status = -ENOMEM; + err = -ENOMEM; goto out; } state->sync = how & FLUSH_SYNC; - status = objio_write_pagelist(state, how & FLUSH_STABLE); + err = objio_write_pagelist(state, how & FLUSH_STABLE); out: - dprintk("%s: Return status %Zd\n", __func__, status); - wdata->pnfs_error = status; + if (unlikely(err)) { + wdata->pnfs_error = err; + dprintk("%s: Returned Error %d\n", __func__, err); + return PNFS_NOT_ATTEMPTED; + } return PNFS_ATTEMPTED; } diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index ffb884c6fef0..4edac9b6ac0c 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -115,8 +115,8 @@ extern int objio_alloc_io_state( gfp_t gfp_flags); extern void objio_free_io_state(struct objlayout_io_state *state); -extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); -extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, +extern int objio_read_pagelist(struct objlayout_io_state *ol_state); +extern int objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable); /* -- cgit v1.2.3-58-ga151 From 96218556b03d3c6505e2880a097338bf277fd783 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 31 Oct 2011 14:47:32 -0700 Subject: pnfs-obj: Get rid of objlayout_{alloc,free}_io_state This is part of moving objio_osd to use the ORE. objlayout_io_state had two functions: 1. It was used in the error reporting mechanism at layout_return. This function is kept intact. (Later patch will rename objlayout_io_state => objlayout_io_res) 2. Carrier of rw io members into the objio_read/write_paglist API. This is removed in this patch. The {r,w}data received from NFS are passed directly to the objio_{read,write}_paglist API. The io_engine is now allocating it's own IO state as part of the read/write. The minimal functionality that was part of the generic allocation is passed to the io_engine. So part of this patch is rename of: ios->ol_state.foo => ios->foo At objlayout_{read,write}_done an objlayout_io_state is passed that denotes the result of the IO. (Hence the later name change). If the IO is successful objlayout calls an objio_free_result() API immediately (Which for objio_osd causes the release of the io_state). If the IO ended in an error it is hanged onto until reported in layout_return and is released later through the objio_free_result() API. (All this is not new just renamed and cleaned) Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust --- fs/nfs/objlayout/objio_osd.c | 94 +++++++++++++++++++++----------- fs/nfs/objlayout/objlayout.c | 124 +++++++++++-------------------------------- fs/nfs/objlayout/objlayout.h | 36 ++++++------- 3 files changed, 112 insertions(+), 142 deletions(-) (limited to 'fs') diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 0c7c9ec24e67..48eb91aad554 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -148,6 +148,13 @@ struct objio_state { /* Generic layer */ struct objlayout_io_state ol_state; + struct page **pages; + unsigned pgbase; + unsigned nr_pages; + unsigned long count; + loff_t offset; + bool sync; + struct objio_segment *layout; struct kref kref; @@ -394,30 +401,43 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg) kfree(objio_seg); } -int objio_alloc_io_state(struct pnfs_layout_segment *lseg, - struct objlayout_io_state **outp, - gfp_t gfp_flags) +static int +objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, + struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase, + loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags, + struct objio_state **outp) { struct objio_segment *objio_seg = OBJIO_LSEG(lseg); struct objio_state *ios; - const unsigned first_size = sizeof(*ios) + - objio_seg->num_comps * sizeof(ios->per_dev[0]); - const unsigned sec_size = objio_seg->num_comps * - sizeof(ios->ol_state.ioerrs[0]); - - ios = kzalloc(first_size + sec_size, gfp_flags); - if (unlikely(!ios)) + struct __alloc_objio_state { + struct objio_state objios; + struct _objio_per_comp per_dev[objio_seg->num_comps]; + struct pnfs_osd_ioerr ioerrs[objio_seg->num_comps]; + } *aos; + + aos = kzalloc(sizeof(*aos), gfp_flags); + if (unlikely(!aos)) return -ENOMEM; - ios->layout = objio_seg; - ios->ol_state.ioerrs = ((void *)ios) + first_size; - ios->ol_state.num_comps = objio_seg->num_comps; + ios = &aos->objios; - *outp = &ios->ol_state; + ios->layout = objio_seg; + objlayout_init_ioerrs(&aos->objios.ol_state, objio_seg->num_comps, + aos->ioerrs, rpcdata, pnfs_layout_type); + + ios->pages = pages; + ios->pgbase = pgbase; + ios->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; + ios->offset = offset; + ios->count = count; + ios->sync = 0; + BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT); + + *outp = ios; return 0; } -void objio_free_io_state(struct objlayout_io_state *ol_state) +void objio_free_result(struct objlayout_io_state *ol_state) { struct objio_state *ios = container_of(ol_state, struct objio_state, ol_state); @@ -598,7 +618,7 @@ static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, if (per_dev->bio == NULL) { unsigned pages_in_stripe = ios->layout->group_width * (ios->layout->stripe_unit / PAGE_SIZE); - unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / + unsigned bio_size = (ios->nr_pages + pages_in_stripe) / ios->layout->group_width; if (BIO_MAX_PAGES_KMALLOC < bio_size) @@ -615,11 +635,11 @@ static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); unsigned added_len; - BUG_ON(ios->ol_state.nr_pages <= pg); + BUG_ON(ios->nr_pages <= pg); cur_len -= pglen; added_len = bio_add_pc_page(q, per_dev->bio, - ios->ol_state.pages[pg], pglen, pgbase); + ios->pages[pg], pglen, pgbase); if (unlikely(pglen != added_len)) return -ENOMEM; pgbase = 0; @@ -660,7 +680,7 @@ static int _prepare_one_group(struct objio_state *ios, u64 length, cur_len = stripe_unit - si->unit_off; page_off = si->unit_off & ~PAGE_MASK; BUG_ON(page_off && - (page_off != ios->ol_state.pgbase)); + (page_off != ios->pgbase)); } else { /* dev > si->dev */ per_dev->offset = si->obj_offset - si->unit_off; cur_len = stripe_unit; @@ -693,8 +713,8 @@ out: static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags) { - u64 length = ios->ol_state.count; - u64 offset = ios->ol_state.offset; + u64 length = ios->count; + u64 offset = ios->offset; struct _striping_info si; unsigned last_pg = 0; int ret = 0; @@ -748,7 +768,7 @@ static int _io_exec(struct objio_state *ios) int ret = 0; unsigned i; objio_done_fn saved_done_fn = ios->done; - bool sync = ios->ol_state.sync; + bool sync = ios->sync; if (sync) { ios->done = _sync_done; @@ -792,7 +812,7 @@ static int _read_done(struct objio_state *ios) else status = ret; - objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); + objlayout_read_done(&ios->ol_state, status, ios->sync); return ret; } @@ -854,12 +874,18 @@ err: return ret; } -int objio_read_pagelist(struct objlayout_io_state *ol_state) +int objio_read_pagelist(struct nfs_read_data *rdata) { - struct objio_state *ios = container_of(ol_state, struct objio_state, - ol_state); + struct objio_state *ios; int ret; + ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, + rdata->lseg, rdata->args.pages, rdata->args.pgbase, + rdata->args.offset, rdata->args.count, rdata, + GFP_KERNEL, &ios); + if (unlikely(ret)) + return ret; + ret = _io_rw_pagelist(ios, GFP_KERNEL); if (unlikely(ret)) return ret; @@ -886,7 +912,7 @@ static int _write_done(struct objio_state *ios) status = ret; } - objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); + objlayout_write_done(&ios->ol_state, status, ios->sync); return ret; } @@ -976,12 +1002,20 @@ err: return ret; } -int objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) +int objio_write_pagelist(struct nfs_write_data *wdata, int how) { - struct objio_state *ios = container_of(ol_state, struct objio_state, - ol_state); + struct objio_state *ios; int ret; + ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, + wdata->lseg, wdata->args.pages, wdata->args.pgbase, + wdata->args.offset, wdata->args.count, wdata, GFP_NOFS, + &ios); + if (unlikely(ret)) + return ret; + + ios->sync = 0 != (how & FLUSH_SYNC); + /* TODO: ios->stable = stable; */ ret = _io_rw_pagelist(ios, GFP_NOFS); if (unlikely(ret)) diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 99c807df11dc..a82053ae5595 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -156,59 +156,23 @@ last_byte_offset(u64 start, u64 len) return end > start ? end - 1 : NFS4_MAX_UINT64; } -static struct objlayout_io_state * -objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, - struct page **pages, - unsigned pgbase, - loff_t offset, - size_t count, - struct pnfs_layout_segment *lseg, - void *rpcdata, - gfp_t gfp_flags) +void _fix_verify_io_params(struct pnfs_layout_segment *lseg, + struct page ***p_pages, unsigned *p_pgbase, + u64 offset, unsigned long count) { - struct objlayout_io_state *state; u64 lseg_end_offset; - dprintk("%s: allocating io_state\n", __func__); - if (objio_alloc_io_state(lseg, &state, gfp_flags)) - return NULL; - BUG_ON(offset < lseg->pls_range.offset); lseg_end_offset = end_offset(lseg->pls_range.offset, lseg->pls_range.length); BUG_ON(offset >= lseg_end_offset); - if (offset + count > lseg_end_offset) { - count = lseg->pls_range.length - - (offset - lseg->pls_range.offset); - dprintk("%s: truncated count %Zd\n", __func__, count); - } + WARN_ON(offset + count > lseg_end_offset); - if (pgbase > PAGE_SIZE) { - pages += pgbase >> PAGE_SHIFT; - pgbase &= ~PAGE_MASK; + if (*p_pgbase > PAGE_SIZE) { + dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase); + *p_pages += *p_pgbase >> PAGE_SHIFT; + *p_pgbase &= ~PAGE_MASK; } - - INIT_LIST_HEAD(&state->err_list); - state->lseg = lseg; - state->rpcdata = rpcdata; - state->pages = pages; - state->pgbase = pgbase; - state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; - state->offset = offset; - state->count = count; - state->sync = 0; - - return state; -} - -static void -objlayout_free_io_state(struct objlayout_io_state *state) -{ - dprintk("%s: freeing io_state\n", __func__); - if (unlikely(!state)) - return; - - objio_free_io_state(state); } /* @@ -217,12 +181,10 @@ objlayout_free_io_state(struct objlayout_io_state *state) static void objlayout_iodone(struct objlayout_io_state *state) { - dprintk("%s: state %p status\n", __func__, state); - if (likely(state->status >= 0)) { - objlayout_free_io_state(state); + objio_free_result(state); } else { - struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); + struct objlayout *objlay = state->objlay; spin_lock(&objlay->lock); objlay->delta_space_valid = OBJ_DSU_INVALID; @@ -289,15 +251,15 @@ objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) { struct nfs_read_data *rdata = state->rpcdata; - state->status = status; - dprintk("%s: Begin status=%zd eof=%d\n", __func__, - status, rdata->res.eof); - rdata->task.tk_status = status; + state->status = rdata->task.tk_status = status; if (status >= 0) rdata->res.count = status; objlayout_iodone(state); /* must not use state after this point */ + dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__, + status, rdata->res.eof, sync); + if (sync) pnfs_ld_read_done(rdata); else { @@ -314,7 +276,6 @@ objlayout_read_pagelist(struct nfs_read_data *rdata) { loff_t offset = rdata->args.offset; size_t count = rdata->args.count; - struct objlayout_io_state *state; int err; loff_t eof; @@ -331,20 +292,14 @@ objlayout_read_pagelist(struct nfs_read_data *rdata) } rdata->res.eof = (offset + count) >= eof; + _fix_verify_io_params(rdata->lseg, &rdata->args.pages, + &rdata->args.pgbase, + rdata->args.offset, rdata->args.count); - state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, - rdata->args.pages, rdata->args.pgbase, - offset, count, - rdata->lseg, rdata, - GFP_KERNEL); - if (unlikely(!state)) { - err = -ENOMEM; - goto out; - } dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n", __func__, rdata->inode->i_ino, offset, count, rdata->res.eof); - err = objio_read_pagelist(state); + err = objio_read_pagelist(rdata); out: if (unlikely(err)) { rdata->pnfs_error = err; @@ -374,23 +329,18 @@ void objlayout_write_done(struct objlayout_io_state *state, ssize_t status, bool sync) { - struct nfs_write_data *wdata; + struct nfs_write_data *wdata = state->rpcdata; - dprintk("%s: Begin\n", __func__); - wdata = state->rpcdata; - state->status = status; - wdata->task.tk_status = status; + state->status = wdata->task.tk_status = status; if (status >= 0) { wdata->res.count = status; wdata->verf.committed = state->committed; - dprintk("%s: Return status %d committed %d\n", - __func__, wdata->task.tk_status, - wdata->verf.committed); - } else - dprintk("%s: Return status %d\n", - __func__, wdata->task.tk_status); + } objlayout_iodone(state); - /* must not use state after this point */ + /* must not use oir after this point */ + + dprintk("%s: Return status %zd committed %d sync=%d\n", __func__, + status, wdata->verf.committed, sync); if (sync) pnfs_ld_write_done(wdata); @@ -407,25 +357,13 @@ enum pnfs_try_status objlayout_write_pagelist(struct nfs_write_data *wdata, int how) { - struct objlayout_io_state *state; int err; - state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, - wdata->args.pages, - wdata->args.pgbase, - wdata->args.offset, - wdata->args.count, - wdata->lseg, wdata, - GFP_NOFS); - if (unlikely(!state)) { - err = -ENOMEM; - goto out; - } + _fix_verify_io_params(wdata->lseg, &wdata->args.pages, + &wdata->args.pgbase, + wdata->args.offset, wdata->args.count); - state->sync = how & FLUSH_SYNC; - - err = objio_write_pagelist(state, how & FLUSH_STABLE); - out: + err = objio_write_pagelist(wdata, how); if (unlikely(err)) { wdata->pnfs_error = err; dprintk("%s: Returned Error %d\n", __func__, err); @@ -564,7 +502,7 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p) merge_ioerr(&accumulated_err, ioerr); } list_del(&state->err_list); - objlayout_free_io_state(state); + objio_free_result(state); } pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); @@ -632,7 +570,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, goto loop_done; } list_del(&state->err_list); - objlayout_free_io_state(state); + objio_free_result(state); } loop_done: spin_unlock(&objlay->lock); diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index 4edac9b6ac0c..d7b2ccfa2132 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -75,14 +75,7 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo) * embedded in objects provider io_state data structure */ struct objlayout_io_state { - struct pnfs_layout_segment *lseg; - - struct page **pages; - unsigned pgbase; - unsigned nr_pages; - unsigned long count; - loff_t offset; - bool sync; + struct objlayout *objlay; void *rpcdata; int status; /* res */ @@ -99,6 +92,18 @@ struct objlayout_io_state { struct pnfs_osd_ioerr *ioerrs; }; +static inline +void objlayout_init_ioerrs(struct objlayout_io_state *oir, unsigned num_comps, + struct pnfs_osd_ioerr *ioerrs, void *rpcdata, + struct pnfs_layout_hdr *pnfs_layout_type) +{ + oir->objlay = OBJLAYOUT(pnfs_layout_type); + oir->rpcdata = rpcdata; + INIT_LIST_HEAD(&oir->err_list); + oir->num_comps = num_comps; + oir->ioerrs = ioerrs; +} + /* * Raid engine I/O API */ @@ -109,15 +114,10 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, gfp_t gfp_flags); extern void objio_free_lseg(struct pnfs_layout_segment *lseg); -extern int objio_alloc_io_state( - struct pnfs_layout_segment *lseg, - struct objlayout_io_state **outp, - gfp_t gfp_flags); -extern void objio_free_io_state(struct objlayout_io_state *state); +extern void objio_free_result(struct objlayout_io_state *state); -extern int objio_read_pagelist(struct objlayout_io_state *ol_state); -extern int objio_write_pagelist(struct objlayout_io_state *ol_state, - bool stable); +extern int objio_read_pagelist(struct nfs_read_data *rdata); +extern int objio_write_pagelist(struct nfs_write_data *wdata, int how); /* * callback API @@ -127,10 +127,8 @@ extern void objlayout_io_set_result(struct objlayout_io_state *state, int osd_error, u64 offset, u64 length, bool is_write); static inline void -objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) +objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used) { - struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); - /* If one of the I/Os errored out and the delta_space_used was * invalid we render the complete report as invalid. Protocol mandate * the DSU be accurate or not reported. -- cgit v1.2.3-58-ga151 From e2e04355d9647305c666462a49223f2942a635f0 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 31 Oct 2011 15:03:35 -0700 Subject: pnfs-obj: Rename objlayout_io_state => objlayout_io_res * All instances of objlayout_io_state => objlayout_io_res * All instances of state => oir; * All instances of ol_state => oir; Big but nothing to it Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust --- fs/nfs/objlayout/objio_osd.c | 17 ++++++------ fs/nfs/objlayout/objlayout.c | 63 ++++++++++++++++++++++---------------------- fs/nfs/objlayout/objlayout.h | 15 ++++++----- 3 files changed, 48 insertions(+), 47 deletions(-) (limited to 'fs') diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 48eb91aad554..2347e0ac63e6 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -146,7 +146,7 @@ typedef int (*objio_done_fn)(struct objio_state *ios); struct objio_state { /* Generic layer */ - struct objlayout_io_state ol_state; + struct objlayout_io_res oir; struct page **pages; unsigned pgbase; @@ -422,7 +422,7 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, ios = &aos->objios; ios->layout = objio_seg; - objlayout_init_ioerrs(&aos->objios.ol_state, objio_seg->num_comps, + objlayout_init_ioerrs(&aos->objios.oir, objio_seg->num_comps, aos->ioerrs, rpcdata, pnfs_layout_type); ios->pages = pages; @@ -437,10 +437,9 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, return 0; } -void objio_free_result(struct objlayout_io_state *ol_state) +void objio_free_result(struct objlayout_io_res *oir) { - struct objio_state *ios = container_of(ol_state, struct objio_state, - ol_state); + struct objio_state *ios = container_of(oir, struct objio_state, oir); kfree(ios); } @@ -519,7 +518,7 @@ static int _io_check(struct objio_state *ios, bool is_write) continue; /* we recovered */ } - objlayout_io_set_result(&ios->ol_state, i, + objlayout_io_set_result(&ios->oir, i, &ios->layout->comps[i].oc_object_id, osd_pri_2_pnfs_err(osi.osd_err_pri), ios->per_dev[i].offset, @@ -812,7 +811,7 @@ static int _read_done(struct objio_state *ios) else status = ret; - objlayout_read_done(&ios->ol_state, status, ios->sync); + objlayout_read_done(&ios->oir, status, ios->sync); return ret; } @@ -906,13 +905,13 @@ static int _write_done(struct objio_state *ios) if (likely(!ret)) { /* FIXME: should be based on the OSD's persistence model * See OSD2r05 Section 4.13 Data persistence model */ - ios->ol_state.committed = NFS_FILE_SYNC; + ios->oir.committed = NFS_FILE_SYNC; status = ios->length; } else { status = ret; } - objlayout_write_done(&ios->ol_state, status, ios->sync); + objlayout_write_done(&ios->oir, status, ios->sync); return ret; } diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index a82053ae5595..72074e3a04f9 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -179,16 +179,16 @@ void _fix_verify_io_params(struct pnfs_layout_segment *lseg, * I/O done common code */ static void -objlayout_iodone(struct objlayout_io_state *state) +objlayout_iodone(struct objlayout_io_res *oir) { - if (likely(state->status >= 0)) { - objio_free_result(state); + if (likely(oir->status >= 0)) { + objio_free_result(oir); } else { - struct objlayout *objlay = state->objlay; + struct objlayout *objlay = oir->objlay; spin_lock(&objlay->lock); objlay->delta_space_valid = OBJ_DSU_INVALID; - list_add(&objlay->err_list, &state->err_list); + list_add(&objlay->err_list, &oir->err_list); spin_unlock(&objlay->lock); } } @@ -200,13 +200,13 @@ objlayout_iodone(struct objlayout_io_state *state) * the error for later reporting at layout-return. */ void -objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, +objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index, struct pnfs_osd_objid *pooid, int osd_error, u64 offset, u64 length, bool is_write) { - struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; + struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index]; - BUG_ON(index >= state->num_comps); + BUG_ON(index >= oir->num_comps); if (osd_error) { ioerr->oer_component = *pooid; ioerr->oer_comp_offset = offset; @@ -247,15 +247,15 @@ static void _rpc_read_complete(struct work_struct *work) } void -objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) +objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) { - struct nfs_read_data *rdata = state->rpcdata; + struct nfs_read_data *rdata = oir->rpcdata; - state->status = rdata->task.tk_status = status; + oir->status = rdata->task.tk_status = status; if (status >= 0) rdata->res.count = status; - objlayout_iodone(state); - /* must not use state after this point */ + objlayout_iodone(oir); + /* must not use oir after this point */ dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__, status, rdata->res.eof, sync); @@ -326,17 +326,16 @@ static void _rpc_write_complete(struct work_struct *work) } void -objlayout_write_done(struct objlayout_io_state *state, ssize_t status, - bool sync) +objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) { - struct nfs_write_data *wdata = state->rpcdata; + struct nfs_write_data *wdata = oir->rpcdata; - state->status = wdata->task.tk_status = status; + oir->status = wdata->task.tk_status = status; if (status >= 0) { wdata->res.count = status; - wdata->verf.committed = state->committed; + wdata->verf.committed = oir->committed; } - objlayout_iodone(state); + objlayout_iodone(oir); /* must not use oir after this point */ dprintk("%s: Return status %zd committed %d sync=%d\n", __func__, @@ -475,14 +474,14 @@ merge_ioerr(struct pnfs_osd_ioerr *dest_err, static void encode_accumulated_error(struct objlayout *objlay, __be32 *p) { - struct objlayout_io_state *state, *tmp; + struct objlayout_io_res *oir, *tmp; struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; - list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { + list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { unsigned i; - for (i = 0; i < state->num_comps; i++) { - struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; + for (i = 0; i < oir->num_comps; i++) { + struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; if (!ioerr->oer_errno) continue; @@ -501,8 +500,8 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p) merge_ioerr(&accumulated_err, ioerr); } - list_del(&state->err_list); - objio_free_result(state); + list_del(&oir->err_list); + objio_free_result(oir); } pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); @@ -514,7 +513,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, const struct nfs4_layoutreturn_args *args) { struct objlayout *objlay = OBJLAYOUT(pnfslay); - struct objlayout_io_state *state, *tmp; + struct objlayout_io_res *oir, *tmp; __be32 *start; dprintk("%s: Begin\n", __func__); @@ -523,13 +522,13 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, spin_lock(&objlay->lock); - list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { + list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { __be32 *last_xdr = NULL, *p; unsigned i; int res = 0; - for (i = 0; i < state->num_comps; i++) { - struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; + for (i = 0; i < oir->num_comps; i++) { + struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; if (!ioerr->oer_errno) continue; @@ -553,7 +552,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, } last_xdr = p; - pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]); + pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]); } /* TODO: use xdr_write_pages */ @@ -569,8 +568,8 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, encode_accumulated_error(objlay, last_xdr); goto loop_done; } - list_del(&state->err_list); - objio_free_result(state); + list_del(&oir->err_list); + objio_free_result(oir); } loop_done: spin_unlock(&objlay->lock); diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index d7b2ccfa2132..8ec34727ed21 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -74,7 +74,7 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo) * per-I/O operation state * embedded in objects provider io_state data structure */ -struct objlayout_io_state { +struct objlayout_io_res { struct objlayout *objlay; void *rpcdata; @@ -93,7 +93,7 @@ struct objlayout_io_state { }; static inline -void objlayout_init_ioerrs(struct objlayout_io_state *oir, unsigned num_comps, +void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps, struct pnfs_osd_ioerr *ioerrs, void *rpcdata, struct pnfs_layout_hdr *pnfs_layout_type) { @@ -114,7 +114,10 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, gfp_t gfp_flags); extern void objio_free_lseg(struct pnfs_layout_segment *lseg); -extern void objio_free_result(struct objlayout_io_state *state); +/* objio_free_result will free these @oir structs recieved from + * objlayout_{read,write}_done + */ +extern void objio_free_result(struct objlayout_io_res *oir); extern int objio_read_pagelist(struct nfs_read_data *rdata); extern int objio_write_pagelist(struct nfs_write_data *wdata, int how); @@ -122,7 +125,7 @@ extern int objio_write_pagelist(struct nfs_write_data *wdata, int how); /* * callback API */ -extern void objlayout_io_set_result(struct objlayout_io_state *state, +extern void objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index, struct pnfs_osd_objid *pooid, int osd_error, u64 offset, u64 length, bool is_write); @@ -141,9 +144,9 @@ objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used) spin_unlock(&objlay->lock); } -extern void objlayout_read_done(struct objlayout_io_state *state, +extern void objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync); -extern void objlayout_write_done(struct objlayout_io_state *state, +extern void objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync); extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, -- cgit v1.2.3-58-ga151 From af4f5b54bcf0379089d01518e818f37258708fb7 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 31 Oct 2011 15:04:19 -0700 Subject: pnfs-obj: move to ore 01: ore_layout & ore_components For Ease of reviewing I split the move to ore into 3 parts move to ore 01: ore_layout & ore_components move to ore 02: move to ORE move to ore 03: Remove old raid engine This patch modifies the objio_lseg, layout-segment level and devices and components arrays to use the ORE types. Though it will be removed soon, also the raid engine is modified to actually compile, possibly run, with the new types. So it is the same old raid engine but with some new ORE types. For Ease of reviewing, some of the old code is "#if 0" but is not removed so the diff command works better. The old code will be removed in the 3rd patch. Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust --- fs/nfs/objlayout/objio_osd.c | 272 ++++++++++++++++++++----------------------- 1 file changed, 128 insertions(+), 144 deletions(-) (limited to 'fs') diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 2347e0ac63e6..bd7ec26e2840 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -38,7 +38,7 @@ */ #include -#include +#include #include "objlayout.h" @@ -52,7 +52,7 @@ enum { BIO_MAX_PAGES_KMALLOC = struct objio_dev_ent { struct nfs4_deviceid_node id_node; - struct osd_dev *od; + struct ore_dev od; }; static void @@ -60,8 +60,8 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d) { struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node); - dprintk("%s: free od=%p\n", __func__, de->od); - osduld_put_device(de->od); + dprintk("%s: free od=%p\n", __func__, de->od.od); + osduld_put_device(de->od.od); kfree(de); } @@ -98,12 +98,12 @@ _dev_list_add(const struct nfs_server *nfss, nfss->pnfs_curr_ld, nfss->nfs_client, d_id); - de->od = od; + de->od.od = od; d = nfs4_insert_deviceid_node(&de->id_node); n = container_of(d, struct objio_dev_ent, id_node); if (n != de) { - dprintk("%s: Race with other n->od=%p\n", __func__, n->od); + dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od); objio_free_deviceid_node(&de->id_node); de = n; } @@ -111,28 +111,11 @@ _dev_list_add(const struct nfs_server *nfss, return de; } -struct caps_buffers { - u8 caps_key[OSD_CRYPTO_KEYID_SIZE]; - u8 creds[OSD_CAP_LEN]; -}; - struct objio_segment { struct pnfs_layout_segment lseg; - struct pnfs_osd_object_cred *comps; - - unsigned mirrors_p1; - unsigned stripe_unit; - unsigned group_width; /* Data stripe_units without integrity comps */ - u64 group_depth; - unsigned group_count; - - unsigned max_io_size; - - unsigned comps_index; - unsigned num_comps; - /* variable length */ - struct objio_dev_ent *ods[]; + struct ore_layout layout; + struct ore_components oc; }; static inline struct objio_segment * @@ -155,7 +138,8 @@ struct objio_state { loff_t offset; bool sync; - struct objio_segment *layout; + struct ore_layout *layout; + struct ore_components *oc; struct kref kref; objio_done_fn done; @@ -175,32 +159,33 @@ struct objio_state { /* Send and wait for a get_device_info of devices in the layout, then look them up with the osd_initiator library */ -static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, - struct objio_segment *objio_seg, unsigned comp, - gfp_t gfp_flags) +static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, + struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id, + gfp_t gfp_flags) { struct pnfs_osd_deviceaddr *deviceaddr; - struct nfs4_deviceid *d_id; struct objio_dev_ent *ode; struct osd_dev *od; struct osd_dev_info odi; int err; - d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id; - ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); - if (ode) - return ode; + if (ode) { + objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ + return 0; + } err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); if (unlikely(err)) { dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n", __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err); - return ERR_PTR(err); + return err; } odi.systemid_len = deviceaddr->oda_systemid.len; if (odi.systemid_len > sizeof(odi.systemid)) { + dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n", + __func__, sizeof(odi.systemid)); err = -EINVAL; goto out; } else if (odi.systemid_len) @@ -225,38 +210,15 @@ static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od, gfp_flags); - + objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ + dprintk("Adding new dev_id(%llx:%llx)\n", + _DEVID_LO(d_id), _DEVID_HI(d_id)); out: - dprintk("%s: return=%d\n", __func__, err); objlayout_put_deviceinfo(deviceaddr); - return err ? ERR_PTR(err) : ode; -} - -static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, - struct objio_segment *objio_seg, - gfp_t gfp_flags) -{ - unsigned i; - int err; - - /* lookup all devices */ - for (i = 0; i < objio_seg->num_comps; i++) { - struct objio_dev_ent *ode; - - ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags); - if (unlikely(IS_ERR(ode))) { - err = PTR_ERR(ode); - goto out; - } - objio_seg->ods[i] = ode; - } - err = 0; - -out: - dprintk("%s: return=%d\n", __func__, err); return err; } +#if 0 static int _verify_data_map(struct pnfs_osd_layout *layout) { struct pnfs_osd_data_map *data_map = &layout->olo_map; @@ -296,23 +258,45 @@ static int _verify_data_map(struct pnfs_osd_layout *layout) return 0; } +#endif -static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp, - struct pnfs_osd_object_cred *src_comp, - struct caps_buffers *caps_p) +static void copy_single_comp(struct ore_components *oc, unsigned c, + struct pnfs_osd_object_cred *src_comp) { - WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key)); - WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds)); + struct ore_comp *ocomp = &oc->comps[c]; - *cur_comp = *src_comp; + WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */ + WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred)); - memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred, - sizeof(caps_p->caps_key)); - cur_comp->oc_cap_key.cred = caps_p->caps_key; + ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id; + ocomp->obj.id = src_comp->oc_object_id.oid_object_id; - memcpy(caps_p->creds, src_comp->oc_cap.cred, - sizeof(caps_p->creds)); - cur_comp->oc_cap.cred = caps_p->creds; + memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred)); +} + +int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags, + struct objio_segment **pseg) +{ + struct __alloc_objio_segment { + struct objio_segment olseg; + struct ore_dev *ods[numdevs]; + struct ore_comp comps[numdevs]; + } *aolseg; + + aolseg = kzalloc(sizeof(*aolseg), gfp_flags); + if (unlikely(!aolseg)) { + dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__, + numdevs, sizeof(*aolseg)); + return -ENOMEM; + } + + aolseg->olseg.oc.numdevs = numdevs; + aolseg->olseg.oc.single_comp = EC_MULTPLE_COMPS; + aolseg->olseg.oc.comps = aolseg->comps; + aolseg->olseg.oc.ods = aolseg->ods; + + *pseg = &aolseg->olseg; + return 0; } int objio_alloc_lseg(struct pnfs_layout_segment **outp, @@ -324,59 +308,43 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp, struct objio_segment *objio_seg; struct pnfs_osd_xdr_decode_layout_iter iter; struct pnfs_osd_layout layout; - struct pnfs_osd_object_cred *cur_comp, src_comp; - struct caps_buffers *caps_p; + struct pnfs_osd_object_cred src_comp; + unsigned cur_comp; int err; err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); if (unlikely(err)) return err; - err = _verify_data_map(&layout); + err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg); if (unlikely(err)) return err; - objio_seg = kzalloc(sizeof(*objio_seg) + - sizeof(objio_seg->ods[0]) * layout.olo_num_comps + - sizeof(*objio_seg->comps) * layout.olo_num_comps + - sizeof(struct caps_buffers) * layout.olo_num_comps, - gfp_flags); - if (!objio_seg) - return -ENOMEM; + objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit; + objio_seg->layout.group_width = layout.olo_map.odm_group_width; + objio_seg->layout.group_depth = layout.olo_map.odm_group_depth; + objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; + objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm; - objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps); - cur_comp = objio_seg->comps; - caps_p = (void *)(cur_comp + layout.olo_num_comps); - while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) - copy_single_comp(cur_comp++, &src_comp, caps_p++); + err = ore_verify_layout(layout.olo_map.odm_num_comps, + &objio_seg->layout); if (unlikely(err)) goto err; - objio_seg->num_comps = layout.olo_num_comps; - objio_seg->comps_index = layout.olo_comps_index; - err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags); - if (err) - goto err; - - objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; - objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit; - if (layout.olo_map.odm_group_width) { - objio_seg->group_width = layout.olo_map.odm_group_width; - objio_seg->group_depth = layout.olo_map.odm_group_depth; - objio_seg->group_count = layout.olo_map.odm_num_comps / - objio_seg->mirrors_p1 / - objio_seg->group_width; - } else { - objio_seg->group_width = layout.olo_map.odm_num_comps / - objio_seg->mirrors_p1; - objio_seg->group_depth = -1; - objio_seg->group_count = 1; + objio_seg->oc.first_dev = layout.olo_comps_index; + cur_comp = 0; + while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) { + copy_single_comp(&objio_seg->oc, cur_comp, &src_comp); + err = objio_devices_lookup(pnfslay, objio_seg, cur_comp, + &src_comp.oc_object_id.oid_device_id, + gfp_flags); + if (err) + goto err; + ++cur_comp; } - - /* Cache this calculation it will hit for every page */ - objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - - objio_seg->stripe_unit) * - objio_seg->group_width; + /* pnfs_osd_xdr_decode_layout_comp returns false on error */ + if (unlikely(err)) + goto err; *outp = &objio_seg->lseg; return 0; @@ -393,10 +361,14 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg) int i; struct objio_segment *objio_seg = OBJIO_LSEG(lseg); - for (i = 0; i < objio_seg->num_comps; i++) { - if (!objio_seg->ods[i]) + for (i = 0; i < objio_seg->oc.numdevs; i++) { + struct ore_dev *od = objio_seg->oc.ods[i]; + struct objio_dev_ent *ode; + + if (!od) break; - nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node); + ode = container_of(od, typeof(*ode), od); + nfs4_put_deviceid_node(&ode->id_node); } kfree(objio_seg); } @@ -411,8 +383,8 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, struct objio_state *ios; struct __alloc_objio_state { struct objio_state objios; - struct _objio_per_comp per_dev[objio_seg->num_comps]; - struct pnfs_osd_ioerr ioerrs[objio_seg->num_comps]; + struct _objio_per_comp per_dev[objio_seg->oc.numdevs]; + struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs]; } *aos; aos = kzalloc(sizeof(*aos), gfp_flags); @@ -421,8 +393,9 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, ios = &aos->objios; - ios->layout = objio_seg; - objlayout_init_ioerrs(&aos->objios.oir, objio_seg->num_comps, + ios->layout = &objio_seg->layout; + ios->oc = &objio_seg->oc; + objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs, aos->ioerrs, rpcdata, pnfs_layout_type); ios->pages = pages; @@ -474,6 +447,27 @@ enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) } } +static void __on_dev_error(struct objio_state *ios, bool is_write, + struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep, + u64 dev_offset, u64 dev_len) +{ + struct objio_state *objios = ios->private; + struct pnfs_osd_objid pooid; + struct objio_dev_ent *ode = container_of(od, typeof(*ode), od); + /* FIXME: what to do with more-then-one-group layouts. We need to + * translate from ore_io_state index to oc->comps index + */ + unsigned comp = dev_index; + + pooid.oid_device_id = ode->id_node.deviceid; + pooid.oid_partition_id = ios->oc->comps[comp].obj.partition; + pooid.oid_object_id = ios->oc->comps[comp].obj.id; + + objlayout_io_set_result(&objios->oir, comp, + &pooid, osd_pri_2_pnfs_err(oep), + dev_offset, dev_len, is_write); +} + static void _clear_bio(struct bio *bio) { struct bio_vec *bv; @@ -518,12 +512,9 @@ static int _io_check(struct objio_state *ios, bool is_write) continue; /* we recovered */ } - objlayout_io_set_result(&ios->oir, i, - &ios->layout->comps[i].oc_object_id, - osd_pri_2_pnfs_err(osi.osd_err_pri), - ios->per_dev[i].offset, - ios->per_dev[i].length, - is_write); + __on_dev_error(ios, is_write, ios->oc->ods[i], + ios->per_dev[i].dev, osi.osd_err_pri, + ios->per_dev[i].offset, ios->per_dev[i].length); if (osi.osd_err_pri >= oep) { oep = osi.osd_err_pri; @@ -558,11 +549,11 @@ static void _io_free(struct objio_state *ios) struct osd_dev *_io_od(struct objio_state *ios, unsigned dev) { - unsigned min_dev = ios->layout->comps_index; - unsigned max_dev = min_dev + ios->layout->num_comps; + unsigned min_dev = ios->oc->first_dev; + unsigned max_dev = min_dev + ios->oc->numdevs; BUG_ON(dev < min_dev || max_dev <= dev); - return ios->layout->ods[dev - min_dev]->od; + return ios->oc->ods[dev - min_dev]->od; } struct _striping_info { @@ -820,12 +811,9 @@ static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) struct osd_request *or = NULL; struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; unsigned dev = per_dev->dev; - struct pnfs_osd_object_cred *cred = - &ios->layout->comps[cur_comp]; - struct osd_obj_id obj = { - .partition = cred->oc_object_id.oid_partition_id, - .id = cred->oc_object_id.oid_object_id, - }; + struct ore_comp *cred = + &ios->oc->comps[cur_comp]; + struct osd_obj_id obj = cred->obj; int ret; or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); @@ -837,7 +825,7 @@ static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); - ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); + ret = osd_finalize_request(or, 0, cred->cred, NULL); if (ret) { dprintk("%s: Faild to osd_finalize_request() => %d\n", __func__, ret); @@ -924,12 +912,8 @@ static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) for (; cur_comp < last_comp; ++cur_comp, ++dev) { struct osd_request *or = NULL; - struct pnfs_osd_object_cred *cred = - &ios->layout->comps[cur_comp]; - struct osd_obj_id obj = { - .partition = cred->oc_object_id.oid_partition_id, - .id = cred->oc_object_id.oid_object_id, - }; + struct ore_comp *cred = &ios->oc->comps[cur_comp]; + struct osd_obj_id obj = cred->obj; struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; struct bio *bio; @@ -964,7 +948,7 @@ static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); - ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); + ret = osd_finalize_request(or, 0, cred->cred, NULL); if (ret) { dprintk("%s: Faild to osd_finalize_request() => %d\n", __func__, ret); @@ -1030,7 +1014,7 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, return false; return pgio->pg_count + req->wb_bytes <= - OBJIO_LSEG(pgio->pg_lseg)->max_io_size; + OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length; } static const struct nfs_pageio_ops objio_pg_read_ops = { -- cgit v1.2.3-58-ga151 From eecfc6312a24e6d0d2883de0a9a6ccf8e993f472 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 31 Oct 2011 15:15:38 -0700 Subject: pnfs-obj: move to ore 02: move to ORE In this patch we are actually moving to the ORE. (Object Raid Engine). objio_state holds a pointer to an ore_io_state. Once we have an ore_io_state at hand we can call the ore for reading/writing. We register on the done path to kick off the nfs io_done mechanism. Again for Ease of reviewing the old code is "#if 0" but is not removed so the diff command works better. The old code will be removed in the next patch. fs/exofs/Kconfig::ORE is modified to also be auto-included if PNFS_OBJLAYOUT is set. Since we now depend on ORE. (See comments in fs/exofs/Kconfig) Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust --- fs/exofs/Kconfig | 2 +- fs/nfs/objlayout/objio_osd.c | 133 +++++++++++++++++++------------------------ 2 files changed, 60 insertions(+), 75 deletions(-) (limited to 'fs') diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig index fa9a286c8771..da42f32c49be 100644 --- a/fs/exofs/Kconfig +++ b/fs/exofs/Kconfig @@ -5,7 +5,7 @@ # selected by any of the users. config ORE tristate - depends on EXOFS_FS + depends on EXOFS_FS || PNFS_OBJLAYOUT select ASYNC_XOR default SCSI_OSD_ULD diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index bd7ec26e2840..00b384934c32 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -44,12 +44,6 @@ #define NFSDBG_FACILITY NFSDBG_PNFS_LD -#define _LLU(x) ((unsigned long long)x) - -enum { BIO_MAX_PAGES_KMALLOC = - (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), -}; - struct objio_dev_ent { struct nfs4_deviceid_node id_node; struct ore_dev od; @@ -124,37 +118,13 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg) return container_of(lseg, struct objio_segment, lseg); } -struct objio_state; -typedef int (*objio_done_fn)(struct objio_state *ios); - struct objio_state { /* Generic layer */ struct objlayout_io_res oir; - struct page **pages; - unsigned pgbase; - unsigned nr_pages; - unsigned long count; - loff_t offset; bool sync; - - struct ore_layout *layout; - struct ore_components *oc; - - struct kref kref; - objio_done_fn done; - void *private; - - unsigned long length; - unsigned numdevs; /* Actually used devs in this IO */ - /* A per-device variable array of size numdevs */ - struct _objio_per_comp { - struct bio *bio; - struct osd_request *or; - unsigned long length; - u64 offset; - unsigned dev; - } per_dev[]; + /*FIXME: Support for extra_bytes at ore_get_rw_state() */ + struct ore_io_state *ios; }; /* Send and wait for a get_device_info of devices in the layout, @@ -374,16 +344,16 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg) } static int -objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, +objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading, struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase, loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags, struct objio_state **outp) { struct objio_segment *objio_seg = OBJIO_LSEG(lseg); - struct objio_state *ios; + struct ore_io_state *ios; + int ret; struct __alloc_objio_state { struct objio_state objios; - struct _objio_per_comp per_dev[objio_seg->oc.numdevs]; struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs]; } *aos; @@ -391,30 +361,33 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, if (unlikely(!aos)) return -ENOMEM; - ios = &aos->objios; - - ios->layout = &objio_seg->layout; - ios->oc = &objio_seg->oc; objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs, aos->ioerrs, rpcdata, pnfs_layout_type); + ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading, + offset, count, &ios); + if (unlikely(ret)) { + kfree(aos); + return ret; + } + ios->pages = pages; ios->pgbase = pgbase; - ios->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; - ios->offset = offset; - ios->count = count; - ios->sync = 0; + ios->private = aos; BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT); - *outp = ios; + aos->objios.sync = 0; + aos->objios.ios = ios; + *outp = &aos->objios; return 0; } void objio_free_result(struct objlayout_io_res *oir) { - struct objio_state *ios = container_of(oir, struct objio_state, oir); + struct objio_state *objios = container_of(oir, struct objio_state, oir); - kfree(ios); + ore_put_io_state(objios->ios); + kfree(objios); } enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) @@ -447,7 +420,7 @@ enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) } } -static void __on_dev_error(struct objio_state *ios, bool is_write, +static void __on_dev_error(struct ore_io_state *ios, struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep, u64 dev_offset, u64 dev_len) { @@ -465,9 +438,10 @@ static void __on_dev_error(struct objio_state *ios, bool is_write, objlayout_io_set_result(&objios->oir, comp, &pooid, osd_pri_2_pnfs_err(oep), - dev_offset, dev_len, is_write); + dev_offset, dev_len, !ios->reading); } +#if 0 static void _clear_bio(struct bio *bio) { struct bio_vec *bv; @@ -786,26 +760,28 @@ static int _io_exec(struct objio_state *ios) return ret; } +#endif /* * read */ -static int _read_done(struct objio_state *ios) +static void _read_done(struct ore_io_state *ios, void *private) { + struct objio_state *objios = private; ssize_t status; - int ret = _io_check(ios, false); + int ret = ore_check_io(ios, &__on_dev_error); - _io_free(ios); + /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ if (likely(!ret)) status = ios->length; else status = ret; - objlayout_read_done(&ios->oir, status, ios->sync); - return ret; + objlayout_read_done(&objios->oir, status, objios->sync); } +#if 0 static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) { struct osd_request *or = NULL; @@ -860,49 +836,50 @@ err: _io_free(ios); return ret; } +#endif int objio_read_pagelist(struct nfs_read_data *rdata) { - struct objio_state *ios; + struct objio_state *objios; int ret; - ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, + ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, true, rdata->lseg, rdata->args.pages, rdata->args.pgbase, rdata->args.offset, rdata->args.count, rdata, - GFP_KERNEL, &ios); - if (unlikely(ret)) - return ret; - - ret = _io_rw_pagelist(ios, GFP_KERNEL); + GFP_KERNEL, &objios); if (unlikely(ret)) return ret; - return _read_exec(ios); + objios->ios->done = _read_done; + dprintk("%s: offset=0x%llx length=0x%x\n", __func__, + rdata->args.offset, rdata->args.count); + return ore_read(objios->ios); } /* * write */ -static int _write_done(struct objio_state *ios) +static void _write_done(struct ore_io_state *ios, void *private) { + struct objio_state *objios = private; ssize_t status; - int ret = _io_check(ios, true); + int ret = ore_check_io(ios, &__on_dev_error); - _io_free(ios); + /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ if (likely(!ret)) { /* FIXME: should be based on the OSD's persistence model * See OSD2r05 Section 4.13 Data persistence model */ - ios->oir.committed = NFS_FILE_SYNC; + objios->oir.committed = NFS_FILE_SYNC; status = ios->length; } else { status = ret; } - objlayout_write_done(&ios->oir, status, ios->sync); - return ret; + objlayout_write_done(&objios->oir, status, objios->sync); } +#if 0 static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) { struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; @@ -984,27 +961,35 @@ err: _io_free(ios); return ret; } +#endif int objio_write_pagelist(struct nfs_write_data *wdata, int how) { - struct objio_state *ios; + struct objio_state *objios; int ret; - ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, + ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, false, wdata->lseg, wdata->args.pages, wdata->args.pgbase, wdata->args.offset, wdata->args.count, wdata, GFP_NOFS, - &ios); + &objios); if (unlikely(ret)) return ret; - ios->sync = 0 != (how & FLUSH_SYNC); + objios->sync = 0 != (how & FLUSH_SYNC); - /* TODO: ios->stable = stable; */ - ret = _io_rw_pagelist(ios, GFP_NOFS); + if (!objios->sync) + objios->ios->done = _write_done; + + dprintk("%s: offset=0x%llx length=0x%x\n", __func__, + wdata->args.offset, wdata->args.count); + ret = ore_write(objios->ios); if (unlikely(ret)) return ret; - return _write_exec(ios); + if (objios->sync) + _write_done(objios->ios, objios); + + return 0; } static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, -- cgit v1.2.3-58-ga151 From 04291b628c450ab6fdb606836585f16336662a4e Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 31 Oct 2011 15:16:15 -0700 Subject: pnfs-obj: move to ore 03: Remove old raid engine Finally remove all the old raid engine, which is by now dead code. Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust --- fs/nfs/objlayout/objio_osd.c | 504 ------------------------------------------- 1 file changed, 504 deletions(-) (limited to 'fs') diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 00b384934c32..3161da654a9b 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -188,48 +188,6 @@ out: return err; } -#if 0 -static int _verify_data_map(struct pnfs_osd_layout *layout) -{ - struct pnfs_osd_data_map *data_map = &layout->olo_map; - u64 stripe_length; - u32 group_width; - -/* FIXME: Only raid0 for now. if not go through MDS */ - if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) { - printk(KERN_ERR "Only RAID_0 for now\n"); - return -ENOTSUPP; - } - if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) { - printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n", - data_map->odm_num_comps, data_map->odm_mirror_cnt); - return -EINVAL; - } - - if (data_map->odm_group_width) - group_width = data_map->odm_group_width; - else - group_width = data_map->odm_num_comps / - (data_map->odm_mirror_cnt + 1); - - stripe_length = (u64)data_map->odm_stripe_unit * group_width; - if (stripe_length >= (1ULL << 32)) { - printk(KERN_ERR "Total Stripe length(0x%llx)" - " >= 32bit is not supported\n", _LLU(stripe_length)); - return -ENOTSUPP; - } - - if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { - printk(KERN_ERR "Stripe Unit(0x%llx)" - " must be Multples of PAGE_SIZE(0x%lx)\n", - _LLU(data_map->odm_stripe_unit), PAGE_SIZE); - return -ENOTSUPP; - } - - return 0; -} -#endif - static void copy_single_comp(struct ore_components *oc, unsigned c, struct pnfs_osd_object_cred *src_comp) { @@ -441,327 +399,6 @@ static void __on_dev_error(struct ore_io_state *ios, dev_offset, dev_len, !ios->reading); } -#if 0 -static void _clear_bio(struct bio *bio) -{ - struct bio_vec *bv; - unsigned i; - - __bio_for_each_segment(bv, bio, i, 0) { - unsigned this_count = bv->bv_len; - - if (likely(PAGE_SIZE == this_count)) - clear_highpage(bv->bv_page); - else - zero_user(bv->bv_page, bv->bv_offset, this_count); - } -} - -static int _io_check(struct objio_state *ios, bool is_write) -{ - enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR; - int lin_ret = 0; - int i; - - for (i = 0; i < ios->numdevs; i++) { - struct osd_sense_info osi; - struct osd_request *or = ios->per_dev[i].or; - int ret; - - if (!or) - continue; - - ret = osd_req_decode_sense(or, &osi); - if (likely(!ret)) - continue; - - if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { - /* start read offset passed endof file */ - BUG_ON(is_write); - _clear_bio(ios->per_dev[i].bio); - dprintk("%s: start read offset passed end of file " - "offset=0x%llx, length=0x%lx\n", __func__, - _LLU(ios->per_dev[i].offset), - ios->per_dev[i].length); - - continue; /* we recovered */ - } - __on_dev_error(ios, is_write, ios->oc->ods[i], - ios->per_dev[i].dev, osi.osd_err_pri, - ios->per_dev[i].offset, ios->per_dev[i].length); - - if (osi.osd_err_pri >= oep) { - oep = osi.osd_err_pri; - lin_ret = ret; - } - } - - return lin_ret; -} - -/* - * Common IO state helpers. - */ -static void _io_free(struct objio_state *ios) -{ - unsigned i; - - for (i = 0; i < ios->numdevs; i++) { - struct _objio_per_comp *per_dev = &ios->per_dev[i]; - - if (per_dev->or) { - osd_end_request(per_dev->or); - per_dev->or = NULL; - } - - if (per_dev->bio) { - bio_put(per_dev->bio); - per_dev->bio = NULL; - } - } -} - -struct osd_dev *_io_od(struct objio_state *ios, unsigned dev) -{ - unsigned min_dev = ios->oc->first_dev; - unsigned max_dev = min_dev + ios->oc->numdevs; - - BUG_ON(dev < min_dev || max_dev <= dev); - return ios->oc->ods[dev - min_dev]->od; -} - -struct _striping_info { - u64 obj_offset; - u64 group_length; - unsigned dev; - unsigned unit_off; -}; - -static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, - struct _striping_info *si) -{ - u32 stripe_unit = ios->layout->stripe_unit; - u32 group_width = ios->layout->group_width; - u64 group_depth = ios->layout->group_depth; - u32 U = stripe_unit * group_width; - - u64 T = U * group_depth; - u64 S = T * ios->layout->group_count; - u64 M = div64_u64(file_offset, S); - - /* - G = (L - (M * S)) / T - H = (L - (M * S)) % T - */ - u64 LmodU = file_offset - M * S; - u32 G = div64_u64(LmodU, T); - u64 H = LmodU - G * T; - - u32 N = div_u64(H, U); - - div_u64_rem(file_offset, stripe_unit, &si->unit_off); - si->obj_offset = si->unit_off + (N * stripe_unit) + - (M * group_depth * stripe_unit); - - /* "H - (N * U)" is just "H % U" so it's bound to u32 */ - si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; - si->dev *= ios->layout->mirrors_p1; - - si->group_length = T - H; -} - -static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, - unsigned pgbase, struct _objio_per_comp *per_dev, int len, - gfp_t gfp_flags) -{ - unsigned pg = *cur_pg; - int cur_len = len; - struct request_queue *q = - osd_request_queue(_io_od(ios, per_dev->dev)); - - if (per_dev->bio == NULL) { - unsigned pages_in_stripe = ios->layout->group_width * - (ios->layout->stripe_unit / PAGE_SIZE); - unsigned bio_size = (ios->nr_pages + pages_in_stripe) / - ios->layout->group_width; - - if (BIO_MAX_PAGES_KMALLOC < bio_size) - bio_size = BIO_MAX_PAGES_KMALLOC; - - per_dev->bio = bio_kmalloc(gfp_flags, bio_size); - if (unlikely(!per_dev->bio)) { - dprintk("Faild to allocate BIO size=%u\n", bio_size); - return -ENOMEM; - } - } - - while (cur_len > 0) { - unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); - unsigned added_len; - - BUG_ON(ios->nr_pages <= pg); - cur_len -= pglen; - - added_len = bio_add_pc_page(q, per_dev->bio, - ios->pages[pg], pglen, pgbase); - if (unlikely(pglen != added_len)) - return -ENOMEM; - pgbase = 0; - ++pg; - } - BUG_ON(cur_len); - - per_dev->length += len; - *cur_pg = pg; - return 0; -} - -static int _prepare_one_group(struct objio_state *ios, u64 length, - struct _striping_info *si, unsigned *last_pg, - gfp_t gfp_flags) -{ - unsigned stripe_unit = ios->layout->stripe_unit; - unsigned mirrors_p1 = ios->layout->mirrors_p1; - unsigned devs_in_group = ios->layout->group_width * mirrors_p1; - unsigned dev = si->dev; - unsigned first_dev = dev - (dev % devs_in_group); - unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; - unsigned cur_pg = *last_pg; - int ret = 0; - - while (length) { - struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev]; - unsigned cur_len, page_off = 0; - - if (!per_dev->length) { - per_dev->dev = dev; - if (dev < si->dev) { - per_dev->offset = si->obj_offset + stripe_unit - - si->unit_off; - cur_len = stripe_unit; - } else if (dev == si->dev) { - per_dev->offset = si->obj_offset; - cur_len = stripe_unit - si->unit_off; - page_off = si->unit_off & ~PAGE_MASK; - BUG_ON(page_off && - (page_off != ios->pgbase)); - } else { /* dev > si->dev */ - per_dev->offset = si->obj_offset - si->unit_off; - cur_len = stripe_unit; - } - - if (max_comp < dev - first_dev) - max_comp = dev - first_dev; - } else { - cur_len = stripe_unit; - } - if (cur_len >= length) - cur_len = length; - - ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, - cur_len, gfp_flags); - if (unlikely(ret)) - goto out; - - dev += mirrors_p1; - dev = (dev % devs_in_group) + first_dev; - - length -= cur_len; - ios->length += cur_len; - } -out: - ios->numdevs = max_comp + mirrors_p1; - *last_pg = cur_pg; - return ret; -} - -static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags) -{ - u64 length = ios->count; - u64 offset = ios->offset; - struct _striping_info si; - unsigned last_pg = 0; - int ret = 0; - - while (length) { - _calc_stripe_info(ios, offset, &si); - - if (length < si.group_length) - si.group_length = length; - - ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags); - if (unlikely(ret)) - goto out; - - offset += si.group_length; - length -= si.group_length; - } - -out: - if (!ios->length) - return ret; - - return 0; -} - -static int _sync_done(struct objio_state *ios) -{ - struct completion *waiting = ios->private; - - complete(waiting); - return 0; -} - -static void _last_io(struct kref *kref) -{ - struct objio_state *ios = container_of(kref, struct objio_state, kref); - - ios->done(ios); -} - -static void _done_io(struct osd_request *or, void *p) -{ - struct objio_state *ios = p; - - kref_put(&ios->kref, _last_io); -} - -static int _io_exec(struct objio_state *ios) -{ - DECLARE_COMPLETION_ONSTACK(wait); - int ret = 0; - unsigned i; - objio_done_fn saved_done_fn = ios->done; - bool sync = ios->sync; - - if (sync) { - ios->done = _sync_done; - ios->private = &wait; - } - - kref_init(&ios->kref); - - for (i = 0; i < ios->numdevs; i++) { - struct osd_request *or = ios->per_dev[i].or; - - if (!or) - continue; - - kref_get(&ios->kref); - osd_execute_request_async(or, _done_io, ios); - } - - kref_put(&ios->kref, _last_io); - - if (sync) { - wait_for_completion(&wait); - ret = saved_done_fn(ios); - } - - return ret; -} -#endif - /* * read */ @@ -781,63 +418,6 @@ static void _read_done(struct ore_io_state *ios, void *private) objlayout_read_done(&objios->oir, status, objios->sync); } -#if 0 -static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) -{ - struct osd_request *or = NULL; - struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; - unsigned dev = per_dev->dev; - struct ore_comp *cred = - &ios->oc->comps[cur_comp]; - struct osd_obj_id obj = cred->obj; - int ret; - - or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); - if (unlikely(!or)) { - ret = -ENOMEM; - goto err; - } - per_dev->or = or; - - osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); - - ret = osd_finalize_request(or, 0, cred->cred, NULL); - if (ret) { - dprintk("%s: Faild to osd_finalize_request() => %d\n", - __func__, ret); - goto err; - } - - dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", - __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), - per_dev->length); - -err: - return ret; -} - -static int _read_exec(struct objio_state *ios) -{ - unsigned i; - int ret; - - for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - if (!ios->per_dev[i].length) - continue; - ret = _read_mirrors(ios, i); - if (unlikely(ret)) - goto err; - } - - ios->done = _read_done; - return _io_exec(ios); - -err: - _io_free(ios); - return ret; -} -#endif - int objio_read_pagelist(struct nfs_read_data *rdata) { struct objio_state *objios; @@ -879,90 +459,6 @@ static void _write_done(struct ore_io_state *ios, void *private) objlayout_write_done(&objios->oir, status, objios->sync); } -#if 0 -static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) -{ - struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; - unsigned dev = ios->per_dev[cur_comp].dev; - unsigned last_comp = cur_comp + ios->layout->mirrors_p1; - int ret; - - for (; cur_comp < last_comp; ++cur_comp, ++dev) { - struct osd_request *or = NULL; - struct ore_comp *cred = &ios->oc->comps[cur_comp]; - struct osd_obj_id obj = cred->obj; - struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; - struct bio *bio; - - or = osd_start_request(_io_od(ios, dev), GFP_NOFS); - if (unlikely(!or)) { - ret = -ENOMEM; - goto err; - } - per_dev->or = or; - - if (per_dev != master_dev) { - bio = bio_kmalloc(GFP_NOFS, - master_dev->bio->bi_max_vecs); - if (unlikely(!bio)) { - dprintk("Faild to allocate BIO size=%u\n", - master_dev->bio->bi_max_vecs); - ret = -ENOMEM; - goto err; - } - - __bio_clone(bio, master_dev->bio); - bio->bi_bdev = NULL; - bio->bi_next = NULL; - per_dev->bio = bio; - per_dev->dev = dev; - per_dev->length = master_dev->length; - per_dev->offset = master_dev->offset; - } else { - bio = master_dev->bio; - bio->bi_rw |= REQ_WRITE; - } - - osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); - - ret = osd_finalize_request(or, 0, cred->cred, NULL); - if (ret) { - dprintk("%s: Faild to osd_finalize_request() => %d\n", - __func__, ret); - goto err; - } - - dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", - __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), - per_dev->length); - } - -err: - return ret; -} - -static int _write_exec(struct objio_state *ios) -{ - unsigned i; - int ret; - - for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - if (!ios->per_dev[i].length) - continue; - ret = _write_mirrors(ios, i); - if (unlikely(ret)) - goto err; - } - - ios->done = _write_done; - return _io_exec(ios); - -err: - _io_free(ios); - return ret; -} -#endif - int objio_write_pagelist(struct nfs_write_data *wdata, int how) { struct objio_state *objios; -- cgit v1.2.3-58-ga151 From 278c023a99b0d6b471d0f4a79835c703482e29ac Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 31 Oct 2011 15:16:54 -0700 Subject: pnfs-obj: Support for RAID5 read-4-write interface. The ore need suplied a r4w_get_page/r4w_put_page API from Filesystem so it can get cache pages to read-into when writing parial stripes. Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust --- fs/nfs/objlayout/objio_osd.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'fs') diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 3161da654a9b..c807ab93140e 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -459,6 +459,43 @@ static void _write_done(struct ore_io_state *ios, void *private) objlayout_write_done(&objios->oir, status, objios->sync); } +static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) +{ + struct objio_state *objios = priv; + struct nfs_write_data *wdata = objios->oir.rpcdata; + pgoff_t index = offset / PAGE_SIZE; + struct page *page = find_get_page(wdata->inode->i_mapping, index); + + if (!page) { + page = find_or_create_page(wdata->inode->i_mapping, + index, GFP_NOFS); + if (unlikely(!page)) { + dprintk("%s: grab_cache_page Failed index=0x%lx\n", + __func__, index); + return NULL; + } + unlock_page(page); + } + if (PageDirty(page) || PageWriteback(page)) + *uptodate = true; + else + *uptodate = PageUptodate(page); + dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate); + return page; +} + +static void __r4w_put_page(void *priv, struct page *page) +{ + dprintk("%s: index=0x%lx\n", __func__, page->index); + page_cache_release(page); + return; +} + +static const struct _ore_r4w_op _r4w_op = { + .get_page = &__r4w_get_page, + .put_page = &__r4w_put_page, +}; + int objio_write_pagelist(struct nfs_write_data *wdata, int how) { struct objio_state *objios; @@ -472,6 +509,7 @@ int objio_write_pagelist(struct nfs_write_data *wdata, int how) return ret; objios->sync = 0 != (how & FLUSH_SYNC); + objios->ios->r4w = &_r4w_op; if (!objios->sync) objios->ios->done = _write_done; -- cgit v1.2.3-58-ga151 From b58dc4100b9190f2cb437f1f67ffcb9f9acc4923 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 15 Mar 2011 15:53:40 -0700 Subject: ceph: clear parent D_COMPLETE flag when on dentry prune When the VFS prunes a dentry from the cache, clear the D_COMPLETE flag on the parent dentry. Do this for the live and snapshotted namespaces. Do not bother for the .snap dir contents, since we do not cache that. Signed-off-by: Sage Weil --- fs/ceph/dir.c | 28 ++++++++++++++++++++++++++++ fs/ceph/super.h | 13 +++++++++++++ 2 files changed, 41 insertions(+) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 382abc9a6a54..97fb155b3242 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1092,7 +1092,33 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry, return 1; } +/* + * When the VFS prunes a dentry from the cache, we need to clear the + * complete flag on the parent directory. + * + * Called under dentry->d_lock. + */ +static void ceph_d_prune(struct dentry *dentry) +{ + struct ceph_dentry_info *di; + + dout("d_release %p\n", dentry); + + /* do we have a valid parent? */ + if (!dentry->d_parent || IS_ROOT(dentry)) + return; + + /* if we are not hashed, we don't affect D_COMPLETE */ + if (d_unhashed(dentry)) + return; + /* + * we hold d_lock, so d_parent is stable, and d_fsdata is never + * cleared until d_release + */ + di = ceph_dentry(dentry->d_parent); + clear_bit(CEPH_D_COMPLETE, &di->flags); +} /* * read() on a dir. This weird interface hack only works if mounted @@ -1306,6 +1332,7 @@ const struct inode_operations ceph_dir_iops = { const struct dentry_operations ceph_dentry_ops = { .d_revalidate = ceph_d_revalidate, .d_release = ceph_d_release, + .d_prune = ceph_d_prune, }; const struct dentry_operations ceph_snapdir_dentry_ops = { @@ -1315,4 +1342,5 @@ const struct dentry_operations ceph_snapdir_dentry_ops = { const struct dentry_operations ceph_snap_dentry_ops = { .d_release = ceph_d_release, + .d_prune = ceph_d_prune, }; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b01442aaf278..98e60693c9a1 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -203,6 +203,7 @@ struct ceph_inode_xattr { * Ceph dentry state */ struct ceph_dentry_info { + unsigned long flags; struct ceph_mds_session *lease_session; u32 lease_gen, lease_shared_gen; u32 lease_seq; @@ -213,6 +214,18 @@ struct ceph_dentry_info { u64 offset; }; +/* + * dentry flags + * + * The locking for D_COMPLETE is a bit odd: + * - we can clear it at almost any time (see ceph_d_prune) + * - it is only meaningful if: + * - we hold dir inode i_lock + * - we hold dir FILE_SHARED caps + * - the dentry D_COMPLETE is set + */ +#define CEPH_D_COMPLETE 1 /* if set, d_u.d_subdirs is complete directory */ + struct ceph_inode_xattrs_info { /* * (still encoded) xattr blob. we avoid the overhead of parsing -- cgit v1.2.3-58-ga151 From 161ebf9fcca967e7396bb076af5ed18539497a3f Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Sat, 29 Oct 2011 17:17:58 +0400 Subject: CIFS: Simplify setlk error handling for mandatory locking Now we allocate a lock structure at first, then we request to the server and save the lock if server returned OK though void function - it prevents the situation when we locked a file on the server and then return -ENOMEM from setlk. Signed-off-by: Pavel Shilovsky Acked-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/file.c | 64 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index c1f063cd1b0c..d9cc07fec99f 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -672,7 +672,7 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock) } static bool -cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset, +__cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, __u8 type, __u16 netfid, struct cifsLockInfo **conf_lock) { @@ -694,6 +694,14 @@ cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset, return false; } +static bool +cifs_find_lock_conflict(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock, + struct cifsLockInfo **conf_lock) +{ + return __cifs_find_lock_conflict(cinode, lock->offset, lock->length, + lock->type, lock->netfid, conf_lock); +} + static int cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, __u8 type, __u16 netfid, struct file_lock *flock) @@ -704,8 +712,8 @@ cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, mutex_lock(&cinode->lock_mutex); - exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid, - &conf_lock); + exist = __cifs_find_lock_conflict(cinode, offset, length, type, netfid, + &conf_lock); if (exist) { flock->fl_start = conf_lock->offset; flock->fl_end = conf_lock->offset + conf_lock->length - 1; @@ -723,40 +731,27 @@ cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, return rc; } -static int -cifs_lock_add(struct cifsInodeInfo *cinode, __u64 len, __u64 offset, - __u8 type, __u16 netfid) +static void +cifs_lock_add(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock) { - struct cifsLockInfo *li; - - li = cifs_lock_init(len, offset, type, netfid); - if (!li) - return -ENOMEM; - mutex_lock(&cinode->lock_mutex); - list_add_tail(&li->llist, &cinode->llist); + list_add_tail(&lock->llist, &cinode->llist); mutex_unlock(&cinode->lock_mutex); - return 0; } static int -cifs_lock_add_if(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, - __u8 type, __u16 netfid, bool wait) +cifs_lock_add_if(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock, + bool wait) { - struct cifsLockInfo *lock, *conf_lock; + struct cifsLockInfo *conf_lock; bool exist; int rc = 0; - lock = cifs_lock_init(length, offset, type, netfid); - if (!lock) - return -ENOMEM; - try_again: exist = false; mutex_lock(&cinode->lock_mutex); - exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid, - &conf_lock); + exist = cifs_find_lock_conflict(cinode, lock, &conf_lock); if (!exist && cinode->can_cache_brlcks) { list_add_tail(&lock->llist, &cinode->llist); mutex_unlock(&cinode->lock_mutex); @@ -781,7 +776,6 @@ try_again: } } - kfree(lock); mutex_unlock(&cinode->lock_mutex); return rc; } @@ -1254,20 +1248,26 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u8 type, } if (lock) { - rc = cifs_lock_add_if(cinode, flock->fl_start, length, - type, netfid, wait_flag); + struct cifsLockInfo *lock; + + lock = cifs_lock_init(length, flock->fl_start, type, netfid); + if (!lock) + return -ENOMEM; + + rc = cifs_lock_add_if(cinode, lock, wait_flag); if (rc < 0) - return rc; - else if (!rc) + kfree(lock); + if (rc <= 0) goto out; rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, flock->fl_start, 0, 1, type, wait_flag, 0); - if (rc == 0) { - /* For Windows locks we must store them. */ - rc = cifs_lock_add(cinode, length, flock->fl_start, - type, netfid); + if (rc) { + kfree(lock); + goto out; } + + cifs_lock_add(cinode, lock); } else if (unlock) rc = cifs_unlock_range(cfile, flock, xid); -- cgit v1.2.3-58-ga151 From a88b470773bc5b640292d8be7b8e7e1011a86639 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Sat, 29 Oct 2011 17:17:59 +0400 Subject: CIFS: Cleanup byte-range locking code style Reorder parms of cifs_lock_init, trivially simplify getlk code and remove extra {} in cifs_lock_add_if. Cc: Dan Carpenter Acked-by: Jeff Layton Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/file.c | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index d9cc07fec99f..cf0b1539b321 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -645,20 +645,20 @@ int cifs_closedir(struct inode *inode, struct file *file) } static struct cifsLockInfo * -cifs_lock_init(__u64 len, __u64 offset, __u8 type, __u16 netfid) +cifs_lock_init(__u64 offset, __u64 length, __u8 type, __u16 netfid) { - struct cifsLockInfo *li = + struct cifsLockInfo *lock = kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL); - if (!li) - return li; - li->netfid = netfid; - li->offset = offset; - li->length = len; - li->type = type; - li->pid = current->tgid; - INIT_LIST_HEAD(&li->blist); - init_waitqueue_head(&li->block_q); - return li; + if (!lock) + return lock; + lock->offset = offset; + lock->length = length; + lock->type = type; + lock->netfid = netfid; + lock->pid = current->tgid; + INIT_LIST_HEAD(&lock->blist); + init_waitqueue_head(&lock->block_q); + return lock; } static void @@ -770,10 +770,8 @@ try_again: (lock->blist.next == &lock->blist)); if (!rc) goto try_again; - else { - mutex_lock(&cinode->lock_mutex); - list_del_init(&lock->blist); - } + mutex_lock(&cinode->lock_mutex); + list_del_init(&lock->blist); } mutex_unlock(&cinode->lock_mutex); @@ -927,7 +925,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) else type = CIFS_WRLCK; - lck = cifs_lock_init(length, flock->fl_start, type, + lck = cifs_lock_init(flock->fl_start, length, type, cfile->netfid); if (!lck) { rc = -ENOMEM; @@ -1064,14 +1062,12 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u8 type, if (rc != 0) cERROR(1, "Error unlocking previously locked " "range %d during test of lock", rc); - rc = 0; - return rc; + return 0; } if (type & LOCKING_ANDX_SHARED_LOCK) { flock->fl_type = F_WRLCK; - rc = 0; - return rc; + return 0; } rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, @@ -1089,8 +1085,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u8 type, } else flock->fl_type = F_WRLCK; - rc = 0; - return rc; + return 0; } static void @@ -1250,7 +1245,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u8 type, if (lock) { struct cifsLockInfo *lock; - lock = cifs_lock_init(length, flock->fl_start, type, netfid); + lock = cifs_lock_init(flock->fl_start, length, type, netfid); if (!lock) return -ENOMEM; -- cgit v1.2.3-58-ga151 From 6070295efc90d1093b2031c43380bd7d9673c802 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 4 Nov 2011 07:04:10 -0400 Subject: nfs: set vs_hidden on nfs4_callback_version4 (try #2) This service should not be registered with or unregistered from rpcbind. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/callback_xdr.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index ee1a5b3cd48d..726e59a9e50f 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -987,4 +987,5 @@ struct svc_version nfs4_callback_version4 = { .vs_proc = nfs4_callback_procedures1, .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, .vs_dispatch = NULL, + .vs_hidden = 1, }; -- cgit v1.2.3-58-ga151 From 5c8a0fbba543d9428a486f0d1282bbcf3cf1d95a Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Tue, 1 Nov 2011 18:23:10 -0500 Subject: VFS: fix statfs() automounter semantics regression No one in their right mind would expect statfs() to not work on a automounter managed mount point. Fix it. [ I'm not sure about the "no one in their right mind" part. It's not mounted, and you didn't ask for it to be mounted. But nobody will really care, and this probably makes it match previous semantics, so.. - Linus ] This mirrors the fix made to the quota code in 815d405ceff0d69646. Signed-off-by: Dan McGee Cc: Trond Myklebust Cc: Alexander Viro Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/statfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/statfs.c b/fs/statfs.c index 8244924dec55..9cf04a118965 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -76,7 +76,7 @@ EXPORT_SYMBOL(vfs_statfs); int user_statfs(const char __user *pathname, struct kstatfs *st) { struct path path; - int error = user_path(pathname, &path); + int error = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); if (!error) { error = vfs_statfs(&path, st); path_put(&path); -- cgit v1.2.3-58-ga151 From c6ffe10015f4e6fba8a915318b319c43aed1836f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Nov 2011 09:23:36 -0700 Subject: ceph: use new D_COMPLETE dentry flag We used to use a flag on the directory inode to track whether the dcache contents for a directory were a complete cached copy. Switch to a dentry flag CEPH_D_COMPLETE that is safely updated by ->d_prune(). Signed-off-by: Sage Weil --- fs/ceph/caps.c | 8 +++---- fs/ceph/dir.c | 59 ++++++++++++++++++++++++++++++++++++++++++++-------- fs/ceph/inode.c | 8 +++---- fs/ceph/mds_client.c | 6 +++--- fs/ceph/super.h | 10 +++++++-- 5 files changed, 68 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 15b21e35078a..0f327c6c9679 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -487,17 +487,15 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, ci->i_rdcache_gen++; /* - * if we are newly issued FILE_SHARED, clear I_COMPLETE; we + * if we are newly issued FILE_SHARED, clear D_COMPLETE; we * don't know what happened to this directory while we didn't * have the cap. */ if ((issued & CEPH_CAP_FILE_SHARED) && (had & CEPH_CAP_FILE_SHARED) == 0) { ci->i_shared_gen++; - if (S_ISDIR(ci->vfs_inode.i_mode)) { - dout(" marking %p NOT complete\n", &ci->vfs_inode); - ci->i_ceph_flags &= ~CEPH_I_COMPLETE; - } + if (S_ISDIR(ci->vfs_inode.i_mode)) + ceph_dir_clear_complete(&ci->vfs_inode); } } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 97fb155b3242..2abd0dfad7f8 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -108,7 +108,7 @@ static unsigned fpos_off(loff_t p) * falling back to a "normal" sync readdir if any dentries in the dir * are dropped. * - * I_COMPLETE tells indicates we have all dentries in the dir. It is + * D_COMPLETE tells indicates we have all dentries in the dir. It is * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by * the MDS if/when the directory is modified). */ @@ -199,8 +199,8 @@ more: filp->f_pos++; /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_i_test(dir, CEPH_I_COMPLETE)) { - dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); + if (!ceph_dir_test_complete(dir)) { + dout(" lost D_COMPLETE on %p; falling back to mds\n", dir); err = -EAGAIN; goto out; } @@ -285,7 +285,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) if ((filp->f_pos == 2 || fi->dentry) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && - (ci->i_ceph_flags & CEPH_I_COMPLETE) && + ceph_dir_test_complete(inode) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { spin_unlock(&inode->i_lock); err = __dcache_readdir(filp, dirent, filldir); @@ -351,7 +351,7 @@ more: if (!req->r_did_prepopulate) { dout("readdir !did_prepopulate"); - fi->dir_release_count--; /* preclude I_COMPLETE */ + fi->dir_release_count--; /* preclude D_COMPLETE */ } /* note next offset and last dentry name */ @@ -430,8 +430,7 @@ more: */ spin_lock(&inode->i_lock); if (ci->i_release_count == fi->dir_release_count) { - dout(" marking %p complete\n", inode); - /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ + ceph_dir_set_complete(inode); ci->i_max_offset = filp->f_pos; } spin_unlock(&inode->i_lock); @@ -614,7 +613,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, fsc->mount_options->snapdir_name, dentry->d_name.len) && !is_root_ceph_dentry(dir, dentry) && - (ci->i_ceph_flags & CEPH_I_COMPLETE) && + ceph_dir_test_complete(dir) && (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { spin_unlock(&dir->i_lock); dout(" dir %p complete, -ENOENT\n", dir); @@ -934,7 +933,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, */ /* d_move screws up d_subdirs order */ - ceph_i_clear(new_dir, CEPH_I_COMPLETE); + ceph_dir_clear_complete(new_dir); d_move(old_dentry, new_dentry); @@ -1092,6 +1091,48 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry, return 1; } +/* + * Set/clear/test dir complete flag on the dir's dentry. + */ +static struct dentry * __d_find_any_alias(struct inode *inode) +{ + struct dentry *alias; + + if (list_empty(&inode->i_dentry)) + return NULL; + alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias); + return alias; +} + +void ceph_dir_set_complete(struct inode *inode) +{ + struct dentry *dentry = __d_find_any_alias(inode); + + if (dentry && ceph_dentry(dentry)) { + dout(" marking %p (%p) complete\n", inode, dentry); + set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); + } +} + +void ceph_dir_clear_complete(struct inode *inode) +{ + struct dentry *dentry = __d_find_any_alias(inode); + + if (dentry && ceph_dentry(dentry)) { + dout(" marking %p (%p) NOT complete\n", inode, dentry); + clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); + } +} + +bool ceph_dir_test_complete(struct inode *inode) +{ + struct dentry *dentry = __d_find_any_alias(inode); + + if (dentry && ceph_dentry(dentry)) + return test_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); + return false; +} + /* * When the VFS prunes a dentry from the cache, we need to clear the * complete flag on the parent directory. diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 1616a0d37cbd..e392bfce84a3 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -771,9 +771,9 @@ no_change: ceph_snap(inode) == CEPH_NOSNAP && (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && (issued & CEPH_CAP_FILE_EXCL) == 0 && - (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { + !ceph_dir_test_complete(inode)) { dout(" marking %p complete (empty)\n", inode); - /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ + ceph_dir_set_complete(inode); ci->i_max_offset = 2; } @@ -856,7 +856,7 @@ static void ceph_set_dentry_offset(struct dentry *dn) di = ceph_dentry(dn); spin_lock(&inode->i_lock); - if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) { + if (!ceph_dir_test_complete(inode)) { spin_unlock(&inode->i_lock); return; } @@ -1056,7 +1056,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, * d_move() puts the renamed dentry at the end of * d_subdirs. We need to assign it an appropriate * directory offset so we can behave when holding - * I_COMPLETE. + * D_COMPLETE. */ ceph_set_dentry_offset(req->r_old_dentry); dout("dn %p gets new offset %lld\n", req->r_old_dentry, diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 1d72f15fe9f4..b4c3efa163c2 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2002,7 +2002,7 @@ out: } /* - * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS + * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS * namespace request. */ void ceph_invalidate_dir_request(struct ceph_mds_request *req) @@ -2010,9 +2010,9 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req) struct inode *inode = req->r_locked_dir; struct ceph_inode_info *ci = ceph_inode(inode); - dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode); + dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode); spin_lock(&inode->i_lock); - ci->i_ceph_flags &= ~CEPH_I_COMPLETE; + ceph_dir_clear_complete(inode); ci->i_release_count++; spin_unlock(&inode->i_lock); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 98e60693c9a1..01bf189e08a9 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -264,7 +264,7 @@ struct ceph_inode_info { struct timespec i_rctime; u64 i_rbytes, i_rfiles, i_rsubdirs; u64 i_files, i_subdirs; - u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */ + u64 i_max_offset; /* largest readdir offset, set with D_COMPLETE */ struct rb_root i_fragtree; struct mutex i_fragtree_mutex; @@ -429,7 +429,6 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, /* * Ceph inode. */ -#define CEPH_I_COMPLETE 1 /* we have complete directory cached */ #define CEPH_I_NODELAY 4 /* do not delay cap release */ #define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ #define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ @@ -486,6 +485,13 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) return ((loff_t)frag << 32) | (loff_t)off; } +/* + * set/clear directory D_COMPLETE flag + */ +void ceph_dir_set_complete(struct inode *inode); +void ceph_dir_clear_complete(struct inode *inode); +bool ceph_dir_test_complete(struct inode *inode); + /* * caps helpers */ -- cgit v1.2.3-58-ga151 From 7fd7d101ff50af55d6d69f4705facc00c324024e Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Fri, 23 Sep 2011 13:22:11 -0700 Subject: ceph/mds_client.c: quiet sparse noise Quiet the following sparse noise: warning: symbol 'get_nonsnap_parent' was not declared. Should it be static? warning: symbol 'done_closing_sessions' was not declared. Should it be static? Local functions don't need external visability. Make them static. Signed-off-by: H Hartley Sweeten Cc: Sage Weil Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index b4c3efa163c2..264ab701154f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -619,7 +619,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc, * * Called under mdsc->mutex. */ -struct dentry *get_nonsnap_parent(struct dentry *dentry) +static struct dentry *get_nonsnap_parent(struct dentry *dentry) { /* * we don't need to worry about protecting the d_parent access @@ -3154,7 +3154,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) /* * true if all sessions are closed, or we force unmount */ -bool done_closing_sessions(struct ceph_mds_client *mdsc) +static bool done_closing_sessions(struct ceph_mds_client *mdsc) { int i, n = 0; -- cgit v1.2.3-58-ga151 From 0c6d4b4e22a513f8563a2e00c5ab08e9f8796272 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Fri, 23 Sep 2011 11:53:30 -0700 Subject: ceph/super.c: quiet sparse noise Quiet the sparse noise: warning: symbol 'create_fs_client' was not declared. Should it be static? warning: symbol 'destroy_fs_client' was not declared. Should it be static? Signed-off-by: H Hartley Sweeten Cc: Sage Weil ceph-devel@vger.kernel.org Signed-off-by: Sage Weil --- fs/ceph/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 788f5ad8e66d..a90846fac759 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -426,7 +426,7 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg) /* * create a new fs client */ -struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, +static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, struct ceph_options *opt) { struct ceph_fs_client *fsc; @@ -502,7 +502,7 @@ fail: return ERR_PTR(err); } -void destroy_fs_client(struct ceph_fs_client *fsc) +static void destroy_fs_client(struct ceph_fs_client *fsc) { dout("destroy_fs_client %p\n", fsc); -- cgit v1.2.3-58-ga151 From cd354ad613a393424f85333ceed6b15e07fb94ae Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 20 Oct 2011 15:45:37 -0400 Subject: Btrfs: don't wait as long for more batches during SSD log commit When we're doing log commits, we try to wait for more writers to come in and make the commit bigger. This helps improve performance on rotating disks, but on SSDs it adds latencies. Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 786639fca067..310ab22cfe58 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2013,10 +2013,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* wait for previous tree log sync to complete */ if (atomic_read(&root->log_commit[(index1 + 1) % 2])) wait_log_commit(trans, root, root->log_transid - 1); - while (1) { unsigned long batch = root->log_batch; - if (root->log_multiple_pids) { + /* when we're on an ssd, just kick the log commit out */ + if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); mutex_lock(&root->log_mutex); -- cgit v1.2.3-58-ga151 From 1eae31e918972bbeefc119d23c1d67674f49a301 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 14 Oct 2011 06:31:20 -0400 Subject: Btrfs: make sure btrfs_remove_free_space doesn't leak EAGAIN btrfs_remove_free_space needs to make sure to set ret back to a valid return value after setting it to EAGAIN, otherwise we return it to the callers. Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index de205d59b74b..f49475dfa954 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1868,6 +1868,7 @@ again: ctl->total_bitmaps--; } kmem_cache_free(btrfs_free_space_cachep, info); + ret = 0; goto out_lock; } @@ -1875,7 +1876,8 @@ again: unlink_free_space(ctl, info); info->offset += bytes; info->bytes -= bytes; - link_free_space(ctl, info); + ret = link_free_space(ctl, info); + WARN_ON(ret); goto out_lock; } -- cgit v1.2.3-58-ga151 From e688b7252f784c2479d559f9f70ca8354752c5e7 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 31 Oct 2011 20:52:39 -0400 Subject: Btrfs: fix extent pinning bugs in the tree log The tree log had two important bugs that could cause corruptions after a crash. Sometimes we were allowing tree log blocks to be reused after the tree log was committed but before the transaction commit was done. This allowed a future metadata write to overwrite the tree log data. It is fixed by adding a new variant of freeing reserved extents that always pins them. Credit goes to Stefan Behrens and Arne Jansen for many many hours spent tracking this bug down. During tree log replay, we do a pass through the tree log and pin all the extents we find. This makes sure the replay code won't go in and use any of those blocks for new allocations during replay. The problem is the free space cache isn't honoring these pinned extents. So the allocator can end up handing them out, leading to all kinds of problems during replay. The fix here is to force any free space cache to load while we pin the extents, and then to make sure we remove the pinned extents from the free space rbtree. Signed-off-by: Chris Mason Reported-by: Stefan Behrens --- fs/btrfs/ctree.h | 5 +++++ fs/btrfs/extent-tree.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/tree-log.c | 11 ++++++----- 3 files changed, 59 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 227620993bce..f63c9b3f6e08 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2156,6 +2156,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, u64 num_bytes, u64 *refs, u64 *flags); int btrfs_pin_extent(struct btrfs_root *root, u64 bytenr, u64 num, int reserved); +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 offset, u64 bytenr); @@ -2206,6 +2209,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset); int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, + u64 start, u64 len); int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 28c4809851a5..cb7626646bba 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4344,6 +4344,34 @@ int btrfs_pin_extent(struct btrfs_root *root, return 0; } +/* + * this function must be called within transaction + */ +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes) +{ + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(root->fs_info, bytenr); + BUG_ON(!cache); + + /* + * pull in the free space cache (if any) so that our pin + * removes the free space from the cache. We have load_only set + * to one because the slow code to read in the free extents does check + * the pinned extents. + */ + cache_block_group(cache, trans, root, 1); + + pin_down_extent(root, cache, bytenr, num_bytes, 0); + + /* remove us from the free space cache (if we're there at all) */ + btrfs_remove_free_space(cache, bytenr, num_bytes); + btrfs_put_block_group(cache); + return 0; +} + /** * btrfs_update_reserved_bytes - update the block_group and space info counters * @cache: The cache we are manipulating @@ -5487,7 +5515,8 @@ again: return ret; } -int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) +static int __btrfs_free_reserved_extent(struct btrfs_root *root, + u64 start, u64 len, int pin) { struct btrfs_block_group_cache *cache; int ret = 0; @@ -5502,8 +5531,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) if (btrfs_test_opt(root, DISCARD)) ret = btrfs_discard_extent(root, start, len, NULL); - btrfs_add_free_space(cache, start, len); - btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); + if (pin) + pin_down_extent(root, cache, start, len, 1); + else { + btrfs_add_free_space(cache, start, len); + btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); + } btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(root, start, len); @@ -5511,6 +5544,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) return ret; } +int btrfs_free_reserved_extent(struct btrfs_root *root, + u64 start, u64 len) +{ + return __btrfs_free_reserved_extent(root, start, len, 0); +} + +int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, + u64 start, u64 len) +{ + return __btrfs_free_reserved_extent(root, start, len, 1); +} + static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 310ab22cfe58..8ca1b6b83bd1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log, struct walk_control *wc, u64 gen) { if (wc->pin) - btrfs_pin_extent(log->fs_info->extent_root, - eb->start, eb->len, 0); + btrfs_pin_extent_for_log_replay(wc->trans, + log->fs_info->extent_root, + eb->start, eb->len); if (btrfs_buffer_uptodate(eb, gen)) { if (wc->write) @@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_reserved_extent(root, + ret = btrfs_free_and_pin_reserved_extent(root, bytenr, blocksize); BUG_ON(ret); } @@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, btrfs_tree_unlock(next); WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_reserved_extent(root, + ret = btrfs_free_and_pin_reserved_extent(root, path->nodes[*level]->start, path->nodes[*level]->len); BUG_ON(ret); @@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, WARN_ON(log->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_reserved_extent(log, next->start, + ret = btrfs_free_and_pin_reserved_extent(log, next->start, next->len); BUG_ON(ret); } -- cgit v1.2.3-58-ga151 From 01d658f2ca3c85c1ffb20b306e30d16197000ce7 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 1 Nov 2011 10:08:06 -0400 Subject: Btrfs: make sure to flush queued bios if write_cache_pages waits write_cache_pages tries to build up a large bio to stuff down the pipe. But if it needs to wait for a page lock, it needs to make sure and send down any pending writes so we don't deadlock with anyone who has the page lock and is waiting for writeback of things inside the bio. Dave Sterba triggered this as a deadlock between the autodefrag code and the extent write_cache_pages Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 13 ++++++++++--- fs/btrfs/disk-io.h | 2 -- fs/btrfs/extent_io.c | 14 ++++++++++---- fs/btrfs/extent_io.h | 3 ++- 4 files changed, 22 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 51372a521167..6f58911ece0d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2735,7 +2735,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) return ret; } -int btree_lock_page_hook(struct page *page) +static int btree_lock_page_hook(struct page *page, void *data, + void (*flush_fn)(void *)) { struct inode *inode = page->mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2752,7 +2753,10 @@ int btree_lock_page_hook(struct page *page) if (!eb) goto out; - btrfs_tree_lock(eb); + if (!btrfs_try_tree_write_lock(eb)) { + flush_fn(data); + btrfs_tree_lock(eb); + } btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { @@ -2767,7 +2771,10 @@ int btree_lock_page_hook(struct page *page) btrfs_tree_unlock(eb); free_extent_buffer(eb); out: - lock_page(page); + if (!trylock_page(page)) { + flush_fn(data); + lock_page(page); + } return 0; } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index bec3ea4bd67f..e678539c8519 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -83,8 +83,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btree_lock_page_hook(struct page *page); - #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_init_lockdep(void); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f284d4e5f447..b40ba75f4483 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2613,10 +2613,16 @@ retry: * swizzled back from swapper_space to tmpfs file * mapping */ - if (tree->ops && tree->ops->write_cache_pages_lock_hook) - tree->ops->write_cache_pages_lock_hook(page); - else - lock_page(page); + if (tree->ops && + tree->ops->write_cache_pages_lock_hook) { + tree->ops->write_cache_pages_lock_hook(page, + data, flush_fn); + } else { + if (!trylock_page(page)) { + flush_fn(data); + lock_page(page); + } + } if (unlikely(page->mapping != mapping)) { unlock_page(page); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 325a346369da..cbd4824a7c94 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -86,7 +86,8 @@ struct extent_io_ops { struct extent_state *other); void (*split_extent_hook)(struct inode *inode, struct extent_state *orig, u64 split); - int (*write_cache_pages_lock_hook)(struct page *page); + int (*write_cache_pages_lock_hook)(struct page *page, void *data, + void (*flush_fn)(void *)); }; struct extent_io_tree { -- cgit v1.2.3-58-ga151 From 5a77d76c243be18d854aa1b14d697525f60e169a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 1 Nov 2011 14:32:23 -0400 Subject: Btrfs: release metadata from global reserve if we have to fallback for unlink I fixed a problem where we weren't reserving space for an orphan item when we had to fallback to using the global reserve for an unlink, but I introduced another problem. I was migrating the bytes from the transaction reserve to the global reserve and then releasing from the global reserve in btrfs_end_transaction(). The problem with this is that a migrate will jack up the size for the destination, but leave the size alone for the source, with the idea that you can do a release normally on the source and it all washes out, and then you can do a release again on the destination and it works out right. My way was skipping the release on the trans_block_rsv which still had the jacked up size from our original reservation. So instead release manually from the global reserve if this transaction was using it, and then set the trans->block_rsv back to the trans_block_rsv so that btrfs_end_transaction cleans everything up properly. With this patch xfstest 83 doesn't emit warnings about leaking space. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 65474d95f26f..a8586e10953c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2904,7 +2904,7 @@ out: if (!err) err = btrfs_block_rsv_migrate(trans->block_rsv, &root->fs_info->global_block_rsv, - btrfs_calc_trans_metadata_size(root, 1)); + trans->bytes_reserved); if (err) { btrfs_end_transaction(trans, root); @@ -2920,6 +2920,9 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { if (trans->block_rsv == &root->fs_info->global_block_rsv) { + btrfs_block_rsv_release(root, trans->block_rsv, + trans->bytes_reserved); + trans->block_rsv = &root->fs_info->trans_block_rsv; BUG_ON(!root->fs_info->enospc_unlink); root->fs_info->enospc_unlink = 0; } -- cgit v1.2.3-58-ga151 From c8174313a8102e874aaa321e2fc4c7c460a87151 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 2 Nov 2011 09:29:35 -0400 Subject: Btrfs: use the global reserve when truncating the free space cache inode We no longer use the orphan block rsv for holding the reservation for truncating the inode, so instead use the global block rsv and check to make sure it has enough space for us to truncate the space. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f49475dfa954..7a15fcfb3e1f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -198,14 +198,24 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct inode *inode) { struct btrfs_block_rsv *rsv; + u64 needed_bytes; loff_t oldsize; int ret = 0; rsv = trans->block_rsv; - trans->block_rsv = root->orphan_block_rsv; - ret = btrfs_block_rsv_check(root, root->orphan_block_rsv, 5); - if (ret) - return ret; + trans->block_rsv = &root->fs_info->global_block_rsv; + + /* 1 for slack space, 1 for updating the inode */ + needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) + + btrfs_calc_trans_metadata_size(root, 1); + + spin_lock(&trans->block_rsv->lock); + if (trans->block_rsv->reserved < needed_bytes) { + spin_unlock(&trans->block_rsv->lock); + trans->block_rsv = rsv; + return -ENOSPC; + } + spin_unlock(&trans->block_rsv->lock); oldsize = i_size_read(inode); btrfs_i_size_write(inode, 0); @@ -218,13 +228,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, ret = btrfs_truncate_inode_items(trans, root, inode, 0, BTRFS_EXTENT_DATA_KEY); - trans->block_rsv = rsv; if (ret) { + trans->block_rsv = rsv; WARN_ON(1); return ret; } ret = btrfs_update_inode(trans, root, inode); + trans->block_rsv = rsv; + return ret; } -- cgit v1.2.3-58-ga151 From 6c41761fc6efe1503103a1afe03a6635c0b5d4ec Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 13 Apr 2011 15:41:04 +0200 Subject: btrfs: separate superblock items out of fs_info fs_info has now ~9kb, more than fits into one page. This will cause mount failure when memory is too fragmented. Top space consumers are super block structures super_copy and super_for_commit, ~2.8kb each. Allocate them dynamically. fs_info will be ~3.5kb. (measured on x86_64) Add a wrapper for freeing fs_info and all of it's dynamically allocated members. Signed-off-by: David Sterba --- fs/btrfs/compression.c | 3 ++- fs/btrfs/ctree.h | 16 ++++++++++++++-- fs/btrfs/disk-io.c | 33 ++++++++++----------------------- fs/btrfs/extent-tree.c | 16 ++++++++-------- fs/btrfs/file-item.c | 17 ++++++----------- fs/btrfs/inode.c | 2 +- fs/btrfs/ioctl.c | 8 ++++---- fs/btrfs/scrub.c | 2 +- fs/btrfs/super.c | 19 +++++++++++++------ fs/btrfs/transaction.c | 10 +++++----- fs/btrfs/tree-log.c | 4 ++-- fs/btrfs/volumes.c | 24 ++++++++++++------------ 12 files changed, 78 insertions(+), 76 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 8ec5d86f1734..14f1c5a0b2d2 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -85,7 +85,8 @@ struct compressed_bio { static inline int compressed_bio_size(struct btrfs_root *root, unsigned long disk_size) { - u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + return sizeof(struct compressed_bio) + ((disk_size + root->sectorsize - 1) / root->sectorsize) * csum_size; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f63c9b3f6e08..5181c53c1124 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -936,8 +936,8 @@ struct btrfs_fs_info { wait_queue_head_t transaction_blocked_wait; wait_queue_head_t async_submit_wait; - struct btrfs_super_block super_copy; - struct btrfs_super_block super_for_commit; + struct btrfs_super_block *super_copy; + struct btrfs_super_block *super_for_commit; struct block_device *__bdev; struct super_block *sb; struct inode *btree_inode; @@ -2387,6 +2387,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) smp_mb(); return fs_info->closing; } +static inline void free_fs_info(struct btrfs_fs_info *fs_info) +{ + kfree(fs_info->delayed_root); + kfree(fs_info->extent_root); + kfree(fs_info->tree_root); + kfree(fs_info->chunk_root); + kfree(fs_info->dev_root); + kfree(fs_info->csum_root); + kfree(fs_info->super_copy); + kfree(fs_info->super_for_commit); + kfree(fs_info); +} /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6f58911ece0d..761717c98278 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result) static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, int verify) { - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); char *result = NULL; unsigned long len; unsigned long cur_len; @@ -1766,14 +1765,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_alloc; } - memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); - memcpy(&fs_info->super_for_commit, &fs_info->super_copy, - sizeof(fs_info->super_for_commit)); + memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); + memcpy(fs_info->super_for_commit, fs_info->super_copy, + sizeof(*fs_info->super_for_commit)); brelse(bh); - memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); + memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); - disk_super = &fs_info->super_copy; + disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) goto fail_alloc; @@ -2152,7 +2151,6 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->delayed_workers); btrfs_stop_workers(&fs_info->caching_workers); fail_alloc: - kfree(fs_info->delayed_root); fail_iput: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); @@ -2164,12 +2162,7 @@ fail_bdi: fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: - kfree(extent_root); - kfree(tree_root); - kfree(fs_info); - kfree(chunk_root); - kfree(dev_root); - kfree(csum_root); + free_fs_info(fs_info); return ERR_PTR(err); } @@ -2338,10 +2331,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) int total_errors = 0; u64 flags; - max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; + max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1; do_barriers = !btrfs_test_opt(root, NOBARRIER); - sb = &root->fs_info->super_for_commit; + sb = root->fs_info->super_for_commit; dev_item = &sb->dev_item; mutex_lock(&root->fs_info->fs_devices->device_list_mutex); @@ -2603,7 +2596,6 @@ int close_ctree(struct btrfs_root *root) del_fs_roots(fs_info); iput(fs_info->btree_inode); - kfree(fs_info->delayed_root); btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); @@ -2624,12 +2616,7 @@ int close_ctree(struct btrfs_root *root) bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); - kfree(fs_info->extent_root); - kfree(fs_info->tree_root); - kfree(fs_info->chunk_root); - kfree(fs_info->dev_root); - kfree(fs_info->csum_root); - kfree(fs_info); + free_fs_info(fs_info); return 0; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cb7626646bba..782eb3ea8edf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3209,7 +3209,7 @@ static int should_alloc_chunk(struct btrfs_root *root, * about 1% of the FS size. */ if (force == CHUNK_ALLOC_LIMITED) { - thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); + thresh = btrfs_super_total_bytes(root->fs_info->super_copy); thresh = max_t(u64, 64 * 1024 * 1024, div_factor_fine(thresh, 1)); @@ -3231,7 +3231,7 @@ static int should_alloc_chunk(struct btrfs_root *root, if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) return 0; - thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); + thresh = btrfs_super_total_bytes(root->fs_info->super_copy); /* 256MB or 5% of the FS */ thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); @@ -3843,7 +3843,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) u64 num_bytes; u64 meta_used; u64 data_used; - int csum_size = btrfs_super_csum_size(&fs_info->super_copy); + int csum_size = btrfs_super_csum_size(fs_info->super_copy); sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); spin_lock(&sinfo->lock); @@ -4222,12 +4222,12 @@ static int update_block_group(struct btrfs_trans_handle *trans, /* block accounting for super block */ spin_lock(&info->delalloc_lock); - old_val = btrfs_super_bytes_used(&info->super_copy); + old_val = btrfs_super_bytes_used(info->super_copy); if (alloc) old_val += num_bytes; else old_val -= num_bytes; - btrfs_set_super_bytes_used(&info->super_copy, old_val); + btrfs_set_super_bytes_used(info->super_copy, old_val); spin_unlock(&info->delalloc_lock); while (total) { @@ -7127,9 +7127,9 @@ int btrfs_read_block_groups(struct btrfs_root *root) return -ENOMEM; path->reada = 1; - cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); + cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); if (btrfs_test_opt(root, SPACE_CACHE) && - btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) + btrfs_super_generation(root->fs_info->super_copy) != cache_gen) need_clear = 1; if (btrfs_test_opt(root, CLEAR_CACHE)) need_clear = 1; @@ -7458,7 +7458,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) int mixed = 0; int ret; - disk_super = &fs_info->super_copy; + disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) return 1; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index a1cb7821becd..c7fb3a4247d3 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, struct btrfs_csum_item *item; struct extent_buffer *leaf; u64 csum_offset = 0; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int csums_in_item; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, u64 item_last_offset = 0; u64 disk_bytenr; u32 diff; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int ret; struct btrfs_path *path; struct btrfs_csum_item *item = NULL; @@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, int ret; size_t size; u64 csum_end; - u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); path = btrfs_alloc_path(); if (!path) @@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, u64 bytenr, u64 len) { struct extent_buffer *leaf; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); u64 csum_end; u64 end_byte = bytenr + len; u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; @@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, u64 csum_end; struct extent_buffer *leaf; int ret; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int blocksize_bits = root->fs_info->sb->s_blocksize_bits; root = root->fs_info->csum_root; @@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_sector_sum *sector_sum; u32 nritems; u32 ins_size; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); path = btrfs_alloc_path(); if (!path) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a8586e10953c..b6b612e14ed7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -824,7 +824,7 @@ static noinline int cow_file_range(struct inode *inode, } BUG_ON(disk_num_bytes > - btrfs_super_total_bytes(&root->fs_info->super_copy)); + btrfs_super_total_bytes(root->fs_info->super_copy)); alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 33aae13cc74b..8f6e14279409 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -282,7 +282,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; - u64 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); + u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); int ret; if (!capable(CAP_SYS_ADMIN)) @@ -1164,7 +1164,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, mutex_unlock(&inode->i_mutex); } - disk_super = &root->fs_info->super_copy; + disk_super = root->fs_info->super_copy; features = btrfs_super_incompat_flags(disk_super); if (range->compress_type == BTRFS_COMPRESS_LZO) { features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; @@ -2613,7 +2613,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) return PTR_ERR(trans); } - dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); + dir_id = btrfs_super_root_dir(root->fs_info->super_copy); di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, dir_id, "default", 7, 1); if (IS_ERR_OR_NULL(di)) { @@ -2629,7 +2629,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - disk_super = &root->fs_info->super_copy; + disk_super = root->fs_info->super_copy; features = btrfs_super_incompat_flags(disk_super); if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) { features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a8d03d5efb5d..69a600f07763 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -182,7 +182,7 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) sdev->curr = -1; atomic_set(&sdev->in_flight, 0); atomic_set(&sdev->cancel_req, 0); - sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); + sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); INIT_LIST_HEAD(&sdev->csum_list); spin_lock_init(&sdev->list_lock); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 5429b1fa0bfc..f7e9de724ef2 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -216,7 +216,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) char *compress_type; bool compress_force = false; - cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); + cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); if (cache_gen) btrfs_set_opt(info->mount_opt, SPACE_CACHE); @@ -524,7 +524,7 @@ static struct dentry *get_default_root(struct super_block *sb, * will mount by default if we haven't been given a specific subvolume * to mount. */ - dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); + dir_id = btrfs_super_root_dir(root->fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); if (IS_ERR(di)) { btrfs_free_path(path); @@ -937,6 +937,13 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, fs_info->fs_devices = fs_devices; tree_root->fs_info = fs_info; + fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + if (!fs_info->super_copy || !fs_info->super_for_commit) { + error = -ENOMEM; + goto error_close_devices; + } + bdev = fs_devices->latest_bdev; s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); if (IS_ERR(s)) { @@ -951,7 +958,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } btrfs_close_devices(fs_devices); - kfree(fs_info); + free_fs_info(fs_info); kfree(tree_root); } else { char b[BDEVNAME_SIZE]; @@ -979,7 +986,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error_close_devices: btrfs_close_devices(fs_devices); - kfree(fs_info); + free_fs_info(fs_info); kfree(tree_root); return ERR_PTR(error); } @@ -1005,7 +1012,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (root->fs_info->fs_devices->rw_devices == 0) return -EACCES; - if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) + if (btrfs_super_log_root(root->fs_info->super_copy) != 0) return -EINVAL; ret = btrfs_cleanup_fs_roots(root->fs_info); @@ -1171,7 +1178,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct btrfs_root *root = btrfs_sb(dentry->d_sb); - struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_super_block *disk_super = root->fs_info->super_copy; struct list_head *head = &root->fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 29bef63e23ba..373c7ec1a026 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -991,7 +991,7 @@ static void update_super_roots(struct btrfs_root *root) struct btrfs_root_item *root_item; struct btrfs_super_block *super; - super = &root->fs_info->super_copy; + super = root->fs_info->super_copy; root_item = &root->fs_info->chunk_root->root_item; super->chunk_root = root_item->bytenr; @@ -1301,12 +1301,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, update_super_roots(root); if (!root->fs_info->log_root_recovering) { - btrfs_set_super_log_root(&root->fs_info->super_copy, 0); - btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); + btrfs_set_super_log_root(root->fs_info->super_copy, 0); + btrfs_set_super_log_root_level(root->fs_info->super_copy, 0); } - memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, - sizeof(root->fs_info->super_copy)); + memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, + sizeof(*root->fs_info->super_copy)); trans->transaction->blocked = 0; spin_lock(&root->fs_info->trans_lock); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8ca1b6b83bd1..f4d81c06d48f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2118,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, BUG_ON(ret); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); - btrfs_set_super_log_root(&root->fs_info->super_for_commit, + btrfs_set_super_log_root(root->fs_info->super_for_commit, log_root_tree->node->start); - btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, + btrfs_set_super_log_root_level(root->fs_info->super_for_commit, btrfs_header_level(log_root_tree->node)); log_root_tree->log_batch = 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c6938b45e0fd..c3b45564048e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1395,8 +1395,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) call_rcu(&device->rcu, free_device); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; - btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); + num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; + btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices); if (cur_devices->open_devices == 0) { struct btrfs_fs_devices *fs_devices; @@ -1458,7 +1458,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_fs_devices *old_devices; struct btrfs_fs_devices *seed_devices; - struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_super_block *disk_super = root->fs_info->super_copy; struct btrfs_device *device; u64 super_flags; @@ -1706,12 +1706,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if (!blk_queue_nonrot(bdev_get_queue(bdev))) root->fs_info->fs_devices->rotating = 1; - total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); - btrfs_set_super_total_bytes(&root->fs_info->super_copy, + total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); + btrfs_set_super_total_bytes(root->fs_info->super_copy, total_bytes + device->total_bytes); - total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); - btrfs_set_super_num_devices(&root->fs_info->super_copy, + total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); + btrfs_set_super_num_devices(root->fs_info->super_copy, total_bytes + 1); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); @@ -1802,7 +1802,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size) { struct btrfs_super_block *super_copy = - &device->dev_root->fs_info->super_copy; + device->dev_root->fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 diff = new_size - device->total_bytes; @@ -1861,7 +1861,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 chunk_offset) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; u8 *ptr; @@ -2187,7 +2187,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) bool retried = false; struct extent_buffer *l; struct btrfs_key key; - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 old_size = device->total_bytes; u64 diff = device->total_bytes - new_size; @@ -2311,7 +2311,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, struct btrfs_key *key, struct btrfs_chunk *chunk, int item_size) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct btrfs_disk_key disk_key; u32 array_size; u8 *ptr; @@ -3653,7 +3653,7 @@ static int read_one_dev(struct btrfs_root *root, int btrfs_read_sys_array(struct btrfs_root *root) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct extent_buffer *sb; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; -- cgit v1.2.3-58-ga151 From af31f5e5b84b5bf2bcec464153a5130b170b2770 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 3 Nov 2011 15:17:42 -0400 Subject: Btrfs: add a log of past tree roots This takes some of the free space in the btrfs super block to record information about most of the roots in the last four commits. It also adds a -o recovery to use the root history log when we're not able to read the tree of tree roots, the extent tree root, the device tree root or the csum root. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 95 +++++++++++++++++ fs/btrfs/disk-io.c | 291 ++++++++++++++++++++++++++++++++++++++++++++++++----- fs/btrfs/super.c | 7 +- 3 files changed, 369 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5181c53c1124..78f43d1102a0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -360,6 +360,47 @@ struct btrfs_header { #define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 #define BTRFS_LABEL_SIZE 256 +/* + * just in case we somehow lose the roots and are not able to mount, + * we store an array of the roots from previous transactions + * in the super. + */ +#define BTRFS_NUM_BACKUP_ROOTS 4 +struct btrfs_root_backup { + __le64 tree_root; + __le64 tree_root_gen; + + __le64 chunk_root; + __le64 chunk_root_gen; + + __le64 extent_root; + __le64 extent_root_gen; + + __le64 fs_root; + __le64 fs_root_gen; + + __le64 dev_root; + __le64 dev_root_gen; + + __le64 csum_root; + __le64 csum_root_gen; + + __le64 total_bytes; + __le64 bytes_used; + __le64 num_devices; + /* future */ + __le64 unsed_64[4]; + + u8 tree_root_level; + u8 chunk_root_level; + u8 extent_root_level; + u8 fs_root_level; + u8 dev_root_level; + u8 csum_root_level; + /* future and to align */ + u8 unused_8[10]; +} __attribute__ ((__packed__)); + /* * the super block basically lists the main trees of the FS * it currently lacks any block count etc etc @@ -406,6 +447,7 @@ struct btrfs_super_block { /* future expansion */ __le64 reserved[31]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; + struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; } __attribute__ ((__packed__)); /* @@ -1113,6 +1155,9 @@ struct btrfs_fs_info { u64 fs_state; struct btrfs_delayed_root *delayed_root; + + /* next backup root to be overwritten */ + int backup_root_index; }; /* @@ -1357,6 +1402,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) #define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) +#define BTRFS_MOUNT_RECOVERY (1 << 18) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1972,6 +2018,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root) return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; } +/* struct btrfs_root_backup */ +BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, + tree_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, + tree_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, + tree_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, + chunk_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, + chunk_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, + extent_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, + extent_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, + extent_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, + fs_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, + fs_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, + fs_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, + dev_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, + dev_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, + dev_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, + csum_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, + csum_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, + csum_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, + num_devices, 64); + /* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 761717c98278..a61f8a6cf219 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1134,10 +1134,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root, generation = btrfs_root_generation(&root->root_item); blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); + root->commit_root = NULL; root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) { free_extent_buffer(root->node); + root->node = NULL; return -EIO; } root->commit_root = btrfs_root_node(root); @@ -1576,6 +1578,228 @@ sleep: return 0; } +/* + * this will find the highest generation in the array of + * root backups. The index of the highest array is returned, + * or -1 if we can't find anything. + * + * We check to make sure the array is valid by comparing the + * generation of the latest root in the array with the generation + * in the super block. If they don't match we pitch it. + */ +static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen) +{ + u64 cur; + int newest_index = -1; + struct btrfs_root_backup *root_backup; + int i; + + for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { + root_backup = info->super_copy->super_roots + i; + cur = btrfs_backup_tree_root_gen(root_backup); + if (cur == newest_gen) + newest_index = i; + } + + /* check to see if we actually wrapped around */ + if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) { + root_backup = info->super_copy->super_roots; + cur = btrfs_backup_tree_root_gen(root_backup); + if (cur == newest_gen) + newest_index = 0; + } + return newest_index; +} + + +/* + * find the oldest backup so we know where to store new entries + * in the backup array. This will set the backup_root_index + * field in the fs_info struct + */ +static void find_oldest_super_backup(struct btrfs_fs_info *info, + u64 newest_gen) +{ + int newest_index = -1; + + newest_index = find_newest_super_backup(info, newest_gen); + /* if there was garbage in there, just move along */ + if (newest_index == -1) { + info->backup_root_index = 0; + } else { + info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS; + } +} + +/* + * copy all the root pointers into the super backup array. + * this will bump the backup pointer by one when it is + * done + */ +static void backup_super_roots(struct btrfs_fs_info *info) +{ + int next_backup; + struct btrfs_root_backup *root_backup; + int last_backup; + + next_backup = info->backup_root_index; + last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) % + BTRFS_NUM_BACKUP_ROOTS; + + /* + * just overwrite the last backup if we're at the same generation + * this happens only at umount + */ + root_backup = info->super_for_commit->super_roots + last_backup; + if (btrfs_backup_tree_root_gen(root_backup) == + btrfs_header_generation(info->tree_root->node)) + next_backup = last_backup; + + root_backup = info->super_for_commit->super_roots + next_backup; + + /* + * make sure all of our padding and empty slots get zero filled + * regardless of which ones we use today + */ + memset(root_backup, 0, sizeof(*root_backup)); + + info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS; + + btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start); + btrfs_set_backup_tree_root_gen(root_backup, + btrfs_header_generation(info->tree_root->node)); + + btrfs_set_backup_tree_root_level(root_backup, + btrfs_header_level(info->tree_root->node)); + + btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start); + btrfs_set_backup_chunk_root_gen(root_backup, + btrfs_header_generation(info->chunk_root->node)); + btrfs_set_backup_chunk_root_level(root_backup, + btrfs_header_level(info->chunk_root->node)); + + btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start); + btrfs_set_backup_extent_root_gen(root_backup, + btrfs_header_generation(info->extent_root->node)); + btrfs_set_backup_extent_root_level(root_backup, + btrfs_header_level(info->extent_root->node)); + + btrfs_set_backup_fs_root(root_backup, info->fs_root->node->start); + btrfs_set_backup_fs_root_gen(root_backup, + btrfs_header_generation(info->fs_root->node)); + btrfs_set_backup_fs_root_level(root_backup, + btrfs_header_level(info->fs_root->node)); + + btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start); + btrfs_set_backup_dev_root_gen(root_backup, + btrfs_header_generation(info->dev_root->node)); + btrfs_set_backup_dev_root_level(root_backup, + btrfs_header_level(info->dev_root->node)); + + btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start); + btrfs_set_backup_csum_root_gen(root_backup, + btrfs_header_generation(info->csum_root->node)); + btrfs_set_backup_csum_root_level(root_backup, + btrfs_header_level(info->csum_root->node)); + + btrfs_set_backup_total_bytes(root_backup, + btrfs_super_total_bytes(info->super_copy)); + btrfs_set_backup_bytes_used(root_backup, + btrfs_super_bytes_used(info->super_copy)); + btrfs_set_backup_num_devices(root_backup, + btrfs_super_num_devices(info->super_copy)); + + /* + * if we don't copy this out to the super_copy, it won't get remembered + * for the next commit + */ + memcpy(&info->super_copy->super_roots, + &info->super_for_commit->super_roots, + sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS); +} + +/* + * this copies info out of the root backup array and back into + * the in-memory super block. It is meant to help iterate through + * the array, so you send it the number of backups you've already + * tried and the last backup index you used. + * + * this returns -1 when it has tried all the backups + */ +static noinline int next_root_backup(struct btrfs_fs_info *info, + struct btrfs_super_block *super, + int *num_backups_tried, int *backup_index) +{ + struct btrfs_root_backup *root_backup; + int newest = *backup_index; + + if (*num_backups_tried == 0) { + u64 gen = btrfs_super_generation(super); + + newest = find_newest_super_backup(info, gen); + if (newest == -1) + return -1; + + *backup_index = newest; + *num_backups_tried = 1; + } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) { + /* we've tried all the backups, all done */ + return -1; + } else { + /* jump to the next oldest backup */ + newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) % + BTRFS_NUM_BACKUP_ROOTS; + *backup_index = newest; + *num_backups_tried += 1; + } + root_backup = super->super_roots + newest; + + btrfs_set_super_generation(super, + btrfs_backup_tree_root_gen(root_backup)); + btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup)); + btrfs_set_super_root_level(super, + btrfs_backup_tree_root_level(root_backup)); + btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup)); + + /* + * fixme: the total bytes and num_devices need to match or we should + * need a fsck + */ + btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup)); + btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup)); + return 0; +} + +/* helper to cleanup tree roots */ +static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) +{ + free_extent_buffer(info->tree_root->node); + free_extent_buffer(info->tree_root->commit_root); + free_extent_buffer(info->dev_root->node); + free_extent_buffer(info->dev_root->commit_root); + free_extent_buffer(info->extent_root->node); + free_extent_buffer(info->extent_root->commit_root); + free_extent_buffer(info->csum_root->node); + free_extent_buffer(info->csum_root->commit_root); + + info->tree_root->node = NULL; + info->tree_root->commit_root = NULL; + info->dev_root->node = NULL; + info->dev_root->commit_root = NULL; + info->extent_root->node = NULL; + info->extent_root->commit_root = NULL; + info->csum_root->node = NULL; + info->csum_root->commit_root = NULL; + + if (chunk_root) { + free_extent_buffer(info->chunk_root->node); + free_extent_buffer(info->chunk_root->commit_root); + info->chunk_root->node = NULL; + info->chunk_root->commit_root = NULL; + } +} + + struct btrfs_root *open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) @@ -1603,6 +1827,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, int ret; int err = -EINVAL; + int num_backups_tried = 0; + int backup_index = 0; struct btrfs_super_block *disk_super; @@ -1781,6 +2007,13 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); + /* + * run through our array of backup supers and setup + * our ring pointer to the oldest one + */ + generation = btrfs_super_generation(disk_super); + find_oldest_super_backup(fs_info, generation); + /* * In the long term, we'll store the compression type in the super * block, and it'll be used for per file compression control. @@ -1938,7 +2171,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", sb->s_id); - goto fail_chunk_root; + goto fail_tree_roots; } btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); chunk_root->commit_root = btrfs_root_node(chunk_root); @@ -1953,11 +2186,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (ret) { printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", sb->s_id); - goto fail_chunk_root; + goto fail_tree_roots; } btrfs_close_extra_devices(fs_devices); +retry_root_backup: blocksize = btrfs_level_size(tree_root, btrfs_super_root_level(disk_super)); generation = btrfs_super_generation(disk_super); @@ -1965,32 +2199,33 @@ struct btrfs_root *open_ctree(struct super_block *sb, tree_root->node = read_tree_block(tree_root, btrfs_super_root(disk_super), blocksize, generation); - if (!tree_root->node) - goto fail_chunk_root; - if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { + if (!tree_root->node || + !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", sb->s_id); - goto fail_tree_root; + + goto recovery_tree_root; } + btrfs_set_root_node(&tree_root->root_item, tree_root->node); tree_root->commit_root = btrfs_root_node(tree_root); ret = find_and_setup_root(tree_root, fs_info, BTRFS_EXTENT_TREE_OBJECTID, extent_root); if (ret) - goto fail_tree_root; + goto recovery_tree_root; extent_root->track_dirty = 1; ret = find_and_setup_root(tree_root, fs_info, BTRFS_DEV_TREE_OBJECTID, dev_root); if (ret) - goto fail_extent_root; + goto recovery_tree_root; dev_root->track_dirty = 1; ret = find_and_setup_root(tree_root, fs_info, BTRFS_CSUM_TREE_OBJECTID, csum_root); if (ret) - goto fail_dev_root; + goto recovery_tree_root; csum_root->track_dirty = 1; @@ -2123,20 +2358,10 @@ fail_cleaner: fail_block_groups: btrfs_free_block_groups(fs_info); - free_extent_buffer(csum_root->node); - free_extent_buffer(csum_root->commit_root); -fail_dev_root: - free_extent_buffer(dev_root->node); - free_extent_buffer(dev_root->commit_root); -fail_extent_root: - free_extent_buffer(extent_root->node); - free_extent_buffer(extent_root->commit_root); -fail_tree_root: - free_extent_buffer(tree_root->node); - free_extent_buffer(tree_root->commit_root); -fail_chunk_root: - free_extent_buffer(chunk_root->node); - free_extent_buffer(chunk_root->commit_root); + +fail_tree_roots: + free_root_pointers(fs_info, 1); + fail_sb_buffer: btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); @@ -2164,6 +2389,25 @@ fail_srcu: fail: free_fs_info(fs_info); return ERR_PTR(err); + +recovery_tree_root: + + if (!btrfs_test_opt(tree_root, RECOVERY)) + goto fail_tree_roots; + + free_root_pointers(fs_info, 0); + + /* don't use the log in recovery mode, it won't be valid */ + btrfs_set_super_log_root(disk_super, 0); + + /* we can't trust the free space cache either */ + btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); + + ret = next_root_backup(fs_info, fs_info->super_copy, + &num_backups_tried, &backup_index); + if (ret == -1) + goto fail_block_groups; + goto retry_root_backup; } static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) @@ -2333,6 +2577,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1; do_barriers = !btrfs_test_opt(root, NOBARRIER); + backup_super_roots(root->fs_info); sb = root->fs_info->super_for_commit; dev_item = &sb->dev_item; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f7e9de724ef2..57080dffdfc6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -164,7 +164,7 @@ enum { Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, - Opt_inode_cache, Opt_no_space_cache, Opt_err, + Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err, }; static match_table_t tokens = { @@ -198,6 +198,7 @@ static match_table_t tokens = { {Opt_defrag, "autodefrag"}, {Opt_inode_cache, "inode_cache"}, {Opt_no_space_cache, "no_space_cache"}, + {Opt_recovery, "recovery"}, {Opt_err, NULL}, }; @@ -392,6 +393,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: enabling auto defrag"); btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); break; + case Opt_recovery: + printk(KERN_INFO "btrfs: enabling auto recovery"); + btrfs_set_opt(info->mount_opt, RECOVERY); + break; case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); -- cgit v1.2.3-58-ga151 From 6d668dda0caec537fbf28c4d91e6d18181af3cff Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 3 Nov 2011 22:54:25 -0400 Subject: Btrfs: make a delayed_block_rsv for the delayed item insertion I've been hitting warnings in use_block_rsv when running the delayed insertion stuff. It's because we will readjust global block rsv based on what is in use, which means we could end up discarding reservations that are for the delayed insertion stuff. So instead create a seperate block rsv for the delayed insertion stuff. This will also make it easier to debug problems with the delayed insertion reservations since we will know that only the delayed insertion code touches this block_rsv. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/delayed-inode.c | 14 +++++++------- fs/btrfs/disk-io.c | 1 + fs/btrfs/extent-tree.c | 3 +++ 4 files changed, 13 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 78f43d1102a0..3002e5d4da0b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -957,6 +957,8 @@ struct btrfs_fs_info { struct btrfs_block_rsv trans_block_rsv; /* block reservation for chunk tree */ struct btrfs_block_rsv chunk_block_rsv; + /* block reservation for delayed operations */ + struct btrfs_block_rsv delayed_block_rsv; struct btrfs_block_rsv empty_block_rsv; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index b52c672f4c18..fc4026af7290 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, return 0; src_rsv = trans->block_rsv; - dst_rsv = &root->fs_info->global_block_rsv; + dst_rsv = &root->fs_info->delayed_block_rsv; num_bytes = btrfs_calc_trans_metadata_size(root, 1); ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); @@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, if (!item->bytes_reserved) return; - rsv = &root->fs_info->global_block_rsv; + rsv = &root->fs_info->delayed_block_rsv; btrfs_block_rsv_release(root, rsv, item->bytes_reserved); } @@ -628,7 +628,7 @@ static int btrfs_delayed_inode_reserve_metadata( return 0; src_rsv = trans->block_rsv; - dst_rsv = &root->fs_info->global_block_rsv; + dst_rsv = &root->fs_info->delayed_block_rsv; num_bytes = btrfs_calc_trans_metadata_size(root, 1); ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); @@ -646,7 +646,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root, if (!node->bytes_reserved) return; - rsv = &root->fs_info->global_block_rsv; + rsv = &root->fs_info->delayed_block_rsv; btrfs_block_rsv_release(root, rsv, node->bytes_reserved); node->bytes_reserved = 0; @@ -1026,7 +1026,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, path->leave_spinning = 1; block_rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->global_block_rsv; + trans->block_rsv = &root->fs_info->delayed_block_rsv; delayed_root = btrfs_get_delayed_root(root); @@ -1069,7 +1069,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, path->leave_spinning = 1; block_rsv = trans->block_rsv; - trans->block_rsv = &node->root->fs_info->global_block_rsv; + trans->block_rsv = &node->root->fs_info->delayed_block_rsv; ret = btrfs_insert_delayed_items(trans, path, node->root, node); if (!ret) @@ -1149,7 +1149,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) goto free_path; block_rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->global_block_rsv; + trans->block_rsv = &root->fs_info->delayed_block_rsv; ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); if (!ret) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a61f8a6cf219..23b6776477b7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1891,6 +1891,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_init_block_rsv(&fs_info->trans_block_rsv); btrfs_init_block_rsv(&fs_info->chunk_block_rsv); btrfs_init_block_rsv(&fs_info->empty_block_rsv); + btrfs_init_block_rsv(&fs_info->delayed_block_rsv); atomic_set(&fs_info->nr_async_submits, 0); atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_submit_draining, 0); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 782eb3ea8edf..01c1f08b976a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3914,6 +3914,7 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->delalloc_block_rsv.space_info = space_info; fs_info->trans_block_rsv.space_info = space_info; fs_info->empty_block_rsv.space_info = space_info; + fs_info->delayed_block_rsv.space_info = space_info; fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; @@ -3933,6 +3934,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) WARN_ON(fs_info->trans_block_rsv.reserved > 0); WARN_ON(fs_info->chunk_block_rsv.size > 0); WARN_ON(fs_info->chunk_block_rsv.reserved > 0); + WARN_ON(fs_info->delayed_block_rsv.size > 0); + WARN_ON(fs_info->delayed_block_rsv.reserved > 0); } void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, -- cgit v1.2.3-58-ga151 From 663350ac38c67ca388acea6e876dc6d668c232b0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 3 Nov 2011 22:54:25 -0400 Subject: Btrfs: be smarter about committing the transaction in reserve_metadata_bytes Because of the overcommit stuff I had to make it so that we committed the transaction all the time in reserve_metadata_bytes in case we had overcommitted because of delayed items. This was because previously we had no way of knowing how much space was reserved for delayed items. Now that we have the delayed_block_rsv we can check it to see if committing the transaction would get us anywhere. This patch breaks out the committing logic into a helper function that will check to see if committing the transaction would free enough space for us to get anything done. With this patch xfstests 83 goes from taking 445 seconds to taking 28 seconds on my box. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 86 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 67 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 01c1f08b976a..5b84205e7685 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3334,12 +3334,12 @@ out: /* * shrink metadata reservation for delalloc */ -static int shrink_delalloc(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 to_reclaim, +static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, bool wait_ordered) { struct btrfs_block_rsv *block_rsv; struct btrfs_space_info *space_info; + struct btrfs_trans_handle *trans; u64 reserved; u64 max_reclaim; u64 reclaimed = 0; @@ -3348,6 +3348,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, int loops = 0; unsigned long progress; + trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; space_info = block_rsv->space_info; @@ -3417,6 +3418,60 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, return reclaimed >= to_reclaim; } +/** + * maybe_commit_transaction - possibly commit the transaction if its ok to + * @root - the root we're allocating for + * @bytes - the number of bytes we want to reserve + * @force - force the commit + * + * This will check to make sure that committing the transaction will actually + * get us somewhere and then commit the transaction if it does. Otherwise it + * will return -ENOSPC. + */ +static int may_commit_transaction(struct btrfs_root *root, + struct btrfs_space_info *space_info, + u64 bytes, int force) +{ + struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; + struct btrfs_trans_handle *trans; + + trans = (struct btrfs_trans_handle *)current->journal_info; + if (trans) + return -EAGAIN; + + if (force) + goto commit; + + /* See if there is enough pinned space to make this reservation */ + spin_lock(&space_info->lock); + if (space_info->bytes_pinned >= bytes) { + spin_unlock(&space_info->lock); + goto commit; + } + spin_unlock(&space_info->lock); + + /* + * See if there is some space in the delayed insertion reservation for + * this reservation. + */ + if (space_info != delayed_rsv->space_info) + return -ENOSPC; + + spin_lock(&delayed_rsv->lock); + if (delayed_rsv->size < bytes) { + spin_unlock(&delayed_rsv->lock); + return -ENOSPC; + } + spin_unlock(&delayed_rsv->lock); + +commit: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return -ENOSPC; + + return btrfs_commit_transaction(trans, root); +} + /** * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space * @root - the root we're allocating for @@ -3436,7 +3491,6 @@ static int reserve_metadata_bytes(struct btrfs_root *root, u64 orig_bytes, int flush) { struct btrfs_space_info *space_info = block_rsv->space_info; - struct btrfs_trans_handle *trans; u64 used; u64 num_bytes = orig_bytes; int retries = 0; @@ -3445,7 +3499,6 @@ static int reserve_metadata_bytes(struct btrfs_root *root, bool flushing = false; bool wait_ordered = false; - trans = (struct btrfs_trans_handle *)current->journal_info; again: ret = 0; spin_lock(&space_info->lock); @@ -3461,7 +3514,7 @@ again: * deadlock since we are waiting for the flusher to finish, but * hold the current transaction open. */ - if (trans) + if (current->journal_info) return -EAGAIN; ret = wait_event_interruptible(space_info->wait, !space_info->flush); @@ -3517,12 +3570,16 @@ again: */ avail = (space_info->total_bytes - space_info->bytes_used) * 8; do_div(avail, 10); - if (space_info->bytes_pinned >= avail && flush && !trans && - !committed) { + if (space_info->bytes_pinned >= avail && flush && !committed) { space_info->flush = 1; flushing = true; spin_unlock(&space_info->lock); - goto commit; + ret = may_commit_transaction(root, space_info, + orig_bytes, 1); + if (ret) + goto out; + committed = true; + goto again; } spin_lock(&root->fs_info->free_chunk_lock); @@ -3575,7 +3632,7 @@ again: * We do synchronous shrinking since we don't actually unreserve * metadata until after the IO is completed. */ - ret = shrink_delalloc(trans, root, num_bytes, wait_ordered); + ret = shrink_delalloc(root, num_bytes, wait_ordered); if (ret < 0) goto out; @@ -3592,21 +3649,12 @@ again: goto again; } - ret = -EAGAIN; - if (trans) - goto out; - -commit: ret = -ENOSPC; if (committed) goto out; - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - goto out; - ret = btrfs_commit_transaction(trans, root); + ret = may_commit_transaction(root, space_info, orig_bytes, 0); if (!ret) { - trans = NULL; committed = true; goto again; } -- cgit v1.2.3-58-ga151 From bf0da8c183a15656eee63c54f334c3794320872a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 4 Nov 2011 12:29:37 -0400 Subject: Btrfs: ClearPageError during writepage and clean_tree_block Failure testing was tripping up over stale PageError bits in metadata pages. If we have an io error on a block, and later on end up reusing it, nobody ever clears PageError on those pages. During commit, we'll find PageError and think we had trouble writing the block, which will lead to aborts and other problems. This changes clean_tree_block and the btrfs writepage code to clear the PageError bit. In both cases we're either completely done with the page or the page has good stuff and the error bit is no longer valid. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 4 ++++ fs/btrfs/transaction.c | 7 ++++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b40ba75f4483..cc3c58970d4e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2334,6 +2334,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, trace___extent_writepage(page, inode, wbc); WARN_ON(!PageLocked(page)); + + ClearPageError(page); + pg_offset = i_size & (PAGE_CACHE_SIZE - 1); if (page->index > end_index || (page->index == end_index && !pg_offset)) { @@ -3402,6 +3405,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, PAGECACHE_TAG_DIRTY); } spin_unlock_irq(&page->mapping->tree_lock); + ClearPageError(page); unlock_page(page); } return 0; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 373c7ec1a026..29f782cc2cc9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -634,7 +634,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, ret = btrfs_write_marked_extents(root, dirty_pages, mark); ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); - return ret || ret2; + + if (ret) + return ret; + if (ret2) + return ret2; + return 0; } int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, -- cgit v1.2.3-58-ga151 From c06a0e120a4e381a1c291c1fce3c6155c5791cae Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 4 Nov 2011 19:56:02 -0400 Subject: Btrfs: fix delayed insertion reservation We all keep getting those stupid warnings from use_block_rsv when running stress.sh, and it's because the delayed insertion stuff is being stupid. It's not the delayed insertion stuffs fault, it's all just stupid. When marking an inode dirty for oh say updating the time on it, we just do a btrfs_join_transaction, which doesn't reserve any space. This is stupid because we're going to have to have space reserve to make this change, but we do it because it's fast because chances are we're going to call it over and over again and it doesn't matter. Well thanks to the delayed insertion stuff this is mostly the case, so we do actually need to make this reservation. So if trans->bytes_reserved is 0 then try to do a normal reservation. If not return ENOSPC which will make the btrfs_dirty_inode start a proper transaction which will let it do the whole ENOSPC dance and reserve enough space for the delayed insertion to steal the reservation from the transaction. The other stupid thing we do is not reserve space for the inode when writing to the thing. Usually this is ok since we have to update the time so we'd have already done all this work before we get to the endio stuff, so it doesn't matter. But this is stupid because we could write the data after the transaction commits where we changed the mtime of the inode so we have to cow all the way down to the inode anyway. This used to be masked by the delalloc reservation stuff, but because we delay the update it doesn't get masked in this case. So again the delayed insertion stuff bites us in the ass. So if our trans->block_rsv is delalloc, just steal the reservation from the delalloc reserve. Hopefully this won't bite us in the ass, but I've said that before. With this patch stress.sh no longer spits out those stupid warnings (famous last words). Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 3 +++ fs/btrfs/delayed-inode.c | 36 ++++++++++++++++++++++++++++-------- fs/btrfs/extent-tree.c | 18 ++++++++++++++++++ 3 files changed, 49 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3002e5d4da0b..6bb34fc1ff22 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2353,6 +2353,9 @@ void btrfs_free_block_rsv(struct btrfs_root *root, int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); +int btrfs_block_rsv_add_noflush(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes); int btrfs_block_rsv_check(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int min_factor); int btrfs_block_rsv_refill(struct btrfs_root *root, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index fc4026af7290..bbe8496d5339 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -624,13 +624,36 @@ static int btrfs_delayed_inode_reserve_metadata( u64 num_bytes; int ret; - if (!trans->bytes_reserved) - return 0; - src_rsv = trans->block_rsv; dst_rsv = &root->fs_info->delayed_block_rsv; num_bytes = btrfs_calc_trans_metadata_size(root, 1); + + /* + * btrfs_dirty_inode will update the inode under btrfs_join_transaction + * which doesn't reserve space for speed. This is a problem since we + * still need to reserve space for this update, so try to reserve the + * space. + * + * Now if src_rsv == delalloc_block_rsv we'll let it just steal since + * we're accounted for. + */ + if (!trans->bytes_reserved && + src_rsv != &root->fs_info->delalloc_block_rsv) { + ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); + /* + * Since we're under a transaction reserve_metadata_bytes could + * try to commit the transaction which will make it return + * EAGAIN to make us stop the transaction we have, so return + * ENOSPC instead so that btrfs_dirty_inode knows what to do. + */ + if (ret == -EAGAIN) + ret = -ENOSPC; + if (!ret) + node->bytes_reserved = num_bytes; + return ret; + } + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); if (!ret) node->bytes_reserved = num_bytes; @@ -1686,11 +1709,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, } ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); - /* - * we must reserve enough space when we start a new transaction, - * so reserving metadata failure is impossible - */ - BUG_ON(ret); + if (ret) + goto release_node; fill_stack_inode_item(trans, &delayed_node->inode_item, inode); delayed_node->inode_dirty = 1; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5b84205e7685..23e936c3de76 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3814,6 +3814,24 @@ int btrfs_block_rsv_add(struct btrfs_root *root, return ret; } +int btrfs_block_rsv_add_noflush(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + int ret; + + if (num_bytes == 0) + return 0; + + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 1); + return 0; + } + + return ret; +} + int btrfs_block_rsv_check(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int min_factor) { -- cgit v1.2.3-58-ga151 From 740c3d226cbba6cd6a32adfb66809c94938f3e57 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 2 Nov 2011 15:48:34 -0400 Subject: Btrfs: fix the new inspection ioctls for 32 bit compat The new ioctls to follow backrefs are not clean for 32/64 bit compat. This reworks them for u64s everywhere. They are brand new, so there are no problems with changing the interface now. Signed-off-by: Chris Mason --- fs/btrfs/backref.c | 8 ++++---- fs/btrfs/ioctl.c | 10 +++++----- fs/btrfs/ioctl.h | 11 +++++------ fs/btrfs/scrub.c | 2 +- 4 files changed, 15 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 2351df0de450..8855aad3929c 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -676,14 +676,14 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, bytes_left = ipath->fspath->bytes_left > s_ptr ? ipath->fspath->bytes_left - s_ptr : 0; - fspath_min = (char *)ipath->fspath->str + (i + 1) * s_ptr; + fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb, inum, fspath_min, bytes_left); if (IS_ERR(fspath)) return PTR_ERR(fspath); if (fspath > fspath_min) { - ipath->fspath->str[i] = fspath; + ipath->fspath->val[i] = (u64)fspath; ++ipath->fspath->elem_cnt; ipath->fspath->bytes_left = fspath - fspath_min; } else { @@ -698,9 +698,9 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, /* * this dumps all file system paths to the inode into the ipath struct, provided * is has been created large enough. each path is zero-terminated and accessed - * from ipath->fspath->str[i]. + * from ipath->fspath->val[i]. * when it returns, there are ipath->fspath->elem_cnt number of paths available - * in ipath->fspath->str[]. when the allocated space wasn't sufficient, the + * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the * number of missed paths in recored in ipath->fspath->elem_missed, otherwise, * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would * have been needed to return all paths. diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index cc9893990341..4a34c472f126 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2895,7 +2895,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) { int ret = 0; int i; - unsigned long rel_ptr; + u64 rel_ptr; int size; struct btrfs_ioctl_ino_path_args *ipa = NULL; struct inode_fs_paths *ipath = NULL; @@ -2930,11 +2930,11 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) goto out; for (i = 0; i < ipath->fspath->elem_cnt; ++i) { - rel_ptr = ipath->fspath->str[i] - (char *)ipath->fspath->str; - ipath->fspath->str[i] = (void *)rel_ptr; + rel_ptr = ipath->fspath->val[i] - (u64)ipath->fspath->val; + ipath->fspath->val[i] = rel_ptr; } - ret = copy_to_user(ipa->fspath, ipath->fspath, size); + ret = copy_to_user((void *)ipa->fspath, (void *)ipath->fspath, size); if (ret) { ret = -EFAULT; goto out; @@ -3017,7 +3017,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, if (ret < 0) goto out; - ret = copy_to_user(loi->inodes, inodes, size); + ret = copy_to_user((void *)loi->inodes, (void *)inodes, size); if (ret) ret = -EFAULT; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 2da30d4950e6..252ae9915de8 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -198,24 +198,23 @@ struct btrfs_data_container { __u32 bytes_missing; /* out -- additional bytes needed for result */ __u32 elem_cnt; /* out */ __u32 elem_missed; /* out */ - union { - char *str[0]; /* out */ - __u64 val[0]; /* out */ - }; + __u64 val[0]; /* out */ }; struct btrfs_ioctl_ino_path_args { __u64 inum; /* in */ __u32 size; /* in */ __u64 reserved[4]; - struct btrfs_data_container *fspath; /* out */ + /* struct btrfs_data_container *fspath; out */ + __u64 fspath; /* out */ }; struct btrfs_ioctl_logical_ino_args { __u64 logical; /* in */ __u32 size; /* in */ __u64 reserved[4]; - struct btrfs_data_container *inodes; /* out */ + /* struct btrfs_data_container *inodes; out */ + __u64 inodes; }; #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 94cd3a19e9c8..562dad10dee9 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -272,7 +272,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) swarn->logical, swarn->dev->name, (unsigned long long)swarn->sector, root, inum, offset, min(isize - offset, (u64)PAGE_SIZE), nlink, - ipath->fspath->str[i]); + (char *)ipath->fspath->val[i]); free_ipath(ipath); return 0; -- cgit v1.2.3-58-ga151 From c674e04e1cd6049715e7b9446790f4b441e547c0 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 3 Nov 2011 22:23:13 -0400 Subject: Btrfs: fix extent_buffer leak in the metadata IO error handling The scrub readahead branch brought in a new error handling hook, but it was leaking extent_buffer references. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0eb1f0951251..40a62b980087 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -644,6 +644,7 @@ static int btree_io_failed_hook(struct bio *failed_bio, clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); btree_readahead_hook(root, eb, eb->start, -EIO); } + free_extent_buffer(eb); out: return -EIO; /* we fixed nothing */ -- cgit v1.2.3-58-ga151 From 306c8b68c82dfe6b7c9e5b61985760ad5d089205 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 3 Nov 2011 15:21:39 -0400 Subject: Btrfs: stop the readahead threads on failed mount If we don't stop them, they linger around corrupting memory by using pointers to freed things. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 40a62b980087..e532892431f4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2444,6 +2444,7 @@ fail_tree_roots: fail_sb_buffer: btrfs_stop_workers(&fs_info->generic_worker); + btrfs_stop_workers(&fs_info->readahead_workers); btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); btrfs_stop_workers(&fs_info->workers); -- cgit v1.2.3-58-ga151 From 9510dc4c62252e96dca44a94948327c76774f25b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 4 Nov 2011 09:41:02 -0400 Subject: Btrfs: stop leaking btrfs_bios on readahead Signed-off-by: Chris Mason --- fs/btrfs/reada.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index cd857119ba8a..35eaf6893679 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -423,6 +423,7 @@ again: } spin_unlock(&fs_info->reada_lock); + kfree(multi); return re; error: @@ -447,6 +448,7 @@ error: kref_put(&zone->refcnt, reada_zone_release); spin_unlock(&fs_info->reada_lock); } + kfree(multi); kfree(re); if (looped) goto again; -- cgit v1.2.3-58-ga151 From 21ca543efc12674fddb22ddf4ea4906427f4e982 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 4 Nov 2011 09:41:02 -0400 Subject: Btrfs: rename btrfs_bio multi -> bbio for consistency Signed-off-by: Chris Mason --- fs/btrfs/reada.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 35eaf6893679..2373b39a132b 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -247,7 +247,7 @@ int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, struct btrfs_device *dev, u64 logical, - struct btrfs_bio *multi) + struct btrfs_bio *bbio) { int ret; int looped = 0; @@ -297,11 +297,11 @@ again: kref_init(&zone->refcnt); zone->elems = 0; zone->device = dev; /* our device always sits at index 0 */ - for (i = 0; i < multi->num_stripes; ++i) { + for (i = 0; i < bbio->num_stripes; ++i) { /* bounds have already been checked */ - zone->devs[i] = multi->stripes[i].dev; + zone->devs[i] = bbio->stripes[i].dev; } - zone->ndevs = multi->num_stripes; + zone->ndevs = bbio->num_stripes; spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&dev->reada_zones, @@ -327,7 +327,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, struct reada_extent *re = NULL; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct btrfs_bio *multi = NULL; + struct btrfs_bio *bbio = NULL; struct btrfs_device *dev; u32 blocksize; u64 length; @@ -361,21 +361,21 @@ again: * map block */ length = blocksize; - ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &multi, 0); - if (ret || !multi || length < blocksize) + ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0); + if (ret || !bbio || length < blocksize) goto error; - if (multi->num_stripes > MAX_MIRRORS) { + if (bbio->num_stripes > MAX_MIRRORS) { printk(KERN_ERR "btrfs readahead: more than %d copies not " "supported", MAX_MIRRORS); goto error; } - for (nzones = 0; nzones < multi->num_stripes; ++nzones) { + for (nzones = 0; nzones < bbio->num_stripes; ++nzones) { struct reada_zone *zone; - dev = multi->stripes[nzones].dev; - zone = reada_find_zone(fs_info, dev, logical, multi); + dev = bbio->stripes[nzones].dev; + zone = reada_find_zone(fs_info, dev, logical, bbio); if (!zone) break; @@ -407,11 +407,11 @@ again: goto error; } for (i = 0; i < nzones; ++i) { - dev = multi->stripes[i].dev; + dev = bbio->stripes[i].dev; ret = radix_tree_insert(&dev->reada_extents, index, re); if (ret) { while (--i >= 0) { - dev = multi->stripes[i].dev; + dev = bbio->stripes[i].dev; BUG_ON(dev == NULL); radix_tree_delete(&dev->reada_extents, index); } @@ -423,7 +423,7 @@ again: } spin_unlock(&fs_info->reada_lock); - kfree(multi); + kfree(bbio); return re; error: @@ -448,7 +448,7 @@ error: kref_put(&zone->refcnt, reada_zone_release); spin_unlock(&fs_info->reada_lock); } - kfree(multi); + kfree(bbio); kfree(re); if (looped) goto again; -- cgit v1.2.3-58-ga151 From 56d2a48f81a1bde827c625b90221fade72fe9c61 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 4 Nov 2011 09:41:02 -0400 Subject: Btrfs: fix a potential btrfs_bio leak on scrub fixups In case we were able to map less than we wanted (length < PAGE_SIZE clause is true) btrfs_bio is still allocated and we have to free it. Signed-off-by: Ilya Dryomov Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 562dad10dee9..ed11d3866afd 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -655,6 +655,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) "scrub_fixup: btrfs_map_block failed us for %llu\n", (unsigned long long)logical); WARN_ON(1); + kfree(bbio); return; } -- cgit v1.2.3-58-ga151 From d43317dcd074818d4bd12ddd4184a29aff98907b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Sun, 6 Nov 2011 03:26:19 -0500 Subject: Btrfs: fix race during transaction joins While we're allocating ram for a new transaction, we drop our spinlock. When we get the lock back, we do check to see if a transaction started while we slept, but we don't check to make sure it isn't blocked because a commit has already started. Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 29f782cc2cc9..960835eaf4da 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -55,6 +55,7 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail) struct btrfs_transaction *cur_trans; spin_lock(&root->fs_info->trans_lock); +loop: if (root->fs_info->trans_no_join) { if (!nofail) { spin_unlock(&root->fs_info->trans_lock); @@ -75,16 +76,18 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail) cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); if (!cur_trans) return -ENOMEM; + spin_lock(&root->fs_info->trans_lock); if (root->fs_info->running_transaction) { + /* + * someone started a transaction after we unlocked. Make sure + * to redo the trans_no_join checks above + */ kmem_cache_free(btrfs_transaction_cachep, cur_trans); cur_trans = root->fs_info->running_transaction; - atomic_inc(&cur_trans->use_count); - atomic_inc(&cur_trans->num_writers); - cur_trans->num_joined++; - spin_unlock(&root->fs_info->trans_lock); - return 0; + goto loop; } + atomic_set(&cur_trans->num_writers, 1); cur_trans->num_joined = 0; init_waitqueue_head(&cur_trans->writer_wait); -- cgit v1.2.3-58-ga151 From 7c7e82a77fe3d89ae50824aa7c897454675eb4c4 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Sun, 6 Nov 2011 18:50:56 -0500 Subject: Btrfs: check for a null fs root when writing to the backup root log During log replay, can commit the transaction before the fs_root pointers are setup, so we have to make sure they are not null before trying to use them. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e532892431f4..e53a5bb85670 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1754,11 +1754,18 @@ static void backup_super_roots(struct btrfs_fs_info *info) btrfs_set_backup_extent_root_level(root_backup, btrfs_header_level(info->extent_root->node)); - btrfs_set_backup_fs_root(root_backup, info->fs_root->node->start); - btrfs_set_backup_fs_root_gen(root_backup, + /* + * we might commit during log recovery, which happens before we set + * the fs_root. Make sure it is valid before we fill it in. + */ + if (info->fs_root && info->fs_root->node) { + btrfs_set_backup_fs_root(root_backup, + info->fs_root->node->start); + btrfs_set_backup_fs_root_gen(root_backup, btrfs_header_generation(info->fs_root->node)); - btrfs_set_backup_fs_root_level(root_backup, + btrfs_set_backup_fs_root_level(root_backup, btrfs_header_level(info->fs_root->node)); + } btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start); btrfs_set_backup_dev_root_gen(root_backup, -- cgit v1.2.3-58-ga151 From 50e696308c3fb18a4a0dae7b3a4d47469149c919 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 7 Nov 2011 16:39:57 +0000 Subject: vfs: d_invalidate() should leave mountpoints alone Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- fs/dcache.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 274f13e2f094..a901c6901bce 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -546,9 +546,11 @@ int d_invalidate(struct dentry * dentry) * would make it unreachable from the root, * we might still populate it if it was a * working directory or similar). + * We also need to leave mountpoints alone, + * directory or not. */ - if (dentry->d_count > 1) { - if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) { + if (dentry->d_count > 1 && dentry->d_inode) { + if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) { spin_unlock(&dentry->d_lock); return -EBUSY; } -- cgit v1.2.3-58-ga151 From 45ea6095c8f0d6caad5658306416a5d254f1205e Mon Sep 17 00:00:00 2001 From: "slyich@gmail.com" Date: Mon, 7 Nov 2011 16:08:01 -0500 Subject: btrfs: fix double-free 'tree_root' in 'btrfs_mount()' On error path 'tree_root' is treed in 'free_fs_info()'. No need to free it explicitely. Noticed by SLUB in debug mode: Complete reproducer under usermode linux (discovered on real machine): bdev=/dev/ubda btr_root=/btr /mkfs.btrfs $bdev mount $bdev $btr_root mkdir $btr_root/subvols/ cd $btr_root/subvols/ /btrfs su cr foo /btrfs su cr bar mount $bdev -osubvol=subvols/foo $btr_root/subvols/bar umount $btr_root/subvols/bar which gives device fsid 4d55aa28-45b1-474b-b4ec-da912322195e devid 1 transid 7 /dev/ubda ============================================================================= BUG kmalloc-2048: Object already free ----------------------------------------------------------------------------- INFO: Allocated in btrfs_mount+0x389/0x7f0 age=0 cpu=0 pid=277 INFO: Freed in btrfs_mount+0x51c/0x7f0 age=0 cpu=0 pid=277 INFO: Slab 0x0000000062886200 objects=15 used=9 fp=0x0000000070b4d2d0 flags=0x4081 INFO: Object 0x0000000070b4d2d0 @offset=21200 fp=0x0000000070b4a968 ... Call Trace: 70b31948: [<6008c522>] print_trailer+0xe2/0x130 70b31978: [<6008c5aa>] object_err+0x3a/0x50 70b319a8: [<6008e242>] free_debug_processing+0x142/0x2a0 70b319e0: [<600ebf6f>] btrfs_mount+0x55f/0x7f0 70b319f8: [<6008e5c1>] __slab_free+0x221/0x2d0 Signed-off-by: Sergei Trofimovich Cc: Arne Jansen Cc: Chris Mason Cc: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/super.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 57080dffdfc6..dcd5aef6b614 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -933,8 +933,12 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, * then open_ctree will properly initialize everything later. */ fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); + if (!fs_info) { + error = -ENOMEM; + goto error_close_devices; + } tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - if (!fs_info || !tree_root) { + if (!tree_root) { error = -ENOMEM; goto error_close_devices; } @@ -964,7 +968,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, btrfs_close_devices(fs_devices); free_fs_info(fs_info); - kfree(tree_root); } else { char b[BDEVNAME_SIZE]; @@ -992,7 +995,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error_close_devices: btrfs_close_devices(fs_devices); free_fs_info(fs_info); - kfree(tree_root); return ERR_PTR(error); } -- cgit v1.2.3-58-ga151 From a3fbbde70a0cec017f2431e8f8de208708c76acc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 7 Nov 2011 21:21:26 +0000 Subject: VFS: we need to set LOOKUP_JUMPED on mountpoint crossing Mountpoint crossing is similar to following procfs symlinks - we do not get ->d_revalidate() called for dentry we have arrived at, with unpleasant consequences for NFS4. Simple way to reproduce the problem in mainline: cat >/tmp/a.c <<'EOF' #include #include #include main() { struct flock fl = {.l_type = F_RDLCK, .l_whence = SEEK_SET, .l_len = 1}; if (fcntl(0, F_SETLK, &fl)) perror("setlk"); } EOF cc /tmp/a.c -o /tmp/test then on nfs4: mount --bind file1 file2 /tmp/test < file1 # ok /tmp/test < file2 # spews "setlk: No locks available"... What happens is the missing call of ->d_revalidate() after mountpoint crossing and that's where NFS4 would issue OPEN request to server. The fix is simple - treat mountpoint crossing the same way we deal with following procfs-style symlinks. I.e. set LOOKUP_JUMPED... Cc: stable@kernel.org Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- fs/namei.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index ac6d214da827..5008f01787f5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -852,7 +852,7 @@ static int follow_managed(struct path *path, unsigned flags) mntput(path->mnt); if (ret == -EISDIR) ret = 0; - return ret; + return ret < 0 ? ret : need_mntput; } int follow_down_one(struct path *path) @@ -900,6 +900,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, break; path->mnt = mounted; path->dentry = mounted->mnt_root; + nd->flags |= LOOKUP_JUMPED; nd->seq = read_seqcount_begin(&path->dentry->d_seq); /* * Update the inode too. We don't need to re-check the @@ -1213,6 +1214,8 @@ retry: path_put_conditional(path, nd); return err; } + if (err) + nd->flags |= LOOKUP_JUMPED; *inode = path->dentry->d_inode; return 0; } @@ -2146,6 +2149,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path, } /* create side of things */ + /* + * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED has been + * cleared when we got to the last component we are about to look up + */ error = complete_walk(nd); if (error) return ERR_PTR(error); @@ -2214,6 +2221,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (error < 0) goto exit_dput; + if (error) + nd->flags |= LOOKUP_JUMPED; + error = -ENOENT; if (!path->dentry->d_inode) goto exit_dput; @@ -2223,6 +2233,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path, path_to_nameidata(path, nd); nd->inode = path->dentry->d_inode; + /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ + error = complete_walk(nd); + if (error) + goto exit; error = -EISDIR; if (S_ISDIR(nd->inode->i_mode)) goto exit; -- cgit v1.2.3-58-ga151 From b52a360b2aa1c59ba9970fb0f52bbb093fcc7a24 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Mon, 7 Nov 2011 16:10:24 +0000 Subject: xfs: Fix possible memory corruption in xfs_readlink Fixes a possible memory corruption when the link is larger than MAXPATHLEN and XFS_DEBUG is not enabled. This also remove the S_ISLNK assert, since the inode mode is checked previously in xfs_readlink_by_handle() and via VFS. Updated to address concerns raised by Ben Hutchings about the loose attention paid to 32- vs 64-bit values, and the lack of handling a potentially negative pathlen value: - Changed type of "pathlen" to be xfs_fsize_t, to match that of ip->i_d.di_size - Added checking for a negative pathlen to the too-long pathlen test, and generalized the message that gets reported in that case to reflect the change As a result, if a negative pathlen were encountered, this function would return EFSCORRUPTED (and would fail an assertion for a debug build)--just as would a too-long pathlen. Signed-off-by: Alex Elder Signed-off-by: Carlos Maiolino Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_vnodeops.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 4ecf2a549060..ce9268a2f56b 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -112,7 +112,7 @@ xfs_readlink( char *link) { xfs_mount_t *mp = ip->i_mount; - int pathlen; + xfs_fsize_t pathlen; int error = 0; trace_xfs_readlink(ip); @@ -122,13 +122,19 @@ xfs_readlink( xfs_ilock(ip, XFS_ILOCK_SHARED); - ASSERT(S_ISLNK(ip->i_d.di_mode)); - ASSERT(ip->i_d.di_size <= MAXPATHLEN); - pathlen = ip->i_d.di_size; if (!pathlen) goto out; + if (pathlen < 0 || pathlen > MAXPATHLEN) { + xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", + __func__, (unsigned long long) ip->i_ino, + (long long) pathlen); + ASSERT(0); + return XFS_ERROR(EFSCORRUPTED); + } + + if (ip->i_df.if_flags & XFS_IFINLINE) { memcpy(link, ip->i_df.if_u1.if_data, pathlen); link[pathlen] = '\0'; -- cgit v1.2.3-58-ga151 From 272e42b215c52d32e06bf035c1f6b70baa6716bd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 28 Oct 2011 09:54:24 +0000 Subject: xfs: constify xfs_item_ops The log item ops aren't nessecarily the biggest exploit vector, but marking them const is easy enough. Also remove the unused xfs_item_ops_t typedef while we're at it. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Alex Elder --- fs/xfs/xfs_buf_item.c | 2 +- fs/xfs/xfs_dquot_item.c | 6 +++--- fs/xfs/xfs_extfree_item.c | 4 ++-- fs/xfs/xfs_inode_item.c | 2 +- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_log.h | 2 +- fs/xfs/xfs_trans.h | 6 +++--- 7 files changed, 12 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 1a3513881bce..eac97ef81e2a 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -656,7 +656,7 @@ xfs_buf_item_committing( /* * This is the ops vector shared by all buf log items. */ -static struct xfs_item_ops xfs_buf_item_ops = { +static const struct xfs_item_ops xfs_buf_item_ops = { .iop_size = xfs_buf_item_size, .iop_format = xfs_buf_item_format, .iop_pin = xfs_buf_item_pin, diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index bb3f71d236d2..0dee0b71029d 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -295,7 +295,7 @@ xfs_qm_dquot_logitem_committing( /* * This is the ops vector for dquots */ -static struct xfs_item_ops xfs_dquot_item_ops = { +static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_size = xfs_qm_dquot_logitem_size, .iop_format = xfs_qm_dquot_logitem_format, .iop_pin = xfs_qm_dquot_logitem_pin, @@ -483,7 +483,7 @@ xfs_qm_qoff_logitem_committing( { } -static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { +static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, .iop_pin = xfs_qm_qoff_logitem_pin, @@ -498,7 +498,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { /* * This is the ops vector shared by all quotaoff-start log items. */ -static struct xfs_item_ops xfs_qm_qoff_logitem_ops = { +static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, .iop_pin = xfs_qm_qoff_logitem_pin, diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index d22e62623437..35c2aff38b20 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -217,7 +217,7 @@ xfs_efi_item_committing( /* * This is the ops vector shared by all efi log items. */ -static struct xfs_item_ops xfs_efi_item_ops = { +static const struct xfs_item_ops xfs_efi_item_ops = { .iop_size = xfs_efi_item_size, .iop_format = xfs_efi_item_format, .iop_pin = xfs_efi_item_pin, @@ -477,7 +477,7 @@ xfs_efd_item_committing( /* * This is the ops vector shared by all efd log items. */ -static struct xfs_item_ops xfs_efd_item_ops = { +static const struct xfs_item_ops xfs_efd_item_ops = { .iop_size = xfs_efd_item_size, .iop_format = xfs_efd_item_format, .iop_pin = xfs_efd_item_pin, diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index b7cf21ba240f..abaafdbb3e65 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -795,7 +795,7 @@ xfs_inode_item_committing( /* * This is the ops vector shared by all buf log items. */ -static struct xfs_item_ops xfs_inode_item_ops = { +static const struct xfs_item_ops xfs_inode_item_ops = { .iop_size = xfs_inode_item_size, .iop_format = xfs_inode_item_format, .iop_pin = xfs_inode_item_pin, diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 2758a6277c52..a14cd89fe465 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -626,7 +626,7 @@ xfs_log_item_init( struct xfs_mount *mp, struct xfs_log_item *item, int type, - struct xfs_item_ops *ops) + const struct xfs_item_ops *ops) { item->li_mountp = mp; item->li_ailp = mp->m_ail; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 78c9039994af..3f7bf451c034 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -137,7 +137,7 @@ struct xfs_trans; void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, int type, - struct xfs_item_ops *ops); + const struct xfs_item_ops *ops); xfs_lsn_t xfs_log_done(struct xfs_mount *mp, struct xlog_ticket *ticket, diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 603f3eb52041..3ae713c0abd9 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -326,7 +326,7 @@ typedef struct xfs_log_item { struct xfs_log_item *); /* buffer item iodone */ /* callback func */ - struct xfs_item_ops *li_ops; /* function list */ + const struct xfs_item_ops *li_ops; /* function list */ /* delayed logging */ struct list_head li_cil; /* CIL pointers */ @@ -341,7 +341,7 @@ typedef struct xfs_log_item { { XFS_LI_IN_AIL, "IN_AIL" }, \ { XFS_LI_ABORTED, "ABORTED" } -typedef struct xfs_item_ops { +struct xfs_item_ops { uint (*iop_size)(xfs_log_item_t *); void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); void (*iop_pin)(xfs_log_item_t *); @@ -352,7 +352,7 @@ typedef struct xfs_item_ops { void (*iop_push)(xfs_log_item_t *); bool (*iop_pushbuf)(xfs_log_item_t *); void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); -} xfs_item_ops_t; +}; #define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) #define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) -- cgit v1.2.3-58-ga151 From 810627d9a6d0e8820c798001875bc4e1b7754ebf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2011 08:56:15 +0000 Subject: xfs: fix force shutdown handling in xfs_end_io Ensure ioend->io_error gets propagated back to e.g. AIO completions. Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/xfs_aops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 33b13310ee0c..574d4ee9b625 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -189,7 +189,7 @@ xfs_end_io( int error = 0; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { - error = -EIO; + ioend->io_error = -EIO; goto done; } if (ioend->io_error) -- cgit v1.2.3-58-ga151 From 917c16b2b69fc2eeb432eabca73258f08c58361e Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 8 Nov 2011 14:49:59 -0500 Subject: Btrfs: fix oops on NULL trans handle in btrfs_truncate If we fail to reserve space in the transaction during truncate, we can error out with a NULL trans handle. The cleanup code needs an extra check to make sure we aren't trying to use the bad handle. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9d0eaa57d4ee..f60e2490bd0d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6529,14 +6529,16 @@ end_trans: ret = btrfs_orphan_del(NULL, inode); } - trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = btrfs_update_inode(trans, root, inode); - if (ret && !err) - err = ret; + if (trans) { + trans->block_rsv = &root->fs_info->trans_block_rsv; + ret = btrfs_update_inode(trans, root, inode); + if (ret && !err) + err = ret; - nr = trans->blocks_used; - ret = btrfs_end_transaction_throttle(trans, root); - btrfs_btree_balance_dirty(root, nr); + nr = trans->blocks_used; + ret = btrfs_end_transaction_throttle(trans, root); + btrfs_btree_balance_dirty(root, nr); + } out: btrfs_free_block_rsv(root, rsv); -- cgit v1.2.3-58-ga151 From 7fd2ae21a42d178982679b86086661292b4afe4a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 8 Nov 2011 15:47:34 -0500 Subject: Btrfs: fix our reservations for updating an inode when completing io People have been reporting ENOSPC crashes in finish_ordered_io. This is because we try to steal from the delalloc block rsv to satisfy a reservation to update the inode. The problem with this is we don't explicitly save space for updating the inode when doing delalloc. This is kind of a problem and we've gotten away with this because way back when we just stole from the delalloc reserve without any questions, and this worked out fine because generally speaking the leaf had been modified either by the mtime update when we did the original write or because we just updated the leaf when we inserted the file extent item, only on rare occasions had the leaf not actually been modified, and that was still ok because we'd just use a block or two out of the over-reservation that is delalloc. Then came the delayed inode stuff. This is amazing, except it wants a full reservation for updating the inode since it may do it at some point down the road after we've written the blocks and we have to recow everything again. This worked out because the delayed inode stuff just stole from the global reserve, that is until recently when I changed that because it caused other problems. So here we are, we're doing everything right and being screwed for it. So take an extra reservation for the inode at delalloc reservation time and carry it through the life of the delalloc reservation. If we need it we can steal it in the delayed inode stuff. If we have already stolen it try and do a normal metadata reservation. If that fails try to steal from the delalloc reservation. If _that_ fails we'll get a WARN_ON() so I can start thinking of a better way to solve this and in the meantime we'll steal from the global reserve. With this patch I ran xfstests 13 in a loop for a couple of hours and didn't see any problems. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 4 +-- fs/btrfs/delayed-inode.c | 65 +++++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/extent-tree.c | 22 +++++++++++++--- fs/btrfs/inode.c | 1 + 4 files changed, 83 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 5a5d325a3935..634608d2a6d0 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -147,14 +147,12 @@ struct btrfs_inode { * the btrfs file release call will add this inode to the * ordered operations list so that we make sure to flush out any * new data the application may have written before commit. - * - * yes, its silly to have a single bitflag, but we might grow more - * of these. */ unsigned ordered_data_close:1; unsigned orphan_meta_reserved:1; unsigned dummy_inode:1; unsigned in_defrag:1; + unsigned delalloc_meta_reserved:1; /* * always compress this one file diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index bbe8496d5339..313ee14cf3b7 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -617,12 +617,14 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, static int btrfs_delayed_inode_reserve_metadata( struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct inode *inode, struct btrfs_delayed_node *node) { struct btrfs_block_rsv *src_rsv; struct btrfs_block_rsv *dst_rsv; u64 num_bytes; int ret; + int release = false; src_rsv = trans->block_rsv; dst_rsv = &root->fs_info->delayed_block_rsv; @@ -652,11 +654,67 @@ static int btrfs_delayed_inode_reserve_metadata( if (!ret) node->bytes_reserved = num_bytes; return ret; + } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { + spin_lock(&BTRFS_I(inode)->lock); + if (BTRFS_I(inode)->delalloc_meta_reserved) { + BTRFS_I(inode)->delalloc_meta_reserved = 0; + spin_unlock(&BTRFS_I(inode)->lock); + release = true; + goto migrate; + } + spin_unlock(&BTRFS_I(inode)->lock); + + /* Ok we didn't have space pre-reserved. This shouldn't happen + * too often but it can happen if we do delalloc to an existing + * inode which gets dirtied because of the time update, and then + * isn't touched again until after the transaction commits and + * then we try to write out the data. First try to be nice and + * reserve something strictly for us. If not be a pain and try + * to steal from the delalloc block rsv. + */ + ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); + if (!ret) + goto out; + + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + if (!ret) + goto out; + + /* + * Ok this is a problem, let's just steal from the global rsv + * since this really shouldn't happen that often. + */ + WARN_ON(1); + ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv, + dst_rsv, num_bytes); + goto out; } +migrate: ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!ret) - node->bytes_reserved = num_bytes; + if (unlikely(ret)) { + /* This shouldn't happen */ + BUG_ON(release); + return ret; + } + +out: + /* + * Migrate only takes a reservation, it doesn't touch the size of the + * block_rsv. This is to simplify people who don't normally have things + * migrated from their block rsv. If they go to release their + * reservation, that will decrease the size as well, so if migrate + * reduced size we'd end up with a negative size. But for the + * delalloc_meta_reserved stuff we will only know to drop 1 reservation, + * but we could in fact do this reserve/migrate dance several times + * between the time we did the original reservation and we'd clean it + * up. So to take care of this, release the space for the meta + * reservation here. I think it may be time for a documentation page on + * how block rsvs. work. + */ + if (release) + btrfs_block_rsv_release(root, src_rsv, num_bytes); + node->bytes_reserved = num_bytes; return ret; } @@ -1708,7 +1766,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, goto release_node; } - ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); + ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode, + delayed_node); if (ret) goto release_node; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 18ea90c8943b..0b044e509e9f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4063,23 +4063,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, */ static unsigned drop_outstanding_extent(struct inode *inode) { + unsigned drop_inode_space = 0; unsigned dropped_extents = 0; BUG_ON(!BTRFS_I(inode)->outstanding_extents); BTRFS_I(inode)->outstanding_extents--; + if (BTRFS_I(inode)->outstanding_extents == 0 && + BTRFS_I(inode)->delalloc_meta_reserved) { + drop_inode_space = 1; + BTRFS_I(inode)->delalloc_meta_reserved = 0; + } + /* * If we have more or the same amount of outsanding extents than we have * reserved then we need to leave the reserved extents count alone. */ if (BTRFS_I(inode)->outstanding_extents >= BTRFS_I(inode)->reserved_extents) - return 0; + return drop_inode_space; dropped_extents = BTRFS_I(inode)->reserved_extents - BTRFS_I(inode)->outstanding_extents; BTRFS_I(inode)->reserved_extents -= dropped_extents; - return dropped_extents; + return dropped_extents + drop_inode_space; } /** @@ -4165,9 +4172,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) nr_extents = BTRFS_I(inode)->outstanding_extents - BTRFS_I(inode)->reserved_extents; BTRFS_I(inode)->reserved_extents += nr_extents; + } - to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); + /* + * Add an item to reserve for updating the inode when we complete the + * delalloc io. + */ + if (!BTRFS_I(inode)->delalloc_meta_reserved) { + nr_extents++; + BTRFS_I(inode)->delalloc_meta_reserved = 1; } + + to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); spin_unlock(&BTRFS_I(inode)->lock); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f60e2490bd0d..2b920596c126 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6607,6 +6607,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->orphan_meta_reserved = 0; ei->dummy_inode = 0; ei->in_defrag = 0; + ei->delalloc_meta_reserved = 0; ei->force_compress = BTRFS_COMPRESS_NONE; ei->delayed_node = NULL; -- cgit v1.2.3-58-ga151 From a90e8b6fb80db43b029e1e76205452afa8bdc77a Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 8 Nov 2011 16:47:55 +0200 Subject: Btrfs: fix memory leak in btrfs_parse_early_options() Don't leak subvol_name string in case multiple subvol= options are given. "The lastest option is effective" behavior (consistent with subvolid= and subvolrootid= options) is preserved. Signed-off-by: Ilya Dryomov --- fs/btrfs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index dcd5aef6b614..6befcaf253bd 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -448,6 +448,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, token = match_token(p, tokens, args); switch (token) { case Opt_subvol: + kfree(*subvol_name); *subvol_name = match_strdup(&args[0]); break; case Opt_subvolid: -- cgit v1.2.3-58-ga151 From f23c8af8ca2789eeb0ab9ea90c214f9694d96cc5 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 8 Nov 2011 19:15:05 +0200 Subject: Btrfs: fix subvol_name leak on error in btrfs_mount() btrfs_parse_early_options() can fail due to error while scanning devices (-o device= option), but still strdup() subvol_name string: mount -o subvol=SUBV,device=BAD_DEVICE So free subvol_name string on error. Signed-off-by: Ilya Dryomov --- fs/btrfs/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6befcaf253bd..58e9492230ce 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -905,8 +905,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error = btrfs_parse_early_options(data, mode, fs_type, &subvol_name, &subvol_objectid, &subvol_rootid, &fs_devices); - if (error) + if (error) { + kfree(subvol_name); return ERR_PTR(error); + } if (subvol_name) { root = mount_subvol(subvol_name, flags, device_name, data); -- cgit v1.2.3-58-ga151 From 4d34b2789538befa45a68a191dc12e0886a69f7d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 9 Nov 2011 00:08:15 +0200 Subject: Btrfs: avoid null dereference and leaks when bailing from open_ctree() Fix bugs introduced by 6c41761f. Firstly, after failing to allocate any of the tree roots (first 'goto fail' in open_ctree()) we would dereference a NULL fs_info pointer in free_fs_info(). Secondly, after failures from init_srcu_struct(), setup_bdi() and new_inode() we would leak all earlier allocated roots: fs_info fields haven't been initialized yet so free_fs_info() is rendered useless. Fix this by initializing fs_info pointer and fs_info fields before any allocations happen. Signed-off-by: Ilya Dryomov --- fs/btrfs/disk-io.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e53a5bb85670..91db90b526c2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1890,31 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb, u64 features; struct btrfs_key location; struct buffer_head *bh; - struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); + struct btrfs_super_block *disk_super; struct btrfs_root *tree_root = btrfs_sb(sb); - struct btrfs_fs_info *fs_info = NULL; - struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); + struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_root *extent_root; + struct btrfs_root *csum_root; + struct btrfs_root *chunk_root; + struct btrfs_root *dev_root; struct btrfs_root *log_tree_root; - int ret; int err = -EINVAL; int num_backups_tried = 0; int backup_index = 0; - struct btrfs_super_block *disk_super; + extent_root = fs_info->extent_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + csum_root = fs_info->csum_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + chunk_root = fs_info->chunk_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + dev_root = fs_info->dev_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - if (!extent_root || !tree_root || !tree_root->fs_info || - !chunk_root || !dev_root || !csum_root) { + if (!extent_root || !csum_root || !chunk_root || !dev_root) { err = -ENOMEM; goto fail; } - fs_info = tree_root->fs_info; ret = init_srcu_struct(&fs_info->subvol_srcu); if (ret) { @@ -1954,12 +1955,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->reloc_mutex); init_completion(&fs_info->kobj_unregister); - fs_info->tree_root = tree_root; - fs_info->extent_root = extent_root; - fs_info->csum_root = csum_root; - fs_info->chunk_root = chunk_root; - fs_info->dev_root = dev_root; - fs_info->fs_devices = fs_devices; INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); btrfs_mapping_init(&fs_info->mapping_tree); -- cgit v1.2.3-58-ga151 From 586e46e2813c589d26258a599580421fb6fb576b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 9 Nov 2011 13:26:37 +0200 Subject: Btrfs: close devices on all error paths in open_ctree() Fix a bug introduced by 7e662854 where we would leave devices busy on certain error paths in open_ctree(). fs_info is guaranteed to be non-NULL now so it's safe to dereference it on all error paths. Signed-off-by: Ilya Dryomov --- fs/btrfs/disk-io.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 91db90b526c2..b6a5c0dd0dd8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2460,21 +2460,20 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->caching_workers); fail_alloc: fail_iput: + btrfs_mapping_tree_free(&fs_info->mapping_tree); + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); - - btrfs_close_devices(fs_info->fs_devices); - btrfs_mapping_tree_free(&fs_info->mapping_tree); fail_bdi: bdi_destroy(&fs_info->bdi); fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: + btrfs_close_devices(fs_info->fs_devices); free_fs_info(fs_info); return ERR_PTR(err); recovery_tree_root: - if (!btrfs_test_opt(tree_root, RECOVERY)) goto fail_tree_roots; -- cgit v1.2.3-58-ga151 From 04d21a244fdf79d0ac892eaaa9a46b682467277c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 9 Nov 2011 14:41:22 +0200 Subject: Btrfs: rework error handling in btrfs_mount() Commits 6c41761f and 45ea6095 introduced the possibility of NULL pointer dereference on error paths, also we would leave all devices busy and leak fs_info with all sub-structures on error when trying to mount an already mounted fs to a different directory. Fix this by doing all allocations before trying to open any of the devices, adjust error path for mount-already-mounted-fs case. Signed-off-by: Ilya Dryomov --- fs/btrfs/super.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 58e9492230ce..629281c65ff5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -891,7 +891,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, struct super_block *s; struct dentry *root; struct btrfs_fs_devices *fs_devices = NULL; - struct btrfs_root *tree_root = NULL; struct btrfs_fs_info *fs_info = NULL; fmode_t mode = FMODE_READ; char *subvol_name = NULL; @@ -920,15 +919,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (error) return ERR_PTR(error); - error = btrfs_open_devices(fs_devices, mode, fs_type); - if (error) - return ERR_PTR(error); - - if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { - error = -EACCES; - goto error_close_devices; - } - /* * Setup a dummy root and fs_info for test/set super. This is because * we don't actually fill this stuff out until open_ctree, but we need @@ -936,28 +926,36 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, * then open_ctree will properly initialize everything later. */ fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); - if (!fs_info) { - error = -ENOMEM; - goto error_close_devices; - } - tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - if (!tree_root) { + if (!fs_info) + return ERR_PTR(-ENOMEM); + + fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + if (!fs_info->tree_root) { error = -ENOMEM; - goto error_close_devices; + goto error_fs_info; } - fs_info->tree_root = tree_root; + fs_info->tree_root->fs_info = fs_info; fs_info->fs_devices = fs_devices; - tree_root->fs_info = fs_info; fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); if (!fs_info->super_copy || !fs_info->super_for_commit) { error = -ENOMEM; + goto error_fs_info; + } + + error = btrfs_open_devices(fs_devices, mode, fs_type); + if (error) + goto error_fs_info; + + if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { + error = -EACCES; goto error_close_devices; } bdev = fs_devices->latest_bdev; - s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); + s = sget(fs_type, btrfs_test_super, btrfs_set_super, + fs_info->tree_root); if (IS_ERR(s)) { error = PTR_ERR(s); goto error_close_devices; @@ -966,7 +964,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (s->s_root) { if ((flags ^ s->s_flags) & MS_RDONLY) { deactivate_locked_super(s); - return ERR_PTR(-EBUSY); + error = -EBUSY; + goto error_close_devices; } btrfs_close_devices(fs_devices); @@ -997,6 +996,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error_close_devices: btrfs_close_devices(fs_devices); +error_fs_info: free_fs_info(fs_info); return ERR_PTR(error); } -- cgit v1.2.3-58-ga151 From 5e442a493fc59fa536c76db1fff5b49ca36a88c5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 9 Nov 2011 18:16:00 -0500 Subject: Revert "proc: fix races against execve() of /proc/PID/fd**" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit aa6afca5bcaba8101f3ea09d5c3e4100b2b9f0e5. It escalates of some of the google-chrome SELinux problems with ptrace ("Check failed: pid_ > 0. Did not find zygote process"), and Andrew says that it is also causing mystery lockdep reports. Reported-by: Alex Villacís Lasso Requested-by: James Morris Requested-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 146 +++++++++++++++++---------------------------------------- 1 file changed, 43 insertions(+), 103 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 2db1bd3173b2..851ba3dcdc29 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1652,46 +1652,12 @@ out: return error; } -static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) -{ - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); - int rc; - - if (task == NULL) - return -ESRCH; - - rc = -EACCES; - if (lock_trace(task)) - goto out_task; - - generic_fillattr(inode, stat); - unlock_trace(task); - rc = 0; -out_task: - put_task_struct(task); - return rc; -} - static const struct inode_operations proc_pid_link_inode_operations = { .readlink = proc_pid_readlink, .follow_link = proc_pid_follow_link, .setattr = proc_setattr, }; -static const struct inode_operations proc_fdinfo_link_inode_operations = { - .setattr = proc_setattr, - .getattr = proc_pid_fd_link_getattr, -}; - -static const struct inode_operations proc_fd_link_inode_operations = { - .readlink = proc_pid_readlink, - .follow_link = proc_pid_follow_link, - .setattr = proc_setattr, - .getattr = proc_pid_fd_link_getattr, -}; - /* building an inode */ @@ -1923,61 +1889,49 @@ out: static int proc_fd_info(struct inode *inode, struct path *path, char *info) { - struct task_struct *task; - struct files_struct *files; + struct task_struct *task = get_proc_task(inode); + struct files_struct *files = NULL; struct file *file; int fd = proc_fd(inode); - int rc; - - task = get_proc_task(inode); - if (!task) - return -ENOENT; - - rc = -EACCES; - if (lock_trace(task)) - goto out_task; - - rc = -ENOENT; - files = get_files_struct(task); - if (files == NULL) - goto out_unlock; - /* - * We are not taking a ref to the file structure, so we must - * hold ->file_lock. - */ - spin_lock(&files->file_lock); - file = fcheck_files(files, fd); - if (file) { - unsigned int f_flags; - struct fdtable *fdt; - - fdt = files_fdtable(files); - f_flags = file->f_flags & ~O_CLOEXEC; - if (FD_ISSET(fd, fdt->close_on_exec)) - f_flags |= O_CLOEXEC; - - if (path) { - *path = file->f_path; - path_get(&file->f_path); + if (task) { + files = get_files_struct(task); + put_task_struct(task); + } + if (files) { + /* + * We are not taking a ref to the file structure, so we must + * hold ->file_lock. + */ + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (file) { + unsigned int f_flags; + struct fdtable *fdt; + + fdt = files_fdtable(files); + f_flags = file->f_flags & ~O_CLOEXEC; + if (FD_ISSET(fd, fdt->close_on_exec)) + f_flags |= O_CLOEXEC; + + if (path) { + *path = file->f_path; + path_get(&file->f_path); + } + if (info) + snprintf(info, PROC_FDINFO_MAX, + "pos:\t%lli\n" + "flags:\t0%o\n", + (long long) file->f_pos, + f_flags); + spin_unlock(&files->file_lock); + put_files_struct(files); + return 0; } - if (info) - snprintf(info, PROC_FDINFO_MAX, - "pos:\t%lli\n" - "flags:\t0%o\n", - (long long) file->f_pos, - f_flags); - rc = 0; - } else - rc = -ENOENT; - spin_unlock(&files->file_lock); - put_files_struct(files); - -out_unlock: - unlock_trace(task); -out_task: - put_task_struct(task); - return rc; + spin_unlock(&files->file_lock); + put_files_struct(files); + } + return -ENOENT; } static int proc_fd_link(struct inode *inode, struct path *path) @@ -2072,7 +2026,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, spin_unlock(&files->file_lock); put_files_struct(files); - inode->i_op = &proc_fd_link_inode_operations; + inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; ei->op.proc_get_link = proc_fd_link; d_set_d_op(dentry, &tid_fd_dentry_operations); @@ -2104,12 +2058,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir, if (fd == ~0U) goto out; - result = ERR_PTR(-EACCES); - if (lock_trace(task)) - goto out; - result = instantiate(dir, dentry, task, &fd); - unlock_trace(task); out: put_task_struct(task); out_no_task: @@ -2129,28 +2078,23 @@ static int proc_readfd_common(struct file * filp, void * dirent, retval = -ENOENT; if (!p) goto out_no_task; - - retval = -EACCES; - if (lock_trace(p)) - goto out; - retval = 0; fd = filp->f_pos; switch (fd) { case 0: if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) - goto out_unlock; + goto out; filp->f_pos++; case 1: ino = parent_ino(dentry); if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) - goto out_unlock; + goto out; filp->f_pos++; default: files = get_files_struct(p); if (!files) - goto out_unlock; + goto out; rcu_read_lock(); for (fd = filp->f_pos-2; fd < files_fdtable(files)->max_fds; @@ -2174,9 +2118,6 @@ static int proc_readfd_common(struct file * filp, void * dirent, rcu_read_unlock(); put_files_struct(files); } - -out_unlock: - unlock_trace(p); out: put_task_struct(p); out_no_task: @@ -2254,7 +2195,6 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir, ei->fd = fd; inode->i_mode = S_IFREG | S_IRUSR; inode->i_fop = &proc_fdinfo_file_operations; - inode->i_op = &proc_fdinfo_link_inode_operations; d_set_d_op(dentry, &tid_fd_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ -- cgit v1.2.3-58-ga151 From 2115133f8b9a8dbdb217d14080814df07ce90479 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 10 Nov 2011 20:39:08 -0500 Subject: Btrfs: tweak the delayed inode reservations again Josef sent along an incremental to the inode reservation code to make sure we try and fall back to directly updating the inode item if things go horribly wrong. This reworks that patch slightly, adding a fallback function that will always try to update the inode item directly without going through the delayed_inode code. Signed-off-by: Chris Mason --- fs/btrfs/delayed-inode.c | 9 +++---- fs/btrfs/inode.c | 64 +++++++++++++++++++++++++++++++++--------------- 2 files changed, 47 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 313ee14cf3b7..6a1a6800776c 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -692,11 +692,6 @@ static int btrfs_delayed_inode_reserve_metadata( migrate: ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (unlikely(ret)) { - /* This shouldn't happen */ - BUG_ON(release); - return ret; - } out: /* @@ -712,9 +707,11 @@ out: * reservation here. I think it may be time for a documentation page on * how block rsvs. work. */ + if (!ret) + node->bytes_reserved = num_bytes; + if (release) btrfs_block_rsv_release(root, src_rsv, num_bytes); - node->bytes_reserved = num_bytes; return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2b920596c126..7d394335bcb5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -93,6 +93,8 @@ static noinline int cow_file_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); +static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, @@ -1741,7 +1743,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); trans->block_rsv = &root->fs_info->delalloc_block_rsv; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode_fallback(trans, root, inode); BUG_ON(ret); } goto out; @@ -1791,7 +1793,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode_fallback(trans, root, inode); BUG_ON(ret); } ret = 0; @@ -2426,7 +2428,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, /* * copy everything in the in-memory inode into the btree. */ -noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, +static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) { struct btrfs_inode_item *inode_item; @@ -2434,21 +2436,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int ret; - /* - * If the inode is a free space inode, we can deadlock during commit - * if we put it into the delayed code. - * - * The data relocation inode should also be directly updated - * without delay - */ - if (!btrfs_is_free_space_inode(root, inode) - && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { - ret = btrfs_delayed_update_inode(trans, root, inode); - if (!ret) - btrfs_set_inode_last_trans(trans, inode); - return ret; - } - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2476,6 +2463,43 @@ failed: return ret; } +/* + * copy everything in the in-memory inode into the btree. + */ +noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + int ret; + + /* + * If the inode is a free space inode, we can deadlock during commit + * if we put it into the delayed code. + * + * The data relocation inode should also be directly updated + * without delay + */ + if (!btrfs_is_free_space_inode(root, inode) + && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { + ret = btrfs_delayed_update_inode(trans, root, inode); + if (!ret) + btrfs_set_inode_last_trans(trans, inode); + return ret; + } + + return btrfs_update_inode_item(trans, root, inode); +} + +static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + int ret; + + ret = btrfs_update_inode(trans, root, inode); + if (ret == -ENOSPC) + return btrfs_update_inode_item(trans, root, inode); + return ret; +} + /* * unlink helper that gets used here in inode.c and in the tree logging * recovery code. It remove a link in a directory with a given name, and @@ -5632,7 +5656,7 @@ again: if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { ret = btrfs_ordered_update_i_size(inode, 0, ordered); if (!ret) - err = btrfs_update_inode(trans, root, inode); + err = btrfs_update_inode_fallback(trans, root, inode); goto out; } @@ -5670,7 +5694,7 @@ again: add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); ret = btrfs_ordered_update_i_size(inode, 0, ordered); if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) - btrfs_update_inode(trans, root, inode); + btrfs_update_inode_fallback(trans, root, inode); ret = 0; out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, -- cgit v1.2.3-58-ga151 From 924cd8fbe41851eda2b68bf2ed501b2777fd77b4 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 10 Nov 2011 20:45:04 -0500 Subject: Btrfs: fix nocow when deleting the item btrfs_previous_item() just search the b+ tree, do not COW the nodes or leaves, if we modify the result of it, the meta-data will be broken. fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f8e2943101a1..c37433d3cd82 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -999,7 +999,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, key.objectid = device->devid; key.offset = start; key.type = BTRFS_DEV_EXTENT_KEY; - +again: ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { ret = btrfs_previous_item(root, path, key.objectid, @@ -1012,6 +1012,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_dev_extent); BUG_ON(found_key.offset > start || found_key.offset + btrfs_dev_extent_length(leaf, extent) < start); + key = found_key; + btrfs_release_path(path); + goto again; } else if (ret == 0) { leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], -- cgit v1.2.3-58-ga151 From ba38eb4de354d228f2792f93cde2c748a3a3f3b2 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 10 Nov 2011 20:45:04 -0500 Subject: Btrfs: fix no reserved space for writing out inode cache I-node cache forgets to reserve the space when writing out it. And when we do some stress test, such as synctest, it will trigger WARN_ON() in use_block_rsv(). WARNING: at fs/btrfs/extent-tree.c:5718 btrfs_alloc_free_block+0xbf/0x281 [btrfs]() ... Call Trace: [] warn_slowpath_common+0x80/0x98 [] warn_slowpath_null+0x15/0x17 [] btrfs_alloc_free_block+0xbf/0x281 [btrfs] [] ? __set_page_dirty_nobuffers+0xfe/0x108 [] __btrfs_cow_block+0x118/0x3b5 [btrfs] [] btrfs_cow_block+0x103/0x14e [btrfs] [] btrfs_search_slot+0x249/0x6a4 [btrfs] [] btrfs_lookup_inode+0x2a/0x8a [btrfs] [] btrfs_update_inode+0xaa/0x141 [btrfs] [] btrfs_save_ino_cache+0xea/0x202 [btrfs] [] ? btrfs_update_reloc_root+0x17e/0x197 [btrfs] [] commit_fs_roots+0xaa/0x158 [btrfs] [] btrfs_commit_transaction+0x405/0x731 [btrfs] [] ? wake_up_bit+0x25/0x25 [] ? btrfs_log_dentry_safe+0x43/0x51 [btrfs] [] btrfs_sync_file+0x16a/0x198 [btrfs] [] ? mntput+0x21/0x23 [] vfs_fsync_range+0x18/0x21 [] vfs_fsync+0x17/0x19 [] do_fsync+0x29/0x3e [] sys_fsync+0xb/0xf [] system_call_fastpath+0x16/0x1b Sometimes it causes BUG_ON() in the reservation code of the delayed inode is triggered. So we must reserve enough space for inode cache. Note: If we can not reserve the enough space for inode cache, we will give up writing out it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode-map.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 53dcbdf446cd..f8962a957d65 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -398,6 +398,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root, struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; struct btrfs_path *path; struct inode *inode; + struct btrfs_block_rsv *rsv; + u64 num_bytes; u64 alloc_hint = 0; int ret; int prealloc; @@ -421,11 +423,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root, if (!path) return -ENOMEM; + rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->trans_block_rsv; + + num_bytes = trans->bytes_reserved; + /* + * 1 item for inode item insertion if need + * 3 items for inode item update (in the worst case) + * 1 item for free space object + * 3 items for pre-allocation + */ + trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); + ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, + trans->bytes_reserved); + if (ret) + goto out; again: inode = lookup_free_ino_inode(root, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { ret = PTR_ERR(inode); - goto out; + goto out_release; } if (IS_ERR(inode)) { @@ -434,7 +451,7 @@ again: ret = create_free_ino_inode(root, trans, path); if (ret) - goto out; + goto out_release; goto again; } @@ -477,11 +494,14 @@ again: } btrfs_free_reserved_data_space(inode, prealloc); + ret = btrfs_write_out_ino_cache(root, trans, path); out_put: iput(inode); +out_release: + btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); out: - if (ret == 0) - ret = btrfs_write_out_ino_cache(root, trans, path); + trans->block_rsv = rsv; + trans->bytes_reserved = num_bytes; btrfs_free_path(path); return ret; -- cgit v1.2.3-58-ga151 From 3254c87618354e58fa2a7b375c6664f567480c33 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 10 Nov 2011 20:45:05 -0500 Subject: Btrfs: fix unreleased path in btrfs_orphan_cleanup() When we did stress test for the space relocation, the deadlock happened. By debugging, We found it was caused by the carelessness that we forgot to unlock the read lock of the extent buffers in btrfs_orphan_cleanup() before we end the transaction handle, so the transaction commit task waited the task, which called btrfs_orphan_cleanup(), to unlock the extent buffer, but that task waited the commit task to end the transaction commit, and the deadlock happened. Fix it. Signed-ff-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7d394335bcb5..e16215f480d0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2201,6 +2201,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) if (ret) goto out; } + /* release the path since we're done with it */ + btrfs_release_path(path); + root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; if (root->orphan_block_rsv) -- cgit v1.2.3-58-ga151 From 61b520a9d0083b9b361638e456af45fd75150c87 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 10 Nov 2011 20:45:05 -0500 Subject: Btrfs: Abstract similar code for btrfs_block_rsv_add{, _noflush} btrfs_block_rsv_add{, _noflush}() have similar code, so abstract that code. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0b044e509e9f..0f47b3e2010e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3796,16 +3796,16 @@ void btrfs_free_block_rsv(struct btrfs_root *root, kfree(rsv); } -int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) +static inline int __block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int flush) { int ret; if (num_bytes == 0) return 0; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 1); return 0; @@ -3814,22 +3814,18 @@ int btrfs_block_rsv_add(struct btrfs_root *root, return ret; } +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + return __block_rsv_add(root, block_rsv, num_bytes, 1); +} + int btrfs_block_rsv_add_noflush(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes) { - int ret; - - if (num_bytes == 0) - return 0; - - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0); - if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, 1); - return 0; - } - - return ret; + return __block_rsv_add(root, block_rsv, num_bytes, 0); } int btrfs_block_rsv_check(struct btrfs_root *root, -- cgit v1.2.3-58-ga151 From 76b9e23d25d5c99f994bee3172de39492e452e93 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 10 Nov 2011 20:45:05 -0500 Subject: Btrfs: fix orphan backref nodes If the root node of a fs/file tree is in the block group that is being relocated, but the others are not in the other block groups. when we create a snapshot for this tree between the relocation tree creation ends and ->create_reloc_tree is set to 0, Btrfs will create some backref nodes that are the lowest nodes of the backrefs cache. But we forget to add them into ->leaves list of the backref cache and deal with them, and at last, they will triggered BUG_ON(). kernel BUG at fs/btrfs/relocation.c:239! This patch fixes it by adding them into ->leaves list of backref cache. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/relocation.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 24d654ce7a06..dff29d5e151a 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, list_add_tail(&new_edge->list[UPPER], &new_node->lower); } + } else { + list_add_tail(&new_node->lower, &cache->leaves); } rb_node = tree_insert(&cache->rb_root, new_node->bytenr, -- cgit v1.2.3-58-ga151 From 2f120c05e67ae34c93786b1050c6828904314429 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 10 Nov 2011 20:45:05 -0500 Subject: Btrfs: only map pages if we know we need them when reading the space cache People have been running into a warning when loading space cache because the page is already mapped when trying to read in a bitmap. The way we read in entries and pages is kind of convoluted, so fix it so that io_ctl_read_entry maps the entries if it needs to, and if it hits the end of the page it simply unmaps the page. That way we can unconditionally unmap the io_ctl before reading in the bitmap and we should stop hitting these warnings. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 7a15fcfb3e1f..181760f9d2ab 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -537,6 +537,13 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl, struct btrfs_free_space *entry, u8 *type) { struct btrfs_free_space_entry *e; + int ret; + + if (!io_ctl->cur) { + ret = io_ctl_check_crc(io_ctl, io_ctl->index); + if (ret) + return ret; + } e = io_ctl->cur; entry->offset = le64_to_cpu(e->offset); @@ -550,10 +557,7 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl, io_ctl_unmap_page(io_ctl); - if (io_ctl->index >= io_ctl->num_pages) - return 0; - - return io_ctl_check_crc(io_ctl, io_ctl->index); + return 0; } static int io_ctl_read_bitmap(struct io_ctl *io_ctl, @@ -561,9 +565,6 @@ static int io_ctl_read_bitmap(struct io_ctl *io_ctl, { int ret; - if (io_ctl->cur && io_ctl->cur != io_ctl->orig) - io_ctl_unmap_page(io_ctl); - ret = io_ctl_check_crc(io_ctl, io_ctl->index); if (ret) return ret; @@ -699,6 +700,8 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, num_entries--; } + io_ctl_unmap_page(&io_ctl); + /* * We add the bitmaps at the end of the entries in order that * the bitmap entries are added to the cache. -- cgit v1.2.3-58-ga151 From 62f30c5462374b991e7e3f42d49ce2265c1b82f1 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 10 Nov 2011 20:45:05 -0500 Subject: Btrfs: fix deadlock caused by the race between relocation We can not do flushable reservation for the relocation when we create snapshot, because it may make the transaction commit task and the flush task wait for each other and the deadlock happens. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 960835eaf4da..6a0574e923bc 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -882,8 +882,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); if (to_reserve > 0) { - ret = btrfs_block_rsv_add(root, &pending->block_rsv, - to_reserve); + ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, + to_reserve); if (ret) { pending->error = ret; goto fail; -- cgit v1.2.3-58-ga151 From 69f4cb526bd02ae5af35846f9a710c099eec3347 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Fri, 11 Nov 2011 08:17:10 -0500 Subject: Btrfs: handle bio_add_page failure gracefully in scrub Currently scrub fails with ENOMEM when bio_add_page fails. Unfortunately dm based targets accept only one page per bio, thus making scrub always fails. This patch just submits the current bio when an error is encountered and starts a new one. Signed-off-by: Arne Jansen Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 64 +++++++++++++++++++++++++------------------------------- 1 file changed, 29 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ed11d3866afd..f4190f22edfb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -944,50 +944,18 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) static int scrub_submit(struct scrub_dev *sdev) { struct scrub_bio *sbio; - struct bio *bio; - int i; if (sdev->curr == -1) return 0; sbio = sdev->bios[sdev->curr]; - - bio = bio_alloc(GFP_NOFS, sbio->count); - if (!bio) - goto nomem; - - bio->bi_private = sbio; - bio->bi_end_io = scrub_bio_end_io; - bio->bi_bdev = sdev->dev->bdev; - bio->bi_sector = sbio->physical >> 9; - - for (i = 0; i < sbio->count; ++i) { - struct page *page; - int ret; - - page = alloc_page(GFP_NOFS); - if (!page) - goto nomem; - - ret = bio_add_page(bio, page, PAGE_SIZE, 0); - if (!ret) { - __free_page(page); - goto nomem; - } - } - sbio->err = 0; sdev->curr = -1; atomic_inc(&sdev->in_flight); - submit_bio(READ, bio); + submit_bio(READ, sbio->bio); return 0; - -nomem: - scrub_free_bio(bio); - - return -ENOMEM; } static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, @@ -995,6 +963,8 @@ static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, u8 *csum, int force) { struct scrub_bio *sbio; + struct page *page; + int ret; again: /* @@ -1015,12 +985,22 @@ again: } sbio = sdev->bios[sdev->curr]; if (sbio->count == 0) { + struct bio *bio; + sbio->physical = physical; sbio->logical = logical; + bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO); + if (!bio) + return -ENOMEM; + + bio->bi_private = sbio; + bio->bi_end_io = scrub_bio_end_io; + bio->bi_bdev = sdev->dev->bdev; + bio->bi_sector = sbio->physical >> 9; + sbio->err = 0; + sbio->bio = bio; } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || sbio->logical + sbio->count * PAGE_SIZE != logical) { - int ret; - ret = scrub_submit(sdev); if (ret) return ret; @@ -1030,6 +1010,20 @@ again: sbio->spag[sbio->count].generation = gen; sbio->spag[sbio->count].have_csum = 0; sbio->spag[sbio->count].mirror_num = mirror_num; + + page = alloc_page(GFP_NOFS); + if (!page) + return -ENOMEM; + + ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0); + if (!ret) { + __free_page(page); + ret = scrub_submit(sdev); + if (ret) + return ret; + goto again; + } + if (csum) { sbio->spag[sbio->count].have_csum = 1; memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); -- cgit v1.2.3-58-ga151 From 8965593e41dd2d0e2a2f1e6f245336005ea94a2c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 11 Nov 2011 10:14:57 -0500 Subject: btrfs: rename the option to nospace_cache Rename no_space_cache option to nospace_cache to be more consistent with the rest, where the simple prefix 'no' is used to negate an option. The option has been introduced during the -rc1 cycle and there are has not been widely used, so it's safe. Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 629281c65ff5..8bd9d6d0e07a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -197,7 +197,7 @@ static match_table_t tokens = { {Opt_subvolrootid, "subvolrootid=%d"}, {Opt_defrag, "autodefrag"}, {Opt_inode_cache, "inode_cache"}, - {Opt_no_space_cache, "no_space_cache"}, + {Opt_no_space_cache, "nospace_cache"}, {Opt_recovery, "recovery"}, {Opt_err, NULL}, }; @@ -711,7 +711,7 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) if (btrfs_test_opt(root, SPACE_CACHE)) seq_puts(seq, ",space_cache"); else - seq_puts(seq, ",no_space_cache"); + seq_puts(seq, ",nospace_cache"); if (btrfs_test_opt(root, CLEAR_CACHE)) seq_puts(seq, ",clear_cache"); if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) -- cgit v1.2.3-58-ga151