From 6dda9266913ad57e09afc1a10d6473f10c806a63 Mon Sep 17 00:00:00 2001
From: "Luck, Tony" <tony.luck@intel.com>
Date: Thu, 11 Aug 2011 15:14:39 -0700
Subject: pstore: defer inserting OOPS entries into pstore

Life is simple for all the kernel terminating types of kmsg_dump
call backs - pstore just saves the tail end of the console log. But
for "oops" the situation is more complex - the kernel may carry on
running (possibly for ever).  So we'd like to make the logged copy
of the oops appear in the pstore filesystem - so that the user has
a handle to clear the entry from the persistent backing store (if
we don't, the store may fill with "oops" entries (that are also
safely stashed in /var/log/messages) leaving no space for real
errors.

Current code calls pstore_mkfile() immediately. But this may
not be safe. The oops could have happened with arbitrary locks
held, or in interrupt or NMI context. So allocating memory and
calling into generic filesystem code seems unwise.

This patch defers making the entry appear. At the time
of the oops, we merely set a flag "pstore_new_entry" noting that
a new entry has been added. A periodic timer checks once a minute
to see if the flag is set - if so, it schedules a work queue to
rescan the backing store and make all new entries appear in the
pstore filesystem.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 fs/pstore/inode.c    | 40 ++++++++++++++++++++++++++++++++++----
 fs/pstore/internal.h |  2 +-
 fs/pstore/platform.c | 54 +++++++++++++++++++++++++++++++++++++++++++---------
 3 files changed, 82 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 893b961dcfd8..379a02dc1217 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -24,6 +24,7 @@
 #include <linux/highmem.h>
 #include <linux/time.h>
 #include <linux/init.h>
+#include <linux/list.h>
 #include <linux/string.h>
 #include <linux/mount.h>
 #include <linux/ramfs.h>
@@ -32,13 +33,18 @@
 #include <linux/magic.h>
 #include <linux/pstore.h>
 #include <linux/slab.h>
+#include <linux/spinlock.h>
 #include <linux/uaccess.h>
 
 #include "internal.h"
 
 #define	PSTORE_NAMELEN	64
 
+static DEFINE_SPINLOCK(allpstore_lock);
+static LIST_HEAD(allpstore);
+
 struct pstore_private {
+	struct list_head list;
 	struct pstore_info *psi;
 	enum pstore_type_id type;
 	u64	id;
@@ -81,8 +87,16 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
 
 static void pstore_evict_inode(struct inode *inode)
 {
+	struct pstore_private	*p = inode->i_private;
+	unsigned long		flags;
+
 	end_writeback(inode);
-	kfree(inode->i_private);
+	if (p) {
+		spin_lock_irqsave(&allpstore_lock, flags);
+		list_del(&p->list);
+		spin_unlock_irqrestore(&allpstore_lock, flags);
+		kfree(p);
+	}
 }
 
 static const struct inode_operations pstore_dir_inode_operations = {
@@ -182,9 +196,23 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
 	struct dentry		*root = pstore_sb->s_root;
 	struct dentry		*dentry;
 	struct inode		*inode;
-	int			rc;
+	int			rc = 0;
 	char			name[PSTORE_NAMELEN];
-	struct pstore_private	*private;
+	struct pstore_private	*private, *pos;
+	unsigned long		flags;
+
+	spin_lock_irqsave(&allpstore_lock, flags);
+	list_for_each_entry(pos, &allpstore, list) {
+		if (pos->type == type &&
+		    pos->id == id &&
+		    pos->psi == psi) {
+			rc = -EEXIST;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&allpstore_lock, flags);
+	if (rc)
+		return rc;
 
 	rc = -ENOMEM;
 	inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0);
@@ -229,6 +257,10 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
 
 	d_add(dentry, inode);
 
+	spin_lock_irqsave(&allpstore_lock, flags);
+	list_add(&private->list, &allpstore);
+	spin_unlock_irqrestore(&allpstore_lock, flags);
+
 	mutex_unlock(&root->d_inode->i_mutex);
 
 	return 0;
@@ -277,7 +309,7 @@ int pstore_fill_super(struct super_block *sb, void *data, int silent)
 		goto fail;
 	}
 
-	pstore_get_records();
+	pstore_get_records(0);
 
 	return 0;
 fail:
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 611c1b3c46fa..3bde461c3f34 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -1,5 +1,5 @@
 extern void	pstore_set_kmsg_bytes(int);
-extern void	pstore_get_records(void);
+extern void	pstore_get_records(int);
 extern int	pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
 			      char *data, size_t size,
 			      struct timespec time, struct pstore_info *psi);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index c5300ec31696..ca60ebcfb15f 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -25,11 +25,28 @@
 #include <linux/module.h>
 #include <linux/pstore.h>
 #include <linux/string.h>
+#include <linux/timer.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/workqueue.h>
 
 #include "internal.h"
 
+/*
+ * We defer making "oops" entries appear in pstore - see
+ * whether the system is actually still running well enough
+ * to let someone see the entry
+ */
+#define	PSTORE_INTERVAL	(60 * HZ)
+
+static int pstore_new_entry;
+
+static void pstore_timefunc(unsigned long);
+static DEFINE_TIMER(pstore_timer, pstore_timefunc, 0, 0);
+
+static void pstore_dowork(struct work_struct *);
+static DECLARE_WORK(pstore_work, pstore_dowork);
+
 /*
  * pstore_lock just protects "psinfo" during
  * calls to pstore_register()
@@ -100,9 +117,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 		id = psinfo->write(PSTORE_TYPE_DMESG, part,
 				   hsize + l1_cpy + l2_cpy, psinfo);
 		if (reason == KMSG_DUMP_OOPS && pstore_is_mounted())
-			pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id,
-				      psinfo->buf, hsize + l1_cpy + l2_cpy,
-				      CURRENT_TIME, psinfo);
+			pstore_new_entry = 1;
 		l1 -= l1_cpy;
 		l2 -= l2_cpy;
 		total += l1_cpy + l2_cpy;
@@ -148,19 +163,24 @@ int pstore_register(struct pstore_info *psi)
 	}
 
 	if (pstore_is_mounted())
-		pstore_get_records();
+		pstore_get_records(0);
 
 	kmsg_dump_register(&pstore_dumper);
 
+	pstore_timer.expires = jiffies + PSTORE_INTERVAL;
+	add_timer(&pstore_timer);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(pstore_register);
 
 /*
- * Read all the records from the persistent store. Create and
- * file files in our filesystem.
+ * Read all the records from the persistent store. Create
+ * files in our filesystem.  Don't warn about -EEXIST errors
+ * when we are re-scanning the backing store looking to add new
+ * error records.
  */
-void pstore_get_records(void)
+void pstore_get_records(int quiet)
 {
 	struct pstore_info *psi = psinfo;
 	ssize_t			size;
@@ -178,8 +198,9 @@ void pstore_get_records(void)
 		goto out;
 
 	while ((size = psi->read(&id, &type, &time, psi)) > 0) {
-		if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size,
-				  time, psi))
+		rc = pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size,
+				  time, psi);
+		if (rc && (rc != -EEXIST || !quiet))
 			failed++;
 	}
 	psi->close(psi);
@@ -191,6 +212,21 @@ out:
 		       failed, psi->name);
 }
 
+static void pstore_dowork(struct work_struct *work)
+{
+	pstore_get_records(1);
+}
+
+static void pstore_timefunc(unsigned long dummy)
+{
+	if (pstore_new_entry) {
+		pstore_new_entry = 0;
+		schedule_work(&pstore_work);
+	}
+
+	mod_timer(&pstore_timer, jiffies + PSTORE_INTERVAL);
+}
+
 /*
  * Call platform driver to write a record to the
  * persistent store.
-- 
cgit v1.2.3-58-ga151


From abd4d5587be911f63592537284dad78766d97d62 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 12 Aug 2011 10:54:51 -0700
Subject: pstore: change mutex locking to spin_locks

pstore was using mutex locking to protect read/write access to the
backend plug-ins.  This causes problems when pstore is executed in
an NMI context through panic() -> kmsg_dump().

This patch changes the mutex to a spin_lock_irqsave then also checks to
see if we are in an NMI context.  If we are in an NMI and can't get the
lock, just print a message stating that and blow by the locking.

All this is probably a hack around the bigger locking problem but it
solves my current situation of trying to sleep in an NMI context.

Tested by loading the lkdtm module and executing a HARDLOCKUP which
will cause the machine to panic inside the nmi handler.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Acked-by: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 drivers/acpi/apei/erst.c   |  2 +-
 drivers/firmware/efivars.c |  2 +-
 fs/pstore/platform.c       | 28 +++++++++++++++++++++-------
 include/linux/pstore.h     |  2 +-
 4 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 2ca59dc69f7f..5e820ea35570 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -1165,7 +1165,7 @@ static int __init erst_init(void)
 		goto err_release_erange;
 
 	buf = kmalloc(erst_erange.size, GFP_KERNEL);
-	mutex_init(&erst_info.buf_mutex);
+	spin_lock_init(&erst_info.buf_lock);
 	if (buf) {
 		erst_info.buf = buf + sizeof(struct cper_pstore_record);
 		erst_info.bufsize = erst_erange.size -
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index eb80b549ed8d..be8bcb035e2a 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -978,7 +978,7 @@ int register_efivars(struct efivars *efivars,
 	if (efivars->efi_pstore_info.buf) {
 		efivars->efi_pstore_info.bufsize = 1024;
 		efivars->efi_pstore_info.data = efivars;
-		mutex_init(&efivars->efi_pstore_info.buf_mutex);
+		spin_lock_init(&efivars->efi_pstore_info.buf_lock);
 		pstore_register(&efivars->efi_pstore_info);
 	}
 
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index ca60ebcfb15f..0472924024cc 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -28,6 +28,7 @@
 #include <linux/timer.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/hardirq.h>
 #include <linux/workqueue.h>
 
 #include "internal.h"
@@ -88,13 +89,20 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 	u64		id;
 	int		hsize;
 	unsigned int	part = 1;
+	unsigned long	flags = 0;
+	int		is_locked = 0;
 
 	if (reason < ARRAY_SIZE(reason_str))
 		why = reason_str[reason];
 	else
 		why = "Unknown";
 
-	mutex_lock(&psinfo->buf_mutex);
+	if (in_nmi()) {
+		is_locked = spin_trylock(&psinfo->buf_lock);
+		if (!is_locked)
+			pr_err("pstore dump routine blocked in NMI, may corrupt error record\n");
+	} else
+		spin_lock_irqsave(&psinfo->buf_lock, flags);
 	oopscount++;
 	while (total < kmsg_bytes) {
 		dst = psinfo->buf;
@@ -123,7 +131,11 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 		total += l1_cpy + l2_cpy;
 		part++;
 	}
-	mutex_unlock(&psinfo->buf_mutex);
+	if (in_nmi()) {
+		if (is_locked)
+			spin_unlock(&psinfo->buf_lock);
+	} else
+		spin_unlock_irqrestore(&psinfo->buf_lock, flags);
 }
 
 static struct kmsg_dumper pstore_dumper = {
@@ -188,11 +200,12 @@ void pstore_get_records(int quiet)
 	enum pstore_type_id	type;
 	struct timespec		time;
 	int			failed = 0, rc;
+	unsigned long		flags;
 
 	if (!psi)
 		return;
 
-	mutex_lock(&psinfo->buf_mutex);
+	spin_lock_irqsave(&psinfo->buf_lock, flags);
 	rc = psi->open(psi);
 	if (rc)
 		goto out;
@@ -205,7 +218,7 @@ void pstore_get_records(int quiet)
 	}
 	psi->close(psi);
 out:
-	mutex_unlock(&psinfo->buf_mutex);
+	spin_unlock_irqrestore(&psinfo->buf_lock, flags);
 
 	if (failed)
 		printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n",
@@ -233,7 +246,8 @@ static void pstore_timefunc(unsigned long dummy)
  */
 int pstore_write(enum pstore_type_id type, char *buf, size_t size)
 {
-	u64	id;
+	u64		id;
+	unsigned long	flags;
 
 	if (!psinfo)
 		return -ENODEV;
@@ -241,13 +255,13 @@ int pstore_write(enum pstore_type_id type, char *buf, size_t size)
 	if (size > psinfo->bufsize)
 		return -EFBIG;
 
-	mutex_lock(&psinfo->buf_mutex);
+	spin_lock_irqsave(&psinfo->buf_lock, flags);
 	memcpy(psinfo->buf, buf, size);
 	id = psinfo->write(type, 0, size, psinfo);
 	if (pstore_is_mounted())
 		pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf,
 			      size, CURRENT_TIME, psinfo);
-	mutex_unlock(&psinfo->buf_mutex);
+	spin_unlock_irqrestore(&psinfo->buf_lock, flags);
 
 	return 0;
 }
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index cc03bbf5c4b8..b91440e64d6e 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -32,7 +32,7 @@ enum pstore_type_id {
 struct pstore_info {
 	struct module	*owner;
 	char		*name;
-	struct mutex	buf_mutex;	/* serialize access to 'buf' */
+	spinlock_t	buf_lock;	/* serialize access to 'buf' */
 	char		*buf;
 	size_t		bufsize;
 	int		(*open)(struct pstore_info *psi);
-- 
cgit v1.2.3-58-ga151


From 832023bffb4b493f230be901f681020caf3ed1f8 Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bernd.schubert@itwm.fraunhofer.de>
Date: Mon, 8 Aug 2011 17:38:08 +0200
Subject: nfsd4: Remove check for a 32-bit cookie in nfsd4_readdir()

Fan Yong <yong.fan@whamcloud.com> noticed setting
FMODE_32bithash wouldn't work with nfsd v4, as
nfsd4_readdir() checks for 32 bit cookies. However, according to RFC 3530
cookies have a 64 bit type and cookies are also defined as u64 in
'struct nfsd4_readdir'. So remove the test for >32-bit values.

Cc: stable@kernel.org
Signed-off-by: Bernd Schubert <bernd.schubert@itwm.fraunhofer.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index e80777666618..9bf0a6625187 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -691,7 +691,7 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
 	readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
 
-	if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) ||
+	if ((cookie == 1) || (cookie == 2) ||
 	    (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
 		return nfserr_bad_cookie;
 
-- 
cgit v1.2.3-58-ga151


From 5a0143a4f00517ea433bf459a80742ccc623a665 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 28 Jul 2011 17:47:10 +0200
Subject: ext3: Remove i_mutex from ext3_sync_file()

ext3_sync_file() does not need i_mutex for anything so just drop it.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/fsync.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d494c554c6e6..1860ed356323 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -61,13 +61,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	if (ret)
 		goto out;
 
-	/*
-	 * Taking the mutex here just to keep consistent with how fsync was
-	 * called previously, however it looks like we don't need to take
-	 * i_mutex at all.
-	 */
-	mutex_lock(&inode->i_mutex);
-
 	J_ASSERT(ext3_journal_current_handle() == NULL);
 
 	/*
@@ -85,7 +78,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	 *  safe in-journal, which is all fsync() needs to ensure.
 	 */
 	if (ext3_should_journal_data(inode)) {
-		mutex_unlock(&inode->i_mutex);
 		ret = ext3_force_commit(inode->i_sb);
 		goto out;
 	}
@@ -108,8 +100,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	 */
 	if (needs_barrier)
 		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
-
-	mutex_unlock(&inode->i_mutex);
 out:
 	trace_ext3_sync_file_exit(inode, ret);
 	return ret;
-- 
cgit v1.2.3-58-ga151


From 1cde201da4e97f10a5dd2434cff4ceff381603d1 Mon Sep 17 00:00:00 2001
From: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Date: Tue, 2 Aug 2011 18:16:57 +0900
Subject: ext3: fix message in ext3_remount for rw-remount case

If there are some inodes in orphan list while a filesystem is being
read-only mounted, we should recommend that peole umount and then
mount it when they try to remount with read-write. But the current
message and comment recommend that they umount and then remount it
which may be slightly misleading.

Signed-off-by: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 7beb69ae0015..2043bcc87719 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2669,13 +2669,13 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 			/*
 			 * If we have an unprocessed orphan list hanging
 			 * around from a previously readonly bdev mount,
-			 * require a full umount/remount for now.
+			 * require a full umount & mount for now.
 			 */
 			if (es->s_last_orphan) {
 				ext3_msg(sb, KERN_WARNING, "warning: couldn't "
 				       "remount RDWR because of unprocessed "
 				       "orphan inode list.  Please "
-				       "umount/remount instead.");
+				       "umount & mount instead.");
 				err = -EINVAL;
 				goto restore_opts;
 			}
-- 
cgit v1.2.3-58-ga151


From 6e3d6ca0bf91bcce0453fff9b597154ff6bb9731 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Thu, 4 Aug 2011 12:29:32 +0200
Subject: fs/ext2/balloc.c: delete useless initialization

Delete nontrivial initialization that is immediately overwritten by the
result of an allocation function.

The semantic match that makes this change is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@@
type T;
identifier i;
expression e;
@@

(
T i = \(0\|NULL\|ERR_PTR(...)\);
|
-T i = e;
+T i;
)
... when != i
i = \(kzalloc\|kcalloc\|kmalloc\)(...);

// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/balloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 8f44cef1b3ef..a8cbe1bc6ad4 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -421,7 +421,7 @@ static inline int rsv_is_empty(struct ext2_reserve_window *rsv)
 void ext2_init_block_alloc_info(struct inode *inode)
 {
 	struct ext2_inode_info *ei = EXT2_I(inode);
-	struct ext2_block_alloc_info *block_i = ei->i_block_alloc_info;
+	struct ext2_block_alloc_info *block_i;
 	struct super_block *sb = inode->i_sb;
 
 	block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
-- 
cgit v1.2.3-58-ga151


From 46130222df8567ffde773216044c7611a1e71d51 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Thu, 4 Aug 2011 12:29:31 +0200
Subject: fs/ext3/balloc.c: delete useless initialization

Delete nontrivial initialization that is immediately overwritten by the
result of an allocation function.

The semantic match that makes this change is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@@
type T;
identifier i;
expression e;
@@

(
T i = \(0\|NULL\|ERR_PTR(...)\);
|
-T i = e;
+T i;
)
... when != i
i = \(kzalloc\|kcalloc\|kmalloc\)(...);

// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/balloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 6386d76f44a7..caedc00520ed 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -427,7 +427,7 @@ static inline int rsv_is_empty(struct ext3_reserve_window *rsv)
 void ext3_init_block_alloc_info(struct inode *inode)
 {
 	struct ext3_inode_info *ei = EXT3_I(inode);
-	struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
+	struct ext3_block_alloc_info *block_i;
 	struct super_block *sb = inode->i_sb;
 
 	block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
-- 
cgit v1.2.3-58-ga151


From fbc854027c91fa2813ae7f9de43cc0b5c1119f41 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Tue, 16 Aug 2011 18:08:06 +0200
Subject: ext3: remove deprecated oldalloc

For a long time now orlov is the default block allocator in the ext3. It
performs better than the old one and no one seems to claim otherwise so
we can safely drop it and make oldalloc and orlov mount option
deprecated.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 Documentation/filesystems/ext3.txt |  8 -------
 fs/ext3/ialloc.c                   | 45 +++-----------------------------------
 fs/ext3/super.c                    |  8 +++----
 include/linux/ext3_fs.h            |  2 +-
 4 files changed, 8 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
index 22f3a0eda1d2..b100adc38adb 100644
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -73,14 +73,6 @@ nobarrier	(*)	This also requires an IO stack which can support
 			also be used to enable or disable barriers, for
 			consistency with other ext3 mount options.
 
-orlov		(*)	This enables the new Orlov block allocator. It is
-			enabled by default.
-
-oldalloc		This disables the Orlov block allocator and enables
-			the old block allocator.  Orlov should have better
-			performance - we'd like to get some feedback if it's
-			the contrary for you.
-
 user_xattr		Enables Extended User Attributes.  Additionally, you
 			need to have extended attribute support enabled in the
 			kernel configuration (CONFIG_EXT3_FS_XATTR).  See the
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index bf09cbf938cc..635bd8ce6d59 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -177,42 +177,6 @@ error_return:
 	ext3_std_error(sb, fatal);
 }
 
-/*
- * There are two policies for allocating an inode.  If the new inode is
- * a directory, then a forward search is made for a block group with both
- * free space and a low directory-to-inode ratio; if that fails, then of
- * the groups with above-average free space, that group with the fewest
- * directories already is chosen.
- *
- * For other inodes, search forward from the parent directory\'s block
- * group to find a free inode.
- */
-static int find_group_dir(struct super_block *sb, struct inode *parent)
-{
-	int ngroups = EXT3_SB(sb)->s_groups_count;
-	unsigned int freei, avefreei;
-	struct ext3_group_desc *desc, *best_desc = NULL;
-	int group, best_group = -1;
-
-	freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter);
-	avefreei = freei / ngroups;
-
-	for (group = 0; group < ngroups; group++) {
-		desc = ext3_get_group_desc (sb, group, NULL);
-		if (!desc || !desc->bg_free_inodes_count)
-			continue;
-		if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
-			continue;
-		if (!best_desc ||
-		    (le16_to_cpu(desc->bg_free_blocks_count) >
-		     le16_to_cpu(best_desc->bg_free_blocks_count))) {
-			best_group = group;
-			best_desc = desc;
-		}
-	}
-	return best_group;
-}
-
 /*
  * Orlov's allocator for directories.
  *
@@ -436,12 +400,9 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir,
 
 	sbi = EXT3_SB(sb);
 	es = sbi->s_es;
-	if (S_ISDIR(mode)) {
-		if (test_opt (sb, OLDALLOC))
-			group = find_group_dir(sb, dir);
-		else
-			group = find_group_orlov(sb, dir);
-	} else
+	if (S_ISDIR(mode))
+		group = find_group_orlov(sb, dir);
+	else
 		group = find_group_other(sb, dir);
 
 	err = -ENOSPC;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 2043bcc87719..922d289aeeb3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -652,8 +652,6 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",nouid32");
 	if (test_opt(sb, DEBUG))
 		seq_puts(seq, ",debug");
-	if (test_opt(sb, OLDALLOC))
-		seq_puts(seq, ",oldalloc");
 #ifdef CONFIG_EXT3_FS_XATTR
 	if (test_opt(sb, XATTR_USER))
 		seq_puts(seq, ",user_xattr");
@@ -1049,10 +1047,12 @@ static int parse_options (char *options, struct super_block *sb,
 			set_opt (sbi->s_mount_opt, DEBUG);
 			break;
 		case Opt_oldalloc:
-			set_opt (sbi->s_mount_opt, OLDALLOC);
+			ext3_msg(sb, KERN_WARNING,
+				"Ignoring deprecated oldalloc option");
 			break;
 		case Opt_orlov:
-			clear_opt (sbi->s_mount_opt, OLDALLOC);
+			ext3_msg(sb, KERN_WARNING,
+				"Ignoring deprecated orlov option");
 			break;
 #ifdef CONFIG_EXT3_FS_XATTR
 		case Opt_user_xattr:
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 67a803aee619..96a30b95e5c2 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -381,7 +381,7 @@ struct ext3_inode {
  * Mount flags
  */
 #define EXT3_MOUNT_CHECK		0x00001	/* Do mount-time checks */
-#define EXT3_MOUNT_OLDALLOC		0x00002  /* Don't use the new Orlov allocator */
+/* EXT3_MOUNT_OLDALLOC was there */
 #define EXT3_MOUNT_GRPID		0x00004	/* Create files with directory's group */
 #define EXT3_MOUNT_DEBUG		0x00008	/* Some debugging messages */
 #define EXT3_MOUNT_ERRORS_CONT		0x00010	/* Continue on errors */
-- 
cgit v1.2.3-58-ga151


From 576163005de286bbd418fcb99cfd0971523a0c6d Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 10 Aug 2011 19:16:22 -0400
Subject: nfsd4: fix seqid_mutating_error

The set of errors here does *not* agree with the set of errors specified
in the rfc!

While we're there, turn this macros into a function, for the usual
reasons, and move it to the one place where it's actually used.

Cc: stable@kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 12 ++++++++++++
 fs/nfsd/state.h   |  6 ------
 2 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c8bf405d19de..f81099605256 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1623,6 +1623,18 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
 								\
 	save = resp->p;
 
+static bool seqid_mutating_err(__be32 err)
+{
+	/* rfc 3530 section 8.1.5: */
+	return	err != nfserr_stale_clientid &&
+		err != nfserr_stale_stateid &&
+		err != nfserr_bad_stateid &&
+		err != nfserr_bad_seqid &&
+		err != nfserr_bad_xdr &&
+		err != nfserr_resource &&
+		err != nfserr_nofilehandle;
+}
+
 /*
  * Routine for encoding the result of a "seqid-mutating" NFSv4 operation.  This
  * is where sequence id's are incremented, and the replay cache is filled.
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 4eefaf1b42e8..5cfebe504056 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -447,12 +447,6 @@ struct nfs4_stateid {
 #define WR_STATE	        0x00000020
 #define CLOSE_STATE             0x00000040
 
-#define seqid_mutating_err(err)                       \
-	(((err) != nfserr_stale_clientid) &&    \
-	((err) != nfserr_bad_seqid) &&          \
-	((err) != nfserr_stale_stateid) &&      \
-	((err) != nfserr_bad_stateid))
-
 struct nfsd4_compound_state;
 
 extern __be32 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
-- 
cgit v1.2.3-58-ga151


From aadab6c6f4da38d639394de740602f146c88da0c Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 15 Aug 2011 16:55:02 -0400
Subject: nfsd4: return nfserr_symlink on v4 OPEN of non-regular file

Without this, an attempt to open a device special file without first
stat'ing it will fail.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 9bf0a6625187..d784ceb81a62 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -168,6 +168,24 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
 	return status;
 }
 
+static __be32 nfsd_check_obj_isreg(struct svc_fh *fh)
+{
+	umode_t mode = fh->fh_dentry->d_inode->i_mode;
+
+	if (S_ISREG(mode))
+		return nfs_ok;
+	if (S_ISDIR(mode))
+		return nfserr_isdir;
+	/*
+	 * Using err_symlink as our catch-all case may look odd; but
+	 * there's no other obvious error for this case in 4.0, and we
+	 * happen to know that it will cause the linux v4 client to do
+	 * the right thing on attempts to open something other than a
+	 * regular file.
+	 */
+	return nfserr_symlink;
+}
+
 static __be32
 do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
 {
@@ -216,6 +234,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 		status = nfsd_lookup(rqstp, current_fh,
 				     open->op_fname.data, open->op_fname.len, &resfh);
 		fh_unlock(current_fh);
+		if (status)
+			goto out;
+		status = nfsd_check_obj_isreg(&resfh);
 	}
 	if (status)
 		goto out;
-- 
cgit v1.2.3-58-ga151


From ab83fa4b49a54e6199b076b7d8c1808144e80f0d Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 26 Jul 2011 20:10:51 -0400
Subject: locks: minor lease cleanup

Use a helper function, to simplify upcoming changes.

Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/locks.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 703f545097de..c528522862b1 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -133,6 +133,11 @@
 #define IS_FLOCK(fl)	(fl->fl_flags & FL_FLOCK)
 #define IS_LEASE(fl)	(fl->fl_flags & FL_LEASE)
 
+static bool lease_breaking(struct file_lock *fl)
+{
+	return fl->fl_type & F_INPROGRESS;
+}
+
 int leases_enable = 1;
 int lease_break_time = 45;
 
@@ -1141,7 +1146,7 @@ static void time_out_leases(struct inode *inode)
 	struct file_lock *fl;
 
 	before = &inode->i_flock;
-	while ((fl = *before) && IS_LEASE(fl) && (fl->fl_type & F_INPROGRESS)) {
+	while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) {
 		if ((fl->fl_break_time == 0)
 				|| time_before(jiffies, fl->fl_break_time)) {
 			before = &fl->fl_next;
@@ -1189,7 +1194,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
 	if (want_write) {
 		/* If we want write access, we have to revoke any lease. */
 		future = F_UNLCK | F_INPROGRESS;
-	} else if (flock->fl_type & F_INPROGRESS) {
+	} else if (lease_breaking(flock)) {
 		/* If the lease is already being broken, we just leave it */
 		future = flock->fl_type;
 	} else if (flock->fl_type & F_WRLCK) {
@@ -1246,7 +1251,7 @@ restart:
 		/* Wait for the next lease that has not been broken yet */
 		for (flock = inode->i_flock; flock && IS_LEASE(flock);
 				flock = flock->fl_next) {
-			if (flock->fl_type & F_INPROGRESS)
+			if (lease_breaking(flock))
 				goto restart;
 		}
 		error = 0;
@@ -2126,7 +2131,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
 		}
 	} else if (IS_LEASE(fl)) {
 		seq_printf(f, "LEASE  ");
-		if (fl->fl_type & F_INPROGRESS)
+		if (lease_breaking(fl))
 			seq_printf(f, "BREAKING  ");
 		else if (fl->fl_file)
 			seq_printf(f, "ACTIVE    ");
@@ -2142,7 +2147,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
 			       : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE ");
 	} else {
 		seq_printf(f, "%s ",
-			       (fl->fl_type & F_INPROGRESS)
+			       (lease_breaking(fl))
 			       ? (fl->fl_type & F_UNLCK) ? "UNLCK" : "READ "
 			       : (fl->fl_type & F_WRLCK) ? "WRITE" : "READ ");
 	}
-- 
cgit v1.2.3-58-ga151


From 710b7216964d6455cf1b215c43b03a1a79008c7d Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 26 Jul 2011 16:28:29 -0400
Subject: locks: move F_INPROGRESS from fl_type to fl_flags field

F_INPROGRESS isn't exposed to userspace.  To me it makes more sense in
fl_flags....

Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 arch/alpha/include/asm/fcntl.h |  2 --
 fs/locks.c                     | 14 ++++++++------
 include/asm-generic/fcntl.h    |  5 -----
 include/linux/fs.h             |  3 ++-
 4 files changed, 10 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/arch/alpha/include/asm/fcntl.h b/arch/alpha/include/asm/fcntl.h
index 1b71ca70c9f6..6d9e805f18a7 100644
--- a/arch/alpha/include/asm/fcntl.h
+++ b/arch/alpha/include/asm/fcntl.h
@@ -51,8 +51,6 @@
 #define F_EXLCK		16	/* or 3 */
 #define F_SHLCK		32	/* or 4 */
 
-#define F_INPROGRESS	64
-
 #include <asm-generic/fcntl.h>
 
 #endif
diff --git a/fs/locks.c b/fs/locks.c
index c528522862b1..c4215418bca3 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -135,7 +135,7 @@
 
 static bool lease_breaking(struct file_lock *fl)
 {
-	return fl->fl_type & F_INPROGRESS;
+	return fl->fl_flags & FL_INPROGRESS;
 }
 
 int leases_enable = 1;
@@ -1132,6 +1132,7 @@ int lease_modify(struct file_lock **before, int arg)
 
 	if (error)
 		return error;
+	fl->fl_flags &= ~FL_INPROGRESS;
 	locks_wake_up_blocks(fl);
 	if (arg == F_UNLCK)
 		locks_delete_lock(before);
@@ -1152,7 +1153,7 @@ static void time_out_leases(struct inode *inode)
 			before = &fl->fl_next;
 			continue;
 		}
-		lease_modify(before, fl->fl_type & ~F_INPROGRESS);
+		lease_modify(before, fl->fl_type);
 		if (fl == *before)	/* lease_modify may have freed fl */
 			before = &fl->fl_next;
 	}
@@ -1193,13 +1194,13 @@ int __break_lease(struct inode *inode, unsigned int mode)
 
 	if (want_write) {
 		/* If we want write access, we have to revoke any lease. */
-		future = F_UNLCK | F_INPROGRESS;
+		future = F_UNLCK;
 	} else if (lease_breaking(flock)) {
 		/* If the lease is already being broken, we just leave it */
 		future = flock->fl_type;
 	} else if (flock->fl_type & F_WRLCK) {
 		/* Downgrade the exclusive lease to a read-only lease. */
-		future = F_RDLCK | F_INPROGRESS;
+		future = F_RDLCK;
 	} else {
 		/* the existing lease was read-only, so we can read too. */
 		goto out;
@@ -1221,6 +1222,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
 	for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) {
 		if (fl->fl_type != future) {
 			fl->fl_type = future;
+			fl->fl_flags |= FL_INPROGRESS;
 			fl->fl_break_time = break_time;
 			/* lease must have lmops break callback */
 			fl->fl_lmops->lm_break(fl);
@@ -1319,7 +1321,7 @@ int fcntl_getlease(struct file *filp)
 	for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl);
 			fl = fl->fl_next) {
 		if (fl->fl_file == filp) {
-			type = fl->fl_type & ~F_INPROGRESS;
+			type = fl->fl_type;
 			break;
 		}
 	}
@@ -1384,7 +1386,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 			before = &fl->fl_next) {
 		if (fl->fl_file == filp)
 			my_before = before;
-		else if (fl->fl_type == (F_INPROGRESS | F_UNLCK))
+		else if ((fl->fl_type == F_UNLCK) && lease_breaking(fl))
 			/*
 			 * Someone is in the process of opening this
 			 * file for writing so we may not take an
diff --git a/include/asm-generic/fcntl.h b/include/asm-generic/fcntl.h
index 84793c7025e2..9e5b0356e2bb 100644
--- a/include/asm-generic/fcntl.h
+++ b/include/asm-generic/fcntl.h
@@ -145,11 +145,6 @@ struct f_owner_ex {
 #define F_SHLCK		8	/* or 4 */
 #endif
 
-/* for leases */
-#ifndef F_INPROGRESS
-#define F_INPROGRESS	16
-#endif
-
 /* operations for bsd flock(), also used by the kernel implementation */
 #define LOCK_SH		1	/* shared lock */
 #define LOCK_EX		2	/* exclusive lock */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 178cdb4f1d4a..327fdd4de85f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1065,6 +1065,7 @@ static inline int file_check_writeable(struct file *filp)
 #define FL_LEASE	32	/* lease held on this file */
 #define FL_CLOSE	64	/* unlock on close */
 #define FL_SLEEP	128	/* A blocking lock */
+#define FL_INPROGRESS	256	/* Lease is being broken */
 
 /*
  * Special return value from posix_lock_file() and vfs_lock_file() for
@@ -1111,7 +1112,7 @@ struct file_lock {
 	struct list_head fl_link;	/* doubly linked list of all locks */
 	struct list_head fl_block;	/* circular list of blocked processes */
 	fl_owner_t fl_owner;
-	unsigned char fl_flags;
+	unsigned int fl_flags;
 	unsigned char fl_type;
 	unsigned int fl_pid;
 	struct pid *fl_nspid;
-- 
cgit v1.2.3-58-ga151


From 778fc546f749c588aa2f6cd50215d2715c374252 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 26 Jul 2011 18:25:49 -0400
Subject: locks: fix tracking of inprogress lease breaks

We currently use a bit in fl_flags to record whether a lease is being
broken, and set fl_type to the type (RDLCK or UNLCK) that it will
eventually have.  This means that once the lease break starts, we forget
what the lease's type *used* to be.  Breaking a read lease will then
result in blocking read opens, even though there's no conflict--because
the lease type is now F_UNLCK and we can no longer tell whether it was
previously a read or write lease.

So, instead keep fl_type as the original type (the type which we
enforce), and keep track of whether we're unlocking or merely
downgrading by replacing the single FL_INPROGRESS flag by
FL_UNLOCK_PENDING and FL_DOWNGRADE_PENDING flags.

To get this right we also need to track separate downgrade and break
times, to handle the case where a write-leased file gets conflicting
opens first for read, then later for write.

(I first considered just eliminating the downgrade behavior
completely--nfsv4 doesn't need it, and nobody as far as I can tell
actually uses it currently--but Jeremy Allison tells me that Windows
oplocks do behave this way, so Samba will probably use this some day.)

Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/locks.c         | 87 ++++++++++++++++++++++++++++++++++--------------------
 include/linux/fs.h |  7 +++--
 2 files changed, 60 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index c4215418bca3..c525aa4de234 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -135,7 +135,16 @@
 
 static bool lease_breaking(struct file_lock *fl)
 {
-	return fl->fl_flags & FL_INPROGRESS;
+	return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
+}
+
+static int target_leasetype(struct file_lock *fl)
+{
+	if (fl->fl_flags & FL_UNLOCK_PENDING)
+		return F_UNLCK;
+	if (fl->fl_flags & FL_DOWNGRADE_PENDING)
+		return F_RDLCK;
+	return fl->fl_type;
 }
 
 int leases_enable = 1;
@@ -1124,6 +1133,17 @@ int locks_mandatory_area(int read_write, struct inode *inode,
 
 EXPORT_SYMBOL(locks_mandatory_area);
 
+static void lease_clear_pending(struct file_lock *fl, int arg)
+{
+	switch (arg) {
+	case F_UNLCK:
+		fl->fl_flags &= ~FL_UNLOCK_PENDING;
+		/* fall through: */
+	case F_RDLCK:
+		fl->fl_flags &= ~FL_DOWNGRADE_PENDING;
+	}
+}
+
 /* We already had a lease on this file; just change its type */
 int lease_modify(struct file_lock **before, int arg)
 {
@@ -1132,7 +1152,7 @@ int lease_modify(struct file_lock **before, int arg)
 
 	if (error)
 		return error;
-	fl->fl_flags &= ~FL_INPROGRESS;
+	lease_clear_pending(fl, arg);
 	locks_wake_up_blocks(fl);
 	if (arg == F_UNLCK)
 		locks_delete_lock(before);
@@ -1141,6 +1161,14 @@ int lease_modify(struct file_lock **before, int arg)
 
 EXPORT_SYMBOL(lease_modify);
 
+static bool past_time(unsigned long then)
+{
+	if (!then)
+		/* 0 is a special value meaning "this never expires": */
+		return false;
+	return time_after(jiffies, then);
+}
+
 static void time_out_leases(struct inode *inode)
 {
 	struct file_lock **before;
@@ -1148,12 +1176,10 @@ static void time_out_leases(struct inode *inode)
 
 	before = &inode->i_flock;
 	while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) {
-		if ((fl->fl_break_time == 0)
-				|| time_before(jiffies, fl->fl_break_time)) {
-			before = &fl->fl_next;
-			continue;
-		}
-		lease_modify(before, fl->fl_type);
+		if (past_time(fl->fl_downgrade_time))
+			lease_modify(before, F_RDLCK);
+		if (past_time(fl->fl_break_time))
+			lease_modify(before, F_UNLCK);
 		if (fl == *before)	/* lease_modify may have freed fl */
 			before = &fl->fl_next;
 	}
@@ -1171,7 +1197,7 @@ static void time_out_leases(struct inode *inode)
  */
 int __break_lease(struct inode *inode, unsigned int mode)
 {
-	int error = 0, future;
+	int error = 0;
 	struct file_lock *new_fl, *flock;
 	struct file_lock *fl;
 	unsigned long break_time;
@@ -1188,24 +1214,13 @@ int __break_lease(struct inode *inode, unsigned int mode)
 	if ((flock == NULL) || !IS_LEASE(flock))
 		goto out;
 
+	if (!locks_conflict(flock, new_fl))
+		goto out;
+
 	for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next)
 		if (fl->fl_owner == current->files)
 			i_have_this_lease = 1;
 
-	if (want_write) {
-		/* If we want write access, we have to revoke any lease. */
-		future = F_UNLCK;
-	} else if (lease_breaking(flock)) {
-		/* If the lease is already being broken, we just leave it */
-		future = flock->fl_type;
-	} else if (flock->fl_type & F_WRLCK) {
-		/* Downgrade the exclusive lease to a read-only lease. */
-		future = F_RDLCK;
-	} else {
-		/* the existing lease was read-only, so we can read too. */
-		goto out;
-	}
-
 	if (IS_ERR(new_fl) && !i_have_this_lease
 			&& ((mode & O_NONBLOCK) == 0)) {
 		error = PTR_ERR(new_fl);
@@ -1220,13 +1235,18 @@ int __break_lease(struct inode *inode, unsigned int mode)
 	}
 
 	for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) {
-		if (fl->fl_type != future) {
-			fl->fl_type = future;
-			fl->fl_flags |= FL_INPROGRESS;
+		if (want_write) {
+			if (fl->fl_flags & FL_UNLOCK_PENDING)
+				continue;
+			fl->fl_flags |= FL_UNLOCK_PENDING;
 			fl->fl_break_time = break_time;
-			/* lease must have lmops break callback */
-			fl->fl_lmops->lm_break(fl);
+		} else {
+			if (lease_breaking(flock))
+				continue;
+			fl->fl_flags |= FL_DOWNGRADE_PENDING;
+			fl->fl_downgrade_time = break_time;
 		}
+		fl->fl_lmops->lm_break(fl);
 	}
 
 	if (i_have_this_lease || (mode & O_NONBLOCK)) {
@@ -1250,10 +1270,13 @@ restart:
 	if (error >= 0) {
 		if (error == 0)
 			time_out_leases(inode);
-		/* Wait for the next lease that has not been broken yet */
+		/*
+		 * Wait for the next conflicting lease that has not been
+		 * broken yet
+		 */
 		for (flock = inode->i_flock; flock && IS_LEASE(flock);
 				flock = flock->fl_next) {
-			if (lease_breaking(flock))
+			if (locks_conflict(new_fl, flock))
 				goto restart;
 		}
 		error = 0;
@@ -1321,7 +1344,7 @@ int fcntl_getlease(struct file *filp)
 	for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl);
 			fl = fl->fl_next) {
 		if (fl->fl_file == filp) {
-			type = fl->fl_type;
+			type = target_leasetype(fl);
 			break;
 		}
 	}
@@ -1386,7 +1409,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 			before = &fl->fl_next) {
 		if (fl->fl_file == filp)
 			my_before = before;
-		else if ((fl->fl_type == F_UNLCK) && lease_breaking(fl))
+		else if (fl->fl_flags & FL_UNLOCK_PENDING)
 			/*
 			 * Someone is in the process of opening this
 			 * file for writing so we may not take an
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 327fdd4de85f..76460edf1648 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1065,7 +1065,8 @@ static inline int file_check_writeable(struct file *filp)
 #define FL_LEASE	32	/* lease held on this file */
 #define FL_CLOSE	64	/* unlock on close */
 #define FL_SLEEP	128	/* A blocking lock */
-#define FL_INPROGRESS	256	/* Lease is being broken */
+#define FL_DOWNGRADE_PENDING	256 /* Lease is being downgraded */
+#define FL_UNLOCK_PENDING	512 /* Lease is being broken */
 
 /*
  * Special return value from posix_lock_file() and vfs_lock_file() for
@@ -1122,7 +1123,9 @@ struct file_lock {
 	loff_t fl_end;
 
 	struct fasync_struct *	fl_fasync; /* for lease break notifications */
-	unsigned long fl_break_time;	/* for nonblocking lease breaks */
+	/* for lease breaks: */
+	unsigned long fl_break_time;
+	unsigned long fl_downgrade_time;
 
 	const struct file_lock_operations *fl_ops;	/* Callbacks for filesystems */
 	const struct lock_manager_operations *fl_lmops;	/* Callbacks for lockmanagers */
-- 
cgit v1.2.3-58-ga151


From c1f24ef4ed46f58ea5e524a2364c93b6847fb164 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 19 Aug 2011 10:59:49 -0400
Subject: locks: setlease cleanup

There's an incorrect comment here.  Also clean up the logic: the
"rdlease" and "wrlease" locals are confusingly named, and don't really
add anything since we can make a decision as soon as we hit one of these
cases.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/locks.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index c525aa4de234..9b8408eb6946 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1368,7 +1368,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 	struct file_lock *fl, **before, **my_before = NULL, *lease;
 	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
-	int error, rdlease_count = 0, wrlease_count = 0;
+	int error;
 
 	lease = *flp;
 
@@ -1404,27 +1404,28 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 	 * then the file is not open by anyone (including us)
 	 * except for this filp.
 	 */
+	error = -EAGAIN;
 	for (before = &inode->i_flock;
 			((fl = *before) != NULL) && IS_LEASE(fl);
 			before = &fl->fl_next) {
-		if (fl->fl_file == filp)
+		if (fl->fl_file == filp) {
 			my_before = before;
-		else if (fl->fl_flags & FL_UNLOCK_PENDING)
-			/*
-			 * Someone is in the process of opening this
-			 * file for writing so we may not take an
-			 * exclusive lease on it.
-			 */
-			wrlease_count++;
-		else
-			rdlease_count++;
+			continue;
+		}
+		/*
+		 * No exclusive leases if someone else has a lease on
+		 * this file:
+		 */
+		if (arg == F_WRLCK)
+			goto out;
+		/*
+		 * Modifying our existing lease is OK, but no getting a
+		 * new lease if someone else is opening for write:
+		 */
+		if (fl->fl_flags & FL_UNLOCK_PENDING)
+			goto out;
 	}
 
-	error = -EAGAIN;
-	if ((arg == F_RDLCK && (wrlease_count > 0)) ||
-	    (arg == F_WRLCK && ((rdlease_count + wrlease_count) > 0)))
-		goto out;
-
 	if (my_before != NULL) {
 		error = lease->fl_lmops->lm_change(my_before, arg);
 		if (!error)
-- 
cgit v1.2.3-58-ga151


From 11fd165c68b73434ca1273e21f21db5eecc90926 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 28 Jul 2011 20:04:09 +0200
Subject: sunrpc: use better NUMA affinities

Use NUMA aware allocations to reduce latencies and increase throughput.

sunrpc kthreads can use kthread_create_on_node() if pool_mode is
"percpu" or "pernode", and svc_prepare_thread()/svc_init_buffer() can
also take into account NUMA node affinity for memory allocations.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: "J. Bruce Fields" <bfields@fieldses.org>
CC: Neil Brown <neilb@suse.de>
CC: David Miller <davem@davemloft.net>
Reviewed-by: Greg Banks <gnb@fastmail.fm>
[bfields@redhat.com: fix up caller nfs41_callback_up]
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/lockd/svc.c             |  2 +-
 fs/nfs/callback.c          |  4 ++--
 include/linux/sunrpc/svc.h |  2 +-
 net/sunrpc/svc.c           | 33 ++++++++++++++++++++++++---------
 4 files changed, 28 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index abfff9d7979d..c061b9aa7ddb 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -282,7 +282,7 @@ int lockd_up(void)
 	/*
 	 * Create the kernel thread and wait for it to start.
 	 */
-	nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
+	nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
 	if (IS_ERR(nlmsvc_rqst)) {
 		error = PTR_ERR(nlmsvc_rqst);
 		nlmsvc_rqst = NULL;
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index e3d294269058..516f3375e067 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -125,7 +125,7 @@ nfs4_callback_up(struct svc_serv *serv)
 	else
 		goto out_err;
 
-	return svc_prepare_thread(serv, &serv->sv_pools[0]);
+	return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
 
 out_err:
 	if (ret == 0)
@@ -199,7 +199,7 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
 	INIT_LIST_HEAD(&serv->sv_cb_list);
 	spin_lock_init(&serv->sv_cb_lock);
 	init_waitqueue_head(&serv->sv_cb_waitq);
-	rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
+	rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
 	if (IS_ERR(rqstp)) {
 		svc_xprt_put(serv->sv_bc_xprt);
 		serv->sv_bc_xprt = NULL;
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 223588a976a0..a78a51e93373 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -404,7 +404,7 @@ struct svc_procedure {
 struct svc_serv *svc_create(struct svc_program *, unsigned int,
 			    void (*shutdown)(struct svc_serv *));
 struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
-					struct svc_pool *pool);
+					struct svc_pool *pool, int node);
 void		   svc_exit_thread(struct svc_rqst *);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
 			void (*shutdown)(struct svc_serv *),
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 6a69a1131fb7..30d70abb4e2c 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -295,6 +295,18 @@ svc_pool_map_put(void)
 }
 
 
+static int svc_pool_map_get_node(unsigned int pidx)
+{
+	const struct svc_pool_map *m = &svc_pool_map;
+
+	if (m->count) {
+		if (m->mode == SVC_POOL_PERCPU)
+			return cpu_to_node(m->pool_to[pidx]);
+		if (m->mode == SVC_POOL_PERNODE)
+			return m->pool_to[pidx];
+	}
+	return NUMA_NO_NODE;
+}
 /*
  * Set the given thread's cpus_allowed mask so that it
  * will only run on cpus in the given pool.
@@ -499,7 +511,7 @@ EXPORT_SYMBOL_GPL(svc_destroy);
  * We allocate pages and place them in rq_argpages.
  */
 static int
-svc_init_buffer(struct svc_rqst *rqstp, unsigned int size)
+svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node)
 {
 	unsigned int pages, arghi;
 
@@ -513,7 +525,7 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size)
 	arghi = 0;
 	BUG_ON(pages > RPCSVC_MAXPAGES);
 	while (pages) {
-		struct page *p = alloc_page(GFP_KERNEL);
+		struct page *p = alloc_pages_node(node, GFP_KERNEL, 0);
 		if (!p)
 			break;
 		rqstp->rq_pages[arghi++] = p;
@@ -536,11 +548,11 @@ svc_release_buffer(struct svc_rqst *rqstp)
 }
 
 struct svc_rqst *
-svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool)
+svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
 {
 	struct svc_rqst	*rqstp;
 
-	rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
+	rqstp = kzalloc_node(sizeof(*rqstp), GFP_KERNEL, node);
 	if (!rqstp)
 		goto out_enomem;
 
@@ -554,15 +566,15 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool)
 	rqstp->rq_server = serv;
 	rqstp->rq_pool = pool;
 
-	rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL);
+	rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node);
 	if (!rqstp->rq_argp)
 		goto out_thread;
 
-	rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL);
+	rqstp->rq_resp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node);
 	if (!rqstp->rq_resp)
 		goto out_thread;
 
-	if (!svc_init_buffer(rqstp, serv->sv_max_mesg))
+	if (!svc_init_buffer(rqstp, serv->sv_max_mesg, node))
 		goto out_thread;
 
 	return rqstp;
@@ -647,6 +659,7 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 	struct svc_pool *chosen_pool;
 	int error = 0;
 	unsigned int state = serv->sv_nrthreads-1;
+	int node;
 
 	if (pool == NULL) {
 		/* The -1 assumes caller has done a svc_get() */
@@ -662,14 +675,16 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 		nrservs--;
 		chosen_pool = choose_pool(serv, pool, &state);
 
-		rqstp = svc_prepare_thread(serv, chosen_pool);
+		node = svc_pool_map_get_node(chosen_pool->sp_id);
+		rqstp = svc_prepare_thread(serv, chosen_pool, node);
 		if (IS_ERR(rqstp)) {
 			error = PTR_ERR(rqstp);
 			break;
 		}
 
 		__module_get(serv->sv_module);
-		task = kthread_create(serv->sv_function, rqstp, serv->sv_name);
+		task = kthread_create_on_node(serv->sv_function, rqstp,
+					      node, serv->sv_name);
 		if (IS_ERR(task)) {
 			error = PTR_ERR(task);
 			module_put(serv->sv_module);
-- 
cgit v1.2.3-58-ga151


From d37854cf99319966f34bb19c7a897b87d478b56c Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@intel.com>
Date: Mon, 22 Aug 2011 16:23:56 +0300
Subject: UBIFS: introduce a helper to dump scanning info

This commit adds 'dbg_dump_sleb()' helper function to dump scanning
information.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@intel.com>
---
 fs/ubifs/debug.c | 16 ++++++++++++++++
 fs/ubifs/debug.h |  5 +++++
 2 files changed, 21 insertions(+)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index eef109a1a927..b09ba2dd8b62 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -870,6 +870,22 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
 	spin_unlock(&dbg_lock);
 }
 
+void dbg_dump_sleb(const struct ubifs_info *c,
+		   const struct ubifs_scan_leb *sleb, int offs)
+{
+	struct ubifs_scan_node *snod;
+
+	printk(KERN_DEBUG "(pid %d) start dumping scanned data from LEB %d:%d\n",
+	       current->pid, sleb->lnum, offs);
+
+	list_for_each_entry(snod, &sleb->nodes, list) {
+		cond_resched();
+		printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", sleb->lnum,
+		       snod->offs, snod->len);
+		dbg_dump_node(c, snod->node);
+	}
+}
+
 void dbg_dump_leb(const struct ubifs_info *c, int lnum)
 {
 	struct ubifs_scan_leb *sleb;
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 45174b534377..2bf84211e320 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -269,6 +269,8 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
 void dbg_dump_lprops(struct ubifs_info *c);
 void dbg_dump_lpt_info(struct ubifs_info *c);
 void dbg_dump_leb(const struct ubifs_info *c, int lnum);
+void dbg_dump_sleb(const struct ubifs_info *c,
+		   const struct ubifs_scan_leb *sleb, int offs);
 void dbg_dump_znode(const struct ubifs_info *c,
 		    const struct ubifs_znode *znode);
 void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat);
@@ -387,6 +389,9 @@ static inline void dbg_dump_lpt_info(struct ubifs_info *c)        { return; }
 static inline void dbg_dump_leb(const struct ubifs_info *c,
 				int lnum)                         { return; }
 static inline void
+dbg_dump_sleb(const struct ubifs_info *c,
+	      const struct ubifs_scan_leb *sleb, int offs)        { return; }
+static inline void
 dbg_dump_znode(const struct ubifs_info *c,
 	       const struct ubifs_znode *znode)                   { return; }
 static inline void dbg_dump_heap(struct ubifs_info *c,
-- 
cgit v1.2.3-58-ga151


From bd33d12fba60dea3b8f944478c54ac3ba18d2c4d Mon Sep 17 00:00:00 2001
From: Harry Wei <harryxiyou@gmail.com>
Date: Sat, 16 Jul 2011 16:45:13 +0800
Subject: debugfs: Fix a comment mistake

The file is fs/debugfs/inode.c but the comment says it is file.c.
This patch can fix this little mistake.

Signed-off-by: Harry Wei <harryxiyou@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/debugfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index e7a7a2f07324..f3a257d7a985 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -1,5 +1,5 @@
 /*
- *  file.c - part of debugfs, a tiny little debug file system
+ *  inode.c - part of debugfs, a tiny little debug file system
  *
  *  Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
  *  Copyright (C) 2004 IBM Inc.
-- 
cgit v1.2.3-58-ga151


From 7f9838fd01833ffb30177d964983076924344c9e Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 21 Jul 2011 19:59:22 -0400
Subject: sysfs: count subdirectories

sysfs: count subdirectories

This patch introduces a subdirectory counter for each sysfs directory.

Without the patch, sysfs_refresh_inode would walk all entries of the directory
to calculate the number of subdirectories.

This patch improves time of "ls -la /sys/block" when there are 10000 block
devices from 9 seconds to 0.19 seconds.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c   |  6 ++++++
 fs/sysfs/inode.c | 14 +-------------
 fs/sysfs/sysfs.h |  2 ++
 3 files changed, 9 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index ea9120a830d8..7d240e6b7176 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -47,6 +47,9 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd)
 
 	BUG_ON(sd->s_sibling);
 
+	if (sysfs_type(sd) == SYSFS_DIR)
+		parent_sd->s_dir.subdirs++;
+
 	/* Store directory entries in order by ino.  This allows
 	 * readdir to properly restart without having to add a
 	 * cursor into the s_dir.children list.
@@ -73,6 +76,9 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
 {
 	struct sysfs_dirent **pos;
 
+	if (sysfs_type(sd) == SYSFS_DIR)
+		sd->s_parent->s_dir.subdirs--;
+
 	for (pos = &sd->s_parent->s_dir.children; *pos;
 	     pos = &(*pos)->s_sibling) {
 		if (*pos == sd) {
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index e3f091a81c72..1ee18c81df78 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -202,18 +202,6 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
 	inode->i_ctime = iattr->ia_ctime;
 }
 
-static int sysfs_count_nlink(struct sysfs_dirent *sd)
-{
-	struct sysfs_dirent *child;
-	int nr = 0;
-
-	for (child = sd->s_dir.children; child; child = child->s_sibling)
-		if (sysfs_type(child) == SYSFS_DIR)
-			nr++;
-
-	return nr + 2;
-}
-
 static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
 {
 	struct sysfs_inode_attrs *iattrs = sd->s_iattr;
@@ -230,7 +218,7 @@ static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
 	}
 
 	if (sysfs_type(sd) == SYSFS_DIR)
-		inode->i_nlink = sysfs_count_nlink(sd);
+		inode->i_nlink = sd->s_dir.subdirs + 2;
 }
 
 int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 845ab3ad229d..6348e2c753f6 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -19,6 +19,8 @@ struct sysfs_elem_dir {
 	struct kobject		*kobj;
 	/* children list starts here and goes through sd->s_sibling */
 	struct sysfs_dirent	*children;
+
+	unsigned long		subdirs;
 };
 
 struct sysfs_elem_symlink {
-- 
cgit v1.2.3-58-ga151


From 4f72c0cab40536a0be501d85ea4918467ab82ad5 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 25 Jul 2011 17:55:57 -0400
Subject: sysfs: use rb-tree for name lookups

sysfs: use rb-tree for name lookups

Use red-black tree for name lookups.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c   | 57 +++++++++++++++++++++++++++++++++++++++++++++++++-------
 fs/sysfs/sysfs.h |  5 +++++
 2 files changed, 55 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 7d240e6b7176..3e937da224d4 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -45,6 +45,9 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd)
 	struct sysfs_dirent *parent_sd = sd->s_parent;
 	struct sysfs_dirent **pos;
 
+	struct rb_node **p;
+	struct rb_node *parent;
+
 	BUG_ON(sd->s_sibling);
 
 	if (sysfs_type(sd) == SYSFS_DIR)
@@ -60,6 +63,23 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd)
 	}
 	sd->s_sibling = *pos;
 	*pos = sd;
+
+	p = &parent_sd->s_dir.name_tree.rb_node;
+	parent = NULL;
+	while (*p) {
+		int c;
+		parent = *p;
+#define node	rb_entry(parent, struct sysfs_dirent, name_node)
+		c = strcmp(sd->s_name, node->s_name);
+		if (c < 0) {
+			p = &node->name_node.rb_left;
+		} else {
+			p = &node->name_node.rb_right;
+		}
+#undef node
+	}
+	rb_link_node(&sd->name_node, parent, p);
+	rb_insert_color(&sd->name_node, &parent_sd->s_dir.name_tree);
 }
 
 /**
@@ -87,6 +107,8 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
 			break;
 		}
 	}
+
+	rb_erase(&sd->name_node, &sd->s_parent->s_dir.name_tree);
 }
 
 /**
@@ -546,15 +568,36 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 				       const void *ns,
 				       const unsigned char *name)
 {
-	struct sysfs_dirent *sd;
+	struct rb_node *p = parent_sd->s_dir.name_tree.rb_node;
+	struct sysfs_dirent *found = NULL;
+
+	while (p) {
+		int c;
+#define node	rb_entry(p, struct sysfs_dirent, name_node)
+		c = strcmp(name, node->s_name);
+		if (c < 0) {
+			p = node->name_node.rb_left;
+		} else if (c > 0) {
+			p = node->name_node.rb_right;
+		} else {
+			found = node;
+			p = node->name_node.rb_left;
+		}
+#undef node
+	}
 
-	for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) {
-		if (ns && sd->s_ns && (sd->s_ns != ns))
-			continue;
-		if (!strcmp(sd->s_name, name))
-			return sd;
+	if (found && ns) {
+		while (found->s_ns && found->s_ns != ns) {
+			p = rb_next(&found->name_node);
+			if (!p)
+				return NULL;
+			found = rb_entry(p, struct sysfs_dirent, name_node);
+			if (strcmp(name, found->s_name))
+				return NULL;
+		}
 	}
-	return NULL;
+
+	return found;
 }
 
 /**
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 6348e2c753f6..fe1a9e8650bf 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -11,6 +11,7 @@
 #include <linux/lockdep.h>
 #include <linux/kobject_ns.h>
 #include <linux/fs.h>
+#include <linux/rbtree.h>
 
 struct sysfs_open_dirent;
 
@@ -21,6 +22,8 @@ struct sysfs_elem_dir {
 	struct sysfs_dirent	*children;
 
 	unsigned long		subdirs;
+
+	struct rb_root		name_tree;
 };
 
 struct sysfs_elem_symlink {
@@ -61,6 +64,8 @@ struct sysfs_dirent {
 	struct sysfs_dirent	*s_sibling;
 	const char		*s_name;
 
+	struct rb_node		name_node;
+
 	const void		*s_ns; /* namespace tag */
 	union {
 		struct sysfs_elem_dir		s_dir;
-- 
cgit v1.2.3-58-ga151


From 58f2a4c7932d8bec866d0394f806004146cde827 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 21 Jul 2011 20:01:12 -0400
Subject: sysfs: remove s_sibling hacks

sysfs: remove s_sibling hacks

s_sibling was used for three different purposes:
1) as a linked list of entries in the directory
2) as a linked list of entries to be deleted
3) as a pointer to "struct completion"

This patch removes the hack and introduces new union u which
holds pointers for cases 2) and 3).

This change is needed for the following patch that removes s_sibling at all
and replaces it with a rb tree.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c   | 19 +++++++------------
 fs/sysfs/sysfs.h |  5 +++++
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 3e937da224d4..a4846979d4e8 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -154,7 +154,6 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
  */
 void sysfs_put_active(struct sysfs_dirent *sd)
 {
-	struct completion *cmpl;
 	int v;
 
 	if (unlikely(!sd))
@@ -166,10 +165,9 @@ void sysfs_put_active(struct sysfs_dirent *sd)
 		return;
 
 	/* atomic_dec_return() is a mb(), we'll always see the updated
-	 * sd->s_sibling.
+	 * sd->u.completion.
 	 */
-	cmpl = (void *)sd->s_sibling;
-	complete(cmpl);
+	complete(sd->u.completion);
 }
 
 /**
@@ -183,16 +181,16 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
 	DECLARE_COMPLETION_ONSTACK(wait);
 	int v;
 
-	BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED));
+	BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED));
 
 	if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
 		return;
 
-	sd->s_sibling = (void *)&wait;
+	sd->u.completion = (void *)&wait;
 
 	rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
 	/* atomic_add_return() is a mb(), put_active() will always see
-	 * the updated sd->s_sibling.
+	 * the updated sd->u.completion.
 	 */
 	v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);
 
@@ -201,8 +199,6 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
 		wait_for_completion(&wait);
 	}
 
-	sd->s_sibling = NULL;
-
 	lock_acquired(&sd->dep_map, _RET_IP_);
 	rwsem_release(&sd->dep_map, 1, _RET_IP_);
 }
@@ -518,7 +514,7 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 	}
 
 	sd->s_flags |= SYSFS_FLAG_REMOVED;
-	sd->s_sibling = acxt->removed;
+	sd->u.removed_list = acxt->removed;
 	acxt->removed = sd;
 }
 
@@ -542,8 +538,7 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
 	while (acxt->removed) {
 		struct sysfs_dirent *sd = acxt->removed;
 
-		acxt->removed = sd->s_sibling;
-		sd->s_sibling = NULL;
+		acxt->removed = sd->u.removed_list;
 
 		sysfs_deactivate(sd);
 		unmap_bin_file(sd);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index fe1a9e8650bf..3c261681713b 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -66,6 +66,11 @@ struct sysfs_dirent {
 
 	struct rb_node		name_node;
 
+	union {
+		struct completion	*completion;
+		struct sysfs_dirent	*removed_list;
+	} u;
+
 	const void		*s_ns; /* namespace tag */
 	union {
 		struct sysfs_elem_dir		s_dir;
-- 
cgit v1.2.3-58-ga151


From a406f75840e15afbabd98cb64ae36b51424a8033 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 25 Jul 2011 17:57:03 -0400
Subject: sysfs: use rb-tree for inode number lookup

sysfs: use rb-tree for inode number lookup

This patch makes sysfs use red-black tree for inode number lookup.
Together with a previous patch to use red-black tree for name lookup,
this patch makes all sysfs lookups to have O(log n) complexity.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c   | 89 +++++++++++++++++++++++++++++++-------------------------
 fs/sysfs/sysfs.h |  5 ++--
 2 files changed, 52 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index a4846979d4e8..c3646d93a032 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -43,26 +43,30 @@ static DEFINE_IDA(sysfs_ino_ida);
 static void sysfs_link_sibling(struct sysfs_dirent *sd)
 {
 	struct sysfs_dirent *parent_sd = sd->s_parent;
-	struct sysfs_dirent **pos;
 
 	struct rb_node **p;
 	struct rb_node *parent;
 
-	BUG_ON(sd->s_sibling);
-
 	if (sysfs_type(sd) == SYSFS_DIR)
 		parent_sd->s_dir.subdirs++;
 
-	/* Store directory entries in order by ino.  This allows
-	 * readdir to properly restart without having to add a
-	 * cursor into the s_dir.children list.
-	 */
-	for (pos = &parent_sd->s_dir.children; *pos; pos = &(*pos)->s_sibling) {
-		if (sd->s_ino < (*pos)->s_ino)
-			break;
+	p = &parent_sd->s_dir.inode_tree.rb_node;
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+#define node	rb_entry(parent, struct sysfs_dirent, inode_node)
+		if (sd->s_ino < node->s_ino) {
+			p = &node->inode_node.rb_left;
+		} else if (sd->s_ino > node->s_ino) {
+			p = &node->inode_node.rb_right;
+		} else {
+			printk(KERN_CRIT "sysfs: inserting duplicate inode '%lx'\n", sd->s_ino);
+			BUG();
+		}
+#undef node
 	}
-	sd->s_sibling = *pos;
-	*pos = sd;
+	rb_link_node(&sd->inode_node, parent, p);
+	rb_insert_color(&sd->inode_node, &parent_sd->s_dir.inode_tree);
 
 	p = &parent_sd->s_dir.name_tree.rb_node;
 	parent = NULL;
@@ -94,20 +98,10 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd)
  */
 static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
 {
-	struct sysfs_dirent **pos;
-
 	if (sysfs_type(sd) == SYSFS_DIR)
 		sd->s_parent->s_dir.subdirs--;
 
-	for (pos = &sd->s_parent->s_dir.children; *pos;
-	     pos = &(*pos)->s_sibling) {
-		if (*pos == sd) {
-			*pos = sd->s_sibling;
-			sd->s_sibling = NULL;
-			break;
-		}
-	}
-
+	rb_erase(&sd->inode_node, &sd->s_parent->s_dir.inode_tree);
 	rb_erase(&sd->name_node, &sd->s_parent->s_dir.name_tree);
 }
 
@@ -788,21 +782,19 @@ void sysfs_remove_subdir(struct sysfs_dirent *sd)
 static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
 {
 	struct sysfs_addrm_cxt acxt;
-	struct sysfs_dirent **pos;
+	struct rb_node *pos;
 
 	if (!dir_sd)
 		return;
 
 	pr_debug("sysfs %s: removing dir\n", dir_sd->s_name);
 	sysfs_addrm_start(&acxt, dir_sd);
-	pos = &dir_sd->s_dir.children;
-	while (*pos) {
-		struct sysfs_dirent *sd = *pos;
-
+	pos = rb_first(&dir_sd->s_dir.inode_tree);
+	while (pos) {
+		struct sysfs_dirent *sd = rb_entry(pos, struct sysfs_dirent, inode_node);
+		pos = rb_next(pos);
 		if (sysfs_type(sd) != SYSFS_DIR)
 			sysfs_remove_one(&acxt, sd);
-		else
-			pos = &(*pos)->s_sibling;
 	}
 	sysfs_addrm_finish(&acxt);
 
@@ -925,12 +917,28 @@ static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
 			pos = NULL;
 	}
 	if (!pos && (ino > 1) && (ino < INT_MAX)) {
-		pos = parent_sd->s_dir.children;
-		while (pos && (ino > pos->s_ino))
-			pos = pos->s_sibling;
+		struct rb_node *p = parent_sd->s_dir.inode_tree.rb_node;
+		while (p) {
+#define node	rb_entry(p, struct sysfs_dirent, inode_node)
+			if (ino < node->s_ino) {
+				pos = node;
+				p = node->inode_node.rb_left;
+			} else if (ino > node->s_ino) {
+				p = node->inode_node.rb_right;
+			} else {
+				pos = node;
+				break;
+			}
+#undef node
+		}
+	}
+	while (pos && pos->s_ns && pos->s_ns != ns) {
+		struct rb_node *p = rb_next(&pos->inode_node);
+		if (!p)
+			pos = NULL;
+		else
+			pos = rb_entry(p, struct sysfs_dirent, inode_node);
 	}
-	while (pos && pos->s_ns && pos->s_ns != ns)
-		pos = pos->s_sibling;
 	return pos;
 }
 
@@ -938,10 +946,13 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
 	struct sysfs_dirent *parent_sd,	ino_t ino, struct sysfs_dirent *pos)
 {
 	pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
-	if (pos)
-		pos = pos->s_sibling;
-	while (pos && pos->s_ns && pos->s_ns != ns)
-		pos = pos->s_sibling;
+	if (pos) do {
+		struct rb_node *p = rb_next(&pos->inode_node);
+		if (!p)
+			pos = NULL;
+		else
+			pos = rb_entry(p, struct sysfs_dirent, inode_node);
+	} while (pos && pos->s_ns && pos->s_ns != ns);
 	return pos;
 }
 
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 3c261681713b..ce29e28b766d 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -18,11 +18,10 @@ struct sysfs_open_dirent;
 /* type-specific structures for sysfs_dirent->s_* union members */
 struct sysfs_elem_dir {
 	struct kobject		*kobj;
-	/* children list starts here and goes through sd->s_sibling */
-	struct sysfs_dirent	*children;
 
 	unsigned long		subdirs;
 
+	struct rb_root		inode_tree;
 	struct rb_root		name_tree;
 };
 
@@ -61,9 +60,9 @@ struct sysfs_dirent {
 	struct lockdep_map	dep_map;
 #endif
 	struct sysfs_dirent	*s_parent;
-	struct sysfs_dirent	*s_sibling;
 	const char		*s_name;
 
+	struct rb_node		inode_node;
 	struct rb_node		name_node;
 
 	union {
-- 
cgit v1.2.3-58-ga151


From d27769ec3df1a8de9ca450d2dcd72d1ab259ba32 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 23 Aug 2011 20:01:04 +0200
Subject: block: add GENHD_FL_NO_PART_SCAN

There are cases where suppressing partition scan is useful - e.g. for
lo devices and pseudo SATA devices which advertise to be a disk but
get upset on partition scan (some port multiplier control devices show
such behavior).

This patch adds GENHD_FL_NO_PART_SCAN which suppresses partition scan
regardless of the number of possible partitions.  disk_partitionable()
is renamed to disk_part_scan_enabled() as suppressing partition scan
doesn't imply the device can't be partitioned using
BLKPG_ADD/DEL_PARTITION calls from userland.  show_partition() now
directly tests disk_max_parts() to maintain backward-compatibility.

-v2: Updated to make it clear that only partition scan is suppressed
     not partitioning itself as suggested by Kay Sievers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c         | 4 ++--
 block/ioctl.c         | 2 +-
 fs/block_dev.c        | 2 +-
 include/linux/genhd.h | 6 ++++--
 4 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/block/genhd.c b/block/genhd.c
index 5cb51c55f6d8..2429ecbbd97d 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -536,7 +536,7 @@ void register_disk(struct gendisk *disk)
 	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
 
 	/* No minors to use for partitions */
-	if (!disk_partitionable(disk))
+	if (!disk_part_scan_enabled(disk))
 		goto exit;
 
 	/* No such device (e.g., media were just removed) */
@@ -841,7 +841,7 @@ static int show_partition(struct seq_file *seqf, void *v)
 	char buf[BDEVNAME_SIZE];
 
 	/* Don't show non-partitionable removeable devices or empty devices */
-	if (!get_capacity(sgp) || (!disk_partitionable(sgp) &&
+	if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
 				   (sgp->flags & GENHD_FL_REMOVABLE)))
 		return 0;
 	if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --git a/block/ioctl.c b/block/ioctl.c
index 1124cd297263..5c74efc01903 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -101,7 +101,7 @@ static int blkdev_reread_part(struct block_device *bdev)
 	struct gendisk *disk = bdev->bd_disk;
 	int res;
 
-	if (!disk_partitionable(disk) || bdev != bdev->bd_contains)
+	if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains)
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ff77262e887c..0bed0d4588dd 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -971,7 +971,7 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty)
 
 	if (!bdev->bd_disk)
 		return;
-	if (disk_partitionable(bdev->bd_disk))
+	if (disk_part_scan_enabled(bdev->bd_disk))
 		bdev->bd_invalidated = 1;
 }
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 02fa4697a0e5..6d18f3531f18 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -128,6 +128,7 @@ struct hd_struct {
 #define GENHD_FL_EXT_DEVT			64 /* allow extended devt */
 #define GENHD_FL_NATIVE_CAPACITY		128
 #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE	256
+#define GENHD_FL_NO_PART_SCAN			512
 
 enum {
 	DISK_EVENT_MEDIA_CHANGE			= 1 << 0, /* media changed */
@@ -234,9 +235,10 @@ static inline int disk_max_parts(struct gendisk *disk)
 	return disk->minors;
 }
 
-static inline bool disk_partitionable(struct gendisk *disk)
+static inline bool disk_part_scan_enabled(struct gendisk *disk)
 {
-	return disk_max_parts(disk) > 1;
+	return disk_max_parts(disk) > 1 &&
+		!(disk->flags & GENHD_FL_NO_PART_SCAN);
 }
 
 static inline dev_t disk_devt(struct gendisk *disk)
-- 
cgit v1.2.3-58-ga151


From 7606f85a701ed8feeac065e133ff9a51c267aa0d Mon Sep 17 00:00:00 2001
From: srimugunthan dhandapani <srimugunthan.dhandapani@gmail.com>
Date: Fri, 26 Aug 2011 16:08:39 +0530
Subject: UBIFS: fix the dark space calculation

The dark space calculation should be 64 bit type-casted, when
assigning to tmp64 (similar to how total_free is calculated).
Overflow will occur for very large flashes.

Signed-off-by: srimugunthan <srimugunthan.dhandapani@gmail.com>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@intel.com>
---
 fs/ubifs/recovery.c | 2 +-
 fs/ubifs/sb.c       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index af02790d9328..ee4f43f4bb99 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -983,7 +983,7 @@ int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf)
 }
 
 /**
- *  clean_an_unclean_leb - read and write a LEB to remove corruption.
+ * clean_an_unclean_leb - read and write a LEB to remove corruption.
  * @c: UBIFS file-system description object
  * @ucleb: unclean LEB information
  * @sbuf: LEB-sized buffer to use
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 93d938ad3d2a..6094c5a5d7a8 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -247,7 +247,7 @@ static int create_default_filesystem(struct ubifs_info *c)
 	mst->total_dirty = cpu_to_le64(tmp64);
 
 	/*  The indexing LEB does not contribute to dark space */
-	tmp64 = (c->main_lebs - 1) * c->dark_wm;
+	tmp64 = ((long long)(c->main_lebs - 1) * c->dark_wm);
 	mst->total_dark = cpu_to_le64(tmp64);
 
 	mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ);
-- 
cgit v1.2.3-58-ga151


From 3d2544b1e4909b6dffa0d140273628913e255e45 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 15 Aug 2011 11:49:30 -0400
Subject: nfsd4: clean up S_IS -> NF4 file type mapping

A slightly unconventional approach to make the code more compact I could
live with, but let's give the poor reader *some* chance.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f81099605256..51ec1f274501 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1772,12 +1772,19 @@ static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp,
 	return 0;
 }
 
-static u32 nfs4_ftypes[16] = {
-        NF4BAD,  NF4FIFO, NF4CHR, NF4BAD,
-        NF4DIR,  NF4BAD,  NF4BLK, NF4BAD,
-        NF4REG,  NF4BAD,  NF4LNK, NF4BAD,
-        NF4SOCK, NF4BAD,  NF4LNK, NF4BAD,
-};
+static u32 nfs4_file_type(umode_t mode)
+{
+	switch (mode & S_IFMT) {
+	case S_IFIFO:	return NF4FIFO;
+	case S_IFCHR:	return NF4CHR;
+	case S_IFDIR:	return NF4DIR;
+	case S_IFBLK:	return NF4BLK;
+	case S_IFLNK:	return NF4LNK;
+	case S_IFREG:	return NF4REG;
+	case S_IFSOCK:	return NF4SOCK;
+	default:	return NF4BAD;
+	};
+}
 
 static __be32
 nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
@@ -1966,7 +1973,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	if (bmval0 & FATTR4_WORD0_TYPE) {
 		if ((buflen -= 4) < 0)
 			goto out_resource;
-		dummy = nfs4_ftypes[(stat.mode & S_IFMT) >> 12];
+		dummy = nfs4_file_type(stat.mode);
 		if (dummy == NF4BAD)
 			goto out_serverfault;
 		WRITE32(dummy);
-- 
cgit v1.2.3-58-ga151


From 7d818a7b8fc8d26c24ee44ed1c5dece69455a7b6 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 15 Aug 2011 16:59:55 -0400
Subject: nfsd: open-code special directory-hardlink check

We allow the fh_verify caller to specify that any object *except* those
of a given type is allowed, by passing a negative type.  But only one
caller actually uses it.  Open-code that check in the one caller.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfsfh.c | 9 ---------
 fs/nfsd/vfs.c   | 6 ++++--
 2 files changed, 4 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 90c6aa6d5e0f..8cd49b9bf085 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -61,7 +61,6 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
 static inline __be32
 nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type)
 {
-	/* Type can be negative when creating hardlinks - not to a dir */
 	if (type > 0 && (mode & S_IFMT) != type) {
 		if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK)
 			return nfserr_symlink;
@@ -72,14 +71,6 @@ nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type)
 		else
 			return nfserr_inval;
 	}
-	if (type < 0 && (mode & S_IFMT) == -type) {
-		if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK)
-			return nfserr_symlink;
-		else if (type == -S_IFDIR)
-			return nfserr_isdir;
-		else
-			return nfserr_notdir;
-	}
 	return 0;
 }
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index fd0acca5370a..0c92a17d1770 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1632,10 +1632,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 	err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE);
 	if (err)
 		goto out;
-	err = fh_verify(rqstp, tfhp, -S_IFDIR, NFSD_MAY_NOP);
+	err = fh_verify(rqstp, tfhp, 0, NFSD_MAY_NOP);
 	if (err)
 		goto out;
-
+	err = nfserr_isdir;
+	if (S_ISDIR(tfhp->fh_dentry->d_inode->i_mode))
+		goto out;
 	err = nfserr_perm;
 	if (!len)
 		goto out;
-- 
cgit v1.2.3-58-ga151


From e10f9e1413576f58c18b89f6ae212a37a88b24e7 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 15 Aug 2011 17:04:19 -0400
Subject: nfsd: clean up nfsd_mode_check()

Add some more comments, simplify logic, do & S_IFMT just once, name
"type" more helpfully.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfsfh.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 8cd49b9bf085..c763de5c1157 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -59,19 +59,25 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
  * the write call).
  */
 static inline __be32
-nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type)
+nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int requested)
 {
-	if (type > 0 && (mode & S_IFMT) != type) {
-		if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK)
-			return nfserr_symlink;
-		else if (type == S_IFDIR)
-			return nfserr_notdir;
-		else if ((mode & S_IFMT) == S_IFDIR)
-			return nfserr_isdir;
-		else
-			return nfserr_inval;
-	}
-	return 0;
+	mode &= S_IFMT;
+
+	if (requested == 0) /* the caller doesn't care */
+		return nfs_ok;
+	if (mode == requested)
+		return nfs_ok;
+	/*
+	 * v4 has an error more specific than err_notdir which we should
+	 * return in preference to err_notdir:
+	 */
+	if (rqstp->rq_vers == 4 && mode == S_IFLNK)
+		return nfserr_symlink;
+	if (requested == S_IFDIR)
+		return nfserr_notdir;
+	if (mode == S_IFDIR)
+		return nfserr_isdir;
+	return nfserr_inval;
 }
 
 static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
-- 
cgit v1.2.3-58-ga151


From e281d8100995133dc65e00b1dec8f84b91b6e8c3 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 15 Aug 2011 16:57:07 -0400
Subject: nfsd4: fix incorrect comment in nfsd4_set_nfs4_acl

Zero means "I don't care what kind of file this is".  And that's
probably what we want--acls are also settable at least on directories,
and if the filesystem doesn't want them on other objects, leave it to it
to complain.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/vfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 0c92a17d1770..4c22870293e6 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -502,7 +502,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	unsigned int flags = 0;
 
 	/* Get inode */
-	error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
+	error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR);
 	if (error)
 		return error;
 
-- 
cgit v1.2.3-58-ga151


From 75c096f753b273b59f1b9a0745e9e4b5d911a312 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 15 Aug 2011 18:39:32 -0400
Subject: nfsd4: it's OK to return nfserr_symlink

The nfsd4 code has a bunch of special exceptions for error returns which
map nfserr_symlink to other errors.

In fact, the spec makes it clear that nfserr_symlink is to be preferred
over less specific errors where possible.

The patch that introduced it back in 2.6.4 is "kNFSd: correct symlink
related error returns.", which claims that these special exceptions are
represent an NFSv4 break from v2/v3 tradition--when in fact the symlink
error was introduced with v4.

I suspect what happened was pynfs tests were written that were overly
faithful to the (known-incomplete) rfc3530 error return lists, and then
code was fixed up mindlessly to make the tests pass.

Delete these unnecessary exceptions.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c  | 15 +--------------
 fs/nfsd/nfs4state.c |  6 +-----
 fs/nfsd/nfs4xdr.c   |  4 ----
 3 files changed, 2 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index d784ceb81a62..479ffb185df9 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -488,17 +488,12 @@ static __be32
 nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	     struct nfsd4_commit *commit)
 {
-	__be32 status;
-
 	u32 *p = (u32 *)commit->co_verf.data;
 	*p++ = nfssvc_boot.tv_sec;
 	*p++ = nfssvc_boot.tv_usec;
 
-	status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
+	return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
 			     commit->co_count);
-	if (status == nfserr_symlink)
-		status = nfserr_inval;
-	return status;
 }
 
 static __be32
@@ -513,8 +508,6 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR,
 			   NFSD_MAY_CREATE);
-	if (status == nfserr_symlink)
-		status = nfserr_notdir;
 	if (status)
 		return status;
 
@@ -740,8 +733,6 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		return nfserr_grace;
 	status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
 			     remove->rm_name, remove->rm_namelen);
-	if (status == nfserr_symlink)
-		return nfserr_notdir;
 	if (!status) {
 		fh_unlock(&cstate->current_fh);
 		set_change_info(&remove->rm_cinfo, &cstate->current_fh);
@@ -772,8 +763,6 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                   (S_ISDIR(cstate->save_fh.fh_dentry->d_inode->i_mode) &&
                    S_ISDIR(cstate->current_fh.fh_dentry->d_inode->i_mode)))
 		status = nfserr_exist;
-	else if (status == nfserr_symlink)
-		status = nfserr_notdir;
 
 	if (!status) {
 		set_change_info(&rename->rn_sinfo, &cstate->current_fh);
@@ -913,8 +902,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	write->wr_bytes_written = cnt;
 
-	if (status == nfserr_symlink)
-		status = nfserr_inval;
 	return status;
 }
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3787ec117400..aa0a36e3b09e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4179,12 +4179,8 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid))
 		goto out;
 
-	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) {
-		dprintk("NFSD: nfsd4_lockt: fh_verify() failed!\n");
-		if (status == nfserr_symlink)
-			status = nfserr_inval;
+	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
 		goto out;
-	}
 
 	inode = cstate->current_fh.fh_dentry->d_inode;
 	locks_init_lock(&file_lock);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 51ec1f274501..78c792fb59a8 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2778,8 +2778,6 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
 			read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
 			&maxcount);
 
-	if (nfserr == nfserr_symlink)
-		nfserr = nfserr_inval;
 	if (nfserr)
 		return nfserr;
 	eof = (read->rd_offset + maxcount >=
@@ -2905,8 +2903,6 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
 	    readdir->common.err == nfserr_toosmall &&
 	    readdir->buffer == page) 
 		nfserr = nfserr_toosmall;
-	if (nfserr == nfserr_symlink)
-		nfserr = nfserr_notdir;
 	if (nfserr)
 		goto err_no_verf;
 
-- 
cgit v1.2.3-58-ga151


From c10bd39d800d42adef55ed9016f802677cd0ab5f Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 19 Aug 2011 11:38:52 -0400
Subject: Remove include/linux/nfsd/const.h

Userspace shouldn't have a use for these constants.  Nothing here is
used outside fs/nfsd.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfsd.h             | 26 ++++++++++++++++++++++++++
 include/linux/nfsd/Kbuild  |  1 -
 include/linux/nfsd/const.h | 42 ------------------------------------------
 include/linux/nfsd/nfsfh.h |  7 +++++--
 4 files changed, 31 insertions(+), 45 deletions(-)
 delete mode 100644 include/linux/nfsd/const.h

(limited to 'fs')

diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 7ecfa2420307..8da03e16ab35 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -11,13 +11,39 @@
 #include <linux/types.h>
 #include <linux/mount.h>
 
+#include <linux/nfs.h>
+#include <linux/nfs2.h>
+#include <linux/nfs3.h>
+#include <linux/nfs4.h>
+#include <linux/sunrpc/msg_prot.h>
+
 #include <linux/nfsd/debug.h>
 #include <linux/nfsd/export.h>
 #include <linux/nfsd/stats.h>
+
 /*
  * nfsd version
  */
 #define NFSD_SUPPORTED_MINOR_VERSION	1
+/*
+ * Maximum blocksizes supported by daemon under various circumstances.
+ */
+#define NFSSVC_MAXBLKSIZE       RPCSVC_MAXPAYLOAD
+/* NFSv2 is limited by the protocol specification, see RFC 1094 */
+#define NFSSVC_MAXBLKSIZE_V2    (8*1024)
+
+
+/*
+ * Largest number of bytes we need to allocate for an NFS
+ * call or reply.  Used to control buffer sizes.  We use
+ * the length of v3 WRITE, READDIR and READDIR replies
+ * which are an RPC header, up to 26 XDR units of reply
+ * data, and some page data.
+ *
+ * Note that accuracy here doesn't matter too much as the
+ * size is rounded up to a page size when allocating space.
+ */
+#define NFSD_BUFSIZE            ((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE)
 
 struct readdir_cd {
 	__be32			err;	/* 0, nfserr, or nfserr_eof */
diff --git a/include/linux/nfsd/Kbuild b/include/linux/nfsd/Kbuild
index 55d1467de3c1..0e528606d46f 100644
--- a/include/linux/nfsd/Kbuild
+++ b/include/linux/nfsd/Kbuild
@@ -1,4 +1,3 @@
-header-y += const.h
 header-y += debug.h
 header-y += export.h
 header-y += nfsfh.h
diff --git a/include/linux/nfsd/const.h b/include/linux/nfsd/const.h
deleted file mode 100644
index feb3764617a4..000000000000
--- a/include/linux/nfsd/const.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * include/linux/nfsd/const.h
- *
- * Various constants related to NFS.
- *
- * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
- */
-
-#ifndef _LINUX_NFSD_CONST_H
-#define _LINUX_NFSD_CONST_H
-
-#include <linux/nfs.h>
-#include <linux/nfs2.h>
-#include <linux/nfs3.h>
-#include <linux/nfs4.h>
-
-/*
- * Maximum blocksizes supported by daemon under various circumstances.
- */
-#define NFSSVC_MAXBLKSIZE	RPCSVC_MAXPAYLOAD
-/* NFSv2 is limited by the protocol specification, see RFC 1094 */
-#define NFSSVC_MAXBLKSIZE_V2	(8*1024)
-
-#ifdef __KERNEL__
-
-#include <linux/sunrpc/msg_prot.h>
-
-/*
- * Largest number of bytes we need to allocate for an NFS
- * call or reply.  Used to control buffer sizes.  We use
- * the length of v3 WRITE, READDIR and READDIR replies
- * which are an RPC header, up to 26 XDR units of reply
- * data, and some page data.
- *
- * Note that accuracy here doesn't matter too much as the
- * size is rounded up to a page size when allocating space.
- */
-#define NFSD_BUFSIZE		((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE)
-
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_NFSD_CONST_H */
diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h
index f76d80ccec10..ce4743a26015 100644
--- a/include/linux/nfsd/nfsfh.h
+++ b/include/linux/nfsd/nfsfh.h
@@ -14,11 +14,14 @@
 #ifndef _LINUX_NFSD_FH_H
 #define _LINUX_NFSD_FH_H
 
-# include <linux/types.h>
+#include <linux/types.h>
+#include <linux/nfs.h>
+#include <linux/nfs2.h>
+#include <linux/nfs3.h>
+#include <linux/nfs4.h>
 #ifdef __KERNEL__
 # include <linux/sunrpc/svc.h>
 #endif
-#include <linux/nfsd/const.h>
 
 /*
  * This is the old "dentry style" Linux NFSv2 file handle.
-- 
cgit v1.2.3-58-ga151


From a043226bc140a2c1dde162246d68a67e5043e6b2 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 25 Aug 2011 10:48:39 -0400
Subject: nfsd4: permit read opens of executable-only files

A client that wants to execute a file must be able to read it.  Read
opens over nfs are therefore implicitly allowed for executable files
even when those files are not readable.

NFSv2/v3 get this right by using a passed-in NFSD_MAY_OWNER_OVERRIDE on
read requests, but NFSv4 has gotten this wrong ever since
dc730e173785e29b297aa605786c94adaffe2544 "nfsd4: fix owner-override on
open", when we realized that the file owner shouldn't override
permissions on non-reclaim NFSv4 opens.

So we can't use NFSD_MAY_OWNER_OVERRIDE to tell nfsd_permission to allow
reads of executable files.

So, do the same thing we do whenever we encounter another weird NFS
permission nit: define yet another NFSD_MAY_* flag.

The industry's future standardization on 128-bit processors will be
motivated primarily by the need for integers with enough bits for all
the NFSD_MAY_* flags.

Reported-by: Leonardo Borda <leonardoborda@gmail.com>
Cc: stable@kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 2 ++
 fs/nfsd/vfs.c      | 3 ++-
 fs/nfsd/vfs.h      | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 479ffb185df9..b5530984db91 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -156,6 +156,8 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
 		!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
 		return nfserr_inval;
 
+	accmode |= NFSD_MAY_READ_IF_EXEC;
+
 	if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
 		accmode |= NFSD_MAY_READ;
 	if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 4c22870293e6..75c35fa46155 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -2116,7 +2116,8 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 
 	/* Allow read access to binaries even when mode 111 */
 	if (err == -EACCES && S_ISREG(inode->i_mode) &&
-	    acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
+	     (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) ||
+	      acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC)))
 		err = inode_permission(inode, MAY_EXEC);
 
 	return err? nfserrno(err) : 0;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index e0bbac04d1dd..a22e40e27861 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -25,6 +25,7 @@
 #define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
 #define NFSD_MAY_NOT_BREAK_LEASE 512
 #define NFSD_MAY_BYPASS_GSS	1024
+#define NFSD_MAY_READ_IF_EXEC	2048
 
 #define NFSD_MAY_CREATE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE)
 #define NFSD_MAY_REMOVE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
-- 
cgit v1.2.3-58-ga151


From 8e82fa8fdcd1271d45bf6a5195801c706e190d69 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 25 Aug 2011 14:23:39 -0400
Subject: nfsd: prettify NFSD_MAY_* flag definitions

Acked-by: Jim Rees <rees@umich.edu>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/vfs.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index a22e40e27861..503f3bf11abd 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -10,22 +10,22 @@
 /*
  * Flags for nfsd_permission
  */
-#define NFSD_MAY_NOP		0
-#define NFSD_MAY_EXEC		1 /* == MAY_EXEC */
-#define NFSD_MAY_WRITE		2 /* == MAY_WRITE */
-#define NFSD_MAY_READ		4 /* == MAY_READ */
-#define NFSD_MAY_SATTR		8
-#define NFSD_MAY_TRUNC		16
-#define NFSD_MAY_LOCK		32
-#define NFSD_MAY_MASK		63
+#define NFSD_MAY_NOP			0
+#define NFSD_MAY_EXEC			0x001 /* == MAY_EXEC */
+#define NFSD_MAY_WRITE			0x002 /* == MAY_WRITE */
+#define NFSD_MAY_READ			0x004 /* == MAY_READ */
+#define NFSD_MAY_SATTR			0x008
+#define NFSD_MAY_TRUNC			0x010
+#define NFSD_MAY_LOCK			0x020
+#define NFSD_MAY_MASK			0x03f
 
 /* extra hints to permission and open routines: */
-#define NFSD_MAY_OWNER_OVERRIDE	64
-#define NFSD_MAY_LOCAL_ACCESS	128 /* IRIX doing local access check on device special file*/
-#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
-#define NFSD_MAY_NOT_BREAK_LEASE 512
-#define NFSD_MAY_BYPASS_GSS	1024
-#define NFSD_MAY_READ_IF_EXEC	2048
+#define NFSD_MAY_OWNER_OVERRIDE		0x040
+#define NFSD_MAY_LOCAL_ACCESS		0x080 /* for device special files */
+#define NFSD_MAY_BYPASS_GSS_ON_ROOT	0x100
+#define NFSD_MAY_NOT_BREAK_LEASE	0x200
+#define NFSD_MAY_BYPASS_GSS		0x400
+#define NFSD_MAY_READ_IF_EXEC		0x800
 
 #define NFSD_MAY_CREATE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE)
 #define NFSD_MAY_REMOVE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
-- 
cgit v1.2.3-58-ga151


From 48483bf23a568f3ef4cc7ad2c8f1a082f10ad0e7 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 26 Aug 2011 20:40:28 -0400
Subject: nfsd4: simplify recovery dir setting

Move around some of this code, simplify a bit.

Reviewed-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4recover.c | 36 ++++++++++++++++++++++++++++++++----
 fs/nfsd/nfs4state.c   | 41 +----------------------------------------
 fs/nfsd/state.h       |  2 +-
 3 files changed, 34 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 29d77f60585b..c3466610e6cd 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -45,6 +45,7 @@
 
 /* Globals */
 static struct file *rec_file;
+static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
 
 static int
 nfs4_save_creds(const struct cred **original_creds)
@@ -354,13 +355,13 @@ nfsd4_recdir_load(void) {
  */
 
 void
-nfsd4_init_recdir(char *rec_dirname)
+nfsd4_init_recdir()
 {
 	const struct cred *original_cred;
 	int status;
 
 	printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
-			rec_dirname);
+			user_recovery_dirname);
 
 	BUG_ON(rec_file);
 
@@ -372,10 +373,10 @@ nfsd4_init_recdir(char *rec_dirname)
 		return;
 	}
 
-	rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0);
+	rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0);
 	if (IS_ERR(rec_file)) {
 		printk("NFSD: unable to find recovery directory %s\n",
-				rec_dirname);
+				user_recovery_dirname);
 		rec_file = NULL;
 	}
 
@@ -390,3 +391,30 @@ nfsd4_shutdown_recdir(void)
 	fput(rec_file);
 	rec_file = NULL;
 }
+
+/*
+ * Change the NFSv4 recovery directory to recdir.
+ */
+int
+nfs4_reset_recoverydir(char *recdir)
+{
+	int status;
+	struct path path;
+
+	status = kern_path(recdir, LOOKUP_FOLLOW, &path);
+	if (status)
+		return status;
+	status = -ENOTDIR;
+	if (S_ISDIR(path.dentry->d_inode->i_mode)) {
+		strcpy(user_recovery_dirname, recdir);
+		status = 0;
+	}
+	path_put(&path);
+	return status;
+}
+
+char *
+nfs4_recoverydir(void)
+{
+	return user_recovery_dirname;
+}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index aa0a36e3b09e..36d0beb76864 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -64,8 +64,6 @@ static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
 static struct nfs4_stateid * search_for_stateid(stateid_t *stid);
 static struct nfs4_delegation * search_for_delegation(stateid_t *stid);
 static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
-static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
-static void nfs4_set_recdir(char *recdir);
 static int check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner);
 
 /* Locking: */
@@ -4523,7 +4521,7 @@ nfsd4_load_reboot_recovery_data(void)
 	int status;
 
 	nfs4_lock_state();
-	nfsd4_init_recdir(user_recovery_dirname);
+	nfsd4_init_recdir();
 	status = nfsd4_recdir_load();
 	nfs4_unlock_state();
 	if (status)
@@ -4632,40 +4630,3 @@ nfs4_state_shutdown(void)
 	nfs4_unlock_state();
 	nfsd4_destroy_callback_queue();
 }
-
-/*
- * user_recovery_dirname is protected by the nfsd_mutex since it's only
- * accessed when nfsd is starting.
- */
-static void
-nfs4_set_recdir(char *recdir)
-{
-	strcpy(user_recovery_dirname, recdir);
-}
-
-/*
- * Change the NFSv4 recovery directory to recdir.
- */
-int
-nfs4_reset_recoverydir(char *recdir)
-{
-	int status;
-	struct path path;
-
-	status = kern_path(recdir, LOOKUP_FOLLOW, &path);
-	if (status)
-		return status;
-	status = -ENOTDIR;
-	if (S_ISDIR(path.dentry->d_inode->i_mode)) {
-		nfs4_set_recdir(recdir);
-		status = 0;
-	}
-	path_put(&path);
-	return status;
-}
-
-char *
-nfs4_recoverydir(void)
-{
-	return user_recovery_dirname;
-}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 5cfebe504056..12c1b1ef52ec 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -467,7 +467,7 @@ extern void nfsd4_destroy_callback_queue(void);
 extern void nfsd4_shutdown_callback(struct nfs4_client *);
 extern void nfs4_put_delegation(struct nfs4_delegation *dp);
 extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
-extern void nfsd4_init_recdir(char *recdir_name);
+extern void nfsd4_init_recdir(void);
 extern int nfsd4_recdir_load(void);
 extern void nfsd4_shutdown_recdir(void);
 extern int nfs4_client_to_reclaim(const char *name);
-- 
cgit v1.2.3-58-ga151


From 6577aac01f00636c16cd583c30bd4dedf18475d5 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Fri, 12 Aug 2011 17:30:12 -0700
Subject: nfsd4: fix failure to end nfsd4 grace period

Even if we fail to write a recovery record, we should still mark the
client as having acquired its first state.  Otherwise we leave 4.1
clients with indefinite ERR_GRACE returns.

However, an inability to write stable storage records may cause failures
of reboot recovery, and the problem should still be brought to the
server administrator's attention.

So, make sure the error is logged.

These errors shouldn't normally be triggered on a corectly functioning
server--this isn't a case where a misconfigured client could spam the
logs.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4recover.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index c3466610e6cd..493851b844fe 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -130,6 +130,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 	if (!rec_file || clp->cl_firststate)
 		return 0;
 
+	clp->cl_firststate = 1;
 	status = nfs4_save_creds(&original_cred);
 	if (status < 0)
 		return status;
@@ -144,10 +145,8 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 		goto out_unlock;
 	}
 	status = -EEXIST;
-	if (dentry->d_inode) {
-		dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
+	if (dentry->d_inode)
 		goto out_put;
-	}
 	status = mnt_want_write(rec_file->f_path.mnt);
 	if (status)
 		goto out_put;
@@ -157,12 +156,14 @@ out_put:
 	dput(dentry);
 out_unlock:
 	mutex_unlock(&dir->d_inode->i_mutex);
-	if (status == 0) {
-		clp->cl_firststate = 1;
+	if (status == 0)
 		vfs_fsync(rec_file, 0);
-	}
+	else
+		printk(KERN_ERR "NFSD: failed to write recovery record"
+				" (err %d); please check that %s exists"
+				" and is writeable", status,
+				user_recovery_dirname);
 	nfs4_reset_creds(original_cred);
-	dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
 	return status;
 }
 
-- 
cgit v1.2.3-58-ga151


From 3e77246393c0a433247631a1f0e9ec98d3d78a1c Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 10 Aug 2011 19:07:33 -0400
Subject: nfsd4: stop using nfserr_resource for transitory errors

The server is returning nfserr_resource for both permanent errors and
for errors (like allocation failures) that might be resolved by retrying
later.  Save nfserr_resource for the former and use delay/jukebox for
the latter.

Cc: stable@kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c    |  2 +-
 fs/nfsd/nfs4recover.c |  2 +-
 fs/nfsd/nfs4state.c   | 14 +++++++-------
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index b5530984db91..50bae7471147 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -940,7 +940,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	count = 4 + (verify->ve_attrlen >> 2);
 	buf = kmalloc(count << 2, GFP_KERNEL);
 	if (!buf)
-		return nfserr_resource;
+		return nfserr_jukebox;
 
 	status = nfsd4_encode_fattr(&cstate->current_fh,
 				    cstate->current_fh.fh_export,
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 493851b844fe..ed083b9a731b 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -89,7 +89,7 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
 	struct xdr_netobj cksum;
 	struct hash_desc desc;
 	struct scatterlist sg;
-	__be32 status = nfserr_resource;
+	__be32 status = nfserr_jukebox;
 
 	dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
 			clname->len, clname->data);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 36d0beb76864..c7d54f6a19c9 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1944,7 +1944,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 * of 5 bullet points, labeled as CASE0 - CASE4 below.
 	 */
 	unconf = find_unconfirmed_client_by_str(dname, strhashval);
-	status = nfserr_resource;
+	status = nfserr_jukebox;
 	if (!conf) {
 		/*
 		 * RFC 3530 14.2.33 CASE 4:
@@ -2481,7 +2481,7 @@ renew:
 	if (open->op_stateowner == NULL) {
 		sop = alloc_init_open_stateowner(strhashval, clp, open);
 		if (sop == NULL)
-			return nfserr_resource;
+			return nfserr_jukebox;
 		open->op_stateowner = sop;
 	}
 	list_del_init(&sop->so_close_lru);
@@ -2617,7 +2617,7 @@ nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
 
 	stp = nfs4_alloc_stateid();
 	if (stp == NULL)
-		return nfserr_resource;
+		return nfserr_jukebox;
 
 	status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open);
 	if (status) {
@@ -2848,7 +2848,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		status = nfserr_bad_stateid;
 		if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
 			goto out;
-		status = nfserr_resource;
+		status = nfserr_jukebox;
 		fp = alloc_init_file(ino);
 		if (fp == NULL)
 			goto out;
@@ -4033,7 +4033,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		/* XXX: Do we need to check for duplicate stateowners on
 		 * the same file, or should they just be allowed (and
 		 * create new stateids)? */
-		status = nfserr_resource;
+		status = nfserr_jukebox;
 		lock_sop = alloc_init_lock_stateowner(strhashval,
 				open_sop->so_client, open_stp, lock);
 		if (lock_sop == NULL)
@@ -4117,9 +4117,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	case (EDEADLK):
 		status = nfserr_deadlock;
 		break;
-	default:        
+	default:
 		dprintk("NFSD: nfsd4_lock: vfs_lock_file() failed! status %d\n",err);
-		status = nfserr_resource;
+		status = nfserrno(err);
 		break;
 	}
 out:
-- 
cgit v1.2.3-58-ga151


From ddc04c41636f8cd374d72cdd3a0fcaa916fbc5d0 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sat, 30 Jul 2011 23:46:29 -0400
Subject: nfsd4: replace some macros by functions

For all the usual reasons.  (Type safety, readability.)

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 53 ++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 38 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c7d54f6a19c9..ce8e70cd04c5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -139,10 +139,19 @@ unsigned int max_delegations;
 #define OWNER_HASH_SIZE             (1 << OWNER_HASH_BITS)
 #define OWNER_HASH_MASK             (OWNER_HASH_SIZE - 1)
 
-#define ownerid_hashval(id) \
-        ((id) & OWNER_HASH_MASK)
-#define ownerstr_hashval(clientid, ownername) \
-        (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK)
+static unsigned int ownerid_hashval(const u32 id)
+{
+	return id & OWNER_HASH_MASK;
+}
+
+static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
+{
+	unsigned int ret;
+
+	ret = opaque_hashval(ownername->data, ownername->len);
+	ret += clientid;
+	return ret & OWNER_HASH_MASK;
+}
 
 static struct list_head	ownerid_hashtbl[OWNER_HASH_SIZE];
 static struct list_head	ownerstr_hashtbl[OWNER_HASH_SIZE];
@@ -156,10 +165,16 @@ static struct list_head	ownerstr_hashtbl[OWNER_HASH_SIZE];
 #define STATEID_HASH_SIZE              (1 << STATEID_HASH_BITS)
 #define STATEID_HASH_MASK              (STATEID_HASH_SIZE - 1)
 
-#define file_hashval(x) \
-        hash_ptr(x, FILE_HASH_BITS)
-#define stateid_hashval(owner_id, file_id)  \
-        (((owner_id) + (file_id)) & STATEID_HASH_MASK)
+static unsigned int file_hashval(struct inode *ino)
+{
+	/* XXX: why are we hashing on inode pointer, anyway? */
+	return hash_ptr(ino, FILE_HASH_BITS);
+}
+
+static unsigned int stateid_hashval(u32 owner_id, u32 file_id)
+{
+	return (owner_id + file_id) & STATEID_HASH_MASK;
+}
 
 static struct list_head file_hashtbl[FILE_HASH_SIZE];
 static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
@@ -290,10 +305,16 @@ static DEFINE_SPINLOCK(client_lock);
 #define CLIENT_HASH_SIZE                (1 << CLIENT_HASH_BITS)
 #define CLIENT_HASH_MASK                (CLIENT_HASH_SIZE - 1)
 
-#define clientid_hashval(id) \
-	((id) & CLIENT_HASH_MASK)
-#define clientstr_hashval(name) \
-	(opaque_hashval((name), 8) & CLIENT_HASH_MASK)
+static unsigned int clientid_hashval(u32 id)
+{
+	return id & CLIENT_HASH_MASK;
+}
+
+static unsigned int clientstr_hashval(const char *name)
+{
+	return opaque_hashval(name, 8) & CLIENT_HASH_MASK;
+}
+
 /*
  * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
  * used in reboot/reset lease grace period processing
@@ -2443,7 +2464,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
 	if (STALE_CLIENTID(&open->op_clientid))
 		return nfserr_stale_clientid;
 
-	strhashval = ownerstr_hashval(clientid->cl_id, open->op_owner);
+	strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner);
 	sop = find_openstateowner_str(strhashval, open);
 	open->op_stateowner = sop;
 	if (!sop) {
@@ -3711,8 +3732,10 @@ last_byte_offset(u64 start, u64 len)
 	return end > start ? end - 1: NFS4_MAX_UINT64;
 }
 
-#define lockownerid_hashval(id) \
-        ((id) & LOCK_HASH_MASK)
+static unsigned int lockownerid_hashval(u32 id)
+{
+	return id & LOCK_HASH_MASK;
+}
 
 static inline unsigned int
 lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
-- 
cgit v1.2.3-58-ga151


From 506f275fffcaceb8abe3db69e581d86c7da48a82 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 11 Aug 2011 18:50:18 -0400
Subject: nfsd4: name openowner data structures more clearly

These appear to be generic (for both open and lock owners), but they're
actually just for open owners.  This has confused me more than once.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ce8e70cd04c5..dd6424face32 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -134,27 +134,27 @@ unsigned int max_delegations;
  * Open owner state (share locks)
  */
 
-/* hash tables for nfs4_stateowner */
-#define OWNER_HASH_BITS              8
-#define OWNER_HASH_SIZE             (1 << OWNER_HASH_BITS)
-#define OWNER_HASH_MASK             (OWNER_HASH_SIZE - 1)
+/* hash tables for open owners */
+#define OPEN_OWNER_HASH_BITS              8
+#define OPEN_OWNER_HASH_SIZE             (1 << OPEN_OWNER_HASH_BITS)
+#define OPEN_OWNER_HASH_MASK             (OPEN_OWNER_HASH_SIZE - 1)
 
-static unsigned int ownerid_hashval(const u32 id)
+static unsigned int open_ownerid_hashval(const u32 id)
 {
-	return id & OWNER_HASH_MASK;
+	return id & OPEN_OWNER_HASH_MASK;
 }
 
-static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
+static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
 {
 	unsigned int ret;
 
 	ret = opaque_hashval(ownername->data, ownername->len);
 	ret += clientid;
-	return ret & OWNER_HASH_MASK;
+	return ret & OPEN_OWNER_HASH_MASK;
 }
 
-static struct list_head	ownerid_hashtbl[OWNER_HASH_SIZE];
-static struct list_head	ownerstr_hashtbl[OWNER_HASH_SIZE];
+static struct list_head	open_ownerid_hashtbl[OPEN_OWNER_HASH_SIZE];
+static struct list_head	open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE];
 
 /* hash table for nfs4_file */
 #define FILE_HASH_BITS                   8
@@ -2240,7 +2240,7 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
 
 	if (!(sop = alloc_stateowner(&open->op_owner)))
 		return NULL;
-	idhashval = ownerid_hashval(current_ownerid);
+	idhashval = open_ownerid_hashval(current_ownerid);
 	INIT_LIST_HEAD(&sop->so_idhash);
 	INIT_LIST_HEAD(&sop->so_strhash);
 	INIT_LIST_HEAD(&sop->so_perclient);
@@ -2248,8 +2248,8 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
 	INIT_LIST_HEAD(&sop->so_perstateid);  /* not used */
 	INIT_LIST_HEAD(&sop->so_close_lru);
 	sop->so_time = 0;
-	list_add(&sop->so_idhash, &ownerid_hashtbl[idhashval]);
-	list_add(&sop->so_strhash, &ownerstr_hashtbl[strhashval]);
+	list_add(&sop->so_idhash, &open_ownerid_hashtbl[idhashval]);
+	list_add(&sop->so_strhash, &open_ownerstr_hashtbl[strhashval]);
 	list_add(&sop->so_perclient, &clp->cl_openowners);
 	sop->so_is_open_owner = 1;
 	sop->so_id = current_ownerid++;
@@ -2313,7 +2313,7 @@ find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open)
 {
 	struct nfs4_stateowner *so = NULL;
 
-	list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
+	list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) {
 		if (same_owner_str(so, &open->op_owner, &open->op_clientid))
 			return so;
 	}
@@ -2464,7 +2464,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
 	if (STALE_CLIENTID(&open->op_clientid))
 		return nfserr_stale_clientid;
 
-	strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner);
+	strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner);
 	sop = find_openstateowner_str(strhashval, open);
 	open->op_stateowner = sop;
 	if (!sop) {
@@ -4518,9 +4518,9 @@ nfs4_state_init(void)
 	for (i = 0; i < FILE_HASH_SIZE; i++) {
 		INIT_LIST_HEAD(&file_hashtbl[i]);
 	}
-	for (i = 0; i < OWNER_HASH_SIZE; i++) {
-		INIT_LIST_HEAD(&ownerstr_hashtbl[i]);
-		INIT_LIST_HEAD(&ownerid_hashtbl[i]);
+	for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) {
+		INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]);
+		INIT_LIST_HEAD(&open_ownerid_hashtbl[i]);
 	}
 	for (i = 0; i < STATEID_HASH_SIZE; i++) {
 		INIT_LIST_HEAD(&stateid_hashtbl[i]);
-- 
cgit v1.2.3-58-ga151


From ff194bd95959ea9047d536b4f4ad6a992754e48d Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 12 Aug 2011 09:42:57 -0400
Subject: nfsd4: cleanup lock/stateowner initialization

Share some common code, stop doing silly things like initializing a list
head immediately before adding it to a list, etc.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 100 +++++++++++++++++++++++++++-------------------------
 1 file changed, 52 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index dd6424face32..3e64288399f7 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2215,51 +2215,61 @@ nfs4_free_stateowner(struct kref *kref)
 	kmem_cache_free(stateowner_slab, sop);
 }
 
-static inline struct nfs4_stateowner *
-alloc_stateowner(struct xdr_netobj *owner)
+static void init_nfs4_replay(struct nfs4_replay *rp)
 {
-	struct nfs4_stateowner *sop;
-
-	if ((sop = kmem_cache_alloc(stateowner_slab, GFP_KERNEL))) {
-		if ((sop->so_owner.data = kmalloc(owner->len, GFP_KERNEL))) {
-			memcpy(sop->so_owner.data, owner->data, owner->len);
-			sop->so_owner.len = owner->len;
-			kref_init(&sop->so_ref);
-			return sop;
-		} 
-		kmem_cache_free(stateowner_slab, sop);
-	}
-	return NULL;
+	rp->rp_status = nfserr_serverfault;
+	rp->rp_buflen = 0;
+	rp->rp_buf = rp->rp_ibuf;
 }
 
-static struct nfs4_stateowner *
-alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) {
+static inline struct nfs4_stateowner *alloc_stateowner(struct xdr_netobj *owner, struct nfs4_client *clp)
+{
 	struct nfs4_stateowner *sop;
-	struct nfs4_replay *rp;
-	unsigned int idhashval;
 
-	if (!(sop = alloc_stateowner(&open->op_owner)))
+	sop = kmem_cache_alloc(stateowner_slab, GFP_KERNEL);
+	if (!sop)
+		return NULL;
+
+	sop->so_owner.data = kmemdup(owner->data, owner->len, GFP_KERNEL);
+	if (!sop->so_owner.data) {
+		kmem_cache_free(stateowner_slab, sop);
 		return NULL;
-	idhashval = open_ownerid_hashval(current_ownerid);
-	INIT_LIST_HEAD(&sop->so_idhash);
-	INIT_LIST_HEAD(&sop->so_strhash);
+	}
+	sop->so_owner.len = owner->len;
+
+	kref_init(&sop->so_ref);
 	INIT_LIST_HEAD(&sop->so_perclient);
 	INIT_LIST_HEAD(&sop->so_stateids);
-	INIT_LIST_HEAD(&sop->so_perstateid);  /* not used */
+	INIT_LIST_HEAD(&sop->so_perstateid);
 	INIT_LIST_HEAD(&sop->so_close_lru);
+	sop->so_id = current_ownerid++;
 	sop->so_time = 0;
+	sop->so_client = clp;
+	init_nfs4_replay(&sop->so_replay);
+	return sop;
+}
+
+static void hash_openowner(struct nfs4_stateowner *sop, struct nfs4_client *clp, unsigned int strhashval)
+{
+	unsigned int idhashval;
+
+	idhashval = open_ownerid_hashval(sop->so_id);
 	list_add(&sop->so_idhash, &open_ownerid_hashtbl[idhashval]);
 	list_add(&sop->so_strhash, &open_ownerstr_hashtbl[strhashval]);
 	list_add(&sop->so_perclient, &clp->cl_openowners);
+}
+
+static struct nfs4_stateowner *
+alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) {
+	struct nfs4_stateowner *sop;
+
+	sop = alloc_stateowner(&open->op_owner, clp);
+	if (!sop)
+		return NULL;
 	sop->so_is_open_owner = 1;
-	sop->so_id = current_ownerid++;
-	sop->so_client = clp;
 	sop->so_seqid = open->op_seqid;
 	sop->so_confirmed = 0;
-	rp = &sop->so_replay;
-	rp->rp_status = nfserr_serverfault;
-	rp->rp_buflen = 0;
-	rp->rp_buf = rp->rp_ibuf;
+	hash_openowner(sop, clp, strhashval);
 	return sop;
 }
 
@@ -3902,6 +3912,16 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid,
 	return NULL;
 }
 
+static void hash_lockowner(struct nfs4_stateowner *sop, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp)
+{
+	unsigned int idhashval;
+
+	idhashval = lockownerid_hashval(sop->so_id);
+	list_add(&sop->so_idhash, &lock_ownerid_hashtbl[idhashval]);
+	list_add(&sop->so_strhash, &lock_ownerstr_hashtbl[strhashval]);
+	list_add(&sop->so_perstateid, &open_stp->st_lockowners);
+}
+
 /*
  * Alloc a lock owner structure.
  * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has 
@@ -3913,33 +3933,17 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid,
 static struct nfs4_stateowner *
 alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp, struct nfsd4_lock *lock) {
 	struct nfs4_stateowner *sop;
-	struct nfs4_replay *rp;
-	unsigned int idhashval;
 
-	if (!(sop = alloc_stateowner(&lock->lk_new_owner)))
+	sop = alloc_stateowner(&lock->lk_new_owner, clp);
+	if (!sop)
 		return NULL;
-	idhashval = lockownerid_hashval(current_ownerid);
-	INIT_LIST_HEAD(&sop->so_idhash);
-	INIT_LIST_HEAD(&sop->so_strhash);
-	INIT_LIST_HEAD(&sop->so_perclient);
 	INIT_LIST_HEAD(&sop->so_stateids);
-	INIT_LIST_HEAD(&sop->so_perstateid);
-	INIT_LIST_HEAD(&sop->so_close_lru); /* not used */
-	sop->so_time = 0;
-	list_add(&sop->so_idhash, &lock_ownerid_hashtbl[idhashval]);
-	list_add(&sop->so_strhash, &lock_ownerstr_hashtbl[strhashval]);
-	list_add(&sop->so_perstateid, &open_stp->st_lockowners);
 	sop->so_is_open_owner = 0;
-	sop->so_id = current_ownerid++;
-	sop->so_client = clp;
 	/* It is the openowner seqid that will be incremented in encode in the
 	 * case of new lockowners; so increment the lock seqid manually: */
 	sop->so_seqid = lock->lk_new_lock_seqid + 1;
 	sop->so_confirmed = 1;
-	rp = &sop->so_replay;
-	rp->rp_status = nfserr_serverfault;
-	rp->rp_buflen = 0;
-	rp->rp_buf = rp->rp_ibuf;
+	hash_lockowner(sop, strhashval, clp, open_stp);
 	return sop;
 }
 
-- 
cgit v1.2.3-58-ga151


From 28dde241cc65c9464b7627d9a9ed3a66e4df2586 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 22 Aug 2011 10:07:12 -0400
Subject: nfsd4: remove HAS_SESSION

This flag doesn't really buy us anything.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 30 ++++++++++--------------------
 fs/nfsd/state.h     |  3 +--
 fs/nfsd/xdr4.h      |  2 +-
 3 files changed, 12 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3e64288399f7..14c8dd64e136 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3168,13 +3168,13 @@ grace_disallows_io(struct inode *inode)
 	return locks_in_grace() && mandatory_lock(inode);
 }
 
-static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags)
+static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
 {
 	/*
 	 * When sessions are used the stateid generation number is ignored
 	 * when it is zero.
 	 */
-	if ((flags & HAS_SESSION) && in->si_generation == 0)
+	if (has_session && in->si_generation == 0)
 		goto out;
 
 	/* If the client sends us a stateid from the future, it's buggy: */
@@ -3206,7 +3206,7 @@ static int is_open_stateid(struct nfs4_stateid *stateid)
 	return stateid->st_openstp == NULL;
 }
 
-__be32 nfs4_validate_stateid(stateid_t *stateid, int flags)
+__be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session)
 {
 	struct nfs4_stateid *stp = NULL;
 	__be32 status = nfserr_stale_stateid;
@@ -3223,7 +3223,7 @@ __be32 nfs4_validate_stateid(stateid_t *stateid, int flags)
 	if (!stp->st_stateowner->so_confirmed)
 		goto out;
 
-	status = check_stateid_generation(stateid, &stp->st_stateid, flags);
+	status = check_stateid_generation(stateid, &stp->st_stateid, has_session);
 	if (status)
 		goto out;
 
@@ -3251,9 +3251,6 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 	if (grace_disallows_io(ino))
 		return nfserr_grace;
 
-	if (nfsd4_has_session(cstate))
-		flags |= HAS_SESSION;
-
 	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
 		return check_special_stateids(current_fh, stateid, flags);
 
@@ -3270,8 +3267,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 		dp = find_delegation_stateid(ino, stateid);
 		if (!dp)
 			goto out;
-		status = check_stateid_generation(stateid, &dp->dl_stateid,
-						  flags);
+		status = check_stateid_generation(stateid, &dp->dl_stateid, nfsd4_has_session(cstate));
 		if (status)
 			goto out;
 		status = nfs4_check_delegmode(dp, flags);
@@ -3292,7 +3288,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 		if (!stp->st_stateowner->so_confirmed)
 			goto out;
 		status = check_stateid_generation(stateid, &stp->st_stateid,
-						  flags);
+						  nfsd4_has_session(cstate));
 		if (status)
 			goto out;
 		status = nfs4_check_openmode(stp, flags);
@@ -3421,9 +3417,6 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 	if (STALE_STATEID(stateid))
 		return nfserr_stale_stateid;
 
-	if (nfsd4_has_session(cstate))
-		flags |= HAS_SESSION;
-
 	/*
 	* We return BAD_STATEID if filehandle doesn't match stateid, 
 	* the confirmed flag is incorrecly set, or the generation 
@@ -3457,7 +3450,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 		if (lock->lk_is_new) {
 			if (!sop->so_is_open_owner)
 				return nfserr_bad_stateid;
-			if (!(flags & HAS_SESSION) &&
+			if (!nfsd4_has_session(cstate) &&
 			    !same_clid(&clp->cl_clientid, lockclid))
 				return nfserr_bad_stateid;
 			/* stp is the open stateid */
@@ -3482,7 +3475,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 	*  For the moment, we ignore the possibility of 
 	*  generation number wraparound.
 	*/
-	if (!(flags & HAS_SESSION) && seqid != sop->so_seqid)
+	if (!nfsd4_has_session(cstate) && seqid != sop->so_seqid)
 		goto check_replay;
 
 	if (sop->so_confirmed && flags & CONFIRM) {
@@ -3495,7 +3488,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 				" confirmed yet!\n");
 		return nfserr_bad_stateid;
 	}
-	status = check_stateid_generation(stateid, &stp->st_stateid, flags);
+	status = check_stateid_generation(stateid, &stp->st_stateid, nfsd4_has_session(cstate));
 	if (status)
 		return status;
 	renew_client(sop->so_client);
@@ -3679,14 +3672,11 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	stateid_t *stateid = &dr->dr_stateid;
 	struct inode *inode;
 	__be32 status;
-	int flags = 0;
 
 	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
 		return status;
 	inode = cstate->current_fh.fh_dentry->d_inode;
 
-	if (nfsd4_has_session(cstate))
-		flags |= HAS_SESSION;
 	nfs4_lock_state();
 	status = nfserr_bad_stateid;
 	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
@@ -3701,7 +3691,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	dp = find_delegation_stateid(inode, stateid);
 	if (!dp)
 		goto out;
-	status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
+	status = check_stateid_generation(stateid, &dp->dl_stateid, nfsd4_has_session(cstate));
 	if (status)
 		goto out;
 	renew_client(dp->dl_client);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 12c1b1ef52ec..f02badd70cf2 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -439,7 +439,6 @@ struct nfs4_stateid {
 };
 
 /* flags for preprocess_seqid_op() */
-#define HAS_SESSION             0x00000001
 #define CONFIRM                 0x00000002
 #define OPEN_STATE              0x00000004
 #define LOCK_STATE              0x00000008
@@ -476,7 +475,7 @@ extern void nfsd4_recdir_purge_old(void);
 extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
 extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
 extern void release_session_client(struct nfsd4_session *);
-extern __be32 nfs4_validate_stateid(stateid_t *, int);
+extern __be32 nfs4_validate_stateid(stateid_t *, bool);
 
 static inline void
 nfs4_put_stateowner(struct nfs4_stateowner *so)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index d2a8d04428c7..663193b21a24 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -351,7 +351,7 @@ struct nfsd4_saved_compoundargs {
 
 struct nfsd4_test_stateid {
 	__be32		ts_num_ids;
-	__be32		ts_has_session;
+	bool		ts_has_session;
 	struct nfsd4_compoundargs *ts_saved_args;
 	struct nfsd4_saved_compoundargs ts_savedp;
 };
-- 
cgit v1.2.3-58-ga151


From a9004abc34239705840eaf6fe3d6cc9cb7656cba Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 23 Aug 2011 15:43:04 -0400
Subject: nfsd4: cleanup and consolidate seqid_mutating_err

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfs/nfs4_fs.h     | 24 ------------------------
 fs/nfsd/nfs4xdr.c    | 14 +-------------
 include/linux/nfs4.h | 16 ++++++++++++++++
 3 files changed, 17 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 1ec1a85fa71c..1a652a0bd7db 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -13,30 +13,6 @@
 
 struct idmap;
 
-/*
- * In a seqid-mutating op, this macro controls which error return
- * values trigger incrementation of the seqid.
- *
- * from rfc 3010:
- * The client MUST monotonically increment the sequence number for the
- * CLOSE, LOCK, LOCKU, OPEN, OPEN_CONFIRM, and OPEN_DOWNGRADE
- * operations.  This is true even in the event that the previous
- * operation that used the sequence number received an error.  The only
- * exception to this rule is if the previous operation received one of
- * the following errors: NFSERR_STALE_CLIENTID, NFSERR_STALE_STATEID,
- * NFSERR_BAD_STATEID, NFSERR_BAD_SEQID, NFSERR_BADXDR,
- * NFSERR_RESOURCE, NFSERR_NOFILEHANDLE.
- *
- */
-#define seqid_mutating_err(err)       \
-(((err) != NFSERR_STALE_CLIENTID) &&  \
- ((err) != NFSERR_STALE_STATEID)  &&  \
- ((err) != NFSERR_BAD_STATEID)    &&  \
- ((err) != NFSERR_BAD_SEQID)      &&  \
- ((err) != NFSERR_BAD_XDR)        &&  \
- ((err) != NFSERR_RESOURCE)       &&  \
- ((err) != NFSERR_NOFILEHANDLE))
-
 enum nfs4_client_state {
 	NFS4CLNT_MANAGER_RUNNING  = 0,
 	NFS4CLNT_CHECK_LEASE,
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 78c792fb59a8..04ad9a2ca3d0 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1623,18 +1623,6 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
 								\
 	save = resp->p;
 
-static bool seqid_mutating_err(__be32 err)
-{
-	/* rfc 3530 section 8.1.5: */
-	return	err != nfserr_stale_clientid &&
-		err != nfserr_stale_stateid &&
-		err != nfserr_bad_stateid &&
-		err != nfserr_bad_seqid &&
-		err != nfserr_bad_xdr &&
-		err != nfserr_resource &&
-		err != nfserr_nofilehandle;
-}
-
 /*
  * Routine for encoding the result of a "seqid-mutating" NFSv4 operation.  This
  * is where sequence id's are incremented, and the replay cache is filled.
@@ -1643,7 +1631,7 @@ static bool seqid_mutating_err(__be32 err)
  */
 
 #define ENCODE_SEQID_OP_TAIL(stateowner) do {			\
-	if (seqid_mutating_err(nfserr) && stateowner) { 	\
+	if (seqid_mutating_err(ntohl(nfserr)) && stateowner) { 	\
 		stateowner->so_seqid++;				\
 		stateowner->so_replay.rp_status = nfserr;   	\
 		stateowner->so_replay.rp_buflen = 		\
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 76f99e8714f3..b875b0324fc0 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -373,6 +373,22 @@ enum nfsstat4 {
 	NFS4ERR_DELEG_REVOKED	= 10087,	/* deleg./layout revoked */
 };
 
+static inline bool seqid_mutating_err(u32 err)
+{
+	/* rfc 3530 section 8.1.5: */
+	switch (err) {
+	case NFS4ERR_STALE_CLIENTID:
+	case NFS4ERR_STALE_STATEID:
+	case NFS4ERR_BAD_STATEID:
+	case NFS4ERR_BAD_SEQID:
+	case NFS4ERR_BADXDR:
+	case NFS4ERR_RESOURCE:
+	case NFS4ERR_NOFILEHANDLE:
+		return false;
+	};
+	return true;
+}
+
 /*
  * Note: NF4BAD is not actually part of the protocol; it is just used
  * internally by nfsd.
-- 
cgit v1.2.3-58-ga151


From 9afb978400e65ea6a3200b3e163606ce86e13c25 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 22 Aug 2011 11:39:07 -0400
Subject: nfsd4: simplify lock openmode check

Note that the special handling for the lock stateid case is already done
by nfs4_check_openmode() (as of 02921914170e3b7fea1cd82dac9713685d2de5e2
"nfsd4: fix openmode checking on IO using lock stateid") so we no longer
need these two cases in the caller.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 14c8dd64e136..aafc41a0a8d0 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3453,16 +3453,11 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 			if (!nfsd4_has_session(cstate) &&
 			    !same_clid(&clp->cl_clientid, lockclid))
 				return nfserr_bad_stateid;
-			/* stp is the open stateid */
-			status = nfs4_check_openmode(stp, lkflg);
-			if (status)
-				return status;
-		} else {
-			/* stp is the lock stateid */
-			status = nfs4_check_openmode(stp->st_openstp, lkflg);
-			if (status)
-				return status;
-               }
+		}
+		/* stp is the open stateid */
+		status = nfs4_check_openmode(stp, lkflg);
+		if (status)
+			return status;
 	}
 
 	if (nfs4_check_fh(current_fh, stp)) {
-- 
cgit v1.2.3-58-ga151


From b34f27aa5da75b0b6c054e76bb4b92aea7aac04b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 22 Aug 2011 13:13:31 -0400
Subject: nfsd4: get lock checks out of preprocess_seqid_op

We've got some lock-specific code here in nfs4_preprocess_seqid_op which
is only used by nfsd4_lock().  Move it to the caller.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 49 ++++++++++++++++++++-----------------------------
 1 file changed, 20 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index aafc41a0a8d0..d2bf80d8d85b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3396,7 +3396,7 @@ static __be32
 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 			 stateid_t *stateid, int flags,
 			 struct nfs4_stateowner **sopp,
-			 struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
+			 struct nfs4_stateid **stpp)
 {
 	struct nfs4_stateid *stp;
 	struct nfs4_stateowner *sop;
@@ -3439,27 +3439,6 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 	*stpp = stp;
 	*sopp = sop = stp->st_stateowner;
 
-	if (lock) {
-		clientid_t *lockclid = &lock->v.new.clientid;
-		struct nfs4_client *clp = sop->so_client;
-		int lkflg = 0;
-		__be32 status;
-
-		lkflg = setlkflg(lock->lk_type);
-
-		if (lock->lk_is_new) {
-			if (!sop->so_is_open_owner)
-				return nfserr_bad_stateid;
-			if (!nfsd4_has_session(cstate) &&
-			    !same_clid(&clp->cl_clientid, lockclid))
-				return nfserr_bad_stateid;
-		}
-		/* stp is the open stateid */
-		status = nfs4_check_openmode(stp, lkflg);
-		if (status)
-			return status;
-	}
-
 	if (nfs4_check_fh(current_fh, stp)) {
 		dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n");
 		return nfserr_bad_stateid;
@@ -3522,7 +3501,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if ((status = nfs4_preprocess_seqid_op(cstate,
 					oc->oc_seqid, &oc->oc_req_stateid,
 					CONFIRM | OPEN_STATE,
-					&oc->oc_stateowner, &stp, NULL)))
+					&oc->oc_stateowner, &stp)))
 		goto out; 
 
 	sop = oc->oc_stateowner;
@@ -3585,7 +3564,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 					od->od_seqid,
 					&od->od_stateid, 
 					OPEN_STATE,
-					&od->od_stateowner, &stp, NULL)))
+					&od->od_stateowner, &stp)))
 		goto out; 
 
 	status = nfserr_inval;
@@ -3635,7 +3614,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 					close->cl_seqid,
 					&close->cl_stateid, 
 					OPEN_STATE | CLOSE_STATE,
-					&close->cl_stateowner, &stp, NULL)))
+					&close->cl_stateowner, &stp)))
 		goto out; 
 	status = nfs_ok;
 	update_stateid(&stp->st_stateid);
@@ -3997,6 +3976,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct file_lock conflock;
 	__be32 status = 0;
 	unsigned int strhashval;
+	int lkflg;
 	int err;
 
 	dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
@@ -4032,11 +4012,17 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 				        lock->lk_new_open_seqid,
 		                        &lock->lk_new_open_stateid,
 					OPEN_STATE,
-		                        &lock->lk_replay_owner, &open_stp,
-					lock);
+		                        &lock->lk_replay_owner, &open_stp);
 		if (status)
 			goto out;
+		status = nfserr_bad_stateid;
 		open_sop = lock->lk_replay_owner;
+		if (!open_sop->so_is_open_owner)
+			goto out;
+		if (!nfsd4_has_session(cstate) &&
+				!same_clid(&open_sop->so_client->cl_clientid,
+						&lock->v.new.clientid))
+			goto out;
 		/* create lockowner and lock stateid */
 		fp = open_stp->st_file;
 		strhashval = lock_ownerstr_hashval(fp->fi_inode, 
@@ -4059,7 +4045,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 				       lock->lk_old_lock_seqid, 
 				       &lock->lk_old_lock_stateid, 
 				       LOCK_STATE,
-				       &lock->lk_replay_owner, &lock_stp, lock);
+				       &lock->lk_replay_owner, &lock_stp);
 		if (status)
 			goto out;
 		lock_sop = lock->lk_replay_owner;
@@ -4067,6 +4053,11 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	}
 	/* lock->lk_replay_owner and lock_stp have been created or found */
 
+	lkflg = setlkflg(lock->lk_type);
+	status = nfs4_check_openmode(lock_stp, lkflg);
+	if (status)
+		goto out;
+
 	status = nfserr_grace;
 	if (locks_in_grace() && !lock->lk_reclaim)
 		goto out;
@@ -4259,7 +4250,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 					locku->lu_seqid, 
 					&locku->lu_stateid, 
 					LOCK_STATE,
-					&locku->lu_stateowner, &stp, NULL)))
+					&locku->lu_stateowner, &stp)))
 		goto out;
 
 	filp = find_any_file(stp->st_file);
-- 
cgit v1.2.3-58-ga151


From 3cc9fda40a427aeb176bab898edca4e9a3ada524 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 23 Aug 2011 15:17:50 -0400
Subject: nfsd4: remove redundant is_open_owner check

When called with OPEN_STATE, preprocess_seqid_op only returns an open
stateid, hence only an open owner.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d2bf80d8d85b..d2b637b717c3 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4017,8 +4017,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			goto out;
 		status = nfserr_bad_stateid;
 		open_sop = lock->lk_replay_owner;
-		if (!open_sop->so_is_open_owner)
-			goto out;
 		if (!nfsd4_has_session(cstate) &&
 				!same_clid(&open_sop->so_client->cl_clientid,
 						&lock->v.new.clientid))
-- 
cgit v1.2.3-58-ga151


From f32948ddd1179ac0b105ceacc235cfc3f98ebea3 Mon Sep 17 00:00:00 2001
From: Li Haifeng <omycle@gmail.com>
Date: Tue, 30 Aug 2011 17:32:50 +0200
Subject: ext2: fix the outdated comment in ext2_nfs_get_inode()

Signed-off-by: Li Haifeng <omycle@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/super.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 1dd62ed35b85..bd8ac164a3bf 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -327,10 +327,10 @@ static struct inode *ext2_nfs_get_inode(struct super_block *sb,
 	if (ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count))
 		return ERR_PTR(-ESTALE);
 
-	/* iget isn't really right if the inode is currently unallocated!!
-	 * ext2_read_inode currently does appropriate checks, but
-	 * it might be "neater" to call ext2_get_inode first and check
-	 * if the inode is valid.....
+	/*
+	 * ext2_iget isn't quite right if the inode is currently unallocated!
+	 * However ext2_iget currently does appropriate checks to handle stale
+	 * inodes so everything is OK.
 	 */
 	inode = ext2_iget(sb, ino);
 	if (IS_ERR(inode))
-- 
cgit v1.2.3-58-ga151


From c152292f9ee7eb4ed30edc0bd5027a5beef5f5e8 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 26 Aug 2011 17:22:06 -0400
Subject: nfsd: remove include/linux/nfsd/syscall.h

We don't need this any more.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/compat.c                  |   1 -
 fs/nfsd/export.c             |   1 -
 fs/nfsd/nfsctl.c             |   1 -
 include/linux/nfsd/Kbuild    |   1 -
 include/linux/nfsd/syscall.h | 116 -------------------------------------------
 5 files changed, 120 deletions(-)
 delete mode 100644 include/linux/nfsd/syscall.h

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 0b48d018e38a..f2b36d472c73 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -37,7 +37,6 @@
 #include <linux/dirent.h>
 #include <linux/fsnotify.h>
 #include <linux/highuid.h>
-#include <linux/nfsd/syscall.h>
 #include <linux/personality.h>
 #include <linux/rwsem.h>
 #include <linux/tsacct_kern.h>
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index f4cc1e2bfc54..d491421cd708 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -16,7 +16,6 @@
 #include <linux/module.h>
 #include <linux/exportfs.h>
 
-#include <linux/nfsd/syscall.h>
 #include <net/ipv6.h>
 
 #include "nfsd.h"
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c7716143cbd1..db34a585e112 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -9,7 +9,6 @@
 #include <linux/ctype.h>
 
 #include <linux/sunrpc/svcsock.h>
-#include <linux/nfsd/syscall.h>
 #include <linux/lockd/lockd.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/gss_api.h>
diff --git a/include/linux/nfsd/Kbuild b/include/linux/nfsd/Kbuild
index 0e528606d46f..b8d4001212b3 100644
--- a/include/linux/nfsd/Kbuild
+++ b/include/linux/nfsd/Kbuild
@@ -2,4 +2,3 @@ header-y += debug.h
 header-y += export.h
 header-y += nfsfh.h
 header-y += stats.h
-header-y += syscall.h
diff --git a/include/linux/nfsd/syscall.h b/include/linux/nfsd/syscall.h
deleted file mode 100644
index 812bc1e160dc..000000000000
--- a/include/linux/nfsd/syscall.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * include/linux/nfsd/syscall.h
- *
- * This file holds all declarations for the knfsd syscall interface.
- *
- * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
- */
-
-#ifndef NFSD_SYSCALL_H
-#define NFSD_SYSCALL_H
-
-#include <linux/types.h>
-#include <linux/nfsd/export.h>
-
-/*
- * Version of the syscall interface
- */
-#define NFSCTL_VERSION		0x0201
-
-/*
- * These are the commands understood by nfsctl().
- */
-#define NFSCTL_SVC		0	/* This is a server process. */
-#define NFSCTL_ADDCLIENT	1	/* Add an NFS client. */
-#define NFSCTL_DELCLIENT	2	/* Remove an NFS client. */
-#define NFSCTL_EXPORT		3	/* export a file system. */
-#define NFSCTL_UNEXPORT		4	/* unexport a file system. */
-/*#define NFSCTL_UGIDUPDATE	5	/ * update a client's uid/gid map. DISCARDED */
-/*#define NFSCTL_GETFH		6	/ * get an fh by ino DISCARDED */
-#define NFSCTL_GETFD		7	/* get an fh by path (used by mountd) */
-#define	NFSCTL_GETFS		8	/* get an fh by path with max FH len */
-
-/* SVC */
-struct nfsctl_svc {
-	unsigned short		svc_port;
-	int			svc_nthreads;
-};
-
-/* ADDCLIENT/DELCLIENT */
-struct nfsctl_client {
-	char			cl_ident[NFSCLNT_IDMAX+1];
-	int			cl_naddr;
-	struct in_addr		cl_addrlist[NFSCLNT_ADDRMAX];
-	int			cl_fhkeytype;
-	int			cl_fhkeylen;
-	unsigned char		cl_fhkey[NFSCLNT_KEYMAX];
-};
-
-/* EXPORT/UNEXPORT */
-struct nfsctl_export {
-	char			ex_client[NFSCLNT_IDMAX+1];
-	char			ex_path[NFS_MAXPATHLEN+1];
-	__kernel_old_dev_t	ex_dev;
-	__kernel_ino_t		ex_ino;
-	int			ex_flags;
-	__kernel_uid_t		ex_anon_uid;
-	__kernel_gid_t		ex_anon_gid;
-};
-
-/* GETFD */
-struct nfsctl_fdparm {
-	struct sockaddr		gd_addr;
-	char			gd_path[NFS_MAXPATHLEN+1];
-	int			gd_version;
-};
-
-/* GETFS - GET Filehandle with Size */
-struct nfsctl_fsparm {
-	struct sockaddr		gd_addr;
-	char			gd_path[NFS_MAXPATHLEN+1];
-	int			gd_maxlen;
-};
-
-/*
- * This is the argument union.
- */
-struct nfsctl_arg {
-	int			ca_version;	/* safeguard */
-	union {
-		struct nfsctl_svc	u_svc;
-		struct nfsctl_client	u_client;
-		struct nfsctl_export	u_export;
-		struct nfsctl_fdparm	u_getfd;
-		struct nfsctl_fsparm	u_getfs;
-		/*
-		 * The following dummy member is needed to preserve binary compatibility
-		 * on platforms where alignof(void*)>alignof(int).  It's needed because
-		 * this union used to contain a member (u_umap) which contained a
-		 * pointer.
-		 */
-		void *u_ptr;
-	} u;
-#define ca_svc		u.u_svc
-#define ca_client	u.u_client
-#define ca_export	u.u_export
-#define ca_getfd	u.u_getfd
-#define	ca_getfs	u.u_getfs
-};
-
-union nfsctl_res {
-	__u8			cr_getfh[NFS_FHSIZE];
-	struct knfsd_fh		cr_getfs;
-};
-
-#ifdef __KERNEL__
-/*
- * Kernel syscall implementation.
- */
-extern int		exp_addclient(struct nfsctl_client *ncp);
-extern int		exp_delclient(struct nfsctl_client *ncp);
-extern int		exp_export(struct nfsctl_export *nxp);
-extern int		exp_unexport(struct nfsctl_export *nxp);
-
-#endif /* __KERNEL__ */
-
-#endif /* NFSD_SYSCALL_H */
-- 
cgit v1.2.3-58-ga151


From 1cd9f0976aa4606db8d6e3dc3edd0aca8019372a Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 31 Aug 2011 11:54:51 -0400
Subject: ext2,ext3,ext4: don't inherit APPEND_FL or IMMUTABLE_FL for new
 inodes

This doesn't make much sense, and it exposes a bug in the kernel where
attempts to create a new file in an append-only directory using
O_CREAT will fail (but still leave a zero-length file).  This was
discovered when xfstests #79 was generalized so it could run on all
file systems.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc:stable@kernel.org
---
 fs/ext4/ext4.h          | 3 +--
 include/linux/ext2_fs.h | 4 ++--
 include/linux/ext3_fs.h | 4 ++--
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b7d7bd0f066e..5c38120c389c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -358,8 +358,7 @@ struct flex_groups {
 
 /* Flags that should be inherited by new inodes from their parent. */
 #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
-			   EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
-			   EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
+			   EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
 			   EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
 			   EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
 
diff --git a/include/linux/ext2_fs.h b/include/linux/ext2_fs.h
index 53792bf36c71..ce1b719e8bd4 100644
--- a/include/linux/ext2_fs.h
+++ b/include/linux/ext2_fs.h
@@ -197,8 +197,8 @@ struct ext2_group_desc
 
 /* Flags that should be inherited by new inodes from their parent. */
 #define EXT2_FL_INHERITED (EXT2_SECRM_FL | EXT2_UNRM_FL | EXT2_COMPR_FL |\
-			   EXT2_SYNC_FL | EXT2_IMMUTABLE_FL | EXT2_APPEND_FL |\
-			   EXT2_NODUMP_FL | EXT2_NOATIME_FL | EXT2_COMPRBLK_FL|\
+			   EXT2_SYNC_FL | EXT2_NODUMP_FL |\
+			   EXT2_NOATIME_FL | EXT2_COMPRBLK_FL |\
 			   EXT2_NOCOMP_FL | EXT2_JOURNAL_DATA_FL |\
 			   EXT2_NOTAIL_FL | EXT2_DIRSYNC_FL)
 
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 67a803aee619..0244611eb2b8 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -180,8 +180,8 @@ struct ext3_group_desc
 
 /* Flags that should be inherited by new inodes from their parent. */
 #define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\
-			   EXT3_SYNC_FL | EXT3_IMMUTABLE_FL | EXT3_APPEND_FL |\
-			   EXT3_NODUMP_FL | EXT3_NOATIME_FL | EXT3_COMPRBLK_FL|\
+			   EXT3_SYNC_FL | EXT3_NODUMP_FL |\
+			   EXT3_NOATIME_FL | EXT3_COMPRBLK_FL |\
 			   EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\
 			   EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL)
 
-- 
cgit v1.2.3-58-ga151


From 84ebd795613488992b273220c2937d575d27d2a9 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 31 Aug 2011 11:56:51 -0400
Subject: ext4: fake direct I/O mode for data=journal

Currently attempts to open a file with O_DIRECT in data=journal mode
causes the open to fail with -EINVAL.  This makes it very hard to test
data=journal mode.  So we will let the open succeed, but then always
fall back to O_DSYNC buffered writes.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 18d2558b7624..b84f127c085d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2854,6 +2854,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
 	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
 
+	/*
+	 * If we are doing data journalling we don't support O_DIRECT
+	 */
+	if (ext4_should_journal_data(inode))
+		return 0;
+
 	trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -2923,6 +2929,7 @@ static const struct address_space_operations ext4_journalled_aops = {
 	.bmap			= ext4_bmap,
 	.invalidatepage		= ext4_invalidatepage,
 	.releasepage		= ext4_releasepage,
+	.direct_IO		= ext4_direct_IO,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 };
-- 
cgit v1.2.3-58-ga151


From bcaa992975041e40449be8c010c26192b8c8b409 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@us.ibm.com>
Date: Wed, 31 Aug 2011 11:58:51 -0400
Subject: ext4: ext4_rename should dirty dir_bh with the correct directory

When ext4_rename performs a directory rename (move), dir_bh is a
buffer that is modified to update the '..' link in the directory being
moved (old_inode).  However, ext4_handle_dirty_metadata is called with
the old parent directory inode (old_dir) and dir_bh, which is
incorrect because dir_bh does not belong to the parent inode.  Fix
this error.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f8068c7bae9f..09f930b7a785 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2529,7 +2529,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
 						cpu_to_le32(new_dir->i_ino);
 		BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
-		retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
+		retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh);
 		if (retval) {
 			ext4_std_error(old_dir->i_sb, retval);
 			goto end_rename;
-- 
cgit v1.2.3-58-ga151


From f9287c1f2d329f4d78a3bbc9cf0db0ebae6f146a Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@us.ibm.com>
Date: Wed, 31 Aug 2011 12:00:51 -0400
Subject: ext4: ext4_mkdir should dirty dir_block with newly created directory
 inode

ext4_mkdir calls ext4_handle_dirty_metadata with dir_block and the inode "dir".
Unfortunately, dir_block belongs to the newly created directory (which is
"inode"), not the parent directory (which is "dir").  Fix the incorrect
association.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 09f930b7a785..f0abe4323136 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1862,7 +1862,7 @@ retry:
 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
 	inode->i_nlink = 2;
 	BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
-	err = ext4_handle_dirty_metadata(handle, dir, dir_block);
+	err = ext4_handle_dirty_metadata(handle, inode, dir_block);
 	if (err)
 		goto out_clear_inode;
 	err = ext4_mark_inode_dirty(handle, inode);
-- 
cgit v1.2.3-58-ga151


From 5930ea643805feb50a2f8383ae12eb6f10935e49 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 31 Aug 2011 12:02:51 -0400
Subject: ext4: call ext4_handle_dirty_metadata with correct inode in
 ext4_dx_add_entry

ext4_dx_add_entry manipulates bh2 and frames[0].bh, which are two buffer_heads
that point to directory blocks assigned to the directory inode.  However, the
function calls ext4_handle_dirty_metadata with the inode of the file that's
being added to the directory, not the directory inode itself.  Therefore,
correct the code to dirty the directory buffers with the directory inode, not
the file inode.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/namei.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f0abe4323136..a067835bbac1 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1585,7 +1585,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			dxtrace(dx_show_index("node", frames[1].entries));
 			dxtrace(dx_show_index("node",
 			       ((struct dx_node *) bh2->b_data)->entries));
-			err = ext4_handle_dirty_metadata(handle, inode, bh2);
+			err = ext4_handle_dirty_metadata(handle, dir, bh2);
 			if (err)
 				goto journal_error;
 			brelse (bh2);
@@ -1611,7 +1611,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			if (err)
 				goto journal_error;
 		}
-		err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
+		err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
 		if (err) {
 			ext4_std_error(inode->i_sb, err);
 			goto cleanup;
-- 
cgit v1.2.3-58-ga151


From b7d7ca35807b4c8ca3271885b47e67c843376f77 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 31 Aug 2011 15:39:30 -0400
Subject: nfsd4: fix off-by-one-error in SEQUENCE reply

The values here represent highest slotid numbers.  Since slotid's are
numbered starting from zero, the highest should be one less than the
number of slots.

Reported-by: Rick Macklem <rmacklem@uoguelph.ca>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 04ad9a2ca3d0..f982d8574d95 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3221,9 +3221,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
 	WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
 	WRITE32(seq->seqid);
 	WRITE32(seq->slotid);
-	WRITE32(seq->maxslots);
-	/* For now: target_maxslots = maxslots */
-	WRITE32(seq->maxslots);
+	/* Note slotid's are numbered from zero: */
+	WRITE32(seq->maxslots - 1); /* sr_highest_slotid */
+	WRITE32(seq->maxslots - 1); /* sr_target_highest_slotid */
 	WRITE32(seq->status_flags);
 
 	ADJUST_ARGS();
-- 
cgit v1.2.3-58-ga151


From c2d8eb7ac645e1baba7205cb2631e2f21db3d6a9 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 29 Aug 2011 10:36:11 -0400
Subject: nfsd4: remove typoed replay field

Wow, I wonder how long that typo's been there.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/state.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index f02badd70cf2..6b706a60ce88 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -312,7 +312,6 @@ struct nfs4_replay {
 	__be32			rp_status;
 	unsigned int		rp_buflen;
 	char			*rp_buf;
-	unsigned		intrp_allocated;
 	struct knfsd_fh		rp_openfh;
 	char			rp_ibuf[NFSD4_REPLAY_ISIZE];
 };
-- 
cgit v1.2.3-58-ga151


From 5fa0bbb4ee5481a6b3e83c4968142ca433d71914 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 31 Aug 2011 15:25:46 -0400
Subject: nfsd4: simplify distinguishing lock & open stateid's

The trick free_stateid is using is a little cheesy, and we'll have more
uses for this field later.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 9 +++------
 fs/nfsd/state.h     | 3 +++
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d2b637b717c3..7de214b860db 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2285,6 +2285,7 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
 	list_add(&stp->st_hash, &stateid_hashtbl[hashval]);
 	list_add(&stp->st_perstateowner, &sop->so_stateids);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
+	stp->st_type = NFS4_OPEN_STID;
 	stp->st_stateowner = sop;
 	get_nfs4_file(fp);
 	stp->st_file = fp;
@@ -3201,11 +3202,6 @@ static int is_delegation_stateid(stateid_t *stateid)
 	return stateid->si_fileid == 0;
 }
 
-static int is_open_stateid(struct nfs4_stateid *stateid)
-{
-	return stateid->st_openstp == NULL;
-}
-
 __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session)
 {
 	struct nfs4_stateid *stp = NULL;
@@ -3369,7 +3365,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		}
 	}
 
-	if (is_open_stateid(stp)) {
+	if (stp->st_type == NFS4_OPEN_STID) {
 		ret = nfserr_locks_held;
 		goto out;
 	} else {
@@ -3928,6 +3924,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
 	list_add(&stp->st_perfile, &fp->fi_stateids);
 	list_add(&stp->st_perstateowner, &sop->so_stateids);
 	stp->st_stateowner = sop;
+	stp->st_type = NFS4_LOCK_STID;
 	get_nfs4_file(fp);
 	stp->st_file = fp;
 	stp->st_stateid.si_boot = boot_time;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 6b706a60ce88..a06f55bd38b6 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -425,6 +425,9 @@ static inline struct file *find_any_file(struct nfs4_file *f)
 */
 
 struct nfs4_stateid {
+#define NFS4_OPEN_STID 1
+#define NFS4_LOCK_STID 2
+	char st_type;
 	struct list_head              st_hash; 
 	struct list_head              st_perfile;
 	struct list_head              st_perstateowner;
-- 
cgit v1.2.3-58-ga151


From b79abaddfe7ef29e00d71721cf03a33e00d53317 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 22 Aug 2011 18:01:43 -0400
Subject: nfsd4: consolidate lock & open stateid tables

There's no reason to have two separate hash tables for open and lock
stateid's.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 63 +++++++++++++----------------------------------------
 1 file changed, 15 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 7de214b860db..0198328c0e84 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -61,7 +61,6 @@ static u64 current_sessionid = 1;
 
 /* forward declarations */
 static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
-static struct nfs4_stateid * search_for_stateid(stateid_t *stid);
 static struct nfs4_delegation * search_for_delegation(stateid_t *stid);
 static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
 static int check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner);
@@ -3211,7 +3210,7 @@ __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session)
 		goto out;
 
 	status = nfserr_expired;
-	stp = search_for_stateid(stateid);
+	stp = find_stateid(stateid, 0);
 	if (!stp)
 		goto out;
 	status = nfserr_bad_stateid;
@@ -3349,7 +3348,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto out;
 	}
 
-	stp = search_for_stateid(stateid);
+	stp = find_stateid(stateid, 0);
 	if (!stp) {
 		ret = nfserr_bad_stateid;
 		goto out;
@@ -3718,7 +3717,6 @@ lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
 
 static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE];
 static struct list_head	lock_ownerstr_hashtbl[LOCK_HASH_SIZE];
-static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
 
 static int
 same_stateid(stateid_t *id_one, stateid_t *id_two)
@@ -3728,50 +3726,21 @@ same_stateid(stateid_t *id_one, stateid_t *id_two)
 	return id_one->si_fileid == id_two->si_fileid;
 }
 
-static struct nfs4_stateid *
-find_stateid(stateid_t *stid, int flags)
+static struct nfs4_stateid *find_stateid(stateid_t *t, int flags)
 {
-	struct nfs4_stateid *local;
-	u32 st_id = stid->si_stateownerid;
-	u32 f_id = stid->si_fileid;
+	struct nfs4_stateid *s;
 	unsigned int hashval;
 
-	dprintk("NFSD: find_stateid flags 0x%x\n",flags);
-	if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) {
-		hashval = stateid_hashval(st_id, f_id);
-		list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
-			if ((local->st_stateid.si_stateownerid == st_id) &&
-			    (local->st_stateid.si_fileid == f_id))
-				return local;
-		}
-	} 
-
-	if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) {
-		hashval = stateid_hashval(st_id, f_id);
-		list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
-			if ((local->st_stateid.si_stateownerid == st_id) &&
-			    (local->st_stateid.si_fileid == f_id))
-				return local;
+	hashval = stateid_hashval(t->si_stateownerid, t->si_fileid);
+	list_for_each_entry(s, &stateid_hashtbl[hashval], st_hash) {
+		if (!same_stateid(&s->st_stateid, t))
+			continue;
+		if (flags & LOCK_STATE && s->st_type != NFS4_LOCK_STID)
+			return NULL;
+		if (flags & OPEN_STATE && s->st_type != NFS4_OPEN_STID)
+			return NULL;
+		return s;
 		}
-	}
-	return NULL;
-}
-
-static struct nfs4_stateid *
-search_for_stateid(stateid_t *stid)
-{
-	struct nfs4_stateid *local;
-	unsigned int hashval = stateid_hashval(stid->si_stateownerid, stid->si_fileid);
-
-	list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
-		if (same_stateid(&local->st_stateid, stid))
-			return local;
-	}
-
-	list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
-		if (same_stateid(&local->st_stateid, stid))
-			return local;
-	}
 	return NULL;
 }
 
@@ -3920,7 +3889,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
 	INIT_LIST_HEAD(&stp->st_perfile);
 	INIT_LIST_HEAD(&stp->st_perstateowner);
 	INIT_LIST_HEAD(&stp->st_lockowners); /* not used */
-	list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]);
+	list_add(&stp->st_hash, &stateid_hashtbl[hashval]);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
 	list_add(&stp->st_perstateowner, &sop->so_stateids);
 	stp->st_stateowner = sop;
@@ -4497,10 +4466,8 @@ nfs4_state_init(void)
 		INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]);
 		INIT_LIST_HEAD(&open_ownerid_hashtbl[i]);
 	}
-	for (i = 0; i < STATEID_HASH_SIZE; i++) {
+	for (i = 0; i < STATEID_HASH_SIZE; i++)
 		INIT_LIST_HEAD(&stateid_hashtbl[i]);
-		INIT_LIST_HEAD(&lockstateid_hashtbl[i]);
-	}
 	for (i = 0; i < LOCK_HASH_SIZE; i++) {
 		INIT_LIST_HEAD(&lock_ownerid_hashtbl[i]);
 		INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]);
-- 
cgit v1.2.3-58-ga151


From 81b829655d418316f0707b3656b45cff7a1dbf12 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 23 Aug 2011 11:03:29 -0400
Subject: nfsd4: simplify stateid generation code, fix wraparound

Follow the recommendation from rfc3530bis for stateid generation number
wraparound, simplify some code, and fix or remove incorrect comments.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 52 +++++++++++++++++++++++-----------------------------
 fs/nfsd/state.h     |  3 +++
 2 files changed, 26 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0198328c0e84..106e8fa63cdf 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3168,6 +3168,12 @@ grace_disallows_io(struct inode *inode)
 	return locks_in_grace() && mandatory_lock(inode);
 }
 
+/* Returns true iff a is later than b: */
+static bool stateid_generation_after(stateid_t *a, stateid_t *b)
+{
+	return (s32)a->si_generation - (s32)b->si_generation > 0;
+}
+
 static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
 {
 	/*
@@ -3175,25 +3181,25 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_sess
 	 * when it is zero.
 	 */
 	if (has_session && in->si_generation == 0)
-		goto out;
+		return nfs_ok;
+
+	if (in->si_generation == ref->si_generation)
+		return nfs_ok;
 
 	/* If the client sends us a stateid from the future, it's buggy: */
-	if (in->si_generation > ref->si_generation)
+	if (stateid_generation_after(in, ref))
 		return nfserr_bad_stateid;
 	/*
-	 * The following, however, can happen.  For example, if the
-	 * client sends an open and some IO at the same time, the open
-	 * may bump si_generation while the IO is still in flight.
-	 * Thanks to hard links and renames, the client never knows what
-	 * file an open will affect.  So it could avoid that situation
-	 * only by serializing all opens and IO from the same open
-	 * owner.  To recover from the old_stateid error, the client
-	 * will just have to retry the IO:
+	 * However, we could see a stateid from the past, even from a
+	 * non-buggy client.  For example, if the client sends a lock
+	 * while some IO is outstanding, the lock may bump si_generation
+	 * while the IO is still in flight.  The client could avoid that
+	 * situation by waiting for responses on all the IO requests,
+	 * but better performance may result in retrying IO that
+	 * receives an old_stateid error if requests are rarely
+	 * reordered in flight:
 	 */
-	if (in->si_generation < ref->si_generation)
-		return nfserr_old_stateid;
-out:
-	return nfs_ok;
+	return nfserr_old_stateid;
 }
 
 static int is_delegation_stateid(stateid_t *stateid)
@@ -3353,16 +3359,9 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		ret = nfserr_bad_stateid;
 		goto out;
 	}
-	if (stateid->si_generation != 0) {
-		if (stateid->si_generation < stp->st_stateid.si_generation) {
-			ret = nfserr_old_stateid;
-			goto out;
-		}
-		if (stateid->si_generation > stp->st_stateid.si_generation) {
-			ret = nfserr_bad_stateid;
-			goto out;
-		}
-	}
+	ret = check_stateid_generation(stateid, &stp->st_stateid, 1);
+	if (ret)
+		goto out;
 
 	if (stp->st_type == NFS4_OPEN_STID) {
 		ret = nfserr_locks_held;
@@ -3439,11 +3438,6 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 		return nfserr_bad_stateid;
 	}
 
-	/*
-	*  We now validate the seqid and stateid generation numbers.
-	*  For the moment, we ignore the possibility of 
-	*  generation number wraparound.
-	*/
 	if (!nfsd4_has_session(cstate) && seqid != sop->so_seqid)
 		goto check_replay;
 
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index a06f55bd38b6..c425717715f6 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -293,6 +293,9 @@ static inline void
 update_stateid(stateid_t *stateid)
 {
 	stateid->si_generation++;
+	/* Wraparound recommendation from 3530bis-13 9.1.3.2: */
+	if (stateid->si_generation == 0)
+		stateid->si_generation = 1;
 }
 
 /* A reasonable value for REPLAY_ISIZE was estimated as follows:  
-- 
cgit v1.2.3-58-ga151


From 73997dc4183c580278ea8cb41c7a9655940801e0 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 31 Aug 2011 15:47:21 -0400
Subject: nfsd4: make delegation stateid's seqid start at 1

Thanks to Casey for reminding me that 5661 gives a special meaning to a
value of 0 in the stateid's seqid field, so all stateid's should start
out with si_generation 1.  We were doing that in the open and lock
cases for minorversion 1, but not for the delegation stateid, and not
for openstateid's with v4.0.

It doesn't *really* matter much for v4.0 or for delegation stateid's
(which never get the seqid field incremented), but we may as well do the
same for all of them.

Reported-by: Casey Bodley <cbodley@citi.umich.edu>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 106e8fa63cdf..80af79ee5d51 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -250,7 +250,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	dp->dl_stateid.si_boot = boot_time;
 	dp->dl_stateid.si_stateownerid = current_delegid++;
 	dp->dl_stateid.si_fileid = 0;
-	dp->dl_stateid.si_generation = 0;
+	dp->dl_stateid.si_generation = 1;
 	fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
 	dp->dl_time = 0;
 	atomic_set(&dp->dl_count, 1);
@@ -2291,6 +2291,7 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
 	stp->st_stateid.si_boot = boot_time;
 	stp->st_stateid.si_stateownerid = sop->so_id;
 	stp->st_stateid.si_fileid = fp->fi_id;
+	/* note will be incremented before first return to client: */
 	stp->st_stateid.si_generation = 0;
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = 0;
@@ -2894,7 +2895,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
 		if (status)
 			goto out;
-		update_stateid(&stp->st_stateid);
 	} else {
 		status = nfs4_new_open(rqstp, &stp, fp, current_fh, open);
 		if (status)
@@ -2905,9 +2905,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 			release_open_stateid(stp);
 			goto out;
 		}
-		if (nfsd4_has_session(&resp->cstate))
-			update_stateid(&stp->st_stateid);
 	}
+	update_stateid(&stp->st_stateid);
 	memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
 
 	if (nfsd4_has_session(&resp->cstate))
@@ -3893,6 +3892,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
 	stp->st_stateid.si_boot = boot_time;
 	stp->st_stateid.si_stateownerid = sop->so_id;
 	stp->st_stateid.si_fileid = fp->fi_id;
+	/* note will be incremented before first return to client: */
 	stp->st_stateid.si_generation = 0;
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = open_stp->st_deny_bmap;
-- 
cgit v1.2.3-58-ga151


From f3e4223751392b9bc0195a806a6e99b4cc399ac0 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 24 Aug 2011 12:27:31 -0400
Subject: nfsd4: centralize handling of replay owners

Set the stateowner associated with a replay in one spot in
nfs4_preprocess_seqid_op() and keep it in cstate.  This allows removing
a few lines of boilerplate from all the nfs4_preprocess_seqid_op()
callers.

Also turn ENCODE_SEQID_OP_TAIL into a function while we're here.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 25 ++++---------------------
 fs/nfsd/nfs4xdr.c   | 34 +++++++++++++++++++---------------
 2 files changed, 23 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 80af79ee5d51..e4535ff92876 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3425,12 +3425,15 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 		/* It's not stale; let's assume it's expired: */
 		if (sop == NULL)
 			return nfserr_expired;
-		*sopp = sop;
+		nfs4_get_stateowner(sop);
+		cstate->replay_owner = sop;
 		goto check_replay;
 	}
 
 	*stpp = stp;
 	*sopp = sop = stp->st_stateowner;
+	nfs4_get_stateowner(sop);
+	cstate->replay_owner = sop;
 
 	if (nfs4_check_fh(current_fh, stp)) {
 		dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n");
@@ -3501,10 +3504,6 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfsd4_create_clid_dir(sop->so_client);
 out:
-	if (oc->oc_stateowner) {
-		nfs4_get_stateowner(oc->oc_stateowner);
-		cstate->replay_owner = oc->oc_stateowner;
-	}
 	nfs4_unlock_state();
 	return status;
 }
@@ -3574,10 +3573,6 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 	memcpy(&od->od_stateid, &stp->st_stateid, sizeof(stateid_t));
 	status = nfs_ok;
 out:
-	if (od->od_stateowner) {
-		nfs4_get_stateowner(od->od_stateowner);
-		cstate->replay_owner = od->od_stateowner;
-	}
 	nfs4_unlock_state();
 	return status;
 }
@@ -3618,10 +3613,6 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (list_empty(&close->cl_stateowner->so_stateids))
 		move_to_close_lru(close->cl_stateowner);
 out:
-	if (close->cl_stateowner) {
-		nfs4_get_stateowner(close->cl_stateowner);
-		cstate->replay_owner = close->cl_stateowner;
-	}
 	nfs4_unlock_state();
 	return status;
 }
@@ -4086,10 +4077,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 out:
 	if (status && lock->lk_is_new && lock_sop)
 		release_lockowner(lock_sop);
-	if (lock->lk_replay_owner) {
-		nfs4_get_stateowner(lock->lk_replay_owner);
-		cstate->replay_owner = lock->lk_replay_owner;
-	}
 	nfs4_unlock_state();
 	return status;
 }
@@ -4244,10 +4231,6 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	memcpy(&locku->lu_stateid, &stp->st_stateid, sizeof(stateid_t));
 
 out:
-	if (locku->lu_stateowner) {
-		nfs4_get_stateowner(locku->lu_stateowner);
-		cstate->replay_owner = locku->lu_stateowner;
-	}
 	nfs4_unlock_state();
 	return status;
 
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f982d8574d95..ee1267838719 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1630,15 +1630,19 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
  * we know whether the error to be returned is a sequence id mutating error.
  */
 
-#define ENCODE_SEQID_OP_TAIL(stateowner) do {			\
-	if (seqid_mutating_err(ntohl(nfserr)) && stateowner) { 	\
-		stateowner->so_seqid++;				\
-		stateowner->so_replay.rp_status = nfserr;   	\
-		stateowner->so_replay.rp_buflen = 		\
-			  (((char *)(resp)->p - (char *)save)); \
-		memcpy(stateowner->so_replay.rp_buf, save,      \
- 			stateowner->so_replay.rp_buflen); 	\
-	} } while (0);
+static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, __be32 nfserr)
+{
+	struct nfs4_stateowner *stateowner = resp->cstate.replay_owner;
+
+	if (seqid_mutating_err(ntohl(nfserr)) && stateowner) {
+		stateowner->so_seqid++;
+		stateowner->so_replay.rp_status = nfserr;
+		stateowner->so_replay.rp_buflen =
+			  (char *)resp->p - (char *)save;
+		memcpy(stateowner->so_replay.rp_buf, save,
+			stateowner->so_replay.rp_buflen);
+	}
+}
 
 /* Encode as an array of strings the string given with components
  * separated @sep.
@@ -2495,7 +2499,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
 	if (!nfserr)
 		nfsd4_encode_stateid(resp, &close->cl_stateid);
 
-	ENCODE_SEQID_OP_TAIL(close->cl_stateowner);
+	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
@@ -2599,7 +2603,7 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
 	else if (nfserr == nfserr_denied)
 		nfsd4_encode_lock_denied(resp, &lock->lk_denied);
 
-	ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner);
+	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
@@ -2619,7 +2623,7 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
 	if (!nfserr)
 		nfsd4_encode_stateid(resp, &locku->lu_stateid);
 
-	ENCODE_SEQID_OP_TAIL(locku->lu_stateowner);
+	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
@@ -2700,7 +2704,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
 	}
 	/* XXX save filehandle here */
 out:
-	ENCODE_SEQID_OP_TAIL(open->op_stateowner);
+	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
@@ -2712,7 +2716,7 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct
 	if (!nfserr)
 		nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
 
-	ENCODE_SEQID_OP_TAIL(oc->oc_stateowner);
+	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
@@ -2724,7 +2728,7 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struc
 	if (!nfserr)
 		nfsd4_encode_stateid(resp, &od->od_stateid);
 
-	ENCODE_SEQID_OP_TAIL(od->od_stateowner);
+	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
-- 
cgit v1.2.3-58-ga151


From 9072d5c66b17292e3cd055bc8057b2ce6af2fe34 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 24 Aug 2011 12:45:03 -0400
Subject: nfsd4: cleanup seqid op stateowner usage

Now that the replay owner is in the cstate we can remove it from a lot
of other individual operations and further simplify
nfs4_preprocess_seqid_op().

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 57 ++++++++++++++++++++++-------------------------------
 fs/nfsd/nfs4xdr.c   |  5 -----
 fs/nfsd/xdr4.h      |  7 -------
 3 files changed, 24 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e4535ff92876..bc1a9dbc289c 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3388,7 +3388,6 @@ setlkflg (int type)
 static __be32
 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 			 stateid_t *stateid, int flags,
-			 struct nfs4_stateowner **sopp,
 			 struct nfs4_stateid **stpp)
 {
 	struct nfs4_stateid *stp;
@@ -3400,7 +3399,6 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 		seqid, STATEID_VAL(stateid));
 
 	*stpp = NULL;
-	*sopp = NULL;
 
 	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) {
 		dprintk("NFSD: preprocess_seqid_op: magic stateid!\n");
@@ -3431,7 +3429,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 	}
 
 	*stpp = stp;
-	*sopp = sop = stp->st_stateowner;
+	sop = stp->st_stateowner;
 	nfs4_get_stateowner(sop);
 	cstate->replay_owner = sop;
 
@@ -3467,7 +3465,6 @@ check_replay:
 	}
 	dprintk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n",
 			sop->so_seqid, seqid);
-	*sopp = NULL;
 	return nfserr_bad_seqid;
 }
 
@@ -3489,13 +3486,13 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 
-	if ((status = nfs4_preprocess_seqid_op(cstate,
+	status = nfs4_preprocess_seqid_op(cstate,
 					oc->oc_seqid, &oc->oc_req_stateid,
-					CONFIRM | OPEN_STATE,
-					&oc->oc_stateowner, &stp)))
+					CONFIRM | OPEN_STATE, &stp);
+	if (status)
 		goto out; 
 
-	sop = oc->oc_stateowner;
+	sop = stp->st_stateowner;
 	sop->so_confirmed = 1;
 	update_stateid(&stp->st_stateid);
 	memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t));
@@ -3547,11 +3544,9 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 		return nfserr_inval;
 
 	nfs4_lock_state();
-	if ((status = nfs4_preprocess_seqid_op(cstate,
-					od->od_seqid,
-					&od->od_stateid, 
-					OPEN_STATE,
-					&od->od_stateowner, &stp)))
+	status = nfs4_preprocess_seqid_op(cstate, od->od_seqid,
+					&od->od_stateid, OPEN_STATE, &stp);
+	if (status)
 		goto out; 
 
 	status = nfserr_inval;
@@ -3586,6 +3581,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
 	__be32 status;
 	struct nfs4_stateid *stp;
+	struct nfs4_stateowner *so;
 
 	dprintk("NFSD: nfsd4_close on file %.*s\n", 
 			(int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3593,12 +3589,12 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 	/* check close_lru for replay */
-	if ((status = nfs4_preprocess_seqid_op(cstate,
-					close->cl_seqid,
+	status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
 					&close->cl_stateid, 
-					OPEN_STATE | CLOSE_STATE,
-					&close->cl_stateowner, &stp)))
+					OPEN_STATE | CLOSE_STATE, &stp);
+	if (status)
 		goto out; 
+	so = stp->st_stateowner;
 	status = nfs_ok;
 	update_stateid(&stp->st_stateid);
 	memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t));
@@ -3610,8 +3606,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 * released by the laundromat service after the lease period
 	 * to enable us to handle CLOSE replay
 	 */
-	if (list_empty(&close->cl_stateowner->so_stateids))
-		move_to_close_lru(close->cl_stateowner);
+	if (list_empty(&so->so_stateids))
+		move_to_close_lru(so);
 out:
 	nfs4_unlock_state();
 	return status;
@@ -3962,12 +3958,11 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		status = nfs4_preprocess_seqid_op(cstate,
 				        lock->lk_new_open_seqid,
 		                        &lock->lk_new_open_stateid,
-					OPEN_STATE,
-		                        &lock->lk_replay_owner, &open_stp);
+					OPEN_STATE, &open_stp);
 		if (status)
 			goto out;
 		status = nfserr_bad_stateid;
-		open_sop = lock->lk_replay_owner;
+		open_sop = open_stp->st_stateowner;
 		if (!nfsd4_has_session(cstate) &&
 				!same_clid(&open_sop->so_client->cl_clientid,
 						&lock->v.new.clientid))
@@ -3993,14 +3988,13 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		status = nfs4_preprocess_seqid_op(cstate,
 				       lock->lk_old_lock_seqid, 
 				       &lock->lk_old_lock_stateid, 
-				       LOCK_STATE,
-				       &lock->lk_replay_owner, &lock_stp);
+				       LOCK_STATE, &lock_stp);
 		if (status)
 			goto out;
-		lock_sop = lock->lk_replay_owner;
+		lock_sop = lock_stp->st_stateowner;
 		fp = lock_stp->st_file;
 	}
-	/* lock->lk_replay_owner and lock_stp have been created or found */
+	/* lock_sop and lock_stp have been created or found */
 
 	lkflg = setlkflg(lock->lk_type);
 	status = nfs4_check_openmode(lock_stp, lkflg);
@@ -4191,13 +4185,10 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 									        
-	if ((status = nfs4_preprocess_seqid_op(cstate,
-					locku->lu_seqid, 
-					&locku->lu_stateid, 
-					LOCK_STATE,
-					&locku->lu_stateowner, &stp)))
+	status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid,
+					&locku->lu_stateid, LOCK_STATE, &stp);
+	if (status)
 		goto out;
-
 	filp = find_any_file(stp->st_file);
 	if (!filp) {
 		status = nfserr_lock_range;
@@ -4206,7 +4197,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	BUG_ON(!filp);
 	locks_init_lock(&file_lock);
 	file_lock.fl_type = F_UNLCK;
-	file_lock.fl_owner = (fl_owner_t) locku->lu_stateowner;
+	file_lock.fl_owner = (fl_owner_t) stp->st_stateowner;
 	file_lock.fl_pid = current->tgid;
 	file_lock.fl_file = filp;
 	file_lock.fl_flags = FL_POSIX; 
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index ee1267838719..462c6eff8471 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -456,7 +456,6 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
 {
 	DECODE_HEAD;
 
-	close->cl_stateowner = NULL;
 	READ_BUF(4);
 	READ32(close->cl_seqid);
 	return nfsd4_decode_stateid(argp, &close->cl_stateid);
@@ -551,7 +550,6 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
 {
 	DECODE_HEAD;
 
-	lock->lk_replay_owner = NULL;
 	/*
 	* type, reclaim(boolean), offset, length, new_lock_owner(boolean)
 	*/
@@ -611,7 +609,6 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
 {
 	DECODE_HEAD;
 
-	locku->lu_stateowner = NULL;
 	READ_BUF(8);
 	READ32(locku->lu_type);
 	if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
@@ -739,7 +736,6 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
 {
 	DECODE_HEAD;
 		    
-	open_conf->oc_stateowner = NULL;
 	status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
 	if (status)
 		return status;
@@ -754,7 +750,6 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
 {
 	DECODE_HEAD;
 		    
-	open_down->od_stateowner = NULL;
 	status = nfsd4_decode_stateid(argp, &open_down->od_stateid);
 	if (status)
 		return status;
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 663193b21a24..341f0a17d217 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -81,7 +81,6 @@ struct nfsd4_access {
 struct nfsd4_close {
 	u32		cl_seqid;           /* request */
 	stateid_t	cl_stateid;         /* request+response */
-	struct nfs4_stateowner * cl_stateowner;	/* response */
 };
 
 struct nfsd4_commit {
@@ -165,9 +164,6 @@ struct nfsd4_lock {
 		} ok;
 		struct nfsd4_lock_denied        denied;
 	} u;
-	/* The lk_replay_owner is the open owner in the open_to_lock_owner
-	 * case and the lock owner otherwise: */
-	struct nfs4_stateowner *lk_replay_owner;
 };
 #define lk_new_open_seqid       v.new.open_seqid
 #define lk_new_open_stateid     v.new.open_stateid
@@ -199,7 +195,6 @@ struct nfsd4_locku {
 	stateid_t       lu_stateid;
 	u64             lu_offset;
 	u64             lu_length;
-	struct nfs4_stateowner  *lu_stateowner;
 };
 
 
@@ -243,7 +238,6 @@ struct nfsd4_open_confirm {
 	stateid_t	oc_req_stateid		/* request */;
 	u32		oc_seqid    		/* request */;
 	stateid_t	oc_resp_stateid		/* response */;
-	struct nfs4_stateowner * oc_stateowner;	/* response */
 };
 
 struct nfsd4_open_downgrade {
@@ -251,7 +245,6 @@ struct nfsd4_open_downgrade {
 	u32             od_seqid;
 	u32             od_share_access;
 	u32             od_share_deny;
-	struct nfs4_stateowner *od_stateowner;
 };
 
 
-- 
cgit v1.2.3-58-ga151


From 5ec094c1096ab3bb795651855d53f18daa26afde Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 30 Aug 2011 17:02:48 -0400
Subject: nfsd4: extend state lock over seqid replay logic

There are currently a couple races in the seqid replay code: a
retransmission could come while we're still encoding the original reply,
or a new seqid-mutating call could come as we're encoding a replay.

So, extend the state lock over the encoding (both encoding of a replayed
reply and caching of the original encoded reply).

I really hate doing this, and previously added the stateowner
reference-counting code to avoid it (which was insufficient)--but I
don't see a less complicated alternative at the moment.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c  |  5 +++--
 fs/nfsd/nfs4state.c | 12 ++++++++----
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 50bae7471147..50063a85f505 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -408,8 +408,8 @@ out:
 	if (open->op_stateowner) {
 		nfs4_get_stateowner(open->op_stateowner);
 		cstate->replay_owner = open->op_stateowner;
-	}
-	nfs4_unlock_state();
+	} else
+		nfs4_unlock_state();
 	return status;
 }
 
@@ -1227,6 +1227,7 @@ encode_op:
 			be32_to_cpu(status));
 
 		if (cstate->replay_owner) {
+			nfs4_unlock_state();
 			nfs4_put_stateowner(cstate->replay_owner);
 			cstate->replay_owner = NULL;
 		}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index bc1a9dbc289c..6cf729a096c3 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3501,7 +3501,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfsd4_create_clid_dir(sop->so_client);
 out:
-	nfs4_unlock_state();
+	if (!cstate->replay_owner)
+		nfs4_unlock_state();
 	return status;
 }
 
@@ -3568,7 +3569,8 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 	memcpy(&od->od_stateid, &stp->st_stateid, sizeof(stateid_t));
 	status = nfs_ok;
 out:
-	nfs4_unlock_state();
+	if (!cstate->replay_owner)
+		nfs4_unlock_state();
 	return status;
 }
 
@@ -3609,7 +3611,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (list_empty(&so->so_stateids))
 		move_to_close_lru(so);
 out:
-	nfs4_unlock_state();
+	if (!cstate->replay_owner)
+		nfs4_unlock_state();
 	return status;
 }
 
@@ -4071,7 +4074,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 out:
 	if (status && lock->lk_is_new && lock_sop)
 		release_lockowner(lock_sop);
-	nfs4_unlock_state();
+	if (!cstate->replay_owner)
+		nfs4_unlock_state();
 	return status;
 }
 
-- 
cgit v1.2.3-58-ga151


From fff6ca9cc46857e5814cf687e5fb1b8a876766a4 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 25 Aug 2011 18:17:52 -0400
Subject: nfsd4: eliminate impossible open replay case

If open fails with any error other than nfserr_replay_me, then the main
nfsd4_proc_compound() loop continues unconditionally to
nfsd4_encode_operation(), which will always call encode_seqid_op_tail.
Thus the condition we check for here does not occur.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6cf729a096c3..26b0c75aa93b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2495,18 +2495,8 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
 		open->op_stateowner = NULL;
 		goto renew;
 	}
-	if (open->op_seqid == sop->so_seqid - 1) {
-		if (sop->so_replay.rp_buflen)
-			return nfserr_replay_me;
-		/* The original OPEN failed so spectacularly
-		 * that we don't even have replay data saved!
-		 * Therefore, we have no choice but to continue
-		 * processing this OPEN; presumably, we'll
-		 * fail again for the same reason.
-		 */
-		dprintk("nfsd4_process_open1: replay with no replay cache\n");
-		goto renew;
-	}
+	if (open->op_seqid == sop->so_seqid - 1)
+		return nfserr_replay_me;
 	if (open->op_seqid != sop->so_seqid)
 		return nfserr_bad_seqid;
 renew:
-- 
cgit v1.2.3-58-ga151


From 7c13f344cf8bec22301c5ed7ef1d90eecb57ba43 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 30 Aug 2011 22:15:47 -0400
Subject: nfsd4: drop most stateowner refcounting

Maybe we'll bring it back some day, but we don't have much real use for
it now.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c  |  6 ++----
 fs/nfsd/nfs4state.c | 23 ++++++++++++-----------
 fs/nfsd/nfs4xdr.c   | 11 ++++++-----
 fs/nfsd/state.h     | 15 +--------------
 fs/nfsd/xdr4.h      |  2 +-
 5 files changed, 22 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 50063a85f505..ce151f0ed4b9 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -405,10 +405,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 */
 	status = nfsd4_process_open2(rqstp, &cstate->current_fh, open);
 out:
-	if (open->op_stateowner) {
-		nfs4_get_stateowner(open->op_stateowner);
+	if (open->op_stateowner)
 		cstate->replay_owner = open->op_stateowner;
-	} else
+	else
 		nfs4_unlock_state();
 	return status;
 }
@@ -1228,7 +1227,6 @@ encode_op:
 
 		if (cstate->replay_owner) {
 			nfs4_unlock_state();
-			nfs4_put_stateowner(cstate->replay_owner);
 			cstate->replay_owner = NULL;
 		}
 		/* XXX Ugh, we need to get rid of this kind of special case: */
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 26b0c75aa93b..834a5f844f42 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -453,7 +453,7 @@ static void unhash_lockowner(struct nfs4_stateowner *sop)
 static void release_lockowner(struct nfs4_stateowner *sop)
 {
 	unhash_lockowner(sop);
-	nfs4_put_stateowner(sop);
+	nfs4_free_stateowner(sop);
 }
 
 static void
@@ -496,7 +496,7 @@ static void release_openowner(struct nfs4_stateowner *sop)
 {
 	unhash_openowner(sop);
 	list_del(&sop->so_close_lru);
-	nfs4_put_stateowner(sop);
+	nfs4_free_stateowner(sop);
 }
 
 #define SESSION_HASH_SIZE	512
@@ -2206,10 +2206,8 @@ out_nomem:
 }
 
 void
-nfs4_free_stateowner(struct kref *kref)
+nfs4_free_stateowner(struct nfs4_stateowner *sop)
 {
-	struct nfs4_stateowner *sop =
-		container_of(kref, struct nfs4_stateowner, so_ref);
 	kfree(sop->so_owner.data);
 	kmem_cache_free(stateowner_slab, sop);
 }
@@ -2236,7 +2234,6 @@ static inline struct nfs4_stateowner *alloc_stateowner(struct xdr_netobj *owner,
 	}
 	sop->so_owner.len = owner->len;
 
-	kref_init(&sop->so_ref);
 	INIT_LIST_HEAD(&sop->so_perclient);
 	INIT_LIST_HEAD(&sop->so_stateids);
 	INIT_LIST_HEAD(&sop->so_perstateid);
@@ -3413,14 +3410,12 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 		/* It's not stale; let's assume it's expired: */
 		if (sop == NULL)
 			return nfserr_expired;
-		nfs4_get_stateowner(sop);
 		cstate->replay_owner = sop;
 		goto check_replay;
 	}
 
 	*stpp = stp;
 	sop = stp->st_stateowner;
-	nfs4_get_stateowner(sop);
 	cstate->replay_owner = sop;
 
 	if (nfs4_check_fh(current_fh, stp)) {
@@ -3783,11 +3778,17 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
 
 	if (fl->fl_lmops == &nfsd_posix_mng_ops) {
 		sop = (struct nfs4_stateowner *) fl->fl_owner;
-		kref_get(&sop->so_ref);
-		deny->ld_sop = sop;
+		deny->ld_owner.data = kmemdup(sop->so_owner.data,
+					sop->so_owner.len, GFP_KERNEL);
+		if (!deny->ld_owner.data)
+			/* We just don't care that much */
+			goto nevermind;
+		deny->ld_owner.len = sop->so_owner.len;
 		deny->ld_clientid = sop->so_client->cl_clientid;
 	} else {
-		deny->ld_sop = NULL;
+nevermind:
+		deny->ld_owner.len = 0;
+		deny->ld_owner.data = NULL;
 		deny->ld_clientid.cl_boot = 0;
 		deny->ld_clientid.cl_id = 0;
 	}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 462c6eff8471..c4dcba3aac1f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2570,17 +2570,18 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
 static void
 nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld)
 {
+	struct xdr_netobj *conf = &ld->ld_owner;
 	__be32 *p;
 
-	RESERVE_SPACE(32 + XDR_LEN(ld->ld_sop ? ld->ld_sop->so_owner.len : 0));
+	RESERVE_SPACE(32 + XDR_LEN(conf->len));
 	WRITE64(ld->ld_start);
 	WRITE64(ld->ld_length);
 	WRITE32(ld->ld_type);
-	if (ld->ld_sop) {
+	if (conf->len) {
 		WRITEMEM(&ld->ld_clientid, 8);
-		WRITE32(ld->ld_sop->so_owner.len);
-		WRITEMEM(ld->ld_sop->so_owner.data, ld->ld_sop->so_owner.len);
-		kref_put(&ld->ld_sop->so_ref, nfs4_free_stateowner);
+		WRITE32(conf->len);
+		WRITEMEM(conf->data, conf->len);
+		kfree(conf->data);
 	}  else {  /* non - nfsv4 lock in conflict, no clientid nor owner */
 		WRITE64((u64)0); /* clientid */
 		WRITE32(0); /* length of owner name */
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index c425717715f6..f7114fc21dee 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -338,7 +338,6 @@ struct nfs4_replay {
 *         reaped by laundramat thread after lease period.
 */
 struct nfs4_stateowner {
-	struct kref		so_ref;
 	struct list_head        so_idhash;   /* hash by so_id */
 	struct list_head        so_strhash;   /* hash by op_name */
 	struct list_head        so_perclient;
@@ -459,7 +458,7 @@ extern void nfs4_lock_state(void);
 extern void nfs4_unlock_state(void);
 extern int nfs4_in_grace(void);
 extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
-extern void nfs4_free_stateowner(struct kref *kref);
+extern void nfs4_free_stateowner(struct nfs4_stateowner *sop);
 extern int set_callback_cred(void);
 extern void nfsd4_probe_callback(struct nfs4_client *clp);
 extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
@@ -482,16 +481,4 @@ extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
 extern void release_session_client(struct nfsd4_session *);
 extern __be32 nfs4_validate_stateid(stateid_t *, bool);
 
-static inline void
-nfs4_put_stateowner(struct nfs4_stateowner *so)
-{
-	kref_put(&so->so_ref, nfs4_free_stateowner);
-}
-
-static inline void
-nfs4_get_stateowner(struct nfs4_stateowner *so)
-{
-	kref_get(&so->so_ref);
-}
-
 #endif   /* NFSD4_STATE_H */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 341f0a17d217..de236fb89e74 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -130,7 +130,7 @@ struct nfsd4_link {
 
 struct nfsd4_lock_denied {
 	clientid_t	ld_clientid;
-	struct nfs4_stateowner   *ld_sop;
+	struct xdr_netobj	ld_owner;
 	u64             ld_start;
 	u64             ld_length;
 	u32             ld_type;
-- 
cgit v1.2.3-58-ga151


From 16d259418b7c0dda79b71bfbfeaedc0ba4035f23 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 1 Sep 2011 11:31:45 -0400
Subject: nfsd4: eliminate unused lt_stateowner

This is used only as a local variable.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 8 ++++----
 fs/nfsd/xdr4.h      | 1 -
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 834a5f844f42..a47bf884726f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4098,6 +4098,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
 	struct inode *inode;
 	struct file_lock file_lock;
+	struct nfs4_stateowner *so;
 	int error;
 	__be32 status;
 
@@ -4107,7 +4108,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (check_lock_length(lockt->lt_offset, lockt->lt_length))
 		 return nfserr_inval;
 
-	lockt->lt_stateowner = NULL;
 	nfs4_lock_state();
 
 	status = nfserr_stale_clientid;
@@ -4134,10 +4134,10 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto out;
 	}
 
-	lockt->lt_stateowner = find_lockstateowner_str(inode,
+	so = find_lockstateowner_str(inode,
 			&lockt->lt_clientid, &lockt->lt_owner);
-	if (lockt->lt_stateowner)
-		file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner;
+	if (so)
+		file_lock.fl_owner = (fl_owner_t)so;
 	file_lock.fl_pid = current->tgid;
 	file_lock.fl_flags = FL_POSIX;
 
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index de236fb89e74..27a3dfab96a9 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -184,7 +184,6 @@ struct nfsd4_lockt {
 	struct xdr_netobj		lt_owner;
 	u64				lt_offset;
 	u64				lt_length;
-	struct nfs4_stateowner * 	lt_stateowner;
 	struct nfsd4_lock_denied  	lt_denied;
 };
 
-- 
cgit v1.2.3-58-ga151


From 7a8711c9a6e2299c324c153da6e3381b1fd56128 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 2 Sep 2011 09:03:37 -0400
Subject: nfsd4: share common seqid checks

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a47bf884726f..8edc9ad63ea6 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2456,6 +2456,16 @@ static const struct lock_manager_operations nfsd_lease_mng_ops = {
 	.lm_change = nfsd_change_deleg_cb,
 };
 
+static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4_stateowner *so, u32 seqid)
+{
+	if (nfsd4_has_session(cstate))
+		return nfs_ok;
+	if (seqid == so->so_seqid - 1)
+		return nfserr_replay_me;
+	if (seqid == so->so_seqid)
+		return nfs_ok;
+	return nfserr_bad_seqid;
+}
 
 __be32
 nfsd4_process_open1(struct nfsd4_compound_state *cstate,
@@ -2465,6 +2475,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
 	struct nfs4_client *clp = NULL;
 	unsigned int strhashval;
 	struct nfs4_stateowner *sop = NULL;
+	__be32 status;
 
 	if (!check_name(open->op_owner))
 		return nfserr_inval;
@@ -2482,9 +2493,6 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
 			return nfserr_expired;
 		goto renew;
 	}
-	/* When sessions are used, skip open sequenceid processing */
-	if (nfsd4_has_session(cstate))
-		goto renew;
 	if (!sop->so_confirmed) {
 		/* Replace unconfirmed owners without checking for replay. */
 		clp = sop->so_client;
@@ -2492,10 +2500,9 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
 		open->op_stateowner = NULL;
 		goto renew;
 	}
-	if (open->op_seqid == sop->so_seqid - 1)
-		return nfserr_replay_me;
-	if (open->op_seqid != sop->so_seqid)
-		return nfserr_bad_seqid;
+	status = nfsd4_check_seqid(cstate, sop, open->op_seqid);
+	if (status)
+		return status;
 renew:
 	if (open->op_stateowner == NULL) {
 		sop = alloc_init_open_stateowner(strhashval, clp, open);
@@ -3411,7 +3418,10 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 		if (sop == NULL)
 			return nfserr_expired;
 		cstate->replay_owner = sop;
-		goto check_replay;
+		status = nfsd4_check_seqid(cstate, sop, seqid);
+		if (status)
+			return status;
+		return nfserr_bad_seqid;
 	}
 
 	*stpp = stp;
@@ -3423,8 +3433,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 		return nfserr_bad_stateid;
 	}
 
-	if (!nfsd4_has_session(cstate) && seqid != sop->so_seqid)
-		goto check_replay;
+	status = nfsd4_check_seqid(cstate, sop, seqid);
+	if (status)
+		return status;
 
 	if (sop->so_confirmed && flags & CONFIRM) {
 		dprintk("NFSD: preprocess_seqid_op: expected"
@@ -3441,16 +3452,6 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 		return status;
 	renew_client(sop->so_client);
 	return nfs_ok;
-
-check_replay:
-	if (seqid == sop->so_seqid - 1) {
-		dprintk("NFSD: preprocess_seqid_op: retransmission?\n");
-		/* indicate replay to calling function */
-		return nfserr_replay_me;
-	}
-	dprintk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n",
-			sop->so_seqid, seqid);
-	return nfserr_bad_seqid;
 }
 
 __be32
-- 
cgit v1.2.3-58-ga151


From 77eaae8d44ec5942033b751d0e0d2914c9411862 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 2 Sep 2011 12:08:20 -0400
Subject: nfsd4: simplify check_open logic

Sometimes the single-exit style is good, sometimes it's unnecessarily
convoluted....

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8edc9ad63ea6..8694e60a4520 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2573,7 +2573,6 @@ static __be32
 nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp)
 {
 	struct nfs4_stateid *local;
-	__be32 status = nfserr_share_denied;
 	struct nfs4_stateowner *sop = open->op_stateowner;
 
 	list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
@@ -2585,11 +2584,9 @@ nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_state
 			*stpp = local;
 		/* check for conflicting share reservations */
 		if (!test_share(local, open))
-			goto out;
+			return nfserr_share_denied;
 	}
-	status = 0;
-out:
-	return status;
+	return nfs_ok;
 }
 
 static inline struct nfs4_stateid *
-- 
cgit v1.2.3-58-ga151


From 68b66e8270f44d297a48662e6aed72c5944f77bd Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 2 Sep 2011 12:19:43 -0400
Subject: nfsd4: move double-confirm test to open_confirm

I don't see the point of having this check in nfs4_preprocess_seqid_op()
when it's only needed by the one caller.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8694e60a4520..9c44630a245a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3434,11 +3434,6 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 	if (status)
 		return status;
 
-	if (sop->so_confirmed && flags & CONFIRM) {
-		dprintk("NFSD: preprocess_seqid_op: expected"
-				" unconfirmed stateowner!\n");
-		return nfserr_bad_stateid;
-	}
 	if (!sop->so_confirmed && !(flags & CONFIRM)) {
 		dprintk("NFSD: preprocess_seqid_op: stateowner not"
 				" confirmed yet!\n");
@@ -3473,9 +3468,11 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 					oc->oc_seqid, &oc->oc_req_stateid,
 					CONFIRM | OPEN_STATE, &stp);
 	if (status)
-		goto out; 
-
+		goto out;
 	sop = stp->st_stateowner;
+	status = nfserr_bad_stateid;
+	if (sop->so_confirmed)
+		goto out;
 	sop->so_confirmed = 1;
 	update_stateid(&stp->st_stateid);
 	memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t));
@@ -3483,6 +3480,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid));
 
 	nfsd4_create_clid_dir(sop->so_client);
+	status = nfs_ok;
 out:
 	if (!cstate->replay_owner)
 		nfs4_unlock_state();
-- 
cgit v1.2.3-58-ga151


From 4e96b2dbbf1d7e81f22047a50f862555a6cb87cb Mon Sep 17 00:00:00 2001
From: Allison Henderson <achender@linux.vnet.ibm.com>
Date: Sat, 3 Sep 2011 11:51:09 -0400
Subject: ext4: Add new ext4_discard_partial_page_buffers routines

This patch adds two new routines: ext4_discard_partial_page_buffers
and ext4_discard_partial_page_buffers_no_lock.

The ext4_discard_partial_page_buffers routine is a wrapper
function to ext4_discard_partial_page_buffers_no_lock.
The wrapper function locks the page and passes it to
ext4_discard_partial_page_buffers_no_lock.
Calling functions that already have the page locked can call
ext4_discard_partial_page_buffers_no_lock directly.

The ext4_discard_partial_page_buffers_no_lock function
zeros a specified range in a page, and unmaps the
corresponding buffer heads.  Only block aligned regions of the
page will have their buffer heads unmapped.  Unblock aligned regions
will be mapped if needed so that they can be updated with the
partial zero out.  This function is meant to
be used to update a page and its buffer heads to be zeroed
and unmapped when the corresponding blocks have been released
or will be released.

This routine is used in the following scenarios:
* A hole is punched and the non page aligned regions
  of the head and tail of the hole need to be discarded

* The file is truncated and the partial page beyond EOF needs
  to be discarded

* The end of a hole is in the same page as EOF.  After the
  page is flushed, the partial page beyond EOF needs to be
  discarded.

* A write operation begins or ends inside a hole and the partial
  page appearing before or after the write needs to be discarded

* A write operation extends EOF and the partial page beyond EOF
  needs to be discarded

This function takes a flag EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
which is used when a write operation begins or ends in a hole.
When the EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED flag is used, only
buffer heads that are already unmapped will have the corresponding
regions of the page zeroed.

Signed-off-by: Allison Henderson <achender@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  |  11 +++
 fs/ext4/inode.c | 224 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 235 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5c38120c389c..ccfa81f33bb0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -528,6 +528,11 @@ struct ext4_new_group_data {
 #define EXT4_FREE_BLOCKS_VALIDATED	0x0004
 #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE	0x0008
 
+/*
+ * Flags used by ext4_discard_partial_page_buffers
+ */
+#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED	0x0001
+
 /*
  * ioctl commands
  */
@@ -1838,6 +1843,12 @@ extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
 extern int ext4_block_zero_page_range(handle_t *handle,
 		struct address_space *mapping, loff_t from, loff_t length);
+extern int ext4_discard_partial_page_buffers(handle_t *handle,
+		struct address_space *mapping, loff_t from,
+		loff_t length, int flags);
+extern int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+		struct inode *inode, struct page *page, loff_t from,
+		loff_t length, int flags);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b84f127c085d..d1b1ef71e5b7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2966,6 +2966,230 @@ void ext4_set_aops(struct inode *inode)
 		inode->i_mapping->a_ops = &ext4_journalled_aops;
 }
 
+
+/*
+ * ext4_discard_partial_page_buffers()
+ * Wrapper function for ext4_discard_partial_page_buffers_no_lock.
+ * This function finds and locks the page containing the offset
+ * "from" and passes it to ext4_discard_partial_page_buffers_no_lock.
+ * Calling functions that already have the page locked should call
+ * ext4_discard_partial_page_buffers_no_lock directly.
+ */
+int ext4_discard_partial_page_buffers(handle_t *handle,
+		struct address_space *mapping, loff_t from,
+		loff_t length, int flags)
+{
+	struct inode *inode = mapping->host;
+	struct page *page;
+	int err = 0;
+
+	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+				   mapping_gfp_mask(mapping) & ~__GFP_FS);
+	if (!page)
+		return -EINVAL;
+
+	err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
+		from, length, flags);
+
+	unlock_page(page);
+	page_cache_release(page);
+	return err;
+}
+
+/*
+ * ext4_discard_partial_page_buffers_no_lock()
+ * Zeros a page range of length 'length' starting from offset 'from'.
+ * Buffer heads that correspond to the block aligned regions of the
+ * zeroed range will be unmapped.  Unblock aligned regions
+ * will have the corresponding buffer head mapped if needed so that
+ * that region of the page can be updated with the partial zero out.
+ *
+ * This function assumes that the page has already been  locked.  The
+ * The range to be discarded must be contained with in the given page.
+ * If the specified range exceeds the end of the page it will be shortened
+ * to the end of the page that corresponds to 'from'.  This function is
+ * appropriate for updating a page and it buffer heads to be unmapped and
+ * zeroed for blocks that have been either released, or are going to be
+ * released.
+ *
+ * handle: The journal handle
+ * inode:  The files inode
+ * page:   A locked page that contains the offset "from"
+ * from:   The starting byte offset (from the begining of the file)
+ *         to begin discarding
+ * len:    The length of bytes to discard
+ * flags:  Optional flags that may be used:
+ *
+ *         EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
+ *         Only zero the regions of the page whose buffer heads
+ *         have already been unmapped.  This flag is appropriate
+ *         for updateing the contents of a page whose blocks may
+ *         have already been released, and we only want to zero
+ *         out the regions that correspond to those released blocks.
+ *
+ * Returns zero on sucess or negative on failure.
+ */
+int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+		struct inode *inode, struct page *page, loff_t from,
+		loff_t length, int flags)
+{
+	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned int offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned int blocksize, max, pos;
+	unsigned int end_of_block, range_to_discard;
+	ext4_lblk_t iblock;
+	struct buffer_head *bh;
+	int err = 0;
+
+	blocksize = inode->i_sb->s_blocksize;
+	max = PAGE_CACHE_SIZE - offset;
+
+	if (index != page->index)
+		return -EINVAL;
+
+	/*
+	 * correct length if it does not fall between
+	 * 'from' and the end of the page
+	 */
+	if (length > max || length < 0)
+		length = max;
+
+	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+	if (!page_has_buffers(page)) {
+		/*
+		 * If the range to be discarded covers a partial block
+		 * we need to get the page buffers.  This is because
+		 * partial blocks cannot be released and the page needs
+		 * to be updated with the contents of the block before
+		 * we write the zeros on top of it.
+		 */
+		if (!(from & (blocksize - 1)) ||
+		    !((from + length) & (blocksize - 1))) {
+			create_empty_buffers(page, blocksize, 0);
+		} else {
+			/*
+			 * If there are no partial blocks,
+			 * there is nothing to update,
+			 * so we can return now
+			 */
+			return 0;
+		}
+	}
+
+	/* Find the buffer that contains "offset" */
+	bh = page_buffers(page);
+	pos = blocksize;
+	while (offset >= pos) {
+		bh = bh->b_this_page;
+		iblock++;
+		pos += blocksize;
+	}
+
+	pos = offset;
+	while (pos < offset + length) {
+		err = 0;
+
+		/* The length of space left to zero and unmap */
+		range_to_discard = offset + length - pos;
+
+		/* The length of space until the end of the block */
+		end_of_block = blocksize - (pos & (blocksize-1));
+
+		/*
+		 * Do not unmap or zero past end of block
+		 * for this buffer head
+		 */
+		if (range_to_discard > end_of_block)
+			range_to_discard = end_of_block;
+
+
+		/*
+		 * Skip this buffer head if we are only zeroing unampped
+		 * regions of the page
+		 */
+		if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
+			buffer_mapped(bh))
+				goto next;
+
+		/* If the range is block aligned, unmap */
+		if (range_to_discard == blocksize) {
+			clear_buffer_dirty(bh);
+			bh->b_bdev = NULL;
+			clear_buffer_mapped(bh);
+			clear_buffer_req(bh);
+			clear_buffer_new(bh);
+			clear_buffer_delay(bh);
+			clear_buffer_unwritten(bh);
+			clear_buffer_uptodate(bh);
+			zero_user(page, pos, range_to_discard);
+			BUFFER_TRACE(bh, "Buffer discarded");
+			goto next;
+		}
+
+		/*
+		 * If this block is not completely contained in the range
+		 * to be discarded, then it is not going to be released. Because
+		 * we need to keep this block, we need to make sure this part
+		 * of the page is uptodate before we modify it by writeing
+		 * partial zeros on it.
+		 */
+		if (!buffer_mapped(bh)) {
+			/*
+			 * Buffer head must be mapped before we can read
+			 * from the block
+			 */
+			BUFFER_TRACE(bh, "unmapped");
+			ext4_get_block(inode, iblock, bh, 0);
+			/* unmapped? It's a hole - nothing to do */
+			if (!buffer_mapped(bh)) {
+				BUFFER_TRACE(bh, "still unmapped");
+				goto next;
+			}
+		}
+
+		/* Ok, it's mapped. Make sure it's up-to-date */
+		if (PageUptodate(page))
+			set_buffer_uptodate(bh);
+
+		if (!buffer_uptodate(bh)) {
+			err = -EIO;
+			ll_rw_block(READ, 1, &bh);
+			wait_on_buffer(bh);
+			/* Uhhuh. Read error. Complain and punt.*/
+			if (!buffer_uptodate(bh))
+				goto next;
+		}
+
+		if (ext4_should_journal_data(inode)) {
+			BUFFER_TRACE(bh, "get write access");
+			err = ext4_journal_get_write_access(handle, bh);
+			if (err)
+				goto next;
+		}
+
+		zero_user(page, pos, range_to_discard);
+
+		err = 0;
+		if (ext4_should_journal_data(inode)) {
+			err = ext4_handle_dirty_metadata(handle, inode, bh);
+		} else {
+			if (ext4_should_order_data(inode) &&
+			   EXT4_I(inode)->jinode)
+				err = ext4_jbd2_file_inode(handle, inode);
+			mark_buffer_dirty(bh);
+		}
+
+		BUFFER_TRACE(bh, "Partial buffer zeroed");
+next:
+		bh = bh->b_this_page;
+		iblock++;
+		pos += range_to_discard;
+	}
+
+	return err;
+}
+
 /*
  * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
  * up to the end of the block which corresponds to `from'.
-- 
cgit v1.2.3-58-ga151


From ba06208a1315ab2d2217e09c79582b886c9f629e Mon Sep 17 00:00:00 2001
From: Allison Henderson <achender@linux.vnet.ibm.com>
Date: Sat, 3 Sep 2011 11:55:59 -0400
Subject: ext4: fix xfstests 75, 112, 127 punch hole failure

This patch addresses a bug found by xfstests 75, 112, 127
when blocksize = 1k

This bug happens because the punch hole code only zeros
out non block aligned regions of the page.  This means that if the
blocks are smaller than a page, then the block aligned regions of
the page inside the hole are left un-zeroed, and their buffer heads
are still mapped.  This bug is corrected by using
ext4_discard_partial_page_buffers to properly zero the partial page
at the head and tail of the hole, and unmap the corresponding buffer
heads

This patch also addresses a bug reported by Lukas while working on a
new patch to add discard support for loop devices using punch hole.
The bug happened because of the first and last block number
needed to be cast to a larger data type before calculating the
byte offset, but since now we only need the byte offsets of the
pages, we no longer even need to be calculating the byte offsets
of the blocks.  The code to do the block offset calculations is
removed in this patch.

Signed-off-by: Allison Henderson <achender@linux.vnet.ibm.com>
---
 fs/ext4/extents.c | 61 +++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 57cf568a98ab..18f7e04a4fa3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4162,17 +4162,14 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 	struct address_space *mapping = inode->i_mapping;
 	struct ext4_map_blocks map;
 	handle_t *handle;
-	loff_t first_block_offset, last_block_offset, block_len;
-	loff_t first_page, last_page, first_page_offset, last_page_offset;
+	loff_t first_page, last_page, page_len;
+	loff_t first_page_offset, last_page_offset;
 	int ret, credits, blocks_released, err = 0;
 
 	first_block = (offset + sb->s_blocksize - 1) >>
 		EXT4_BLOCK_SIZE_BITS(sb);
 	last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
 
-	first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb);
-	last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb);
-
 	first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	last_page = (offset + length) >> PAGE_CACHE_SHIFT;
 
@@ -4211,24 +4208,44 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 		goto out;
 
 	/*
-	 * Now we need to zero out the un block aligned data.
-	 * If the file is smaller than a block, just
-	 * zero out the middle
+	 * Now we need to zero out the non-page-aligned data in the
+	 * pages at the start and tail of the hole, and unmap the buffer
+	 * heads for the block aligned regions of the page that were
+	 * completely zeroed.
 	 */
-	if (first_block > last_block)
-		ext4_block_zero_page_range(handle, mapping, offset, length);
-	else {
-		/* zero out the head of the hole before the first block */
-		block_len  = first_block_offset - offset;
-		if (block_len > 0)
-			ext4_block_zero_page_range(handle, mapping,
-						   offset, block_len);
-
-		/* zero out the tail of the hole after the last block */
-		block_len = offset + length - last_block_offset;
-		if (block_len > 0) {
-			ext4_block_zero_page_range(handle, mapping,
-					last_block_offset, block_len);
+	if (first_page > last_page) {
+		/*
+		 * If the file space being truncated is contained within a page
+		 * just zero out and unmap the middle of that page
+		 */
+		err = ext4_discard_partial_page_buffers(handle,
+			mapping, offset, length, 0);
+
+		if (err)
+			goto out;
+	} else {
+		/*
+		 * zero out and unmap the partial page that contains
+		 * the start of the hole
+		 */
+		page_len  = first_page_offset - offset;
+		if (page_len > 0) {
+			err = ext4_discard_partial_page_buffers(handle, mapping,
+						   offset, page_len, 0);
+			if (err)
+				goto out;
+		}
+
+		/*
+		 * zero out and unmap the partial page that contains
+		 * the end of the hole
+		 */
+		page_len = offset + length - last_page_offset;
+		if (page_len > 0) {
+			err = ext4_discard_partial_page_buffers(handle, mapping,
+					last_page_offset, page_len, 0);
+			if (err)
+				goto out;
 		}
 	}
 
-- 
cgit v1.2.3-58-ga151


From 2be4751b21ae1cacb002da48cfc5bf6743fee8c1 Mon Sep 17 00:00:00 2001
From: Allison Henderson <achender@linux.vnet.ibm.com>
Date: Sat, 3 Sep 2011 11:56:52 -0400
Subject: ext4: fix 2nd xfstests 127 punch hole failure

This patch fixes a second punch hole bug found by xfstests 127.

This bug happens because punch hole needs to flush the pages
of the hole to avoid race conditions.  But if the end of the
hole is in the same page as i_size, the buffer heads beyond
i_size need to be unmapped and the page needs to be zeroed
after it is flushed.

To correct this, the new ext4_discard_partial_page_buffers
routine is used to zero and unmap the partial page
beyond i_size if the end of the hole appears in the same
page as i_size.

The code has also been optimized to set the end of the hole
to the page after i_size if the specified hole exceeds i_size,
and the code that flushes the pages has been simplified.

Signed-off-by: Allison Henderson <achender@linux.vnet.ibm.com>
---
 fs/ext4/extents.c | 41 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 18f7e04a4fa3..9124cd24e093 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4166,6 +4166,20 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 	loff_t first_page_offset, last_page_offset;
 	int ret, credits, blocks_released, err = 0;
 
+	/* No need to punch hole beyond i_size */
+	if (offset >= inode->i_size)
+		return 0;
+
+	/*
+	 * If the hole extends beyond i_size, set the hole
+	 * to end after the page that contains i_size
+	 */
+	if (offset + length > inode->i_size) {
+		length = inode->i_size +
+		   PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+		   offset;
+	}
+
 	first_block = (offset + sb->s_blocksize - 1) >>
 		EXT4_BLOCK_SIZE_BITS(sb);
 	last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
@@ -4182,11 +4196,10 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 	 */
 	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 		err = filemap_write_and_wait_range(mapping,
-			first_page_offset == 0 ? 0 : first_page_offset-1,
-			last_page_offset);
+			offset, offset + length - 1);
 
-			if (err)
-				return err;
+		if (err)
+			return err;
 	}
 
 	/* Now release the pages */
@@ -4249,6 +4262,26 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 		}
 	}
 
+
+	/*
+	 * If i_size is contained in the last page, we need to
+	 * unmap and zero the partial page after i_size
+	 */
+	if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
+	   inode->i_size % PAGE_CACHE_SIZE != 0) {
+
+		page_len = PAGE_CACHE_SIZE -
+			(inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+		if (page_len > 0) {
+			err = ext4_discard_partial_page_buffers(handle,
+			  mapping, inode->i_size, page_len, 0);
+
+			if (err)
+				goto out;
+		}
+	}
+
 	/* If there are no blocks to remove, return now */
 	if (first_block >= last_block)
 		goto out;
-- 
cgit v1.2.3-58-ga151


From 56889787cfa77dfd96f0b3a3e6a4f26c2e4a5134 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 3 Sep 2011 18:22:38 -0400
Subject: ext4: improve handling of conflicting mount options

If the user explicitly specifies conflicting mount options for
delalloc or dioread_nolock and data=journal, fail the mount, instead
of printing a warning and continuing (since many user's won't look at
dmesg and notice the warning).

Also, print a single warning that data=journal implies that delayed
allocation is not on by default (since it's not supported), and
furthermore that O_DIRECT is not supported.  Improve the text in
Documentation/filesystems/ext4.txt so this is clear there as well.

Similarly, if the dioread_nolock mount option is specified when the
file system block size != PAGE_SIZE, fail the mount instead of
printing a warning message and ignoring the mount option.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 Documentation/filesystems/ext4.txt |  8 +++---
 fs/ext4/ext4.h                     |  3 +++
 fs/ext4/super.c                    | 50 ++++++++++++++++++++++----------------
 3 files changed, 37 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 232a575a0c48..168242b5c045 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -160,7 +160,9 @@ noload			if the filesystem was not unmounted cleanly,
                      	lead to any number of problems.
 
 data=journal		All data are committed into the journal prior to being
-			written into the main file system.
+			written into the main file system.  Enabling
+			this mode will disable delayed allocation and
+			O_DIRECT support.
 
 data=ordered	(*)	All data are forced directly out to the main file
 			system prior to its metadata being committed to the
@@ -419,8 +421,8 @@ written to the journal first, and then to its final location.
 In the event of a crash, the journal can be replayed, bringing both data and
 metadata into a consistent state.  This mode is the slowest except when data
 needs to be read from and written to disk at the same time where it
-outperforms all others modes.  Currently ext4 does not have delayed
-allocation support if this data journalling mode is selected.
+outperforms all others modes.  Enabling this mode will disable delayed
+allocation and O_DIRECT support.
 
 /proc entries
 =============
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ccfa81f33bb0..48ae98819d35 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -922,6 +922,9 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DISCARD		0x40000000 /* Issue DISCARD requests */
 #define EXT4_MOUNT_INIT_INODE_TABLE	0x80000000 /* Initialize uninitialized itables */
 
+#define EXT4_MOUNT2_EXPLICIT_DELALLOC	0x00000001 /* User explicitly
+						      specified delalloc */
+
 #define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
 						~EXT4_MOUNT_##opt
 #define set_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt |= \
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 44d0c8db2239..ee2f74a7084d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1801,6 +1801,7 @@ set_qf_format:
 			break;
 		case Opt_nodelalloc:
 			clear_opt(sb, DELALLOC);
+			clear_opt2(sb, EXPLICIT_DELALLOC);
 			break;
 		case Opt_mblk_io_submit:
 			set_opt(sb, MBLK_IO_SUBMIT);
@@ -1817,6 +1818,7 @@ set_qf_format:
 			break;
 		case Opt_delalloc:
 			set_opt(sb, DELALLOC);
+			set_opt2(sb, EXPLICIT_DELALLOC);
 			break;
 		case Opt_block_validity:
 			set_opt(sb, BLOCK_VALIDITY);
@@ -3224,6 +3226,33 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			   &journal_ioprio, NULL, 0))
 		goto failed_mount;
 
+	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+		printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
+			    "with data=journal disables delayed "
+			    "allocation and O_DIRECT support!\n");
+		if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				 "both data=journal and delalloc");
+			goto failed_mount;
+		}
+		if (test_opt(sb, DIOREAD_NOLOCK)) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				 "both data=journal and delalloc");
+			goto failed_mount;
+		}
+		if (test_opt(sb, DELALLOC))
+			clear_opt(sb, DELALLOC);
+	}
+
+	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+	if (test_opt(sb, DIOREAD_NOLOCK)) {
+		if (blocksize < PAGE_SIZE) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				 "dioread_nolock if block size != PAGE_SIZE");
+			goto failed_mount;
+		}
+	}
+
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
 		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
 
@@ -3265,8 +3294,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
 		goto failed_mount;
 
-	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
-
 	if (blocksize < EXT4_MIN_BLOCK_SIZE ||
 	    blocksize > EXT4_MAX_BLOCK_SIZE) {
 		ext4_msg(sb, KERN_ERR,
@@ -3679,25 +3706,6 @@ no_journal:
 			 "available");
 	}
 
-	if (test_opt(sb, DELALLOC) &&
-	    (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
-		ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
-			 "requested data journaling mode");
-		clear_opt(sb, DELALLOC);
-	}
-	if (test_opt(sb, DIOREAD_NOLOCK)) {
-		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
-			ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
-				"option - requested data journaling mode");
-			clear_opt(sb, DIOREAD_NOLOCK);
-		}
-		if (sb->s_blocksize < PAGE_SIZE) {
-			ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
-				"option - block size is too small");
-			clear_opt(sb, DIOREAD_NOLOCK);
-		}
-	}
-
 	err = ext4_setup_system_zone(sb);
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "failed to initialize system "
-- 
cgit v1.2.3-58-ga151


From f4dee24cca98739a4190a00fa014cd1b7e2581a4 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 2 Sep 2011 16:36:49 -0400
Subject: nfsd4: move CLOSE_STATE special case to caller

Move the CLOSE_STATE case into the unique caller that cares about it
rather than putting it in preprocess_seqid_op.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 56 ++++++++++++++++++++++++++---------------------------
 fs/nfsd/state.h     |  1 -
 2 files changed, 27 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9c44630a245a..eb11626babc6 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3070,15 +3070,13 @@ laundromat_main(struct work_struct *not_used)
 }
 
 static struct nfs4_stateowner *
-search_close_lru(u32 st_id, int flags)
+search_close_lru(u32 st_id)
 {
-	struct nfs4_stateowner *local = NULL;
+	struct nfs4_stateowner *local;
 
-	if (flags & CLOSE_STATE) {
-		list_for_each_entry(local, &close_lru, so_close_lru) {
-			if (local->so_id == st_id)
-				return local;
-		}
+	list_for_each_entry(local, &close_lru, so_close_lru) {
+		if (local->so_id == st_id)
+			return local;
 	}
 	return NULL;
 }
@@ -3381,7 +3379,6 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 			 stateid_t *stateid, int flags,
 			 struct nfs4_stateid **stpp)
 {
-	struct nfs4_stateid *stp;
 	struct nfs4_stateowner *sop;
 	struct svc_fh *current_fh = &cstate->current_fh;
 	__be32 status;
@@ -3404,28 +3401,14 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 	* the confirmed flag is incorrecly set, or the generation 
 	* number is incorrect.  
 	*/
-	stp = find_stateid(stateid, flags);
-	if (stp == NULL) {
-		/*
-		 * Also, we should make sure this isn't just the result of
-		 * a replayed close:
-		 */
-		sop = search_close_lru(stateid->si_stateownerid, flags);
-		/* It's not stale; let's assume it's expired: */
-		if (sop == NULL)
-			return nfserr_expired;
-		cstate->replay_owner = sop;
-		status = nfsd4_check_seqid(cstate, sop, seqid);
-		if (status)
-			return status;
-		return nfserr_bad_seqid;
-	}
+	*stpp = find_stateid(stateid, flags);
+	if (*stpp == NULL)
+		return nfserr_expired;
 
-	*stpp = stp;
-	sop = stp->st_stateowner;
+	sop = (*stpp)->st_stateowner;
 	cstate->replay_owner = sop;
 
-	if (nfs4_check_fh(current_fh, stp)) {
+	if (nfs4_check_fh(current_fh, *stpp)) {
 		dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n");
 		return nfserr_bad_stateid;
 	}
@@ -3439,7 +3422,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 				" confirmed yet!\n");
 		return nfserr_bad_stateid;
 	}
-	status = check_stateid_generation(stateid, &stp->st_stateid, nfsd4_has_session(cstate));
+	status = check_stateid_generation(stateid, &(*stpp)->st_stateid, nfsd4_has_session(cstate));
 	if (status)
 		return status;
 	renew_client(sop->so_client);
@@ -3574,7 +3557,22 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	/* check close_lru for replay */
 	status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
 					&close->cl_stateid, 
-					OPEN_STATE | CLOSE_STATE, &stp);
+					OPEN_STATE, &stp);
+	if (stp == NULL && status == nfserr_expired) {
+		/*
+		 * Also, we should make sure this isn't just the result of
+		 * a replayed close:
+		 */
+		so = search_close_lru(close->cl_stateid.si_stateownerid);
+		/* It's not stale; let's assume it's expired: */
+		if (so == NULL)
+			goto out;
+		cstate->replay_owner = so;
+		status = nfsd4_check_seqid(cstate, so, close->cl_seqid);
+		if (status)
+			goto out;
+		status = nfserr_bad_seqid;
+	}
 	if (status)
 		goto out; 
 	so = stp->st_stateowner;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index f7114fc21dee..0d88000d15d7 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -448,7 +448,6 @@ struct nfs4_stateid {
 #define LOCK_STATE              0x00000008
 #define RD_STATE	        0x00000010
 #define WR_STATE	        0x00000020
-#define CLOSE_STATE             0x00000040
 
 struct nfsd4_compound_state;
 
-- 
cgit v1.2.3-58-ga151


From 9ea7a0df63630ad8197716cd313ea66e28906fc0 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 4 Sep 2011 10:18:14 -0400
Subject: jbd2: add debugging information to jbd2_journal_dirty_metadata()

Add debugging information in case jbd2_journal_dirty_metadata() is
called with a buffer_head which didn't have
jbd2_journal_get_write_access() called on it, or if the journal_head
has the wrong transaction in it.  In addition, return an error code.
This won't change anything for ocfs2, which will BUG_ON() the non-zero
exit code.

For ext4, the caller of this function is ext4_handle_dirty_metadata(),
and on seeing a non-zero return code, will call __ext4_journal_stop(),
which will print the function and line number of the (buggy) calling
function and abort the journal.  This will allow us to recover instead
of bug halting, which is better from a robustness and reliability
point of view.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4_jbd2.c   |  8 ++++---
 fs/ext4/extents.c     | 10 ++++++---
 fs/jbd2/transaction.c | 58 +++++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 64 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index f5240aa15601..aca179017582 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -109,9 +109,11 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
 
 	if (ext4_handle_valid(handle)) {
 		err = jbd2_journal_dirty_metadata(handle, bh);
-		if (err)
-			ext4_journal_abort_handle(where, line, __func__,
-						  bh, handle, err);
+		if (err) {
+			/* Errors can only happen if there is a bug */
+			handle->h_err = err;
+			__ext4_journal_stop(where, line, handle);
+		}
 	} else {
 		if (inode)
 			mark_buffer_dirty_inode(bh, inode);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9124cd24e093..2c5216a8d03b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -96,13 +96,17 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
  *  - ENOMEM
  *  - EIO
  */
-static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
-				struct ext4_ext_path *path)
+#define ext4_ext_dirty(handle, inode, path) \
+		__ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
+static int __ext4_ext_dirty(const char *where, unsigned int line,
+			    handle_t *handle, struct inode *inode,
+			    struct ext4_ext_path *path)
 {
 	int err;
 	if (path->p_bh) {
 		/* path points to block */
-		err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
+		err = __ext4_handle_dirty_metadata(where, line, handle,
+						   inode, path->p_bh);
 	} else {
 		/* path points to leaf/index in inode body */
 		err = ext4_mark_inode_dirty(handle, inode);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 2d7109414cdd..cb56fe9aaabb 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1049,6 +1049,10 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
  * mark dirty metadata which needs to be journaled as part of the current
  * transaction.
  *
+ * The buffer must have previously had jbd2_journal_get_write_access()
+ * called so that it has a valid journal_head attached to the buffer
+ * head.
+ *
  * The buffer is placed on the transaction's metadata list and is marked
  * as belonging to the transaction.
  *
@@ -1065,11 +1069,16 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal = transaction->t_journal;
 	struct journal_head *jh = bh2jh(bh);
+	int ret = 0;
 
 	jbd_debug(5, "journal_head %p\n", jh);
 	JBUFFER_TRACE(jh, "entry");
 	if (is_handle_aborted(handle))
 		goto out;
+	if (!buffer_jbd(bh)) {
+		ret = -EUCLEAN;
+		goto out;
+	}
 
 	jbd_lock_bh_state(bh);
 
@@ -1093,8 +1102,20 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 	 */
 	if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
 		JBUFFER_TRACE(jh, "fastpath");
-		J_ASSERT_JH(jh, jh->b_transaction ==
-					journal->j_running_transaction);
+		if (unlikely(jh->b_transaction !=
+			     journal->j_running_transaction)) {
+			printk(KERN_EMERG "JBD: %s: "
+			       "jh->b_transaction (%llu, %p, %u) != "
+			       "journal->j_running_transaction (%p, %u)",
+			       journal->j_devname,
+			       (unsigned long long) bh->b_blocknr,
+			       jh->b_transaction,
+			       jh->b_transaction ? jh->b_transaction->t_tid : 0,
+			       journal->j_running_transaction,
+			       journal->j_running_transaction ?
+			       journal->j_running_transaction->t_tid : 0);
+			ret = -EINVAL;
+		}
 		goto out_unlock_bh;
 	}
 
@@ -1108,9 +1129,32 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 	 */
 	if (jh->b_transaction != transaction) {
 		JBUFFER_TRACE(jh, "already on other transaction");
-		J_ASSERT_JH(jh, jh->b_transaction ==
-					journal->j_committing_transaction);
-		J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
+		if (unlikely(jh->b_transaction !=
+			     journal->j_committing_transaction)) {
+			printk(KERN_EMERG "JBD: %s: "
+			       "jh->b_transaction (%llu, %p, %u) != "
+			       "journal->j_committing_transaction (%p, %u)",
+			       journal->j_devname,
+			       (unsigned long long) bh->b_blocknr,
+			       jh->b_transaction,
+			       jh->b_transaction ? jh->b_transaction->t_tid : 0,
+			       journal->j_committing_transaction,
+			       journal->j_committing_transaction ?
+			       journal->j_committing_transaction->t_tid : 0);
+			ret = -EINVAL;
+		}
+		if (unlikely(jh->b_next_transaction != transaction)) {
+			printk(KERN_EMERG "JBD: %s: "
+			       "jh->b_next_transaction (%llu, %p, %u) != "
+			       "transaction (%p, %u)",
+			       journal->j_devname,
+			       (unsigned long long) bh->b_blocknr,
+			       jh->b_next_transaction,
+			       jh->b_next_transaction ?
+			       jh->b_next_transaction->t_tid : 0,
+			       transaction, transaction->t_tid);
+			ret = -EINVAL;
+		}
 		/* And this case is illegal: we can't reuse another
 		 * transaction's data buffer, ever. */
 		goto out_unlock_bh;
@@ -1127,7 +1171,9 @@ out_unlock_bh:
 	jbd_unlock_bh_state(bh);
 out:
 	JBUFFER_TRACE(jh, "exit");
-	return 0;
+	if (ret)
+		__WARN();	/* All errors are bugs, so dump the stack */
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3-58-ga151


From d2159fb7b8bac12684aabdf41d84b56da9f5c062 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sun, 4 Sep 2011 10:20:14 -0400
Subject: jbd2: use gfp_t instead of int

This silences some Sparse warnings:
fs/jbd2/transaction.c:135:69: warning: incorrect type in argument 2 (different base types)
fs/jbd2/transaction.c:135:69:    expected restricted gfp_t [usertype] flags
fs/jbd2/transaction.c:135:69:    got int [signed] gfp_mask

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/transaction.c | 6 +++---
 include/linux/jbd2.h  | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index cb56fe9aaabb..b01fd6104089 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -115,7 +115,7 @@ static inline void update_t_max_wait(transaction_t *transaction,
  */
 
 static int start_this_handle(journal_t *journal, handle_t *handle,
-			     int gfp_mask)
+			     gfp_t gfp_mask)
 {
 	transaction_t	*transaction, *new_transaction = NULL;
 	tid_t		tid;
@@ -320,7 +320,7 @@ static handle_t *new_handle(int nblocks)
  * Return a pointer to a newly allocated handle, or an ERR_PTR() value
  * on failure.
  */
-handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
+handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask)
 {
 	handle_t *handle = journal_current_handle();
 	int err;
@@ -443,7 +443,7 @@ out:
  * transaction capabable of guaranteeing the requested number of
  * credits.
  */
-int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
+int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal = transaction->t_journal;
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 38f307b8c334..3dd101e49d36 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1106,9 +1106,9 @@ static inline handle_t *journal_current_handle(void)
  */
 
 extern handle_t *jbd2_journal_start(journal_t *, int nblocks);
-extern handle_t *jbd2__journal_start(journal_t *, int nblocks, int gfp_mask);
+extern handle_t *jbd2__journal_start(journal_t *, int nblocks, gfp_t gfp_mask);
 extern int	 jbd2_journal_restart(handle_t *, int nblocks);
-extern int	 jbd2__journal_restart(handle_t *, int nblocks, int gfp_mask);
+extern int	 jbd2__journal_restart(handle_t *, int nblocks, gfp_t gfp_mask);
 extern int	 jbd2_journal_extend (handle_t *, int nblocks);
 extern int	 jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
-- 
cgit v1.2.3-58-ga151


From decbd919f4bb9cb698486880c026c4104b13d3c3 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 6 Sep 2011 02:37:06 -0400
Subject: ext4: only call ext4_jbd2_file_inode when an inode has been extended

In delayed allocation mode, it's important to only call
ext4_jbd2_file_inode when the file has been extended.  This is
necessary to avoid a race which first got introduced in commit
678aaf481, but which was made much more common with the introduction
of the "punch hole" functionality.  (Especially when dioread_nolock
was enabled; when I could reliably reproduce this problem with
xfstests #74.)

The race is this: If while trying to writeback a delayed allocation
inode, there is a need to map delalloc blocks, and we run out of space
in the journal, *and* at the same time the inode is already on the
committing transaction's t_inode_list (because for example while doing
the punch hole operation, ext4_jbd2_file_inode() is called), then the
commit operation will wait for the inode to finish all of its pending
writebacks by calling filemap_fdatawait(), but since that inode has
one or more pages with the PageWriteback flag set, the commit
operation will wait forever, and the so the writeback of the inode can
never take place, and the kjournald thread and the writeback thread
end up waiting for each other --- forever.

It's important at this point to recall why an inode is placed on the
t_inode_list; it is to provide the data=ordered guarantees that we
don't end up exposing stale data.  In the case where we are truncating
or punching a hole in the inode, there is no possibility that stale
data could be exposed in the first place, so we don't need to put the
inode on the t_inode_list!

The right long-term fix is to get rid of data=ordered mode altogether,
and only update the extent tree or indirect blocks after the data has
been written.  Until then, this change will also avoid some
unnecessary waiting in the commit operation.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Allison Henderson <achender@linux.vnet.ibm.com>
Cc: Jan Kara <jack@suse.cz>
---
 fs/ext4/inode.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d1b1ef71e5b7..f86b149fb8b1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1471,13 +1471,13 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 
 		for (i = 0; i < map.m_len; i++)
 			unmap_underlying_metadata(bdev, map.m_pblk + i);
-	}
 
-	if (ext4_should_order_data(mpd->inode)) {
-		err = ext4_jbd2_file_inode(handle, mpd->inode);
-		if (err)
-			/* This only happens if the journal is aborted */
-			return;
+		if (ext4_should_order_data(mpd->inode)) {
+			err = ext4_jbd2_file_inode(handle, mpd->inode);
+			if (err)
+				/* Only if the journal is aborted */
+				return;
+		}
 	}
 
 	/*
@@ -3173,12 +3173,8 @@ int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
 		err = 0;
 		if (ext4_should_journal_data(inode)) {
 			err = ext4_handle_dirty_metadata(handle, inode, bh);
-		} else {
-			if (ext4_should_order_data(inode) &&
-			   EXT4_I(inode)->jinode)
-				err = ext4_jbd2_file_inode(handle, inode);
+		} else
 			mark_buffer_dirty(bh);
-		}
 
 		BUFFER_TRACE(bh, "Partial buffer zeroed");
 next:
@@ -3301,11 +3297,8 @@ int ext4_block_zero_page_range(handle_t *handle,
 	err = 0;
 	if (ext4_should_journal_data(inode)) {
 		err = ext4_handle_dirty_metadata(handle, inode, bh);
-	} else {
-		if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
-			err = ext4_jbd2_file_inode(handle, inode);
+	} else
 		mark_buffer_dirty(bh);
-	}
 
 unlock:
 	unlock_page(page);
-- 
cgit v1.2.3-58-ga151


From 189e868fa8fdca702eb9db9d8afc46b5cb9144c9 Mon Sep 17 00:00:00 2001
From: Allison Henderson <achender@linux.vnet.ibm.com>
Date: Tue, 6 Sep 2011 21:49:44 -0400
Subject: ext4: fix fsx truncate failure

While running extended fsx tests to verify the first
two patches, a similar bug was also found in the
truncate operation.

This bug happens because the truncate routine only zeros
the unblock aligned portion of the last page.  This means
that the block aligned portions of the page appearing after
i_size are left unzeroed, and the buffer heads still mapped.

This bug is corrected by using ext4_discard_partial_page_buffers
in the truncate routine to zero the partial page and unmap
the buffer headers.

Signed-off-by: Allison Henderson <achender@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c  | 13 +++++++++++--
 fs/ext4/indirect.c | 13 +++++++++++--
 2 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2c5216a8d03b..ba7bd5a176ce 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3653,6 +3653,7 @@ void ext4_ext_truncate(struct inode *inode)
 	struct super_block *sb = inode->i_sb;
 	ext4_lblk_t last_block;
 	handle_t *handle;
+	loff_t page_len;
 	int err = 0;
 
 	/*
@@ -3669,8 +3670,16 @@ void ext4_ext_truncate(struct inode *inode)
 	if (IS_ERR(handle))
 		return;
 
-	if (inode->i_size & (sb->s_blocksize - 1))
-		ext4_block_truncate_page(handle, mapping, inode->i_size);
+	if (inode->i_size % PAGE_CACHE_SIZE != 0) {
+		page_len = PAGE_CACHE_SIZE -
+			(inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+		err = ext4_discard_partial_page_buffers(handle,
+			mapping, inode->i_size, page_len, 0);
+
+		if (err)
+			goto out_stop;
+	}
 
 	if (ext4_orphan_add(handle, inode))
 		goto out_stop;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 0962642119c0..ea704dff8669 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -1343,7 +1343,9 @@ void ext4_ind_truncate(struct inode *inode)
 	__le32 nr = 0;
 	int n = 0;
 	ext4_lblk_t last_block, max_block;
+	loff_t page_len;
 	unsigned blocksize = inode->i_sb->s_blocksize;
+	int err;
 
 	handle = start_transaction(inode);
 	if (IS_ERR(handle))
@@ -1354,9 +1356,16 @@ void ext4_ind_truncate(struct inode *inode)
 	max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
 					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
 
-	if (inode->i_size & (blocksize - 1))
-		if (ext4_block_truncate_page(handle, mapping, inode->i_size))
+	if (inode->i_size % PAGE_CACHE_SIZE != 0) {
+		page_len = PAGE_CACHE_SIZE -
+			(inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+		err = ext4_discard_partial_page_buffers(handle,
+			mapping, inode->i_size, page_len, 0);
+
+		if (err)
 			goto out_stop;
+	}
 
 	if (last_block != max_block) {
 		n = ext4_block_to_path(inode, last_block, offsets, NULL);
-- 
cgit v1.2.3-58-ga151


From 02fac1297eb3f471a27368271aadd285548297b0 Mon Sep 17 00:00:00 2001
From: Allison Henderson <achender@linux.vnet.ibm.com>
Date: Tue, 6 Sep 2011 21:53:01 -0400
Subject: ext4: fix partial page writes

While running extended fsx tests to verify the preceeding patches,
a similar bug was also found in the write operation

When ever a write operation begins or ends in a hole,
or extends EOF, the partial page contained in the hole
or beyond EOF needs to be zeroed out.

To correct this the new ext4_discard_partial_page_buffers_no_lock
routine is used to zero out the partial page, but only for buffer
heads that are already unmapped.

Signed-off-by: Allison Henderson <achender@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f86b149fb8b1..6ecc93979e48 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2255,6 +2255,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 	pgoff_t index;
 	struct inode *inode = mapping->host;
 	handle_t *handle;
+	loff_t page_len;
 
 	index = pos >> PAGE_CACHE_SHIFT;
 
@@ -2301,6 +2302,13 @@ retry:
 		 */
 		if (pos + len > inode->i_size)
 			ext4_truncate_failed_write(inode);
+	} else {
+		page_len = pos & (PAGE_CACHE_SIZE - 1);
+		if (page_len > 0) {
+			ret = ext4_discard_partial_page_buffers_no_lock(handle,
+				inode, page, pos - page_len, page_len,
+				EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED);
+		}
 	}
 
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -2343,6 +2351,7 @@ static int ext4_da_write_end(struct file *file,
 	loff_t new_i_size;
 	unsigned long start, end;
 	int write_mode = (int)(unsigned long)fsdata;
+	loff_t page_len;
 
 	if (write_mode == FALL_BACK_TO_NONDELALLOC) {
 		if (ext4_should_order_data(inode)) {
@@ -2391,6 +2400,16 @@ static int ext4_da_write_end(struct file *file,
 	}
 	ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
+
+	page_len = PAGE_CACHE_SIZE -
+			((pos + copied - 1) & (PAGE_CACHE_SIZE - 1));
+
+	if (page_len > 0) {
+		ret = ext4_discard_partial_page_buffers_no_lock(handle,
+			inode, page, pos + copied - 1, page_len,
+			EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED);
+	}
+
 	copied = ret2;
 	if (ret2 < 0)
 		ret = ret2;
-- 
cgit v1.2.3-58-ga151


From fe0750e5c43189adb6e6fc59837af7d5a588f413 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sat, 30 Jul 2011 23:33:59 -0400
Subject: nfsd4: split stateowners into open and lockowners

The stateowner has some fields that only make sense for openowners, and
some that only make sense for lockowners, and I find it a lot clearer if
those are separated out.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c  |  18 +--
 fs/nfsd/nfs4state.c | 367 ++++++++++++++++++++++++++--------------------------
 fs/nfsd/nfs4xdr.c   |   2 +-
 fs/nfsd/state.h     |  33 ++++-
 fs/nfsd/xdr4.h      |   2 +-
 5 files changed, 224 insertions(+), 198 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ce151f0ed4b9..460eeb329d81 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -250,7 +250,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 	fh_dup2(current_fh, &resfh);
 
 	/* set reply cache */
-	fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+	fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
 			&resfh.fh_handle);
 	if (!created)
 		status = do_open_permission(rqstp, current_fh, open,
@@ -277,7 +277,7 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
 	memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
 
 	/* set replay cache */
-	fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+	fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
 			&current_fh->fh_handle);
 
 	open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
@@ -306,9 +306,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	__be32 status;
 	struct nfsd4_compoundres *resp;
 
-	dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
+	dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n",
 		(int)open->op_fname.len, open->op_fname.data,
-		open->op_stateowner);
+		open->op_openowner);
 
 	/* This check required by spec. */
 	if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
@@ -332,7 +332,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	resp = rqstp->rq_resp;
 	status = nfsd4_process_open1(&resp->cstate, open);
 	if (status == nfserr_replay_me) {
-		struct nfs4_replay *rp = &open->op_stateowner->so_replay;
+		struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay;
 		fh_put(&cstate->current_fh);
 		fh_copy_shallow(&cstate->current_fh.fh_handle,
 				&rp->rp_openfh);
@@ -374,7 +374,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 				goto out;
 			break;
 		case NFS4_OPEN_CLAIM_PREVIOUS:
-			open->op_stateowner->so_confirmed = 1;
+			open->op_openowner->oo_confirmed = 1;
 			/*
 			 * The CURRENT_FH is already set to the file being
 			 * opened.  (1) set open->op_cinfo, (2) set
@@ -387,7 +387,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 				goto out;
 			break;
              	case NFS4_OPEN_CLAIM_DELEGATE_PREV:
-			open->op_stateowner->so_confirmed = 1;
+			open->op_openowner->oo_confirmed = 1;
 			dprintk("NFSD: unsupported OPEN claim type %d\n",
 				open->op_claim_type);
 			status = nfserr_notsupp;
@@ -405,8 +405,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 */
 	status = nfsd4_process_open2(rqstp, &cstate->current_fh, open);
 out:
-	if (open->op_stateowner)
-		cstate->replay_owner = open->op_stateowner;
+	if (open->op_openowner)
+		cstate->replay_owner = &open->op_openowner->oo_owner;
 	else
 		nfs4_unlock_state();
 	return status;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index eb11626babc6..567130dccda0 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -63,7 +63,7 @@ static u64 current_sessionid = 1;
 static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
 static struct nfs4_delegation * search_for_delegation(stateid_t *stid);
 static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
-static int check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner);
+static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner);
 
 /* Locking: */
 
@@ -77,7 +77,8 @@ static DEFINE_MUTEX(client_mutex);
  */
 static DEFINE_SPINLOCK(recall_lock);
 
-static struct kmem_cache *stateowner_slab = NULL;
+static struct kmem_cache *openowner_slab = NULL;
+static struct kmem_cache *lockowner_slab = NULL;
 static struct kmem_cache *file_slab = NULL;
 static struct kmem_cache *stateid_slab = NULL;
 static struct kmem_cache *deleg_slab = NULL;
@@ -432,41 +433,39 @@ static void release_lock_stateid(struct nfs4_stateid *stp)
 	unhash_generic_stateid(stp);
 	file = find_any_file(stp->st_file);
 	if (file)
-		locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
+		locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner));
 	free_generic_stateid(stp);
 }
 
-static void unhash_lockowner(struct nfs4_stateowner *sop)
+static void unhash_lockowner(struct nfs4_lockowner *lo)
 {
 	struct nfs4_stateid *stp;
 
-	list_del(&sop->so_idhash);
-	list_del(&sop->so_strhash);
-	list_del(&sop->so_perstateid);
-	while (!list_empty(&sop->so_stateids)) {
-		stp = list_first_entry(&sop->so_stateids,
+	list_del(&lo->lo_owner.so_idhash);
+	list_del(&lo->lo_owner.so_strhash);
+	list_del(&lo->lo_perstateid);
+	while (!list_empty(&lo->lo_owner.so_stateids)) {
+		stp = list_first_entry(&lo->lo_owner.so_stateids,
 				struct nfs4_stateid, st_perstateowner);
 		release_lock_stateid(stp);
 	}
 }
 
-static void release_lockowner(struct nfs4_stateowner *sop)
+static void release_lockowner(struct nfs4_lockowner *lo)
 {
-	unhash_lockowner(sop);
-	nfs4_free_stateowner(sop);
+	unhash_lockowner(lo);
+	nfs4_free_lockowner(lo);
 }
 
 static void
 release_stateid_lockowners(struct nfs4_stateid *open_stp)
 {
-	struct nfs4_stateowner *lock_sop;
+	struct nfs4_lockowner *lo;
 
 	while (!list_empty(&open_stp->st_lockowners)) {
-		lock_sop = list_entry(open_stp->st_lockowners.next,
-				struct nfs4_stateowner, so_perstateid);
-		/* list_del(&open_stp->st_lockowners);  */
-		BUG_ON(lock_sop->so_is_open_owner);
-		release_lockowner(lock_sop);
+		lo = list_entry(open_stp->st_lockowners.next,
+				struct nfs4_lockowner, lo_perstateid);
+		release_lockowner(lo);
 	}
 }
 
@@ -477,26 +476,25 @@ static void release_open_stateid(struct nfs4_stateid *stp)
 	free_generic_stateid(stp);
 }
 
-static void unhash_openowner(struct nfs4_stateowner *sop)
+static void unhash_openowner(struct nfs4_openowner *oo)
 {
 	struct nfs4_stateid *stp;
 
-	list_del(&sop->so_idhash);
-	list_del(&sop->so_strhash);
-	list_del(&sop->so_perclient);
-	list_del(&sop->so_perstateid); /* XXX: necessary? */
-	while (!list_empty(&sop->so_stateids)) {
-		stp = list_first_entry(&sop->so_stateids,
+	list_del(&oo->oo_owner.so_idhash);
+	list_del(&oo->oo_owner.so_strhash);
+	list_del(&oo->oo_perclient);
+	while (!list_empty(&oo->oo_owner.so_stateids)) {
+		stp = list_first_entry(&oo->oo_owner.so_stateids,
 				struct nfs4_stateid, st_perstateowner);
 		release_open_stateid(stp);
 	}
 }
 
-static void release_openowner(struct nfs4_stateowner *sop)
+static void release_openowner(struct nfs4_openowner *oo)
 {
-	unhash_openowner(sop);
-	list_del(&sop->so_close_lru);
-	nfs4_free_stateowner(sop);
+	unhash_openowner(oo);
+	list_del(&oo->oo_close_lru);
+	nfs4_free_openowner(oo);
 }
 
 #define SESSION_HASH_SIZE	512
@@ -961,7 +959,7 @@ unhash_client_locked(struct nfs4_client *clp)
 static void
 expire_client(struct nfs4_client *clp)
 {
-	struct nfs4_stateowner *sop;
+	struct nfs4_openowner *oo;
 	struct nfs4_delegation *dp;
 	struct list_head reaplist;
 
@@ -979,8 +977,8 @@ expire_client(struct nfs4_client *clp)
 		unhash_delegation(dp);
 	}
 	while (!list_empty(&clp->cl_openowners)) {
-		sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
-		release_openowner(sop);
+		oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient);
+		release_openowner(oo);
 	}
 	nfsd4_shutdown_callback(clp);
 	if (clp->cl_cb_conn.cb_xprt)
@@ -2173,7 +2171,8 @@ nfsd4_free_slab(struct kmem_cache **slab)
 void
 nfsd4_free_slabs(void)
 {
-	nfsd4_free_slab(&stateowner_slab);
+	nfsd4_free_slab(&openowner_slab);
+	nfsd4_free_slab(&lockowner_slab);
 	nfsd4_free_slab(&file_slab);
 	nfsd4_free_slab(&stateid_slab);
 	nfsd4_free_slab(&deleg_slab);
@@ -2182,9 +2181,13 @@ nfsd4_free_slabs(void)
 static int
 nfsd4_init_slabs(void)
 {
-	stateowner_slab = kmem_cache_create("nfsd4_stateowners",
-			sizeof(struct nfs4_stateowner), 0, 0, NULL);
-	if (stateowner_slab == NULL)
+	openowner_slab = kmem_cache_create("nfsd4_openowners",
+			sizeof(struct nfs4_openowner), 0, 0, NULL);
+	if (openowner_slab == NULL)
+		goto out_nomem;
+	lockowner_slab = kmem_cache_create("nfsd4_lockowners",
+			sizeof(struct nfs4_openowner), 0, 0, NULL);
+	if (lockowner_slab == NULL)
 		goto out_nomem;
 	file_slab = kmem_cache_create("nfsd4_files",
 			sizeof(struct nfs4_file), 0, 0, NULL);
@@ -2205,11 +2208,16 @@ out_nomem:
 	return -ENOMEM;
 }
 
-void
-nfs4_free_stateowner(struct nfs4_stateowner *sop)
+void nfs4_free_openowner(struct nfs4_openowner *oo)
+{
+	kfree(oo->oo_owner.so_owner.data);
+	kmem_cache_free(openowner_slab, oo);
+}
+
+void nfs4_free_lockowner(struct nfs4_lockowner *lo)
 {
-	kfree(sop->so_owner.data);
-	kmem_cache_free(stateowner_slab, sop);
+	kfree(lo->lo_owner.so_owner.data);
+	kmem_cache_free(lockowner_slab, lo);
 }
 
 static void init_nfs4_replay(struct nfs4_replay *rp)
@@ -2219,74 +2227,72 @@ static void init_nfs4_replay(struct nfs4_replay *rp)
 	rp->rp_buf = rp->rp_ibuf;
 }
 
-static inline struct nfs4_stateowner *alloc_stateowner(struct xdr_netobj *owner, struct nfs4_client *clp)
+static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp)
 {
 	struct nfs4_stateowner *sop;
 
-	sop = kmem_cache_alloc(stateowner_slab, GFP_KERNEL);
+	sop = kmem_cache_alloc(slab, GFP_KERNEL);
 	if (!sop)
 		return NULL;
 
 	sop->so_owner.data = kmemdup(owner->data, owner->len, GFP_KERNEL);
 	if (!sop->so_owner.data) {
-		kmem_cache_free(stateowner_slab, sop);
+		kmem_cache_free(slab, sop);
 		return NULL;
 	}
 	sop->so_owner.len = owner->len;
 
-	INIT_LIST_HEAD(&sop->so_perclient);
 	INIT_LIST_HEAD(&sop->so_stateids);
-	INIT_LIST_HEAD(&sop->so_perstateid);
-	INIT_LIST_HEAD(&sop->so_close_lru);
 	sop->so_id = current_ownerid++;
-	sop->so_time = 0;
 	sop->so_client = clp;
 	init_nfs4_replay(&sop->so_replay);
 	return sop;
 }
 
-static void hash_openowner(struct nfs4_stateowner *sop, struct nfs4_client *clp, unsigned int strhashval)
+static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
 {
 	unsigned int idhashval;
 
-	idhashval = open_ownerid_hashval(sop->so_id);
-	list_add(&sop->so_idhash, &open_ownerid_hashtbl[idhashval]);
-	list_add(&sop->so_strhash, &open_ownerstr_hashtbl[strhashval]);
-	list_add(&sop->so_perclient, &clp->cl_openowners);
+	idhashval = open_ownerid_hashval(oo->oo_owner.so_id);
+	list_add(&oo->oo_owner.so_idhash, &open_ownerid_hashtbl[idhashval]);
+	list_add(&oo->oo_owner.so_strhash, &open_ownerstr_hashtbl[strhashval]);
+	list_add(&oo->oo_perclient, &clp->cl_openowners);
 }
 
-static struct nfs4_stateowner *
+static struct nfs4_openowner *
 alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) {
-	struct nfs4_stateowner *sop;
+	struct nfs4_openowner *oo;
 
-	sop = alloc_stateowner(&open->op_owner, clp);
-	if (!sop)
+	oo = alloc_stateowner(openowner_slab, &open->op_owner, clp);
+	if (!oo)
 		return NULL;
-	sop->so_is_open_owner = 1;
-	sop->so_seqid = open->op_seqid;
-	sop->so_confirmed = 0;
-	hash_openowner(sop, clp, strhashval);
-	return sop;
+	oo->oo_owner.so_is_open_owner = 1;
+	oo->oo_owner.so_seqid = open->op_seqid;
+	oo->oo_confirmed = 0;
+	oo->oo_time = 0;
+	INIT_LIST_HEAD(&oo->oo_close_lru);
+	hash_openowner(oo, clp, strhashval);
+	return oo;
 }
 
 static inline void
 init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
-	struct nfs4_stateowner *sop = open->op_stateowner;
-	unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id);
+	struct nfs4_openowner *oo = open->op_openowner;
+	unsigned int hashval = stateid_hashval(oo->oo_owner.so_id, fp->fi_id);
 
 	INIT_LIST_HEAD(&stp->st_hash);
 	INIT_LIST_HEAD(&stp->st_perstateowner);
 	INIT_LIST_HEAD(&stp->st_lockowners);
 	INIT_LIST_HEAD(&stp->st_perfile);
 	list_add(&stp->st_hash, &stateid_hashtbl[hashval]);
-	list_add(&stp->st_perstateowner, &sop->so_stateids);
+	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
 	stp->st_type = NFS4_OPEN_STID;
-	stp->st_stateowner = sop;
+	stp->st_stateowner = &oo->oo_owner;
 	get_nfs4_file(fp);
 	stp->st_file = fp;
 	stp->st_stateid.si_boot = boot_time;
-	stp->st_stateid.si_stateownerid = sop->so_id;
+	stp->st_stateid.si_stateownerid = oo->oo_owner.so_id;
 	stp->st_stateid.si_fileid = fp->fi_id;
 	/* note will be incremented before first return to client: */
 	stp->st_stateid.si_generation = 0;
@@ -2299,12 +2305,12 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
 }
 
 static void
-move_to_close_lru(struct nfs4_stateowner *sop)
+move_to_close_lru(struct nfs4_openowner *oo)
 {
-	dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
+	dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo);
 
-	list_move_tail(&sop->so_close_lru, &close_lru);
-	sop->so_time = get_seconds();
+	list_move_tail(&oo->oo_close_lru, &close_lru);
+	oo->oo_time = get_seconds();
 }
 
 static int
@@ -2316,14 +2322,14 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner,
 		(sop->so_client->cl_clientid.cl_id == clid->cl_id);
 }
 
-static struct nfs4_stateowner *
+static struct nfs4_openowner *
 find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open)
 {
 	struct nfs4_stateowner *so = NULL;
 
 	list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) {
 		if (same_owner_str(so, &open->op_owner, &open->op_clientid))
-			return so;
+			return container_of(so, struct nfs4_openowner, oo_owner);
 	}
 	return NULL;
 }
@@ -2474,7 +2480,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
 	clientid_t *clientid = &open->op_clientid;
 	struct nfs4_client *clp = NULL;
 	unsigned int strhashval;
-	struct nfs4_stateowner *sop = NULL;
+	struct nfs4_openowner *oo = NULL;
 	__be32 status;
 
 	if (!check_name(open->op_owner))
@@ -2484,34 +2490,34 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
 		return nfserr_stale_clientid;
 
 	strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner);
-	sop = find_openstateowner_str(strhashval, open);
-	open->op_stateowner = sop;
-	if (!sop) {
+	oo = find_openstateowner_str(strhashval, open);
+	open->op_openowner = oo;
+	if (!oo) {
 		/* Make sure the client's lease hasn't expired. */
 		clp = find_confirmed_client(clientid);
 		if (clp == NULL)
 			return nfserr_expired;
 		goto renew;
 	}
-	if (!sop->so_confirmed) {
+	if (!oo->oo_confirmed) {
 		/* Replace unconfirmed owners without checking for replay. */
-		clp = sop->so_client;
-		release_openowner(sop);
-		open->op_stateowner = NULL;
+		clp = oo->oo_owner.so_client;
+		release_openowner(oo);
+		open->op_openowner = NULL;
 		goto renew;
 	}
-	status = nfsd4_check_seqid(cstate, sop, open->op_seqid);
+	status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
 	if (status)
 		return status;
 renew:
-	if (open->op_stateowner == NULL) {
-		sop = alloc_init_open_stateowner(strhashval, clp, open);
-		if (sop == NULL)
+	if (open->op_openowner == NULL) {
+		oo = alloc_init_open_stateowner(strhashval, clp, open);
+		if (oo == NULL)
 			return nfserr_jukebox;
-		open->op_stateowner = sop;
+		open->op_openowner = oo;
 	}
-	list_del_init(&sop->so_close_lru);
-	renew_client(sop->so_client);
+	list_del_init(&oo->oo_close_lru);
+	renew_client(oo->oo_owner.so_client);
 	return nfs_ok;
 }
 
@@ -2565,7 +2571,7 @@ out:
 		return nfs_ok;
 	if (status)
 		return status;
-	open->op_stateowner->so_confirmed = 1;
+	open->op_openowner->oo_confirmed = 1;
 	return nfs_ok;
 }
 
@@ -2573,14 +2579,14 @@ static __be32
 nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp)
 {
 	struct nfs4_stateid *local;
-	struct nfs4_stateowner *sop = open->op_stateowner;
+	struct nfs4_openowner *oo = open->op_openowner;
 
 	list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
 		/* ignore lock owners */
 		if (local->st_stateowner->so_is_open_owner == 0)
 			continue;
 		/* remember if we have seen this open owner */
-		if (local->st_stateowner == sop)
+		if (local->st_stateowner == &oo->oo_owner)
 			*stpp = local;
 		/* check for conflicting share reservations */
 		if (!test_share(local, open))
@@ -2698,8 +2704,8 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
 static void
 nfs4_set_claim_prev(struct nfsd4_open *open)
 {
-	open->op_stateowner->so_confirmed = 1;
-	open->op_stateowner->so_client->cl_firststate = 1;
+	open->op_openowner->oo_confirmed = 1;
+	open->op_openowner->oo_owner.so_client->cl_firststate = 1;
 }
 
 /* Should we give out recallable state?: */
@@ -2782,11 +2788,11 @@ static void
 nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_stateid *stp)
 {
 	struct nfs4_delegation *dp;
-	struct nfs4_stateowner *sop = stp->st_stateowner;
+	struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner);
 	int cb_up;
 	int status, flag = 0;
 
-	cb_up = nfsd4_cb_channel_good(sop->so_client);
+	cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
 	flag = NFS4_OPEN_DELEGATE_NONE;
 	open->op_recall = 0;
 	switch (open->op_claim_type) {
@@ -2802,7 +2808,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 			 * had the chance to reclaim theirs.... */
 			if (locks_in_grace())
 				goto out;
-			if (!cb_up || !sop->so_confirmed)
+			if (!cb_up || !oo->oo_confirmed)
 				goto out;
 			if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
 				flag = NFS4_OPEN_DELEGATE_WRITE;
@@ -2813,7 +2819,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 			goto out;
 	}
 
-	dp = alloc_init_deleg(sop->so_client, stp, fh, flag);
+	dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh, flag);
 	if (dp == NULL)
 		goto out_no_deleg;
 	status = nfs4_set_delegation(dp, flag);
@@ -2901,7 +2907,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
 
 	if (nfsd4_has_session(&resp->cstate))
-		open->op_stateowner->so_confirmed = 1;
+		open->op_openowner->oo_confirmed = 1;
 
 	/*
 	* Attempt to hand out a delegation. No error return, because the
@@ -2922,7 +2928,7 @@ out:
 	* To finish the open response, we just need to set the rflags.
 	*/
 	open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
-	if (!open->op_stateowner->so_confirmed &&
+	if (!open->op_openowner->oo_confirmed &&
 	    !nfsd4_has_session(&resp->cstate))
 		open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
 
@@ -2981,7 +2987,7 @@ static time_t
 nfs4_laundromat(void)
 {
 	struct nfs4_client *clp;
-	struct nfs4_stateowner *sop;
+	struct nfs4_openowner *oo;
 	struct nfs4_delegation *dp;
 	struct list_head *pos, *next, reaplist;
 	time_t cutoff = get_seconds() - nfsd4_lease;
@@ -3038,16 +3044,16 @@ nfs4_laundromat(void)
 	}
 	test_val = nfsd4_lease;
 	list_for_each_safe(pos, next, &close_lru) {
-		sop = list_entry(pos, struct nfs4_stateowner, so_close_lru);
-		if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) {
-			u = sop->so_time - cutoff;
+		oo = container_of(pos, struct nfs4_openowner, oo_close_lru);
+		if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) {
+			u = oo->oo_time - cutoff;
 			if (test_val > u)
 				test_val = u;
 			break;
 		}
 		dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
-			sop->so_id);
-		release_openowner(sop);
+			oo->oo_owner.so_id);
+		release_openowner(oo);
 	}
 	if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
 		clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -3069,13 +3075,12 @@ laundromat_main(struct work_struct *not_used)
 	queue_delayed_work(laundry_wq, &laundromat_work, t*HZ);
 }
 
-static struct nfs4_stateowner *
-search_close_lru(u32 st_id)
+static struct nfs4_openowner * search_close_lru(u32 st_id)
 {
-	struct nfs4_stateowner *local;
+	struct nfs4_openowner *local;
 
-	list_for_each_entry(local, &close_lru, so_close_lru) {
-		if (local->so_id == st_id)
+	list_for_each_entry(local, &close_lru, oo_close_lru) {
+		if (local->oo_owner.so_id == st_id)
 			return local;
 	}
 	return NULL;
@@ -3209,7 +3214,8 @@ __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session)
 		goto out;
 	status = nfserr_bad_stateid;
 
-	if (!stp->st_stateowner->so_confirmed)
+	if (stp->st_stateowner->so_is_open_owner
+	    && !openowner(stp->st_stateowner)->oo_confirmed)
 		goto out;
 
 	status = check_stateid_generation(stateid, &stp->st_stateid, has_session);
@@ -3274,7 +3280,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 		status = nfserr_bad_stateid;
 		if (nfs4_check_fh(current_fh, stp))
 			goto out;
-		if (!stp->st_stateowner->so_confirmed)
+		if (stp->st_stateowner->so_is_open_owner
+		    && !openowner(stp->st_stateowner)->oo_confirmed)
 			goto out;
 		status = check_stateid_generation(stateid, &stp->st_stateid,
 						  nfsd4_has_session(cstate));
@@ -3308,7 +3315,7 @@ nfsd4_free_delegation_stateid(stateid_t *stateid)
 static __be32
 nfsd4_free_lock_stateid(struct nfs4_stateid *stp)
 {
-	if (check_for_locks(stp->st_file, stp->st_stateowner))
+	if (check_for_locks(stp->st_file, lockowner(stp->st_stateowner)))
 		return nfserr_locks_held;
 	release_lock_stateid(stp);
 	return nfs_ok;
@@ -3417,7 +3424,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 	if (status)
 		return status;
 
-	if (!sop->so_confirmed && !(flags & CONFIRM)) {
+	if (sop->so_is_open_owner && !openowner(sop)->oo_confirmed
+			&& !(flags & CONFIRM)) {
 		dprintk("NFSD: preprocess_seqid_op: stateowner not"
 				" confirmed yet!\n");
 		return nfserr_bad_stateid;
@@ -3434,7 +3442,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		   struct nfsd4_open_confirm *oc)
 {
 	__be32 status;
-	struct nfs4_stateowner *sop;
+	struct nfs4_openowner *oo;
 	struct nfs4_stateid *stp;
 
 	dprintk("NFSD: nfsd4_open_confirm on file %.*s\n",
@@ -3452,17 +3460,17 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 					CONFIRM | OPEN_STATE, &stp);
 	if (status)
 		goto out;
-	sop = stp->st_stateowner;
+	oo = openowner(stp->st_stateowner);
 	status = nfserr_bad_stateid;
-	if (sop->so_confirmed)
+	if (oo->oo_confirmed)
 		goto out;
-	sop->so_confirmed = 1;
+	oo->oo_confirmed = 1;
 	update_stateid(&stp->st_stateid);
 	memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t));
 	dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
 		__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid));
 
-	nfsd4_create_clid_dir(sop->so_client);
+	nfsd4_create_clid_dir(oo->oo_owner.so_client);
 	status = nfs_ok;
 out:
 	if (!cstate->replay_owner)
@@ -3513,7 +3521,6 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 					&od->od_stateid, OPEN_STATE, &stp);
 	if (status)
 		goto out; 
-
 	status = nfserr_inval;
 	if (!test_bit(od->od_share_access, &stp->st_access_bmap)) {
 		dprintk("NFSD:access not a subset current bitmap: 0x%lx, input access=%08x\n",
@@ -3546,8 +3553,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	    struct nfsd4_close *close)
 {
 	__be32 status;
+	struct nfs4_openowner *oo;
 	struct nfs4_stateid *stp;
-	struct nfs4_stateowner *so;
 
 	dprintk("NFSD: nfsd4_close on file %.*s\n", 
 			(int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3563,19 +3570,19 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		 * Also, we should make sure this isn't just the result of
 		 * a replayed close:
 		 */
-		so = search_close_lru(close->cl_stateid.si_stateownerid);
+		oo = search_close_lru(close->cl_stateid.si_stateownerid);
 		/* It's not stale; let's assume it's expired: */
-		if (so == NULL)
+		if (oo == NULL)
 			goto out;
-		cstate->replay_owner = so;
-		status = nfsd4_check_seqid(cstate, so, close->cl_seqid);
+		cstate->replay_owner = &oo->oo_owner;
+		status = nfsd4_check_seqid(cstate, &oo->oo_owner, close->cl_seqid);
 		if (status)
 			goto out;
 		status = nfserr_bad_seqid;
 	}
 	if (status)
 		goto out; 
-	so = stp->st_stateowner;
+	oo = openowner(stp->st_stateowner);
 	status = nfs_ok;
 	update_stateid(&stp->st_stateid);
 	memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t));
@@ -3587,8 +3594,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 * released by the laundromat service after the lease period
 	 * to enable us to handle CLOSE replay
 	 */
-	if (list_empty(&so->so_stateids))
-		move_to_close_lru(so);
+	if (list_empty(&oo->oo_owner.so_stateids))
+		move_to_close_lru(oo);
 out:
 	if (!cstate->replay_owner)
 		nfs4_unlock_state();
@@ -3768,17 +3775,17 @@ static const struct lock_manager_operations nfsd_posix_mng_ops  = {
 static inline void
 nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
 {
-	struct nfs4_stateowner *sop;
+	struct nfs4_lockowner *lo;
 
 	if (fl->fl_lmops == &nfsd_posix_mng_ops) {
-		sop = (struct nfs4_stateowner *) fl->fl_owner;
-		deny->ld_owner.data = kmemdup(sop->so_owner.data,
-					sop->so_owner.len, GFP_KERNEL);
+		lo = (struct nfs4_lockowner *) fl->fl_owner;
+		deny->ld_owner.data = kmemdup(lo->lo_owner.so_owner.data,
+					lo->lo_owner.so_owner.len, GFP_KERNEL);
 		if (!deny->ld_owner.data)
 			/* We just don't care that much */
 			goto nevermind;
-		deny->ld_owner.len = sop->so_owner.len;
-		deny->ld_clientid = sop->so_client->cl_clientid;
+		deny->ld_owner.len = lo->lo_owner.so_owner.len;
+		deny->ld_clientid = lo->lo_owner.so_client->cl_clientid;
 	} else {
 nevermind:
 		deny->ld_owner.len = 0;
@@ -3795,8 +3802,8 @@ nevermind:
 		deny->ld_type = NFS4_WRITE_LT;
 }
 
-static struct nfs4_stateowner *
-find_lockstateowner_str(struct inode *inode, clientid_t *clid,
+static struct nfs4_lockowner *
+find_lockowner_str(struct inode *inode, clientid_t *clid,
 		struct xdr_netobj *owner)
 {
 	unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner);
@@ -3804,19 +3811,19 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid,
 
 	list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) {
 		if (same_owner_str(op, owner, clid))
-			return op;
+			return lockowner(op);
 	}
 	return NULL;
 }
 
-static void hash_lockowner(struct nfs4_stateowner *sop, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp)
+static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp)
 {
 	unsigned int idhashval;
 
-	idhashval = lockownerid_hashval(sop->so_id);
-	list_add(&sop->so_idhash, &lock_ownerid_hashtbl[idhashval]);
-	list_add(&sop->so_strhash, &lock_ownerstr_hashtbl[strhashval]);
-	list_add(&sop->so_perstateid, &open_stp->st_lockowners);
+	idhashval = lockownerid_hashval(lo->lo_owner.so_id);
+	list_add(&lo->lo_owner.so_idhash, &lock_ownerid_hashtbl[idhashval]);
+	list_add(&lo->lo_owner.so_strhash, &lock_ownerstr_hashtbl[strhashval]);
+	list_add(&lo->lo_perstateid, &open_stp->st_lockowners);
 }
 
 /*
@@ -3827,28 +3834,27 @@ static void hash_lockowner(struct nfs4_stateowner *sop, unsigned int strhashval,
  * strhashval = lock_ownerstr_hashval 
  */
 
-static struct nfs4_stateowner *
+static struct nfs4_lockowner *
 alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp, struct nfsd4_lock *lock) {
-	struct nfs4_stateowner *sop;
+	struct nfs4_lockowner *lo;
 
-	sop = alloc_stateowner(&lock->lk_new_owner, clp);
-	if (!sop)
+	lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp);
+	if (!lo)
 		return NULL;
-	INIT_LIST_HEAD(&sop->so_stateids);
-	sop->so_is_open_owner = 0;
+	INIT_LIST_HEAD(&lo->lo_owner.so_stateids);
+	lo->lo_owner.so_is_open_owner = 0;
 	/* It is the openowner seqid that will be incremented in encode in the
 	 * case of new lockowners; so increment the lock seqid manually: */
-	sop->so_seqid = lock->lk_new_lock_seqid + 1;
-	sop->so_confirmed = 1;
-	hash_lockowner(sop, strhashval, clp, open_stp);
-	return sop;
+	lo->lo_owner.so_seqid = lock->lk_new_lock_seqid + 1;
+	hash_lockowner(lo, strhashval, clp, open_stp);
+	return lo;
 }
 
 static struct nfs4_stateid *
-alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struct nfs4_stateid *open_stp)
+alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_stateid *open_stp)
 {
 	struct nfs4_stateid *stp;
-	unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id);
+	unsigned int hashval = stateid_hashval(lo->lo_owner.so_id, fp->fi_id);
 
 	stp = nfs4_alloc_stateid();
 	if (stp == NULL)
@@ -3859,13 +3865,13 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
 	INIT_LIST_HEAD(&stp->st_lockowners); /* not used */
 	list_add(&stp->st_hash, &stateid_hashtbl[hashval]);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
-	list_add(&stp->st_perstateowner, &sop->so_stateids);
-	stp->st_stateowner = sop;
+	list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
+	stp->st_stateowner = &lo->lo_owner;
 	stp->st_type = NFS4_LOCK_STID;
 	get_nfs4_file(fp);
 	stp->st_file = fp;
 	stp->st_stateid.si_boot = boot_time;
-	stp->st_stateid.si_stateownerid = sop->so_id;
+	stp->st_stateid.si_stateownerid = lo->lo_owner.so_id;
 	stp->st_stateid.si_fileid = fp->fi_id;
 	/* note will be incremented before first return to client: */
 	stp->st_stateid.si_generation = 0;
@@ -3902,8 +3908,8 @@ __be32
 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	   struct nfsd4_lock *lock)
 {
-	struct nfs4_stateowner *open_sop = NULL;
-	struct nfs4_stateowner *lock_sop = NULL;
+	struct nfs4_openowner *open_sop = NULL;
+	struct nfs4_lockowner *lock_sop = NULL;
 	struct nfs4_stateid *lock_stp;
 	struct nfs4_file *fp;
 	struct file *filp = NULL;
@@ -3949,23 +3955,23 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 					OPEN_STATE, &open_stp);
 		if (status)
 			goto out;
+		open_sop = openowner(open_stp->st_stateowner);
 		status = nfserr_bad_stateid;
-		open_sop = open_stp->st_stateowner;
 		if (!nfsd4_has_session(cstate) &&
-				!same_clid(&open_sop->so_client->cl_clientid,
+			!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
 						&lock->v.new.clientid))
 			goto out;
 		/* create lockowner and lock stateid */
 		fp = open_stp->st_file;
-		strhashval = lock_ownerstr_hashval(fp->fi_inode, 
-				open_sop->so_client->cl_clientid.cl_id, 
+		strhashval = lock_ownerstr_hashval(fp->fi_inode,
+				open_sop->oo_owner.so_client->cl_clientid.cl_id,
 				&lock->v.new.owner);
 		/* XXX: Do we need to check for duplicate stateowners on
 		 * the same file, or should they just be allowed (and
 		 * create new stateids)? */
 		status = nfserr_jukebox;
 		lock_sop = alloc_init_lock_stateowner(strhashval,
-				open_sop->so_client, open_stp, lock);
+				open_sop->oo_owner.so_client, open_stp, lock);
 		if (lock_sop == NULL)
 			goto out;
 		lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp);
@@ -3974,12 +3980,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	} else {
 		/* lock (lock owner + lock stateid) already exists */
 		status = nfs4_preprocess_seqid_op(cstate,
-				       lock->lk_old_lock_seqid, 
-				       &lock->lk_old_lock_stateid, 
+				       lock->lk_old_lock_seqid,
+				       &lock->lk_old_lock_stateid,
 				       LOCK_STATE, &lock_stp);
 		if (status)
 			goto out;
-		lock_sop = lock_stp->st_stateowner;
+		lock_sop = lockowner(lock_stp->st_stateowner);
 		fp = lock_stp->st_file;
 	}
 	/* lock_sop and lock_stp have been created or found */
@@ -4092,7 +4098,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
 	struct inode *inode;
 	struct file_lock file_lock;
-	struct nfs4_stateowner *so;
+	struct nfs4_lockowner *lo;
 	int error;
 	__be32 status;
 
@@ -4128,10 +4134,9 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto out;
 	}
 
-	so = find_lockstateowner_str(inode,
-			&lockt->lt_clientid, &lockt->lt_owner);
-	if (so)
-		file_lock.fl_owner = (fl_owner_t)so;
+	lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner);
+	if (lo)
+		file_lock.fl_owner = (fl_owner_t)lo;
 	file_lock.fl_pid = current->tgid;
 	file_lock.fl_flags = FL_POSIX;
 
@@ -4186,7 +4191,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	BUG_ON(!filp);
 	locks_init_lock(&file_lock);
 	file_lock.fl_type = F_UNLCK;
-	file_lock.fl_owner = (fl_owner_t) stp->st_stateowner;
+	file_lock.fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
 	file_lock.fl_pid = current->tgid;
 	file_lock.fl_file = filp;
 	file_lock.fl_flags = FL_POSIX; 
@@ -4225,7 +4230,7 @@ out_nfserr:
  * 	0: no locks held by lockowner
  */
 static int
-check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
+check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
 {
 	struct file_lock **flpp;
 	struct inode *inode = filp->fi_inode;
@@ -4250,6 +4255,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
 {
 	clientid_t *clid = &rlockowner->rl_clientid;
 	struct nfs4_stateowner *sop;
+	struct nfs4_lockowner *lo;
 	struct nfs4_stateid *stp;
 	struct xdr_netobj *owner = &rlockowner->rl_owner;
 	struct list_head matches;
@@ -4279,11 +4285,10 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
 				continue;
 			list_for_each_entry(stp, &sop->so_stateids,
 					st_perstateowner) {
-				if (check_for_locks(stp->st_file, sop))
+				lo = lockowner(sop);
+				if (check_for_locks(stp->st_file, lo))
 					goto out;
-				/* Note: so_perclient unused for lockowners,
-				 * so it's OK to fool with here. */
-				list_add(&sop->so_perclient, &matches);
+				list_add(&lo->lo_list, &matches);
 			}
 		}
 	}
@@ -4292,12 +4297,12 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
 	 * have been checked. */
 	status = nfs_ok;
 	while (!list_empty(&matches)) {
-		sop = list_entry(matches.next, struct nfs4_stateowner,
-								so_perclient);
+		lo = list_entry(matches.next, struct nfs4_lockowner,
+								lo_list);
 		/* unhash_stateowner deletes so_perclient only
 		 * for openowners. */
-		list_del(&sop->so_perclient);
-		release_lockowner(sop);
+		list_del(&lo->lo_list);
+		release_lockowner(lo);
 	}
 out:
 	nfs4_unlock_state();
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c4dcba3aac1f..182570bed472 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -646,7 +646,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 
 	memset(open->op_bmval, 0, sizeof(open->op_bmval));
 	open->op_iattr.ia_valid = 0;
-	open->op_stateowner = NULL;
+	open->op_openowner = NULL;
 
 	/* seqid, share_access, share_deny, clientid, ownerlen */
 	READ_BUF(16 + sizeof(clientid_t));
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 0d88000d15d7..7994ed9be3cc 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -337,14 +337,11 @@ struct nfs4_replay {
 *         reaped (when so_perfilestate is empty) to hold the last close replay.
 *         reaped by laundramat thread after lease period.
 */
+
 struct nfs4_stateowner {
 	struct list_head        so_idhash;   /* hash by so_id */
 	struct list_head        so_strhash;   /* hash by op_name */
-	struct list_head        so_perclient;
 	struct list_head        so_stateids;
-	struct list_head        so_perstateid; /* for lockowners only */
-	struct list_head	so_close_lru; /* tail queue */
-	time_t			so_time; /* time of placement on so_close_lru */
 	int			so_is_open_owner; /* 1=openowner,0=lockowner */
 	u32                     so_id;
 	struct nfs4_client *    so_client;
@@ -352,10 +349,33 @@ struct nfs4_stateowner {
 	 * sequence id expected from the client: */
 	u32                     so_seqid;
 	struct xdr_netobj       so_owner;     /* open owner name */
-	int                     so_confirmed; /* successful OPEN_CONFIRM? */
 	struct nfs4_replay	so_replay;
 };
 
+struct nfs4_openowner {
+	struct nfs4_stateowner	oo_owner; /* must be first field */
+	struct list_head        oo_perclient;
+	struct list_head	oo_close_lru; /* tail queue */
+	time_t			oo_time; /* time of placement on so_close_lru */
+	int                     oo_confirmed; /* successful OPEN_CONFIRM? */
+};
+
+struct nfs4_lockowner {
+	struct nfs4_stateowner	lo_owner; /* must be first element */
+	struct list_head        lo_perstateid; /* for lockowners only */
+	struct list_head	lo_list; /* for temporary uses */
+};
+
+static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so)
+{
+	return container_of(so, struct nfs4_openowner, oo_owner);
+}
+
+static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so)
+{
+	return container_of(so, struct nfs4_lockowner, lo_owner);
+}
+
 /*
 *  nfs4_file: a file opened by some number of (open) nfs4_stateowners.
 *    o fi_perfile list is used to search for conflicting 
@@ -457,7 +477,8 @@ extern void nfs4_lock_state(void);
 extern void nfs4_unlock_state(void);
 extern int nfs4_in_grace(void);
 extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
-extern void nfs4_free_stateowner(struct nfs4_stateowner *sop);
+extern void nfs4_free_openowner(struct nfs4_openowner *);
+extern void nfs4_free_lockowner(struct nfs4_lockowner *);
 extern int set_callback_cred(void);
 extern void nfsd4_probe_callback(struct nfs4_client *clp);
 extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 27a3dfab96a9..f95a72482064 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -227,7 +227,7 @@ struct nfsd4_open {
 	struct nfsd4_change_info  op_cinfo; /* response */
 	u32		op_rflags;          /* response */
 	int		op_truncate;        /* used during processing */
-	struct nfs4_stateowner *op_stateowner; /* used during processing */
+	struct nfs4_openowner *op_openowner; /* used during processing */
 	struct nfs4_acl *op_acl;
 };
 #define op_iattr	iattr
-- 
cgit v1.2.3-58-ga151


From 4665e2bac5076d02264f4a4d79edafa05ec7b752 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 6 Sep 2011 14:50:49 -0400
Subject: nfsd4: split out some free_generic_stateid code

We'll use this elsewhere.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 567130dccda0..c28432a80210 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -411,7 +411,7 @@ static void unhash_generic_stateid(struct nfs4_stateid *stp)
 	list_del(&stp->st_perstateowner);
 }
 
-static void free_generic_stateid(struct nfs4_stateid *stp)
+static void close_generic_stateid(struct nfs4_stateid *stp)
 {
 	int i;
 
@@ -420,9 +420,16 @@ static void free_generic_stateid(struct nfs4_stateid *stp)
 			if (test_bit(i, &stp->st_access_bmap))
 				nfs4_file_put_access(stp->st_file,
 						nfs4_access_to_omode(i));
+			__clear_bit(i, &stp->st_access_bmap);
 		}
 	}
 	put_nfs4_file(stp->st_file);
+	stp->st_file = NULL;
+}
+
+static void free_generic_stateid(struct nfs4_stateid *stp)
+{
+	close_generic_stateid(stp);
 	kmem_cache_free(stateid_slab, stp);
 }
 
-- 
cgit v1.2.3-58-ga151


From a25cac5198d4ff2842ccca63b423962848ad24b2 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Wed, 24 Aug 2011 09:40:25 +0200
Subject: proc: Consider NO_HZ when printing idle and iowait times

show_stat handler of the /proc/stat file relies on kstat_cpu(cpu)
statistics when priting information about idle and iowait times.
This is OK if we are not using tickless kernel (CONFIG_NO_HZ) because
counters are updated periodically.
With NO_HZ things got more tricky because we are not doing idle/iowait
accounting while we are tickless so the value might get outdated.
Users of /proc/stat will notice that by unchanged idle/iowait values
which is then interpreted as 0% idle/iowait time. From the user space
POV this is an unexpected behavior and a change of the interface.

Let's fix this by using get_cpu_{idle,iowait}_time_us which accounts the
total idle/iowait time since boot and it doesn't rely on sampling or any
other periodic activity. Fall back to the previous behavior if NO_HZ is
disabled or not configured.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Cc: Dave Jones <davej@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Link: http://lkml.kernel.org/r/39181366adac1b39cb6aa3cd53ff0f7c78d32676.1314172057.git.mhocko@suse.cz
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/proc/stat.c | 41 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 34 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 9758b654a1bc..42b274da92c3 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -10,6 +10,7 @@
 #include <linux/time.h>
 #include <linux/irqnr.h>
 #include <asm/cputime.h>
+#include <linux/tick.h>
 
 #ifndef arch_irq_stat_cpu
 #define arch_irq_stat_cpu(cpu) 0
@@ -21,6 +22,35 @@
 #define arch_idle_time(cpu) 0
 #endif
 
+static cputime64_t get_idle_time(int cpu)
+{
+	u64 idle_time = get_cpu_idle_time_us(cpu, NULL);
+	cputime64_t idle;
+
+	if (idle_time == -1ULL) {
+		/* !NO_HZ so we can rely on cpustat.idle */
+		idle = kstat_cpu(cpu).cpustat.idle;
+		idle = cputime64_add(idle, arch_idle_time(cpu));
+	} else
+		idle = usecs_to_cputime(idle_time);
+
+	return idle;
+}
+
+static cputime64_t get_iowait_time(int cpu)
+{
+	u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL);
+	cputime64_t iowait;
+
+	if (iowait_time == -1ULL)
+		/* !NO_HZ so we can rely on cpustat.iowait */
+		iowait = kstat_cpu(cpu).cpustat.iowait;
+	else
+		iowait = usecs_to_cputime(iowait_time);
+
+	return iowait;
+}
+
 static int show_stat(struct seq_file *p, void *v)
 {
 	int i, j;
@@ -42,9 +72,8 @@ static int show_stat(struct seq_file *p, void *v)
 		user = cputime64_add(user, kstat_cpu(i).cpustat.user);
 		nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
 		system = cputime64_add(system, kstat_cpu(i).cpustat.system);
-		idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
-		idle = cputime64_add(idle, arch_idle_time(i));
-		iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
+		idle = cputime64_add(idle, get_idle_time(i));
+		iowait = cputime64_add(iowait, get_iowait_time(i));
 		irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
@@ -76,14 +105,12 @@ static int show_stat(struct seq_file *p, void *v)
 		(unsigned long long)cputime64_to_clock_t(guest),
 		(unsigned long long)cputime64_to_clock_t(guest_nice));
 	for_each_online_cpu(i) {
-
 		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
 		user = kstat_cpu(i).cpustat.user;
 		nice = kstat_cpu(i).cpustat.nice;
 		system = kstat_cpu(i).cpustat.system;
-		idle = kstat_cpu(i).cpustat.idle;
-		idle = cputime64_add(idle, arch_idle_time(i));
-		iowait = kstat_cpu(i).cpustat.iowait;
+		idle = get_idle_time(i);
+		iowait = get_iowait_time(i);
 		irq = kstat_cpu(i).cpustat.irq;
 		softirq = kstat_cpu(i).cpustat.softirq;
 		steal = kstat_cpu(i).cpustat.steal;
-- 
cgit v1.2.3-58-ga151


From 7c2e70879fc0949b4220ee61b7c4553f6976a94d Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 18:28:51 -0400
Subject: ext4: add ext4-specific kludge to avoid an oops after the disk
 disappears

The del_gendisk() function uninitializes the disk-specific data
structures, including the bdi structure, without telling anyone
else.  Once this happens, any attempt to call mark_buffer_dirty()
(for example, by ext4_commit_super), will cause a kernel OOPS.

Fix this for now until we can fix things in an architecturally correct
way.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ee2f74a7084d..5dcd0dacc591 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -414,6 +414,22 @@ static void save_error_info(struct super_block *sb, const char *func,
 	ext4_commit_super(sb, 1);
 }
 
+/*
+ * The del_gendisk() function uninitializes the disk-specific data
+ * structures, including the bdi structure, without telling anyone
+ * else.  Once this happens, any attempt to call mark_buffer_dirty()
+ * (for example, by ext4_commit_super), will cause a kernel OOPS.
+ * This is a kludge to prevent these oops until we can put in a proper
+ * hook in del_gendisk() to inform the VFS and file system layers.
+ */
+static int block_device_ejected(struct super_block *sb)
+{
+	struct inode *bd_inode = sb->s_bdev->bd_inode;
+	struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
+
+	return bdi->dev == NULL;
+}
+
 
 /* Deal with the reporting of failure conditions on a filesystem such as
  * inconsistencies detected or read IO failures.
@@ -4072,7 +4088,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 	struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
 	int error = 0;
 
-	if (!sbh)
+	if (!sbh || block_device_ejected(sb))
 		return error;
 	if (buffer_write_io_error(sbh)) {
 		/*
-- 
cgit v1.2.3-58-ga151


From 281b59959707dfae03ce038cdf231bf4904e170c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 18:34:51 -0400
Subject: ext4: read-only support for bigalloc file systems

This adds supports for bigalloc file systems.  It teaches the mount
code just enough about bigalloc superblock fields that it will mount
the file system without freaking out that the number of blocks per
group is too big.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  | 17 ++++++++++++++---
 fs/ext4/super.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 64 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 48ae98819d35..f7257aa6bf81 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -239,8 +239,11 @@ struct ext4_io_submit {
 # define EXT4_BLOCK_SIZE(s)		(EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
 #endif
 #define	EXT4_ADDR_PER_BLOCK(s)		(EXT4_BLOCK_SIZE(s) / sizeof(__u32))
+#define EXT4_CLUSTER_SIZE(s)		(EXT4_BLOCK_SIZE(s) << \
+					 EXT4_SB(s)->s_cluster_bits)
 #ifdef __KERNEL__
 # define EXT4_BLOCK_SIZE_BITS(s)	((s)->s_blocksize_bits)
+# define EXT4_CLUSTER_BITS(s)		(EXT4_SB(s)->s_cluster_bits)
 #else
 # define EXT4_BLOCK_SIZE_BITS(s)	((s)->s_log_block_size + 10)
 #endif
@@ -306,6 +309,7 @@ struct flex_groups {
 #define EXT4_DESC_SIZE(s)		(EXT4_SB(s)->s_desc_size)
 #ifdef __KERNEL__
 # define EXT4_BLOCKS_PER_GROUP(s)	(EXT4_SB(s)->s_blocks_per_group)
+# define EXT4_CLUSTERS_PER_GROUP(s)	(EXT4_SB(s)->s_clusters_per_group)
 # define EXT4_DESC_PER_BLOCK(s)		(EXT4_SB(s)->s_desc_per_block)
 # define EXT4_INODES_PER_GROUP(s)	(EXT4_SB(s)->s_inodes_per_group)
 # define EXT4_DESC_PER_BLOCK_BITS(s)	(EXT4_SB(s)->s_desc_per_block_bits)
@@ -975,9 +979,9 @@ struct ext4_super_block {
 /*10*/	__le32	s_free_inodes_count;	/* Free inodes count */
 	__le32	s_first_data_block;	/* First Data Block */
 	__le32	s_log_block_size;	/* Block size */
-	__le32	s_obso_log_frag_size;	/* Obsoleted fragment size */
+	__le32	s_log_cluster_size;	/* Allocation cluster size */
 /*20*/	__le32	s_blocks_per_group;	/* # Blocks per group */
-	__le32	s_obso_frags_per_group;	/* Obsoleted fragments per group */
+	__le32	s_clusters_per_group;	/* # Clusters per group */
 	__le32	s_inodes_per_group;	/* # Inodes per group */
 	__le32	s_mtime;		/* Mount time */
 /*30*/	__le32	s_wtime;		/* Write time */
@@ -1073,7 +1077,10 @@ struct ext4_super_block {
 	__u8	s_last_error_func[32];	/* function where the error happened */
 #define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
 	__u8	s_mount_opts[64];
-	__le32	s_reserved[112];        /* Padding to the end of the block */
+	__le32	s_usr_quota_inum;	/* inode for tracking user quota */
+	__le32	s_grp_quota_inum;	/* inode for tracking group quota */
+	__le32	s_overhead_clusters;	/* overhead blocks/clusters in fs */
+	__le32  s_reserved[109];        /* Padding to the end of the block */
 };
 
 #define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
@@ -1093,6 +1100,7 @@ struct ext4_sb_info {
 	unsigned long s_desc_size;	/* Size of a group descriptor in bytes */
 	unsigned long s_inodes_per_block;/* Number of inodes per block */
 	unsigned long s_blocks_per_group;/* Number of blocks in a group */
+	unsigned long s_clusters_per_group; /* Number of clusters in a group */
 	unsigned long s_inodes_per_group;/* Number of inodes in a group */
 	unsigned long s_itb_per_group;	/* Number of inode table blocks per group */
 	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
@@ -1101,6 +1109,8 @@ struct ext4_sb_info {
 	ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
 	unsigned long s_overhead_last;  /* Last calculated overhead */
 	unsigned long s_blocks_last;    /* Last seen block count */
+	unsigned int s_cluster_ratio;	/* Number of blocks per cluster */
+	unsigned int s_cluster_bits;	/* log2 of s_cluster_ratio */
 	loff_t s_bitmap_maxbytes;	/* max bytes for bitmap files */
 	struct buffer_head * s_sbh;	/* Buffer containing the super block */
 	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
@@ -1367,6 +1377,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK	0x0020
 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE	0x0040
 #define EXT4_FEATURE_RO_COMPAT_QUOTA		0x0100
+#define EXT4_FEATURE_RO_COMPAT_BIGALLOC		0x0200
 
 #define EXT4_FEATURE_INCOMPAT_COMPRESSION	0x0001
 #define EXT4_FEATURE_INCOMPAT_FILETYPE		0x0002
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5dcd0dacc591..823e7d9deee2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1953,7 +1953,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 		res = MS_RDONLY;
 	}
 	if (read_only)
-		return res;
+		goto done;
 	if (!(sbi->s_mount_state & EXT4_VALID_FS))
 		ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
 			 "running e2fsck is recommended");
@@ -1984,6 +1984,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 
 	ext4_commit_super(sb, 1);
+done:
 	if (test_opt(sb, DEBUG))
 		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
 				"bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
@@ -3105,10 +3106,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	char *cp;
 	const char *descr;
 	int ret = -ENOMEM;
-	int blocksize;
+	int blocksize, clustersize;
 	unsigned int db_count;
 	unsigned int i;
-	int needs_recovery, has_huge_files;
+	int needs_recovery, has_huge_files, has_bigalloc;
 	__u64 blocks_count;
 	int err;
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
@@ -3412,12 +3413,53 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		sb->s_dirt = 1;
 	}
 
-	if (sbi->s_blocks_per_group > blocksize * 8) {
-		ext4_msg(sb, KERN_ERR,
-		       "#blocks per group too big: %lu",
-		       sbi->s_blocks_per_group);
-		goto failed_mount;
+	/* Handle clustersize */
+	clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
+	has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				EXT4_FEATURE_RO_COMPAT_BIGALLOC);
+	if (has_bigalloc) {
+		if (clustersize < blocksize) {
+			ext4_msg(sb, KERN_ERR,
+				 "cluster size (%d) smaller than "
+				 "block size (%d)", clustersize, blocksize);
+			goto failed_mount;
+		}
+		sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
+			le32_to_cpu(es->s_log_block_size);
+		sbi->s_clusters_per_group =
+			le32_to_cpu(es->s_clusters_per_group);
+		if (sbi->s_clusters_per_group > blocksize * 8) {
+			ext4_msg(sb, KERN_ERR,
+				 "#clusters per group too big: %lu",
+				 sbi->s_clusters_per_group);
+			goto failed_mount;
+		}
+		if (sbi->s_blocks_per_group !=
+		    (sbi->s_clusters_per_group * (clustersize / blocksize))) {
+			ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
+				 "clusters per group (%lu) inconsistent",
+				 sbi->s_blocks_per_group,
+				 sbi->s_clusters_per_group);
+			goto failed_mount;
+		}
+	} else {
+		if (clustersize != blocksize) {
+			ext4_warning(sb, "fragment/cluster size (%d) != "
+				     "block size (%d)", clustersize,
+				     blocksize);
+			clustersize = blocksize;
+		}
+		if (sbi->s_blocks_per_group > blocksize * 8) {
+			ext4_msg(sb, KERN_ERR,
+				 "#blocks per group too big: %lu",
+				 sbi->s_blocks_per_group);
+			goto failed_mount;
+		}
+		sbi->s_clusters_per_group = sbi->s_blocks_per_group;
+		sbi->s_cluster_bits = 0;
 	}
+	sbi->s_cluster_ratio = clustersize / blocksize;
+
 	if (sbi->s_inodes_per_group > blocksize * 8) {
 		ext4_msg(sb, KERN_ERR,
 		       "#inodes per group too big: %lu",
-- 
cgit v1.2.3-58-ga151


From bab08ab9646288f1b0b72a7aaeecdff94bd62c18 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 18:36:51 -0400
Subject: ext4: enforce bigalloc restrictions (e.g., no online resizing, etc.)

At least initially if the bigalloc feature is enabled, we will not
support non-extent mapped inodes, online resizing, online defrag, or
the FITRIM ioctl.  This simplifies the initial implementation.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/indirect.c |  7 +++++++
 fs/ext4/inode.c    |  5 +++++
 fs/ext4/ioctl.c    | 33 +++++++++++++++++++++++++++++----
 fs/ext4/super.c    |  7 +++++++
 4 files changed, 48 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index ea704dff8669..3cfc73fbca8e 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -699,6 +699,13 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 	/*
 	 * Okay, we need to do block allocation.
 	*/
+	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+				       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+		EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
+				 "non-extent mapped inodes with bigalloc");
+		return -ENOSPC;
+	}
+
 	goal = ext4_find_goal(inode, map->m_lblk, partial);
 
 	/* the number of blocks need to allocate for [d,t]indirect blocks */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6ecc93979e48..904a9a623dab 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3358,6 +3358,11 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
 		return -ENOTSUPP;
 	}
 
+	if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
+		/* TODO: Add support for bigalloc file systems */
+		return -ENOTSUPP;
+	}
+
 	return ext4_ext_punch_hole(file, offset, length);
 }
 
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index f18bfe37aff8..2046d699b4a7 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -21,6 +21,7 @@
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	unsigned int flags;
 
@@ -183,7 +184,6 @@ setversion_out:
 		 * Returns 1 if it slept, else zero.
 		 */
 		{
-			struct super_block *sb = inode->i_sb;
 			DECLARE_WAITQUEUE(wait, current);
 			int ret = 0;
 
@@ -199,7 +199,6 @@ setversion_out:
 #endif
 	case EXT4_IOC_GROUP_EXTEND: {
 		ext4_fsblk_t n_blocks_count;
-		struct super_block *sb = inode->i_sb;
 		int err, err2=0;
 
 		err = ext4_resize_begin(sb);
@@ -209,6 +208,13 @@ setversion_out:
 		if (get_user(n_blocks_count, (__u32 __user *)arg))
 			return -EFAULT;
 
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Online resizing not supported with bigalloc");
+			return -EOPNOTSUPP;
+		}
+
 		err = mnt_want_write(filp->f_path.mnt);
 		if (err)
 			return err;
@@ -250,6 +256,13 @@ setversion_out:
 			goto mext_out;
 		}
 
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Online defrag not supported with bigalloc");
+			return -EOPNOTSUPP;
+		}
+
 		err = mnt_want_write(filp->f_path.mnt);
 		if (err)
 			goto mext_out;
@@ -270,7 +283,6 @@ mext_out:
 
 	case EXT4_IOC_GROUP_ADD: {
 		struct ext4_new_group_data input;
-		struct super_block *sb = inode->i_sb;
 		int err, err2=0;
 
 		err = ext4_resize_begin(sb);
@@ -281,6 +293,13 @@ mext_out:
 				sizeof(input)))
 			return -EFAULT;
 
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Online resizing not supported with bigalloc");
+			return -EOPNOTSUPP;
+		}
+
 		err = mnt_want_write(filp->f_path.mnt);
 		if (err)
 			return err;
@@ -337,7 +356,6 @@ mext_out:
 
 	case FITRIM:
 	{
-		struct super_block *sb = inode->i_sb;
 		struct request_queue *q = bdev_get_queue(sb->s_bdev);
 		struct fstrim_range range;
 		int ret = 0;
@@ -348,6 +366,13 @@ mext_out:
 		if (!blk_queue_discard(q))
 			return -EOPNOTSUPP;
 
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+			ext4_msg(sb, KERN_ERR,
+				 "FITRIM not supported with bigalloc");
+			return -EOPNOTSUPP;
+		}
+
 		if (copy_from_user(&range, (struct fstrim_range *)arg,
 		    sizeof(range)))
 			return -EFAULT;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 823e7d9deee2..25a4bfe3f39f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2701,6 +2701,13 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
 			return 0;
 		}
 	}
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) &&
+	    !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+		ext4_msg(sb, KERN_ERR,
+			 "Can't support bigalloc feature without "
+			 "extents feature\n");
+		return 0;
+	}
 	return 1;
 }
 
-- 
cgit v1.2.3-58-ga151


From 7137d7a48e2213eb1f6d6529da14c2ed3706b795 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 18:38:51 -0400
Subject: ext4: convert instances of EXT4_BLOCKS_PER_GROUP to
 EXT4_CLUSTERS_PER_GROUP

Change the places in fs/ext4/mballoc.c where EXT4_BLOCKS_PER_GROUP are
used to indicate the number of bits in a block bitmap (which is really
a cluster allocation bitmap in bigalloc file systems).  There are
still some places in the ext4 codebase where usage of
EXT4_BLOCKS_PER_GROUP needs to be audited/fixed, in code paths that
aren't used given the initial restricted assumptions for bigalloc.
These will need to be fixed before we can relax those restrictions.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 17a5a57c415a..81e28657a3c2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -653,7 +653,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
 	ext4_grpblk_t chunk;
 	unsigned short border;
 
-	BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
+	BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
 
 	border = 2 << sb->s_blocksize_bits;
 
@@ -705,7 +705,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
 				void *buddy, void *bitmap, ext4_group_t group)
 {
 	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
-	ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
+	ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
 	ext4_grpblk_t i = 0;
 	ext4_grpblk_t first;
 	ext4_grpblk_t len;
@@ -1624,8 +1624,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
 	struct ext4_free_extent *gex = &ac->ac_g_ex;
 
 	BUG_ON(ex->fe_len <= 0);
-	BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
-	BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+	BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
+	BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
 	BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
 
 	ac->ac_found++;
@@ -1823,8 +1823,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 
 	while (free && ac->ac_status == AC_STATUS_CONTINUE) {
 		i = mb_find_next_zero_bit(bitmap,
-						EXT4_BLOCKS_PER_GROUP(sb), i);
-		if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
+						EXT4_CLUSTERS_PER_GROUP(sb), i);
+		if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
 			/*
 			 * IF we have corrupt bitmap, we won't find any
 			 * free blocks even though group info says we
@@ -1887,7 +1887,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
 	do_div(a, sbi->s_stripe);
 	i = (a * sbi->s_stripe) - first_group_block;
 
-	while (i < EXT4_BLOCKS_PER_GROUP(sb)) {
+	while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
 		if (!mb_test_bit(i, bitmap)) {
 			max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
 			if (max >= sbi->s_stripe) {
@@ -3036,7 +3036,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 	}
 	BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
 			start > ac->ac_o_ex.fe_logical);
-	BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+	BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
 
 	/* now prepare goal request */
 
@@ -3690,7 +3690,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 	}
 
 	if (needed == 0)
-		needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
+		needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
 
 	INIT_LIST_HEAD(&list);
 repeat:
@@ -4007,8 +4007,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 	len = ar->len;
 
 	/* just a dirty hack to filter too big requests  */
-	if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10)
-		len = EXT4_BLOCKS_PER_GROUP(sb) - 10;
+	if (len >= EXT4_CLUSTERS_PER_GROUP(sb) - 10)
+		len = EXT4_CLUSTERS_PER_GROUP(sb) - 10;
 
 	/* start searching from the goal */
 	goal = ar->goal;
@@ -4552,8 +4552,8 @@ do_more:
 	 * Check to see if we are freeing blocks across a group
 	 * boundary.
 	 */
-	if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
-		overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
+	if (bit + count > EXT4_CLUSTERS_PER_GROUP(sb)) {
+		overflow = bit + count - EXT4_CLUSTERS_PER_GROUP(sb);
 		count -= overflow;
 	}
 	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
@@ -4948,7 +4948,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 	struct ext4_group_info *grp;
 	ext4_group_t first_group, last_group;
 	ext4_group_t group, ngroups = ext4_get_groups_count(sb);
-	ext4_grpblk_t cnt = 0, first_block, last_block;
+	ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
 	uint64_t start, len, minlen, trimmed = 0;
 	ext4_fsblk_t first_data_blk =
 			le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
@@ -4958,7 +4958,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 	len = range->len >> sb->s_blocksize_bits;
 	minlen = range->minlen >> sb->s_blocksize_bits;
 
-	if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
+	if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)))
 		return -EINVAL;
 	if (start + len <= first_data_blk)
 		goto out;
@@ -4969,11 +4969,11 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 
 	/* Determine first and last group to examine based on start and len */
 	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
-				     &first_group, &first_block);
+				     &first_group, &first_cluster);
 	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
-				     &last_group, &last_block);
+				     &last_group, &last_cluster);
 	last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
-	last_block = EXT4_BLOCKS_PER_GROUP(sb);
+	last_cluster = EXT4_CLUSTERS_PER_GROUP(sb);
 
 	if (first_group > last_group)
 		return -EINVAL;
@@ -4993,20 +4993,20 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 		 * change it for the last group in which case start +
 		 * len < EXT4_BLOCKS_PER_GROUP(sb).
 		 */
-		if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb))
-			last_block = first_block + len;
-		len -= last_block - first_block;
+		if (first_cluster + len < EXT4_CLUSTERS_PER_GROUP(sb))
+			last_cluster = first_cluster + len;
+		len -= last_cluster - first_cluster;
 
 		if (grp->bb_free >= minlen) {
-			cnt = ext4_trim_all_free(sb, group, first_block,
-						last_block, minlen);
+			cnt = ext4_trim_all_free(sb, group, first_cluster,
+						last_cluster, minlen);
 			if (cnt < 0) {
 				ret = cnt;
 				break;
 			}
 		}
 		trimmed += cnt;
-		first_block = 0;
+		first_cluster = 0;
 	}
 	range->len = trimmed * sb->s_blocksize;
 
-- 
cgit v1.2.3-58-ga151


From 49f7f9af4bb4d7972f3a35a74877937fec9f622d Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 18:40:51 -0400
Subject: ext4: factor out block group accounting into functions

This makes it easier to understand how ext4_init_block_bitmap() works,
and it will assist when we split out ext4_free_blocks_after_init() in
the next commit.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 80 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 48 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f8224adf496e..8573e2bfb78a 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -23,6 +23,9 @@
 
 #include <trace/events/ext4.h>
 
+static unsigned int num_base_meta_blocks(struct super_block *sb,
+					 ext4_group_t block_group);
+
 /*
  * balloc.c contains the blocks allocation and deallocation routines
  */
@@ -83,14 +86,30 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
 	return used_blocks;
 }
 
+static unsigned int num_blocks_in_group(struct super_block *sb,
+					ext4_group_t block_group)
+{
+	if (block_group == ext4_get_groups_count(sb) - 1) {
+		/*
+		 * Even though mke2fs always initializes the first and
+		 * last group, just in case some other tool was used,
+		 * we need to make sure we calculate the right free
+		 * blocks.
+		 */
+		return ext4_blocks_count(EXT4_SB(sb)->s_es) -
+			ext4_group_first_block_no(sb, block_group);
+	} else
+		return EXT4_BLOCKS_PER_GROUP(sb);
+}
+
 /* Initializes an uninitialized block bitmap if given, and returns the
  * number of blocks free in the group. */
 unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 ext4_group_t block_group, struct ext4_group_desc *gdp)
 {
-	int bit, bit_max;
+	unsigned int bit, bit_max = num_base_meta_blocks(sb, block_group);
 	ext4_group_t ngroups = ext4_get_groups_count(sb);
-	unsigned free_blocks, group_blocks;
+	unsigned group_blocks = num_blocks_in_group(sb, block_group);
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	if (bh) {
@@ -110,35 +129,6 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		memset(bh->b_data, 0, sb->s_blocksize);
 	}
 
-	/* Check for superblock and gdt backups in this group */
-	bit_max = ext4_bg_has_super(sb, block_group);
-
-	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
-	    block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
-			  sbi->s_desc_per_block) {
-		if (bit_max) {
-			bit_max += ext4_bg_num_gdb(sb, block_group);
-			bit_max +=
-				le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
-		}
-	} else { /* For META_BG_BLOCK_GROUPS */
-		bit_max += ext4_bg_num_gdb(sb, block_group);
-	}
-
-	if (block_group == ngroups - 1) {
-		/*
-		 * Even though mke2fs always initialize first and last group
-		 * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
-		 * to make sure we calculate the right free blocks
-		 */
-		group_blocks = ext4_blocks_count(sbi->s_es) -
-			ext4_group_first_block_no(sb, ngroups - 1);
-	} else {
-		group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
-	}
-
-	free_blocks = group_blocks - bit_max;
-
 	if (bh) {
 		ext4_fsblk_t start, tmp;
 		int flex_bg = 0;
@@ -176,7 +166,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8,
 				     bh->b_data);
 	}
-	return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
+	return group_blocks - bit_max -
+		ext4_group_used_meta_blocks(sb, block_group, gdp);
 }
 
 
@@ -620,6 +611,31 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
 
 }
 
+/*
+ * This function returns the number of file system metadata blocks at
+ * the beginning of a block group, including the reserved gdt blocks.
+ */
+static unsigned int num_base_meta_blocks(struct super_block *sb,
+					 ext4_group_t block_group)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	int num;
+
+	/* Check for superblock and gdt backups in this group */
+	num = ext4_bg_has_super(sb, block_group);
+
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
+	    block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
+			  sbi->s_desc_per_block) {
+		if (num) {
+			num += ext4_bg_num_gdb(sb, block_group);
+			num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+		}
+	} else { /* For META_BG_BLOCK_GROUPS */
+		num += ext4_bg_num_gdb(sb, block_group);
+	}
+	return num;
+}
 /**
  *	ext4_inode_to_goal_block - return a hint for block allocation
  *	@inode: inode for block allocation
-- 
cgit v1.2.3-58-ga151


From fd034a84e1ea5c8c8d159cd2089c32e792c269b0 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 18:42:51 -0400
Subject: ext4: split out ext4_free_blocks_after_init()

The function ext4_free_blocks_after_init() used to be a #define of
ext4_init_block_bitmap().  This actually made it difficult to
understand how the function worked, and made it hard make changes to
support clusters.  So as an initial cleanup, I've separated out the
functionality of initializing block bitmap from calculating the number
of free blocks in the new block group.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 105 +++++++++++++++++++++++++++----------------------------
 fs/ext4/ext4.h   |  13 +++----
 fs/ext4/ialloc.c |  20 ++++-------
 3 files changed, 66 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 8573e2bfb78a..735d9fcc72e6 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -102,74 +102,73 @@ static unsigned int num_blocks_in_group(struct super_block *sb,
 		return EXT4_BLOCKS_PER_GROUP(sb);
 }
 
-/* Initializes an uninitialized block bitmap if given, and returns the
- * number of blocks free in the group. */
-unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
-		 ext4_group_t block_group, struct ext4_group_desc *gdp)
+/* Initializes an uninitialized block bitmap */
+void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
+			    ext4_group_t block_group,
+			    struct ext4_group_desc *gdp)
 {
 	unsigned int bit, bit_max = num_base_meta_blocks(sb, block_group);
-	ext4_group_t ngroups = ext4_get_groups_count(sb);
-	unsigned group_blocks = num_blocks_in_group(sb, block_group);
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-
-	if (bh) {
-		J_ASSERT_BH(bh, buffer_locked(bh));
-
-		/* If checksum is bad mark all blocks used to prevent allocation
-		 * essentially implementing a per-group read-only flag. */
-		if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-			ext4_error(sb, "Checksum bad for group %u",
-					block_group);
-			ext4_free_blks_set(sb, gdp, 0);
-			ext4_free_inodes_set(sb, gdp, 0);
-			ext4_itable_unused_set(sb, gdp, 0);
-			memset(bh->b_data, 0xff, sb->s_blocksize);
-			return 0;
-		}
-		memset(bh->b_data, 0, sb->s_blocksize);
+	ext4_fsblk_t start, tmp;
+	int flex_bg = 0;
+
+	J_ASSERT_BH(bh, buffer_locked(bh));
+
+	/* If checksum is bad mark all blocks used to prevent allocation
+	 * essentially implementing a per-group read-only flag. */
+	if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
+		ext4_error(sb, "Checksum bad for group %u", block_group);
+		ext4_free_blks_set(sb, gdp, 0);
+		ext4_free_inodes_set(sb, gdp, 0);
+		ext4_itable_unused_set(sb, gdp, 0);
+		memset(bh->b_data, 0xff, sb->s_blocksize);
+		return;
 	}
+	memset(bh->b_data, 0, sb->s_blocksize);
 
-	if (bh) {
-		ext4_fsblk_t start, tmp;
-		int flex_bg = 0;
+	for (bit = 0; bit < bit_max; bit++)
+		ext4_set_bit(bit, bh->b_data);
 
-		for (bit = 0; bit < bit_max; bit++)
-			ext4_set_bit(bit, bh->b_data);
+	start = ext4_group_first_block_no(sb, block_group);
 
-		start = ext4_group_first_block_no(sb, block_group);
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+		flex_bg = 1;
 
-		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
-					      EXT4_FEATURE_INCOMPAT_FLEX_BG))
-			flex_bg = 1;
+	/* Set bits for block and inode bitmaps, and inode table */
+	tmp = ext4_block_bitmap(sb, gdp);
+	if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+		ext4_set_bit(tmp - start, bh->b_data);
 
-		/* Set bits for block and inode bitmaps, and inode table */
-		tmp = ext4_block_bitmap(sb, gdp);
-		if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
-			ext4_set_bit(tmp - start, bh->b_data);
+	tmp = ext4_inode_bitmap(sb, gdp);
+	if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+		ext4_set_bit(tmp - start, bh->b_data);
 
-		tmp = ext4_inode_bitmap(sb, gdp);
+	tmp = ext4_inode_table(sb, gdp);
+	for (; tmp < ext4_inode_table(sb, gdp) +
+		     sbi->s_itb_per_group; tmp++) {
 		if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
 			ext4_set_bit(tmp - start, bh->b_data);
-
-		tmp = ext4_inode_table(sb, gdp);
-		for (; tmp < ext4_inode_table(sb, gdp) +
-				sbi->s_itb_per_group; tmp++) {
-			if (!flex_bg ||
-				ext4_block_in_group(sb, tmp, block_group))
-				ext4_set_bit(tmp - start, bh->b_data);
-		}
-		/*
-		 * Also if the number of blocks within the group is
-		 * less than the blocksize * 8 ( which is the size
-		 * of bitmap ), set rest of the block bitmap to 1
-		 */
-		ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8,
-				     bh->b_data);
 	}
-	return group_blocks - bit_max -
-		ext4_group_used_meta_blocks(sb, block_group, gdp);
+	/*
+	 * Also if the number of blocks within the group is less than
+	 * the blocksize * 8 ( which is the size of bitmap ), set rest
+	 * of the block bitmap to 1
+	 */
+	ext4_mark_bitmap_end(num_blocks_in_group(sb, block_group),
+			     sb->s_blocksize * 8, bh->b_data);
 }
 
+/* Return the number of free blocks in a block group.  It is used when
+ * the block bitmap is uninitialized, so we can't just count the bits
+ * in the bitmap. */
+unsigned ext4_free_blocks_after_init(struct super_block *sb,
+				     ext4_group_t block_group,
+				     struct ext4_group_desc *gdp)
+{
+	return num_blocks_in_group(sb, block_group) -
+		num_base_meta_blocks(sb, block_group) -
+		ext4_group_used_meta_blocks(sb, block_group, gdp);
+}
 
 /*
  * The free blocks are managed by bitmaps.  A file system contains several
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f7257aa6bf81..b0b7b67e439d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1763,12 +1763,13 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
 struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
 				      ext4_group_t block_group);
-extern unsigned ext4_init_block_bitmap(struct super_block *sb,
-				       struct buffer_head *bh,
-				       ext4_group_t group,
-				       struct ext4_group_desc *desc);
-#define ext4_free_blocks_after_init(sb, group, desc)			\
-		ext4_init_block_bitmap(sb, NULL, group, desc)
+extern void ext4_init_block_bitmap(struct super_block *sb,
+				   struct buffer_head *bh,
+				   ext4_group_t group,
+				   struct ext4_group_desc *desc);
+extern unsigned ext4_free_blocks_after_init(struct super_block *sb,
+					    ext4_group_t block_group,
+					    struct ext4_group_desc *gdp);
 ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
 
 /* dir.c */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 9c63f273b550..b7a8130d0af4 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -816,7 +816,6 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
 	int ret2, err = 0;
 	struct inode *ret;
 	ext4_group_t i;
-	int free = 0;
 	static int once = 1;
 	ext4_group_t flex_group;
 
@@ -950,26 +949,21 @@ got:
 			goto fail;
 		}
 
-		free = 0;
-		ext4_lock_group(sb, group);
+		BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
+		err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
+		brelse(block_bitmap_bh);
+
 		/* recheck and clear flag under lock if we still need to */
+		ext4_lock_group(sb, group);
 		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-			free = ext4_free_blocks_after_init(sb, group, gdp);
 			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-			ext4_free_blks_set(sb, gdp, free);
+			ext4_free_blks_set(sb, gdp,
+				ext4_free_blocks_after_init(sb, group, gdp));
 			gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
 								gdp);
 		}
 		ext4_unlock_group(sb, group);
 
-		/* Don't need to dirty bitmap block if we didn't change it */
-		if (free) {
-			BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
-			err = ext4_handle_dirty_metadata(handle,
-							NULL, block_bitmap_bh);
-		}
-
-		brelse(block_bitmap_bh);
 		if (err)
 			goto fail;
 	}
-- 
cgit v1.2.3-58-ga151


From d5b8f31007a93777cfb0603b665858fb7aebebfc Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 18:44:51 -0400
Subject: ext4: bigalloc changes to block bitmap initialization functions

Add bigalloc support to ext4_init_block_bitmap() and
ext4_free_blocks_after_init().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 131 ++++++++++++++++++++++++++++++++++++++-----------------
 fs/ext4/ext4.h   |  13 ++++++
 2 files changed, 103 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 735d9fcc72e6..1c6d777b35a2 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -23,9 +23,6 @@
 
 #include <trace/events/ext4.h>
 
-static unsigned int num_base_meta_blocks(struct super_block *sb,
-					 ext4_group_t block_group);
-
 /*
  * balloc.c contains the blocks allocation and deallocation routines
  */
@@ -58,37 +55,87 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
 	return 0;
 }
 
-static int ext4_group_used_meta_blocks(struct super_block *sb,
-				       ext4_group_t block_group,
-				       struct ext4_group_desc *gdp)
+/* Return the number of clusters used for file system metadata; this
+ * represents the overhead needed by the file system.
+ */
+unsigned ext4_num_overhead_clusters(struct super_block *sb,
+				    ext4_group_t block_group,
+				    struct ext4_group_desc *gdp)
 {
-	ext4_fsblk_t tmp;
+	unsigned num_clusters;
+	int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c;
+	ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group);
+	ext4_fsblk_t itbl_blk;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	/* block bitmap, inode bitmap, and inode table blocks */
-	int used_blocks = sbi->s_itb_per_group + 2;
 
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
-		if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
-					block_group))
-			used_blocks--;
-
-		if (!ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp),
-					block_group))
-			used_blocks--;
-
-		tmp = ext4_inode_table(sb, gdp);
-		for (; tmp < ext4_inode_table(sb, gdp) +
-				sbi->s_itb_per_group; tmp++) {
-			if (!ext4_block_in_group(sb, tmp, block_group))
-				used_blocks -= 1;
+	/* This is the number of clusters used by the superblock,
+	 * block group descriptors, and reserved block group
+	 * descriptor blocks */
+	num_clusters = ext4_num_base_meta_clusters(sb, block_group);
+
+	/*
+	 * For the allocation bitmaps and inode table, we first need
+	 * to check to see if the block is in the block group.  If it
+	 * is, then check to see if the cluster is already accounted
+	 * for in the clusters used for the base metadata cluster, or
+	 * if we can increment the base metadata cluster to include
+	 * that block.  Otherwise, we will have to track the cluster
+	 * used for the allocation bitmap or inode table explicitly.
+	 * Normally all of these blocks are contiguous, so the special
+	 * case handling shouldn't be necessary except for *very*
+	 * unusual file system layouts.
+	 */
+	if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) {
+		block_cluster = EXT4_B2C(sbi, (start -
+					       ext4_block_bitmap(sb, gdp)));
+		if (block_cluster < num_clusters)
+			block_cluster = -1;
+		else if (block_cluster == num_clusters) {
+			num_clusters++;
+			block_cluster = -1;
+		}
+	}
+
+	if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) {
+		inode_cluster = EXT4_B2C(sbi,
+					 start - ext4_inode_bitmap(sb, gdp));
+		if (inode_cluster < num_clusters)
+			inode_cluster = -1;
+		else if (inode_cluster == num_clusters) {
+			num_clusters++;
+			inode_cluster = -1;
+		}
+	}
+
+	itbl_blk = ext4_inode_table(sb, gdp);
+	for (i = 0; i < sbi->s_itb_per_group; i++) {
+		if (ext4_block_in_group(sb, itbl_blk + i, block_group)) {
+			c = EXT4_B2C(sbi, start - itbl_blk + i);
+			if ((c < num_clusters) || (c == inode_cluster) ||
+			    (c == block_cluster) || (c == itbl_cluster))
+				continue;
+			if (c == num_clusters) {
+				num_clusters++;
+				continue;
+			}
+			num_clusters++;
+			itbl_cluster = c;
 		}
 	}
-	return used_blocks;
+
+	if (block_cluster != -1)
+		num_clusters++;
+	if (inode_cluster != -1)
+		num_clusters++;
+
+	return num_clusters;
 }
 
-static unsigned int num_blocks_in_group(struct super_block *sb,
-					ext4_group_t block_group)
+static unsigned int num_clusters_in_group(struct super_block *sb,
+					  ext4_group_t block_group)
 {
+	unsigned int blocks;
+
 	if (block_group == ext4_get_groups_count(sb) - 1) {
 		/*
 		 * Even though mke2fs always initializes the first and
@@ -96,10 +143,11 @@ static unsigned int num_blocks_in_group(struct super_block *sb,
 		 * we need to make sure we calculate the right free
 		 * blocks.
 		 */
-		return ext4_blocks_count(EXT4_SB(sb)->s_es) -
+		blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) -
 			ext4_group_first_block_no(sb, block_group);
 	} else
-		return EXT4_BLOCKS_PER_GROUP(sb);
+		blocks = EXT4_BLOCKS_PER_GROUP(sb);
+	return EXT4_NUM_B2C(EXT4_SB(sb), blocks);
 }
 
 /* Initializes an uninitialized block bitmap */
@@ -107,7 +155,7 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 			    ext4_group_t block_group,
 			    struct ext4_group_desc *gdp)
 {
-	unsigned int bit, bit_max = num_base_meta_blocks(sb, block_group);
+	unsigned int bit, bit_max;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_fsblk_t start, tmp;
 	int flex_bg = 0;
@@ -126,6 +174,7 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 	}
 	memset(bh->b_data, 0, sb->s_blocksize);
 
+	bit_max = ext4_num_base_meta_clusters(sb, block_group);
 	for (bit = 0; bit < bit_max; bit++)
 		ext4_set_bit(bit, bh->b_data);
 
@@ -137,24 +186,25 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 	/* Set bits for block and inode bitmaps, and inode table */
 	tmp = ext4_block_bitmap(sb, gdp);
 	if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
-		ext4_set_bit(tmp - start, bh->b_data);
+		ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
 
 	tmp = ext4_inode_bitmap(sb, gdp);
 	if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
-		ext4_set_bit(tmp - start, bh->b_data);
+		ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
 
 	tmp = ext4_inode_table(sb, gdp);
 	for (; tmp < ext4_inode_table(sb, gdp) +
 		     sbi->s_itb_per_group; tmp++) {
 		if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
-			ext4_set_bit(tmp - start, bh->b_data);
+			ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
 	}
+
 	/*
 	 * Also if the number of blocks within the group is less than
 	 * the blocksize * 8 ( which is the size of bitmap ), set rest
 	 * of the block bitmap to 1
 	 */
-	ext4_mark_bitmap_end(num_blocks_in_group(sb, block_group),
+	ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
 			     sb->s_blocksize * 8, bh->b_data);
 }
 
@@ -165,9 +215,8 @@ unsigned ext4_free_blocks_after_init(struct super_block *sb,
 				     ext4_group_t block_group,
 				     struct ext4_group_desc *gdp)
 {
-	return num_blocks_in_group(sb, block_group) -
-		num_base_meta_blocks(sb, block_group) -
-		ext4_group_used_meta_blocks(sb, block_group, gdp);
+	return num_clusters_in_group(sb, block_group) - 
+		ext4_num_overhead_clusters(sb, block_group, gdp);
 }
 
 /*
@@ -611,14 +660,14 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
 }
 
 /*
- * This function returns the number of file system metadata blocks at
+ * This function returns the number of file system metadata clusters at
  * the beginning of a block group, including the reserved gdt blocks.
  */
-static unsigned int num_base_meta_blocks(struct super_block *sb,
-					 ext4_group_t block_group)
+unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+				     ext4_group_t block_group)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	int num;
+	unsigned num;
 
 	/* Check for superblock and gdt backups in this group */
 	num = ext4_bg_has_super(sb, block_group);
@@ -633,7 +682,7 @@ static unsigned int num_base_meta_blocks(struct super_block *sb,
 	} else { /* For META_BG_BLOCK_GROUPS */
 		num += ext4_bg_num_gdb(sb, block_group);
 	}
-	return num;
+	return EXT4_NUM_B2C(sbi, num);
 }
 /**
  *	ext4_inode_to_goal_block - return a hint for block allocation
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b0b7b67e439d..803cfa42e1e8 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -261,6 +261,14 @@ struct ext4_io_submit {
 #endif
 #define EXT4_BLOCK_ALIGN(size, blkbits)		ALIGN((size), (1 << (blkbits)))
 
+/* Translate a block number to a cluster number */
+#define EXT4_B2C(sbi, blk)	((blk) >> (sbi)->s_cluster_bits)
+/* Translate a cluster number to a block number */
+#define EXT4_C2B(sbi, cluster)	((cluster) << (sbi)->s_cluster_bits)
+/* Translate # of blks to # of clusters */
+#define EXT4_NUM_B2C(sbi, blks)	(((blks) + (sbi)->s_cluster_ratio - 1) >> \
+				 (sbi)->s_cluster_bits)
+
 /*
  * Structure of a blocks group descriptor
  */
@@ -1770,6 +1778,11 @@ extern void ext4_init_block_bitmap(struct super_block *sb,
 extern unsigned ext4_free_blocks_after_init(struct super_block *sb,
 					    ext4_group_t block_group,
 					    struct ext4_group_desc *gdp);
+extern unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+					    ext4_group_t block_group);
+extern unsigned ext4_num_overhead_clusters(struct super_block *sb,
+					   ext4_group_t block_group,
+					   struct ext4_group_desc *gdp);
 ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
 
 /* dir.c */
-- 
cgit v1.2.3-58-ga151


From 3212a80a58062056bb922811071062be58d8fee1 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 18:46:51 -0400
Subject: ext4: convert block group-relative offsets to use clusters

Certain parts of the ext4 code base, primarily in mballoc.c, use a
block group number and offset from the beginning of the block group.
This offset is invariably used to index into the allocation bitmap, so
change the offset to be denominated in units of clusters.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 6 ++++--
 fs/ext4/mballoc.h | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1c6d777b35a2..89abf1f7b253 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -28,7 +28,8 @@
  */
 
 /*
- * Calculate the block group number and offset, given a block number
+ * Calculate the block group number and offset into the block/cluster
+ * allocation bitmap, given a block number
  */
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 		ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
@@ -37,7 +38,8 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 	ext4_grpblk_t offset;
 
 	blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
-	offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb));
+	offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >>
+		EXT4_SB(sb)->s_cluster_bits;
 	if (offsetp)
 		*offsetp = offset;
 	if (blockgrpp)
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 9d4a636b546c..3cdb8aa9f6b7 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -216,6 +216,7 @@ struct ext4_buddy {
 static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
 					struct ext4_free_extent *fex)
 {
-	return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start;
+	return ext4_group_first_block_no(sb, fex->fe_group) +
+		(fex->fe_start << EXT4_SB(sb)->s_cluster_bits);
 }
 #endif
-- 
cgit v1.2.3-58-ga151


From 53accfa9f819c80056db6f03f9c5cfa4bcba1ed8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 18:48:51 -0400
Subject: ext4: teach mballoc preallocation code about bigalloc clusters

In most of mballoc.c, we do everything in units of clusters, since the
block allocation bitmaps and buddy bitmaps are all denominated in
clusters.  The one place where we do deal with absolute block numbers
is in the code that handles the preallocation regions, since in the
case of inode-based preallocation regions, the start of the
preallocation region can't be relative to the beginning of the group.

So this adds a bit of complexity, where pa_pstart and pa_lstart are
block numbers, while pa_free, pa_len, and fe_len are denominated in
units of clusters.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 95 +++++++++++++++++++++++++++++++------------------------
 fs/ext4/mballoc.h |  4 +--
 2 files changed, 56 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 81e28657a3c2..8765f2512f13 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -70,8 +70,8 @@
  *
  * pa_lstart -> the logical start block for this prealloc space
  * pa_pstart -> the physical start block for this prealloc space
- * pa_len    -> length for this prealloc space
- * pa_free   ->  free space available in this prealloc space
+ * pa_len    -> length for this prealloc space (in clusters)
+ * pa_free   ->  free space available in this prealloc space (in clusters)
  *
  * The inode preallocation space is used looking at the _logical_ start
  * block. If only the logical file block falls within the range of prealloc
@@ -459,7 +459,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
 			ext4_fsblk_t blocknr;
 
 			blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
-			blocknr += first + i;
+			blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
 			ext4_grp_locked_error(sb, e4b->bd_group,
 					      inode ? inode->i_ino : 0,
 					      blocknr,
@@ -734,7 +734,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
 
 	if (free != grp->bb_free) {
 		ext4_grp_locked_error(sb, group, 0, 0,
-				      "%u blocks in bitmap, %u in gd",
+				      "%u clusters in bitmap, %u in gd",
 				      free, grp->bb_free);
 		/*
 		 * If we intent to continue, we consider group descritor
@@ -1339,7 +1339,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 			ext4_fsblk_t blocknr;
 
 			blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
-			blocknr += block;
+			blocknr += EXT4_C2B(EXT4_SB(sb), block);
 			ext4_grp_locked_error(sb, e4b->bd_group,
 					      inode ? inode->i_ino : 0,
 					      blocknr,
@@ -1831,7 +1831,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 			 * we have free blocks
 			 */
 			ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
-					"%d free blocks as per "
+					"%d free clusters as per "
 					"group info. But bitmap says 0",
 					free);
 			break;
@@ -1841,7 +1841,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 		BUG_ON(ex.fe_len <= 0);
 		if (free < ex.fe_len) {
 			ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
-					"%d free blocks as per "
+					"%d free clusters as per "
 					"group info. But got %d blocks",
 					free, ex.fe_len);
 			/*
@@ -2752,7 +2752,7 @@ void ext4_exit_mballoc(void)
  */
 static noinline_for_stack int
 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
-				handle_t *handle, unsigned int reserv_blks)
+				handle_t *handle, unsigned int reserv_clstrs)
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct ext4_group_desc *gdp;
@@ -2791,7 +2791,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 
 	block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
 
-	len = ac->ac_b_ex.fe_len;
+	len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
 	if (!ext4_data_block_valid(sbi, block, len)) {
 		ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
 			   "fs metadata\n", block, block+len);
@@ -2838,7 +2838,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	 */
 	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
 		/* release all the reserved blocks if non delalloc */
-		percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
+		percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_clstrs);
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi,
@@ -2886,6 +2886,7 @@ static noinline_for_stack void
 ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 				struct ext4_allocation_request *ar)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 	int bsbits, max;
 	ext4_lblk_t end;
 	loff_t size, orig_size, start_off;
@@ -2916,7 +2917,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 
 	/* first, let's learn actual file size
 	 * given current request is allocated */
-	size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+	size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
 	size = size << bsbits;
 	if (size < i_size_read(ac->ac_inode))
 		size = i_size_read(ac->ac_inode);
@@ -2988,7 +2989,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 			continue;
 		}
 
-		pa_end = pa->pa_lstart + pa->pa_len;
+		pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+						  pa->pa_len);
 
 		/* PA must not overlap original request */
 		BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
@@ -3018,9 +3020,11 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 	rcu_read_lock();
 	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
 		ext4_lblk_t pa_end;
+
 		spin_lock(&pa->pa_lock);
 		if (pa->pa_deleted == 0) {
-			pa_end = pa->pa_lstart + pa->pa_len;
+			pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+							  pa->pa_len);
 			BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
 		}
 		spin_unlock(&pa->pa_lock);
@@ -3043,7 +3047,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 	/* XXX: is it better to align blocks WRT to logical
 	 * placement or satisfy big request as is */
 	ac->ac_g_ex.fe_logical = start;
-	ac->ac_g_ex.fe_len = size;
+	ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
 
 	/* define goal start in order to merge */
 	if (ar->pright && (ar->lright == (start + size))) {
@@ -3112,14 +3116,16 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
 				struct ext4_prealloc_space *pa)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 	ext4_fsblk_t start;
 	ext4_fsblk_t end;
 	int len;
 
 	/* found preallocated blocks, use them */
 	start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
-	end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len);
-	len = end - start;
+	end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
+		  start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
+	len = EXT4_NUM_B2C(sbi, end - start);
 	ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
 					&ac->ac_b_ex.fe_start);
 	ac->ac_b_ex.fe_len = len;
@@ -3127,7 +3133,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
 	ac->ac_pa = pa;
 
 	BUG_ON(start < pa->pa_pstart);
-	BUG_ON(start + len > pa->pa_pstart + pa->pa_len);
+	BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
 	BUG_ON(pa->pa_free < len);
 	pa->pa_free -= len;
 
@@ -3193,6 +3199,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
 static noinline_for_stack int
 ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 	int order, i;
 	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
 	struct ext4_locality_group *lg;
@@ -3210,12 +3217,14 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 		/* all fields in this condition don't change,
 		 * so we can skip locking for them */
 		if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
-			ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
+		    ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
+					       EXT4_C2B(sbi, pa->pa_len)))
 			continue;
 
 		/* non-extent files can't have physical blocks past 2^32 */
 		if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
-			pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
+		    (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
+		     EXT4_MAX_BLOCK_FILE_PHYS))
 			continue;
 
 		/* found preallocated blocks, use them */
@@ -3412,6 +3421,7 @@ static noinline_for_stack int
 ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 {
 	struct super_block *sb = ac->ac_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_prealloc_space *pa;
 	struct ext4_group_info *grp;
 	struct ext4_inode_info *ei;
@@ -3443,16 +3453,18 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 		winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
 
 		/* also, we should cover whole original request */
-		wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len;
+		wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
 
 		/* the smallest one defines real window */
 		win = min(winl, wins);
 
-		offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len;
+		offs = ac->ac_o_ex.fe_logical %
+			EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
 		if (offs && offs < win)
 			win = offs;
 
-		ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win;
+		ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
+			EXT4_B2C(sbi, win);
 		BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
 		BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
 	}
@@ -3477,7 +3489,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 	trace_ext4_mb_new_inode_pa(ac, pa);
 
 	ext4_mb_use_inode_pa(ac, pa);
-	atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
+	atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
 
 	ei = EXT4_I(ac->ac_inode);
 	grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
@@ -3592,7 +3604,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 
 	BUG_ON(pa->pa_deleted == 0);
 	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
-	grp_blk_start = pa->pa_pstart - bit;
+	grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
 	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
 	end = bit + pa->pa_len;
 
@@ -3607,7 +3619,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 		free += next - bit;
 
 		trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
-		trace_ext4_mb_release_inode_pa(pa, grp_blk_start + bit,
+		trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
+						    EXT4_C2B(sbi, bit)),
 					       next - bit);
 		mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
 		bit = next + 1;
@@ -3958,7 +3971,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
 		return;
 
-	size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+	size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
 	isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
 		>> bsbits;
 
@@ -4019,18 +4032,15 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 
 	/* set up allocation goals */
 	memset(ac, 0, sizeof(struct ext4_allocation_context));
-	ac->ac_b_ex.fe_logical = ar->logical;
+	ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1);
 	ac->ac_status = AC_STATUS_CONTINUE;
 	ac->ac_sb = sb;
 	ac->ac_inode = ar->inode;
-	ac->ac_o_ex.fe_logical = ar->logical;
+	ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
 	ac->ac_o_ex.fe_group = group;
 	ac->ac_o_ex.fe_start = block;
 	ac->ac_o_ex.fe_len = len;
-	ac->ac_g_ex.fe_logical = ar->logical;
-	ac->ac_g_ex.fe_group = group;
-	ac->ac_g_ex.fe_start = block;
-	ac->ac_g_ex.fe_len = len;
+	ac->ac_g_ex = ac->ac_o_ex;
 	ac->ac_flags = ar->flags;
 
 	/* we have to define context: we'll we work with a file or
@@ -4182,13 +4192,14 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
  */
 static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 	struct ext4_prealloc_space *pa = ac->ac_pa;
 	if (pa) {
 		if (pa->pa_type == MB_GROUP_PA) {
 			/* see comment in ext4_mb_use_group_pa() */
 			spin_lock(&pa->pa_lock);
-			pa->pa_pstart += ac->ac_b_ex.fe_len;
-			pa->pa_lstart += ac->ac_b_ex.fe_len;
+			pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+			pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
 			pa->pa_free -= ac->ac_b_ex.fe_len;
 			pa->pa_len -= ac->ac_b_ex.fe_len;
 			spin_unlock(&pa->pa_lock);
@@ -4249,7 +4260,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	struct super_block *sb;
 	ext4_fsblk_t block = 0;
 	unsigned int inquota = 0;
-	unsigned int reserv_blks = 0;
+	unsigned int reserv_clstrs = 0;
 
 	sb = ar->inode->i_sb;
 	sbi = EXT4_SB(sb);
@@ -4279,12 +4290,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 			*errp = -ENOSPC;
 			return 0;
 		}
-		reserv_blks = ar->len;
+		reserv_clstrs = ar->len;
 		if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
-			dquot_alloc_block_nofail(ar->inode, ar->len);
+			dquot_alloc_block_nofail(ar->inode,
+						 EXT4_C2B(sbi, ar->len));
 		} else {
 			while (ar->len &&
-				dquot_alloc_block(ar->inode, ar->len)) {
+				dquot_alloc_block(ar->inode,
+						  EXT4_C2B(sbi, ar->len))) {
 
 				ar->flags |= EXT4_MB_HINT_NOPREALLOC;
 				ar->len--;
@@ -4328,7 +4341,7 @@ repeat:
 			ext4_mb_new_preallocation(ac);
 	}
 	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
-		*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
+		*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
 		if (*errp == -EAGAIN) {
 			/*
 			 * drop the reference that we took
@@ -4364,13 +4377,13 @@ out:
 	if (ac)
 		kmem_cache_free(ext4_ac_cachep, ac);
 	if (inquota && ar->len < inquota)
-		dquot_free_block(ar->inode, inquota - ar->len);
+		dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
 	if (!ar->len) {
 		if (!ext4_test_inode_state(ar->inode,
 					   EXT4_STATE_DELALLOC_RESERVED))
 			/* release all the reserved blocks if non delalloc */
 			percpu_counter_sub(&sbi->s_dirtyblocks_counter,
-						reserv_blks);
+						reserv_clstrs);
 	}
 
 	trace_ext4_allocate_blocks(ar, (unsigned long long)block);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 3cdb8aa9f6b7..1641f4b57439 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -139,9 +139,9 @@ enum {
 
 struct ext4_free_extent {
 	ext4_lblk_t fe_logical;
-	ext4_grpblk_t fe_start;
+	ext4_grpblk_t fe_start;	/* In cluster units */
 	ext4_group_t fe_group;
-	ext4_grpblk_t fe_len;
+	ext4_grpblk_t fe_len;	/* In cluster units */
 };
 
 /*
-- 
cgit v1.2.3-58-ga151


From 84130193e0e6568dfdfb823f0e1e19aec80aff6e Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 18:50:51 -0400
Subject: ext4: teach ext4_free_blocks() about bigalloc and clusters

The ext4_free_blocks() function now has two new flags that indicate
whether a partial cluster at the beginning or the end of the block
extents should be freed or not.  That will be up the caller (i.e.,
truncate), who can figure out whether partial clusters at the
beginning or the end of a block range can be freed.

We also have to update the ext4_mb_free_metadata() and
release_blocks_on_commit() machinery to be cluster-based, since it is
used by ext4_free_blocks().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  2 ++
 fs/ext4/mballoc.c | 86 +++++++++++++++++++++++++++++++++++++++----------------
 fs/ext4/mballoc.h |  2 +-
 3 files changed, 65 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 803cfa42e1e8..030bfc1cb59d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -539,6 +539,8 @@ struct ext4_new_group_data {
 #define EXT4_FREE_BLOCKS_FORGET		0x0002
 #define EXT4_FREE_BLOCKS_VALIDATED	0x0004
 #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE	0x0008
+#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER	0x0010
+#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER	0x0020
 
 /*
  * Flags used by ext4_discard_partial_page_buffers
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 8765f2512f13..57ce6960e940 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2602,11 +2602,13 @@ int ext4_mb_release(struct super_block *sb)
 }
 
 static inline int ext4_issue_discard(struct super_block *sb,
-		ext4_group_t block_group, ext4_grpblk_t block, int count)
+		ext4_group_t block_group, ext4_grpblk_t cluster, int count)
 {
 	ext4_fsblk_t discard_block;
 
-	discard_block = block + ext4_group_first_block_no(sb, block_group);
+	discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
+			 ext4_group_first_block_no(sb, block_group));
+	count = EXT4_C2B(EXT4_SB(sb), count);
 	trace_ext4_discard_blocks(sb,
 			(unsigned long long) discard_block, count);
 	return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
@@ -2633,7 +2635,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 
 		if (test_opt(sb, DISCARD))
 			ext4_issue_discard(sb, entry->group,
-					   entry->start_blk, entry->count);
+					   entry->start_cluster, entry->count);
 
 		err = ext4_mb_load_buddy(sb, entry->group, &e4b);
 		/* we expect to find existing buddy because it's pinned */
@@ -2646,7 +2648,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 		ext4_lock_group(sb, entry->group);
 		/* Take it out of per group rb tree */
 		rb_erase(&entry->node, &(db->bb_free_root));
-		mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
+		mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count);
 
 		/*
 		 * Clear the trimmed flag for the group so that the next
@@ -3300,7 +3302,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 
 	while (n) {
 		entry = rb_entry(n, struct ext4_free_data, node);
-		ext4_set_bits(bitmap, entry->start_blk, entry->count);
+		ext4_set_bits(bitmap, entry->start_cluster, entry->count);
 		n = rb_next(n);
 	}
 	return;
@@ -4401,7 +4403,7 @@ static int can_merge(struct ext4_free_data *entry1,
 {
 	if ((entry1->t_tid == entry2->t_tid) &&
 	    (entry1->group == entry2->group) &&
-	    ((entry1->start_blk + entry1->count) == entry2->start_blk))
+	    ((entry1->start_cluster + entry1->count) == entry2->start_cluster))
 		return 1;
 	return 0;
 }
@@ -4411,7 +4413,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 		      struct ext4_free_data *new_entry)
 {
 	ext4_group_t group = e4b->bd_group;
-	ext4_grpblk_t block;
+	ext4_grpblk_t cluster;
 	struct ext4_free_data *entry;
 	struct ext4_group_info *db = e4b->bd_info;
 	struct super_block *sb = e4b->bd_sb;
@@ -4424,7 +4426,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 	BUG_ON(e4b->bd_buddy_page == NULL);
 
 	new_node = &new_entry->node;
-	block = new_entry->start_blk;
+	cluster = new_entry->start_cluster;
 
 	if (!*n) {
 		/* first free block exent. We need to
@@ -4438,13 +4440,14 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 	while (*n) {
 		parent = *n;
 		entry = rb_entry(parent, struct ext4_free_data, node);
-		if (block < entry->start_blk)
+		if (cluster < entry->start_cluster)
 			n = &(*n)->rb_left;
-		else if (block >= (entry->start_blk + entry->count))
+		else if (cluster >= (entry->start_cluster + entry->count))
 			n = &(*n)->rb_right;
 		else {
 			ext4_grp_locked_error(sb, group, 0,
-				ext4_group_first_block_no(sb, group) + block,
+				ext4_group_first_block_no(sb, group) +
+				EXT4_C2B(sbi, cluster),
 				"Block already on to-be-freed list");
 			return 0;
 		}
@@ -4458,7 +4461,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 	if (node) {
 		entry = rb_entry(node, struct ext4_free_data, node);
 		if (can_merge(entry, new_entry)) {
-			new_entry->start_blk = entry->start_blk;
+			new_entry->start_cluster = entry->start_cluster;
 			new_entry->count += entry->count;
 			rb_erase(node, &(db->bb_free_root));
 			spin_lock(&sbi->s_md_lock);
@@ -4509,6 +4512,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 	ext4_group_t block_group;
 	struct ext4_sb_info *sbi;
 	struct ext4_buddy e4b;
+	unsigned int count_clusters;
 	int err = 0;
 	int ret;
 
@@ -4557,6 +4561,38 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 	if (!ext4_should_writeback_data(inode))
 		flags |= EXT4_FREE_BLOCKS_METADATA;
 
+	/*
+	 * If the extent to be freed does not begin on a cluster
+	 * boundary, we need to deal with partial clusters at the
+	 * beginning and end of the extent.  Normally we will free
+	 * blocks at the beginning or the end unless we are explicitly
+	 * requested to avoid doing so.
+	 */
+	overflow = block & (sbi->s_cluster_ratio - 1);
+	if (overflow) {
+		if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
+			overflow = sbi->s_cluster_ratio - overflow;
+			block += overflow;
+			if (count > overflow)
+				count -= overflow;
+			else
+				return;
+		} else {
+			block -= overflow;
+			count += overflow;
+		}
+	}
+	overflow = count & (sbi->s_cluster_ratio - 1);
+	if (overflow) {
+		if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
+			if (count > overflow)
+				count -= overflow;
+			else
+				return;
+		} else
+			count += sbi->s_cluster_ratio - overflow;
+	}
+
 do_more:
 	overflow = 0;
 	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -4565,10 +4601,12 @@ do_more:
 	 * Check to see if we are freeing blocks across a group
 	 * boundary.
 	 */
-	if (bit + count > EXT4_CLUSTERS_PER_GROUP(sb)) {
-		overflow = bit + count - EXT4_CLUSTERS_PER_GROUP(sb);
+	if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+		overflow = EXT4_C2B(sbi, bit) + count -
+			EXT4_BLOCKS_PER_GROUP(sb);
 		count -= overflow;
 	}
+	count_clusters = EXT4_B2C(sbi, count);
 	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
 	if (!bitmap_bh) {
 		err = -EIO;
@@ -4583,9 +4621,9 @@ do_more:
 	if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
 	    in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
 	    in_range(block, ext4_inode_table(sb, gdp),
-		      EXT4_SB(sb)->s_itb_per_group) ||
+		     EXT4_SB(sb)->s_itb_per_group) ||
 	    in_range(block + count - 1, ext4_inode_table(sb, gdp),
-		      EXT4_SB(sb)->s_itb_per_group)) {
+		     EXT4_SB(sb)->s_itb_per_group)) {
 
 		ext4_error(sb, "Freeing blocks in system zone - "
 			   "Block = %llu, count = %lu", block, count);
@@ -4610,11 +4648,11 @@ do_more:
 #ifdef AGGRESSIVE_CHECK
 	{
 		int i;
-		for (i = 0; i < count; i++)
+		for (i = 0; i < count_clusters; i++)
 			BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
 	}
 #endif
-	trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
+	trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
 
 	err = ext4_mb_load_buddy(sb, block_group, &e4b);
 	if (err)
@@ -4631,13 +4669,13 @@ do_more:
 			err = -ENOMEM;
 			goto error_return;
 		}
-		new_entry->start_blk = bit;
+		new_entry->start_cluster = bit;
 		new_entry->group  = block_group;
-		new_entry->count = count;
+		new_entry->count = count_clusters;
 		new_entry->t_tid = handle->h_transaction->t_tid;
 
 		ext4_lock_group(sb, block_group);
-		mb_clear_bits(bitmap_bh->b_data, bit, count);
+		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
 		ext4_mb_free_metadata(handle, &e4b, new_entry);
 	} else {
 		/* need to update group_info->bb_free and bitmap
@@ -4645,11 +4683,11 @@ do_more:
 		 * them with group lock_held
 		 */
 		ext4_lock_group(sb, block_group);
-		mb_clear_bits(bitmap_bh->b_data, bit, count);
-		mb_free_blocks(inode, &e4b, bit, count);
+		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
+		mb_free_blocks(inode, &e4b, bit, count_clusters);
 	}
 
-	ret = ext4_free_blks_count(sb, gdp) + count;
+	ret = ext4_free_blks_count(sb, gdp) + count_clusters;
 	ext4_free_blks_set(sb, gdp, ret);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
 	ext4_unlock_group(sb, block_group);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 1641f4b57439..dc99930d4cb5 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -106,7 +106,7 @@ struct ext4_free_data {
 	ext4_group_t group;
 
 	/* free block extent */
-	ext4_grpblk_t start_blk;
+	ext4_grpblk_t start_cluster;
 	ext4_grpblk_t count;
 
 	/* transaction which freed this extent */
-- 
cgit v1.2.3-58-ga151


From 4d33b1ef10995d7ba6191d67456202c697a92a32 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 18:52:51 -0400
Subject: ext4: teach ext4_ext_map_blocks() about the bigalloc feature

If we need to allocate a new block in ext4_ext_map_blocks(), the
function needs to see if the cluster has already been allocated.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 181 ++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 162 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ba7bd5a176ce..bd42ab29efec 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1270,7 +1270,8 @@ static int ext4_ext_search_left(struct inode *inode,
  */
 static int ext4_ext_search_right(struct inode *inode,
 				 struct ext4_ext_path *path,
-				 ext4_lblk_t *logical, ext4_fsblk_t *phys)
+				 ext4_lblk_t *logical, ext4_fsblk_t *phys,
+				 struct ext4_extent **ret_ex)
 {
 	struct buffer_head *bh = NULL;
 	struct ext4_extent_header *eh;
@@ -1312,9 +1313,7 @@ static int ext4_ext_search_right(struct inode *inode,
 				return -EIO;
 			}
 		}
-		*logical = le32_to_cpu(ex->ee_block);
-		*phys = ext4_ext_pblock(ex);
-		return 0;
+		goto found_extent;
 	}
 
 	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
@@ -1327,9 +1326,7 @@ static int ext4_ext_search_right(struct inode *inode,
 	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
 		/* next allocated block in this leaf */
 		ex++;
-		*logical = le32_to_cpu(ex->ee_block);
-		*phys = ext4_ext_pblock(ex);
-		return 0;
+		goto found_extent;
 	}
 
 	/* go up and search for index to the right */
@@ -1372,9 +1369,12 @@ got_index:
 		return -EIO;
 	}
 	ex = EXT_FIRST_EXTENT(eh);
+found_extent:
 	*logical = le32_to_cpu(ex->ee_block);
 	*phys = ext4_ext_pblock(ex);
-	put_bh(bh);
+	*ret_ex = ex;
+	if (bh)
+		put_bh(bh);
 	return 0;
 }
 
@@ -1627,7 +1627,8 @@ static int ext4_ext_try_to_merge(struct inode *inode,
  * such that there will be no overlap, and then returns 1.
  * If there is no overlap found, it returns 0.
  */
-static unsigned int ext4_ext_check_overlap(struct inode *inode,
+static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
+					   struct inode *inode,
 					   struct ext4_extent *newext,
 					   struct ext4_ext_path *path)
 {
@@ -1641,6 +1642,7 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode,
 	if (!path[depth].p_ext)
 		goto out;
 	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
+	b2 &= ~(sbi->s_cluster_ratio - 1);
 
 	/*
 	 * get the next allocated block if the extent in the path
@@ -1650,6 +1652,7 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode,
 		b2 = ext4_ext_next_allocated_block(path);
 		if (b2 == EXT_MAX_BLOCKS)
 			goto out;
+		b2 &= ~(sbi->s_cluster_ratio - 1);
 	}
 
 	/* check for wrap through zero on extent logical start block*/
@@ -3293,6 +3296,106 @@ out2:
 	return err ? err : allocated;
 }
 
+/*
+ * get_implied_cluster_alloc - check to see if the requested
+ * allocation (in the map structure) overlaps with a cluster already
+ * allocated in an extent.
+ *	@sbi	The ext4-specific superblock structure
+ *	@map	The requested lblk->pblk mapping
+ *	@ex	The extent structure which might contain an implied
+ *			cluster allocation
+ *
+ * This function is called by ext4_ext_map_blocks() after we failed to
+ * find blocks that were already in the inode's extent tree.  Hence,
+ * we know that the beginning of the requested region cannot overlap
+ * the extent from the inode's extent tree.  There are three cases we
+ * want to catch.  The first is this case:
+ *
+ *		 |--- cluster # N--|
+ *    |--- extent ---|	|---- requested region ---|
+ *			|==========|
+ *
+ * The second case that we need to test for is this one:
+ *
+ *   |--------- cluster # N ----------------|
+ *	   |--- requested region --|   |------- extent ----|
+ *	   |=======================|
+ *
+ * The third case is when the requested region lies between two extents
+ * within the same cluster:
+ *          |------------- cluster # N-------------|
+ * |----- ex -----|                  |---- ex_right ----|
+ *                  |------ requested region ------|
+ *                  |================|
+ *
+ * In each of the above cases, we need to set the map->m_pblk and
+ * map->m_len so it corresponds to the return the extent labelled as
+ * "|====|" from cluster #N, since it is already in use for data in
+ * cluster EXT4_B2C(sbi, map->m_lblk).	We will then return 1 to
+ * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
+ * as a new "allocated" block region.  Otherwise, we will return 0 and
+ * ext4_ext_map_blocks() will then allocate one or more new clusters
+ * by calling ext4_mb_new_blocks().
+ */
+static int get_implied_cluster_alloc(struct ext4_sb_info *sbi,
+				     struct ext4_map_blocks *map,
+				     struct ext4_extent *ex,
+				     struct ext4_ext_path *path)
+{
+	ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
+	ext4_lblk_t ex_cluster_start, ex_cluster_end;
+	ext4_lblk_t rr_cluster_start, rr_cluster_end;
+	ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
+	ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
+	unsigned short ee_len = ext4_ext_get_actual_len(ex);
+
+	/* The extent passed in that we are trying to match */
+	ex_cluster_start = EXT4_B2C(sbi, ee_block);
+	ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
+
+	/* The requested region passed into ext4_map_blocks() */
+	rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
+	rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1);
+
+	if ((rr_cluster_start == ex_cluster_end) ||
+	    (rr_cluster_start == ex_cluster_start)) {
+		if (rr_cluster_start == ex_cluster_end)
+			ee_start += ee_len - 1;
+		map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) +
+			c_offset;
+		map->m_len = min(map->m_len,
+				 (unsigned) sbi->s_cluster_ratio - c_offset);
+		/*
+		 * Check for and handle this case:
+		 *
+		 *   |--------- cluster # N-------------|
+		 *		       |------- extent ----|
+		 *	   |--- requested region ---|
+		 *	   |===========|
+		 */
+
+		if (map->m_lblk < ee_block)
+			map->m_len = min(map->m_len, ee_block - map->m_lblk);
+
+		/*
+		 * Check for the case where there is already another allocated
+		 * block to the right of 'ex' but before the end of the cluster.
+		 *
+		 *          |------------- cluster # N-------------|
+		 * |----- ex -----|                  |---- ex_right ----|
+		 *                  |------ requested region ------|
+		 *                  |================|
+		 */
+		if (map->m_lblk > ee_block) {
+			ext4_lblk_t next = ext4_ext_next_allocated_block(path);
+			map->m_len = min(map->m_len, next - map->m_lblk);
+		}
+		return 1;
+	}
+	return 0;
+}
+
+
 /*
  * Block allocation/map/preallocation routine for extents based files
  *
@@ -3315,14 +3418,16 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			struct ext4_map_blocks *map, int flags)
 {
 	struct ext4_ext_path *path = NULL;
-	struct ext4_extent newex, *ex;
+	struct ext4_extent newex, *ex, *ex2;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	ext4_fsblk_t newblock = 0;
-	int err = 0, depth, ret;
-	unsigned int allocated = 0;
+	int free_on_err = 0, err = 0, depth, ret;
+	unsigned int allocated = 0, offset = 0;
 	unsigned int punched_out = 0;
 	unsigned int result = 0;
 	struct ext4_allocation_request ar;
 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+	ext4_lblk_t cluster_offset;
 	struct ext4_map_blocks punch_map;
 
 	ext_debug("blocks %u/%u requested for inode %lu\n",
@@ -3508,9 +3613,23 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
 		goto out2;
 	}
+
 	/*
 	 * Okay, we need to do block allocation.
 	 */
+	newex.ee_block = cpu_to_le32(map->m_lblk);
+	cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
+
+	/*
+	 * If we are doing bigalloc, check to see if the extent returned
+	 * by ext4_ext_find_extent() implies a cluster we can use.
+	 */
+	if (cluster_offset && ex &&
+	    get_implied_cluster_alloc(sbi, map, ex, path)) {
+		ar.len = allocated = map->m_len;
+		newblock = map->m_pblk;
+		goto got_allocated_blocks;
+	}
 
 	/* find neighbour allocated blocks */
 	ar.lleft = map->m_lblk;
@@ -3518,10 +3637,20 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	if (err)
 		goto out2;
 	ar.lright = map->m_lblk;
-	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
+	ex2 = NULL;
+	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
 	if (err)
 		goto out2;
 
+	/* Check if the extent after searching to the right implies a
+	 * cluster we can use. */
+	if ((sbi->s_cluster_ratio > 1) && ex2 &&
+	    get_implied_cluster_alloc(sbi, map, ex2, path)) {
+		ar.len = allocated = map->m_len;
+		newblock = map->m_pblk;
+		goto got_allocated_blocks;
+	}
+
 	/*
 	 * See if request is beyond maximum number of blocks we can have in
 	 * a single extent. For an initialized extent this limit is
@@ -3536,9 +3665,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		map->m_len = EXT_UNINIT_MAX_LEN;
 
 	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
-	newex.ee_block = cpu_to_le32(map->m_lblk);
 	newex.ee_len = cpu_to_le16(map->m_len);
-	err = ext4_ext_check_overlap(inode, &newex, path);
+	err = ext4_ext_check_overlap(sbi, inode, &newex, path);
 	if (err)
 		allocated = ext4_ext_get_actual_len(&newex);
 	else
@@ -3548,7 +3676,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	ar.inode = inode;
 	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
 	ar.logical = map->m_lblk;
-	ar.len = allocated;
+	/*
+	 * We calculate the offset from the beginning of the cluster
+	 * for the logical block number, since when we allocate a
+	 * physical cluster, the physical block should start at the
+	 * same offset from the beginning of the cluster.  This is
+	 * needed so that future calls to get_implied_cluster_alloc()
+	 * work correctly.
+	 */
+	offset = map->m_lblk & (sbi->s_cluster_ratio - 1);
+	ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
+	ar.goal -= offset;
+	ar.logical -= offset;
 	if (S_ISREG(inode->i_mode))
 		ar.flags = EXT4_MB_HINT_DATA;
 	else
@@ -3561,9 +3700,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		goto out2;
 	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
 		  ar.goal, newblock, allocated);
+	free_on_err = 1;
+	ar.len = EXT4_C2B(sbi, ar.len) - offset;
+	if (ar.len > allocated)
+		ar.len = allocated;
 
+got_allocated_blocks:
 	/* try to insert new extent into found leaf and return */
-	ext4_ext_store_pblock(&newex, newblock);
+	ext4_ext_store_pblock(&newex, newblock + offset);
 	newex.ee_len = cpu_to_le16(ar.len);
 	/* Mark uninitialized */
 	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
@@ -3591,7 +3735,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	if (!err)
 		err = ext4_ext_insert_extent(handle, inode, path,
 					     &newex, flags);
-	if (err) {
+	if (err && free_on_err) {
 		int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
 			EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
 		/* free data blocks we just allocated */
@@ -4115,7 +4259,6 @@ found_delayed_extent:
 		return EXT_BREAK;
 	return EXT_CONTINUE;
 }
-
 /* fiemap flags we can handle specified here */
 #define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
 
-- 
cgit v1.2.3-58-ga151


From 0aa060000e83ca3d09ddc446a7174fb0820d99bc Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 18:54:51 -0400
Subject: ext4: teach ext4_ext_truncate() about the bigalloc feature

When we are truncating (as opposed unlinking) a file, we need to worry
about partial truncates of a file, especially in the light of sparse
files.  The changes here make sure that arbitrary truncates of sparse
files works correctly.  Yeah, it's messy.

Note that these functions will need to be revisted when the punch
ioctl is integrated --- in fact this commit will probably have merge
conflicts with the punch changes which Allison Henders and the IBM LTC
have been working on.  I will need to fix this up when either patch
hits mainline.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 76 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bd42ab29efec..cd4479c08031 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2202,14 +2202,39 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 }
 
 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
-				struct ext4_extent *ex,
-				ext4_lblk_t from, ext4_lblk_t to)
+			      struct ext4_extent *ex,
+			      ext4_fsblk_t *partial_cluster,
+			      ext4_lblk_t from, ext4_lblk_t to)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
+	ext4_fsblk_t pblk;
 	int flags = EXT4_FREE_BLOCKS_FORGET;
 
 	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
 		flags |= EXT4_FREE_BLOCKS_METADATA;
+	/*
+	 * For bigalloc file systems, we never free a partial cluster
+	 * at the beginning of the extent.  Instead, we make a note
+	 * that we tried freeing the cluster, and check to see if we
+	 * need to free it on a subsequent call to ext4_remove_blocks,
+	 * or at the end of the ext4_truncate() operation.
+	 */
+	flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
+
+	/*
+	 * If we have a partial cluster, and it's different from the
+	 * cluster of the last block, we need to explicitly free the
+	 * partial cluster here.
+	 */
+	pblk = ext4_ext_pblock(ex) + ee_len - 1;
+	if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
+		ext4_free_blocks(handle, inode, NULL,
+				 EXT4_C2B(sbi, *partial_cluster),
+				 sbi->s_cluster_ratio, flags);
+		*partial_cluster = 0;
+	}
+
 #ifdef EXTENTS_STATS
 	{
 		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2229,12 +2254,24 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
 		/* tail removal */
 		ext4_lblk_t num;
-		ext4_fsblk_t start;
 
 		num = le32_to_cpu(ex->ee_block) + ee_len - from;
-		start = ext4_ext_pblock(ex) + ee_len - num;
-		ext_debug("free last %u blocks starting %llu\n", num, start);
-		ext4_free_blocks(handle, inode, NULL, start, num, flags);
+		pblk = ext4_ext_pblock(ex) + ee_len - num;
+		ext_debug("free last %u blocks starting %llu\n", num, pblk);
+		ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
+		/*
+		 * If the block range to be freed didn't start at the
+		 * beginning of a cluster, and we removed the entire
+		 * extent, save the partial cluster here, since we
+		 * might need to delete if we determine that the
+		 * truncate operation has removed all of the blocks in
+		 * the cluster.
+		 */
+		if (pblk & (sbi->s_cluster_ratio - 1) &&
+		    (ee_len == num))
+			*partial_cluster = EXT4_B2C(sbi, pblk);
+		else
+			*partial_cluster = 0;
 	} else if (from == le32_to_cpu(ex->ee_block)
 		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
 		/* head removal */
@@ -2269,9 +2306,10 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
  */
 static int
 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
-		struct ext4_ext_path *path, ext4_lblk_t start,
-		ext4_lblk_t end)
+		 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
+		 ext4_lblk_t start, ext4_lblk_t end)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	int err = 0, correct_index = 0;
 	int depth = ext_depth(inode), credits;
 	struct ext4_extent_header *eh;
@@ -2423,7 +2461,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		if (err)
 			goto out;
 
-		err = ext4_remove_blocks(handle, inode, ex, a, b);
+		err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
+					 a, b);
 		if (err)
 			goto out;
 
@@ -2471,7 +2510,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 					sizeof(struct ext4_extent));
 			}
 			le16_add_cpu(&eh->eh_entries, -1);
-		}
+		} else
+			*partial_cluster = 0;
 
 		ext_debug("new extent: %u:%u:%llu\n", block, num,
 				ext4_ext_pblock(ex));
@@ -2483,6 +2523,25 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 	if (correct_index && eh->eh_entries)
 		err = ext4_ext_correct_indexes(handle, inode, path);
 
+	/*
+	 * If there is still a entry in the leaf node, check to see if
+	 * it references the partial cluster.  This is the only place
+	 * where it could; if it doesn't, we can free the cluster.
+	 */
+	if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
+	    (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
+	     *partial_cluster)) {
+		int flags = EXT4_FREE_BLOCKS_FORGET;
+
+		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+			flags |= EXT4_FREE_BLOCKS_METADATA;
+
+		ext4_free_blocks(handle, inode, NULL,
+				 EXT4_C2B(sbi, *partial_cluster),
+				 sbi->s_cluster_ratio, flags);
+		*partial_cluster = 0;
+	}
+
 	/* if this leaf is free, then we should
 	 * remove it from index block above */
 	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
@@ -2518,6 +2577,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 	struct super_block *sb = inode->i_sb;
 	int depth = ext_depth(inode);
 	struct ext4_ext_path *path;
+	ext4_fsblk_t partial_cluster = 0;
 	handle_t *handle;
 	int i, err;
 
@@ -2553,7 +2613,8 @@ again:
 		if (i == depth) {
 			/* this is leaf block */
 			err = ext4_ext_rm_leaf(handle, inode, path,
-					start, EXT_MAX_BLOCKS - 1);
+					       &partial_cluster, start,
+					       EXT_MAX_BLOCKS - 1);
 			/* root level has p_bh == NULL, brelse() eats this */
 			brelse(path[i].p_bh);
 			path[i].p_bh = NULL;
@@ -3495,6 +3556,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		ee_len = ext4_ext_get_actual_len(ex);
 		/* if found extent covers block, simply return it */
 		if (in_range(map->m_lblk, ee_block, ee_len)) {
+			ext4_fsblk_t partial_cluster = 0;
+
 			newblock = map->m_lblk - ee_block + ee_start;
 			/* number of remaining blocks in the extent */
 			allocated = ee_len - (map->m_lblk - ee_block);
@@ -3578,7 +3641,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			ext4_ext_invalidate_cache(inode);
 
 			err = ext4_ext_rm_leaf(handle, inode, path,
-				map->m_lblk, map->m_lblk + punched_out);
+					       &partial_cluster, map->m_lblk,
+					       map->m_lblk + punched_out);
 
 			if (!err && path->p_hdr->eh_entries == 0) {
 				/*
-- 
cgit v1.2.3-58-ga151


From 5704265188ffe4290ed73b3cb685206c3ed8209d Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 18:56:51 -0400
Subject: ext4: convert s_{dirty,free}blocks_counter to
 s_{dirty,free}clusters_counter

Convert the percpu counters s_dirtyblocks_counter and
s_freeblocks_counter in struct ext4_super_info to be
s_dirtyclusters_counter and s_freeclusters_counter.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 10 +++++-----
 fs/ext4/ext4.h    |  5 +++--
 fs/ext4/ialloc.c  |  3 ++-
 fs/ext4/inode.c   | 18 ++++++++++--------
 fs/ext4/mballoc.c | 12 +++++++-----
 fs/ext4/resize.c  |  4 ++--
 fs/ext4/super.c   | 29 +++++++++++++++--------------
 7 files changed, 44 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 89abf1f7b253..9080a857cda9 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -414,16 +414,16 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
 				s64 nblocks, unsigned int flags)
 {
 	s64 free_blocks, dirty_blocks, root_blocks;
-	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
-	struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
+	struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
+	struct percpu_counter *dbc = &sbi->s_dirtyclusters_counter;
 
-	free_blocks  = percpu_counter_read_positive(fbc);
+	free_blocks  = percpu_counter_read_positive(fcc);
 	dirty_blocks = percpu_counter_read_positive(dbc);
 	root_blocks = ext4_r_blocks_count(sbi->s_es);
 
 	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
 						EXT4_FREEBLOCKS_WATERMARK) {
-		free_blocks  = percpu_counter_sum_positive(fbc);
+		free_blocks  = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc));
 		dirty_blocks = percpu_counter_sum_positive(dbc);
 	}
 	/* Check whether we have space after
@@ -449,7 +449,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
 			   s64 nblocks, unsigned int flags)
 {
 	if (ext4_has_free_blocks(sbi, nblocks, flags)) {
-		percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
+		percpu_counter_add(&sbi->s_dirtyclusters_counter, nblocks);
 		return 0;
 	} else
 		return -ENOSPC;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 030bfc1cb59d..c7588366471c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -855,6 +855,7 @@ struct ext4_inode_info {
 	ext4_group_t	i_last_alloc_group;
 
 	/* allocation reservation info for delalloc */
+	/* In case of bigalloc, these refer to clusters rather than blocks */
 	unsigned int i_reserved_data_blocks;
 	unsigned int i_reserved_meta_blocks;
 	unsigned int i_allocated_meta_blocks;
@@ -1144,10 +1145,10 @@ struct ext4_sb_info {
 	u32 s_hash_seed[4];
 	int s_def_hash_version;
 	int s_hash_unsigned;	/* 3 if hash should be signed, 0 if not */
-	struct percpu_counter s_freeblocks_counter;
+	struct percpu_counter s_freeclusters_counter;
 	struct percpu_counter s_freeinodes_counter;
 	struct percpu_counter s_dirs_counter;
-	struct percpu_counter s_dirtyblocks_counter;
+	struct percpu_counter s_dirtyclusters_counter;
 	struct blockgroup_lock *s_blockgroup_lock;
 	struct proc_dir_entry *s_proc;
 	struct kobject s_kobj;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b7a8130d0af4..58115bad163f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -490,7 +490,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 
 	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
 	avefreei = freei / ngroups;
-	freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	freeb = EXT4_C2B(sbi,
+		percpu_counter_read_positive(&sbi->s_freeclusters_counter));
 	avefreeb = freeb;
 	do_div(avefreeb, ngroups);
 	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 904a9a623dab..40f51aae42fe 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -281,7 +281,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
 	/* Update per-inode reservations */
 	ei->i_reserved_data_blocks -= used;
 	ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
-	percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+	percpu_counter_sub(&sbi->s_dirtyclusters_counter,
 			   used + ei->i_allocated_meta_blocks);
 	ei->i_allocated_meta_blocks = 0;
 
@@ -291,7 +291,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
 		 * only when we have written all of the delayed
 		 * allocation blocks.
 		 */
-		percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+		percpu_counter_sub(&sbi->s_dirtyclusters_counter,
 				   ei->i_reserved_meta_blocks);
 		ei->i_reserved_meta_blocks = 0;
 		ei->i_da_metadata_calc_len = 0;
@@ -1119,14 +1119,14 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 		 * only when we have written all of the delayed
 		 * allocation blocks.
 		 */
-		percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+		percpu_counter_sub(&sbi->s_dirtyclusters_counter,
 				   ei->i_reserved_meta_blocks);
 		ei->i_reserved_meta_blocks = 0;
 		ei->i_da_metadata_calc_len = 0;
 	}
 
 	/* update fs dirty data blocks counter */
-	percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
+	percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
 
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 
@@ -1349,9 +1349,10 @@ static void ext4_print_free_blocks(struct inode *inode)
 	       ext4_count_free_blocks(inode->i_sb));
 	printk(KERN_CRIT "Free/Dirty block details\n");
 	printk(KERN_CRIT "free_blocks=%lld\n",
-	       (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
+	       (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+		percpu_counter_sum(&sbi->s_freeclusters_counter)));
 	printk(KERN_CRIT "dirty_blocks=%lld\n",
-	       (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+	       (long long) percpu_counter_sum(&sbi->s_dirtyclusters_counter));
 	printk(KERN_CRIT "Block reservation details\n");
 	printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
 	       EXT4_I(inode)->i_reserved_data_blocks);
@@ -2226,8 +2227,9 @@ static int ext4_nonda_switch(struct super_block *sb)
 	 * Delalloc need an accurate free block accounting. So switch
 	 * to non delalloc when we are near to error range.
 	 */
-	free_blocks  = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
-	dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
+	free_blocks  = EXT4_C2B(sbi,
+		percpu_counter_read_positive(&sbi->s_freeclusters_counter));
+	dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
 	if (2 * free_blocks < 3 * dirty_blocks ||
 		free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
 		/*
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 57ce6960e940..4a38b65bd564 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2834,13 +2834,14 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
 
 	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
-	percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+	percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
 	/*
 	 * Now reduce the dirty block count also. Should not go negative
 	 */
 	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
 		/* release all the reserved blocks if non delalloc */
-		percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_clstrs);
+		percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+				   reserv_clstrs);
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi,
@@ -4384,7 +4385,7 @@ out:
 		if (!ext4_test_inode_state(ar->inode,
 					   EXT4_STATE_DELALLOC_RESERVED))
 			/* release all the reserved blocks if non delalloc */
-			percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+			percpu_counter_sub(&sbi->s_dirtyclusters_counter,
 						reserv_clstrs);
 	}
 
@@ -4691,7 +4692,7 @@ do_more:
 	ext4_free_blks_set(sb, gdp, ret);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
 	ext4_unlock_group(sb, block_group);
-	percpu_counter_add(&sbi->s_freeblocks_counter, count);
+	percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
@@ -4833,7 +4834,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
 	ext4_free_blks_set(sb, desc, blk_free_count);
 	desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
 	ext4_unlock_group(sb, block_group);
-	percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
+	percpu_counter_add(&sbi->s_freeclusters_counter,
+			   EXT4_B2C(sbi, blocks_freed));
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 707d3f16f7ce..a324a537f2dc 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -937,8 +937,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 		input->reserved_blocks);
 
 	/* Update the free space counts */
-	percpu_counter_add(&sbi->s_freeblocks_counter,
-			   input->free_blocks_count);
+	percpu_counter_add(&sbi->s_freeclusters_counter,
+			   EXT4_B2C(sbi, input->free_blocks_count));
 	percpu_counter_add(&sbi->s_freeinodes_counter,
 			   EXT4_INODES_PER_GROUP(sb));
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 25a4bfe3f39f..f81e7e791655 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -837,10 +837,10 @@ static void ext4_put_super(struct super_block *sb)
 		brelse(sbi->s_group_desc[i]);
 	ext4_kvfree(sbi->s_group_desc);
 	ext4_kvfree(sbi->s_flex_groups);
-	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeclusters_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
-	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
 	brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
@@ -2473,7 +2473,7 @@ static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
 					      char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%llu\n",
-			(s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+		(s64) percpu_counter_sum(&sbi->s_dirtyclusters_counter));
 }
 
 static ssize_t session_write_kbytes_show(struct ext4_attr *a,
@@ -3575,7 +3575,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_err_report.function = print_daily_error_info;
 	sbi->s_err_report.data = (unsigned long) sb;
 
-	err = percpu_counter_init(&sbi->s_freeblocks_counter,
+	err = percpu_counter_init(&sbi->s_freeclusters_counter,
 			ext4_count_free_blocks(sb));
 	if (!err) {
 		err = percpu_counter_init(&sbi->s_freeinodes_counter,
@@ -3586,7 +3586,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				ext4_count_dirs(sb));
 	}
 	if (!err) {
-		err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+		err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);
 	}
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "insufficient memory");
@@ -3701,13 +3701,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 * The journal may have updated the bg summary counts, so we
 	 * need to update the global counters.
 	 */
-	percpu_counter_set(&sbi->s_freeblocks_counter,
+	percpu_counter_set(&sbi->s_freeclusters_counter,
 			   ext4_count_free_blocks(sb));
 	percpu_counter_set(&sbi->s_freeinodes_counter,
 			   ext4_count_free_inodes(sb));
 	percpu_counter_set(&sbi->s_dirs_counter,
 			   ext4_count_dirs(sb));
-	percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
+	percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);
 
 no_journal:
 	/*
@@ -3847,10 +3847,10 @@ failed_mount3:
 	del_timer(&sbi->s_err_report);
 	if (sbi->s_flex_groups)
 		ext4_kvfree(sbi->s_flex_groups);
-	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeclusters_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
-	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
 	if (sbi->s_mmp_tsk)
 		kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
@@ -4173,8 +4173,9 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 	else
 		es->s_kbytes_written =
 			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
-	ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
-					   &EXT4_SB(sb)->s_freeblocks_counter));
+	ext4_free_blocks_count_set(es,
+			EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
+				&EXT4_SB(sb)->s_freeclusters_counter)));
 	es->s_free_inodes_count =
 		cpu_to_le32(percpu_counter_sum_positive(
 				&EXT4_SB(sb)->s_freeinodes_counter));
@@ -4629,10 +4630,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_type = EXT4_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
-	bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
-		       percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
+	bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
+		percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
 	/* prevent underflow in case that few free space is available */
-	buf->f_bfree = max_t(s64, bfree, 0);
+	buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
 	buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
 	if (buf->f_bfree < ext4_r_blocks_count(es))
 		buf->f_bavail = 0;
-- 
cgit v1.2.3-58-ga151


From 24aaa8ef4e2b5764ada1fc69787e2fbd4f6276e5 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 18:58:51 -0400
Subject: ext4: convert the free_blocks field in s_flex_groups to be
 free_clusters

Convert the free_blocks to be free_clusters to make the final revised
bigalloc changes easier to read/understand.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  2 +-
 fs/ext4/ialloc.c  | 40 +++++++++++++++++++++-------------------
 fs/ext4/mballoc.c |  9 +++++----
 fs/ext4/resize.c  |  4 ++--
 fs/ext4/super.c   |  2 +-
 5 files changed, 30 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c7588366471c..d2584224c89a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -300,7 +300,7 @@ struct ext4_group_desc
 
 struct flex_groups {
 	atomic_t free_inodes;
-	atomic_t free_blocks;
+	atomic_t free_clusters;
 	atomic_t used_dirs;
 };
 
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 58115bad163f..0be5862313f0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -346,7 +346,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
 	int flex_size = ext4_flex_bg_size(sbi);
 	ext4_group_t best_flex = parent_fbg_group;
 	int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
-	int flexbg_free_blocks;
+	int flexbg_free_clusters;
 	int flex_freeb_ratio;
 	ext4_group_t n_fbg_groups;
 	ext4_group_t i;
@@ -355,8 +355,9 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
 		sbi->s_log_groups_per_flex;
 
 find_close_to_parent:
-	flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
-	flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+	flexbg_free_clusters = atomic_read(&flex_group[best_flex].free_clusters);
+	flex_freeb_ratio = EXT4_C2B(sbi, flexbg_free_clusters) * 100 /
+		blocks_per_flex;
 	if (atomic_read(&flex_group[best_flex].free_inodes) &&
 	    flex_freeb_ratio > free_block_ratio)
 		goto found_flexbg;
@@ -370,8 +371,9 @@ find_close_to_parent:
 		if (i == parent_fbg_group || i == parent_fbg_group - 1)
 			continue;
 
-		flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
-		flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+		flexbg_free_clusters = atomic_read(&flex_group[i].free_clusters);
+		flex_freeb_ratio = EXT4_C2B(sbi, flexbg_free_clusters) * 100 /
+			blocks_per_flex;
 
 		if (flex_freeb_ratio > free_block_ratio &&
 		    (atomic_read(&flex_group[i].free_inodes))) {
@@ -380,14 +382,14 @@ find_close_to_parent:
 		}
 
 		if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
-		    ((atomic_read(&flex_group[i].free_blocks) >
-		      atomic_read(&flex_group[best_flex].free_blocks)) &&
+		    ((atomic_read(&flex_group[i].free_clusters) >
+		      atomic_read(&flex_group[best_flex].free_clusters)) &&
 		     atomic_read(&flex_group[i].free_inodes)))
 			best_flex = i;
 	}
 
 	if (!atomic_read(&flex_group[best_flex].free_inodes) ||
-	    !atomic_read(&flex_group[best_flex].free_blocks))
+	    !atomic_read(&flex_group[best_flex].free_clusters))
 		return -1;
 
 found_flexbg:
@@ -407,7 +409,7 @@ out:
 
 struct orlov_stats {
 	__u32 free_inodes;
-	__u32 free_blocks;
+	__u32 free_clusters;
 	__u32 used_dirs;
 };
 
@@ -424,7 +426,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
 
 	if (flex_size > 1) {
 		stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
-		stats->free_blocks = atomic_read(&flex_group[g].free_blocks);
+		stats->free_clusters = atomic_read(&flex_group[g].free_clusters);
 		stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
 		return;
 	}
@@ -432,11 +434,11 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
 	desc = ext4_get_group_desc(sb, g, NULL);
 	if (desc) {
 		stats->free_inodes = ext4_free_inodes_count(sb, desc);
-		stats->free_blocks = ext4_free_blks_count(sb, desc);
+		stats->free_clusters = ext4_free_blks_count(sb, desc);
 		stats->used_dirs = ext4_used_dirs_count(sb, desc);
 	} else {
 		stats->free_inodes = 0;
-		stats->free_blocks = 0;
+		stats->free_clusters = 0;
 		stats->used_dirs = 0;
 	}
 }
@@ -471,10 +473,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 	ext4_group_t real_ngroups = ext4_get_groups_count(sb);
 	int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
 	unsigned int freei, avefreei;
-	ext4_fsblk_t freeb, avefreeb;
+	ext4_fsblk_t freeb, avefreec;
 	unsigned int ndirs;
 	int max_dirs, min_inodes;
-	ext4_grpblk_t min_blocks;
+	ext4_grpblk_t min_clusters;
 	ext4_group_t i, grp, g, ngroups;
 	struct ext4_group_desc *desc;
 	struct orlov_stats stats;
@@ -492,8 +494,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 	avefreei = freei / ngroups;
 	freeb = EXT4_C2B(sbi,
 		percpu_counter_read_positive(&sbi->s_freeclusters_counter));
-	avefreeb = freeb;
-	do_div(avefreeb, ngroups);
+	avefreec = freeb;
+	do_div(avefreec, ngroups);
 	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
 
 	if (S_ISDIR(mode) &&
@@ -519,7 +521,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 				continue;
 			if (stats.free_inodes < avefreei)
 				continue;
-			if (stats.free_blocks < avefreeb)
+			if (stats.free_clusters < avefreec)
 				continue;
 			grp = g;
 			ret = 0;
@@ -557,7 +559,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 	min_inodes = avefreei - inodes_per_group*flex_size / 4;
 	if (min_inodes < 1)
 		min_inodes = 1;
-	min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
+	min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
 
 	/*
 	 * Start looking in the flex group where we last allocated an
@@ -576,7 +578,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 			continue;
 		if (stats.free_inodes < min_inodes)
 			continue;
-		if (stats.free_blocks < min_blocks)
+		if (stats.free_clusters < min_clusters)
 			continue;
 		goto found_flex_bg;
 	}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4a38b65bd564..f8e37cf2c2dd 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2847,7 +2847,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		ext4_group_t flex_group = ext4_flex_group(sbi,
 							  ac->ac_b_ex.fe_group);
 		atomic_sub(ac->ac_b_ex.fe_len,
-			   &sbi->s_flex_groups[flex_group].free_blocks);
+			   &sbi->s_flex_groups[flex_group].free_clusters);
 	}
 
 	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -4696,7 +4696,8 @@ do_more:
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-		atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
+		atomic_add(count_clusters,
+			   &sbi->s_flex_groups[flex_group].free_clusters);
 	}
 
 	ext4_mb_unload_buddy(&e4b);
@@ -4839,8 +4840,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-		atomic_add(blocks_freed,
-			   &sbi->s_flex_groups[flex_group].free_blocks);
+		atomic_add(EXT4_B2C(sbi, blocks_freed),
+			   &sbi->s_flex_groups[flex_group].free_clusters);
 	}
 
 	ext4_mb_unload_buddy(&e4b);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index a324a537f2dc..95a09ddca3b9 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -946,8 +946,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	    sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group;
 		flex_group = ext4_flex_group(sbi, input->group);
-		atomic_add(input->free_blocks_count,
-			   &sbi->s_flex_groups[flex_group].free_blocks);
+		atomic_add(EXT4_B2C(sbi, input->free_blocks_count),
+			   &sbi->s_flex_groups[flex_group].free_clusters);
 		atomic_add(EXT4_INODES_PER_GROUP(sb),
 			   &sbi->s_flex_groups[flex_group].free_inodes);
 	}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f81e7e791655..d7e0e045b11b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2035,7 +2035,7 @@ static int ext4_fill_flex_info(struct super_block *sb)
 		atomic_add(ext4_free_inodes_count(sb, gdp),
 			   &sbi->s_flex_groups[flex_group].free_inodes);
 		atomic_add(ext4_free_blks_count(sb, gdp),
-			   &sbi->s_flex_groups[flex_group].free_blocks);
+			   &sbi->s_flex_groups[flex_group].free_clusters);
 		atomic_add(ext4_used_dirs_count(sb, gdp),
 			   &sbi->s_flex_groups[flex_group].used_dirs);
 	}
-- 
cgit v1.2.3-58-ga151


From f975d6bcc7a698a10cc755115e27d3612dcfe322 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 19:00:51 -0400
Subject: ext4: teach ext4_statfs() to deal with clusters if bigalloc is
 enabled

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d7e0e045b11b..6810957e0ac7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4580,16 +4580,34 @@ restore_opts:
 	return err;
 }
 
+/*
+ * Note: calculating the overhead so we can be compatible with
+ * historical BSD practice is quite difficult in the face of
+ * clusters/bigalloc.  This is because multiple metadata blocks from
+ * different block group can end up in the same allocation cluster.
+ * Calculating the exact overhead in the face of clustered allocation
+ * requires either O(all block bitmaps) in memory or O(number of block
+ * groups**2) in time.  We will still calculate the superblock for
+ * older file systems --- and if we come across with a bigalloc file
+ * system with zero in s_overhead_clusters the estimate will be close to
+ * correct especially for very large cluster sizes --- but for newer
+ * file systems, it's better to calculate this figure once at mkfs
+ * time, and store it in the superblock.  If the superblock value is
+ * present (even for non-bigalloc file systems), we will use it.
+ */
 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
+	struct ext4_group_desc *gdp;
 	u64 fsid;
 	s64 bfree;
 
 	if (test_opt(sb, MINIX_DF)) {
 		sbi->s_overhead_last = 0;
+	} else if (es->s_overhead_clusters) {
+		sbi->s_overhead_last = le32_to_cpu(es->s_overhead_clusters);
 	} else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
 		ext4_group_t i, ngroups = ext4_get_groups_count(sb);
 		ext4_fsblk_t overhead = 0;
@@ -4604,24 +4622,16 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 		 * All of the blocks before first_data_block are
 		 * overhead
 		 */
-		overhead = le32_to_cpu(es->s_first_data_block);
+		overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
 
 		/*
-		 * Add the overhead attributed to the superblock and
-		 * block group descriptors.  If the sparse superblocks
-		 * feature is turned on, then not all groups have this.
+		 * Add the overhead found in each block group
 		 */
 		for (i = 0; i < ngroups; i++) {
-			overhead += ext4_bg_has_super(sb, i) +
-				ext4_bg_num_gdb(sb, i);
+			gdp = ext4_get_group_desc(sb, i, NULL);
+			overhead += ext4_num_overhead_clusters(sb, i, gdp);
 			cond_resched();
 		}
-
-		/*
-		 * Every block group has an inode bitmap, a block
-		 * bitmap, and an inode table.
-		 */
-		overhead += ngroups * (2 + sbi->s_itb_per_group);
 		sbi->s_overhead_last = overhead;
 		smp_wmb();
 		sbi->s_blocks_last = ext4_blocks_count(es);
@@ -4629,7 +4639,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 	buf->f_type = EXT4_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
-	buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
+	buf->f_blocks = (ext4_blocks_count(es) -
+			 EXT4_C2B(sbi, sbi->s_overhead_last));
 	bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
 		percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
 	/* prevent underflow in case that few free space is available */
-- 
cgit v1.2.3-58-ga151


From 27baebb849d46d901e756e6502b0a65a62e43771 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 19:02:51 -0400
Subject: ext4: tune mballoc's default group prealloc size for bigalloc file
 systems

The default group preallocation size had been previously set to 512
blocks/clusters, regardless of the block/cluster size.  This is
probably too big for large cluster sizes.  So adjust the default so
that it is 2 megabytes or 32 clusters, whichever is larger.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f8e37cf2c2dd..63dd56703342 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -126,7 +126,8 @@
  * list. In case of inode preallocation we follow a list of heuristics
  * based on file size. This can be found in ext4_mb_normalize_request. If
  * we are doing a group prealloc we try to normalize the request to
- * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
+ * sbi->s_mb_group_prealloc.  The default value of s_mb_group_prealloc is
+ * dependent on the cluster size; for non-bigalloc file systems, it is
  * 512 blocks. This can be tuned via
  * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
  * terms of number of blocks. If we have mounted the file system with -O
@@ -2473,7 +2474,20 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	sbi->s_mb_stats = MB_DEFAULT_STATS;
 	sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
 	sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
-	sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
+	/*
+	 * The default group preallocation is 512, which for 4k block
+	 * sizes translates to 2 megabytes.  However for bigalloc file
+	 * systems, this is probably too big (i.e, if the cluster size
+	 * is 1 megabyte, then group preallocation size becomes half a
+	 * gigabyte!).  As a default, we will keep a two megabyte
+	 * group pralloc size for cluster sizes up to 64k, and after
+	 * that, we will force a minimum group preallocation size of
+	 * 32 clusters.  This translates to 8 megs when the cluster
+	 * size is 256k, and 32 megs when the cluster size is 1 meg,
+	 * which seems reasonable as a default.
+	 */
+	sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
+				       sbi->s_cluster_bits, 32);
 	/*
 	 * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
 	 * to the lowest multiple of s_stripe which is bigger than
-- 
cgit v1.2.3-58-ga151


From 7b415bf60f6afb0499fd3dc0ee33444f54e28567 Mon Sep 17 00:00:00 2001
From: Aditya Kali <adityakali@google.com>
Date: Fri, 9 Sep 2011 19:04:51 -0400
Subject: ext4: Fix bigalloc quota accounting and i_blocks value

With bigalloc changes, the i_blocks value was not correctly set (it was still
set to number of blocks being used, but in case of bigalloc, we want i_blocks
to represent the number of clusters being used). Since the quota subsystem sets
the i_blocks value, this patch fixes the quota accounting and makes sure that
the i_blocks value is set correctly.

Signed-off-by: Aditya Kali <adityakali@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c       |   5 +-
 fs/ext4/ext4.h         |  16 ++-
 fs/ext4/ext4_extents.h |   2 +
 fs/ext4/extents.c      | 306 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/inode.c        |  54 ++++++---
 fs/ext4/mballoc.c      |   5 +-
 fs/ext4/super.c        |   3 +-
 7 files changed, 366 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9080a857cda9..bf42b3219e3c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -485,7 +485,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
  * @handle:             handle to this transaction
  * @inode:              file inode
  * @goal:               given target block(filesystem wide)
- * @count:		pointer to total number of blocks needed
+ * @count:		pointer to total number of clusters needed
  * @errp:               error code
  *
  * Return 1st allocated block number on success, *count stores total account
@@ -517,7 +517,8 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 		spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 		EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
 		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-		dquot_alloc_block_nofail(inode, ar.len);
+		dquot_alloc_block_nofail(inode,
+				EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
 	}
 	return ret;
 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d2584224c89a..a6307f7c9807 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -144,9 +144,17 @@ struct ext4_allocation_request {
 #define EXT4_MAP_UNWRITTEN	(1 << BH_Unwritten)
 #define EXT4_MAP_BOUNDARY	(1 << BH_Boundary)
 #define EXT4_MAP_UNINIT		(1 << BH_Uninit)
+/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of
+ * ext4_map_blocks wants to know whether or not the underlying cluster has
+ * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that
+ * the requested mapping was from previously mapped (or delayed allocated)
+ * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster
+ * should never appear on buffer_head's state flags.
+ */
+#define EXT4_MAP_FROM_CLUSTER	(1 << BH_AllocFromCluster)
 #define EXT4_MAP_FLAGS		(EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
 				 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
-				 EXT4_MAP_UNINIT)
+				 EXT4_MAP_UNINIT | EXT4_MAP_FROM_CLUSTER)
 
 struct ext4_map_blocks {
 	ext4_fsblk_t m_pblk;
@@ -1884,6 +1892,7 @@ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
 					int used, int quota_claim);
+extern int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock);
 
 /* indirect.c */
 extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
@@ -2284,6 +2293,11 @@ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
 enum ext4_state_bits {
 	BH_Uninit	/* blocks are allocated but uninitialized on disk */
 	  = BH_JBDPrivateStart,
+	BH_AllocFromCluster,	/* allocated blocks were part of already
+				 * allocated cluster. Note that this flag will
+				 * never, ever appear in a buffer_head's state
+				 * flag. See EXT4_MAP_FROM_CLUSTER to see where
+				 * this is used. */
 };
 
 BUFFER_FNS(Uninit, uninit)
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 095c36f3b612..a52db3a69a30 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -290,5 +290,7 @@ extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
 							struct ext4_ext_path *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
 extern int ext4_ext_check_inode(struct inode *inode);
+extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
+				      int search_hint_reverse);
 #endif /* _EXT4_EXTENTS */
 
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index cd4479c08031..c4e005864534 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2686,6 +2686,21 @@ again:
 		}
 	}
 
+	/* If we still have something in the partial cluster and we have removed
+	 * even the first extent, then we should free the blocks in the partial
+	 * cluster as well. */
+	if (partial_cluster && path->p_hdr->eh_entries == 0) {
+		int flags = EXT4_FREE_BLOCKS_FORGET;
+
+		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+			flags |= EXT4_FREE_BLOCKS_METADATA;
+
+		ext4_free_blocks(handle, inode, NULL,
+				 EXT4_C2B(EXT4_SB(sb), partial_cluster),
+				 EXT4_SB(sb)->s_cluster_ratio, flags);
+		partial_cluster = 0;
+	}
+
 	/* TODO: flexible tree reduction should be here */
 	if (path->p_hdr->eh_entries == 0) {
 		/*
@@ -3233,6 +3248,195 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
 	return ext4_mark_inode_dirty(handle, inode);
 }
 
+/**
+ * ext4_find_delalloc_range: find delayed allocated block in the given range.
+ *
+ * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns
+ * whether there are any buffers marked for delayed allocation. It returns '1'
+ * on the first delalloc'ed buffer head found. If no buffer head in the given
+ * range is marked for delalloc, it returns 0.
+ * lblk_start should always be <= lblk_end.
+ * search_hint_reverse is to indicate that searching in reverse from lblk_end to
+ * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed
+ * block sooner). This is useful when blocks are truncated sequentially from
+ * lblk_start towards lblk_end.
+ */
+static int ext4_find_delalloc_range(struct inode *inode,
+				    ext4_lblk_t lblk_start,
+				    ext4_lblk_t lblk_end,
+				    int search_hint_reverse)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct buffer_head *head, *bh = NULL;
+	struct page *page;
+	ext4_lblk_t i, pg_lblk;
+	pgoff_t index;
+
+	/* reverse search wont work if fs block size is less than page size */
+	if (inode->i_blkbits < PAGE_CACHE_SHIFT)
+		search_hint_reverse = 0;
+
+	if (search_hint_reverse)
+		i = lblk_end;
+	else
+		i = lblk_start;
+
+	index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	while ((i >= lblk_start) && (i <= lblk_end)) {
+		page = find_get_page(mapping, index);
+		if (!page || !PageDirty(page))
+			goto nextpage;
+
+		if (PageWriteback(page)) {
+			/*
+			 * This might be a race with allocation and writeout. In
+			 * this case we just assume that the rest of the range
+			 * will eventually be written and there wont be any
+			 * delalloc blocks left.
+			 * TODO: the above assumption is troublesome, but might
+			 * work better in practice. other option could be note
+			 * somewhere that the cluster is getting written out and
+			 * detect that here.
+			 */
+			page_cache_release(page);
+			return 0;
+		}
+
+		if (!page_has_buffers(page))
+			goto nextpage;
+
+		head = page_buffers(page);
+		if (!head)
+			goto nextpage;
+
+		bh = head;
+		pg_lblk = index << (PAGE_CACHE_SHIFT -
+						inode->i_blkbits);
+		do {
+			if (unlikely(pg_lblk < lblk_start)) {
+				/*
+				 * This is possible when fs block size is less
+				 * than page size and our cluster starts/ends in
+				 * middle of the page. So we need to skip the
+				 * initial few blocks till we reach the 'lblk'
+				 */
+				pg_lblk++;
+				continue;
+			}
+
+			if (buffer_delay(bh)) {
+				page_cache_release(page);
+				return 1;
+			}
+			if (search_hint_reverse)
+				i--;
+			else
+				i++;
+		} while ((i >= lblk_start) && (i <= lblk_end) &&
+				((bh = bh->b_this_page) != head));
+nextpage:
+		if (page)
+			page_cache_release(page);
+		/*
+		 * Move to next page. 'i' will be the first lblk in the next
+		 * page.
+		 */
+		if (search_hint_reverse)
+			index--;
+		else
+			index++;
+		i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	}
+
+	return 0;
+}
+
+int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
+			       int search_hint_reverse)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	ext4_lblk_t lblk_start, lblk_end;
+	lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
+	lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
+
+	return ext4_find_delalloc_range(inode, lblk_start, lblk_end,
+					search_hint_reverse);
+}
+
+/**
+ * Determines how many complete clusters (out of those specified by the 'map')
+ * are under delalloc and were reserved quota for.
+ * This function is called when we are writing out the blocks that were
+ * originally written with their allocation delayed, but then the space was
+ * allocated using fallocate() before the delayed allocation could be resolved.
+ * The cases to look for are:
+ * ('=' indicated delayed allocated blocks
+ *  '-' indicates non-delayed allocated blocks)
+ * (a) partial clusters towards beginning and/or end outside of allocated range
+ *     are not delalloc'ed.
+ *	Ex:
+ *	|----c---=|====c====|====c====|===-c----|
+ *	         |++++++ allocated ++++++|
+ *	==> 4 complete clusters in above example
+ *
+ * (b) partial cluster (outside of allocated range) towards either end is
+ *     marked for delayed allocation. In this case, we will exclude that
+ *     cluster.
+ *	Ex:
+ *	|----====c========|========c========|
+ *	     |++++++ allocated ++++++|
+ *	==> 1 complete clusters in above example
+ *
+ *	Ex:
+ *	|================c================|
+ *            |++++++ allocated ++++++|
+ *	==> 0 complete clusters in above example
+ *
+ * The ext4_da_update_reserve_space will be called only if we
+ * determine here that there were some "entire" clusters that span
+ * this 'allocated' range.
+ * In the non-bigalloc case, this function will just end up returning num_blks
+ * without ever calling ext4_find_delalloc_range.
+ */
+static unsigned int
+get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
+			   unsigned int num_blks)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
+	ext4_lblk_t lblk_from, lblk_to, c_offset;
+	unsigned int allocated_clusters = 0;
+
+	alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
+	alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
+
+	/* max possible clusters for this allocation */
+	allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
+
+	/* Check towards left side */
+	c_offset = lblk_start & (sbi->s_cluster_ratio - 1);
+	if (c_offset) {
+		lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
+		lblk_to = lblk_from + c_offset - 1;
+
+		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
+			allocated_clusters--;
+	}
+
+	/* Now check towards right. */
+	c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1);
+	if (allocated_clusters && c_offset) {
+		lblk_from = lblk_start + num_blks;
+		lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
+
+		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
+			allocated_clusters--;
+	}
+
+	return allocated_clusters;
+}
+
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 			struct ext4_map_blocks *map,
@@ -3338,8 +3542,15 @@ out:
 	 * But fallocate would have already updated quota and block
 	 * count for this offset. So cancel these reservation
 	 */
-	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-		ext4_da_update_reserve_space(inode, allocated, 0);
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+		unsigned int reserved_clusters;
+		reserved_clusters = get_reserved_cluster_alloc(inode,
+				map->m_lblk, map->m_len);
+		if (reserved_clusters)
+			ext4_da_update_reserve_space(inode,
+						     reserved_clusters,
+						     0);
+	}
 
 map_out:
 	map->m_flags |= EXT4_MAP_MAPPED;
@@ -3484,6 +3695,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	ext4_fsblk_t newblock = 0;
 	int free_on_err = 0, err = 0, depth, ret;
 	unsigned int allocated = 0, offset = 0;
+	unsigned int allocated_clusters = 0, reserved_clusters = 0;
 	unsigned int punched_out = 0;
 	unsigned int result = 0;
 	struct ext4_allocation_request ar;
@@ -3499,6 +3711,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) &&
 		ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
 		if (!newex.ee_start_lo && !newex.ee_start_hi) {
+			if ((sbi->s_cluster_ratio > 1) &&
+			    ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
+				map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+
 			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
 				/*
 				 * block isn't allocated yet and
@@ -3509,6 +3725,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			/* we should allocate requested block */
 		} else {
 			/* block is already allocated */
+			if (sbi->s_cluster_ratio > 1)
+				map->m_flags |= EXT4_MAP_FROM_CLUSTER;
 			newblock = map->m_lblk
 				   - le32_to_cpu(newex.ee_block)
 				   + ext4_ext_pblock(&newex);
@@ -3665,6 +3883,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		}
 	}
 
+	if ((sbi->s_cluster_ratio > 1) &&
+	    ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
+		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+
 	/*
 	 * requested block isn't allocated yet;
 	 * we couldn't try to create block if create flag is zero
@@ -3681,6 +3903,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	/*
 	 * Okay, we need to do block allocation.
 	 */
+	map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
 	newex.ee_block = cpu_to_le32(map->m_lblk);
 	cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
 
@@ -3692,6 +3915,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	    get_implied_cluster_alloc(sbi, map, ex, path)) {
 		ar.len = allocated = map->m_len;
 		newblock = map->m_pblk;
+		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
 		goto got_allocated_blocks;
 	}
 
@@ -3712,6 +3936,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	    get_implied_cluster_alloc(sbi, map, ex2, path)) {
 		ar.len = allocated = map->m_len;
 		newblock = map->m_pblk;
+		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
 		goto got_allocated_blocks;
 	}
 
@@ -3765,6 +3990,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
 		  ar.goal, newblock, allocated);
 	free_on_err = 1;
+	allocated_clusters = ar.len;
 	ar.len = EXT4_C2B(sbi, ar.len) - offset;
 	if (ar.len > allocated)
 		ar.len = allocated;
@@ -3822,8 +4048,80 @@ got_allocated_blocks:
 	 * Update reserved blocks/metadata blocks after successful
 	 * block allocation which had been deferred till now.
 	 */
-	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-		ext4_da_update_reserve_space(inode, allocated, 1);
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+		/*
+		 * Check how many clusters we had reserved this allocted range.
+		 */
+		reserved_clusters = get_reserved_cluster_alloc(inode,
+						map->m_lblk, allocated);
+		if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
+			if (reserved_clusters) {
+				/*
+				 * We have clusters reserved for this range.
+				 * But since we are not doing actual allocation
+				 * and are simply using blocks from previously
+				 * allocated cluster, we should release the
+				 * reservation and not claim quota.
+				 */
+				ext4_da_update_reserve_space(inode,
+						reserved_clusters, 0);
+			}
+		} else {
+			BUG_ON(allocated_clusters < reserved_clusters);
+			/* We will claim quota for all newly allocated blocks.*/
+			ext4_da_update_reserve_space(inode, allocated_clusters,
+							1);
+			if (reserved_clusters < allocated_clusters) {
+				int reservation = allocated_clusters -
+						  reserved_clusters;
+				/*
+				 * It seems we claimed few clusters outside of
+				 * the range of this allocation. We should give
+				 * it back to the reservation pool. This can
+				 * happen in the following case:
+				 *
+				 * * Suppose s_cluster_ratio is 4 (i.e., each
+				 *   cluster has 4 blocks. Thus, the clusters
+				 *   are [0-3],[4-7],[8-11]...
+				 * * First comes delayed allocation write for
+				 *   logical blocks 10 & 11. Since there were no
+				 *   previous delayed allocated blocks in the
+				 *   range [8-11], we would reserve 1 cluster
+				 *   for this write.
+				 * * Next comes write for logical blocks 3 to 8.
+				 *   In this case, we will reserve 2 clusters
+				 *   (for [0-3] and [4-7]; and not for [8-11] as
+				 *   that range has a delayed allocated blocks.
+				 *   Thus total reserved clusters now becomes 3.
+				 * * Now, during the delayed allocation writeout
+				 *   time, we will first write blocks [3-8] and
+				 *   allocate 3 clusters for writing these
+				 *   blocks. Also, we would claim all these
+				 *   three clusters above.
+				 * * Now when we come here to writeout the
+				 *   blocks [10-11], we would expect to claim
+				 *   the reservation of 1 cluster we had made
+				 *   (and we would claim it since there are no
+				 *   more delayed allocated blocks in the range
+				 *   [8-11]. But our reserved cluster count had
+				 *   already gone to 0.
+				 *
+				 *   Thus, at the step 4 above when we determine
+				 *   that there are still some unwritten delayed
+				 *   allocated blocks outside of our current
+				 *   block range, we should increment the
+				 *   reserved clusters count so that when the
+				 *   remaining blocks finally gets written, we
+				 *   could claim them.
+				 */
+				while (reservation) {
+					ext4_da_reserve_space(inode,
+							      map->m_lblk);
+					reservation--;
+				}
+			}
+		}
+	}
 
 	/*
 	 * Cache the extent and update transaction to commit on fdatasync only
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 40f51aae42fe..d1c17e47c1c6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -300,14 +300,14 @@ void ext4_da_update_reserve_space(struct inode *inode,
 
 	/* Update quota subsystem for data blocks */
 	if (quota_claim)
-		dquot_claim_block(inode, used);
+		dquot_claim_block(inode, EXT4_C2B(sbi, used));
 	else {
 		/*
 		 * We did fallocate with an offset that is already delayed
 		 * allocated. So on delayed allocated writeback we should
 		 * not re-claim the quota for fallocated blocks.
 		 */
-		dquot_release_reservation_block(inode, used);
+		dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
 	}
 
 	/*
@@ -1037,14 +1037,14 @@ static int ext4_journalled_write_end(struct file *file,
 }
 
 /*
- * Reserve a single block located at lblock
+ * Reserve a single cluster located at lblock
  */
-static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
+int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 {
 	int retries = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	unsigned long md_needed;
+	unsigned int md_needed;
 	int ret;
 
 	/*
@@ -1054,7 +1054,8 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 	 */
 repeat:
 	spin_lock(&ei->i_block_reservation_lock);
-	md_needed = ext4_calc_metadata_amount(inode, lblock);
+	md_needed = EXT4_NUM_B2C(sbi,
+				 ext4_calc_metadata_amount(inode, lblock));
 	trace_ext4_da_reserve_space(inode, md_needed);
 	spin_unlock(&ei->i_block_reservation_lock);
 
@@ -1063,7 +1064,7 @@ repeat:
 	 * us from metadata over-estimation, though we may go over by
 	 * a small amount in the end.  Here we just reserve for data.
 	 */
-	ret = dquot_reserve_block(inode, 1);
+	ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
 	if (ret)
 		return ret;
 	/*
@@ -1071,7 +1072,7 @@ repeat:
 	 * we cannot afford to run out of free blocks.
 	 */
 	if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
-		dquot_release_reservation_block(inode, 1);
+		dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
 		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
 			yield();
 			goto repeat;
@@ -1118,6 +1119,8 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 		 * We can release all of the reserved metadata blocks
 		 * only when we have written all of the delayed
 		 * allocation blocks.
+		 * Note that in case of bigalloc, i_reserved_meta_blocks,
+		 * i_reserved_data_blocks, etc. refer to number of clusters.
 		 */
 		percpu_counter_sub(&sbi->s_dirtyclusters_counter,
 				   ei->i_reserved_meta_blocks);
@@ -1130,7 +1133,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 
-	dquot_release_reservation_block(inode, to_free);
+	dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
 }
 
 static void ext4_da_page_release_reservation(struct page *page,
@@ -1139,6 +1142,9 @@ static void ext4_da_page_release_reservation(struct page *page,
 	int to_release = 0;
 	struct buffer_head *head, *bh;
 	unsigned int curr_off = 0;
+	struct inode *inode = page->mapping->host;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	int num_clusters;
 
 	head = page_buffers(page);
 	bh = head;
@@ -1151,7 +1157,20 @@ static void ext4_da_page_release_reservation(struct page *page,
 		}
 		curr_off = next_off;
 	} while ((bh = bh->b_this_page) != head);
-	ext4_da_release_space(page->mapping->host, to_release);
+
+	/* If we have released all the blocks belonging to a cluster, then we
+	 * need to release the reserved space for that cluster. */
+	num_clusters = EXT4_NUM_B2C(sbi, to_release);
+	while (num_clusters > 0) {
+		ext4_fsblk_t lblk;
+		lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
+			((num_clusters - 1) << sbi->s_cluster_bits);
+		if (sbi->s_cluster_ratio == 1 ||
+		    !ext4_find_delalloc_cluster(inode, lblk, 1))
+			ext4_da_release_space(inode, 1);
+
+		num_clusters--;
+	}
 }
 
 /*
@@ -1352,7 +1371,8 @@ static void ext4_print_free_blocks(struct inode *inode)
 	       (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
 		percpu_counter_sum(&sbi->s_freeclusters_counter)));
 	printk(KERN_CRIT "dirty_blocks=%lld\n",
-	       (long long) percpu_counter_sum(&sbi->s_dirtyclusters_counter));
+	       (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+		percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
 	printk(KERN_CRIT "Block reservation details\n");
 	printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
 	       EXT4_I(inode)->i_reserved_data_blocks);
@@ -1626,10 +1646,14 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 		/*
 		 * XXX: __block_write_begin() unmaps passed block, is it OK?
 		 */
-		ret = ext4_da_reserve_space(inode, iblock);
-		if (ret)
-			/* not enough space to reserve */
-			return ret;
+		/* If the block was allocated from previously allocated cluster,
+		 * then we dont need to reserve it again. */
+		if (!(map.m_flags & EXT4_MAP_FROM_CLUSTER)) {
+			ret = ext4_da_reserve_space(inode, iblock);
+			if (ret)
+				/* not enough space to reserve */
+				return ret;
+		}
 
 		map_bh(bh, inode->i_sb, invalid_block);
 		set_buffer_new(bh);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 63dd56703342..5e1215d38331 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4718,6 +4718,9 @@ do_more:
 
 	freed += count;
 
+	if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+		dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
+
 	/* We dirtied the bitmap block */
 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
 	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -4736,8 +4739,6 @@ do_more:
 	}
 	ext4_mark_super_dirty(sb);
 error_return:
-	if (freed && !(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
-		dquot_free_block(inode, freed);
 	brelse(bitmap_bh);
 	ext4_std_error(sb, err);
 	return;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6810957e0ac7..66b8cfa15636 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2473,7 +2473,8 @@ static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
 					      char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%llu\n",
-		(s64) percpu_counter_sum(&sbi->s_dirtyclusters_counter));
+		(s64) EXT4_C2B(sbi,
+			percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
 }
 
 static ssize_t session_write_kbytes_show(struct ext4_attr *a,
-- 
cgit v1.2.3-58-ga151


From 6f16b60690ba04cf476480a6f19b204e4b95b4a6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 19:06:51 -0400
Subject: ext4: enable mounting bigalloc as read/write

Now that we have implemented all of the changes needed for bigalloc,
we can finally enable it!

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a6307f7c9807..a5a7e369f719 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1439,7 +1439,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 					 EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
 					 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
 					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
-					 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)
+					 EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
+					 EXT4_FEATURE_RO_COMPAT_BIGALLOC)
 
 /*
  * Default values for user and/or group using reserved blocks
-- 
cgit v1.2.3-58-ga151


From 021b65bb1e4e4b625c80bbb82651e5e155721ef3 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 19:08:51 -0400
Subject: ext4: Rename ext4_free_blks_{count,set}() to refer to clusters

The field bg_free_blocks_count_{lo,high} in the block group
descriptor has been repurposed to hold the number of free clusters for
bigalloc functions.  So rename the functions so it makes it easier to
read and audit the block allocation and block freeing code.

Note: at this point in bigalloc development we doesn't support
online resize, so this also makes it really obvious all of the places
we need to fix up to add support for online resize.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  |  8 ++++----
 fs/ext4/ext4.h    |  9 +++++----
 fs/ext4/ialloc.c  | 14 +++++++-------
 fs/ext4/mballoc.c | 22 +++++++++++-----------
 fs/ext4/resize.c  |  2 +-
 fs/ext4/super.c   | 10 +++++-----
 6 files changed, 33 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index bf42b3219e3c..84dc3d62b40c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -168,7 +168,7 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 	 * essentially implementing a per-group read-only flag. */
 	if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
 		ext4_error(sb, "Checksum bad for group %u", block_group);
-		ext4_free_blks_set(sb, gdp, 0);
+		ext4_free_group_clusters_set(sb, gdp, 0);
 		ext4_free_inodes_set(sb, gdp, 0);
 		ext4_itable_unused_set(sb, gdp, 0);
 		memset(bh->b_data, 0xff, sb->s_blocksize);
@@ -550,7 +550,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
-		desc_count += ext4_free_blks_count(sb, gdp);
+		desc_count += ext4_free_group_clusters(sb, gdp);
 		brelse(bitmap_bh);
 		bitmap_bh = ext4_read_block_bitmap(sb, i);
 		if (bitmap_bh == NULL)
@@ -558,7 +558,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 
 		x = ext4_count_free(bitmap_bh, sb->s_blocksize);
 		printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
-			i, ext4_free_blks_count(sb, gdp), x);
+			i, ext4_free_group_clusters(sb, gdp), x);
 		bitmap_count += x;
 	}
 	brelse(bitmap_bh);
@@ -572,7 +572,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
-		desc_count += ext4_free_blks_count(sb, gdp);
+		desc_count += ext4_free_group_clusters(sb, gdp);
 	}
 
 	return desc_count;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a5a7e369f719..db279c362017 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1978,8 +1978,8 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 				      struct ext4_group_desc *bg);
 extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 				     struct ext4_group_desc *bg);
-extern __u32 ext4_free_blks_count(struct super_block *sb,
-				struct ext4_group_desc *bg);
+extern __u32 ext4_free_group_clusters(struct super_block *sb,
+				      struct ext4_group_desc *bg);
 extern __u32 ext4_free_inodes_count(struct super_block *sb,
 				 struct ext4_group_desc *bg);
 extern __u32 ext4_used_dirs_count(struct super_block *sb,
@@ -1992,8 +1992,9 @@ extern void ext4_inode_bitmap_set(struct super_block *sb,
 				  struct ext4_group_desc *bg, ext4_fsblk_t blk);
 extern void ext4_inode_table_set(struct super_block *sb,
 				 struct ext4_group_desc *bg, ext4_fsblk_t blk);
-extern void ext4_free_blks_set(struct super_block *sb,
-			       struct ext4_group_desc *bg, __u32 count);
+extern void ext4_free_group_clusters_set(struct super_block *sb,
+					 struct ext4_group_desc *bg,
+					 __u32 count);
 extern void ext4_free_inodes_set(struct super_block *sb,
 				struct ext4_group_desc *bg, __u32 count);
 extern void ext4_used_dirs_set(struct super_block *sb,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 0be5862313f0..71a9c8f3dece 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -78,7 +78,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
 	 * allocation, essentially implementing a per-group read-only flag. */
 	if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
 		ext4_error(sb, "Checksum bad for group %u", block_group);
-		ext4_free_blks_set(sb, gdp, 0);
+		ext4_free_group_clusters_set(sb, gdp, 0);
 		ext4_free_inodes_set(sb, gdp, 0);
 		ext4_itable_unused_set(sb, gdp, 0);
 		memset(bh->b_data, 0xff, sb->s_blocksize);
@@ -322,8 +322,8 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
 		if (ext4_free_inodes_count(sb, desc) < avefreei)
 			continue;
 		if (!best_desc ||
-		    (ext4_free_blks_count(sb, desc) >
-		     ext4_free_blks_count(sb, best_desc))) {
+		    (ext4_free_group_clusters(sb, desc) >
+		     ext4_free_group_clusters(sb, best_desc))) {
 			*best_group = group;
 			best_desc = desc;
 			ret = 0;
@@ -434,7 +434,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
 	desc = ext4_get_group_desc(sb, g, NULL);
 	if (desc) {
 		stats->free_inodes = ext4_free_inodes_count(sb, desc);
-		stats->free_clusters = ext4_free_blks_count(sb, desc);
+		stats->free_clusters = ext4_free_group_clusters(sb, desc);
 		stats->used_dirs = ext4_used_dirs_count(sb, desc);
 	} else {
 		stats->free_inodes = 0;
@@ -662,7 +662,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 	*group = parent_group;
 	desc = ext4_get_group_desc(sb, *group, NULL);
 	if (desc && ext4_free_inodes_count(sb, desc) &&
-			ext4_free_blks_count(sb, desc))
+	    ext4_free_group_clusters(sb, desc))
 		return 0;
 
 	/*
@@ -686,7 +686,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 			*group -= ngroups;
 		desc = ext4_get_group_desc(sb, *group, NULL);
 		if (desc && ext4_free_inodes_count(sb, desc) &&
-				ext4_free_blks_count(sb, desc))
+		    ext4_free_group_clusters(sb, desc))
 			return 0;
 	}
 
@@ -960,7 +960,7 @@ got:
 		ext4_lock_group(sb, group);
 		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-			ext4_free_blks_set(sb, gdp,
+			ext4_free_group_clusters_set(sb, gdp,
 				ext4_free_blocks_after_init(sb, group, gdp));
 			gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
 								gdp);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 5e1215d38331..76db2f12107f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2256,7 +2256,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 			ext4_free_blocks_after_init(sb, group, desc);
 	} else {
 		meta_group_info[i]->bb_free =
-			ext4_free_blks_count(sb, desc);
+			ext4_free_group_clusters(sb, desc);
 	}
 
 	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
@@ -2799,7 +2799,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		goto out_err;
 
 	ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
-			ext4_free_blks_count(sb, gdp));
+			ext4_free_group_clusters(sb, gdp));
 
 	err = ext4_journal_get_write_access(handle, gdp_bh);
 	if (err)
@@ -2839,12 +2839,12 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		      ac->ac_b_ex.fe_len);
 	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-		ext4_free_blks_set(sb, gdp,
-					ext4_free_blocks_after_init(sb,
-					ac->ac_b_ex.fe_group, gdp));
+		ext4_free_group_clusters_set(sb, gdp,
+					     ext4_free_blocks_after_init(sb,
+						ac->ac_b_ex.fe_group, gdp));
 	}
-	len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
-	ext4_free_blks_set(sb, gdp, len);
+	len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
+	ext4_free_group_clusters_set(sb, gdp, len);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
 
 	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
@@ -4702,8 +4702,8 @@ do_more:
 		mb_free_blocks(inode, &e4b, bit, count_clusters);
 	}
 
-	ret = ext4_free_blks_count(sb, gdp) + count_clusters;
-	ext4_free_blks_set(sb, gdp, ret);
+	ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
+	ext4_free_group_clusters_set(sb, gdp, ret);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
 	ext4_unlock_group(sb, block_group);
 	percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
@@ -4846,8 +4846,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
 	ext4_lock_group(sb, block_group);
 	mb_clear_bits(bitmap_bh->b_data, bit, count);
 	mb_free_blocks(NULL, &e4b, bit, count);
-	blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
-	ext4_free_blks_set(sb, desc, blk_free_count);
+	blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
+	ext4_free_group_clusters_set(sb, desc, blk_free_count);
 	desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
 	ext4_unlock_group(sb, block_group);
 	percpu_counter_add(&sbi->s_freeclusters_counter,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 95a09ddca3b9..996780ab4f4e 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -875,7 +875,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
 	ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
 	ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
-	ext4_free_blks_set(sb, gdp, input->free_blocks_count);
+	ext4_free_group_clusters_set(sb, gdp, input->free_blocks_count);
 	ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
 	gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 66b8cfa15636..30cf0e1bd89c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -163,8 +163,8 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 }
 
-__u32 ext4_free_blks_count(struct super_block *sb,
-			      struct ext4_group_desc *bg)
+__u32 ext4_free_group_clusters(struct super_block *sb,
+			       struct ext4_group_desc *bg)
 {
 	return le16_to_cpu(bg->bg_free_blocks_count_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
@@ -219,8 +219,8 @@ void ext4_inode_table_set(struct super_block *sb,
 		bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
 }
 
-void ext4_free_blks_set(struct super_block *sb,
-			  struct ext4_group_desc *bg, __u32 count)
+void ext4_free_group_clusters_set(struct super_block *sb,
+				  struct ext4_group_desc *bg, __u32 count)
 {
 	bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
 	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
@@ -2034,7 +2034,7 @@ static int ext4_fill_flex_info(struct super_block *sb)
 		flex_group = ext4_flex_group(sbi, i);
 		atomic_add(ext4_free_inodes_count(sb, gdp),
 			   &sbi->s_flex_groups[flex_group].free_inodes);
-		atomic_add(ext4_free_blks_count(sb, gdp),
+		atomic_add(ext4_free_group_clusters(sb, gdp),
 			   &sbi->s_flex_groups[flex_group].free_clusters);
 		atomic_add(ext4_used_dirs_count(sb, gdp),
 			   &sbi->s_flex_groups[flex_group].used_dirs);
-- 
cgit v1.2.3-58-ga151


From 5dee54372c1ea15ab482b959634cda8c01b042bd Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 19:10:51 -0400
Subject: ext4: rename ext4_count_free_blocks() to ext4_count_free_clusters()

This function really counts the free clusters reported in the block
group descriptors, so rename it to reduce confusion.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 11 ++++++-----
 fs/ext4/ext4.h   |  2 +-
 fs/ext4/inode.c  |  6 +++---
 fs/ext4/super.c  |  7 ++++---
 4 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 84dc3d62b40c..4f303809e076 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -524,12 +524,12 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 }
 
 /**
- * ext4_count_free_blocks() -- count filesystem free blocks
+ * ext4_count_free_clusters() -- count filesystem free clusters
  * @sb:		superblock
  *
- * Adds up the number of free blocks from each block group.
+ * Adds up the number of free clusters from each block group.
  */
-ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
+ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
 {
 	ext4_fsblk_t desc_count;
 	struct ext4_group_desc *gdp;
@@ -562,8 +562,9 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 		bitmap_count += x;
 	}
 	brelse(bitmap_bh);
-	printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu"
-		", computed = %llu, %llu\n", ext4_free_blocks_count(es),
+	printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
+	       ", computed = %llu, %llu\n",
+	       EXT4_B2C(sbi, ext4_free_blocks_count(es)),
 	       desc_count, bitmap_count);
 	return bitmap_count;
 #else
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index db279c362017..f6963cd7f709 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1775,7 +1775,7 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 					 int *errp);
 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
 				  s64 nblocks, unsigned int flags);
-extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
+extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *);
 extern void ext4_check_blocks_bitmap(struct super_block *);
 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 						    ext4_group_t block_group,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d1c17e47c1c6..f9198d7875cc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1365,7 +1365,8 @@ static void ext4_print_free_blocks(struct inode *inode)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	printk(KERN_CRIT "Total free blocks count %lld\n",
-	       ext4_count_free_blocks(inode->i_sb));
+	       EXT4_C2B(EXT4_SB(inode->i_sb),
+			ext4_count_free_clusters(inode->i_sb)));
 	printk(KERN_CRIT "Free/Dirty block details\n");
 	printk(KERN_CRIT "free_blocks=%lld\n",
 	       (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
@@ -1451,8 +1452,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 		if (err == -EAGAIN)
 			goto submit_io;
 
-		if (err == -ENOSPC &&
-		    ext4_count_free_blocks(sb)) {
+		if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
 			mpd->retval = err;
 			goto submit_io;
 		}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 30cf0e1bd89c..07f3de341452 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2153,7 +2153,8 @@ static int ext4_check_descriptors(struct super_block *sb,
 	if (NULL != first_not_zeroed)
 		*first_not_zeroed = grp;
 
-	ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
+	ext4_free_blocks_count_set(sbi->s_es,
+				   EXT4_C2B(sbi, ext4_count_free_clusters(sb)));
 	sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
 	return 1;
 }
@@ -3577,7 +3578,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_err_report.data = (unsigned long) sb;
 
 	err = percpu_counter_init(&sbi->s_freeclusters_counter,
-			ext4_count_free_blocks(sb));
+			ext4_count_free_clusters(sb));
 	if (!err) {
 		err = percpu_counter_init(&sbi->s_freeinodes_counter,
 				ext4_count_free_inodes(sb));
@@ -3703,7 +3704,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 * need to update the global counters.
 	 */
 	percpu_counter_set(&sbi->s_freeclusters_counter,
-			   ext4_count_free_blocks(sb));
+			   ext4_count_free_clusters(sb));
 	percpu_counter_set(&sbi->s_freeinodes_counter,
 			   ext4_count_free_inodes(sb));
 	percpu_counter_set(&sbi->s_dirs_counter,
-- 
cgit v1.2.3-58-ga151


From cff1dfd767d1ee3c773fd8b57fe310957e5f8abb Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 19:12:51 -0400
Subject: ext4: rename ext4_free_blocks_after_init() to
 ext4_free_clusters_after_init()

This function really returns the number of clusters after initializing
an uninitalized block bitmap has been initialized.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 6 +++---
 fs/ext4/ext4.h    | 6 +++---
 fs/ext4/ialloc.c  | 2 +-
 fs/ext4/mballoc.c | 4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 4f303809e076..3d07c35c7089 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -213,9 +213,9 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 /* Return the number of free blocks in a block group.  It is used when
  * the block bitmap is uninitialized, so we can't just count the bits
  * in the bitmap. */
-unsigned ext4_free_blocks_after_init(struct super_block *sb,
-				     ext4_group_t block_group,
-				     struct ext4_group_desc *gdp)
+unsigned ext4_free_clusters_after_init(struct super_block *sb,
+				       ext4_group_t block_group,
+				       struct ext4_group_desc *gdp)
 {
 	return num_clusters_in_group(sb, block_group) - 
 		ext4_num_overhead_clusters(sb, block_group, gdp);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f6963cd7f709..1473c06e4c94 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1787,9 +1787,9 @@ extern void ext4_init_block_bitmap(struct super_block *sb,
 				   struct buffer_head *bh,
 				   ext4_group_t group,
 				   struct ext4_group_desc *desc);
-extern unsigned ext4_free_blocks_after_init(struct super_block *sb,
-					    ext4_group_t block_group,
-					    struct ext4_group_desc *gdp);
+extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
+					      ext4_group_t block_group,
+					      struct ext4_group_desc *gdp);
 extern unsigned ext4_num_base_meta_clusters(struct super_block *sb,
 					    ext4_group_t block_group);
 extern unsigned ext4_num_overhead_clusters(struct super_block *sb,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 71a9c8f3dece..d50a7d5e4726 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -961,7 +961,7 @@ got:
 		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
 			ext4_free_group_clusters_set(sb, gdp,
-				ext4_free_blocks_after_init(sb, group, gdp));
+				ext4_free_clusters_after_init(sb, group, gdp));
 			gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
 								gdp);
 		}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 76db2f12107f..90a3ed7c565c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2253,7 +2253,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 	 */
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 		meta_group_info[i]->bb_free =
-			ext4_free_blocks_after_init(sb, group, desc);
+			ext4_free_clusters_after_init(sb, group, desc);
 	} else {
 		meta_group_info[i]->bb_free =
 			ext4_free_group_clusters(sb, desc);
@@ -2840,7 +2840,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
 		ext4_free_group_clusters_set(sb, gdp,
-					     ext4_free_blocks_after_init(sb,
+					     ext4_free_clusters_after_init(sb,
 						ac->ac_b_ex.fe_group, gdp));
 	}
 	len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
-- 
cgit v1.2.3-58-ga151


From e7d5f3156e6827970f75ab27ad7eb0155826eb0b Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 19:14:51 -0400
Subject: ext4: rename ext4_claim_free_blocks() to ext4_claim_free_clusters()

This function really claims a number of free clusters, not blocks, so
rename it so it's clearer what's going on.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 8 ++++----
 fs/ext4/ext4.h    | 4 ++--
 fs/ext4/inode.c   | 2 +-
 fs/ext4/mballoc.c | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 3d07c35c7089..af0c5357e35a 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -445,11 +445,11 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
 	return 0;
 }
 
-int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
-			   s64 nblocks, unsigned int flags)
+int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
+			     s64 nclusters, unsigned int flags)
 {
-	if (ext4_has_free_blocks(sbi, nblocks, flags)) {
-		percpu_counter_add(&sbi->s_dirtyclusters_counter, nblocks);
+	if (ext4_has_free_blocks(sbi, nclusters, flags)) {
+		percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters);
 		return 0;
 	} else
 		return -ENOSPC;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1473c06e4c94..f97638634e34 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1773,8 +1773,8 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 					 unsigned int flags,
 					 unsigned long *count,
 					 int *errp);
-extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
-				  s64 nblocks, unsigned int flags);
+extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
+				    s64 nclusters, unsigned int flags);
 extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *);
 extern void ext4_check_blocks_bitmap(struct super_block *);
 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f9198d7875cc..557c34ffa05b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1071,7 +1071,7 @@ repeat:
 	 * We do still charge estimated metadata to the sb though;
 	 * we cannot afford to run out of free blocks.
 	 */
-	if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
+	if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) {
 		dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
 		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
 			yield();
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 90a3ed7c565c..1c83161090f8 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4297,7 +4297,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 		 * and verify allocation doesn't exceed the quota limits.
 		 */
 		while (ar->len &&
-			ext4_claim_free_blocks(sbi, ar->len, ar->flags)) {
+			ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
 
 			/* let others to free the space */
 			yield();
-- 
cgit v1.2.3-58-ga151


From df55c99dc8ee4c3c886a5edc8a4aa6b131c95afc Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 9 Sep 2011 19:16:51 -0400
Subject: ext4: rename ext4_has_free_blocks() to ext4_has_free_clusters()

Rename the function so it is more clear what is going on.  Also rename
the various variables so it's clearer what's happening.

Also fix a missing blocks to cluster conversion when reading the
number of reserved blocks for root.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 43 ++++++++++++++++++++++---------------------
 fs/ext4/ext4.h   |  8 ++++----
 fs/ext4/inode.c  |  2 +-
 3 files changed, 27 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index af0c5357e35a..f6dba4505f1c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -403,42 +403,43 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 }
 
 /**
- * ext4_has_free_blocks()
+ * ext4_has_free_clusters()
  * @sbi:	in-core super block structure.
- * @nblocks:	number of needed blocks
+ * @nclusters:	number of needed blocks
+ * @flags:	flags from ext4_mb_new_blocks()
  *
- * Check if filesystem has nblocks free & available for allocation.
+ * Check if filesystem has nclusters free & available for allocation.
  * On success return 1, return 0 on failure.
  */
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
-				s64 nblocks, unsigned int flags)
+static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
+				  s64 nclusters, unsigned int flags)
 {
-	s64 free_blocks, dirty_blocks, root_blocks;
+	s64 free_clusters, dirty_clusters, root_clusters;
 	struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
-	struct percpu_counter *dbc = &sbi->s_dirtyclusters_counter;
+	struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
 
-	free_blocks  = percpu_counter_read_positive(fcc);
-	dirty_blocks = percpu_counter_read_positive(dbc);
-	root_blocks = ext4_r_blocks_count(sbi->s_es);
+	free_clusters  = percpu_counter_read_positive(fcc);
+	dirty_clusters = percpu_counter_read_positive(dcc);
+	root_clusters = EXT4_B2C(sbi, ext4_r_blocks_count(sbi->s_es));
 
-	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
-						EXT4_FREEBLOCKS_WATERMARK) {
-		free_blocks  = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc));
-		dirty_blocks = percpu_counter_sum_positive(dbc);
+	if (free_clusters - (nclusters + root_clusters + dirty_clusters) <
+					EXT4_FREECLUSTERS_WATERMARK) {
+		free_clusters  = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc));
+		dirty_clusters = percpu_counter_sum_positive(dcc);
 	}
-	/* Check whether we have space after
-	 * accounting for current dirty blocks & root reserved blocks.
+	/* Check whether we have space after accounting for current
+	 * dirty clusters & root reserved clusters.
 	 */
-	if (free_blocks >= ((root_blocks + nblocks) + dirty_blocks))
+	if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters))
 		return 1;
 
-	/* Hm, nope.  Are (enough) root reserved blocks available? */
+	/* Hm, nope.  Are (enough) root reserved clusters available? */
 	if (sbi->s_resuid == current_fsuid() ||
 	    ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
 	    capable(CAP_SYS_RESOURCE) ||
 		(flags & EXT4_MB_USE_ROOT_BLOCKS)) {
 
-		if (free_blocks >= (nblocks + dirty_blocks))
+		if (free_clusters >= (nclusters + dirty_clusters))
 			return 1;
 	}
 
@@ -448,7 +449,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
 int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
 			     s64 nclusters, unsigned int flags)
 {
-	if (ext4_has_free_blocks(sbi, nclusters, flags)) {
+	if (ext4_has_free_clusters(sbi, nclusters, flags)) {
 		percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters);
 		return 0;
 	} else
@@ -469,7 +470,7 @@ int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
  */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-	if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) ||
+	if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
 	    (*retries)++ > 3 ||
 	    !EXT4_SB(sb)->s_journal)
 		return 0;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f97638634e34..21ea65d8bd46 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2103,13 +2103,13 @@ do {								\
 } while (0)
 
 #ifdef CONFIG_SMP
-/* Each CPU can accumulate percpu_counter_batch blocks in their local
- * counters. So we need to make sure we have free blocks more
+/* Each CPU can accumulate percpu_counter_batch clusters in their local
+ * counters. So we need to make sure we have free clusters more
  * than percpu_counter_batch  * nr_cpu_ids. Also add a window of 4 times.
  */
-#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
+#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
 #else
-#define EXT4_FREEBLOCKS_WATERMARK 0
+#define EXT4_FREECLUSTERS_WATERMARK 0
 #endif
 
 static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 557c34ffa05b..88dc63a01756 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2255,7 +2255,7 @@ static int ext4_nonda_switch(struct super_block *sb)
 		percpu_counter_read_positive(&sbi->s_freeclusters_counter));
 	dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
 	if (2 * free_blocks < 3 * dirty_blocks ||
-		free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
+		free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
 		/*
 		 * free block count is less than 150% of dirty blocks
 		 * or free blocks is less than watermark
-- 
cgit v1.2.3-58-ga151


From d8990240d8c911064447f8aa5a440f9345a6d692 Mon Sep 17 00:00:00 2001
From: Aditya Kali <adityakali@google.com>
Date: Fri, 9 Sep 2011 19:18:51 -0400
Subject: ext4: add some tracepoints in ext4/extents.c

This patch adds some tracepoints in ext4/extents.c and updates a tracepoint in
ext4/inode.c.

Tested: Built and ran the kernel and verified that these tracepoints work.
Also ran xfstests.

Signed-off-by: Aditya Kali <adityakali@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h              |   2 +
 fs/ext4/extents.c           |  44 ++++-
 fs/ext4/inode.c             |   3 +-
 fs/ext4/migrate.c           |   1 -
 fs/ext4/move_extent.c       |   1 -
 include/trace/events/ext4.h | 398 +++++++++++++++++++++++++++++++++++++++++++-
 6 files changed, 433 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 21ea65d8bd46..751277a4890c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2339,4 +2339,6 @@ extern void ext4_resize_end(struct super_block *sb);
 
 #endif	/* __KERNEL__ */
 
+#include "ext4_extents.h"
+
 #endif	/* _EXT4_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c4e005864534..9b119308daea 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -42,7 +42,6 @@
 #include <asm/uaccess.h>
 #include <linux/fiemap.h>
 #include "ext4_jbd2.h"
-#include "ext4_extents.h"
 
 #include <trace/events/ext4.h>
 
@@ -1969,6 +1968,7 @@ ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
 	struct ext4_ext_cache *cex;
 	BUG_ON(len == 0);
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+	trace_ext4_ext_put_in_cache(inode, block, len, start);
 	cex = &EXT4_I(inode)->i_cached_extent;
 	cex->ec_block = block;
 	cex->ec_len = len;
@@ -2070,6 +2070,7 @@ errout:
 		sbi->extent_cache_misses++;
 	else
 		sbi->extent_cache_hits++;
+	trace_ext4_ext_in_cache(inode, block, ret);
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 	return ret;
 }
@@ -2137,6 +2138,8 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
 	if (err)
 		return err;
 	ext_debug("index is empty, remove it, free block %llu\n", leaf);
+	trace_ext4_ext_rm_idx(inode, leaf);
+
 	ext4_free_blocks(handle, inode, NULL, leaf, 1,
 			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
 	return err;
@@ -2222,6 +2225,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 	 */
 	flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
 
+	trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
 	/*
 	 * If we have a partial cluster, and it's different from the
 	 * cluster of the last block, we need to explicitly free the
@@ -2336,6 +2340,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 	ex_ee_block = le32_to_cpu(ex->ee_block);
 	ex_ee_len = ext4_ext_get_actual_len(ex);
 
+	trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
+
 	while (ex >= EXT_FIRST_EXTENT(eh) &&
 			ex_ee_block + ex_ee_len > start) {
 
@@ -2591,6 +2597,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 again:
 	ext4_ext_invalidate_cache(inode);
 
+	trace_ext4_ext_remove_space(inode, start, depth);
+
 	/*
 	 * We start scanning from right side, freeing all the blocks
 	 * after i_size and walking into the tree depth-wise.
@@ -2686,6 +2694,9 @@ again:
 		}
 	}
 
+	trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
+			path->p_hdr->eh_entries);
+
 	/* If we still have something in the partial cluster and we have removed
 	 * even the first extent, then we should free the blocks in the partial
 	 * cluster as well. */
@@ -3300,6 +3311,10 @@ static int ext4_find_delalloc_range(struct inode *inode,
 			 * detect that here.
 			 */
 			page_cache_release(page);
+			trace_ext4_find_delalloc_range(inode,
+					lblk_start, lblk_end,
+					search_hint_reverse,
+					0, i);
 			return 0;
 		}
 
@@ -3327,6 +3342,10 @@ static int ext4_find_delalloc_range(struct inode *inode,
 
 			if (buffer_delay(bh)) {
 				page_cache_release(page);
+				trace_ext4_find_delalloc_range(inode,
+						lblk_start, lblk_end,
+						search_hint_reverse,
+						1, i);
 				return 1;
 			}
 			if (search_hint_reverse)
@@ -3349,6 +3368,8 @@ nextpage:
 		i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 	}
 
+	trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end,
+					search_hint_reverse, 0, 0);
 	return 0;
 }
 
@@ -3414,6 +3435,8 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
 	/* max possible clusters for this allocation */
 	allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
 
+	trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
+
 	/* Check towards left side */
 	c_offset = lblk_start & (sbi->s_cluster_ratio - 1);
 	if (c_offset) {
@@ -3453,6 +3476,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 		  flags, allocated);
 	ext4_ext_show_leaf(inode, path);
 
+	trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated,
+						    newblock);
+
 	/* get_block() before submit the IO, split the extent */
 	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
 		ret = ext4_split_unwritten_extents(handle, inode, map,
@@ -3572,7 +3598,7 @@ out2:
  * get_implied_cluster_alloc - check to see if the requested
  * allocation (in the map structure) overlaps with a cluster already
  * allocated in an extent.
- *	@sbi	The ext4-specific superblock structure
+ *	@sb	The filesystem superblock structure
  *	@map	The requested lblk->pblk mapping
  *	@ex	The extent structure which might contain an implied
  *			cluster allocation
@@ -3609,11 +3635,12 @@ out2:
  * ext4_ext_map_blocks() will then allocate one or more new clusters
  * by calling ext4_mb_new_blocks().
  */
-static int get_implied_cluster_alloc(struct ext4_sb_info *sbi,
+static int get_implied_cluster_alloc(struct super_block *sb,
 				     struct ext4_map_blocks *map,
 				     struct ext4_extent *ex,
 				     struct ext4_ext_path *path)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
 	ext4_lblk_t ex_cluster_start, ex_cluster_end;
 	ext4_lblk_t rr_cluster_start, rr_cluster_end;
@@ -3662,8 +3689,12 @@ static int get_implied_cluster_alloc(struct ext4_sb_info *sbi,
 			ext4_lblk_t next = ext4_ext_next_allocated_block(path);
 			map->m_len = min(map->m_len, next - map->m_lblk);
 		}
+
+		trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
 		return 1;
 	}
+
+	trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
 	return 0;
 }
 
@@ -3772,6 +3803,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		 * we split out initialized portions during a write.
 		 */
 		ee_len = ext4_ext_get_actual_len(ex);
+
+		trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
+
 		/* if found extent covers block, simply return it */
 		if (in_range(map->m_lblk, ee_block, ee_len)) {
 			ext4_fsblk_t partial_cluster = 0;
@@ -3912,7 +3946,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	 * by ext4_ext_find_extent() implies a cluster we can use.
 	 */
 	if (cluster_offset && ex &&
-	    get_implied_cluster_alloc(sbi, map, ex, path)) {
+	    get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
 		ar.len = allocated = map->m_len;
 		newblock = map->m_pblk;
 		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
@@ -3933,7 +3967,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	/* Check if the extent after searching to the right implies a
 	 * cluster we can use. */
 	if ((sbi->s_cluster_ratio > 1) && ex2 &&
-	    get_implied_cluster_alloc(sbi, map, ex2, path)) {
+	    get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
 		ar.len = allocated = map->m_len;
 		newblock = map->m_pblk;
 		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 88dc63a01756..2dcd4fed96ec 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -42,7 +42,6 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "ext4_extents.h"
 #include "truncate.h"
 
 #include <trace/events/ext4.h>
@@ -268,7 +267,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
 	struct ext4_inode_info *ei = EXT4_I(inode);
 
 	spin_lock(&ei->i_block_reservation_lock);
-	trace_ext4_da_update_reserve_space(inode, used);
+	trace_ext4_da_update_reserve_space(inode, used, quota_claim);
 	if (unlikely(used > ei->i_reserved_data_blocks)) {
 		ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
 			 "with only %d reserved data blocks\n",
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b57b98fb44d1..6f07a06f2437 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -15,7 +15,6 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include "ext4_jbd2.h"
-#include "ext4_extents.h"
 
 /*
  * The contiguous blocks details which can be
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index f57455a1b1b2..c5826c623e7a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -17,7 +17,6 @@
 #include <linux/quotaops.h>
 #include <linux/slab.h>
 #include "ext4_jbd2.h"
-#include "ext4_extents.h"
 #include "ext4.h"
 
 /**
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index b50a54736242..c9a341e385a3 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -12,6 +12,8 @@ struct ext4_allocation_request;
 struct ext4_prealloc_space;
 struct ext4_inode_info;
 struct mpage_da_data;
+struct ext4_map_blocks;
+struct ext4_extent;
 
 #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
 
@@ -1032,9 +1034,9 @@ TRACE_EVENT(ext4_forget,
 );
 
 TRACE_EVENT(ext4_da_update_reserve_space,
-	TP_PROTO(struct inode *inode, int used_blocks),
+	TP_PROTO(struct inode *inode, int used_blocks, int quota_claim),
 
-	TP_ARGS(inode, used_blocks),
+	TP_ARGS(inode, used_blocks, quota_claim),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
@@ -1045,6 +1047,7 @@ TRACE_EVENT(ext4_da_update_reserve_space,
 		__field(	int,	reserved_data_blocks	)
 		__field(	int,	reserved_meta_blocks	)
 		__field(	int,	allocated_meta_blocks	)
+		__field(	int,	quota_claim		)
 	),
 
 	TP_fast_assign(
@@ -1053,19 +1056,24 @@ TRACE_EVENT(ext4_da_update_reserve_space,
 		__entry->mode	= inode->i_mode;
 		__entry->i_blocks = inode->i_blocks;
 		__entry->used_blocks = used_blocks;
-		__entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
-		__entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
-		__entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks;
+		__entry->reserved_data_blocks =
+				EXT4_I(inode)->i_reserved_data_blocks;
+		__entry->reserved_meta_blocks =
+				EXT4_I(inode)->i_reserved_meta_blocks;
+		__entry->allocated_meta_blocks =
+				EXT4_I(inode)->i_allocated_meta_blocks;
+		__entry->quota_claim = quota_claim;
 	),
 
 	TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d "
 		  "reserved_data_blocks %d reserved_meta_blocks %d "
-		  "allocated_meta_blocks %d",
+		  "allocated_meta_blocks %d quota_claim %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->mode, __entry->i_blocks,
 		  __entry->used_blocks, __entry->reserved_data_blocks,
-		  __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
+		  __entry->reserved_meta_blocks, __entry->allocated_meta_blocks,
+		  __entry->quota_claim)
 );
 
 TRACE_EVENT(ext4_da_reserve_space,
@@ -1589,6 +1597,382 @@ DEFINE_EVENT(ext4__trim, ext4_trim_all_free,
 	TP_ARGS(sb, group, start, len)
 );
 
+TRACE_EVENT(ext4_ext_handle_uninitialized_extents,
+	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
+		 unsigned int allocated, ext4_fsblk_t newblock),
+
+	TP_ARGS(inode, map, allocated, newblock),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino		)
+		__field(	dev_t,		dev		)
+		__field(	ext4_lblk_t,	lblk		)
+		__field(	ext4_fsblk_t,	pblk		)
+		__field(	unsigned int,	len		)
+		__field(	int,		flags		)
+		__field(	unsigned int,	allocated	)
+		__field(	ext4_fsblk_t,	newblk		)
+	),
+
+	TP_fast_assign(
+		__entry->ino		= inode->i_ino;
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->lblk		= map->m_lblk;
+		__entry->pblk		= map->m_pblk;
+		__entry->len		= map->m_len;
+		__entry->flags		= map->m_flags;
+		__entry->allocated	= allocated;
+		__entry->newblk		= newblock;
+	),
+
+	TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %d"
+		  "allocated %d newblock %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned) __entry->lblk, (unsigned long long) __entry->pblk,
+		  __entry->len, __entry->flags,
+		  (unsigned int) __entry->allocated,
+		  (unsigned long long) __entry->newblk)
+);
+
+TRACE_EVENT(ext4_get_implied_cluster_alloc_exit,
+	TP_PROTO(struct super_block *sb, struct ext4_map_blocks *map, int ret),
+
+	TP_ARGS(sb, map, ret),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,		dev	)
+		__field(	ext4_lblk_t,	lblk	)
+		__field(	ext4_fsblk_t,	pblk	)
+		__field(	unsigned int,	len	)
+		__field(	unsigned int,	flags	)
+		__field(	int,		ret	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= sb->s_dev;
+		__entry->lblk	= map->m_lblk;
+		__entry->pblk	= map->m_pblk;
+		__entry->len	= map->m_len;
+		__entry->flags	= map->m_flags;
+		__entry->ret	= ret;
+	),
+
+	TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %u ret %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->lblk, (unsigned long long) __entry->pblk,
+		  __entry->len, __entry->flags, __entry->ret)
+);
+
+TRACE_EVENT(ext4_ext_put_in_cache,
+	TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len,
+		 ext4_fsblk_t start),
+
+	TP_ARGS(inode, lblk, len, start),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino	)
+		__field(	dev_t,		dev	)
+		__field(	ext4_lblk_t,	lblk	)
+		__field(	unsigned int,	len	)
+		__field(	ext4_fsblk_t,	start	)
+	),
+
+	TP_fast_assign(
+		__entry->ino	= inode->i_ino;
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->lblk	= lblk;
+		__entry->len	= len;
+		__entry->start	= start;
+	),
+
+	TP_printk("dev %d,%d ino %lu lblk %u len %u start %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned) __entry->lblk,
+		  __entry->len,
+		  (unsigned long long) __entry->start)
+);
+
+TRACE_EVENT(ext4_ext_in_cache,
+	TP_PROTO(struct inode *inode, ext4_lblk_t lblk, int ret),
+
+	TP_ARGS(inode, lblk, ret),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino	)
+		__field(	dev_t,		dev	)
+		__field(	ext4_lblk_t,	lblk	)
+		__field(	int,		ret	)
+	),
+
+	TP_fast_assign(
+		__entry->ino	= inode->i_ino;
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->lblk	= lblk;
+		__entry->ret	= ret;
+	),
+
+	TP_printk("dev %d,%d ino %lu lblk %u ret %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned) __entry->lblk,
+		  __entry->ret)
+
+);
+
+TRACE_EVENT(ext4_find_delalloc_range,
+	TP_PROTO(struct inode *inode, ext4_lblk_t from, ext4_lblk_t to,
+		int reverse, int found, ext4_lblk_t found_blk),
+
+	TP_ARGS(inode, from, to, reverse, found, found_blk),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino		)
+		__field(	dev_t,		dev		)
+		__field(	ext4_lblk_t,	from		)
+		__field(	ext4_lblk_t,	to		)
+		__field(	int,		reverse		)
+		__field(	int,		found		)
+		__field(	ext4_lblk_t,	found_blk	)
+	),
+
+	TP_fast_assign(
+		__entry->ino		= inode->i_ino;
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->from		= from;
+		__entry->to		= to;
+		__entry->reverse	= reverse;
+		__entry->found		= found;
+		__entry->found_blk	= found_blk;
+	),
+
+	TP_printk("dev %d,%d ino %lu from %u to %u reverse %d found %d "
+		  "(blk = %u)",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned) __entry->from, (unsigned) __entry->to,
+		  __entry->reverse, __entry->found,
+		  (unsigned) __entry->found_blk)
+);
+
+TRACE_EVENT(ext4_get_reserved_cluster_alloc,
+	TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len),
+
+	TP_ARGS(inode, lblk, len),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino	)
+		__field(	dev_t,		dev	)
+		__field(	ext4_lblk_t,	lblk	)
+		__field(	unsigned int,	len	)
+	),
+
+	TP_fast_assign(
+		__entry->ino	= inode->i_ino;
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->lblk	= lblk;
+		__entry->len	= len;
+	),
+
+	TP_printk("dev %d,%d ino %lu lblk %u len %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned) __entry->lblk,
+		  __entry->len)
+);
+
+TRACE_EVENT(ext4_ext_show_extent,
+	TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
+		 unsigned short len),
+
+	TP_ARGS(inode, lblk, pblk, len),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino	)
+		__field(	dev_t,		dev	)
+		__field(	ext4_lblk_t,	lblk	)
+		__field(	ext4_fsblk_t,	pblk	)
+		__field(	unsigned short,	len	)
+	),
+
+	TP_fast_assign(
+		__entry->ino	= inode->i_ino;
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->lblk	= lblk;
+		__entry->pblk	= pblk;
+		__entry->len	= len;
+	),
+
+	TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned) __entry->lblk,
+		  (unsigned long long) __entry->pblk,
+		  (unsigned short) __entry->len)
+);
+
+TRACE_EVENT(ext4_remove_blocks,
+	    TP_PROTO(struct inode *inode, struct ext4_extent *ex,
+		ext4_lblk_t from, ext4_fsblk_t to,
+		ext4_fsblk_t partial_cluster),
+
+	TP_ARGS(inode, ex, from, to, partial_cluster),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino	)
+		__field(	dev_t,		dev	)
+		__field(	ext4_lblk_t,	ee_lblk	)
+		__field(	ext4_fsblk_t,	ee_pblk	)
+		__field(	unsigned short,	ee_len	)
+		__field(	ext4_lblk_t,	from	)
+		__field(	ext4_lblk_t,	to	)
+		__field(	ext4_fsblk_t,	partial	)
+	),
+
+	TP_fast_assign(
+		__entry->ino		= inode->i_ino;
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->ee_lblk	= cpu_to_le32(ex->ee_block);
+		__entry->ee_pblk	= ext4_ext_pblock(ex);
+		__entry->ee_len		= ext4_ext_get_actual_len(ex);
+		__entry->from		= from;
+		__entry->to		= to;
+		__entry->partial	= partial_cluster;
+	),
+
+	TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]"
+		  "from %u to %u partial_cluster %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned) __entry->ee_lblk,
+		  (unsigned long long) __entry->ee_pblk,
+		  (unsigned short) __entry->ee_len,
+		  (unsigned) __entry->from,
+		  (unsigned) __entry->to,
+		  (unsigned) __entry->partial)
+);
+
+TRACE_EVENT(ext4_ext_rm_leaf,
+	TP_PROTO(struct inode *inode, ext4_lblk_t start,
+		 struct ext4_extent *ex, ext4_fsblk_t partial_cluster),
+
+	TP_ARGS(inode, start, ex, partial_cluster),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino	)
+		__field(	dev_t,		dev	)
+		__field(	ext4_lblk_t,	start	)
+		__field(	ext4_lblk_t,	ee_lblk	)
+		__field(	ext4_fsblk_t,	ee_pblk	)
+		__field(	short,		ee_len	)
+		__field(	ext4_fsblk_t,	partial	)
+	),
+
+	TP_fast_assign(
+		__entry->ino		= inode->i_ino;
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->start		= start;
+		__entry->ee_lblk	= le32_to_cpu(ex->ee_block);
+		__entry->ee_pblk	= ext4_ext_pblock(ex);
+		__entry->ee_len		= ext4_ext_get_actual_len(ex);
+		__entry->partial	= partial_cluster;
+	),
+
+	TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]"
+		  "partial_cluster %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned) __entry->start,
+		  (unsigned) __entry->ee_lblk,
+		  (unsigned long long) __entry->ee_pblk,
+		  (unsigned short) __entry->ee_len,
+		  (unsigned) __entry->partial)
+);
+
+TRACE_EVENT(ext4_ext_rm_idx,
+	TP_PROTO(struct inode *inode, ext4_fsblk_t pblk),
+
+	TP_ARGS(inode, pblk),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino	)
+		__field(	dev_t,		dev	)
+		__field(	ext4_fsblk_t,	pblk	)
+	),
+
+	TP_fast_assign(
+		__entry->ino	= inode->i_ino;
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->pblk	= pblk;
+	),
+
+	TP_printk("dev %d,%d ino %lu index_pblk %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned long long) __entry->pblk)
+);
+
+TRACE_EVENT(ext4_ext_remove_space,
+	TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth),
+
+	TP_ARGS(inode, start, depth),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino	)
+		__field(	dev_t,		dev	)
+		__field(	ext4_lblk_t,	start	)
+		__field(	int,		depth	)
+	),
+
+	TP_fast_assign(
+		__entry->ino	= inode->i_ino;
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->start	= start;
+		__entry->depth	= depth;
+	),
+
+	TP_printk("dev %d,%d ino %lu since %u depth %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned) __entry->start,
+		  __entry->depth)
+);
+
+TRACE_EVENT(ext4_ext_remove_space_done,
+	TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth,
+		ext4_lblk_t partial, unsigned short eh_entries),
+
+	TP_ARGS(inode, start, depth, partial, eh_entries),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino		)
+		__field(	dev_t,		dev		)
+		__field(	ext4_lblk_t,	start		)
+		__field(	int,		depth		)
+		__field(	ext4_lblk_t,	partial		)
+		__field(	unsigned short,	eh_entries	)
+	),
+
+	TP_fast_assign(
+		__entry->ino		= inode->i_ino;
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->start		= start;
+		__entry->depth		= depth;
+		__entry->partial	= partial;
+		__entry->eh_entries	= eh_entries;
+	),
+
+	TP_printk("dev %d,%d ino %lu since %u depth %d partial %u "
+		  "remaining_entries %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned) __entry->start,
+		  __entry->depth,
+		  (unsigned) __entry->partial,
+		  (unsigned short) __entry->eh_entries)
+);
+
 #endif /* _TRACE_EXT4_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3-58-ga151


From 5356f2615cd558c57a1f7d7528d1ad4de3640d96 Mon Sep 17 00:00:00 2001
From: Aditya Kali <adityakali@google.com>
Date: Fri, 9 Sep 2011 19:20:51 -0400
Subject: ext4: attempt to fix race in bigalloc code path

Currently, there exists a race between delayed allocated writes and
the writeback when bigalloc feature is in use. The race was because we
wanted to determine what blocks in a cluster are under delayed
allocation and we were using buffer_delayed(bh) check for it. But, the
writeback codepath clears this bit without any synchronization which
resulted in a race and an ext4 warning similar to:

EXT4-fs (ram1): ext4_da_update_reserve_space: ino 13, used 1 with only 0
		reserved data blocks

The race existed in two places.
(1) between ext4_find_delalloc_range() and ext4_map_blocks() when called from
    writeback code path.
(2) between ext4_find_delalloc_range() and ext4_da_get_block_prep() (where
    buffer_delayed(bh) is set.

To fix (1), this patch introduces a new buffer_head state bit -
BH_Da_Mapped.  This bit is set under the protection of
EXT4_I(inode)->i_data_sem when we have actually mapped the delayed
allocated blocks during the writeout time. We can now reliably check
for this bit inside ext4_find_delalloc_range() to determine whether
the reservation for the blocks have already been claimed or not.

To fix (2), it was necessary to set buffer_delay(bh) under the
protection of i_data_sem.  So, I extracted the very beginning of
ext4_map_blocks into a new function - ext4_da_map_blocks() - and
performed the required setting of bh_delay bit and the quota
reservation under the protection of i_data_sem.  These two fixes makes
the checking of buffer_delay(bh) and buffer_da_mapped(bh) consistent,
thus removing the race.

Tested: I was able to reproduce the problem by running 'dd' and
'fsync' in parallel. Also, xfstests sometimes used to reproduce this
race. After the fix both my test and xfstests were successful and no
race (warning message) was observed.

Google-Bug-Id: 4997027

Signed-off-by: Aditya Kali <adityakali@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |   5 +-
 fs/ext4/extents.c |  38 +++++---------
 fs/ext4/inode.c   | 146 +++++++++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 134 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 751277a4890c..1bbd2caebe7f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1893,7 +1893,6 @@ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
 					int used, int quota_claim);
-extern int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock);
 
 /* indirect.c */
 extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
@@ -2300,10 +2299,14 @@ enum ext4_state_bits {
 				 * never, ever appear in a buffer_head's state
 				 * flag. See EXT4_MAP_FROM_CLUSTER to see where
 				 * this is used. */
+	BH_Da_Mapped,	/* Delayed allocated block that now has a mapping. This
+			 * flag is set when ext4_map_blocks is called on a
+			 * delayed allocated block to get its real mapping. */
 };
 
 BUFFER_FNS(Uninit, uninit)
 TAS_BUFFER_FNS(Uninit, uninit)
+BUFFER_FNS(Da_Mapped, da_mapped)
 
 /*
  * Add new method to test wether block and inode bitmaps are properly
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9b119308daea..ad39627c1fbc 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3296,28 +3296,9 @@ static int ext4_find_delalloc_range(struct inode *inode,
 
 	while ((i >= lblk_start) && (i <= lblk_end)) {
 		page = find_get_page(mapping, index);
-		if (!page || !PageDirty(page))
+		if (!page)
 			goto nextpage;
 
-		if (PageWriteback(page)) {
-			/*
-			 * This might be a race with allocation and writeout. In
-			 * this case we just assume that the rest of the range
-			 * will eventually be written and there wont be any
-			 * delalloc blocks left.
-			 * TODO: the above assumption is troublesome, but might
-			 * work better in practice. other option could be note
-			 * somewhere that the cluster is getting written out and
-			 * detect that here.
-			 */
-			page_cache_release(page);
-			trace_ext4_find_delalloc_range(inode,
-					lblk_start, lblk_end,
-					search_hint_reverse,
-					0, i);
-			return 0;
-		}
-
 		if (!page_has_buffers(page))
 			goto nextpage;
 
@@ -3340,7 +3321,11 @@ static int ext4_find_delalloc_range(struct inode *inode,
 				continue;
 			}
 
-			if (buffer_delay(bh)) {
+			/* Check if the buffer is delayed allocated and that it
+			 * is not yet mapped. (when da-buffers are mapped during
+			 * their writeout, their da_mapped bit is set.)
+			 */
+			if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
 				page_cache_release(page);
 				trace_ext4_find_delalloc_range(inode,
 						lblk_start, lblk_end,
@@ -4106,6 +4091,7 @@ got_allocated_blocks:
 			ext4_da_update_reserve_space(inode, allocated_clusters,
 							1);
 			if (reserved_clusters < allocated_clusters) {
+				struct ext4_inode_info *ei = EXT4_I(inode);
 				int reservation = allocated_clusters -
 						  reserved_clusters;
 				/*
@@ -4148,11 +4134,11 @@ got_allocated_blocks:
 				 *   remaining blocks finally gets written, we
 				 *   could claim them.
 				 */
-				while (reservation) {
-					ext4_da_reserve_space(inode,
-							      map->m_lblk);
-					reservation--;
-				}
+				dquot_reserve_block(inode,
+						EXT4_C2B(sbi, reservation));
+				spin_lock(&ei->i_block_reservation_lock);
+				ei->i_reserved_data_blocks += reservation;
+				spin_unlock(&ei->i_block_reservation_lock);
 			}
 		}
 	}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2dcd4fed96ec..1380cd29c312 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -397,6 +397,49 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 	return num;
 }
 
+/*
+ * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map.
+ */
+static void set_buffers_da_mapped(struct inode *inode,
+				   struct ext4_map_blocks *map)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct pagevec pvec;
+	int i, nr_pages;
+	pgoff_t index, end;
+
+	index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	end = (map->m_lblk + map->m_len - 1) >>
+		(PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	pagevec_init(&pvec, 0);
+	while (index <= end) {
+		nr_pages = pagevec_lookup(&pvec, mapping, index,
+					  min(end - index + 1,
+					      (pgoff_t)PAGEVEC_SIZE));
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+			struct buffer_head *bh, *head;
+
+			if (unlikely(page->mapping != mapping) ||
+			    !PageDirty(page))
+				break;
+
+			if (page_has_buffers(page)) {
+				bh = head = page_buffers(page);
+				do {
+					set_buffer_da_mapped(bh);
+					bh = bh->b_this_page;
+				} while (bh != head);
+			}
+			index++;
+		}
+		pagevec_release(&pvec);
+	}
+}
+
 /*
  * The ext4_map_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
@@ -516,9 +559,17 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 			(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
 			ext4_da_update_reserve_space(inode, retval, 1);
 	}
-	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
 		ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
 
+		/* If we have successfully mapped the delayed allocated blocks,
+		 * set the BH_Da_Mapped bit on them. Its important to do this
+		 * under the protection of i_data_sem.
+		 */
+		if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
+			set_buffers_da_mapped(inode, map);
+	}
+
 	up_write((&EXT4_I(inode)->i_data_sem));
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
 		int ret = check_block_validity(inode, map);
@@ -1038,7 +1089,7 @@ static int ext4_journalled_write_end(struct file *file,
 /*
  * Reserve a single cluster located at lblock
  */
-int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
+static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 {
 	int retries = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -1153,6 +1204,7 @@ static void ext4_da_page_release_reservation(struct page *page,
 		if ((offset <= curr_off) && (buffer_delay(bh))) {
 			to_release++;
 			clear_buffer_delay(bh);
+			clear_buffer_da_mapped(bh);
 		}
 		curr_off = next_off;
 	} while ((bh = bh->b_this_page) != head);
@@ -1271,6 +1323,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
 						clear_buffer_delay(bh);
 						bh->b_blocknr = pblock;
 					}
+					if (buffer_da_mapped(bh))
+						clear_buffer_da_mapped(bh);
 					if (buffer_unwritten(bh) ||
 					    buffer_mapped(bh))
 						BUG_ON(bh->b_blocknr != pblock);
@@ -1603,6 +1657,66 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 	return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
 }
 
+/*
+ * This function is grabs code from the very beginning of
+ * ext4_map_blocks, but assumes that the caller is from delayed write
+ * time. This function looks up the requested blocks and sets the
+ * buffer delay bit under the protection of i_data_sem.
+ */
+static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
+			      struct ext4_map_blocks *map,
+			      struct buffer_head *bh)
+{
+	int retval;
+	sector_t invalid_block = ~((sector_t) 0xffff);
+
+	if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
+		invalid_block = ~0;
+
+	map->m_flags = 0;
+	ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
+		  "logical block %lu\n", inode->i_ino, map->m_len,
+		  (unsigned long) map->m_lblk);
+	/*
+	 * Try to see if we can get the block without requesting a new
+	 * file system block.
+	 */
+	down_read((&EXT4_I(inode)->i_data_sem));
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		retval = ext4_ext_map_blocks(NULL, inode, map, 0);
+	else
+		retval = ext4_ind_map_blocks(NULL, inode, map, 0);
+
+	if (retval == 0) {
+		/*
+		 * XXX: __block_prepare_write() unmaps passed block,
+		 * is it OK?
+		 */
+		/* If the block was allocated from previously allocated cluster,
+		 * then we dont need to reserve it again. */
+		if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
+			retval = ext4_da_reserve_space(inode, iblock);
+			if (retval)
+				/* not enough space to reserve */
+				goto out_unlock;
+		}
+
+		/* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
+		 * and it should not appear on the bh->b_state.
+		 */
+		map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
+
+		map_bh(bh, inode->i_sb, invalid_block);
+		set_buffer_new(bh);
+		set_buffer_delay(bh);
+	}
+
+out_unlock:
+	up_read((&EXT4_I(inode)->i_data_sem));
+
+	return retval;
+}
+
 /*
  * This is a special get_blocks_t callback which is used by
  * ext4_da_write_begin().  It will either return mapped block or
@@ -1620,10 +1734,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 {
 	struct ext4_map_blocks map;
 	int ret = 0;
-	sector_t invalid_block = ~((sector_t) 0xffff);
-
-	if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
-		invalid_block = ~0;
 
 	BUG_ON(create == 0);
 	BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
@@ -1636,29 +1746,9 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 	 * preallocated blocks are unmapped but should treated
 	 * the same as allocated blocks.
 	 */
-	ret = ext4_map_blocks(NULL, inode, &map, 0);
-	if (ret < 0)
+	ret = ext4_da_map_blocks(inode, iblock, &map, bh);
+	if (ret <= 0)
 		return ret;
-	if (ret == 0) {
-		if (buffer_delay(bh))
-			return 0; /* Not sure this could or should happen */
-		/*
-		 * XXX: __block_write_begin() unmaps passed block, is it OK?
-		 */
-		/* If the block was allocated from previously allocated cluster,
-		 * then we dont need to reserve it again. */
-		if (!(map.m_flags & EXT4_MAP_FROM_CLUSTER)) {
-			ret = ext4_da_reserve_space(inode, iblock);
-			if (ret)
-				/* not enough space to reserve */
-				return ret;
-		}
-
-		map_bh(bh, inode->i_sb, invalid_block);
-		set_buffer_new(bh);
-		set_buffer_delay(bh);
-		return 0;
-	}
 
 	map_bh(bh, inode->i_sb, map.m_pblk);
 	bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
-- 
cgit v1.2.3-58-ga151


From e8a0e41266e9c207ad8ac158cee9547ef1bc90ac Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Mon, 13 Jun 2011 22:16:44 +0200
Subject: jffs2: Avoid unneeded 'if' before kfree

kfree() deals gracefully with NULL pointers, so it's pointless to test for
one prior to calling it.
This removes such a test from jffs2_scan_medium().

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Artem Bityutskiy <dedekind1@gmail.com>
---
 fs/jffs2/scan.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 8d8cd3419d02..28107ca136e4 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -275,9 +275,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
 	else
 		c->mtd->unpoint(c->mtd, 0, c->mtd->size);
 #endif
-	if (s)
-		kfree(s);
-
+	kfree(s);
 	return ret;
 }
 
-- 
cgit v1.2.3-58-ga151


From 51b11e3630672b7ce8793ecf13e5759656edf38a Mon Sep 17 00:00:00 2001
From: Alexey Khoroshilov <khoroshilov@ispras.ru>
Date: Tue, 28 Jun 2011 00:21:30 +0400
Subject: jffs2: use mutex_is_locked() in __jffs2_flush_wbuf()

Use a helper to test if a mutex is held instead of a hack with
mutex_trylock().

Signed-off-by: Alexey Khoroshilov <khoroshilov@ispras.ru>
Signed-off-by: Artem Bityutskiy <dedekind1@gmail.com>
---
 fs/jffs2/wbuf.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 4515bea0268f..a10fb24ca6e6 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -578,8 +578,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 	if (!jffs2_is_writebuffered(c))
 		return 0;
 
-	if (mutex_trylock(&c->alloc_sem)) {
-		mutex_unlock(&c->alloc_sem);
+	if (!mutex_is_locked(&c->alloc_sem)) {
 		printk(KERN_CRIT "jffs2_flush_wbuf() called with alloc_sem not locked!\n");
 		BUG();
 	}
-- 
cgit v1.2.3-58-ga151


From 0612b9ddc2eeda014dd805c87c752b342d8f80f0 Mon Sep 17 00:00:00 2001
From: Brian Norris <computersforpeace@gmail.com>
Date: Tue, 30 Aug 2011 18:45:40 -0700
Subject: mtd: rename MTD_OOB_* to MTD_OPS_*

These modes are not necessarily for OOB only. Particularly, MTD_OOB_RAW
affected operations on in-band page data as well. To clarify these
options and to emphasize that their effect is applied per-operation, we
change the primary prefix to MTD_OPS_.

Signed-off-by: Brian Norris <computersforpeace@gmail.com>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@intel.com>
---
 drivers/mtd/devices/doc2000.c          |  4 ++--
 drivers/mtd/devices/doc2001.c          |  4 ++--
 drivers/mtd/devices/doc2001plus.c      |  4 ++--
 drivers/mtd/inftlcore.c                |  6 ++---
 drivers/mtd/mtdchar.c                  | 10 +++++----
 drivers/mtd/mtdpart.c                  |  2 +-
 drivers/mtd/mtdswap.c                  |  6 ++---
 drivers/mtd/nand/gpmi-nand/gpmi-nand.c |  2 +-
 drivers/mtd/nand/nand_base.c           | 40 +++++++++++++++++-----------------
 drivers/mtd/nand/nand_bbt.c            |  8 +++----
 drivers/mtd/nand/sm_common.c           |  2 +-
 drivers/mtd/nftlcore.c                 |  6 ++---
 drivers/mtd/onenand/onenand_base.c     | 38 ++++++++++++++++----------------
 drivers/mtd/onenand/onenand_bbt.c      |  2 +-
 drivers/mtd/sm_ftl.c                   |  4 ++--
 drivers/mtd/ssfdc.c                    |  2 +-
 drivers/mtd/tests/mtd_oobtest.c        | 24 ++++++++++----------
 drivers/mtd/tests/mtd_readtest.c       |  2 +-
 drivers/staging/spectra/lld_mtd.c      |  6 ++---
 fs/jffs2/wbuf.c                        |  6 ++---
 include/linux/mtd/mtd.h                |  2 +-
 include/mtd/mtd-abi.h                  | 16 +++++++-------
 22 files changed, 99 insertions(+), 97 deletions(-)

(limited to 'fs')

diff --git a/drivers/mtd/devices/doc2000.c b/drivers/mtd/devices/doc2000.c
index 8c9703309496..e9fad9151219 100644
--- a/drivers/mtd/devices/doc2000.c
+++ b/drivers/mtd/devices/doc2000.c
@@ -927,7 +927,7 @@ static int doc_read_oob(struct mtd_info *mtd, loff_t ofs,
 	uint8_t *buf = ops->oobbuf;
 	size_t len = ops->len;
 
-	BUG_ON(ops->mode != MTD_OOB_PLACE);
+	BUG_ON(ops->mode != MTD_OPS_PLACE_OOB);
 
 	ofs += ops->ooboffs;
 
@@ -1091,7 +1091,7 @@ static int doc_write_oob(struct mtd_info *mtd, loff_t ofs,
 	struct DiskOnChip *this = mtd->priv;
 	int ret;
 
-	BUG_ON(ops->mode != MTD_OOB_PLACE);
+	BUG_ON(ops->mode != MTD_OPS_PLACE_OOB);
 
 	mutex_lock(&this->lock);
 	ret = doc_write_oob_nolock(mtd, ofs + ops->ooboffs, ops->len,
diff --git a/drivers/mtd/devices/doc2001.c b/drivers/mtd/devices/doc2001.c
index 3d2b459cea92..a3f7a27499be 100644
--- a/drivers/mtd/devices/doc2001.c
+++ b/drivers/mtd/devices/doc2001.c
@@ -631,7 +631,7 @@ static int doc_read_oob(struct mtd_info *mtd, loff_t ofs,
 	uint8_t *buf = ops->oobbuf;
 	size_t len = ops->len;
 
-	BUG_ON(ops->mode != MTD_OOB_PLACE);
+	BUG_ON(ops->mode != MTD_OPS_PLACE_OOB);
 
 	ofs += ops->ooboffs;
 
@@ -689,7 +689,7 @@ static int doc_write_oob(struct mtd_info *mtd, loff_t ofs,
 	uint8_t *buf = ops->oobbuf;
 	size_t len = ops->len;
 
-	BUG_ON(ops->mode != MTD_OOB_PLACE);
+	BUG_ON(ops->mode != MTD_OPS_PLACE_OOB);
 
 	ofs += ops->ooboffs;
 
diff --git a/drivers/mtd/devices/doc2001plus.c b/drivers/mtd/devices/doc2001plus.c
index d28c9d99979f..99351bc3e0ed 100644
--- a/drivers/mtd/devices/doc2001plus.c
+++ b/drivers/mtd/devices/doc2001plus.c
@@ -834,7 +834,7 @@ static int doc_read_oob(struct mtd_info *mtd, loff_t ofs,
 	uint8_t *buf = ops->oobbuf;
 	size_t len = ops->len;
 
-	BUG_ON(ops->mode != MTD_OOB_PLACE);
+	BUG_ON(ops->mode != MTD_OPS_PLACE_OOB);
 
 	ofs += ops->ooboffs;
 
@@ -919,7 +919,7 @@ static int doc_write_oob(struct mtd_info *mtd, loff_t ofs,
 	uint8_t *buf = ops->oobbuf;
 	size_t len = ops->len;
 
-	BUG_ON(ops->mode != MTD_OOB_PLACE);
+	BUG_ON(ops->mode != MTD_OPS_PLACE_OOB);
 
 	ofs += ops->ooboffs;
 
diff --git a/drivers/mtd/inftlcore.c b/drivers/mtd/inftlcore.c
index 21aa981e42cd..652065e47a79 100644
--- a/drivers/mtd/inftlcore.c
+++ b/drivers/mtd/inftlcore.c
@@ -152,7 +152,7 @@ int inftl_read_oob(struct mtd_info *mtd, loff_t offs, size_t len,
 	struct mtd_oob_ops ops;
 	int res;
 
-	ops.mode = MTD_OOB_PLACE;
+	ops.mode = MTD_OPS_PLACE_OOB;
 	ops.ooboffs = offs & (mtd->writesize - 1);
 	ops.ooblen = len;
 	ops.oobbuf = buf;
@@ -172,7 +172,7 @@ int inftl_write_oob(struct mtd_info *mtd, loff_t offs, size_t len,
 	struct mtd_oob_ops ops;
 	int res;
 
-	ops.mode = MTD_OOB_PLACE;
+	ops.mode = MTD_OPS_PLACE_OOB;
 	ops.ooboffs = offs & (mtd->writesize - 1);
 	ops.ooblen = len;
 	ops.oobbuf = buf;
@@ -192,7 +192,7 @@ static int inftl_write(struct mtd_info *mtd, loff_t offs, size_t len,
 	struct mtd_oob_ops ops;
 	int res;
 
-	ops.mode = MTD_OOB_PLACE;
+	ops.mode = MTD_OPS_PLACE_OOB;
 	ops.ooboffs = offs;
 	ops.ooblen = mtd->oobsize;
 	ops.oobbuf = oob;
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index d0eaef67b9bb..9b76438a3c27 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -221,7 +221,7 @@ static ssize_t mtd_read(struct file *file, char __user *buf, size_t count,loff_t
 		{
 			struct mtd_oob_ops ops;
 
-			ops.mode = MTD_OOB_RAW;
+			ops.mode = MTD_OPS_RAW;
 			ops.datbuf = kbuf;
 			ops.oobbuf = NULL;
 			ops.len = len;
@@ -317,7 +317,7 @@ static ssize_t mtd_write(struct file *file, const char __user *buf, size_t count
 		{
 			struct mtd_oob_ops ops;
 
-			ops.mode = MTD_OOB_RAW;
+			ops.mode = MTD_OPS_RAW;
 			ops.datbuf = kbuf;
 			ops.oobbuf = NULL;
 			ops.ooboffs = 0;
@@ -413,7 +413,8 @@ static int mtd_do_writeoob(struct file *file, struct mtd_info *mtd,
 	ops.ooblen = length;
 	ops.ooboffs = start & (mtd->writesize - 1);
 	ops.datbuf = NULL;
-	ops.mode = (mfi->mode == MTD_MODE_RAW) ? MTD_OOB_RAW : MTD_OOB_PLACE;
+	ops.mode = (mfi->mode == MTD_MODE_RAW) ? MTD_OPS_RAW :
+		MTD_OPS_PLACE_OOB;
 
 	if (ops.ooboffs && ops.ooblen > (mtd->oobsize - ops.ooboffs))
 		return -EINVAL;
@@ -457,7 +458,8 @@ static int mtd_do_readoob(struct file *file, struct mtd_info *mtd,
 	ops.ooblen = length;
 	ops.ooboffs = start & (mtd->writesize - 1);
 	ops.datbuf = NULL;
-	ops.mode = (mfi->mode == MTD_MODE_RAW) ? MTD_OOB_RAW : MTD_OOB_PLACE;
+	ops.mode = (mfi->mode == MTD_MODE_RAW) ? MTD_OPS_RAW :
+		MTD_OPS_PLACE_OOB;
 
 	if (ops.ooboffs && ops.ooblen > (mtd->oobsize - ops.ooboffs))
 		return -EINVAL;
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index c90b7ba362d7..cd7785aa1649 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -130,7 +130,7 @@ static int part_read_oob(struct mtd_info *mtd, loff_t from,
 	if (ops->oobbuf) {
 		size_t len, pages;
 
-		if (ops->mode == MTD_OOB_AUTO)
+		if (ops->mode == MTD_OPS_AUTO_OOB)
 			len = mtd->oobavail;
 		else
 			len = mtd->oobsize;
diff --git a/drivers/mtd/mtdswap.c b/drivers/mtd/mtdswap.c
index 9961063b90a2..910309f260f8 100644
--- a/drivers/mtd/mtdswap.c
+++ b/drivers/mtd/mtdswap.c
@@ -350,7 +350,7 @@ static int mtdswap_read_markers(struct mtdswap_dev *d, struct swap_eb *eb)
 	ops.oobbuf = d->oob_buf;
 	ops.ooboffs = 0;
 	ops.datbuf = NULL;
-	ops.mode = MTD_OOB_AUTO;
+	ops.mode = MTD_OPS_AUTO_OOB;
 
 	ret = mtdswap_read_oob(d, offset, &ops);
 
@@ -389,7 +389,7 @@ static int mtdswap_write_marker(struct mtdswap_dev *d, struct swap_eb *eb,
 
 	ops.ooboffs = 0;
 	ops.oobbuf = (uint8_t *)&n;
-	ops.mode = MTD_OOB_AUTO;
+	ops.mode = MTD_OPS_AUTO_OOB;
 	ops.datbuf = NULL;
 
 	if (marker == MTDSWAP_TYPE_CLEAN) {
@@ -931,7 +931,7 @@ static unsigned int mtdswap_eblk_passes(struct mtdswap_dev *d,
 	struct mtd_oob_ops ops;
 	int ret;
 
-	ops.mode = MTD_OOB_AUTO;
+	ops.mode = MTD_OPS_AUTO_OOB;
 	ops.len = mtd->writesize;
 	ops.ooblen = mtd->ecclayout->oobavail;
 	ops.ooboffs = 0;
diff --git a/drivers/mtd/nand/gpmi-nand/gpmi-nand.c b/drivers/mtd/nand/gpmi-nand/gpmi-nand.c
index 5c0fe0dd7057..071b63420f0e 100644
--- a/drivers/mtd/nand/gpmi-nand/gpmi-nand.c
+++ b/drivers/mtd/nand/gpmi-nand/gpmi-nand.c
@@ -1104,7 +1104,7 @@ gpmi_ecc_write_oob(struct mtd_info *mtd, struct nand_chip *chip, int page)
 	 * The BCH will use all the (page + oob).
 	 * Our gpmi_hw_ecclayout can only prohibit the JFFS2 to write the oob.
 	 * But it can not stop some ioctls such MEMWRITEOOB which uses
-	 * MTD_OOB_PLACE. So We have to implement this function to prohibit
+	 * MTD_OPS_PLACE_OOB. So We have to implement this function to prohibit
 	 * these ioctls too.
 	 */
 	return -EPERM;
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index ad40607f5f24..686b55770113 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -1382,12 +1382,12 @@ static uint8_t *nand_transfer_oob(struct nand_chip *chip, uint8_t *oob,
 {
 	switch (ops->mode) {
 
-	case MTD_OOB_PLACE:
-	case MTD_OOB_RAW:
+	case MTD_OPS_PLACE_OOB:
+	case MTD_OPS_RAW:
 		memcpy(oob, chip->oob_poi + ops->ooboffs, len);
 		return oob + len;
 
-	case MTD_OOB_AUTO: {
+	case MTD_OPS_AUTO_OOB: {
 		struct nand_oobfree *free = chip->ecc.layout->oobfree;
 		uint32_t boffs = 0, roffs = ops->ooboffs;
 		size_t bytes = 0;
@@ -1437,7 +1437,7 @@ static int nand_do_read_ops(struct mtd_info *mtd, loff_t from,
 	int ret = 0;
 	uint32_t readlen = ops->len;
 	uint32_t oobreadlen = ops->ooblen;
-	uint32_t max_oobsize = ops->mode == MTD_OOB_AUTO ?
+	uint32_t max_oobsize = ops->mode == MTD_OPS_AUTO_OOB ?
 		mtd->oobavail : mtd->oobsize;
 
 	uint8_t *bufpoi, *oob, *buf;
@@ -1469,7 +1469,7 @@ static int nand_do_read_ops(struct mtd_info *mtd, loff_t from,
 			}
 
 			/* Now read the page into the buffer */
-			if (unlikely(ops->mode == MTD_OOB_RAW))
+			if (unlikely(ops->mode == MTD_OPS_RAW))
 				ret = chip->ecc.read_page_raw(mtd, chip,
 							      bufpoi, page);
 			else if (!aligned && NAND_SUBPAGE_READ(chip) && !oob)
@@ -1759,7 +1759,7 @@ static int nand_do_read_oob(struct mtd_info *mtd, loff_t from,
 
 	stats = mtd->ecc_stats;
 
-	if (ops->mode == MTD_OOB_AUTO)
+	if (ops->mode == MTD_OPS_AUTO_OOB)
 		len = chip->ecc.layout->oobavail;
 	else
 		len = mtd->oobsize;
@@ -1787,7 +1787,7 @@ static int nand_do_read_oob(struct mtd_info *mtd, loff_t from,
 	page = realpage & chip->pagemask;
 
 	while (1) {
-		if (ops->mode == MTD_OOB_RAW)
+		if (ops->mode == MTD_OPS_RAW)
 			sndcmd = chip->ecc.read_oob_raw(mtd, chip, page, sndcmd);
 		else
 			sndcmd = chip->ecc.read_oob(mtd, chip, page, sndcmd);
@@ -1865,9 +1865,9 @@ static int nand_read_oob(struct mtd_info *mtd, loff_t from,
 	nand_get_device(chip, mtd, FL_READING);
 
 	switch (ops->mode) {
-	case MTD_OOB_PLACE:
-	case MTD_OOB_AUTO:
-	case MTD_OOB_RAW:
+	case MTD_OPS_PLACE_OOB:
+	case MTD_OPS_AUTO_OOB:
+	case MTD_OPS_RAW:
 		break;
 
 	default:
@@ -2113,12 +2113,12 @@ static uint8_t *nand_fill_oob(struct mtd_info *mtd, uint8_t *oob, size_t len,
 
 	switch (ops->mode) {
 
-	case MTD_OOB_PLACE:
-	case MTD_OOB_RAW:
+	case MTD_OPS_PLACE_OOB:
+	case MTD_OPS_RAW:
 		memcpy(chip->oob_poi + ops->ooboffs, oob, len);
 		return oob + len;
 
-	case MTD_OOB_AUTO: {
+	case MTD_OPS_AUTO_OOB: {
 		struct nand_oobfree *free = chip->ecc.layout->oobfree;
 		uint32_t boffs = 0, woffs = ops->ooboffs;
 		size_t bytes = 0;
@@ -2167,7 +2167,7 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to,
 	uint32_t writelen = ops->len;
 
 	uint32_t oobwritelen = ops->ooblen;
-	uint32_t oobmaxlen = ops->mode == MTD_OOB_AUTO ?
+	uint32_t oobmaxlen = ops->mode == MTD_OPS_AUTO_OOB ?
 				mtd->oobavail : mtd->oobsize;
 
 	uint8_t *oob = ops->oobbuf;
@@ -2236,7 +2236,7 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to,
 		}
 
 		ret = chip->write_page(mtd, chip, wbuf, page, cached,
-				       (ops->mode == MTD_OOB_RAW));
+				       (ops->mode == MTD_OPS_RAW));
 		if (ret)
 			break;
 
@@ -2356,7 +2356,7 @@ static int nand_do_write_oob(struct mtd_info *mtd, loff_t to,
 	pr_debug("%s: to = 0x%08x, len = %i\n",
 			 __func__, (unsigned int)to, (int)ops->ooblen);
 
-	if (ops->mode == MTD_OOB_AUTO)
+	if (ops->mode == MTD_OPS_AUTO_OOB)
 		len = chip->ecc.layout->oobavail;
 	else
 		len = mtd->oobsize;
@@ -2408,7 +2408,7 @@ static int nand_do_write_oob(struct mtd_info *mtd, loff_t to,
 
 	nand_fill_oob(mtd, ops->oobbuf, ops->ooblen, ops);
 
-	if (ops->mode == MTD_OOB_RAW)
+	if (ops->mode == MTD_OPS_RAW)
 		status = chip->ecc.write_oob_raw(mtd, chip, page & chip->pagemask);
 	else
 		status = chip->ecc.write_oob(mtd, chip, page & chip->pagemask);
@@ -2445,9 +2445,9 @@ static int nand_write_oob(struct mtd_info *mtd, loff_t to,
 	nand_get_device(chip, mtd, FL_WRITING);
 
 	switch (ops->mode) {
-	case MTD_OOB_PLACE:
-	case MTD_OOB_AUTO:
-	case MTD_OOB_RAW:
+	case MTD_OPS_PLACE_OOB:
+	case MTD_OPS_AUTO_OOB:
+	case MTD_OPS_RAW:
 		break;
 
 	default:
diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c
index 6aa8125772b8..c488bcb84c90 100644
--- a/drivers/mtd/nand/nand_bbt.c
+++ b/drivers/mtd/nand/nand_bbt.c
@@ -302,7 +302,7 @@ static int scan_read_raw_oob(struct mtd_info *mtd, uint8_t *buf, loff_t offs,
 	struct mtd_oob_ops ops;
 	int res;
 
-	ops.mode = MTD_OOB_RAW;
+	ops.mode = MTD_OPS_RAW;
 	ops.ooboffs = 0;
 	ops.ooblen = mtd->oobsize;
 
@@ -350,7 +350,7 @@ static int scan_write_bbt(struct mtd_info *mtd, loff_t offs, size_t len,
 {
 	struct mtd_oob_ops ops;
 
-	ops.mode = MTD_OOB_PLACE;
+	ops.mode = MTD_OPS_PLACE_OOB;
 	ops.ooboffs = 0;
 	ops.ooblen = mtd->oobsize;
 	ops.datbuf = buf;
@@ -433,7 +433,7 @@ static int scan_block_fast(struct mtd_info *mtd, struct nand_bbt_descr *bd,
 	ops.oobbuf = buf;
 	ops.ooboffs = 0;
 	ops.datbuf = NULL;
-	ops.mode = MTD_OOB_PLACE;
+	ops.mode = MTD_OPS_PLACE_OOB;
 
 	for (j = 0; j < len; j++) {
 		/*
@@ -672,7 +672,7 @@ static int write_bbt(struct mtd_info *mtd, uint8_t *buf,
 	ops.ooblen = mtd->oobsize;
 	ops.ooboffs = 0;
 	ops.datbuf = NULL;
-	ops.mode = MTD_OOB_PLACE;
+	ops.mode = MTD_OPS_PLACE_OOB;
 
 	if (!rcode)
 		rcode = 0xff;
diff --git a/drivers/mtd/nand/sm_common.c b/drivers/mtd/nand/sm_common.c
index b6332e83b289..2c438ef53cf5 100644
--- a/drivers/mtd/nand/sm_common.c
+++ b/drivers/mtd/nand/sm_common.c
@@ -47,7 +47,7 @@ static int sm_block_markbad(struct mtd_info *mtd, loff_t ofs)
 
 	/* As long as this function is called on erase block boundaries
 		it will work correctly for 256 byte nand */
-	ops.mode = MTD_OOB_PLACE;
+	ops.mode = MTD_OPS_PLACE_OOB;
 	ops.ooboffs = 0;
 	ops.ooblen = mtd->oobsize;
 	ops.oobbuf = (void *)&oob;
diff --git a/drivers/mtd/nftlcore.c b/drivers/mtd/nftlcore.c
index 93d6fc68b892..272e3c03e324 100644
--- a/drivers/mtd/nftlcore.c
+++ b/drivers/mtd/nftlcore.c
@@ -147,7 +147,7 @@ int nftl_read_oob(struct mtd_info *mtd, loff_t offs, size_t len,
 	struct mtd_oob_ops ops;
 	int res;
 
-	ops.mode = MTD_OOB_PLACE;
+	ops.mode = MTD_OPS_PLACE_OOB;
 	ops.ooboffs = offs & mask;
 	ops.ooblen = len;
 	ops.oobbuf = buf;
@@ -168,7 +168,7 @@ int nftl_write_oob(struct mtd_info *mtd, loff_t offs, size_t len,
 	struct mtd_oob_ops ops;
 	int res;
 
-	ops.mode = MTD_OOB_PLACE;
+	ops.mode = MTD_OPS_PLACE_OOB;
 	ops.ooboffs = offs & mask;
 	ops.ooblen = len;
 	ops.oobbuf = buf;
@@ -191,7 +191,7 @@ static int nftl_write(struct mtd_info *mtd, loff_t offs, size_t len,
 	struct mtd_oob_ops ops;
 	int res;
 
-	ops.mode = MTD_OOB_PLACE;
+	ops.mode = MTD_OPS_PLACE_OOB;
 	ops.ooboffs = offs & mask;
 	ops.ooblen = mtd->oobsize;
 	ops.oobbuf = oob;
diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index 493901a59e6e..a52aa0f6b0c3 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -1125,7 +1125,7 @@ static int onenand_mlc_read_ops_nolock(struct mtd_info *mtd, loff_t from,
 	pr_debug("%s: from = 0x%08x, len = %i\n", __func__, (unsigned int)from,
 			(int)len);
 
-	if (ops->mode == MTD_OOB_AUTO)
+	if (ops->mode == MTD_OPS_AUTO_OOB)
 		oobsize = this->ecclayout->oobavail;
 	else
 		oobsize = mtd->oobsize;
@@ -1170,7 +1170,7 @@ static int onenand_mlc_read_ops_nolock(struct mtd_info *mtd, loff_t from,
 			thisooblen = oobsize - oobcolumn;
 			thisooblen = min_t(int, thisooblen, ooblen - oobread);
 
-			if (ops->mode == MTD_OOB_AUTO)
+			if (ops->mode == MTD_OPS_AUTO_OOB)
 				onenand_transfer_auto_oob(mtd, oobbuf, oobcolumn, thisooblen);
 			else
 				this->read_bufferram(mtd, ONENAND_SPARERAM, oobbuf, oobcolumn, thisooblen);
@@ -1229,7 +1229,7 @@ static int onenand_read_ops_nolock(struct mtd_info *mtd, loff_t from,
 	pr_debug("%s: from = 0x%08x, len = %i\n", __func__, (unsigned int)from,
 			(int)len);
 
-	if (ops->mode == MTD_OOB_AUTO)
+	if (ops->mode == MTD_OPS_AUTO_OOB)
 		oobsize = this->ecclayout->oobavail;
 	else
 		oobsize = mtd->oobsize;
@@ -1291,7 +1291,7 @@ static int onenand_read_ops_nolock(struct mtd_info *mtd, loff_t from,
 			thisooblen = oobsize - oobcolumn;
 			thisooblen = min_t(int, thisooblen, ooblen - oobread);
 
-			if (ops->mode == MTD_OOB_AUTO)
+			if (ops->mode == MTD_OPS_AUTO_OOB)
 				onenand_transfer_auto_oob(mtd, oobbuf, oobcolumn, thisooblen);
 			else
 				this->read_bufferram(mtd, ONENAND_SPARERAM, oobbuf, oobcolumn, thisooblen);
@@ -1363,7 +1363,7 @@ static int onenand_read_oob_nolock(struct mtd_info *mtd, loff_t from,
 	/* Initialize return length value */
 	ops->oobretlen = 0;
 
-	if (mode == MTD_OOB_AUTO)
+	if (mode == MTD_OPS_AUTO_OOB)
 		oobsize = this->ecclayout->oobavail;
 	else
 		oobsize = mtd->oobsize;
@@ -1409,7 +1409,7 @@ static int onenand_read_oob_nolock(struct mtd_info *mtd, loff_t from,
 			break;
 		}
 
-		if (mode == MTD_OOB_AUTO)
+		if (mode == MTD_OPS_AUTO_OOB)
 			onenand_transfer_auto_oob(mtd, buf, column, thislen);
 		else
 			this->read_bufferram(mtd, ONENAND_SPARERAM, buf, column, thislen);
@@ -1487,10 +1487,10 @@ static int onenand_read_oob(struct mtd_info *mtd, loff_t from,
 	int ret;
 
 	switch (ops->mode) {
-	case MTD_OOB_PLACE:
-	case MTD_OOB_AUTO:
+	case MTD_OPS_PLACE_OOB:
+	case MTD_OPS_AUTO_OOB:
 		break;
-	case MTD_OOB_RAW:
+	case MTD_OPS_RAW:
 		/* Not implemented yet */
 	default:
 		return -EINVAL;
@@ -1908,7 +1908,7 @@ static int onenand_write_ops_nolock(struct mtd_info *mtd, loff_t to,
 	if (!len)
 		return 0;
 
-	if (ops->mode == MTD_OOB_AUTO)
+	if (ops->mode == MTD_OPS_AUTO_OOB)
 		oobsize = this->ecclayout->oobavail;
 	else
 		oobsize = mtd->oobsize;
@@ -1945,7 +1945,7 @@ static int onenand_write_ops_nolock(struct mtd_info *mtd, loff_t to,
 				/* We send data to spare ram with oobsize
 				 * to prevent byte access */
 				memset(oobbuf, 0xff, mtd->oobsize);
-				if (ops->mode == MTD_OOB_AUTO)
+				if (ops->mode == MTD_OPS_AUTO_OOB)
 					onenand_fill_auto_oob(mtd, oobbuf, oob, oobcolumn, thisooblen);
 				else
 					memcpy(oobbuf + oobcolumn, oob, thisooblen);
@@ -2084,7 +2084,7 @@ static int onenand_write_oob_nolock(struct mtd_info *mtd, loff_t to,
 	/* Initialize retlen, in case of early exit */
 	ops->oobretlen = 0;
 
-	if (mode == MTD_OOB_AUTO)
+	if (mode == MTD_OPS_AUTO_OOB)
 		oobsize = this->ecclayout->oobavail;
 	else
 		oobsize = mtd->oobsize;
@@ -2128,7 +2128,7 @@ static int onenand_write_oob_nolock(struct mtd_info *mtd, loff_t to,
 		/* We send data to spare ram with oobsize
 		 * to prevent byte access */
 		memset(oobbuf, 0xff, mtd->oobsize);
-		if (mode == MTD_OOB_AUTO)
+		if (mode == MTD_OPS_AUTO_OOB)
 			onenand_fill_auto_oob(mtd, oobbuf, buf, column, thislen);
 		else
 			memcpy(oobbuf + column, buf, thislen);
@@ -2217,10 +2217,10 @@ static int onenand_write_oob(struct mtd_info *mtd, loff_t to,
 	int ret;
 
 	switch (ops->mode) {
-	case MTD_OOB_PLACE:
-	case MTD_OOB_AUTO:
+	case MTD_OPS_PLACE_OOB:
+	case MTD_OPS_AUTO_OOB:
 		break;
-	case MTD_OOB_RAW:
+	case MTD_OPS_RAW:
 		/* Not implemented yet */
 	default:
 		return -EINVAL;
@@ -2603,7 +2603,7 @@ static int onenand_default_block_markbad(struct mtd_info *mtd, loff_t ofs)
 	struct bbm_info *bbm = this->bbm;
 	u_char buf[2] = {0, 0};
 	struct mtd_oob_ops ops = {
-		.mode = MTD_OOB_PLACE,
+		.mode = MTD_OPS_PLACE_OOB,
 		.ooblen = 2,
 		.oobbuf = buf,
 		.ooboffs = 0,
@@ -3171,7 +3171,7 @@ static int do_otp_lock(struct mtd_info *mtd, loff_t from, size_t len,
 		this->command(mtd, ONENAND_CMD_RESET, 0, 0);
 		this->wait(mtd, FL_RESETING);
 	} else {
-		ops.mode = MTD_OOB_PLACE;
+		ops.mode = MTD_OPS_PLACE_OOB;
 		ops.ooblen = len;
 		ops.oobbuf = buf;
 		ops.ooboffs = 0;
@@ -3677,7 +3677,7 @@ static int flexonenand_check_blocks_erased(struct mtd_info *mtd, int start, int
 	int i, ret;
 	int block;
 	struct mtd_oob_ops ops = {
-		.mode = MTD_OOB_PLACE,
+		.mode = MTD_OPS_PLACE_OOB,
 		.ooboffs = 0,
 		.ooblen	= mtd->oobsize,
 		.datbuf	= NULL,
diff --git a/drivers/mtd/onenand/onenand_bbt.c b/drivers/mtd/onenand/onenand_bbt.c
index 3b9a2a9573c6..09956c4fd850 100644
--- a/drivers/mtd/onenand/onenand_bbt.c
+++ b/drivers/mtd/onenand/onenand_bbt.c
@@ -80,7 +80,7 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr
 	startblock = 0;
 	from = 0;
 
-	ops.mode = MTD_OOB_PLACE;
+	ops.mode = MTD_OPS_PLACE_OOB;
 	ops.ooblen = readlen;
 	ops.oobbuf = buf;
 	ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
diff --git a/drivers/mtd/sm_ftl.c b/drivers/mtd/sm_ftl.c
index 311a9e39a956..d927641cb0f5 100644
--- a/drivers/mtd/sm_ftl.c
+++ b/drivers/mtd/sm_ftl.c
@@ -256,7 +256,7 @@ static int sm_read_sector(struct sm_ftl *ftl,
 	if (!oob)
 		oob = &tmp_oob;
 
-	ops.mode = ftl->smallpagenand ? MTD_OOB_RAW : MTD_OOB_PLACE;
+	ops.mode = ftl->smallpagenand ? MTD_OPS_RAW : MTD_OPS_PLACE_OOB;
 	ops.ooboffs = 0;
 	ops.ooblen = SM_OOB_SIZE;
 	ops.oobbuf = (void *)oob;
@@ -336,7 +336,7 @@ static int sm_write_sector(struct sm_ftl *ftl,
 	if (ftl->unstable)
 		return -EIO;
 
-	ops.mode = ftl->smallpagenand ? MTD_OOB_RAW : MTD_OOB_PLACE;
+	ops.mode = ftl->smallpagenand ? MTD_OPS_RAW : MTD_OPS_PLACE_OOB;
 	ops.len = SM_SECTOR_SIZE;
 	ops.datbuf = buffer;
 	ops.ooboffs = 0;
diff --git a/drivers/mtd/ssfdc.c b/drivers/mtd/ssfdc.c
index 5f917f0a9609..976e3d28b962 100644
--- a/drivers/mtd/ssfdc.c
+++ b/drivers/mtd/ssfdc.c
@@ -169,7 +169,7 @@ static int read_raw_oob(struct mtd_info *mtd, loff_t offs, uint8_t *buf)
 	struct mtd_oob_ops ops;
 	int ret;
 
-	ops.mode = MTD_OOB_RAW;
+	ops.mode = MTD_OPS_RAW;
 	ops.ooboffs = 0;
 	ops.ooblen = OOB_SIZE;
 	ops.oobbuf = buf;
diff --git a/drivers/mtd/tests/mtd_oobtest.c b/drivers/mtd/tests/mtd_oobtest.c
index dec92ae6111a..6bcff42296a9 100644
--- a/drivers/mtd/tests/mtd_oobtest.c
+++ b/drivers/mtd/tests/mtd_oobtest.c
@@ -131,7 +131,7 @@ static int write_eraseblock(int ebnum)
 
 	for (i = 0; i < pgcnt; ++i, addr += mtd->writesize) {
 		set_random_data(writebuf, use_len);
-		ops.mode      = MTD_OOB_AUTO;
+		ops.mode      = MTD_OPS_AUTO_OOB;
 		ops.len       = 0;
 		ops.retlen    = 0;
 		ops.ooblen    = use_len;
@@ -184,7 +184,7 @@ static int verify_eraseblock(int ebnum)
 
 	for (i = 0; i < pgcnt; ++i, addr += mtd->writesize) {
 		set_random_data(writebuf, use_len);
-		ops.mode      = MTD_OOB_AUTO;
+		ops.mode      = MTD_OPS_AUTO_OOB;
 		ops.len       = 0;
 		ops.retlen    = 0;
 		ops.ooblen    = use_len;
@@ -211,7 +211,7 @@ static int verify_eraseblock(int ebnum)
 		if (use_offset != 0 || use_len < mtd->ecclayout->oobavail) {
 			int k;
 
-			ops.mode      = MTD_OOB_AUTO;
+			ops.mode      = MTD_OPS_AUTO_OOB;
 			ops.len       = 0;
 			ops.retlen    = 0;
 			ops.ooblen    = mtd->ecclayout->oobavail;
@@ -276,7 +276,7 @@ static int verify_eraseblock_in_one_go(int ebnum)
 	size_t len = mtd->ecclayout->oobavail * pgcnt;
 
 	set_random_data(writebuf, len);
-	ops.mode      = MTD_OOB_AUTO;
+	ops.mode      = MTD_OPS_AUTO_OOB;
 	ops.len       = 0;
 	ops.retlen    = 0;
 	ops.ooblen    = len;
@@ -507,7 +507,7 @@ static int __init mtd_oobtest_init(void)
 		addr0 += mtd->erasesize;
 
 	/* Attempt to write off end of OOB */
-	ops.mode      = MTD_OOB_AUTO;
+	ops.mode      = MTD_OPS_AUTO_OOB;
 	ops.len       = 0;
 	ops.retlen    = 0;
 	ops.ooblen    = 1;
@@ -527,7 +527,7 @@ static int __init mtd_oobtest_init(void)
 	}
 
 	/* Attempt to read off end of OOB */
-	ops.mode      = MTD_OOB_AUTO;
+	ops.mode      = MTD_OPS_AUTO_OOB;
 	ops.len       = 0;
 	ops.retlen    = 0;
 	ops.ooblen    = 1;
@@ -551,7 +551,7 @@ static int __init mtd_oobtest_init(void)
 		       "block is bad\n");
 	else {
 		/* Attempt to write off end of device */
-		ops.mode      = MTD_OOB_AUTO;
+		ops.mode      = MTD_OPS_AUTO_OOB;
 		ops.len       = 0;
 		ops.retlen    = 0;
 		ops.ooblen    = mtd->ecclayout->oobavail + 1;
@@ -571,7 +571,7 @@ static int __init mtd_oobtest_init(void)
 		}
 
 		/* Attempt to read off end of device */
-		ops.mode      = MTD_OOB_AUTO;
+		ops.mode      = MTD_OPS_AUTO_OOB;
 		ops.len       = 0;
 		ops.retlen    = 0;
 		ops.ooblen    = mtd->ecclayout->oobavail + 1;
@@ -595,7 +595,7 @@ static int __init mtd_oobtest_init(void)
 			goto out;
 
 		/* Attempt to write off end of device */
-		ops.mode      = MTD_OOB_AUTO;
+		ops.mode      = MTD_OPS_AUTO_OOB;
 		ops.len       = 0;
 		ops.retlen    = 0;
 		ops.ooblen    = mtd->ecclayout->oobavail;
@@ -615,7 +615,7 @@ static int __init mtd_oobtest_init(void)
 		}
 
 		/* Attempt to read off end of device */
-		ops.mode      = MTD_OOB_AUTO;
+		ops.mode      = MTD_OPS_AUTO_OOB;
 		ops.len       = 0;
 		ops.retlen    = 0;
 		ops.ooblen    = mtd->ecclayout->oobavail;
@@ -655,7 +655,7 @@ static int __init mtd_oobtest_init(void)
 		addr = (i + 1) * mtd->erasesize - mtd->writesize;
 		for (pg = 0; pg < cnt; ++pg) {
 			set_random_data(writebuf, sz);
-			ops.mode      = MTD_OOB_AUTO;
+			ops.mode      = MTD_OPS_AUTO_OOB;
 			ops.len       = 0;
 			ops.retlen    = 0;
 			ops.ooblen    = sz;
@@ -683,7 +683,7 @@ static int __init mtd_oobtest_init(void)
 			continue;
 		set_random_data(writebuf, mtd->ecclayout->oobavail * 2);
 		addr = (i + 1) * mtd->erasesize - mtd->writesize;
-		ops.mode      = MTD_OOB_AUTO;
+		ops.mode      = MTD_OPS_AUTO_OOB;
 		ops.len       = 0;
 		ops.retlen    = 0;
 		ops.ooblen    = mtd->ecclayout->oobavail * 2;
diff --git a/drivers/mtd/tests/mtd_readtest.c b/drivers/mtd/tests/mtd_readtest.c
index 836792d1d60e..587e1e371c6c 100644
--- a/drivers/mtd/tests/mtd_readtest.c
+++ b/drivers/mtd/tests/mtd_readtest.c
@@ -66,7 +66,7 @@ static int read_eraseblock_by_page(int ebnum)
 		if (mtd->oobsize) {
 			struct mtd_oob_ops ops;
 
-			ops.mode      = MTD_OOB_PLACE;
+			ops.mode      = MTD_OPS_PLACE_OOB;
 			ops.len       = 0;
 			ops.retlen    = 0;
 			ops.ooblen    = mtd->oobsize;
diff --git a/drivers/staging/spectra/lld_mtd.c b/drivers/staging/spectra/lld_mtd.c
index 2bd34662beb5..a9c309a167c2 100644
--- a/drivers/staging/spectra/lld_mtd.c
+++ b/drivers/staging/spectra/lld_mtd.c
@@ -340,7 +340,7 @@ u16 mtd_Read_Page_Main_Spare(u8 *read_data, u32 Block,
 		struct mtd_oob_ops ops;
 		int ret;
 
-		ops.mode = MTD_OOB_AUTO;
+		ops.mode = MTD_OPS_AUTO_OOB;
 		ops.datbuf = read_data;
 		ops.len = DeviceInfo.wPageDataSize;
 		ops.oobbuf = read_data + DeviceInfo.wPageDataSize + BTSIG_OFFSET;
@@ -400,7 +400,7 @@ u16 mtd_Write_Page_Main_Spare(u8 *write_data, u32 Block,
 		struct mtd_oob_ops ops;
 		int ret;
 
-		ops.mode = MTD_OOB_AUTO;
+		ops.mode = MTD_OPS_AUTO_OOB;
 		ops.datbuf = write_data;
 		ops.len = DeviceInfo.wPageDataSize;
 		ops.oobbuf = write_data + DeviceInfo.wPageDataSize + BTSIG_OFFSET;
@@ -473,7 +473,7 @@ u16 mtd_Read_Page_Spare(u8 *read_data, u32 Block,
 		struct mtd_oob_ops ops;
 		int ret;
 
-		ops.mode = MTD_OOB_AUTO;
+		ops.mode = MTD_OPS_AUTO_OOB;
 		ops.datbuf = NULL;
 		ops.len = 0;
 		ops.oobbuf = read_data;
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index a10fb24ca6e6..b09e51d2f81f 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1025,7 +1025,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
 	int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
 	struct mtd_oob_ops ops;
 
-	ops.mode = MTD_OOB_AUTO;
+	ops.mode = MTD_OPS_AUTO_OOB;
 	ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail;
 	ops.oobbuf = c->oobbuf;
 	ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
@@ -1068,7 +1068,7 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
 	struct mtd_oob_ops ops;
 	int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
 
-	ops.mode = MTD_OOB_AUTO;
+	ops.mode = MTD_OPS_AUTO_OOB;
 	ops.ooblen = cmlen;
 	ops.oobbuf = c->oobbuf;
 	ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
@@ -1094,7 +1094,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
 	struct mtd_oob_ops ops;
 	int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
 
-	ops.mode = MTD_OOB_AUTO;
+	ops.mode = MTD_OPS_AUTO_OOB;
 	ops.ooblen = cmlen;
 	ops.oobbuf = (uint8_t *)&oob_cleanmarker;
 	ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 6882cd968a3e..c2047b85691d 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -79,7 +79,7 @@ struct mtd_erase_region_info {
  * @ooblen:	number of oob bytes to write/read
  * @oobretlen:	number of oob bytes written/read
  * @ooboffs:	offset of oob data in the oob area (only relevant when
- *		mode = MTD_OOB_PLACE)
+ *		mode = MTD_OPS_PLACE_OOB)
  * @datbuf:	data buffer - if NULL only oob data are read/written
  * @oobbuf:	oob data buffer
  *
diff --git a/include/mtd/mtd-abi.h b/include/mtd/mtd-abi.h
index af42c7a34805..d30990e1858b 100644
--- a/include/mtd/mtd-abi.h
+++ b/include/mtd/mtd-abi.h
@@ -46,18 +46,18 @@ struct mtd_oob_buf64 {
 };
 
 /*
- * oob operation modes
+ * MTD operation modes
  *
- * MTD_OOB_PLACE:       oob data are placed at the given offset (default)
- * MTD_OOB_AUTO:        oob data are automatically placed at the free areas
- *                      which are defined by the internal ecclayout
- * MTD_OOB_RAW:         mode to read or write oob and data without doing ECC
+ * MTD_OPS_PLACE_OOB:	oob data are placed at the given offset (default)
+ * MTD_OPS_AUTO_OOB:	oob data are automatically placed at the free areas
+ *			which are defined by the internal ecclayout
+ * MTD_OPS_RAW:		mode to read or write oob and data without doing ECC
  *			checking
  */
 enum {
-	MTD_OOB_PLACE = 0,
-	MTD_OOB_AUTO = 1,
-	MTD_OOB_RAW = 2,
+	MTD_OPS_PLACE_OOB = 0,
+	MTD_OPS_AUTO_OOB = 1,
+	MTD_OPS_RAW = 2,
 };
 
 #define MTD_ABSENT		0
-- 
cgit v1.2.3-58-ga151


From 4581d1409977c5fe686a4ed487cdce2e50031826 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 6 Sep 2011 14:56:09 -0400
Subject: nfsd4: rearrange to avoid a forward reference

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 53 ++++++++++++++++++++++++++---------------------------
 1 file changed, 26 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c28432a80210..f0eccc236a0d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -60,7 +60,6 @@ static u64 current_sessionid = 1;
 #define ONE_STATEID(stateid)  (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
 
 /* forward declarations */
-static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
 static struct nfs4_delegation * search_for_delegation(stateid_t *stid);
 static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
 static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner);
@@ -1061,6 +1060,32 @@ static void gen_confirm(struct nfs4_client *clp)
 	*p++ = i++;
 }
 
+static int
+same_stateid(stateid_t *id_one, stateid_t *id_two)
+{
+	if (id_one->si_stateownerid != id_two->si_stateownerid)
+		return 0;
+	return id_one->si_fileid == id_two->si_fileid;
+}
+
+static struct nfs4_stateid *find_stateid(stateid_t *t, int flags)
+{
+	struct nfs4_stateid *s;
+	unsigned int hashval;
+
+	hashval = stateid_hashval(t->si_stateownerid, t->si_fileid);
+	list_for_each_entry(s, &stateid_hashtbl[hashval], st_hash) {
+		if (!same_stateid(&s->st_stateid, t))
+			continue;
+		if (flags & LOCK_STATE && s->st_type != NFS4_LOCK_STID)
+			return NULL;
+		if (flags & OPEN_STATE && s->st_type != NFS4_OPEN_STID)
+			return NULL;
+		return s;
+		}
+	return NULL;
+}
+
 static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
 		struct svc_rqst *rqstp, nfs4_verifier *verf)
 {
@@ -3694,32 +3719,6 @@ lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
 static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE];
 static struct list_head	lock_ownerstr_hashtbl[LOCK_HASH_SIZE];
 
-static int
-same_stateid(stateid_t *id_one, stateid_t *id_two)
-{
-	if (id_one->si_stateownerid != id_two->si_stateownerid)
-		return 0;
-	return id_one->si_fileid == id_two->si_fileid;
-}
-
-static struct nfs4_stateid *find_stateid(stateid_t *t, int flags)
-{
-	struct nfs4_stateid *s;
-	unsigned int hashval;
-
-	hashval = stateid_hashval(t->si_stateownerid, t->si_fileid);
-	list_for_each_entry(s, &stateid_hashtbl[hashval], st_hash) {
-		if (!same_stateid(&s->st_stateid, t))
-			continue;
-		if (flags & LOCK_STATE && s->st_type != NFS4_LOCK_STID)
-			return NULL;
-		if (flags & OPEN_STATE && s->st_type != NFS4_OPEN_STID)
-			return NULL;
-		return s;
-		}
-	return NULL;
-}
-
 static struct nfs4_delegation *
 search_for_delegation(stateid_t *stid)
 {
-- 
cgit v1.2.3-58-ga151


From 4d71ab8751c1d749f9fdf84ec094989adf579493 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 6 Sep 2011 16:48:57 -0400
Subject: nfsd4: split up find_stateid

Minor cleanup.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f0eccc236a0d..aa088bc3b169 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1068,21 +1068,29 @@ same_stateid(stateid_t *id_one, stateid_t *id_two)
 	return id_one->si_fileid == id_two->si_fileid;
 }
 
-static struct nfs4_stateid *find_stateid(stateid_t *t, int flags)
+static struct nfs4_stateid *find_stateid(stateid_t *t)
 {
 	struct nfs4_stateid *s;
 	unsigned int hashval;
 
 	hashval = stateid_hashval(t->si_stateownerid, t->si_fileid);
-	list_for_each_entry(s, &stateid_hashtbl[hashval], st_hash) {
-		if (!same_stateid(&s->st_stateid, t))
-			continue;
-		if (flags & LOCK_STATE && s->st_type != NFS4_LOCK_STID)
-			return NULL;
-		if (flags & OPEN_STATE && s->st_type != NFS4_OPEN_STID)
-			return NULL;
+	list_for_each_entry(s, &stateid_hashtbl[hashval], st_hash)
+		if (same_stateid(&s->st_stateid, t))
+			return s;
+	return NULL;
+}
+
+static struct nfs4_stateid *find_stateid_by_type(stateid_t *t, int flags)
+{
+	struct nfs4_stateid *s;
+
+	s = find_stateid(t);
+	if (!s)
+		return NULL;
+	if (flags & LOCK_STATE && s->st_type == NFS4_LOCK_STID)
+		return s;
+	if (flags & OPEN_STATE && s->st_type == NFS4_OPEN_STID)
 		return s;
-		}
 	return NULL;
 }
 
@@ -3241,7 +3249,7 @@ __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session)
 		goto out;
 
 	status = nfserr_expired;
-	stp = find_stateid(stateid, 0);
+	stp = find_stateid(stateid);
 	if (!stp)
 		goto out;
 	status = nfserr_bad_stateid;
@@ -3306,7 +3314,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 			BUG_ON(!*filpp);
 		}
 	} else { /* open or lock stateid */
-		stp = find_stateid(stateid, flags);
+		stp = find_stateid(stateid);
 		if (!stp)
 			goto out;
 		status = nfserr_bad_stateid;
@@ -3381,7 +3389,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto out;
 	}
 
-	stp = find_stateid(stateid, 0);
+	stp = find_stateid(stateid);
 	if (!stp) {
 		ret = nfserr_bad_stateid;
 		goto out;
@@ -3440,7 +3448,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 	* the confirmed flag is incorrecly set, or the generation 
 	* number is incorrect.  
 	*/
-	*stpp = find_stateid(stateid, flags);
+	*stpp = find_stateid_by_type(stateid, flags);
 	if (*stpp == NULL)
 		return nfserr_expired;
 
-- 
cgit v1.2.3-58-ga151


From c0a5d93efbbb79117bdf7f5f81fba9d679c35dfa Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 6 Sep 2011 15:19:46 -0400
Subject: nfsd4: split preprocess_seqid, cleanup

Move most of this into helper functions.  Also move the non-CONFIRM case
into caller, providing a helper function for that purpose.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 87 +++++++++++++++++++++++++++--------------------------
 fs/nfsd/state.h     |  1 -
 2 files changed, 45 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index aa088bc3b169..ad20bbf0a1f8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3253,7 +3253,6 @@ __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session)
 	if (!stp)
 		goto out;
 	status = nfserr_bad_stateid;
-
 	if (stp->st_stateowner->so_is_open_owner
 	    && !openowner(stp->st_stateowner)->oo_confirmed)
 		goto out;
@@ -3418,6 +3417,29 @@ setlkflg (int type)
 		RD_STATE : WR_STATE;
 }
 
+static __be32 nfs4_nospecial_stateid_checks(stateid_t *stateid)
+{
+	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+		return nfserr_bad_stateid;
+	if (STALE_STATEID(stateid))
+		return nfserr_stale_stateid;
+	return nfs_ok;
+}
+
+static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_stateid *stp)
+{
+	struct svc_fh *current_fh = &cstate->current_fh;
+	struct nfs4_stateowner *sop = stp->st_stateowner;
+	__be32 status;
+
+	if (nfs4_check_fh(current_fh, stp))
+		return nfserr_bad_stateid;
+	status = nfsd4_check_seqid(cstate, sop, seqid);
+	if (status)
+		return status;
+	return check_stateid_generation(stateid, &stp->st_stateid, nfsd4_has_session(cstate));
+}
+
 /* 
  * Checks for sequence id mutating operations. 
  */
@@ -3426,54 +3448,36 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 			 stateid_t *stateid, int flags,
 			 struct nfs4_stateid **stpp)
 {
-	struct nfs4_stateowner *sop;
-	struct svc_fh *current_fh = &cstate->current_fh;
 	__be32 status;
 
 	dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__,
 		seqid, STATEID_VAL(stateid));
 
 	*stpp = NULL;
-
-	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) {
-		dprintk("NFSD: preprocess_seqid_op: magic stateid!\n");
-		return nfserr_bad_stateid;
-	}
-
-	if (STALE_STATEID(stateid))
-		return nfserr_stale_stateid;
-
-	/*
-	* We return BAD_STATEID if filehandle doesn't match stateid, 
-	* the confirmed flag is incorrecly set, or the generation 
-	* number is incorrect.  
-	*/
+	status = nfs4_nospecial_stateid_checks(stateid);
+	if (status)
+		return status;
 	*stpp = find_stateid_by_type(stateid, flags);
 	if (*stpp == NULL)
 		return nfserr_expired;
+	cstate->replay_owner = (*stpp)->st_stateowner;
+	renew_client((*stpp)->st_stateowner->so_client);
 
-	sop = (*stpp)->st_stateowner;
-	cstate->replay_owner = sop;
+	return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp);
+}
 
-	if (nfs4_check_fh(current_fh, *stpp)) {
-		dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n");
-		return nfserr_bad_stateid;
-	}
+static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_stateid **stpp)
+{
+	__be32 status;
+	struct nfs4_openowner *oo;
 
-	status = nfsd4_check_seqid(cstate, sop, seqid);
+	status = nfs4_preprocess_seqid_op(cstate, seqid, stateid,
+						OPEN_STATE, stpp);
 	if (status)
 		return status;
-
-	if (sop->so_is_open_owner && !openowner(sop)->oo_confirmed
-			&& !(flags & CONFIRM)) {
-		dprintk("NFSD: preprocess_seqid_op: stateowner not"
-				" confirmed yet!\n");
+	oo = openowner((*stpp)->st_stateowner);
+	if (!oo->oo_confirmed)
 		return nfserr_bad_stateid;
-	}
-	status = check_stateid_generation(stateid, &(*stpp)->st_stateid, nfsd4_has_session(cstate));
-	if (status)
-		return status;
-	renew_client(sop->so_client);
 	return nfs_ok;
 }
 
@@ -3497,7 +3501,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	status = nfs4_preprocess_seqid_op(cstate,
 					oc->oc_seqid, &oc->oc_req_stateid,
-					CONFIRM | OPEN_STATE, &stp);
+					OPEN_STATE, &stp);
 	if (status)
 		goto out;
 	oo = openowner(stp->st_stateowner);
@@ -3557,8 +3561,8 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 		return nfserr_inval;
 
 	nfs4_lock_state();
-	status = nfs4_preprocess_seqid_op(cstate, od->od_seqid,
-					&od->od_stateid, OPEN_STATE, &stp);
+	status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid,
+					&od->od_stateid, &stp);
 	if (status)
 		goto out; 
 	status = nfserr_inval;
@@ -3602,9 +3606,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 	/* check close_lru for replay */
-	status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
-					&close->cl_stateid, 
-					OPEN_STATE, &stp);
+	status = nfs4_preprocess_confirmed_seqid_op(cstate, close->cl_seqid,
+					&close->cl_stateid, &stp);
 	if (stp == NULL && status == nfserr_expired) {
 		/*
 		 * Also, we should make sure this isn't just the result of
@@ -3963,10 +3966,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			goto out;
 
 		/* validate and update open stateid and open seqid */
-		status = nfs4_preprocess_seqid_op(cstate,
+		status = nfs4_preprocess_confirmed_seqid_op(cstate,
 				        lock->lk_new_open_seqid,
 		                        &lock->lk_new_open_stateid,
-					OPEN_STATE, &open_stp);
+					&open_stp);
 		if (status)
 			goto out;
 		open_sop = openowner(open_stp->st_stateowner);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 7994ed9be3cc..9745cc781e74 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -463,7 +463,6 @@ struct nfs4_stateid {
 };
 
 /* flags for preprocess_seqid_op() */
-#define CONFIRM                 0x00000002
 #define OPEN_STATE              0x00000004
 #define LOCK_STATE              0x00000008
 #define RD_STATE	        0x00000010
-- 
cgit v1.2.3-58-ga151


From 2288d0e3958b94bcc3c00a78ea06909a8eb66378 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 6 Sep 2011 15:50:21 -0400
Subject: nfsd4: pass around typemask instead of flags

We're only using those flags to choose lock or open stateid's at this
point.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 18 ++++++++----------
 fs/nfsd/state.h     |  2 --
 2 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ad20bbf0a1f8..48134635fc2f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1080,16 +1080,14 @@ static struct nfs4_stateid *find_stateid(stateid_t *t)
 	return NULL;
 }
 
-static struct nfs4_stateid *find_stateid_by_type(stateid_t *t, int flags)
+static struct nfs4_stateid *find_stateid_by_type(stateid_t *t, char typemask)
 {
 	struct nfs4_stateid *s;
 
 	s = find_stateid(t);
 	if (!s)
 		return NULL;
-	if (flags & LOCK_STATE && s->st_type == NFS4_LOCK_STID)
-		return s;
-	if (flags & OPEN_STATE && s->st_type == NFS4_OPEN_STID)
+	if (typemask & s->st_type)
 		return s;
 	return NULL;
 }
@@ -3445,7 +3443,7 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
  */
 static __be32
 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
-			 stateid_t *stateid, int flags,
+			 stateid_t *stateid, char typemask,
 			 struct nfs4_stateid **stpp)
 {
 	__be32 status;
@@ -3457,7 +3455,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 	status = nfs4_nospecial_stateid_checks(stateid);
 	if (status)
 		return status;
-	*stpp = find_stateid_by_type(stateid, flags);
+	*stpp = find_stateid_by_type(stateid, typemask);
 	if (*stpp == NULL)
 		return nfserr_expired;
 	cstate->replay_owner = (*stpp)->st_stateowner;
@@ -3472,7 +3470,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs
 	struct nfs4_openowner *oo;
 
 	status = nfs4_preprocess_seqid_op(cstate, seqid, stateid,
-						OPEN_STATE, stpp);
+						NFS4_OPEN_STID, stpp);
 	if (status)
 		return status;
 	oo = openowner((*stpp)->st_stateowner);
@@ -3501,7 +3499,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	status = nfs4_preprocess_seqid_op(cstate,
 					oc->oc_seqid, &oc->oc_req_stateid,
-					OPEN_STATE, &stp);
+					NFS4_OPEN_STID, &stp);
 	if (status)
 		goto out;
 	oo = openowner(stp->st_stateowner);
@@ -3999,7 +3997,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		status = nfs4_preprocess_seqid_op(cstate,
 				       lock->lk_old_lock_seqid,
 				       &lock->lk_old_lock_stateid,
-				       LOCK_STATE, &lock_stp);
+				       NFS4_LOCK_STID, &lock_stp);
 		if (status)
 			goto out;
 		lock_sop = lockowner(lock_stp->st_stateowner);
@@ -4197,7 +4195,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	nfs4_lock_state();
 									        
 	status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid,
-					&locku->lu_stateid, LOCK_STATE, &stp);
+					&locku->lu_stateid, NFS4_LOCK_STID, &stp);
 	if (status)
 		goto out;
 	filp = find_any_file(stp->st_file);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 9745cc781e74..ef949eb3a86e 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -463,8 +463,6 @@ struct nfs4_stateid {
 };
 
 /* flags for preprocess_seqid_op() */
-#define OPEN_STATE              0x00000004
-#define LOCK_STATE              0x00000008
 #define RD_STATE	        0x00000010
 #define WR_STATE	        0x00000020
 
-- 
cgit v1.2.3-58-ga151


From 881ea2b11e8a786a8d30c108261296953380bead Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 6 Sep 2011 17:20:34 -0400
Subject: nfsd4: rename init_stateid

Note this is actually open-stateid specific.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 48134635fc2f..73b5e1e264fa 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2314,7 +2314,7 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
 }
 
 static inline void
-init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
+init_open_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
 	struct nfs4_openowner *oo = open->op_openowner;
 	unsigned int hashval = stateid_hashval(oo->oo_owner.so_id, fp->fi_id);
 
@@ -2934,7 +2934,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		status = nfs4_new_open(rqstp, &stp, fp, current_fh, open);
 		if (status)
 			goto out;
-		init_stateid(stp, fp, open);
+		init_open_stateid(stp, fp, open);
 		status = nfsd4_truncate(rqstp, current_fh, open);
 		if (status) {
 			release_open_stateid(stp);
-- 
cgit v1.2.3-58-ga151


From 91a8c04031d051db41ec2a8c15e24622da037382 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 7 Sep 2011 12:54:06 -0400
Subject: nfsd4: remove redundant stateid initialization

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 73b5e1e264fa..768382d1de97 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2318,10 +2318,7 @@ init_open_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_o
 	struct nfs4_openowner *oo = open->op_openowner;
 	unsigned int hashval = stateid_hashval(oo->oo_owner.so_id, fp->fi_id);
 
-	INIT_LIST_HEAD(&stp->st_hash);
-	INIT_LIST_HEAD(&stp->st_perstateowner);
 	INIT_LIST_HEAD(&stp->st_lockowners);
-	INIT_LIST_HEAD(&stp->st_perfile);
 	list_add(&stp->st_hash, &stateid_hashtbl[hashval]);
 	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
@@ -3874,10 +3871,6 @@ alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct
 	stp = nfs4_alloc_stateid();
 	if (stp == NULL)
 		goto out;
-	INIT_LIST_HEAD(&stp->st_hash);
-	INIT_LIST_HEAD(&stp->st_perfile);
-	INIT_LIST_HEAD(&stp->st_perstateowner);
-	INIT_LIST_HEAD(&stp->st_lockowners); /* not used */
 	list_add(&stp->st_hash, &stateid_hashtbl[hashval]);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
 	list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
-- 
cgit v1.2.3-58-ga151


From dcef0413da9a17bfca917d8b49baf309ce76b737 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 7 Sep 2011 16:06:42 -0400
Subject: nfsd4: move some of nfs4_stateid into a separate structure

We want delegations to share more with open/lock stateid's, so first
we'll pull out some of the common stuff we want to share.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 174 ++++++++++++++++++++++++++--------------------------
 fs/nfsd/state.h     |  29 ++++++---
 2 files changed, 106 insertions(+), 97 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 768382d1de97..d0bb5a5613a9 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -159,7 +159,7 @@ static struct list_head	open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE];
 #define FILE_HASH_BITS                   8
 #define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
 
-/* hash table for (open)nfs4_stateid */
+/* hash table for (open)nfs4_ol_stateid */
 #define STATEID_HASH_BITS              10
 #define STATEID_HASH_SIZE              (1 << STATEID_HASH_BITS)
 #define STATEID_HASH_MASK              (STATEID_HASH_SIZE - 1)
@@ -219,7 +219,7 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag)
 }
 
 static struct nfs4_delegation *
-alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type)
+alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type)
 {
 	struct nfs4_delegation *dp;
 	struct nfs4_file *fp = stp->st_file;
@@ -380,7 +380,7 @@ set_deny(unsigned int *deny, unsigned long bmap) {
 }
 
 static int
-test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
+test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) {
 	unsigned int access, deny;
 
 	set_access(&access, stp->st_access_bmap);
@@ -403,14 +403,14 @@ static int nfs4_access_to_omode(u32 access)
 	BUG();
 }
 
-static void unhash_generic_stateid(struct nfs4_stateid *stp)
+static void unhash_generic_stateid(struct nfs4_ol_stateid *stp)
 {
-	list_del(&stp->st_hash);
+	list_del(&stp->st_stid.sc_hash);
 	list_del(&stp->st_perfile);
 	list_del(&stp->st_perstateowner);
 }
 
-static void close_generic_stateid(struct nfs4_stateid *stp)
+static void close_generic_stateid(struct nfs4_ol_stateid *stp)
 {
 	int i;
 
@@ -426,13 +426,13 @@ static void close_generic_stateid(struct nfs4_stateid *stp)
 	stp->st_file = NULL;
 }
 
-static void free_generic_stateid(struct nfs4_stateid *stp)
+static void free_generic_stateid(struct nfs4_ol_stateid *stp)
 {
 	close_generic_stateid(stp);
 	kmem_cache_free(stateid_slab, stp);
 }
 
-static void release_lock_stateid(struct nfs4_stateid *stp)
+static void release_lock_stateid(struct nfs4_ol_stateid *stp)
 {
 	struct file *file;
 
@@ -445,14 +445,14 @@ static void release_lock_stateid(struct nfs4_stateid *stp)
 
 static void unhash_lockowner(struct nfs4_lockowner *lo)
 {
-	struct nfs4_stateid *stp;
+	struct nfs4_ol_stateid *stp;
 
 	list_del(&lo->lo_owner.so_idhash);
 	list_del(&lo->lo_owner.so_strhash);
 	list_del(&lo->lo_perstateid);
 	while (!list_empty(&lo->lo_owner.so_stateids)) {
 		stp = list_first_entry(&lo->lo_owner.so_stateids,
-				struct nfs4_stateid, st_perstateowner);
+				struct nfs4_ol_stateid, st_perstateowner);
 		release_lock_stateid(stp);
 	}
 }
@@ -464,7 +464,7 @@ static void release_lockowner(struct nfs4_lockowner *lo)
 }
 
 static void
-release_stateid_lockowners(struct nfs4_stateid *open_stp)
+release_stateid_lockowners(struct nfs4_ol_stateid *open_stp)
 {
 	struct nfs4_lockowner *lo;
 
@@ -475,7 +475,7 @@ release_stateid_lockowners(struct nfs4_stateid *open_stp)
 	}
 }
 
-static void release_open_stateid(struct nfs4_stateid *stp)
+static void release_open_stateid(struct nfs4_ol_stateid *stp)
 {
 	unhash_generic_stateid(stp);
 	release_stateid_lockowners(stp);
@@ -484,14 +484,14 @@ static void release_open_stateid(struct nfs4_stateid *stp)
 
 static void unhash_openowner(struct nfs4_openowner *oo)
 {
-	struct nfs4_stateid *stp;
+	struct nfs4_ol_stateid *stp;
 
 	list_del(&oo->oo_owner.so_idhash);
 	list_del(&oo->oo_owner.so_strhash);
 	list_del(&oo->oo_perclient);
 	while (!list_empty(&oo->oo_owner.so_stateids)) {
 		stp = list_first_entry(&oo->oo_owner.so_stateids,
-				struct nfs4_stateid, st_perstateowner);
+				struct nfs4_ol_stateid, st_perstateowner);
 		release_open_stateid(stp);
 	}
 }
@@ -1068,26 +1068,26 @@ same_stateid(stateid_t *id_one, stateid_t *id_two)
 	return id_one->si_fileid == id_two->si_fileid;
 }
 
-static struct nfs4_stateid *find_stateid(stateid_t *t)
+static struct nfs4_ol_stateid *find_stateid(stateid_t *t)
 {
-	struct nfs4_stateid *s;
+	struct nfs4_stid *s;
 	unsigned int hashval;
 
 	hashval = stateid_hashval(t->si_stateownerid, t->si_fileid);
-	list_for_each_entry(s, &stateid_hashtbl[hashval], st_hash)
-		if (same_stateid(&s->st_stateid, t))
-			return s;
+	list_for_each_entry(s, &stateid_hashtbl[hashval], sc_hash)
+		if (same_stateid(&s->sc_stateid, t))
+			return openlockstateid(s);
 	return NULL;
 }
 
-static struct nfs4_stateid *find_stateid_by_type(stateid_t *t, char typemask)
+static struct nfs4_ol_stateid *find_stateid_by_type(stateid_t *t, char typemask)
 {
-	struct nfs4_stateid *s;
+	struct nfs4_ol_stateid *s;
 
 	s = find_stateid(t);
 	if (!s)
 		return NULL;
-	if (typemask & s->st_type)
+	if (typemask & s->st_stid.sc_type)
 		return s;
 	return NULL;
 }
@@ -2232,7 +2232,7 @@ nfsd4_init_slabs(void)
 	if (file_slab == NULL)
 		goto out_nomem;
 	stateid_slab = kmem_cache_create("nfsd4_stateids",
-			sizeof(struct nfs4_stateid), 0, 0, NULL);
+			sizeof(struct nfs4_ol_stateid), 0, 0, NULL);
 	if (stateid_slab == NULL)
 		goto out_nomem;
 	deleg_slab = kmem_cache_create("nfsd4_delegations",
@@ -2314,23 +2314,23 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
 }
 
 static inline void
-init_open_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
+init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
 	struct nfs4_openowner *oo = open->op_openowner;
 	unsigned int hashval = stateid_hashval(oo->oo_owner.so_id, fp->fi_id);
 
 	INIT_LIST_HEAD(&stp->st_lockowners);
-	list_add(&stp->st_hash, &stateid_hashtbl[hashval]);
+	list_add(&stp->st_stid.sc_hash, &stateid_hashtbl[hashval]);
 	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
-	stp->st_type = NFS4_OPEN_STID;
+	stp->st_stid.sc_type = NFS4_OPEN_STID;
 	stp->st_stateowner = &oo->oo_owner;
 	get_nfs4_file(fp);
 	stp->st_file = fp;
-	stp->st_stateid.si_boot = boot_time;
-	stp->st_stateid.si_stateownerid = oo->oo_owner.so_id;
-	stp->st_stateid.si_fileid = fp->fi_id;
+	stp->st_stid.sc_stateid.si_boot = boot_time;
+	stp->st_stid.sc_stateid.si_stateownerid = oo->oo_owner.so_id;
+	stp->st_stid.sc_stateid.si_fileid = fp->fi_id;
 	/* note will be incremented before first return to client: */
-	stp->st_stateid.si_generation = 0;
+	stp->st_stid.sc_stateid.si_generation = 0;
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = 0;
 	__set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK,
@@ -2422,7 +2422,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
 {
 	struct inode *ino = current_fh->fh_dentry->d_inode;
 	struct nfs4_file *fp;
-	struct nfs4_stateid *stp;
+	struct nfs4_ol_stateid *stp;
 	__be32 ret;
 
 	dprintk("NFSD: nfs4_share_conflict\n");
@@ -2611,9 +2611,9 @@ out:
 }
 
 static __be32
-nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp)
+nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_stateid **stpp)
 {
-	struct nfs4_stateid *local;
+	struct nfs4_ol_stateid *local;
 	struct nfs4_openowner *oo = open->op_openowner;
 
 	list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
@@ -2630,7 +2630,7 @@ nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_state
 	return nfs_ok;
 }
 
-static inline struct nfs4_stateid *
+static inline struct nfs4_ol_stateid *
 nfs4_alloc_stateid(void)
 {
 	return kmem_cache_alloc(stateid_slab, GFP_KERNEL);
@@ -2672,11 +2672,11 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
 }
 
 static __be32
-nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
+nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_ol_stateid **stpp,
 		struct nfs4_file *fp, struct svc_fh *cur_fh,
 		struct nfsd4_open *open)
 {
-	struct nfs4_stateid *stp;
+	struct nfs4_ol_stateid *stp;
 	__be32 status;
 
 	stp = nfs4_alloc_stateid();
@@ -2708,7 +2708,7 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
 }
 
 static __be32
-nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
+nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open)
 {
 	u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
 	bool new_access;
@@ -2820,7 +2820,7 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
  * Attempt to hand out a delegation.
  */
 static void
-nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_stateid *stp)
+nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_stateid *stp)
 {
 	struct nfs4_delegation *dp;
 	struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner);
@@ -2888,7 +2888,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	struct nfsd4_compoundres *resp = rqstp->rq_resp;
 	struct nfs4_file *fp = NULL;
 	struct inode *ino = current_fh->fh_dentry->d_inode;
-	struct nfs4_stateid *stp = NULL;
+	struct nfs4_ol_stateid *stp = NULL;
 	struct nfs4_delegation *dp = NULL;
 	__be32 status;
 
@@ -2938,8 +2938,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 			goto out;
 		}
 	}
-	update_stateid(&stp->st_stateid);
-	memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
+	update_stateid(&stp->st_stid.sc_stateid);
+	memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 
 	if (nfsd4_has_session(&resp->cstate))
 		open->op_openowner->oo_confirmed = 1;
@@ -2953,7 +2953,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	status = nfs_ok;
 
 	dprintk("%s: stateid=" STATEID_FMT "\n", __func__,
-		STATEID_VAL(&stp->st_stateid));
+		STATEID_VAL(&stp->st_stid.sc_stateid));
 out:
 	if (fp)
 		put_nfs4_file(fp);
@@ -3122,7 +3122,7 @@ static struct nfs4_openowner * search_close_lru(u32 st_id)
 }
 
 static inline int
-nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
+nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
 {
 	return fhp->fh_dentry->d_inode != stp->st_file->fi_inode;
 }
@@ -3153,7 +3153,7 @@ access_permit_write(unsigned long access_bmap)
 }
 
 static
-__be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags)
+__be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags)
 {
         __be32 status = nfserr_openmode;
 
@@ -3237,7 +3237,7 @@ static int is_delegation_stateid(stateid_t *stateid)
 
 __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session)
 {
-	struct nfs4_stateid *stp = NULL;
+	struct nfs4_ol_stateid *stp = NULL;
 	__be32 status = nfserr_stale_stateid;
 
 	if (STALE_STATEID(stateid))
@@ -3252,7 +3252,7 @@ __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session)
 	    && !openowner(stp->st_stateowner)->oo_confirmed)
 		goto out;
 
-	status = check_stateid_generation(stateid, &stp->st_stateid, has_session);
+	status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, has_session);
 	if (status)
 		goto out;
 
@@ -3268,7 +3268,7 @@ __be32
 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 			   stateid_t *stateid, int flags, struct file **filpp)
 {
-	struct nfs4_stateid *stp = NULL;
+	struct nfs4_ol_stateid *stp = NULL;
 	struct nfs4_delegation *dp = NULL;
 	struct svc_fh *current_fh = &cstate->current_fh;
 	struct inode *ino = current_fh->fh_dentry->d_inode;
@@ -3317,7 +3317,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 		if (stp->st_stateowner->so_is_open_owner
 		    && !openowner(stp->st_stateowner)->oo_confirmed)
 			goto out;
-		status = check_stateid_generation(stateid, &stp->st_stateid,
+		status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid,
 						  nfsd4_has_session(cstate));
 		if (status)
 			goto out;
@@ -3347,7 +3347,7 @@ nfsd4_free_delegation_stateid(stateid_t *stateid)
 }
 
 static __be32
-nfsd4_free_lock_stateid(struct nfs4_stateid *stp)
+nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp)
 {
 	if (check_for_locks(stp->st_file, lockowner(stp->st_stateowner)))
 		return nfserr_locks_held;
@@ -3374,7 +3374,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		   struct nfsd4_free_stateid *free_stateid)
 {
 	stateid_t *stateid = &free_stateid->fr_stateid;
-	struct nfs4_stateid *stp;
+	struct nfs4_ol_stateid *stp;
 	__be32 ret;
 
 	nfs4_lock_state();
@@ -3388,11 +3388,11 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		ret = nfserr_bad_stateid;
 		goto out;
 	}
-	ret = check_stateid_generation(stateid, &stp->st_stateid, 1);
+	ret = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, 1);
 	if (ret)
 		goto out;
 
-	if (stp->st_type == NFS4_OPEN_STID) {
+	if (stp->st_stid.sc_type == NFS4_OPEN_STID) {
 		ret = nfserr_locks_held;
 		goto out;
 	} else {
@@ -3421,7 +3421,7 @@ static __be32 nfs4_nospecial_stateid_checks(stateid_t *stateid)
 	return nfs_ok;
 }
 
-static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_stateid *stp)
+static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_ol_stateid *stp)
 {
 	struct svc_fh *current_fh = &cstate->current_fh;
 	struct nfs4_stateowner *sop = stp->st_stateowner;
@@ -3432,7 +3432,7 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
 	status = nfsd4_check_seqid(cstate, sop, seqid);
 	if (status)
 		return status;
-	return check_stateid_generation(stateid, &stp->st_stateid, nfsd4_has_session(cstate));
+	return check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
 }
 
 /* 
@@ -3441,7 +3441,7 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
 static __be32
 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 			 stateid_t *stateid, char typemask,
-			 struct nfs4_stateid **stpp)
+			 struct nfs4_ol_stateid **stpp)
 {
 	__be32 status;
 
@@ -3461,7 +3461,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 	return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp);
 }
 
-static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_stateid **stpp)
+static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_ol_stateid **stpp)
 {
 	__be32 status;
 	struct nfs4_openowner *oo;
@@ -3482,7 +3482,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
 	__be32 status;
 	struct nfs4_openowner *oo;
-	struct nfs4_stateid *stp;
+	struct nfs4_ol_stateid *stp;
 
 	dprintk("NFSD: nfsd4_open_confirm on file %.*s\n",
 			(int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3504,10 +3504,10 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (oo->oo_confirmed)
 		goto out;
 	oo->oo_confirmed = 1;
-	update_stateid(&stp->st_stateid);
-	memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t));
+	update_stateid(&stp->st_stid.sc_stateid);
+	memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 	dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
-		__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid));
+		__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
 
 	nfsd4_create_clid_dir(oo->oo_owner.so_client);
 	status = nfs_ok;
@@ -3517,7 +3517,7 @@ out:
 	return status;
 }
 
-static inline void nfs4_file_downgrade(struct nfs4_stateid *stp, unsigned int to_access)
+static inline void nfs4_file_downgrade(struct nfs4_ol_stateid *stp, unsigned int to_access)
 {
 	int i;
 
@@ -3545,7 +3545,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 		     struct nfsd4_open_downgrade *od)
 {
 	__be32 status;
-	struct nfs4_stateid *stp;
+	struct nfs4_ol_stateid *stp;
 
 	dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n", 
 			(int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3575,8 +3575,8 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 
 	reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap);
 
-	update_stateid(&stp->st_stateid);
-	memcpy(&od->od_stateid, &stp->st_stateid, sizeof(stateid_t));
+	update_stateid(&stp->st_stid.sc_stateid);
+	memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 	status = nfs_ok;
 out:
 	if (!cstate->replay_owner)
@@ -3593,7 +3593,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
 	__be32 status;
 	struct nfs4_openowner *oo;
-	struct nfs4_stateid *stp;
+	struct nfs4_ol_stateid *stp;
 
 	dprintk("NFSD: nfsd4_close on file %.*s\n", 
 			(int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3622,8 +3622,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto out; 
 	oo = openowner(stp->st_stateowner);
 	status = nfs_ok;
-	update_stateid(&stp->st_stateid);
-	memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t));
+	update_stateid(&stp->st_stid.sc_stateid);
+	memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 
 	/* release_stateid() calls nfsd_close() if needed */
 	release_open_stateid(stp);
@@ -3828,7 +3828,7 @@ find_lockowner_str(struct inode *inode, clientid_t *clid,
 	return NULL;
 }
 
-static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp)
+static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp)
 {
 	unsigned int idhashval;
 
@@ -3847,7 +3847,7 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s
  */
 
 static struct nfs4_lockowner *
-alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp, struct nfsd4_lock *lock) {
+alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp, struct nfsd4_lock *lock) {
 	struct nfs4_lockowner *lo;
 
 	lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp);
@@ -3862,27 +3862,27 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
 	return lo;
 }
 
-static struct nfs4_stateid *
-alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_stateid *open_stp)
+static struct nfs4_ol_stateid *
+alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_ol_stateid *open_stp)
 {
-	struct nfs4_stateid *stp;
+	struct nfs4_ol_stateid *stp;
 	unsigned int hashval = stateid_hashval(lo->lo_owner.so_id, fp->fi_id);
 
 	stp = nfs4_alloc_stateid();
 	if (stp == NULL)
 		goto out;
-	list_add(&stp->st_hash, &stateid_hashtbl[hashval]);
+	list_add(&stp->st_stid.sc_hash, &stateid_hashtbl[hashval]);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
 	list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
 	stp->st_stateowner = &lo->lo_owner;
-	stp->st_type = NFS4_LOCK_STID;
+	stp->st_stid.sc_type = NFS4_LOCK_STID;
 	get_nfs4_file(fp);
 	stp->st_file = fp;
-	stp->st_stateid.si_boot = boot_time;
-	stp->st_stateid.si_stateownerid = lo->lo_owner.so_id;
-	stp->st_stateid.si_fileid = fp->fi_id;
+	stp->st_stid.sc_stateid.si_boot = boot_time;
+	stp->st_stid.sc_stateid.si_stateownerid = lo->lo_owner.so_id;
+	stp->st_stid.sc_stateid.si_fileid = fp->fi_id;
 	/* note will be incremented before first return to client: */
-	stp->st_stateid.si_generation = 0;
+	stp->st_stid.sc_stateid.si_generation = 0;
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = open_stp->st_deny_bmap;
 	stp->st_openstp = open_stp;
@@ -3898,7 +3898,7 @@ check_lock_length(u64 offset, u64 length)
 	     LOFF_OVERFLOW(offset, length)));
 }
 
-static void get_lock_access(struct nfs4_stateid *lock_stp, u32 access)
+static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
 {
 	struct nfs4_file *fp = lock_stp->st_file;
 	int oflag = nfs4_access_to_omode(access);
@@ -3918,7 +3918,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
 	struct nfs4_openowner *open_sop = NULL;
 	struct nfs4_lockowner *lock_sop = NULL;
-	struct nfs4_stateid *lock_stp;
+	struct nfs4_ol_stateid *lock_stp;
 	struct nfs4_file *fp;
 	struct file *filp = NULL;
 	struct file_lock file_lock;
@@ -3949,7 +3949,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		 * Use open owner and open stateid to create lock owner and
 		 * lock stateid.
 		 */
-		struct nfs4_stateid *open_stp = NULL;
+		struct nfs4_ol_stateid *open_stp = NULL;
 		
 		status = nfserr_stale_clientid;
 		if (!nfsd4_has_session(cstate) &&
@@ -4052,8 +4052,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock);
 	switch (-err) {
 	case 0: /* success! */
-		update_stateid(&lock_stp->st_stateid);
-		memcpy(&lock->lk_resp_stateid, &lock_stp->st_stateid, 
+		update_stateid(&lock_stp->st_stid.sc_stateid);
+		memcpy(&lock->lk_resp_stateid, &lock_stp->st_stid.sc_stateid, 
 				sizeof(stateid_t));
 		status = 0;
 		break;
@@ -4172,7 +4172,7 @@ __be32
 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	    struct nfsd4_locku *locku)
 {
-	struct nfs4_stateid *stp;
+	struct nfs4_ol_stateid *stp;
 	struct file *filp = NULL;
 	struct file_lock file_lock;
 	__be32 status;
@@ -4220,8 +4220,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	/*
 	* OK, unlock succeeded; the only thing left to do is update the stateid.
 	*/
-	update_stateid(&stp->st_stateid);
-	memcpy(&locku->lu_stateid, &stp->st_stateid, sizeof(stateid_t));
+	update_stateid(&stp->st_stid.sc_stateid);
+	memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 
 out:
 	nfs4_unlock_state();
@@ -4264,7 +4264,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
 	clientid_t *clid = &rlockowner->rl_clientid;
 	struct nfs4_stateowner *sop;
 	struct nfs4_lockowner *lo;
-	struct nfs4_stateid *stp;
+	struct nfs4_ol_stateid *stp;
 	struct xdr_netobj *owner = &rlockowner->rl_owner;
 	struct list_head matches;
 	int i;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index ef949eb3a86e..d7fffabb8d56 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -328,10 +328,10 @@ struct nfs4_replay {
 *         for lock_owner
 *    so_perclient: nfs4_client->cl_perclient entry - used when nfs4_client
 *         struct is reaped.
-*    so_perfilestate: heads the list of nfs4_stateid (either open or lock) 
-*         and is used to ensure no dangling nfs4_stateid references when we 
+*    so_perfilestate: heads the list of nfs4_ol_stateid (either open or lock) 
+*         and is used to ensure no dangling nfs4_ol_stateid references when we 
 *         release a stateowner.
-*    so_perlockowner: (open) nfs4_stateid->st_perlockowner entry - used when
+*    so_perlockowner: (open) nfs4_ol_stateid->st_perlockowner entry - used when
 *         close is called to reap associated byte-range locks
 *    so_close_lru: (open) stateowner is placed on this list instead of being
 *         reaped (when so_perfilestate is empty) to hold the last close replay.
@@ -430,9 +430,9 @@ static inline struct file *find_any_file(struct nfs4_file *f)
 }
 
 /*
-* nfs4_stateid can either be an open stateid or (eventually) a lock stateid
+* nfs4_ol_stateid can either be an open stateid or (eventually) a lock stateid
 *
-* (open)nfs4_stateid: one per (open)nfs4_stateowner, nfs4_file
+* (open)nfs4_ol_stateid: one per (open)nfs4_stateowner, nfs4_file
 *
 * 	st_hash: stateid_hashtbl[] entry or lockstateid_hashtbl entry
 * 	st_perfile: file_hashtbl[] entry.
@@ -446,22 +446,31 @@ static inline struct file *find_any_file(struct nfs4_file *f)
 * we should consider defining separate structs for the two cases.
 */
 
-struct nfs4_stateid {
+struct nfs4_stid {
 #define NFS4_OPEN_STID 1
 #define NFS4_LOCK_STID 2
-	char st_type;
-	struct list_head              st_hash; 
+	char sc_type;
+	struct list_head sc_hash;
+	stateid_t sc_stateid;
+};
+
+struct nfs4_ol_stateid {
+	struct nfs4_stid    st_stid;
 	struct list_head              st_perfile;
 	struct list_head              st_perstateowner;
 	struct list_head              st_lockowners;
 	struct nfs4_stateowner      * st_stateowner;
 	struct nfs4_file            * st_file;
-	stateid_t                     st_stateid;
 	unsigned long                 st_access_bmap;
 	unsigned long                 st_deny_bmap;
-	struct nfs4_stateid         * st_openstp;
+	struct nfs4_ol_stateid         * st_openstp;
 };
 
+static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
+{
+	return container_of(s, struct nfs4_ol_stateid, st_stid);
+}
+
 /* flags for preprocess_seqid_op() */
 #define RD_STATE	        0x00000010
 #define WR_STATE	        0x00000020
-- 
cgit v1.2.3-58-ga151


From d5477a8db8134c481ad7b4b745f6defa119253e1 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 8 Sep 2011 12:07:44 -0400
Subject: nfsd4: add common dl_stid field to delegation

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c |  2 +-
 fs/nfsd/nfs4state.c    | 20 ++++++++++----------
 fs/nfsd/state.h        | 20 +++++++++++---------
 3 files changed, 22 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 02eb4edf0ece..93b5e405ad38 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -351,7 +351,7 @@ static void encode_cb_recall4args(struct xdr_stream *xdr,
 	__be32 *p;
 
 	encode_nfs_cb_opnum4(xdr, OP_CB_RECALL);
-	encode_stateid4(xdr, &dp->dl_stateid);
+	encode_stateid4(xdr, &dp->dl_stid.sc_stateid);
 
 	p = xdr_reserve_space(xdr, 4);
 	*p++ = xdr_zero;			/* truncate */
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d0bb5a5613a9..3e3d605d3e73 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -247,10 +247,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
 	get_nfs4_file(fp);
 	dp->dl_file = fp;
 	dp->dl_type = type;
-	dp->dl_stateid.si_boot = boot_time;
-	dp->dl_stateid.si_stateownerid = current_delegid++;
-	dp->dl_stateid.si_fileid = 0;
-	dp->dl_stateid.si_generation = 1;
+	dp->dl_stid.sc_stateid.si_boot = boot_time;
+	dp->dl_stid.sc_stateid.si_stateownerid = current_delegid++;
+	dp->dl_stid.sc_stateid.si_fileid = 0;
+	dp->dl_stid.sc_stateid.si_generation = 1;
 	fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
 	dp->dl_time = 0;
 	atomic_set(&dp->dl_count, 1);
@@ -2572,7 +2572,7 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
 
 	spin_lock(&recall_lock);
 	list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
-		if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) {
+		if (dp->dl_stid.sc_stateid.si_stateownerid == stid->si_stateownerid) {
 			spin_unlock(&recall_lock);
 			return dp;
 		}
@@ -2861,10 +2861,10 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_
 	if (status)
 		goto out_free;
 
-	memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid));
+	memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid));
 
 	dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
-		STATEID_VAL(&dp->dl_stateid));
+		STATEID_VAL(&dp->dl_stid.sc_stateid));
 out:
 	if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS
 			&& flag == NFS4_OPEN_DELEGATE_NONE
@@ -3296,7 +3296,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 		dp = find_delegation_stateid(ino, stateid);
 		if (!dp)
 			goto out;
-		status = check_stateid_generation(stateid, &dp->dl_stateid, nfsd4_has_session(cstate));
+		status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate));
 		if (status)
 			goto out;
 		status = nfs4_check_delegmode(dp, flags);
@@ -3667,7 +3667,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	dp = find_delegation_stateid(inode, stateid);
 	if (!dp)
 		goto out;
-	status = check_stateid_generation(stateid, &dp->dl_stateid, nfsd4_has_session(cstate));
+	status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate));
 	if (status)
 		goto out;
 	renew_client(dp->dl_client);
@@ -3737,7 +3737,7 @@ search_for_delegation(stateid_t *stid)
 		list_for_each_entry(fp, &file_hashtbl[i], fi_hash) {
 			list_for_each(pos, &fp->fi_delegations) {
 				dp = list_entry(pos, struct nfs4_delegation, dl_perfile);
-				if (same_stateid(&dp->dl_stateid, stid))
+				if (same_stateid(&dp->dl_stid.sc_stateid, stid))
 					return dp;
 			}
 		}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index d7fffabb8d56..e3ff7c9f9264 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -76,6 +76,15 @@ struct nfsd4_callback {
 	bool cb_done;
 };
 
+struct nfs4_stid {
+#define NFS4_OPEN_STID 1
+#define NFS4_LOCK_STID 2
+#define NFS4_DELEG_STID 4
+	char sc_type;
+	struct list_head sc_hash;
+	stateid_t sc_stateid;
+};
+
 struct nfs4_delegation {
 	struct list_head	dl_perfile;
 	struct list_head	dl_perclnt;
@@ -86,7 +95,7 @@ struct nfs4_delegation {
 	u32			dl_type;
 	time_t			dl_time;
 /* For recall: */
-	stateid_t		dl_stateid;
+	struct nfs4_stid	dl_stid;
 	struct knfsd_fh		dl_fh;
 	int			dl_retries;
 	struct nfsd4_callback	dl_recall;
@@ -446,14 +455,7 @@ static inline struct file *find_any_file(struct nfs4_file *f)
 * we should consider defining separate structs for the two cases.
 */
 
-struct nfs4_stid {
-#define NFS4_OPEN_STID 1
-#define NFS4_LOCK_STID 2
-	char sc_type;
-	struct list_head sc_hash;
-	stateid_t sc_stateid;
-};
-
+/* "ol" stands for "Open or Lock".  Better suggestions welcome. */
 struct nfs4_ol_stateid {
 	struct nfs4_stid    st_stid;
 	struct list_head              st_perfile;
-- 
cgit v1.2.3-58-ga151


From 36d44c6038f6d3f8786187a5558c2f3b39a7af95 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 8 Sep 2011 12:16:03 -0400
Subject: nfsd4: share common stid-hashing helper function

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3e3d605d3e73..581a5d01b005 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -218,6 +218,15 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag)
 		__nfs4_file_put_access(fp, oflag);
 }
 
+static inline void hash_stid(struct nfs4_stid *stid)
+{
+	stateid_t *s = &stid->sc_stateid;
+	unsigned int hashval;
+
+	hashval = stateid_hashval(s->si_stateownerid, s->si_fileid);
+	list_add(&stid->sc_hash, &stateid_hashtbl[hashval]);
+}
+
 static struct nfs4_delegation *
 alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type)
 {
@@ -2316,10 +2325,8 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
 static inline void
 init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
 	struct nfs4_openowner *oo = open->op_openowner;
-	unsigned int hashval = stateid_hashval(oo->oo_owner.so_id, fp->fi_id);
 
 	INIT_LIST_HEAD(&stp->st_lockowners);
-	list_add(&stp->st_stid.sc_hash, &stateid_hashtbl[hashval]);
 	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
 	stp->st_stid.sc_type = NFS4_OPEN_STID;
@@ -2331,6 +2338,7 @@ init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd
 	stp->st_stid.sc_stateid.si_fileid = fp->fi_id;
 	/* note will be incremented before first return to client: */
 	stp->st_stid.sc_stateid.si_generation = 0;
+	hash_stid(&stp->st_stid);
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = 0;
 	__set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK,
@@ -3866,12 +3874,10 @@ static struct nfs4_ol_stateid *
 alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_ol_stateid *open_stp)
 {
 	struct nfs4_ol_stateid *stp;
-	unsigned int hashval = stateid_hashval(lo->lo_owner.so_id, fp->fi_id);
 
 	stp = nfs4_alloc_stateid();
 	if (stp == NULL)
 		goto out;
-	list_add(&stp->st_stid.sc_hash, &stateid_hashtbl[hashval]);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
 	list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
 	stp->st_stateowner = &lo->lo_owner;
@@ -3883,6 +3889,7 @@ alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct
 	stp->st_stid.sc_stateid.si_fileid = fp->fi_id;
 	/* note will be incremented before first return to client: */
 	stp->st_stid.sc_stateid.si_generation = 0;
+	hash_stid(&stp->st_stid);
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = open_stp->st_deny_bmap;
 	stp->st_openstp = open_stp;
-- 
cgit v1.2.3-58-ga151


From f459e4535904e16ca9f0cc202c78345c332bbbad Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 9 Sep 2011 09:06:12 -0400
Subject: nfsd4: hash deleg stateid's like any other

It's simpler to look up delegation stateid's in the same hash table as
any other stateid.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 112 +++++++++++++++++++++-------------------------------
 fs/nfsd/state.h     |   5 +++
 2 files changed, 51 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 581a5d01b005..24685a02690f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -60,8 +60,6 @@ static u64 current_sessionid = 1;
 #define ONE_STATEID(stateid)  (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
 
 /* forward declarations */
-static struct nfs4_delegation * search_for_delegation(stateid_t *stid);
-static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
 static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner);
 
 /* Locking: */
@@ -256,10 +254,12 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
 	get_nfs4_file(fp);
 	dp->dl_file = fp;
 	dp->dl_type = type;
+	dp->dl_stid.sc_type = NFS4_DELEG_STID;
 	dp->dl_stid.sc_stateid.si_boot = boot_time;
 	dp->dl_stid.sc_stateid.si_stateownerid = current_delegid++;
 	dp->dl_stid.sc_stateid.si_fileid = 0;
 	dp->dl_stid.sc_stateid.si_generation = 1;
+	hash_stid(&dp->dl_stid);
 	fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
 	dp->dl_time = 0;
 	atomic_set(&dp->dl_count, 1);
@@ -292,6 +292,7 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
 static void
 unhash_delegation(struct nfs4_delegation *dp)
 {
+	list_del_init(&dp->dl_stid.sc_hash);
 	list_del_init(&dp->dl_perclnt);
 	spin_lock(&recall_lock);
 	list_del_init(&dp->dl_perfile);
@@ -1077,7 +1078,7 @@ same_stateid(stateid_t *id_one, stateid_t *id_two)
 	return id_one->si_fileid == id_two->si_fileid;
 }
 
-static struct nfs4_ol_stateid *find_stateid(stateid_t *t)
+static struct nfs4_stid *find_stateid(stateid_t *t)
 {
 	struct nfs4_stid *s;
 	unsigned int hashval;
@@ -1085,22 +1086,42 @@ static struct nfs4_ol_stateid *find_stateid(stateid_t *t)
 	hashval = stateid_hashval(t->si_stateownerid, t->si_fileid);
 	list_for_each_entry(s, &stateid_hashtbl[hashval], sc_hash)
 		if (same_stateid(&s->sc_stateid, t))
-			return openlockstateid(s);
+			return s;
 	return NULL;
 }
 
-static struct nfs4_ol_stateid *find_stateid_by_type(stateid_t *t, char typemask)
+static struct nfs4_ol_stateid *find_ol_stateid(stateid_t *t)
 {
-	struct nfs4_ol_stateid *s;
+	struct nfs4_stid *s;
+
+	s = find_stateid(t);
+	if (!s)
+		return NULL;
+	return openlockstateid(s);
+}
+
+static struct nfs4_stid *find_stateid_by_type(stateid_t *t, char typemask)
+{
+	struct nfs4_stid *s;
 
 	s = find_stateid(t);
 	if (!s)
 		return NULL;
-	if (typemask & s->st_stid.sc_type)
+	if (typemask & s->sc_type)
 		return s;
 	return NULL;
 }
 
+static struct nfs4_ol_stateid *find_ol_stateid_by_type(stateid_t *t, char typemask)
+{
+	struct nfs4_stid *s;
+
+	s = find_stateid_by_type(t, typemask);
+	if (!s)
+		return NULL;
+	return openlockstateid(s);
+}
+
 static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
 		struct svc_rqst *rqstp, nfs4_verifier *verf)
 {
@@ -2573,26 +2594,21 @@ nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
 		return nfs_ok;
 }
 
-static struct nfs4_delegation *
-find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
+static int share_access_to_flags(u32 share_access)
 {
-	struct nfs4_delegation *dp;
+	share_access &= ~NFS4_SHARE_WANT_MASK;
 
-	spin_lock(&recall_lock);
-	list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
-		if (dp->dl_stid.sc_stateid.si_stateownerid == stid->si_stateownerid) {
-			spin_unlock(&recall_lock);
-			return dp;
-		}
-	spin_unlock(&recall_lock);
-	return NULL;
+	return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
 }
 
-static int share_access_to_flags(u32 share_access)
+static struct nfs4_delegation *find_deleg_stateid(stateid_t *s)
 {
-	share_access &= ~NFS4_SHARE_WANT_MASK;
+	struct nfs4_stid *ret;
 
-	return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
+	ret = find_stateid_by_type(s, NFS4_DELEG_STID);
+	if (!ret)
+		return NULL;
+	return delegstateid(ret);
 }
 
 static __be32
@@ -2602,7 +2618,7 @@ nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
 	int flags;
 	__be32 status = nfserr_bad_stateid;
 
-	*dp = find_delegation_file(fp, &open->op_delegate_stateid);
+	*dp = find_deleg_stateid(&open->op_delegate_stateid);
 	if (*dp == NULL)
 		goto out;
 	flags = share_access_to_flags(open->op_share_access);
@@ -3252,7 +3268,7 @@ __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session)
 		goto out;
 
 	status = nfserr_expired;
-	stp = find_stateid(stateid);
+	stp = find_ol_stateid(stateid);
 	if (!stp)
 		goto out;
 	status = nfserr_bad_stateid;
@@ -3301,7 +3317,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 	 */
 	status = nfserr_expired;
 	if (is_delegation_stateid(stateid)) {
-		dp = find_delegation_stateid(ino, stateid);
+		dp = find_deleg_stateid(stateid);
 		if (!dp)
 			goto out;
 		status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate));
@@ -3316,7 +3332,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 			BUG_ON(!*filpp);
 		}
 	} else { /* open or lock stateid */
-		stp = find_stateid(stateid);
+		stp = find_ol_stateid(stateid);
 		if (!stp)
 			goto out;
 		status = nfserr_bad_stateid;
@@ -3348,9 +3364,10 @@ out:
 static __be32
 nfsd4_free_delegation_stateid(stateid_t *stateid)
 {
-	struct nfs4_delegation *dp = search_for_delegation(stateid);
+	struct nfs4_delegation *dp = find_deleg_stateid(stateid);
 	if (dp)
 		return nfserr_locks_held;
+
 	return nfserr_bad_stateid;
 }
 
@@ -3391,7 +3408,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto out;
 	}
 
-	stp = find_stateid(stateid);
+	stp = find_ol_stateid(stateid);
 	if (!stp) {
 		ret = nfserr_bad_stateid;
 		goto out;
@@ -3460,7 +3477,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 	status = nfs4_nospecial_stateid_checks(stateid);
 	if (status)
 		return status;
-	*stpp = find_stateid_by_type(stateid, typemask);
+	*stpp = find_ol_stateid_by_type(stateid, typemask);
 	if (*stpp == NULL)
 		return nfserr_expired;
 	cstate->replay_owner = (*stpp)->st_stateowner;
@@ -3672,7 +3689,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (!is_delegation_stateid(stateid))
 		goto out;
 	status = nfserr_expired;
-	dp = find_delegation_stateid(inode, stateid);
+	dp = find_deleg_stateid(stateid);
 	if (!dp)
 		goto out;
 	status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate));
@@ -3733,43 +3750,6 @@ lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
 static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE];
 static struct list_head	lock_ownerstr_hashtbl[LOCK_HASH_SIZE];
 
-static struct nfs4_delegation *
-search_for_delegation(stateid_t *stid)
-{
-	struct nfs4_file *fp;
-	struct nfs4_delegation *dp;
-	struct list_head *pos;
-	int i;
-
-	for (i = 0; i < FILE_HASH_SIZE; i++) {
-		list_for_each_entry(fp, &file_hashtbl[i], fi_hash) {
-			list_for_each(pos, &fp->fi_delegations) {
-				dp = list_entry(pos, struct nfs4_delegation, dl_perfile);
-				if (same_stateid(&dp->dl_stid.sc_stateid, stid))
-					return dp;
-			}
-		}
-	}
-	return NULL;
-}
-
-static struct nfs4_delegation *
-find_delegation_stateid(struct inode *ino, stateid_t *stid)
-{
-	struct nfs4_file *fp;
-	struct nfs4_delegation *dl;
-
-	dprintk("NFSD: %s: stateid=" STATEID_FMT "\n", __func__,
-		STATEID_VAL(stid));
-
-	fp = find_file(ino);
-	if (!fp)
-		return NULL;
-	dl = find_delegation_file(fp, stid);
-	put_nfs4_file(fp);
-	return dl;
-}
-
 /*
  * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
  * we can't properly handle lock requests that go beyond the (2^63 - 1)-th
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index e3ff7c9f9264..12c142436705 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -113,6 +113,11 @@ struct nfs4_cb_conn {
 	struct svc_xprt		*cb_xprt;	/* minorversion 1 only */
 };
 
+static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
+{
+	return container_of(s, struct nfs4_delegation, dl_stid);
+}
+
 /* Maximum number of slots per session. 160 is useful for long haul TCP */
 #define NFSD_MAX_SLOTS_PER_SESSION     160
 /* Maximum number of operations per session compound */
-- 
cgit v1.2.3-58-ga151


From 97b7e3b6d4ae838694b43f66b4f0b32b5ec7635b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 9 Sep 2011 11:26:58 -0400
Subject: nfsd4: fix test_stateid for delegation stateid's

Test_stateid should handle delegation stateid's as well.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 24685a02690f..30387f3d9881 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3261,28 +3261,26 @@ static int is_delegation_stateid(stateid_t *stateid)
 
 __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session)
 {
-	struct nfs4_ol_stateid *stp = NULL;
-	__be32 status = nfserr_stale_stateid;
+	struct nfs4_stid *s;
+	struct nfs4_ol_stateid *ols;
+	__be32 status;
 
 	if (STALE_STATEID(stateid))
-		goto out;
-
-	status = nfserr_expired;
-	stp = find_ol_stateid(stateid);
-	if (!stp)
-		goto out;
-	status = nfserr_bad_stateid;
-	if (stp->st_stateowner->so_is_open_owner
-	    && !openowner(stp->st_stateowner)->oo_confirmed)
-		goto out;
+		return nfserr_stale_stateid;
 
-	status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, has_session);
+	s = find_stateid(stateid);
+	if (!s)
+		 return nfserr_stale_stateid;
+	status = check_stateid_generation(stateid, &s->sc_stateid, has_session);
 	if (status)
-		goto out;
-
-	status = nfs_ok;
-out:
-	return status;
+		return status;
+	if (!(s->sc_type & (NFS4_OPEN_STID | NFS4_LOCK_STID)))
+		return nfs_ok;
+	ols = openlockstateid(s);
+	if (ols->st_stateowner->so_is_open_owner
+	    && !openowner(ols->st_stateowner)->oo_confirmed)
+		return nfserr_bad_stateid;
+	return nfs_ok;
 }
 
 /*
-- 
cgit v1.2.3-58-ga151


From 69064a2764fe195f1478be3ea83d15abe5d71025 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 9 Sep 2011 11:54:57 -0400
Subject: nfsd4: use deleg changes to cleanup preprocess_stateid_op

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 30387f3d9881..ea338d0c62a1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3290,6 +3290,7 @@ __be32
 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 			   stateid_t *stateid, int flags, struct file **filpp)
 {
+	struct nfs4_stid *s;
 	struct nfs4_ol_stateid *stp = NULL;
 	struct nfs4_delegation *dp = NULL;
 	struct svc_fh *current_fh = &cstate->current_fh;
@@ -3314,13 +3315,14 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 	 * but that we can't find, is expired:
 	 */
 	status = nfserr_expired;
-	if (is_delegation_stateid(stateid)) {
-		dp = find_deleg_stateid(stateid);
-		if (!dp)
-			goto out;
-		status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate));
-		if (status)
-			goto out;
+	s = find_stateid(stateid);
+	if (!s)
+		goto out;
+	status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
+	if (status)
+		goto out;
+	if (s->sc_type == NFS4_DELEG_STID) {
+		dp = delegstateid(s);
 		status = nfs4_check_delegmode(dp, flags);
 		if (status)
 			goto out;
@@ -3330,19 +3332,13 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 			BUG_ON(!*filpp);
 		}
 	} else { /* open or lock stateid */
-		stp = find_ol_stateid(stateid);
-		if (!stp)
-			goto out;
+		stp = openlockstateid(s);
 		status = nfserr_bad_stateid;
 		if (nfs4_check_fh(current_fh, stp))
 			goto out;
 		if (stp->st_stateowner->so_is_open_owner
 		    && !openowner(stp->st_stateowner)->oo_confirmed)
 			goto out;
-		status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid,
-						  nfsd4_has_session(cstate));
-		if (status)
-			goto out;
 		status = nfs4_check_openmode(stp, flags);
 		if (status)
 			goto out;
-- 
cgit v1.2.3-58-ga151


From ee626a77d3725a129391b1c85edd46f3b470cca9 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sun, 11 Sep 2011 13:48:41 -0400
Subject: nfsd4: better stateid hashing

First, we shouldn't care here about the structure of the opaque part of
the stateid.  Second, this hash is really dumb.  (I'm not sure the
replacement is much better, though--to look at it another patch.)

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ea338d0c62a1..0cd346477f29 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -168,9 +168,9 @@ static unsigned int file_hashval(struct inode *ino)
 	return hash_ptr(ino, FILE_HASH_BITS);
 }
 
-static unsigned int stateid_hashval(u32 owner_id, u32 file_id)
+static unsigned int stateid_hashval(stateid_t *s)
 {
-	return (owner_id + file_id) & STATEID_HASH_MASK;
+	return opaque_hashval(&s->si_opaque, sizeof(stateid_opaque_t)) & STATEID_HASH_MASK;
 }
 
 static struct list_head file_hashtbl[FILE_HASH_SIZE];
@@ -221,7 +221,7 @@ static inline void hash_stid(struct nfs4_stid *stid)
 	stateid_t *s = &stid->sc_stateid;
 	unsigned int hashval;
 
-	hashval = stateid_hashval(s->si_stateownerid, s->si_fileid);
+	hashval = stateid_hashval(s);
 	list_add(&stid->sc_hash, &stateid_hashtbl[hashval]);
 }
 
@@ -1083,7 +1083,7 @@ static struct nfs4_stid *find_stateid(stateid_t *t)
 	struct nfs4_stid *s;
 	unsigned int hashval;
 
-	hashval = stateid_hashval(t->si_stateownerid, t->si_fileid);
+	hashval = stateid_hashval(t);
 	list_for_each_entry(s, &stateid_hashtbl[hashval], sc_hash)
 		if (same_stateid(&s->sc_stateid, t))
 			return s;
-- 
cgit v1.2.3-58-ga151


From ed748aacb8e3318fa2cf24e1c197d35b5fd29605 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 12 Sep 2011 19:37:06 -0400
Subject: NFSD: Cleanup for nfsd4_path()

The current code is sort of hackish in that it assumes a referral is always
matched to an export. When we add support for junctions that may not be the
case.
We can replace nfsd4_path() with a function that encodes the components
directly from the dentries. Since nfsd4_path is currently the only user of
the 'ex_pathname' field in struct svc_export, this has the added benefit
of allowing us to get rid of that.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/export.c            |   4 +-
 fs/nfsd/nfs4xdr.c           | 106 ++++++++++++++++++++++++++++++++------------
 include/linux/nfsd/export.h |   1 +
 3 files changed, 81 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index d491421cd708..99229b0c153e 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1009,7 +1009,7 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
 	return exp;
 }
 
-static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp)
+struct svc_export *rqst_find_fsidzero_export(struct svc_rqst *rqstp)
 {
 	u32 fsidv[2];
 
@@ -1029,7 +1029,7 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
 	struct svc_export *exp;
 	__be32 rv;
 
-	exp = find_fsidzero_export(rqstp);
+	exp = rqst_find_fsidzero_export(rqstp);
 	if (IS_ERR(exp))
 		return nfserrno(PTR_ERR(exp));
 	rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 182570bed472..5252d6681960 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1696,36 +1696,89 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
 }
 
 /*
- * Return the path to an export point in the pseudo filesystem namespace
- * Returned string is safe to use as long as the caller holds a reference
- * to @exp.
+ * Encode a path in RFC3530 'pathname4' format
  */
-static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat)
+static __be32 nfsd4_encode_path(const struct path *root,
+		const struct path *path, __be32 **pp, int *buflen)
 {
-	struct svc_fh tmp_fh;
-	char *path = NULL, *rootpath;
-	size_t rootlen;
+	struct path cur = {
+		.mnt = path->mnt,
+		.dentry = path->dentry,
+	};
+	__be32 *p = *pp;
+	struct dentry **components = NULL;
+	unsigned int ncomponents = 0;
+	__be32 err = nfserr_jukebox;
 
-	fh_init(&tmp_fh, NFS4_FHSIZE);
-	*stat = exp_pseudoroot(rqstp, &tmp_fh);
-	if (*stat)
-		return NULL;
-	rootpath = tmp_fh.fh_export->ex_pathname;
+	dprintk("nfsd4_encode_components(");
 
-	path = exp->ex_pathname;
+	path_get(&cur);
+	/* First walk the path up to the nfsd root, and store the
+	 * dentries/path components in an array.
+	 */
+	for (;;) {
+		if (cur.dentry == root->dentry && cur.mnt == root->mnt)
+			break;
+		if (cur.dentry == cur.mnt->mnt_root) {
+			if (follow_up(&cur))
+				continue;
+			goto out_free;
+		}
+		if ((ncomponents & 15) == 0) {
+			struct dentry **new;
+			new = krealloc(components,
+					sizeof(*new) * (ncomponents + 16),
+					GFP_KERNEL);
+			if (!new)
+				goto out_free;
+			components = new;
+		}
+		components[ncomponents++] = cur.dentry;
+		cur.dentry = dget_parent(cur.dentry);
+	}
 
-	rootlen = strlen(rootpath);
-	if (strncmp(path, rootpath, rootlen)) {
-		dprintk("nfsd: fs_locations failed;"
-			"%s is not contained in %s\n", path, rootpath);
-		*stat = nfserr_notsupp;
-		path = NULL;
-		goto out;
+	*buflen -= 4;
+	if (*buflen < 0)
+		goto out_free;
+	WRITE32(ncomponents);
+
+	while (ncomponents) {
+		struct dentry *dentry = components[ncomponents - 1];
+		unsigned int len = dentry->d_name.len;
+
+		*buflen -= 4 + (XDR_QUADLEN(len) << 2);
+		if (*buflen < 0)
+			goto out_free;
+		WRITE32(len);
+		WRITEMEM(dentry->d_name.name, len);
+		dprintk("/%s", dentry->d_name.name);
+		dput(dentry);
+		ncomponents--;
 	}
-	path += rootlen;
-out:
-	fh_put(&tmp_fh);
-	return path;
+
+	*pp = p;
+	err = 0;
+out_free:
+	dprintk(")\n");
+	while (ncomponents)
+		dput(components[--ncomponents]);
+	kfree(components);
+	path_put(&cur);
+	return err;
+}
+
+static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp,
+		const struct path *path, __be32 **pp, int *buflen)
+{
+	struct svc_export *exp_ps;
+	__be32 res;
+
+	exp_ps = rqst_find_fsidzero_export(rqstp);
+	if (IS_ERR(exp_ps))
+		return nfserrno(PTR_ERR(exp_ps));
+	res = nfsd4_encode_path(&exp_ps->ex_path, path, pp, buflen);
+	exp_put(exp_ps);
+	return res;
 }
 
 /*
@@ -1739,11 +1792,8 @@ static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp,
 	int i;
 	__be32 *p = *pp;
 	struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
-	char *root = nfsd4_path(rqstp, exp, &status);
 
-	if (status)
-		return status;
-	status = nfsd4_encode_components('/', root, &p, buflen);
+	status = nfsd4_encode_fsloc_fsroot(rqstp, &exp->ex_path, &p, buflen);
 	if (status)
 		return status;
 	if ((*buflen -= 4) < 0)
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index 8a31a20efe7e..7ba3fd43f312 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -137,6 +137,7 @@ struct svc_export *	rqst_exp_get_by_name(struct svc_rqst *,
 					     struct path *);
 struct svc_export *	rqst_exp_parent(struct svc_rqst *,
 					struct path *);
+struct svc_export *	rqst_find_fsidzero_export(struct svc_rqst *);
 int			exp_rootfh(struct auth_domain *, 
 					char *path, struct knfsd_fh *, int maxsize);
 __be32			exp_pseudoroot(struct svc_rqst *, struct svc_fh *);
-- 
cgit v1.2.3-58-ga151


From 2f1ddda1749a223d1a05e16dc6ea28632b9ec570 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 12 Sep 2011 19:37:16 -0400
Subject: NFSD: Remove the ex_pathname field from struct svc_export

There are no more users...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/export.c            | 11 -----------
 include/linux/nfsd/export.h |  1 -
 2 files changed, 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 99229b0c153e..62f3b9074e84 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -317,7 +317,6 @@ static void svc_export_put(struct kref *ref)
 	struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
 	path_put(&exp->ex_path);
 	auth_domain_put(exp->ex_client);
-	kfree(exp->ex_pathname);
 	nfsd4_fslocs_free(&exp->ex_fslocs);
 	kfree(exp);
 }
@@ -527,11 +526,6 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 
 	exp.ex_client = dom;
 
-	err = -ENOMEM;
-	exp.ex_pathname = kstrdup(buf, GFP_KERNEL);
-	if (!exp.ex_pathname)
-		goto out2;
-
 	/* expiry */
 	err = -EINVAL;
 	exp.h.expiry_time = get_expiry(&mesg);
@@ -612,8 +606,6 @@ out4:
 	nfsd4_fslocs_free(&exp.ex_fslocs);
 	kfree(exp.ex_uuid);
 out3:
-	kfree(exp.ex_pathname);
-out2:
 	path_put(&exp.ex_path);
 out1:
 	auth_domain_put(dom);
@@ -677,7 +669,6 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
 	new->ex_client = item->ex_client;
 	new->ex_path.dentry = dget(item->ex_path.dentry);
 	new->ex_path.mnt = mntget(item->ex_path.mnt);
-	new->ex_pathname = NULL;
 	new->ex_fslocs.locations = NULL;
 	new->ex_fslocs.locations_count = 0;
 	new->ex_fslocs.migrated = 0;
@@ -695,8 +686,6 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
 	new->ex_fsid = item->ex_fsid;
 	new->ex_uuid = item->ex_uuid;
 	item->ex_uuid = NULL;
-	new->ex_pathname = item->ex_pathname;
-	item->ex_pathname = NULL;
 	new->ex_fslocs.locations = item->ex_fslocs.locations;
 	item->ex_fslocs.locations = NULL;
 	new->ex_fslocs.locations_count = item->ex_fslocs.locations_count;
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index 7ba3fd43f312..f85308e688fd 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -96,7 +96,6 @@ struct svc_export {
 	struct auth_domain *	ex_client;
 	int			ex_flags;
 	struct path		ex_path;
-	char			*ex_pathname;
 	uid_t			ex_anon_uid;
 	gid_t			ex_anon_gid;
 	int			ex_fsid;
-- 
cgit v1.2.3-58-ga151


From 11fcee0293a6d9f0973e04f8b3fb6cd15a55bcce Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 12 Sep 2011 19:37:26 -0400
Subject: NFSD: Add a cache for fs_locations information

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
[ cel: since this is server-side, use nfsd4_ prefix instead of nfs4_ prefix. ]
[ cel: implement S_ISVTX filter in bfields-normal form ]
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfsd.h |  7 +++++++
 fs/nfsd/vfs.c  | 18 ++++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 8da03e16ab35..58134a23fdfb 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -361,6 +361,13 @@ static inline u32 nfsd_suppattrs2(u32 minorversion)
 #define NFSD_SUPPATTR_EXCLCREAT_WORD2 \
 	NFSD_WRITEABLE_ATTRS_WORD2
 
+extern int nfsd4_is_junction(struct dentry *dentry);
+#else
+static inline int nfsd4_is_junction(struct dentry *dentry)
+{
+	return 0;
+}
+
 #endif /* CONFIG_NFSD_V4 */
 
 #endif /* LINUX_NFSD_NFSD_H */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 75c35fa46155..4dd91283d039 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -168,6 +168,8 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
 {
 	if (d_mountpoint(dentry))
 		return 1;
+	if (nfsd4_is_junction(dentry))
+		return 1;
 	if (!(exp->ex_flags & NFSEXP_V4ROOT))
 		return 0;
 	return dentry->d_inode != NULL;
@@ -592,6 +594,22 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac
 	return error;
 }
 
+#define NFSD_XATTR_JUNCTION_PREFIX XATTR_TRUSTED_PREFIX "junction."
+#define NFSD_XATTR_JUNCTION_TYPE NFSD_XATTR_JUNCTION_PREFIX "type"
+int nfsd4_is_junction(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+
+	if (inode == NULL)
+		return 0;
+	if (inode->i_mode & S_IXUGO)
+		return 0;
+	if (!(inode->i_mode & S_ISVTX))
+		return 0;
+	if (vfs_getxattr(dentry, NFSD_XATTR_JUNCTION_TYPE, NULL, 0) <= 0)
+		return 0;
+	return 1;
+}
 #endif /* defined(CONFIG_NFSD_V4) */
 
 #ifdef CONFIG_NFSD_V3
-- 
cgit v1.2.3-58-ga151


From 849a1cf13d4394d398d91752166e92e9ecd64f8d Mon Sep 17 00:00:00 2001
From: Mi Jinlong <mijinlong@cn.fujitsu.com>
Date: Tue, 30 Aug 2011 17:18:41 +0800
Subject: SUNRPC: Replace svc_addr_u by sockaddr_storage

For IPv6 local address, lockd can not callback to client for
missing scope id when binding address at inet6_bind:

 324       if (addr_type & IPV6_ADDR_LINKLOCAL) {
 325               if (addr_len >= sizeof(struct sockaddr_in6) &&
 326                   addr->sin6_scope_id) {
 327                       /* Override any existing binding, if another one
 328                        * is supplied by user.
 329                        */
 330                       sk->sk_bound_dev_if = addr->sin6_scope_id;
 331               }
 332
 333               /* Binding to link-local address requires an interface */
 334               if (!sk->sk_bound_dev_if) {
 335                       err = -EINVAL;
 336                       goto out_unlock;
 337               }

Replacing svc_addr_u by sockaddr_storage, let rqstp->rq_daddr contains more info
besides address.

Reviewed-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Mi Jinlong <mijinlong@cn.fujitsu.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/lockd/host.c            | 25 ++-----------------------
 fs/nfsd/nfs4state.c        | 16 +---------------
 include/linux/sunrpc/svc.h | 30 +++++++++++++++++++++---------
 net/sunrpc/svc_xprt.c      | 13 ++-----------
 net/sunrpc/svcsock.c       | 23 +++++++++++++++++------
 5 files changed, 43 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index b7c99bfb3da6..6f29836ec0cb 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -316,14 +316,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
 	struct hlist_node *pos;
 	struct nlm_host	*host = NULL;
 	struct nsm_handle *nsm = NULL;
-	struct sockaddr_in sin = {
-		.sin_family	= AF_INET,
-	};
-	struct sockaddr_in6 sin6 = {
-		.sin6_family	= AF_INET6,
-	};
-	struct sockaddr *src_sap;
-	size_t src_len = rqstp->rq_addrlen;
+	struct sockaddr *src_sap = svc_daddr(rqstp);
+	size_t src_len = rqstp->rq_daddrlen;
 	struct nlm_lookup_host_info ni = {
 		.server		= 1,
 		.sap		= svc_addr(rqstp),
@@ -340,21 +334,6 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
 
 	mutex_lock(&nlm_host_mutex);
 
-	switch (ni.sap->sa_family) {
-	case AF_INET:
-		sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
-		src_sap = (struct sockaddr *)&sin;
-		break;
-	case AF_INET6:
-		ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
-		src_sap = (struct sockaddr *)&sin6;
-		break;
-	default:
-		dprintk("lockd: %s failed; unrecognized address family\n",
-			__func__);
-		goto out;
-	}
-
 	if (time_after_eq(jiffies, next_gc))
 		nlm_gc_hosts();
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0cd346477f29..e7f83bd9b4a8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1257,20 +1257,6 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
 	return NULL;
 }
 
-static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr)
-{
-	switch (family) {
-	case AF_INET:
-		((struct sockaddr_in *)sa)->sin_family = AF_INET;
-		((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr;
-		return;
-	case AF_INET6:
-		((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6;
-		((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6;
-		return;
-	}
-}
-
 static void
 gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp)
 {
@@ -1302,7 +1288,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r
 
 	conn->cb_prog = se->se_callback_prog;
 	conn->cb_ident = se->se_callback_ident;
-	rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr);
+	memcpy(&conn->cb_saddr, &rqstp->rq_daddr, rqstp->rq_daddrlen);
 	return;
 out_err:
 	conn->cb_addr.ss_family = AF_UNSPEC;
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index a78a51e93373..d8d5d93071b3 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -212,11 +212,6 @@ static inline void svc_putu32(struct kvec *iov, __be32 val)
 	iov->iov_len += sizeof(__be32);
 }
 
-union svc_addr_u {
-    struct in_addr	addr;
-    struct in6_addr	addr6;
-};
-
 /*
  * The context of a single thread, including the request currently being
  * processed.
@@ -225,8 +220,12 @@ struct svc_rqst {
 	struct list_head	rq_list;	/* idle list */
 	struct list_head	rq_all;		/* all threads list */
 	struct svc_xprt *	rq_xprt;	/* transport ptr */
+
 	struct sockaddr_storage	rq_addr;	/* peer address */
 	size_t			rq_addrlen;
+	struct sockaddr_storage	rq_daddr;	/* dest addr of request
+						 *  - reply from here */
+	size_t			rq_daddrlen;
 
 	struct svc_serv *	rq_server;	/* RPC service definition */
 	struct svc_pool *	rq_pool;	/* thread pool */
@@ -255,9 +254,6 @@ struct svc_rqst {
 	unsigned short
 				rq_secure  : 1;	/* secure port */
 
-	union svc_addr_u	rq_daddr;	/* dest addr of request
-						 *  - reply from here */
-
 	void *			rq_argp;	/* decoded arguments */
 	void *			rq_resp;	/* xdr'd results */
 	void *			rq_auth_data;	/* flavor-specific data */
@@ -300,6 +296,21 @@ static inline struct sockaddr *svc_addr(const struct svc_rqst *rqst)
 	return (struct sockaddr *) &rqst->rq_addr;
 }
 
+static inline struct sockaddr_in *svc_daddr_in(const struct svc_rqst *rqst)
+{
+	return (struct sockaddr_in *) &rqst->rq_daddr;
+}
+
+static inline struct sockaddr_in6 *svc_daddr_in6(const struct svc_rqst *rqst)
+{
+	return (struct sockaddr_in6 *) &rqst->rq_daddr;
+}
+
+static inline struct sockaddr *svc_daddr(const struct svc_rqst *rqst)
+{
+	return (struct sockaddr *) &rqst->rq_daddr;
+}
+
 /*
  * Check buffer bounds after decoding arguments
  */
@@ -340,7 +351,8 @@ struct svc_deferred_req {
 	struct svc_xprt		*xprt;
 	struct sockaddr_storage	addr;	/* where reply must go */
 	size_t			addrlen;
-	union svc_addr_u	daddr;	/* where reply must come from */
+	struct sockaddr_storage	daddr;	/* where reply must come from */
+	size_t			daddrlen;
 	struct cache_deferred_req handle;
 	size_t			xprt_hlen;
 	int			argslen;
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index bd31208bbb61..d86bb673e1f6 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -254,8 +254,6 @@ EXPORT_SYMBOL_GPL(svc_create_xprt);
  */
 void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt)
 {
-	struct sockaddr *sin;
-
 	memcpy(&rqstp->rq_addr, &xprt->xpt_remote, xprt->xpt_remotelen);
 	rqstp->rq_addrlen = xprt->xpt_remotelen;
 
@@ -263,15 +261,8 @@ void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt)
 	 * Destination address in request is needed for binding the
 	 * source address in RPC replies/callbacks later.
 	 */
-	sin = (struct sockaddr *)&xprt->xpt_local;
-	switch (sin->sa_family) {
-	case AF_INET:
-		rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr;
-		break;
-	case AF_INET6:
-		rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr;
-		break;
-	}
+	memcpy(&rqstp->rq_daddr, &xprt->xpt_local, xprt->xpt_locallen);
+	rqstp->rq_daddrlen = xprt->xpt_locallen;
 }
 EXPORT_SYMBOL_GPL(svc_xprt_copy_addrs);
 
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 767d494de7a2..dfd686eb0b7f 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -143,19 +143,20 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
 			cmh->cmsg_level = SOL_IP;
 			cmh->cmsg_type = IP_PKTINFO;
 			pki->ipi_ifindex = 0;
-			pki->ipi_spec_dst.s_addr = rqstp->rq_daddr.addr.s_addr;
+			pki->ipi_spec_dst.s_addr =
+				 svc_daddr_in(rqstp)->sin_addr.s_addr;
 			cmh->cmsg_len = CMSG_LEN(sizeof(*pki));
 		}
 		break;
 
 	case AF_INET6: {
 			struct in6_pktinfo *pki = CMSG_DATA(cmh);
+			struct sockaddr_in6 *daddr = svc_daddr_in6(rqstp);
 
 			cmh->cmsg_level = SOL_IPV6;
 			cmh->cmsg_type = IPV6_PKTINFO;
-			pki->ipi6_ifindex = 0;
-			ipv6_addr_copy(&pki->ipi6_addr,
-					&rqstp->rq_daddr.addr6);
+			pki->ipi6_ifindex = daddr->sin6_scope_id;
+			ipv6_addr_copy(&pki->ipi6_addr,	&daddr->sin6_addr);
 			cmh->cmsg_len = CMSG_LEN(sizeof(*pki));
 		}
 		break;
@@ -498,9 +499,13 @@ static int svc_udp_get_dest_address4(struct svc_rqst *rqstp,
 				     struct cmsghdr *cmh)
 {
 	struct in_pktinfo *pki = CMSG_DATA(cmh);
+	struct sockaddr_in *daddr = svc_daddr_in(rqstp);
+
 	if (cmh->cmsg_type != IP_PKTINFO)
 		return 0;
-	rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr;
+
+	daddr->sin_family = AF_INET;
+	daddr->sin_addr.s_addr = pki->ipi_spec_dst.s_addr;
 	return 1;
 }
 
@@ -511,9 +516,14 @@ static int svc_udp_get_dest_address6(struct svc_rqst *rqstp,
 				     struct cmsghdr *cmh)
 {
 	struct in6_pktinfo *pki = CMSG_DATA(cmh);
+	struct sockaddr_in6 *daddr = svc_daddr_in6(rqstp);
+
 	if (cmh->cmsg_type != IPV6_PKTINFO)
 		return 0;
-	ipv6_addr_copy(&rqstp->rq_daddr.addr6, &pki->ipi6_addr);
+
+	daddr->sin6_family = AF_INET6;
+	ipv6_addr_copy(&daddr->sin6_addr, &pki->ipi6_addr);
+	daddr->sin6_scope_id = pki->ipi6_ifindex;
 	return 1;
 }
 
@@ -614,6 +624,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
 		skb_free_datagram_locked(svsk->sk_sk, skb);
 		return 0;
 	}
+	rqstp->rq_daddrlen = svc_addr_len(svc_daddr(rqstp));
 
 	if (skb_is_nonlinear(skb)) {
 		/* we have to copy */
-- 
cgit v1.2.3-58-ga151


From 58e7b33a58d0cd07c9294d5161553b204c75662d Mon Sep 17 00:00:00 2001
From: Mi Jinlong <mijinlong@cn.fujitsu.com>
Date: Sun, 28 Aug 2011 18:18:56 +0800
Subject: nfsd41: try to check reply size before operation

For checking the size of reply before calling a operation,
we need try to get maxsize of the operation's reply.

v3: using new method as Bruce said,

 "we could handle operations in two different ways:

	- For operations that actually change something (write, rename,
	  open, close, ...), do it the way we're doing it now: be
	  very careful to estimate the size of the response before even
	  processing the operation.
	- For operations that don't change anything (read, getattr, ...)
	  just go ahead and do the operation.  If you realize after the
	  fact that the response is too large, then return the error at
	  that point.

  So we'd add another flag to op_flags: say, OP_MODIFIES_SOMETHING.  And for
  operations with OP_MODIFIES_SOMETHING set, we'd do the first thing.  For
  operations without it set, we'd do the second."

Signed-off-by: Mi Jinlong <mijinlong@cn.fujitsu.com>
[bfields@redhat.com: crash, don't attempt to handle, undefined op_rsize_bop]
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 247 +++++++++++++++++++++++++++++++++++++++++++++++++----
 fs/nfsd/nfs4xdr.c  |  37 ++++----
 fs/nfsd/xdr4.h     |   1 +
 3 files changed, 248 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 460eeb329d81..752a367e0e3d 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -35,6 +35,7 @@
 #include <linux/file.h>
 #include <linux/slab.h>
 
+#include "idmap.h"
 #include "cache.h"
 #include "xdr4.h"
 #include "vfs.h"
@@ -1003,6 +1004,8 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
 
 typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
 			      void *);
+typedef u32(*nfsd4op_rsize)(struct svc_rqst *, struct nfsd4_op *op);
+
 enum nfsd4_op_flags {
 	ALLOWED_WITHOUT_FH = 1 << 0,	/* No current filehandle required */
 	ALLOWED_ON_ABSENT_FS = 1 << 1,	/* ops processed on absent fs */
@@ -1010,6 +1013,7 @@ enum nfsd4_op_flags {
 	/* For rfc 5661 section 2.6.3.1.1: */
 	OP_HANDLES_WRONGSEC = 1 << 3,
 	OP_IS_PUTFH_LIKE = 1 << 4,
+	OP_MODIFIES_SOMETHING = 1 << 5, /* op is non-idempotent */
 };
 
 struct nfsd4_operation {
@@ -1025,6 +1029,8 @@ struct nfsd4_operation {
 	 * the v4.0 case).
 	 */
 	bool op_cacheresult;
+	/* Try to get response size before operation */
+	nfsd4op_rsize op_rsize_bop;
 };
 
 static struct nfsd4_operation nfsd4_ops[];
@@ -1119,6 +1125,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 	struct nfsd4_operation *opdesc;
 	struct nfsd4_compound_state *cstate = &resp->cstate;
 	int		slack_bytes;
+	u32		plen = 0;
 	__be32		status;
 
 	resp->xbuf = &rqstp->rq_res;
@@ -1197,6 +1204,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 			goto encode_op;
 		}
 
+		/* If op is non-idempotent */
+		if (opdesc->op_flags & OP_MODIFIES_SOMETHING) {
+			plen = opdesc->op_rsize_bop(rqstp, op);
+			op->status = nfsd4_check_resp_size(resp, plen);
+		}
+
+		if (op->status)
+			goto encode_op;
+
 		if (opdesc->op_func)
 			op->status = opdesc->op_func(rqstp, cstate, &op->u);
 		else
@@ -1247,6 +1263,144 @@ out:
 	return status;
 }
 
+#define op_encode_hdr_size		(2)
+#define op_encode_stateid_maxsz		(XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define op_encode_verifier_maxsz	(XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+#define op_encode_change_info_maxsz	(5)
+#define nfs4_fattr_bitmap_maxsz		(4)
+
+#define op_encode_lockowner_maxsz	(1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#define op_encode_lock_denied_maxsz	(8 + op_encode_lockowner_maxsz)
+
+#define nfs4_owner_maxsz		(1 + XDR_QUADLEN(IDMAP_NAMESZ))
+
+#define op_encode_ace_maxsz		(3 + nfs4_owner_maxsz)
+#define op_encode_delegation_maxsz	(1 + op_encode_stateid_maxsz + 1 + \
+					 op_encode_ace_maxsz)
+
+#define op_encode_channel_attrs_maxsz	(6 + 1 + 1)
+
+static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32);
+}
+
+static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_change_info_maxsz
+		+ nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_change_info_maxsz)
+		* sizeof(__be32);
+}
+
+static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_lock_denied_maxsz)
+		* sizeof(__be32);
+}
+
+static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_stateid_maxsz
+		+ op_encode_change_info_maxsz + 1
+		+ nfs4_fattr_bitmap_maxsz
+		+ op_encode_delegation_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	u32 maxcount = 0, rlen = 0;
+
+	maxcount = svc_max_payload(rqstp);
+	rlen = op->u.read.rd_length;
+
+	if (rlen > maxcount)
+		rlen = maxcount;
+
+	return (op_encode_hdr_size + 2) * sizeof(__be32) + rlen;
+}
+
+static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	u32 rlen = op->u.readdir.rd_maxcount;
+
+	if (rlen > PAGE_SIZE)
+		rlen = PAGE_SIZE;
+
+	return (op_encode_hdr_size + op_encode_verifier_maxsz)
+		 * sizeof(__be32) + rlen;
+}
+
+static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_change_info_maxsz)
+		* sizeof(__be32);
+}
+
+static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_change_info_maxsz
+		+ op_encode_change_info_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + 2 + 1024) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\
+		1 + 1 + 0 + /* eir_flags, spr_how, SP4_NONE (for now) */\
+		2 + /*eir_server_owner.so_minor_id */\
+		/* eir_server_owner.so_major_id<> */\
+		XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
+		/* eir_server_scope<> */\
+		XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
+		1 + /* eir_server_impl_id array length */\
+		0 /* ignored eir_server_impl_id contents */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + \
+		XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\
+		2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + \
+		XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\
+		2 + /* csr_sequence, csr_flags */\
+		op_encode_channel_attrs_maxsz + \
+		op_encode_channel_attrs_maxsz) * sizeof(__be32);
+}
+
 static struct nfsd4_operation nfsd4_ops[] = {
 	[OP_ACCESS] = {
 		.op_func = (nfsd4op_func)nfsd4_access,
@@ -1254,20 +1408,28 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_CLOSE] = {
 		.op_func = (nfsd4op_func)nfsd4_close,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_CLOSE",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
 	},
 	[OP_COMMIT] = {
 		.op_func = (nfsd4op_func)nfsd4_commit,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_COMMIT",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_commit_rsize,
 	},
 	[OP_CREATE] = {
 		.op_func = (nfsd4op_func)nfsd4_create,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_CREATE",
 		.op_cacheresult = true,
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_create_rsize,
 	},
 	[OP_DELEGRETURN] = {
 		.op_func = (nfsd4op_func)nfsd4_delegreturn,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_DELEGRETURN",
+		.op_rsize_bop = nfsd4_only_status_rsize,
 	},
 	[OP_GETATTR] = {
 		.op_func = (nfsd4op_func)nfsd4_getattr,
@@ -1280,12 +1442,16 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_LINK] = {
 		.op_func = (nfsd4op_func)nfsd4_link,
+		.op_flags = ALLOWED_ON_ABSENT_FS | OP_MODIFIES_SOMETHING,
 		.op_name = "OP_LINK",
 		.op_cacheresult = true,
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_link_rsize,
 	},
 	[OP_LOCK] = {
 		.op_func = (nfsd4op_func)nfsd4_lock,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_LOCK",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize,
 	},
 	[OP_LOCKT] = {
 		.op_func = (nfsd4op_func)nfsd4_lockt,
@@ -1293,7 +1459,9 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_LOCKU] = {
 		.op_func = (nfsd4op_func)nfsd4_locku,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_LOCKU",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
 	},
 	[OP_LOOKUP] = {
 		.op_func = (nfsd4op_func)nfsd4_lookup,
@@ -1311,42 +1479,54 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_OPEN] = {
 		.op_func = (nfsd4op_func)nfsd4_open,
-		.op_flags = OP_HANDLES_WRONGSEC,
+		.op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING,
 		.op_name = "OP_OPEN",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_open_rsize,
 	},
 	[OP_OPEN_CONFIRM] = {
 		.op_func = (nfsd4op_func)nfsd4_open_confirm,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_OPEN_CONFIRM",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
 	},
 	[OP_OPEN_DOWNGRADE] = {
 		.op_func = (nfsd4op_func)nfsd4_open_downgrade,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_OPEN_DOWNGRADE",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
 	},
 	[OP_PUTFH] = {
 		.op_func = (nfsd4op_func)nfsd4_putfh,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
-				| OP_IS_PUTFH_LIKE,
+				| OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
 		.op_name = "OP_PUTFH",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_PUTPUBFH] = {
 		.op_func = (nfsd4op_func)nfsd4_putrootfh,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
-				| OP_IS_PUTFH_LIKE,
+				| OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
 		.op_name = "OP_PUTPUBFH",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_PUTROOTFH] = {
 		.op_func = (nfsd4op_func)nfsd4_putrootfh,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
-				| OP_IS_PUTFH_LIKE,
+				| OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
 		.op_name = "OP_PUTROOTFH",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_READ] = {
 		.op_func = (nfsd4op_func)nfsd4_read,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_READ",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_read_rsize,
 	},
 	[OP_READDIR] = {
 		.op_func = (nfsd4op_func)nfsd4_readdir,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_READDIR",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_readdir_rsize,
 	},
 	[OP_READLINK] = {
 		.op_func = (nfsd4op_func)nfsd4_readlink,
@@ -1354,29 +1534,38 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_REMOVE] = {
 		.op_func = (nfsd4op_func)nfsd4_remove,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_REMOVE",
 		.op_cacheresult = true,
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_remove_rsize,
 	},
 	[OP_RENAME] = {
-		.op_name = "OP_RENAME",
 		.op_func = (nfsd4op_func)nfsd4_rename,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_RENAME",
 		.op_cacheresult = true,
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_rename_rsize,
 	},
 	[OP_RENEW] = {
 		.op_func = (nfsd4op_func)nfsd4_renew,
-		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_MODIFIES_SOMETHING,
 		.op_name = "OP_RENEW",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+
 	},
 	[OP_RESTOREFH] = {
 		.op_func = (nfsd4op_func)nfsd4_restorefh,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
-				| OP_IS_PUTFH_LIKE,
+				| OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
 		.op_name = "OP_RESTOREFH",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_SAVEFH] = {
 		.op_func = (nfsd4op_func)nfsd4_savefh,
-		.op_flags = OP_HANDLES_WRONGSEC,
+		.op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING,
 		.op_name = "OP_SAVEFH",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_SECINFO] = {
 		.op_func = (nfsd4op_func)nfsd4_secinfo,
@@ -1386,19 +1575,25 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	[OP_SETATTR] = {
 		.op_func = (nfsd4op_func)nfsd4_setattr,
 		.op_name = "OP_SETATTR",
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_cacheresult = true,
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_setattr_rsize,
 	},
 	[OP_SETCLIENTID] = {
 		.op_func = (nfsd4op_func)nfsd4_setclientid,
-		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_MODIFIES_SOMETHING,
 		.op_name = "OP_SETCLIENTID",
 		.op_cacheresult = true,
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_setclientid_rsize,
 	},
 	[OP_SETCLIENTID_CONFIRM] = {
 		.op_func = (nfsd4op_func)nfsd4_setclientid_confirm,
-		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_MODIFIES_SOMETHING,
 		.op_name = "OP_SETCLIENTID_CONFIRM",
 		.op_cacheresult = true,
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_VERIFY] = {
 		.op_func = (nfsd4op_func)nfsd4_verify,
@@ -1406,35 +1601,47 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_WRITE] = {
 		.op_func = (nfsd4op_func)nfsd4_write,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_WRITE",
 		.op_cacheresult = true,
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
 	},
 	[OP_RELEASE_LOCKOWNER] = {
 		.op_func = (nfsd4op_func)nfsd4_release_lockowner,
-		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_MODIFIES_SOMETHING,
 		.op_name = "OP_RELEASE_LOCKOWNER",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 
 	/* NFSv4.1 operations */
 	[OP_EXCHANGE_ID] = {
 		.op_func = (nfsd4op_func)nfsd4_exchange_id,
-		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+				| OP_MODIFIES_SOMETHING,
 		.op_name = "OP_EXCHANGE_ID",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize,
 	},
 	[OP_BIND_CONN_TO_SESSION] = {
 		.op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
-		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+				| OP_MODIFIES_SOMETHING,
 		.op_name = "OP_BIND_CONN_TO_SESSION",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_bind_conn_to_session_rsize,
 	},
 	[OP_CREATE_SESSION] = {
 		.op_func = (nfsd4op_func)nfsd4_create_session,
-		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+				| OP_MODIFIES_SOMETHING,
 		.op_name = "OP_CREATE_SESSION",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_create_session_rsize,
 	},
 	[OP_DESTROY_SESSION] = {
 		.op_func = (nfsd4op_func)nfsd4_destroy_session,
-		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+				| OP_MODIFIES_SOMETHING,
 		.op_name = "OP_DESTROY_SESSION",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_SEQUENCE] = {
 		.op_func = (nfsd4op_func)nfsd4_sequence,
@@ -1443,13 +1650,16 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_DESTROY_CLIENTID] = {
 		.op_func = NULL,
-		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+				| OP_MODIFIES_SOMETHING,
 		.op_name = "OP_DESTROY_CLIENTID",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_RECLAIM_COMPLETE] = {
 		.op_func = (nfsd4op_func)nfsd4_reclaim_complete,
-		.op_flags = ALLOWED_WITHOUT_FH,
+		.op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
 		.op_name = "OP_RECLAIM_COMPLETE",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_SECINFO_NO_NAME] = {
 		.op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
@@ -1463,8 +1673,9 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_FREE_STATEID] = {
 		.op_func = (nfsd4op_func)nfsd4_free_stateid,
-		.op_flags = ALLOWED_WITHOUT_FH,
+		.op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
 		.op_name = "OP_FREE_STATEID",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 };
 
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 5252d6681960..f4116cf16595 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3387,34 +3387,29 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 
 /*
  * Calculate the total amount of memory that the compound response has taken
- * after encoding the current operation.
+ * after encoding the current operation with pad.
  *
- * pad: add on 8 bytes for the next operation's op_code and status so that
- * there is room to cache a failure on the next operation.
+ * pad: if operation is non-idempotent, pad was calculate by op_rsize_bop()
+ *      which was specified at nfsd4_operation, else pad is zero.
  *
- * Compare this length to the session se_fmaxresp_cached.
+ * Compare this length to the session se_fmaxresp_sz and se_fmaxresp_cached.
  *
  * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
  * will be at least a page and will therefore hold the xdr_buf head.
  */
-static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
+int nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
 {
-	int status = 0;
 	struct xdr_buf *xb = &resp->rqstp->rq_res;
-	struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
 	struct nfsd4_session *session = NULL;
 	struct nfsd4_slot *slot = resp->cstate.slot;
-	u32 length, tlen = 0, pad = 8;
+	u32 length, tlen = 0;
 
 	if (!nfsd4_has_session(&resp->cstate))
-		return status;
+		return 0;
 
 	session = resp->cstate.session;
-	if (session == NULL || slot->sl_cachethis == 0)
-		return status;
-
-	if (resp->opcnt >= args->opcnt)
-		pad = 0; /* this is the last operation */
+	if (session == NULL)
+		return 0;
 
 	if (xb->page_len == 0) {
 		length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
@@ -3427,10 +3422,14 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
 	dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
 		length, xb->page_len, tlen, pad);
 
-	if (length <= session->se_fchannel.maxresp_cached)
-		return status;
-	else
+	if (length > session->se_fchannel.maxresp_sz)
+		return nfserr_rep_too_big;
+
+	if (slot->sl_cachethis == 1 &&
+	    length > session->se_fchannel.maxresp_cached)
 		return nfserr_rep_too_big_to_cache;
+
+	return 0;
 }
 
 void
@@ -3450,8 +3449,8 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 	       !nfsd4_enc_ops[op->opnum]);
 	op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
 	/* nfsd4_check_drc_limit guarantees enough room for error status */
-	if (!op->status && nfsd4_check_drc_limit(resp))
-		op->status = nfserr_rep_too_big_to_cache;
+	if (!op->status)
+		op->status = nfsd4_check_resp_size(resp, 0);
 status:
 	/*
 	 * Note: We write the status directly, instead of using WRITE32(),
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index f95a72482064..a767b57b8208 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -524,6 +524,7 @@ int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *,
 		struct nfsd4_compoundargs *);
 int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *,
 		struct nfsd4_compoundres *);
+int nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
 void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
 void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
 __be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
-- 
cgit v1.2.3-58-ga151


From dad1c067eb42ec8bedadd64f681056914547d22e Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 12 Sep 2011 12:24:13 -0400
Subject: nfsd4: replace oo_confirmed by flag bit

I want at least one more bit here.  So, let's haul out the caps lock key
and add a flags field.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c  |  4 ++--
 fs/nfsd/nfs4state.c | 24 ++++++++++++------------
 fs/nfsd/state.h     |  3 ++-
 3 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 752a367e0e3d..aa769dfa756a 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -375,7 +375,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 				goto out;
 			break;
 		case NFS4_OPEN_CLAIM_PREVIOUS:
-			open->op_openowner->oo_confirmed = 1;
+			open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
 			/*
 			 * The CURRENT_FH is already set to the file being
 			 * opened.  (1) set open->op_cinfo, (2) set
@@ -388,7 +388,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 				goto out;
 			break;
              	case NFS4_OPEN_CLAIM_DELEGATE_PREV:
-			open->op_openowner->oo_confirmed = 1;
+			open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
 			dprintk("NFSD: unsupported OPEN claim type %d\n",
 				open->op_claim_type);
 			status = nfserr_notsupp;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e7f83bd9b4a8..59b70afdc884 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2322,7 +2322,7 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
 		return NULL;
 	oo->oo_owner.so_is_open_owner = 1;
 	oo->oo_owner.so_seqid = open->op_seqid;
-	oo->oo_confirmed = 0;
+	oo->oo_flags = 0;
 	oo->oo_time = 0;
 	INIT_LIST_HEAD(&oo->oo_close_lru);
 	hash_openowner(oo, clp, strhashval);
@@ -2549,7 +2549,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
 			return nfserr_expired;
 		goto renew;
 	}
-	if (!oo->oo_confirmed) {
+	if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
 		/* Replace unconfirmed owners without checking for replay. */
 		clp = oo->oo_owner.so_client;
 		release_openowner(oo);
@@ -2616,7 +2616,7 @@ out:
 		return nfs_ok;
 	if (status)
 		return status;
-	open->op_openowner->oo_confirmed = 1;
+	open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
 	return nfs_ok;
 }
 
@@ -2749,7 +2749,7 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
 static void
 nfs4_set_claim_prev(struct nfsd4_open *open)
 {
-	open->op_openowner->oo_confirmed = 1;
+	open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
 	open->op_openowner->oo_owner.so_client->cl_firststate = 1;
 }
 
@@ -2853,7 +2853,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_
 			 * had the chance to reclaim theirs.... */
 			if (locks_in_grace())
 				goto out;
-			if (!cb_up || !oo->oo_confirmed)
+			if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
 				goto out;
 			if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
 				flag = NFS4_OPEN_DELEGATE_WRITE;
@@ -2952,7 +2952,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 
 	if (nfsd4_has_session(&resp->cstate))
-		open->op_openowner->oo_confirmed = 1;
+		open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
 
 	/*
 	* Attempt to hand out a delegation. No error return, because the
@@ -2973,7 +2973,7 @@ out:
 	* To finish the open response, we just need to set the rflags.
 	*/
 	open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
-	if (!open->op_openowner->oo_confirmed &&
+	if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) &&
 	    !nfsd4_has_session(&resp->cstate))
 		open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
 
@@ -3264,7 +3264,7 @@ __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session)
 		return nfs_ok;
 	ols = openlockstateid(s);
 	if (ols->st_stateowner->so_is_open_owner
-	    && !openowner(ols->st_stateowner)->oo_confirmed)
+	    && !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
 		return nfserr_bad_stateid;
 	return nfs_ok;
 }
@@ -3323,7 +3323,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 		if (nfs4_check_fh(current_fh, stp))
 			goto out;
 		if (stp->st_stateowner->so_is_open_owner
-		    && !openowner(stp->st_stateowner)->oo_confirmed)
+		    && !(openowner(stp->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
 			goto out;
 		status = nfs4_check_openmode(stp, flags);
 		if (status)
@@ -3476,7 +3476,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs
 	if (status)
 		return status;
 	oo = openowner((*stpp)->st_stateowner);
-	if (!oo->oo_confirmed)
+	if (!(oo->oo_flags & NFS4_OO_CONFIRMED))
 		return nfserr_bad_stateid;
 	return nfs_ok;
 }
@@ -3506,9 +3506,9 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto out;
 	oo = openowner(stp->st_stateowner);
 	status = nfserr_bad_stateid;
-	if (oo->oo_confirmed)
+	if (oo->oo_flags & NFS4_OO_CONFIRMED)
 		goto out;
-	oo->oo_confirmed = 1;
+	oo->oo_flags |= NFS4_OO_CONFIRMED;
 	update_stateid(&stp->st_stid.sc_stateid);
 	memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 	dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 12c142436705..a8324b868a36 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -371,7 +371,8 @@ struct nfs4_openowner {
 	struct list_head        oo_perclient;
 	struct list_head	oo_close_lru; /* tail queue */
 	time_t			oo_time; /* time of placement on so_close_lru */
-	int                     oo_confirmed; /* successful OPEN_CONFIRM? */
+#define NFS4_OO_CONFIRMED   1
+	unsigned char		oo_flags;
 };
 
 struct nfs4_lockowner {
-- 
cgit v1.2.3-58-ga151


From 38c387b52d8404f8fd29d8c26bebc83a80733657 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 16 Sep 2011 17:42:48 -0400
Subject: nfsd4: match close replays on stateid, not open owner id

Keep around an unhashed copy of the final stateid after the last close
using an openowner, and when identifying a replay, match against that
stateid instead of just against the open owner id.  Free it the next
time the seqid is bumped or the stateowner is destroyed.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 47 ++++++++++++++++++++++++++++++++++++++++-------
 fs/nfsd/nfs4xdr.c   |  1 +
 fs/nfsd/state.h     |  3 +++
 3 files changed, 44 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 59b70afdc884..a174841b262e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -438,7 +438,6 @@ static void close_generic_stateid(struct nfs4_ol_stateid *stp)
 
 static void free_generic_stateid(struct nfs4_ol_stateid *stp)
 {
-	close_generic_stateid(stp);
 	kmem_cache_free(stateid_slab, stp);
 }
 
@@ -450,6 +449,7 @@ static void release_lock_stateid(struct nfs4_ol_stateid *stp)
 	file = find_any_file(stp->st_file);
 	if (file)
 		locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner));
+	close_generic_stateid(stp);
 	free_generic_stateid(stp);
 }
 
@@ -485,10 +485,16 @@ release_stateid_lockowners(struct nfs4_ol_stateid *open_stp)
 	}
 }
 
-static void release_open_stateid(struct nfs4_ol_stateid *stp)
+static void unhash_open_stateid(struct nfs4_ol_stateid *stp)
 {
 	unhash_generic_stateid(stp);
 	release_stateid_lockowners(stp);
+	close_generic_stateid(stp);
+}
+
+static void release_open_stateid(struct nfs4_ol_stateid *stp)
+{
+	unhash_open_stateid(stp);
 	free_generic_stateid(stp);
 }
 
@@ -510,6 +516,8 @@ static void release_openowner(struct nfs4_openowner *oo)
 {
 	unhash_openowner(oo);
 	list_del(&oo->oo_close_lru);
+	if (oo->oo_last_closed_stid)
+		free_generic_stateid(oo->oo_last_closed_stid);
 	nfs4_free_openowner(oo);
 }
 
@@ -2324,6 +2332,7 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
 	oo->oo_owner.so_seqid = open->op_seqid;
 	oo->oo_flags = 0;
 	oo->oo_time = 0;
+	oo->oo_last_closed_stid = NULL;
 	INIT_LIST_HEAD(&oo->oo_close_lru);
 	hash_openowner(oo, clp, strhashval);
 	return oo;
@@ -3120,12 +3129,14 @@ laundromat_main(struct work_struct *not_used)
 	queue_delayed_work(laundry_wq, &laundromat_work, t*HZ);
 }
 
-static struct nfs4_openowner * search_close_lru(u32 st_id)
+static struct nfs4_openowner * search_close_lru(stateid_t *s)
 {
 	struct nfs4_openowner *local;
+	struct nfs4_ol_stateid *os;
 
 	list_for_each_entry(local, &close_lru, oo_close_lru) {
-		if (local->oo_owner.so_id == st_id)
+		os = local->oo_last_closed_stid;
+		if (same_stateid(&os->st_stid.sc_stateid, s))
 			return local;
 	}
 	return NULL;
@@ -3589,6 +3600,27 @@ out:
 	return status;
 }
 
+void nfsd4_purge_closed_stateid(struct nfs4_stateowner *so)
+{
+	struct nfs4_openowner *oo;
+	struct nfs4_ol_stateid *s;
+
+	if (!so->so_is_open_owner)
+		return;
+	oo = openowner(so);
+	s = oo->oo_last_closed_stid;
+	if (!s)
+		return;
+	if (!(oo->oo_flags & NFS4_OO_PURGE_CLOSE)) {
+		/* Release the last_closed_stid on the next seqid bump: */
+		oo->oo_flags |= NFS4_OO_PURGE_CLOSE;
+		return;
+	}
+	oo->oo_flags &= ~NFS4_OO_PURGE_CLOSE;
+	free_generic_stateid(oo->oo_last_closed_stid);
+	oo->oo_last_closed_stid = NULL;
+}
+
 /*
  * nfs4_unlock_state() called after encode
  */
@@ -3613,7 +3645,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		 * Also, we should make sure this isn't just the result of
 		 * a replayed close:
 		 */
-		oo = search_close_lru(close->cl_stateid.si_stateownerid);
+		oo = search_close_lru(&close->cl_stateid);
 		/* It's not stale; let's assume it's expired: */
 		if (oo == NULL)
 			goto out;
@@ -3630,8 +3662,9 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	update_stateid(&stp->st_stid.sc_stateid);
 	memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 
-	/* release_stateid() calls nfsd_close() if needed */
-	release_open_stateid(stp);
+	/* unhash_open_stateid() calls nfsd_close() if needed */
+	oo->oo_last_closed_stid = stp;
+	unhash_open_stateid(stp);
 
 	/* place unused nfs4_stateowners on so_close_lru list to be
 	 * released by the laundromat service after the lease period
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f4116cf16595..7bd57c2dbc4d 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1636,6 +1636,7 @@ static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, _
 			  (char *)resp->p - (char *)save;
 		memcpy(stateowner->so_replay.rp_buf, save,
 			stateowner->so_replay.rp_buflen);
+		nfsd4_purge_closed_stateid(stateowner);
 	}
 }
 
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index a8324b868a36..e807abb116f6 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -370,8 +370,10 @@ struct nfs4_openowner {
 	struct nfs4_stateowner	oo_owner; /* must be first field */
 	struct list_head        oo_perclient;
 	struct list_head	oo_close_lru; /* tail queue */
+	struct nfs4_ol_stateid *oo_last_closed_stid;
 	time_t			oo_time; /* time of placement on so_close_lru */
 #define NFS4_OO_CONFIRMED   1
+#define NFS4_OO_PURGE_CLOSE 2
 	unsigned char		oo_flags;
 };
 
@@ -514,5 +516,6 @@ extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
 extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
 extern void release_session_client(struct nfsd4_session *);
 extern __be32 nfs4_validate_stateid(stateid_t *, bool);
+extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *);
 
 #endif   /* NFSD4_STATE_H */
-- 
cgit v1.2.3-58-ga151


From 2da1cec713bc6d3ec9732e7d48b8bc0453580fd3 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 16 Sep 2011 18:56:20 -0400
Subject: nfsd4: simplify free_stateid

We no longer need is_deleg_stateid, for example.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 66 ++++++++++++-----------------------------------------
 1 file changed, 15 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a174841b262e..fdd03f6ae044 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1098,16 +1098,6 @@ static struct nfs4_stid *find_stateid(stateid_t *t)
 	return NULL;
 }
 
-static struct nfs4_ol_stateid *find_ol_stateid(stateid_t *t)
-{
-	struct nfs4_stid *s;
-
-	s = find_stateid(t);
-	if (!s)
-		return NULL;
-	return openlockstateid(s);
-}
-
 static struct nfs4_stid *find_stateid_by_type(stateid_t *t, char typemask)
 {
 	struct nfs4_stid *s;
@@ -3251,11 +3241,6 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_sess
 	return nfserr_old_stateid;
 }
 
-static int is_delegation_stateid(stateid_t *stateid)
-{
-	return stateid->si_fileid == 0;
-}
-
 __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session)
 {
 	struct nfs4_stid *s;
@@ -3352,16 +3337,6 @@ out:
 	return status;
 }
 
-static __be32
-nfsd4_free_delegation_stateid(stateid_t *stateid)
-{
-	struct nfs4_delegation *dp = find_deleg_stateid(stateid);
-	if (dp)
-		return nfserr_locks_held;
-
-	return nfserr_bad_stateid;
-}
-
 static __be32
 nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp)
 {
@@ -3382,40 +3357,32 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	return nfs_ok;
 }
 
-/*
- * Free a state id
- */
 __be32
 nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		   struct nfsd4_free_stateid *free_stateid)
 {
 	stateid_t *stateid = &free_stateid->fr_stateid;
-	struct nfs4_ol_stateid *stp;
-	__be32 ret;
+	struct nfs4_stid *s;
+	__be32 ret = nfserr_bad_stateid;
 
 	nfs4_lock_state();
-	if (is_delegation_stateid(stateid)) {
-		ret = nfsd4_free_delegation_stateid(stateid);
-		goto out;
-	}
-
-	stp = find_ol_stateid(stateid);
-	if (!stp) {
-		ret = nfserr_bad_stateid;
-		goto out;
-	}
-	ret = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, 1);
-	if (ret)
+	s = find_stateid(stateid);
+	if (!s)
 		goto out;
-
-	if (stp->st_stid.sc_type == NFS4_OPEN_STID) {
+	switch (s->sc_type) {
+	case NFS4_DELEG_STID:
 		ret = nfserr_locks_held;
 		goto out;
-	} else {
-		ret = nfsd4_free_lock_stateid(stp);
-		goto out;
+	case NFS4_OPEN_STID:
+	case NFS4_LOCK_STID:
+		ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
+		if (ret)
+			goto out;
+		if (s->sc_type == NFS4_LOCK_STID)
+			ret = nfsd4_free_lock_stateid(openlockstateid(s));
+		else
+			ret = nfserr_locks_held;
 	}
-
 out:
 	nfs4_unlock_state();
 	return ret;
@@ -3698,9 +3665,6 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	status = nfserr_stale_stateid;
 	if (STALE_STATEID(stateid))
 		goto out;
-	status = nfserr_bad_stateid;
-	if (!is_delegation_stateid(stateid))
-		goto out;
 	status = nfserr_expired;
 	dp = find_deleg_stateid(stateid);
 	if (!dp)
-- 
cgit v1.2.3-58-ga151


From d3b313a463c64c54d57c6af09c4a5d20106c1d1c Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 15 Sep 2011 15:02:41 -0400
Subject: nfsd4: construct stateid from clientid and counter

Including the full clientid in the on-the-wire stateid allows more
reliable detection of bad vs. expired stateid's, simplifies code, and
ensures we won't reuse the opaque part of the stateid (as we currently
do when the same openowner closes and reopens the same file).

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 58 ++++++++++++-----------------------------------------
 fs/nfsd/state.h     | 18 +++++------------
 2 files changed, 18 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fdd03f6ae044..922f47dd0d74 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -49,9 +49,7 @@
 time_t nfsd4_lease = 90;     /* default lease time */
 time_t nfsd4_grace = 90;
 static time_t boot_time;
-static u32 current_ownerid = 1;
-static u32 current_fileid = 1;
-static u32 current_delegid = 1;
+static u32 current_stateid = 1;
 static stateid_t zerostateid;             /* bits all 0 */
 static stateid_t onestateid;              /* bits all 1 */
 static u64 current_sessionid = 1;
@@ -136,11 +134,6 @@ unsigned int max_delegations;
 #define OPEN_OWNER_HASH_SIZE             (1 << OPEN_OWNER_HASH_BITS)
 #define OPEN_OWNER_HASH_MASK             (OPEN_OWNER_HASH_SIZE - 1)
 
-static unsigned int open_ownerid_hashval(const u32 id)
-{
-	return id & OPEN_OWNER_HASH_MASK;
-}
-
 static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
 {
 	unsigned int ret;
@@ -150,7 +143,6 @@ static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *owner
 	return ret & OPEN_OWNER_HASH_MASK;
 }
 
-static struct list_head	open_ownerid_hashtbl[OPEN_OWNER_HASH_SIZE];
 static struct list_head	open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE];
 
 /* hash table for nfs4_file */
@@ -255,9 +247,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
 	dp->dl_file = fp;
 	dp->dl_type = type;
 	dp->dl_stid.sc_type = NFS4_DELEG_STID;
-	dp->dl_stid.sc_stateid.si_boot = boot_time;
-	dp->dl_stid.sc_stateid.si_stateownerid = current_delegid++;
-	dp->dl_stid.sc_stateid.si_fileid = 0;
+	dp->dl_stid.sc_stateid.si_opaque.so_clid = clp->cl_clientid;
+	dp->dl_stid.sc_stateid.si_opaque.so_id = current_stateid++;
 	dp->dl_stid.sc_stateid.si_generation = 1;
 	hash_stid(&dp->dl_stid);
 	fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
@@ -457,7 +448,6 @@ static void unhash_lockowner(struct nfs4_lockowner *lo)
 {
 	struct nfs4_ol_stateid *stp;
 
-	list_del(&lo->lo_owner.so_idhash);
 	list_del(&lo->lo_owner.so_strhash);
 	list_del(&lo->lo_perstateid);
 	while (!list_empty(&lo->lo_owner.so_stateids)) {
@@ -502,7 +492,6 @@ static void unhash_openowner(struct nfs4_openowner *oo)
 {
 	struct nfs4_ol_stateid *stp;
 
-	list_del(&oo->oo_owner.so_idhash);
 	list_del(&oo->oo_owner.so_strhash);
 	list_del(&oo->oo_perclient);
 	while (!list_empty(&oo->oo_owner.so_stateids)) {
@@ -1081,9 +1070,8 @@ static void gen_confirm(struct nfs4_client *clp)
 static int
 same_stateid(stateid_t *id_one, stateid_t *id_two)
 {
-	if (id_one->si_stateownerid != id_two->si_stateownerid)
-		return 0;
-	return id_one->si_fileid == id_two->si_fileid;
+	return 0 == memcmp(&id_one->si_opaque, &id_two->si_opaque,
+					sizeof(stateid_opaque_t));
 }
 
 static struct nfs4_stid *find_stateid(stateid_t *t)
@@ -2198,7 +2186,6 @@ alloc_init_file(struct inode *ino)
 		INIT_LIST_HEAD(&fp->fi_stateids);
 		INIT_LIST_HEAD(&fp->fi_delegations);
 		fp->fi_inode = igrab(ino);
-		fp->fi_id = current_fileid++;
 		fp->fi_had_conflict = false;
 		fp->fi_lease = NULL;
 		memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
@@ -2295,7 +2282,6 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj
 	sop->so_owner.len = owner->len;
 
 	INIT_LIST_HEAD(&sop->so_stateids);
-	sop->so_id = current_ownerid++;
 	sop->so_client = clp;
 	init_nfs4_replay(&sop->so_replay);
 	return sop;
@@ -2303,10 +2289,6 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj
 
 static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
 {
-	unsigned int idhashval;
-
-	idhashval = open_ownerid_hashval(oo->oo_owner.so_id);
-	list_add(&oo->oo_owner.so_idhash, &open_ownerid_hashtbl[idhashval]);
 	list_add(&oo->oo_owner.so_strhash, &open_ownerstr_hashtbl[strhashval]);
 	list_add(&oo->oo_perclient, &clp->cl_openowners);
 }
@@ -2331,6 +2313,7 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
 static inline void
 init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
 	struct nfs4_openowner *oo = open->op_openowner;
+	struct nfs4_client *clp = oo->oo_owner.so_client;
 
 	INIT_LIST_HEAD(&stp->st_lockowners);
 	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
@@ -2339,9 +2322,8 @@ init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd
 	stp->st_stateowner = &oo->oo_owner;
 	get_nfs4_file(fp);
 	stp->st_file = fp;
-	stp->st_stid.sc_stateid.si_boot = boot_time;
-	stp->st_stid.sc_stateid.si_stateownerid = oo->oo_owner.so_id;
-	stp->st_stid.sc_stateid.si_fileid = fp->fi_id;
+	stp->st_stid.sc_stateid.si_opaque.so_clid = clp->cl_clientid;
+	stp->st_stid.sc_stateid.si_opaque.so_id = current_stateid++;
 	/* note will be incremented before first return to client: */
 	stp->st_stid.sc_stateid.si_generation = 0;
 	hash_stid(&stp->st_stid);
@@ -3095,8 +3077,6 @@ nfs4_laundromat(void)
 				test_val = u;
 			break;
 		}
-		dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
-			oo->oo_owner.so_id);
 		release_openowner(oo);
 	}
 	if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
@@ -3141,7 +3121,7 @@ nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
 static int
 STALE_STATEID(stateid_t *stateid)
 {
-	if (stateid->si_boot == boot_time)
+	if (stateid->si_opaque.so_clid.cl_boot == boot_time)
 		return 0;
 	dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
 		STATEID_VAL(stateid));
@@ -3710,11 +3690,6 @@ last_byte_offset(u64 start, u64 len)
 	return end > start ? end - 1: NFS4_MAX_UINT64;
 }
 
-static unsigned int lockownerid_hashval(u32 id)
-{
-	return id & LOCK_HASH_MASK;
-}
-
 static inline unsigned int
 lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
 		struct xdr_netobj *ownername)
@@ -3724,7 +3699,6 @@ lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
 		& LOCK_HASH_MASK;
 }
 
-static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE];
 static struct list_head	lock_ownerstr_hashtbl[LOCK_HASH_SIZE];
 
 /*
@@ -3795,10 +3769,6 @@ find_lockowner_str(struct inode *inode, clientid_t *clid,
 
 static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp)
 {
-	unsigned int idhashval;
-
-	idhashval = lockownerid_hashval(lo->lo_owner.so_id);
-	list_add(&lo->lo_owner.so_idhash, &lock_ownerid_hashtbl[idhashval]);
 	list_add(&lo->lo_owner.so_strhash, &lock_ownerstr_hashtbl[strhashval]);
 	list_add(&lo->lo_perstateid, &open_stp->st_lockowners);
 }
@@ -3831,6 +3801,7 @@ static struct nfs4_ol_stateid *
 alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_ol_stateid *open_stp)
 {
 	struct nfs4_ol_stateid *stp;
+	struct nfs4_client *clp = lo->lo_owner.so_client;
 
 	stp = nfs4_alloc_stateid();
 	if (stp == NULL)
@@ -3841,9 +3812,8 @@ alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct
 	stp->st_stid.sc_type = NFS4_LOCK_STID;
 	get_nfs4_file(fp);
 	stp->st_file = fp;
-	stp->st_stid.sc_stateid.si_boot = boot_time;
-	stp->st_stid.sc_stateid.si_stateownerid = lo->lo_owner.so_id;
-	stp->st_stid.sc_stateid.si_fileid = fp->fi_id;
+	stp->st_stid.sc_stateid.si_opaque.so_clid = clp->cl_clientid;
+	stp->st_stid.sc_stateid.si_opaque.so_id = current_stateid++;
 	/* note will be incremented before first return to client: */
 	stp->st_stid.sc_stateid.si_generation = 0;
 	hash_stid(&stp->st_stid);
@@ -4252,7 +4222,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
 	 * data structures. */
 	INIT_LIST_HEAD(&matches);
 	for (i = 0; i < LOCK_HASH_SIZE; i++) {
-		list_for_each_entry(sop, &lock_ownerid_hashtbl[i], so_idhash) {
+		list_for_each_entry(sop, &lock_ownerstr_hashtbl[i], so_strhash) {
 			if (!same_owner_str(sop, owner, clid))
 				continue;
 			list_for_each_entry(stp, &sop->so_stateids,
@@ -4398,12 +4368,10 @@ nfs4_state_init(void)
 	}
 	for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) {
 		INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]);
-		INIT_LIST_HEAD(&open_ownerid_hashtbl[i]);
 	}
 	for (i = 0; i < STATEID_HASH_SIZE; i++)
 		INIT_LIST_HEAD(&stateid_hashtbl[i]);
 	for (i = 0; i < LOCK_HASH_SIZE; i++) {
-		INIT_LIST_HEAD(&lock_ownerid_hashtbl[i]);
 		INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]);
 	}
 	memset(&onestateid, ~0, sizeof(stateid_t));
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index e807abb116f6..d6aec4f8d3dd 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -45,24 +45,20 @@ typedef struct {
 } clientid_t;
 
 typedef struct {
-	u32             so_boot;
-	u32             so_stateownerid;
-	u32             so_fileid;
+	clientid_t	so_clid;
+	u32		so_id;
 } stateid_opaque_t;
 
 typedef struct {
 	u32                     si_generation;
 	stateid_opaque_t        si_opaque;
 } stateid_t;
-#define si_boot           si_opaque.so_boot
-#define si_stateownerid   si_opaque.so_stateownerid
-#define si_fileid         si_opaque.so_fileid
 
 #define STATEID_FMT	"(%08x/%08x/%08x/%08x)"
 #define STATEID_VAL(s) \
-	(s)->si_boot, \
-	(s)->si_stateownerid, \
-	(s)->si_fileid, \
+	(s)->si_opaque.so_clid.cl_boot, \
+	(s)->si_opaque.so_clid.cl_id, \
+	(s)->si_opaque.so_id, \
 	(s)->si_generation
 
 struct nfsd4_callback {
@@ -353,11 +349,9 @@ struct nfs4_replay {
 */
 
 struct nfs4_stateowner {
-	struct list_head        so_idhash;   /* hash by so_id */
 	struct list_head        so_strhash;   /* hash by op_name */
 	struct list_head        so_stateids;
 	int			so_is_open_owner; /* 1=openowner,0=lockowner */
-	u32                     so_id;
 	struct nfs4_client *    so_client;
 	/* after increment in ENCODE_SEQID_OP_TAIL, represents the next
 	 * sequence id expected from the client: */
@@ -415,8 +409,6 @@ struct nfs4_file {
 	struct file_lock	*fi_lease;
 	atomic_t		fi_delegees;
 	struct inode		*fi_inode;
-	u32                     fi_id;      /* used with stateowner->so_id 
-					     * for stateid_hashtbl hash */
 	bool			fi_had_conflict;
 };
 
-- 
cgit v1.2.3-58-ga151


From f7a4d872078a5e143d88adb561627f637046b05a Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 16 Sep 2011 20:12:38 -0400
Subject: nfsd4: hash closed stateid's like any other

Look up closed stateid's in the stateid hash like any other stateid
rather than searching the close lru.

This is simpler, and fixes a bug: currently we handle only the case of a
close that is the last close for a given stateowner, but not the case of
a close for a stateowner that still has active opens on other files.
Thus in a case like:

	open(owner, file1)
	open(owner, file2)
	close(owner, file2)
	close(owner, file2)

the final close won't be recognized as a retransmission.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 101 +++++++++++++++++++++++++++-------------------------
 fs/nfsd/state.h     |   4 ++-
 2 files changed, 56 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 922f47dd0d74..e5cba833613f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -406,7 +406,6 @@ static int nfs4_access_to_omode(u32 access)
 
 static void unhash_generic_stateid(struct nfs4_ol_stateid *stp)
 {
-	list_del(&stp->st_stid.sc_hash);
 	list_del(&stp->st_perfile);
 	list_del(&stp->st_perstateowner);
 }
@@ -437,6 +436,7 @@ static void release_lock_stateid(struct nfs4_ol_stateid *stp)
 	struct file *file;
 
 	unhash_generic_stateid(stp);
+	list_del(&stp->st_stid.sc_hash);
 	file = find_any_file(stp->st_file);
 	if (file)
 		locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner));
@@ -485,6 +485,7 @@ static void unhash_open_stateid(struct nfs4_ol_stateid *stp)
 static void release_open_stateid(struct nfs4_ol_stateid *stp)
 {
 	unhash_open_stateid(stp);
+	list_del(&stp->st_stid.sc_hash);
 	free_generic_stateid(stp);
 }
 
@@ -501,12 +502,22 @@ static void unhash_openowner(struct nfs4_openowner *oo)
 	}
 }
 
+static void release_last_closed_stateid(struct nfs4_openowner *oo)
+{
+	struct nfs4_ol_stateid *s = oo->oo_last_closed_stid;
+
+	if (s) {
+		list_del_init(&s->st_stid.sc_hash);
+		free_generic_stateid(s);
+		oo->oo_last_closed_stid = NULL;
+	}
+}
+
 static void release_openowner(struct nfs4_openowner *oo)
 {
 	unhash_openowner(oo);
 	list_del(&oo->oo_close_lru);
-	if (oo->oo_last_closed_stid)
-		free_generic_stateid(oo->oo_last_closed_stid);
+	release_last_closed_stateid(oo);
 	nfs4_free_openowner(oo);
 }
 
@@ -3099,23 +3110,11 @@ laundromat_main(struct work_struct *not_used)
 	queue_delayed_work(laundry_wq, &laundromat_work, t*HZ);
 }
 
-static struct nfs4_openowner * search_close_lru(stateid_t *s)
-{
-	struct nfs4_openowner *local;
-	struct nfs4_ol_stateid *os;
-
-	list_for_each_entry(local, &close_lru, oo_close_lru) {
-		os = local->oo_last_closed_stid;
-		if (same_stateid(&os->st_stid.sc_stateid, s))
-			return local;
-	}
-	return NULL;
-}
-
-static inline int
-nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
+static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
 {
-	return fhp->fh_dentry->d_inode != stp->st_file->fi_inode;
+	if (fhp->fh_dentry->d_inode != stp->st_file->fi_inode)
+		return nfserr_bad_stateid;
+	return nfs_ok;
 }
 
 static int
@@ -3283,7 +3282,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 	status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
 	if (status)
 		goto out;
-	if (s->sc_type == NFS4_DELEG_STID) {
+	switch (s->sc_type) {
+	case NFS4_DELEG_STID:
 		dp = delegstateid(s);
 		status = nfs4_check_delegmode(dp, flags);
 		if (status)
@@ -3293,10 +3293,12 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 			*filpp = dp->dl_file->fi_deleg_file;
 			BUG_ON(!*filpp);
 		}
-	} else { /* open or lock stateid */
+		break;
+	case NFS4_OPEN_STID:
+	case NFS4_LOCK_STID:
 		stp = openlockstateid(s);
-		status = nfserr_bad_stateid;
-		if (nfs4_check_fh(current_fh, stp))
+		status = nfs4_check_fh(current_fh, stp);
+		if (status)
 			goto out;
 		if (stp->st_stateowner->so_is_open_owner
 		    && !(openowner(stp->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
@@ -3311,6 +3313,9 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 			else
 				*filpp = find_writeable_file(stp->st_file);
 		}
+		break;
+	default:
+		return nfserr_bad_stateid;
 	}
 	status = nfs_ok;
 out:
@@ -3362,6 +3367,9 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			ret = nfsd4_free_lock_stateid(openlockstateid(s));
 		else
 			ret = nfserr_locks_held;
+		break;
+	default:
+		ret = nfserr_bad_stateid;
 	}
 out:
 	nfs4_unlock_state();
@@ -3390,12 +3398,19 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
 	struct nfs4_stateowner *sop = stp->st_stateowner;
 	__be32 status;
 
-	if (nfs4_check_fh(current_fh, stp))
-		return nfserr_bad_stateid;
 	status = nfsd4_check_seqid(cstate, sop, seqid);
 	if (status)
 		return status;
-	return check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
+	if (stp->st_stid.sc_type == NFS4_CLOSED_STID)
+		/*
+		 * "Closed" stateid's exist *only* to return
+		 * nfserr_replay_me from the previous step.
+		 */
+		return nfserr_bad_stateid;
+	status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
+	if (status)
+		return status;
+	return nfs4_check_fh(current_fh, stp);
 }
 
 /* 
@@ -3564,8 +3579,13 @@ void nfsd4_purge_closed_stateid(struct nfs4_stateowner *so)
 		return;
 	}
 	oo->oo_flags &= ~NFS4_OO_PURGE_CLOSE;
-	free_generic_stateid(oo->oo_last_closed_stid);
-	oo->oo_last_closed_stid = NULL;
+	release_last_closed_stateid(oo);
+}
+
+static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
+{
+	unhash_open_stateid(s);
+	s->st_stid.sc_type = NFS4_CLOSED_STID;
 }
 
 /*
@@ -3584,24 +3604,10 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			cstate->current_fh.fh_dentry->d_name.name);
 
 	nfs4_lock_state();
-	/* check close_lru for replay */
-	status = nfs4_preprocess_confirmed_seqid_op(cstate, close->cl_seqid,
-					&close->cl_stateid, &stp);
-	if (stp == NULL && status == nfserr_expired) {
-		/*
-		 * Also, we should make sure this isn't just the result of
-		 * a replayed close:
-		 */
-		oo = search_close_lru(&close->cl_stateid);
-		/* It's not stale; let's assume it's expired: */
-		if (oo == NULL)
-			goto out;
-		cstate->replay_owner = &oo->oo_owner;
-		status = nfsd4_check_seqid(cstate, &oo->oo_owner, close->cl_seqid);
-		if (status)
-			goto out;
-		status = nfserr_bad_seqid;
-	}
+	status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
+					&close->cl_stateid,
+					NFS4_OPEN_STID|NFS4_CLOSED_STID,
+					&stp);
 	if (status)
 		goto out; 
 	oo = openowner(stp->st_stateowner);
@@ -3609,9 +3615,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	update_stateid(&stp->st_stid.sc_stateid);
 	memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 
-	/* unhash_open_stateid() calls nfsd_close() if needed */
+	nfsd4_close_open_stateid(stp);
 	oo->oo_last_closed_stid = stp;
-	unhash_open_stateid(stp);
 
 	/* place unused nfs4_stateowners on so_close_lru list to be
 	 * released by the laundromat service after the lease period
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index d6aec4f8d3dd..da68bf66a3d7 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -76,7 +76,9 @@ struct nfs4_stid {
 #define NFS4_OPEN_STID 1
 #define NFS4_LOCK_STID 2
 #define NFS4_DELEG_STID 4
-	char sc_type;
+/* For an open stateid kept around *only* to process close replays: */
+#define NFS4_CLOSED_STID 8
+	unsigned char sc_type;
 	struct list_head sc_hash;
 	stateid_t sc_stateid;
 };
-- 
cgit v1.2.3-58-ga151


From 3d02fa29dec920c597dd7b7db608a4bc71f088ce Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 19 Sep 2011 15:07:41 -0400
Subject: nfsd4: fix open downgrade, again

Yet another open-management regression:

	- nfs4_file_downgrade() doesn't remove the BOTH access bit on
	  downgrade, so the server's idea of the stateid's access gets
	  out of sync with the client's.  If we want to keep an O_RDWR
	  open in this case, we should do that in the file_put_access
	  logic rather than here.
	- We forgot to convert v4 access to an open mode here.

This logic has proven too hard to get right.  In the future we may
consider:
	- reexamining the lock/openowner relationship (locks probably
	  don't really need to take their own references here).
	- adding open upgrade/downgrade support to the vfs.
	- removing the atomic operations.  They're redundant as long as
	  this is all under some other lock.

Also, maybe some kind of additional static checking would help catch
O_/NFS4_SHARE_ACCESS confusion.

Cc: stable@kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e5cba833613f..edcced18caa8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -194,8 +194,15 @@ static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag)
 static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
 {
 	if (atomic_dec_and_test(&fp->fi_access[oflag])) {
-		nfs4_file_put_fd(fp, O_RDWR);
 		nfs4_file_put_fd(fp, oflag);
+		/*
+		 * It's also safe to get rid of the RDWR open *if*
+		 * we no longer have need of the other kind of access
+		 * or if we already have the other kind of open:
+		 */
+		if (fp->fi_fds[1-oflag]
+			|| atomic_read(&fp->fi_access[1 - oflag]) == 0)
+			nfs4_file_put_fd(fp, O_RDWR);
 	}
 }
 
@@ -3500,8 +3507,9 @@ static inline void nfs4_file_downgrade(struct nfs4_ol_stateid *stp, unsigned int
 	int i;
 
 	for (i = 1; i < 4; i++) {
-		if (test_bit(i, &stp->st_access_bmap) && !(i & to_access)) {
-			nfs4_file_put_access(stp->st_file, i);
+		if (test_bit(i, &stp->st_access_bmap)
+					&& ((i & to_access) != i)) {
+			nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(i));
 			__clear_bit(i, &stp->st_access_bmap);
 		}
 	}
-- 
cgit v1.2.3-58-ga151


From c856694e3d46976c76bf5b92091cb1efa211208d Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 20 Sep 2011 08:49:51 -0400
Subject: nfsd4: make op_cacheresult another flag

I'm not sure why I used a new field for this originally.

Also, the differences between some of these flags are a little subtle;
add some comments to explain.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 50 +++++++++++++++++++++++++-------------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index aa769dfa756a..4e41f65c7021 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1013,14 +1013,15 @@ enum nfsd4_op_flags {
 	/* For rfc 5661 section 2.6.3.1.1: */
 	OP_HANDLES_WRONGSEC = 1 << 3,
 	OP_IS_PUTFH_LIKE = 1 << 4,
-	OP_MODIFIES_SOMETHING = 1 << 5, /* op is non-idempotent */
-};
-
-struct nfsd4_operation {
-	nfsd4op_func op_func;
-	u32 op_flags;
-	char *op_name;
 	/*
+	 * These are the ops whose result size we estimate before
+	 * encoding, to avoid performing an op then not being able to
+	 * respond or cache a response.  This includes writes and setattrs
+	 * as well as the operations usually called "nonidempotent":
+	 */
+	OP_MODIFIES_SOMETHING = 1 << 5,
+	/*
+	 * Cache compounds containing these ops in the xid-based drc:
 	 * We use the DRC for compounds containing non-idempotent
 	 * operations, *except* those that are 4.1-specific (since
 	 * sessions provide their own EOS), and except for stateful
@@ -1028,7 +1029,13 @@ struct nfsd4_operation {
 	 * (since sequence numbers provide EOS for open, lock, etc in
 	 * the v4.0 case).
 	 */
-	bool op_cacheresult;
+	OP_CACHEME = 1 << 6,
+};
+
+struct nfsd4_operation {
+	nfsd4op_func op_func;
+	u32 op_flags;
+	char *op_name;
 	/* Try to get response size before operation */
 	nfsd4op_rsize op_rsize_bop;
 };
@@ -1077,7 +1084,7 @@ static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op)
 
 bool nfsd4_cache_this_op(struct nfsd4_op *op)
 {
-	return OPDESC(op)->op_cacheresult;
+	return OPDESC(op)->op_flags & OP_CACHEME;
 }
 
 static bool need_wrongsec_check(struct svc_rqst *rqstp)
@@ -1420,9 +1427,8 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_CREATE] = {
 		.op_func = (nfsd4op_func)nfsd4_create,
-		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
 		.op_name = "OP_CREATE",
-		.op_cacheresult = true,
 		.op_rsize_bop = (nfsd4op_rsize)nfsd4_create_rsize,
 	},
 	[OP_DELEGRETURN] = {
@@ -1442,9 +1448,9 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_LINK] = {
 		.op_func = (nfsd4op_func)nfsd4_link,
-		.op_flags = ALLOWED_ON_ABSENT_FS | OP_MODIFIES_SOMETHING,
+		.op_flags = ALLOWED_ON_ABSENT_FS | OP_MODIFIES_SOMETHING
+				| OP_CACHEME,
 		.op_name = "OP_LINK",
-		.op_cacheresult = true,
 		.op_rsize_bop = (nfsd4op_rsize)nfsd4_link_rsize,
 	},
 	[OP_LOCK] = {
@@ -1534,16 +1540,14 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_REMOVE] = {
 		.op_func = (nfsd4op_func)nfsd4_remove,
-		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
 		.op_name = "OP_REMOVE",
-		.op_cacheresult = true,
 		.op_rsize_bop = (nfsd4op_rsize)nfsd4_remove_rsize,
 	},
 	[OP_RENAME] = {
 		.op_func = (nfsd4op_func)nfsd4_rename,
-		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
 		.op_name = "OP_RENAME",
-		.op_cacheresult = true,
 		.op_rsize_bop = (nfsd4op_rsize)nfsd4_rename_rsize,
 	},
 	[OP_RENEW] = {
@@ -1575,24 +1579,21 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	[OP_SETATTR] = {
 		.op_func = (nfsd4op_func)nfsd4_setattr,
 		.op_name = "OP_SETATTR",
-		.op_flags = OP_MODIFIES_SOMETHING,
-		.op_cacheresult = true,
+		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
 		.op_rsize_bop = (nfsd4op_rsize)nfsd4_setattr_rsize,
 	},
 	[OP_SETCLIENTID] = {
 		.op_func = (nfsd4op_func)nfsd4_setclientid,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
-				| OP_MODIFIES_SOMETHING,
+				| OP_MODIFIES_SOMETHING | OP_CACHEME,
 		.op_name = "OP_SETCLIENTID",
-		.op_cacheresult = true,
 		.op_rsize_bop = (nfsd4op_rsize)nfsd4_setclientid_rsize,
 	},
 	[OP_SETCLIENTID_CONFIRM] = {
 		.op_func = (nfsd4op_func)nfsd4_setclientid_confirm,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
-				| OP_MODIFIES_SOMETHING,
+				| OP_MODIFIES_SOMETHING | OP_CACHEME,
 		.op_name = "OP_SETCLIENTID_CONFIRM",
-		.op_cacheresult = true,
 		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_VERIFY] = {
@@ -1601,9 +1602,8 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_WRITE] = {
 		.op_func = (nfsd4op_func)nfsd4_write,
-		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
 		.op_name = "OP_WRITE",
-		.op_cacheresult = true,
 		.op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
 	},
 	[OP_RELEASE_LOCKOWNER] = {
-- 
cgit v1.2.3-58-ga151


From 8335ebd94b3f5bed7875cc35848bbe46d8381695 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 21 Sep 2011 08:34:32 -0400
Subject: leases: split up generic_setlease into lock/unlock cases

Eventually we should probably do the same thing to the file operations
as well.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/locks.c | 98 +++++++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 62 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 9b8408eb6946..b342902c38bc 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1352,18 +1352,7 @@ int fcntl_getlease(struct file *filp)
 	return type;
 }
 
-/**
- *	generic_setlease	-	sets a lease on an open file
- *	@filp: file pointer
- *	@arg: type of lease to obtain
- *	@flp: input - file_lock to use, output - file_lock inserted
- *
- *	The (input) flp->fl_lmops->lm_break function is required
- *	by break_lease().
- *
- *	Called with file_lock_lock held.
- */
-int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
+int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
 {
 	struct file_lock *fl, **before, **my_before = NULL, *lease;
 	struct dentry *dentry = filp->f_path.dentry;
@@ -1372,30 +1361,14 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 
 	lease = *flp;
 
-	error = -EACCES;
-	if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
-		goto out;
-	error = -EINVAL;
-	if (!S_ISREG(inode->i_mode))
+	error = -EAGAIN;
+	if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
 		goto out;
-	error = security_file_lock(filp, arg);
-	if (error)
+	if ((arg == F_WRLCK)
+	    && ((dentry->d_count > 1)
+		|| (atomic_read(&inode->i_count) > 1)))
 		goto out;
 
-	time_out_leases(inode);
-
-	BUG_ON(!(*flp)->fl_lmops->lm_break);
-
-	if (arg != F_UNLCK) {
-		error = -EAGAIN;
-		if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
-			goto out;
-		if ((arg == F_WRLCK)
-		    && ((dentry->d_count > 1)
-			|| (atomic_read(&inode->i_count) > 1)))
-			goto out;
-	}
-
 	/*
 	 * At this point, we know that if there is an exclusive
 	 * lease on this file, then we hold it on this filp
@@ -1433,9 +1406,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 		goto out;
 	}
 
-	if (arg == F_UNLCK)
-		goto out;
-
 	error = -EINVAL;
 	if (!leases_enable)
 		goto out;
@@ -1446,6 +1416,62 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 out:
 	return error;
 }
+
+int generic_delete_lease(struct file *filp, struct file_lock **flp)
+{
+	struct file_lock *fl, **before;
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+
+	for (before = &inode->i_flock;
+			((fl = *before) != NULL) && IS_LEASE(fl);
+			before = &fl->fl_next) {
+		if (fl->fl_file != filp)
+			continue;
+		return (*flp)->fl_lmops->lm_change(before, F_UNLCK);
+	}
+	return -EAGAIN;
+}
+
+/**
+ *	generic_setlease	-	sets a lease on an open file
+ *	@filp: file pointer
+ *	@arg: type of lease to obtain
+ *	@flp: input - file_lock to use, output - file_lock inserted
+ *
+ *	The (input) flp->fl_lmops->lm_break function is required
+ *	by break_lease().
+ *
+ *	Called with file_lock_lock held.
+ */
+int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
+		return -EACCES;
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+	error = security_file_lock(filp, arg);
+	if (error)
+		return error;
+
+	time_out_leases(inode);
+
+	BUG_ON(!(*flp)->fl_lmops->lm_break);
+
+	switch (arg) {
+	case F_UNLCK:
+		return generic_delete_lease(filp, flp);
+	case F_RDLCK:
+	case F_WRLCK:
+		return generic_add_lease(filp, arg, flp);
+	default:
+		BUG();
+	}
+}
 EXPORT_SYMBOL(generic_setlease);
 
 static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
-- 
cgit v1.2.3-58-ga151


From 2a74aba799bfbc02977b69400b7bf4d2850aea79 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 23 Sep 2011 17:20:02 -0400
Subject: nfsd4: move client * to nfs4_stateid, add init_stid helper

This will be convenient.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c |  8 ++++----
 fs/nfsd/nfs4state.c    | 48 ++++++++++++++++++++++++++----------------------
 fs/nfsd/state.h        |  2 +-
 3 files changed, 31 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 93b5e405ad38..de018ecadae6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -787,7 +787,7 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfsd4_callback *cb = calldata;
 	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
-	struct nfs4_client *clp = dp->dl_client;
+	struct nfs4_client *clp = dp->dl_stid.sc_client;
 	u32 minorversion = clp->cl_minorversion;
 
 	cb->cb_minorversion = minorversion;
@@ -809,7 +809,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 {
 	struct nfsd4_callback *cb = calldata;
 	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
-	struct nfs4_client *clp = dp->dl_client;
+	struct nfs4_client *clp = dp->dl_stid.sc_client;
 
 	dprintk("%s: minorversion=%d\n", __func__,
 		clp->cl_minorversion);
@@ -832,7 +832,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 {
 	struct nfsd4_callback *cb = calldata;
 	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
-	struct nfs4_client *clp = dp->dl_client;
+	struct nfs4_client *clp = dp->dl_stid.sc_client;
 	struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
 
 	nfsd4_cb_done(task, calldata);
@@ -1006,7 +1006,7 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
 void nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
 	struct nfsd4_callback *cb = &dp->dl_recall;
-	struct nfs4_client *clp = dp->dl_client;
+	struct nfs4_client *clp = dp->dl_stid.sc_client;
 
 	dp->dl_retries = 1;
 	cb->cb_op = dp;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index edcced18caa8..cb36c9a2e8ca 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -224,6 +224,19 @@ static inline void hash_stid(struct nfs4_stid *stid)
 	list_add(&stid->sc_hash, &stateid_hashtbl[hashval]);
 }
 
+static void init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type)
+{
+	stateid_t *s = &stid->sc_stateid;
+
+	stid->sc_type = type;
+	stid->sc_client = cl;
+	s->si_opaque.so_clid = cl->cl_clientid;
+	s->si_opaque.so_id = current_stateid++;
+	/* Will be incremented before return to client: */
+	s->si_generation = 0;
+	hash_stid(stid);
+}
+
 static struct nfs4_delegation *
 alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type)
 {
@@ -245,19 +258,20 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
 	dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL);
 	if (dp == NULL)
 		return dp;
+	init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID);
+	/*
+	 * delegation seqid's are never incremented.  The 4.1 special
+	 * meaning of seqid 0 isn't really meaningful, really, but let's
+	 * avoid 0 anyway just for consistency and use 1:
+	 */
+	dp->dl_stid.sc_stateid.si_generation = 1;
 	num_delegations++;
 	INIT_LIST_HEAD(&dp->dl_perfile);
 	INIT_LIST_HEAD(&dp->dl_perclnt);
 	INIT_LIST_HEAD(&dp->dl_recall_lru);
-	dp->dl_client = clp;
 	get_nfs4_file(fp);
 	dp->dl_file = fp;
 	dp->dl_type = type;
-	dp->dl_stid.sc_type = NFS4_DELEG_STID;
-	dp->dl_stid.sc_stateid.si_opaque.so_clid = clp->cl_clientid;
-	dp->dl_stid.sc_stateid.si_opaque.so_id = current_stateid++;
-	dp->dl_stid.sc_stateid.si_generation = 1;
-	hash_stid(&dp->dl_stid);
 	fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
 	dp->dl_time = 0;
 	atomic_set(&dp->dl_count, 1);
@@ -2333,18 +2347,13 @@ init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd
 	struct nfs4_openowner *oo = open->op_openowner;
 	struct nfs4_client *clp = oo->oo_owner.so_client;
 
+	init_stid(&stp->st_stid, clp, NFS4_OPEN_STID);
 	INIT_LIST_HEAD(&stp->st_lockowners);
 	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
-	stp->st_stid.sc_type = NFS4_OPEN_STID;
 	stp->st_stateowner = &oo->oo_owner;
 	get_nfs4_file(fp);
 	stp->st_file = fp;
-	stp->st_stid.sc_stateid.si_opaque.so_clid = clp->cl_clientid;
-	stp->st_stid.sc_stateid.si_opaque.so_id = current_stateid++;
-	/* note will be incremented before first return to client: */
-	stp->st_stid.sc_stateid.si_generation = 0;
-	hash_stid(&stp->st_stid);
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = 0;
 	__set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK,
@@ -2792,7 +2801,7 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
 	if (!fl)
 		return -ENOMEM;
 	fl->fl_file = find_readable_file(fp);
-	list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
+	list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
 	status = vfs_setlease(fl->fl_file, fl->fl_type, &fl);
 	if (status) {
 		list_del_init(&dp->dl_perclnt);
@@ -2821,7 +2830,7 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
 	atomic_inc(&fp->fi_delegees);
 	list_add(&dp->dl_perfile, &fp->fi_delegations);
 	spin_unlock(&recall_lock);
-	list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
+	list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
 	return 0;
 }
 
@@ -3295,7 +3304,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 		status = nfs4_check_delegmode(dp, flags);
 		if (status)
 			goto out;
-		renew_client(dp->dl_client);
+		renew_client(dp->dl_stid.sc_client);
 		if (filpp) {
 			*filpp = dp->dl_file->fi_deleg_file;
 			BUG_ON(!*filpp);
@@ -3665,7 +3674,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate));
 	if (status)
 		goto out;
-	renew_client(dp->dl_client);
+	renew_client(dp->dl_stid.sc_client);
 
 	unhash_delegation(dp);
 out:
@@ -3819,17 +3828,12 @@ alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct
 	stp = nfs4_alloc_stateid();
 	if (stp == NULL)
 		goto out;
+	init_stid(&stp->st_stid, clp, NFS4_LOCK_STID);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
 	list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
 	stp->st_stateowner = &lo->lo_owner;
-	stp->st_stid.sc_type = NFS4_LOCK_STID;
 	get_nfs4_file(fp);
 	stp->st_file = fp;
-	stp->st_stid.sc_stateid.si_opaque.so_clid = clp->cl_clientid;
-	stp->st_stid.sc_stateid.si_opaque.so_id = current_stateid++;
-	/* note will be incremented before first return to client: */
-	stp->st_stid.sc_stateid.si_generation = 0;
-	hash_stid(&stp->st_stid);
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = open_stp->st_deny_bmap;
 	stp->st_openstp = open_stp;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index da68bf66a3d7..70062b75e24a 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -81,6 +81,7 @@ struct nfs4_stid {
 	unsigned char sc_type;
 	struct list_head sc_hash;
 	stateid_t sc_stateid;
+	struct nfs4_client *sc_client;
 };
 
 struct nfs4_delegation {
@@ -88,7 +89,6 @@ struct nfs4_delegation {
 	struct list_head	dl_perclnt;
 	struct list_head	dl_recall_lru;  /* delegation recalled */
 	atomic_t		dl_count;       /* ref count */
-	struct nfs4_client	*dl_client;
 	struct nfs4_file	*dl_file;
 	u32			dl_type;
 	time_t			dl_time;
-- 
cgit v1.2.3-58-ga151


From 6136d2b409652b064b2da6d43d5c47cbd1d2cc14 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 23 Sep 2011 16:21:15 -0400
Subject: nfsd4: use idr for stateid's

The idr system is designed exactly for generating id and looking up
integer id's.  Thanks to Trond for pointing it out.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 124 +++++++++++++++++++++++++++++++---------------------
 fs/nfsd/state.h     |   1 -
 2 files changed, 73 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index cb36c9a2e8ca..a9e71cdf4a84 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -32,6 +32,7 @@
 *
 */
 
+#include <linux/idr.h>
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
@@ -49,7 +50,6 @@
 time_t nfsd4_lease = 90;     /* default lease time */
 time_t nfsd4_grace = 90;
 static time_t boot_time;
-static u32 current_stateid = 1;
 static stateid_t zerostateid;             /* bits all 0 */
 static stateid_t onestateid;              /* bits all 1 */
 static u64 current_sessionid = 1;
@@ -149,10 +149,7 @@ static struct list_head	open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE];
 #define FILE_HASH_BITS                   8
 #define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
 
-/* hash table for (open)nfs4_ol_stateid */
-#define STATEID_HASH_BITS              10
-#define STATEID_HASH_SIZE              (1 << STATEID_HASH_BITS)
-#define STATEID_HASH_MASK              (STATEID_HASH_SIZE - 1)
+struct idr stateids;
 
 static unsigned int file_hashval(struct inode *ino)
 {
@@ -160,13 +157,7 @@ static unsigned int file_hashval(struct inode *ino)
 	return hash_ptr(ino, FILE_HASH_BITS);
 }
 
-static unsigned int stateid_hashval(stateid_t *s)
-{
-	return opaque_hashval(&s->si_opaque, sizeof(stateid_opaque_t)) & STATEID_HASH_MASK;
-}
-
 static struct list_head file_hashtbl[FILE_HASH_SIZE];
-static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
 
 static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
 {
@@ -215,26 +206,52 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag)
 		__nfs4_file_put_access(fp, oflag);
 }
 
-static inline void hash_stid(struct nfs4_stid *stid)
+static inline int get_new_stid(struct nfs4_stid *stid)
 {
-	stateid_t *s = &stid->sc_stateid;
-	unsigned int hashval;
+	static int min_stateid = 0;
+	int new_stid;
+	int error;
+
+	if (!idr_pre_get(&stateids, GFP_KERNEL))
+		return -ENOMEM;
+
+	error = idr_get_new_above(&stateids, stid, min_stateid, &new_stid);
+	/*
+	 * All this code is currently serialized; the preallocation
+	 * above should still be ours:
+	 */
+	BUG_ON(error);
+	/*
+	 * It shouldn't be a problem to reuse an opaque stateid value.
+	 * I don't think it is for 4.1.  But with 4.0 I worry that, for
+	 * example, a stray write retransmission could be accepted by
+	 * the server when it should have been rejected.  Therefore,
+	 * adopt a trick from the sctp code to attempt to maximize the
+	 * amount of time until an id is reused, by ensuring they always
+	 * "increase" (mod INT_MAX):
+	 */
 
-	hashval = stateid_hashval(s);
-	list_add(&stid->sc_hash, &stateid_hashtbl[hashval]);
+	min_stateid = new_stid+1;
+	if (min_stateid == INT_MAX)
+		min_stateid = 0;
+	return new_stid;
 }
 
-static void init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type)
+static inline __be32 init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type)
 {
 	stateid_t *s = &stid->sc_stateid;
+	int new_id;
 
 	stid->sc_type = type;
 	stid->sc_client = cl;
 	s->si_opaque.so_clid = cl->cl_clientid;
-	s->si_opaque.so_id = current_stateid++;
+	new_id = get_new_stid(stid);
+	if (new_id < 0)
+		return nfserr_jukebox;
+	s->si_opaque.so_id = (u32)new_id;
 	/* Will be incremented before return to client: */
 	s->si_generation = 0;
-	hash_stid(stid);
+	return 0;
 }
 
 static struct nfs4_delegation *
@@ -242,6 +259,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
 {
 	struct nfs4_delegation *dp;
 	struct nfs4_file *fp = stp->st_file;
+	__be32 status;
 
 	dprintk("NFSD alloc_init_deleg\n");
 	/*
@@ -258,11 +276,15 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
 	dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL);
 	if (dp == NULL)
 		return dp;
-	init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID);
+	status = init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID);
+	if (status) {
+		kmem_cache_free(deleg_slab, dp);
+		return NULL;
+	}
 	/*
 	 * delegation seqid's are never incremented.  The 4.1 special
-	 * meaning of seqid 0 isn't really meaningful, really, but let's
-	 * avoid 0 anyway just for consistency and use 1:
+	 * meaning of seqid 0 isn't meaningful, really, but let's avoid
+	 * 0 anyway just for consistency and use 1:
 	 */
 	dp->dl_stid.sc_stateid.si_generation = 1;
 	num_delegations++;
@@ -300,11 +322,16 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
 	}
 }
 
+static void unhash_stid(struct nfs4_stid *s)
+{
+	idr_remove(&stateids, s->sc_stateid.si_opaque.so_id);
+}
+
 /* Called under the state lock. */
 static void
 unhash_delegation(struct nfs4_delegation *dp)
 {
-	list_del_init(&dp->dl_stid.sc_hash);
+	unhash_stid(&dp->dl_stid);
 	list_del_init(&dp->dl_perclnt);
 	spin_lock(&recall_lock);
 	list_del_init(&dp->dl_perfile);
@@ -457,7 +484,7 @@ static void release_lock_stateid(struct nfs4_ol_stateid *stp)
 	struct file *file;
 
 	unhash_generic_stateid(stp);
-	list_del(&stp->st_stid.sc_hash);
+	unhash_stid(&stp->st_stid);
 	file = find_any_file(stp->st_file);
 	if (file)
 		locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner));
@@ -506,7 +533,7 @@ static void unhash_open_stateid(struct nfs4_ol_stateid *stp)
 static void release_open_stateid(struct nfs4_ol_stateid *stp)
 {
 	unhash_open_stateid(stp);
-	list_del(&stp->st_stid.sc_hash);
+	unhash_stid(&stp->st_stid);
 	free_generic_stateid(stp);
 }
 
@@ -528,7 +555,7 @@ static void release_last_closed_stateid(struct nfs4_openowner *oo)
 	struct nfs4_ol_stateid *s = oo->oo_last_closed_stid;
 
 	if (s) {
-		list_del_init(&s->st_stid.sc_hash);
+		unhash_stid(&s->st_stid);
 		free_generic_stateid(s);
 		oo->oo_last_closed_stid = NULL;
 	}
@@ -1099,23 +1126,9 @@ static void gen_confirm(struct nfs4_client *clp)
 	*p++ = i++;
 }
 
-static int
-same_stateid(stateid_t *id_one, stateid_t *id_two)
-{
-	return 0 == memcmp(&id_one->si_opaque, &id_two->si_opaque,
-					sizeof(stateid_opaque_t));
-}
-
 static struct nfs4_stid *find_stateid(stateid_t *t)
 {
-	struct nfs4_stid *s;
-	unsigned int hashval;
-
-	hashval = stateid_hashval(t);
-	list_for_each_entry(s, &stateid_hashtbl[hashval], sc_hash)
-		if (same_stateid(&s->sc_stateid, t))
-			return s;
-	return NULL;
+	return idr_find(&stateids, t->si_opaque.so_id);
 }
 
 static struct nfs4_stid *find_stateid_by_type(stateid_t *t, char typemask)
@@ -2342,12 +2355,14 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
 	return oo;
 }
 
-static inline void
-init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
+static inline __be32 init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
 	struct nfs4_openowner *oo = open->op_openowner;
 	struct nfs4_client *clp = oo->oo_owner.so_client;
+	__be32 status;
 
-	init_stid(&stp->st_stid, clp, NFS4_OPEN_STID);
+	status = init_stid(&stp->st_stid, clp, NFS4_OPEN_STID);
+	if (status)
+		return status;
 	INIT_LIST_HEAD(&stp->st_lockowners);
 	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
@@ -2360,6 +2375,7 @@ init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd
 		  &stp->st_access_bmap);
 	__set_bit(open->op_share_deny, &stp->st_deny_bmap);
 	stp->st_openstp = NULL;
+	return nfs_ok;
 }
 
 static void
@@ -2949,7 +2965,11 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		status = nfs4_new_open(rqstp, &stp, fp, current_fh, open);
 		if (status)
 			goto out;
-		init_open_stateid(stp, fp, open);
+		status = init_open_stateid(stp, fp, open);
+		if (status) {
+			release_open_stateid(stp);
+			goto out;
+		}
 		status = nfsd4_truncate(rqstp, current_fh, open);
 		if (status) {
 			release_open_stateid(stp);
@@ -3824,11 +3844,16 @@ alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct
 {
 	struct nfs4_ol_stateid *stp;
 	struct nfs4_client *clp = lo->lo_owner.so_client;
+	__be32 status;
 
 	stp = nfs4_alloc_stateid();
 	if (stp == NULL)
-		goto out;
-	init_stid(&stp->st_stid, clp, NFS4_LOCK_STID);
+		return NULL;
+	status = init_stid(&stp->st_stid, clp, NFS4_LOCK_STID);
+	if (status) {
+		free_generic_stateid(stp);
+		return NULL;
+	}
 	list_add(&stp->st_perfile, &fp->fi_stateids);
 	list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
 	stp->st_stateowner = &lo->lo_owner;
@@ -3837,8 +3862,6 @@ alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = open_stp->st_deny_bmap;
 	stp->st_openstp = open_stp;
-
-out:
 	return stp;
 }
 
@@ -4386,8 +4409,7 @@ nfs4_state_init(void)
 	for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) {
 		INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]);
 	}
-	for (i = 0; i < STATEID_HASH_SIZE; i++)
-		INIT_LIST_HEAD(&stateid_hashtbl[i]);
+	idr_init(&stateids);
 	for (i = 0; i < LOCK_HASH_SIZE; i++) {
 		INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]);
 	}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 70062b75e24a..3ed5f99141ec 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -79,7 +79,6 @@ struct nfs4_stid {
 /* For an open stateid kept around *only* to process close replays: */
 #define NFS4_CLOSED_STID 8
 	unsigned char sc_type;
-	struct list_head sc_hash;
 	stateid_t sc_stateid;
 	struct nfs4_client *sc_client;
 };
-- 
cgit v1.2.3-58-ga151


From 36279ac10c3d69372af875f1affafd375db687a9 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 26 Sep 2011 12:53:00 -0400
Subject: nfsd4: assume test_stateid always has session

Test_stateid is 4.1-only and only allowed after a sequence operation, so
this check is unnecessary.

Cc: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 6 +++---
 fs/nfsd/nfs4xdr.c   | 2 +-
 fs/nfsd/state.h     | 2 +-
 fs/nfsd/xdr4.h      | 1 -
 4 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a9e71cdf4a84..daf75fa4c027 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3256,7 +3256,7 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_sess
 	return nfserr_old_stateid;
 }
 
-__be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session)
+__be32 nfs4_validate_stateid(stateid_t *stateid)
 {
 	struct nfs4_stid *s;
 	struct nfs4_ol_stateid *ols;
@@ -3268,7 +3268,7 @@ __be32 nfs4_validate_stateid(stateid_t *stateid, bool has_session)
 	s = find_stateid(stateid);
 	if (!s)
 		 return nfserr_stale_stateid;
-	status = check_stateid_generation(stateid, &s->sc_stateid, has_session);
+	status = check_stateid_generation(stateid, &s->sc_stateid, 1);
 	if (status)
 		return status;
 	if (!(s->sc_type & (NFS4_OPEN_STID | NFS4_LOCK_STID)))
@@ -3374,7 +3374,7 @@ __be32
 nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		   struct nfsd4_test_stateid *test_stateid)
 {
-	test_stateid->ts_has_session = nfsd4_has_session(cstate);
+	/* real work is done during encoding */
 	return nfs_ok;
 }
 
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 7bd57c2dbc4d..2429fffa31dd 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3302,7 +3302,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr,
 	nfs4_lock_state();
 	for (i = 0; i < test_stateid->ts_num_ids; i++) {
 		nfsd4_decode_stateid(argp, &si);
-		valid = nfs4_validate_stateid(&si, test_stateid->ts_has_session);
+		valid = nfs4_validate_stateid(&si);
 		RESERVE_SPACE(4);
 		*p++ = htonl(valid);
 		resp->p = p;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 3ed5f99141ec..55a4d6a108a2 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -508,7 +508,7 @@ extern void nfsd4_recdir_purge_old(void);
 extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
 extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
 extern void release_session_client(struct nfsd4_session *);
-extern __be32 nfs4_validate_stateid(stateid_t *, bool);
+extern __be32 nfs4_validate_stateid(stateid_t *);
 extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *);
 
 #endif   /* NFSD4_STATE_H */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index a767b57b8208..c9012149637c 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -343,7 +343,6 @@ struct nfsd4_saved_compoundargs {
 
 struct nfsd4_test_stateid {
 	__be32		ts_num_ids;
-	bool		ts_has_session;
 	struct nfsd4_compoundargs *ts_saved_args;
 	struct nfsd4_saved_compoundargs ts_savedp;
 };
-- 
cgit v1.2.3-58-ga151


From 38c2f4b12a455cb3a108fd5c79a10df2ba3ec9a7 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 23 Sep 2011 17:01:19 -0400
Subject: nfsd4: look up stateid's per clientid

Use a separate stateid idr per client, and lookup a stateid by first
finding the client, then looking up the stateid relative to that client.

Also some minor refactoring.

This allows us to improve error returns: we can return expired when the
clientid is not found and bad_stateid when the clientid is found but not
the stateid, as opposed to returning expired for both cases.

I hope this will also help to replace the state lock mostly by a
per-client lock, but that hasn't been done yet.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 112 +++++++++++++++++++++++-----------------------------
 fs/nfsd/nfs4xdr.c   |   3 +-
 fs/nfsd/state.h     |   4 +-
 3 files changed, 54 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index daf75fa4c027..931155f51ecc 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -32,7 +32,6 @@
 *
 */
 
-#include <linux/idr.h>
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
@@ -149,8 +148,6 @@ static struct list_head	open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE];
 #define FILE_HASH_BITS                   8
 #define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
 
-struct idr stateids;
-
 static unsigned int file_hashval(struct inode *ino)
 {
 	/* XXX: why are we hashing on inode pointer, anyway? */
@@ -209,13 +206,14 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag)
 static inline int get_new_stid(struct nfs4_stid *stid)
 {
 	static int min_stateid = 0;
+	struct idr *stateids = &stid->sc_client->cl_stateids;
 	int new_stid;
 	int error;
 
-	if (!idr_pre_get(&stateids, GFP_KERNEL))
+	if (!idr_pre_get(stateids, GFP_KERNEL))
 		return -ENOMEM;
 
-	error = idr_get_new_above(&stateids, stid, min_stateid, &new_stid);
+	error = idr_get_new_above(stateids, stid, min_stateid, &new_stid);
 	/*
 	 * All this code is currently serialized; the preallocation
 	 * above should still be ours:
@@ -324,7 +322,9 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
 
 static void unhash_stid(struct nfs4_stid *s)
 {
-	idr_remove(&stateids, s->sc_stateid.si_opaque.so_id);
+	struct idr *stateids = &s->sc_client->cl_stateids;
+
+	idr_remove(stateids, s->sc_stateid.si_opaque.so_id);
 }
 
 /* Called under the state lock. */
@@ -1126,16 +1126,16 @@ static void gen_confirm(struct nfs4_client *clp)
 	*p++ = i++;
 }
 
-static struct nfs4_stid *find_stateid(stateid_t *t)
+static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t)
 {
-	return idr_find(&stateids, t->si_opaque.so_id);
+	return idr_find(&cl->cl_stateids, t->si_opaque.so_id);
 }
 
-static struct nfs4_stid *find_stateid_by_type(stateid_t *t, char typemask)
+static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
 {
 	struct nfs4_stid *s;
 
-	s = find_stateid(t);
+	s = find_stateid(cl, t);
 	if (!s)
 		return NULL;
 	if (typemask & s->sc_type)
@@ -1143,16 +1143,6 @@ static struct nfs4_stid *find_stateid_by_type(stateid_t *t, char typemask)
 	return NULL;
 }
 
-static struct nfs4_ol_stateid *find_ol_stateid_by_type(stateid_t *t, char typemask)
-{
-	struct nfs4_stid *s;
-
-	s = find_stateid_by_type(t, typemask);
-	if (!s)
-		return NULL;
-	return openlockstateid(s);
-}
-
 static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
 		struct svc_rqst *rqstp, nfs4_verifier *verf)
 {
@@ -1175,6 +1165,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
 		}
 	}
 
+	idr_init(&clp->cl_stateids);
 	memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
 	atomic_set(&clp->cl_refcount, 0);
 	clp->cl_cb_state = NFSD4_CB_UNKNOWN;
@@ -2611,24 +2602,24 @@ static int share_access_to_flags(u32 share_access)
 	return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
 }
 
-static struct nfs4_delegation *find_deleg_stateid(stateid_t *s)
+static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s)
 {
 	struct nfs4_stid *ret;
 
-	ret = find_stateid_by_type(s, NFS4_DELEG_STID);
+	ret = find_stateid_by_type(cl, s, NFS4_DELEG_STID);
 	if (!ret)
 		return NULL;
 	return delegstateid(ret);
 }
 
 static __be32
-nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
+nfs4_check_deleg(struct nfs4_client *cl, struct nfs4_file *fp, struct nfsd4_open *open,
 		struct nfs4_delegation **dp)
 {
 	int flags;
 	__be32 status = nfserr_bad_stateid;
 
-	*dp = find_deleg_stateid(&open->op_delegate_stateid);
+	*dp = find_deleg_stateid(cl, &open->op_delegate_stateid);
 	if (*dp == NULL)
 		goto out;
 	flags = share_access_to_flags(open->op_share_access);
@@ -2920,6 +2911,7 @@ __be32
 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
 {
 	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
 	struct nfs4_file *fp = NULL;
 	struct inode *ino = current_fh->fh_dentry->d_inode;
 	struct nfs4_ol_stateid *stp = NULL;
@@ -2939,7 +2931,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	if (fp) {
 		if ((status = nfs4_check_open(fp, open, &stp)))
 			goto out;
-		status = nfs4_check_deleg(fp, open, &dp);
+		status = nfs4_check_deleg(cl, fp, open, &dp);
 		if (status)
 			goto out;
 	} else {
@@ -3256,7 +3248,7 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_sess
 	return nfserr_old_stateid;
 }
 
-__be32 nfs4_validate_stateid(stateid_t *stateid)
+__be32 nfs4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
 {
 	struct nfs4_stid *s;
 	struct nfs4_ol_stateid *ols;
@@ -3265,7 +3257,7 @@ __be32 nfs4_validate_stateid(stateid_t *stateid)
 	if (STALE_STATEID(stateid))
 		return nfserr_stale_stateid;
 
-	s = find_stateid(stateid);
+	s = find_stateid(cl, stateid);
 	if (!s)
 		 return nfserr_stale_stateid;
 	status = check_stateid_generation(stateid, &s->sc_stateid, 1);
@@ -3280,6 +3272,24 @@ __be32 nfs4_validate_stateid(stateid_t *stateid)
 	return nfs_ok;
 }
 
+static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s)
+{
+	struct nfs4_client *cl;
+
+	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+		return nfserr_bad_stateid;
+	if (STALE_STATEID(stateid))
+		return nfserr_stale_stateid;
+	cl = find_confirmed_client(&stateid->si_opaque.so_clid);
+	if (!cl)
+		return nfserr_expired;
+	*s = find_stateid_by_type(cl, stateid, typemask);
+	if (!*s)
+		return nfserr_bad_stateid;
+	return nfs_ok;
+
+}
+
 /*
 * Checks for stateid operations
 */
@@ -3303,18 +3313,9 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
 		return check_special_stateids(current_fh, stateid, flags);
 
-	status = nfserr_stale_stateid;
-	if (STALE_STATEID(stateid)) 
-		goto out;
-
-	/*
-	 * We assume that any stateid that has the current boot time,
-	 * but that we can't find, is expired:
-	 */
-	status = nfserr_expired;
-	s = find_stateid(stateid);
-	if (!s)
-		goto out;
+	status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s);
+	if (status)
+		return status;
 	status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
 	if (status)
 		goto out;
@@ -3384,10 +3385,11 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
 	stateid_t *stateid = &free_stateid->fr_stateid;
 	struct nfs4_stid *s;
+	struct nfs4_client *cl = cstate->session->se_client;
 	__be32 ret = nfserr_bad_stateid;
 
 	nfs4_lock_state();
-	s = find_stateid(stateid);
+	s = find_stateid(cl, stateid);
 	if (!s)
 		goto out;
 	switch (s->sc_type) {
@@ -3419,15 +3421,6 @@ setlkflg (int type)
 		RD_STATE : WR_STATE;
 }
 
-static __be32 nfs4_nospecial_stateid_checks(stateid_t *stateid)
-{
-	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
-		return nfserr_bad_stateid;
-	if (STALE_STATEID(stateid))
-		return nfserr_stale_stateid;
-	return nfs_ok;
-}
-
 static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_ol_stateid *stp)
 {
 	struct svc_fh *current_fh = &cstate->current_fh;
@@ -3458,17 +3451,16 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 			 struct nfs4_ol_stateid **stpp)
 {
 	__be32 status;
+	struct nfs4_stid *s;
 
 	dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__,
 		seqid, STATEID_VAL(stateid));
 
 	*stpp = NULL;
-	status = nfs4_nospecial_stateid_checks(stateid);
+	status = nfsd4_lookup_stateid(stateid, typemask, &s);
 	if (status)
 		return status;
-	*stpp = find_ol_stateid_by_type(stateid, typemask);
-	if (*stpp == NULL)
-		return nfserr_expired;
+	*stpp = openlockstateid(s);
 	cstate->replay_owner = (*stpp)->st_stateowner;
 	renew_client((*stpp)->st_stateowner->so_client);
 
@@ -3673,6 +3665,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
 	struct nfs4_delegation *dp;
 	stateid_t *stateid = &dr->dr_stateid;
+	struct nfs4_stid *s;
 	struct inode *inode;
 	__be32 status;
 
@@ -3681,16 +3674,10 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	inode = cstate->current_fh.fh_dentry->d_inode;
 
 	nfs4_lock_state();
-	status = nfserr_bad_stateid;
-	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
-		goto out;
-	status = nfserr_stale_stateid;
-	if (STALE_STATEID(stateid))
-		goto out;
-	status = nfserr_expired;
-	dp = find_deleg_stateid(stateid);
-	if (!dp)
+	status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s);
+	if (status)
 		goto out;
+	dp = delegstateid(s);
 	status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate));
 	if (status)
 		goto out;
@@ -4409,7 +4396,6 @@ nfs4_state_init(void)
 	for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) {
 		INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]);
 	}
-	idr_init(&stateids);
 	for (i = 0; i < LOCK_HASH_SIZE; i++) {
 		INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]);
 	}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2429fffa31dd..5779acde7e70 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3287,6 +3287,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr,
 			  struct nfsd4_test_stateid *test_stateid)
 {
 	struct nfsd4_compoundargs *argp;
+	struct nfs4_client *cl = resp->cstate.session->se_client;
 	stateid_t si;
 	__be32 *p;
 	int i;
@@ -3302,7 +3303,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr,
 	nfs4_lock_state();
 	for (i = 0; i < test_stateid->ts_num_ids; i++) {
 		nfsd4_decode_stateid(argp, &si);
-		valid = nfs4_validate_stateid(&si);
+		valid = nfs4_validate_stateid(cl, &si);
 		RESERVE_SPACE(4);
 		*p++ = htonl(valid);
 		resp->p = p;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 55a4d6a108a2..13f6f9f5ceec 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -35,6 +35,7 @@
 #ifndef _NFSD4_STATE_H
 #define _NFSD4_STATE_H
 
+#include <linux/idr.h>
 #include <linux/sunrpc/svc_xprt.h>
 #include <linux/nfsd/nfsfh.h>
 #include "nfsfh.h"
@@ -231,6 +232,7 @@ struct nfs4_client {
 	struct list_head	cl_idhash; 	/* hash by cl_clientid.id */
 	struct list_head	cl_strhash; 	/* hash by cl_name */
 	struct list_head	cl_openowners;
+	struct idr		cl_stateids;	/* stateid lookup */
 	struct list_head	cl_delegations;
 	struct list_head        cl_lru;         /* tail queue */
 	struct xdr_netobj	cl_name; 	/* id generated by client */
@@ -508,7 +510,7 @@ extern void nfsd4_recdir_purge_old(void);
 extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
 extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
 extern void release_session_client(struct nfsd4_session *);
-extern __be32 nfs4_validate_stateid(stateid_t *);
+extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *);
 extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *);
 
 #endif   /* NFSD4_STATE_H */
-- 
cgit v1.2.3-58-ga151


From c4253cb0748cd50060d04d838c38b07f1ad0e6e5 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 22 Sep 2011 19:34:33 +0200
Subject: sysfs: add unsigned long cast to prevent compile warning

"sysfs: use rb-tree for inode number lookup" added a new printk which
causes a new compile warning on s390 (and few other architectures):

fs/sysfs/dir.c: In function 'sysfs_link_sibling':
fs/sysfs/dir.c:63:4: warning: format '%lx' expects argument of type
  'long unsigned int', but argument 2 has type 'ino_t' [-Wform

Add an explicit unsigned long cast since ino_t is an unsigned long on
most architectures.

Cc: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index c3646d93a032..83bb9d1f30aa 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -60,7 +60,8 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd)
 		} else if (sd->s_ino > node->s_ino) {
 			p = &node->inode_node.rb_right;
 		} else {
-			printk(KERN_CRIT "sysfs: inserting duplicate inode '%lx'\n", sd->s_ino);
+			printk(KERN_CRIT "sysfs: inserting duplicate inode '%lx'\n",
+			       (unsigned long) sd->s_ino);
 			BUG();
 		}
 #undef node
-- 
cgit v1.2.3-58-ga151


From a542ad1bafc7df9fc16de8a6894b350a4df75572 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Mon, 13 Jun 2011 19:52:59 +0200
Subject: btrfs: added helper functions to iterate backrefs

These helper functions iterate back references and call a function for each
backref. There is also a function to resolve an inode to a path in the
file system.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/Makefile  |   3 +-
 fs/btrfs/backref.c | 776 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/backref.h |  62 +++++
 fs/btrfs/ioctl.h   |  11 +
 4 files changed, 851 insertions(+), 1 deletion(-)
 create mode 100644 fs/btrfs/backref.c
 create mode 100644 fs/btrfs/backref.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 40e6ac08c21f..89b6ce3634fd 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,6 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
-	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o
+	   compression.o delayed-ref.o relocation.o delayed-inode.o backref.o \
+	   scrub.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
new file mode 100644
index 000000000000..2351df0de450
--- /dev/null
+++ b/fs/btrfs/backref.c
@@ -0,0 +1,776 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "backref.h"
+
+struct __data_ref {
+	struct list_head list;
+	u64 inum;
+	u64 root;
+	u64 extent_data_item_offset;
+};
+
+struct __shared_ref {
+	struct list_head list;
+	u64 disk_byte;
+};
+
+static int __inode_info(u64 inum, u64 ioff, u8 key_type,
+			struct btrfs_root *fs_root, struct btrfs_path *path,
+			struct btrfs_key *found_key)
+{
+	int ret;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+
+	key.type = key_type;
+	key.objectid = inum;
+	key.offset = ioff;
+
+	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+
+	eb = path->nodes[0];
+	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
+		ret = btrfs_next_leaf(fs_root, path);
+		if (ret)
+			return ret;
+		eb = path->nodes[0];
+	}
+
+	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
+	if (found_key->type != key.type || found_key->objectid != key.objectid)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * this makes the path point to (inum INODE_ITEM ioff)
+ */
+int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
+			struct btrfs_path *path)
+{
+	struct btrfs_key key;
+	return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path,
+				&key);
+}
+
+static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
+				struct btrfs_path *path,
+				struct btrfs_key *found_key)
+{
+	return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path,
+				found_key);
+}
+
+/*
+ * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
+ * of the path are separated by '/' and the path is guaranteed to be
+ * 0-terminated. the path is only given within the current file system.
+ * Therefore, it never starts with a '/'. the caller is responsible to provide
+ * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
+ * the start point of the resulting string is returned. this pointer is within
+ * dest, normally.
+ * in case the path buffer would overflow, the pointer is decremented further
+ * as if output was written to the buffer, though no more output is actually
+ * generated. that way, the caller can determine how much space would be
+ * required for the path to fit into the buffer. in that case, the returned
+ * value will be smaller than dest. callers must check this!
+ */
+static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+				struct btrfs_inode_ref *iref,
+				struct extent_buffer *eb_in, u64 parent,
+				char *dest, u32 size)
+{
+	u32 len;
+	int slot;
+	u64 next_inum;
+	int ret;
+	s64 bytes_left = size - 1;
+	struct extent_buffer *eb = eb_in;
+	struct btrfs_key found_key;
+
+	if (bytes_left >= 0)
+		dest[bytes_left] = '\0';
+
+	while (1) {
+		len = btrfs_inode_ref_name_len(eb, iref);
+		bytes_left -= len;
+		if (bytes_left >= 0)
+			read_extent_buffer(eb, dest + bytes_left,
+						(unsigned long)(iref + 1), len);
+		if (eb != eb_in)
+			free_extent_buffer(eb);
+		ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
+		if (ret)
+			break;
+		next_inum = found_key.offset;
+
+		/* regular exit ahead */
+		if (parent == next_inum)
+			break;
+
+		slot = path->slots[0];
+		eb = path->nodes[0];
+		/* make sure we can use eb after releasing the path */
+		if (eb != eb_in)
+			atomic_inc(&eb->refs);
+		btrfs_release_path(path);
+
+		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+		parent = next_inum;
+		--bytes_left;
+		if (bytes_left >= 0)
+			dest[bytes_left] = '/';
+	}
+
+	btrfs_release_path(path);
+
+	if (ret)
+		return ERR_PTR(ret);
+
+	return dest + bytes_left;
+}
+
+/*
+ * this makes the path point to (logical EXTENT_ITEM *)
+ * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
+ * tree blocks and <0 on error.
+ */
+int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
+			struct btrfs_path *path, struct btrfs_key *found_key)
+{
+	int ret;
+	u64 flags;
+	u32 item_size;
+	struct extent_buffer *eb;
+	struct btrfs_extent_item *ei;
+	struct btrfs_key key;
+
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.objectid = logical;
+	key.offset = (u64)-1;
+
+	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+	ret = btrfs_previous_item(fs_info->extent_root, path,
+					0, BTRFS_EXTENT_ITEM_KEY);
+	if (ret < 0)
+		return ret;
+
+	btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
+	if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
+	    found_key->objectid > logical ||
+	    found_key->objectid + found_key->offset <= logical)
+		return -ENOENT;
+
+	eb = path->nodes[0];
+	item_size = btrfs_item_size_nr(eb, path->slots[0]);
+	BUG_ON(item_size < sizeof(*ei));
+
+	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+	flags = btrfs_extent_flags(eb, ei);
+
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+		return BTRFS_EXTENT_FLAG_TREE_BLOCK;
+	if (flags & BTRFS_EXTENT_FLAG_DATA)
+		return BTRFS_EXTENT_FLAG_DATA;
+
+	return -EIO;
+}
+
+/*
+ * helper function to iterate extent inline refs. ptr must point to a 0 value
+ * for the first call and may be modified. it is used to track state.
+ * if more refs exist, 0 is returned and the next call to
+ * __get_extent_inline_ref must pass the modified ptr parameter to get the
+ * next ref. after the last ref was processed, 1 is returned.
+ * returns <0 on error
+ */
+static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
+				struct btrfs_extent_item *ei, u32 item_size,
+				struct btrfs_extent_inline_ref **out_eiref,
+				int *out_type)
+{
+	unsigned long end;
+	u64 flags;
+	struct btrfs_tree_block_info *info;
+
+	if (!*ptr) {
+		/* first call */
+		flags = btrfs_extent_flags(eb, ei);
+		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+			info = (struct btrfs_tree_block_info *)(ei + 1);
+			*out_eiref =
+				(struct btrfs_extent_inline_ref *)(info + 1);
+		} else {
+			*out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
+		}
+		*ptr = (unsigned long)*out_eiref;
+		if ((void *)*ptr >= (void *)ei + item_size)
+			return -ENOENT;
+	}
+
+	end = (unsigned long)ei + item_size;
+	*out_eiref = (struct btrfs_extent_inline_ref *)*ptr;
+	*out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
+
+	*ptr += btrfs_extent_inline_ref_size(*out_type);
+	WARN_ON(*ptr > end);
+	if (*ptr == end)
+		return 1; /* last */
+
+	return 0;
+}
+
+/*
+ * reads the tree block backref for an extent. tree level and root are returned
+ * through out_level and out_root. ptr must point to a 0 value for the first
+ * call and may be modified (see __get_extent_inline_ref comment).
+ * returns 0 if data was provided, 1 if there was no more data to provide or
+ * <0 on error.
+ */
+int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
+				struct btrfs_extent_item *ei, u32 item_size,
+				u64 *out_root, u8 *out_level)
+{
+	int ret;
+	int type;
+	struct btrfs_tree_block_info *info;
+	struct btrfs_extent_inline_ref *eiref;
+
+	if (*ptr == (unsigned long)-1)
+		return 1;
+
+	while (1) {
+		ret = __get_extent_inline_ref(ptr, eb, ei, item_size,
+						&eiref, &type);
+		if (ret < 0)
+			return ret;
+
+		if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+		    type == BTRFS_SHARED_BLOCK_REF_KEY)
+			break;
+
+		if (ret == 1)
+			return 1;
+	}
+
+	/* we can treat both ref types equally here */
+	info = (struct btrfs_tree_block_info *)(ei + 1);
+	*out_root = btrfs_extent_inline_ref_offset(eb, eiref);
+	*out_level = btrfs_tree_block_level(eb, info);
+
+	if (ret == 1)
+		*ptr = (unsigned long)-1;
+
+	return 0;
+}
+
+static int __data_list_add(struct list_head *head, u64 inum,
+				u64 extent_data_item_offset, u64 root)
+{
+	struct __data_ref *ref;
+
+	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	if (!ref)
+		return -ENOMEM;
+
+	ref->inum = inum;
+	ref->extent_data_item_offset = extent_data_item_offset;
+	ref->root = root;
+	list_add_tail(&ref->list, head);
+
+	return 0;
+}
+
+static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
+				struct btrfs_extent_data_ref *dref)
+{
+	return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
+				btrfs_extent_data_ref_offset(eb, dref),
+				btrfs_extent_data_ref_root(eb, dref));
+}
+
+static int __shared_list_add(struct list_head *head, u64 disk_byte)
+{
+	struct __shared_ref *ref;
+
+	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	if (!ref)
+		return -ENOMEM;
+
+	ref->disk_byte = disk_byte;
+	list_add_tail(&ref->list, head);
+
+	return 0;
+}
+
+static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
+					   u64 logical, u64 inum,
+					   u64 extent_data_item_offset,
+					   u64 extent_offset,
+					   struct btrfs_path *path,
+					   struct list_head *data_refs,
+					   iterate_extent_inodes_t *iterate,
+					   void *ctx)
+{
+	u64 ref_root;
+	u32 item_size;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	struct btrfs_extent_item *ei;
+	struct btrfs_extent_inline_ref *eiref;
+	struct __data_ref *ref;
+	int ret;
+	int type;
+	int last;
+	unsigned long ptr = 0;
+
+	WARN_ON(!list_empty(data_refs));
+	ret = extent_from_logical(fs_info, logical, path, &key);
+	if (ret & BTRFS_EXTENT_FLAG_DATA)
+		ret = -EIO;
+	if (ret < 0)
+		goto out;
+
+	eb = path->nodes[0];
+	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+	item_size = btrfs_item_size_nr(eb, path->slots[0]);
+
+	ret = 0;
+	ref_root = 0;
+	/*
+	 * as done in iterate_extent_inodes, we first build a list of refs to
+	 * iterate, then free the path and then iterate them to avoid deadlocks.
+	 */
+	do {
+		last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
+						&eiref, &type);
+		if (last < 0) {
+			ret = last;
+			goto out;
+		}
+		if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+		    type == BTRFS_SHARED_BLOCK_REF_KEY) {
+			ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
+			ret = __data_list_add(data_refs, inum,
+						extent_data_item_offset,
+						ref_root);
+		}
+	} while (!ret && !last);
+
+	btrfs_release_path(path);
+
+	if (ref_root == 0) {
+		printk(KERN_ERR "btrfs: failed to find tree block ref "
+			"for shared data backref %llu\n", logical);
+		WARN_ON(1);
+		ret = -EIO;
+	}
+
+out:
+	while (!list_empty(data_refs)) {
+		ref = list_first_entry(data_refs, struct __data_ref, list);
+		list_del(&ref->list);
+		if (!ret)
+			ret = iterate(ref->inum, extent_offset +
+					ref->extent_data_item_offset,
+					ref->root, ctx);
+		kfree(ref);
+	}
+
+	return ret;
+}
+
+static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
+				    u64 logical, u64 orig_extent_item_objectid,
+				    u64 extent_offset, struct btrfs_path *path,
+				    struct list_head *data_refs,
+				    iterate_extent_inodes_t *iterate,
+				    void *ctx)
+{
+	u64 disk_byte;
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *fi;
+	struct extent_buffer *eb;
+	int slot;
+	int nritems;
+	int ret;
+	int found = 0;
+
+	eb = read_tree_block(fs_info->tree_root, logical,
+				fs_info->tree_root->leafsize, 0);
+	if (!eb)
+		return -EIO;
+
+	/*
+	 * from the shared data ref, we only have the leaf but we need
+	 * the key. thus, we must look into all items and see that we
+	 * find one (some) with a reference to our extent item.
+	 */
+	nritems = btrfs_header_nritems(eb);
+	for (slot = 0; slot < nritems; ++slot) {
+		btrfs_item_key_to_cpu(eb, &key, slot);
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+		if (!fi) {
+			free_extent_buffer(eb);
+			return -EIO;
+		}
+		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+		if (disk_byte != orig_extent_item_objectid) {
+			if (found)
+				break;
+			else
+				continue;
+		}
+		++found;
+		ret = __iter_shared_inline_ref_inodes(fs_info, logical,
+							key.objectid,
+							key.offset,
+							extent_offset, path,
+							data_refs,
+							iterate, ctx);
+		if (ret)
+			break;
+	}
+
+	if (!found) {
+		printk(KERN_ERR "btrfs: failed to follow shared data backref "
+			"to parent %llu\n", logical);
+		WARN_ON(1);
+		ret = -EIO;
+	}
+
+	free_extent_buffer(eb);
+	return ret;
+}
+
+/*
+ * calls iterate() for every inode that references the extent identified by
+ * the given parameters. will use the path given as a parameter and return it
+ * released.
+ * when the iterator function returns a non-zero value, iteration stops.
+ */
+int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
+				struct btrfs_path *path,
+				u64 extent_item_objectid,
+				u64 extent_offset,
+				iterate_extent_inodes_t *iterate, void *ctx)
+{
+	unsigned long ptr = 0;
+	int last;
+	int ret;
+	int type;
+	u64 logical;
+	u32 item_size;
+	struct btrfs_extent_inline_ref *eiref;
+	struct btrfs_extent_data_ref *dref;
+	struct extent_buffer *eb;
+	struct btrfs_extent_item *ei;
+	struct btrfs_key key;
+	struct list_head data_refs = LIST_HEAD_INIT(data_refs);
+	struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
+	struct __data_ref *ref_d;
+	struct __shared_ref *ref_s;
+
+	eb = path->nodes[0];
+	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+	item_size = btrfs_item_size_nr(eb, path->slots[0]);
+
+	/* first we iterate the inline refs, ... */
+	do {
+		last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
+						&eiref, &type);
+		if (last == -ENOENT) {
+			ret = 0;
+			break;
+		}
+		if (last < 0) {
+			ret = last;
+			break;
+		}
+
+		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+			dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
+			ret = __data_list_add_eb(&data_refs, eb, dref);
+		} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+			logical = btrfs_extent_inline_ref_offset(eb, eiref);
+			ret = __shared_list_add(&shared_refs, logical);
+		}
+	} while (!ret && !last);
+
+	/* ... then we proceed to in-tree references and ... */
+	while (!ret) {
+		++path->slots[0];
+		if (path->slots[0] > btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(fs_info->extent_root, path);
+			if (ret) {
+				if (ret == 1)
+					ret = 0; /* we're done */
+				break;
+			}
+			eb = path->nodes[0];
+		}
+		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+		if (key.objectid != extent_item_objectid)
+			break;
+		if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+			dref = btrfs_item_ptr(eb, path->slots[0],
+						struct btrfs_extent_data_ref);
+			ret = __data_list_add_eb(&data_refs, eb, dref);
+		} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+			ret = __shared_list_add(&shared_refs, key.offset);
+		}
+	}
+
+	btrfs_release_path(path);
+
+	/*
+	 * ... only at the very end we can process the refs we found. this is
+	 * because the iterator function we call is allowed to make tree lookups
+	 * and we have to avoid deadlocks. additionally, we need more tree
+	 * lookups ourselves for shared data refs.
+	 */
+	while (!list_empty(&data_refs)) {
+		ref_d = list_first_entry(&data_refs, struct __data_ref, list);
+		list_del(&ref_d->list);
+		if (!ret)
+			ret = iterate(ref_d->inum, extent_offset +
+					ref_d->extent_data_item_offset,
+					ref_d->root, ctx);
+		kfree(ref_d);
+	}
+
+	while (!list_empty(&shared_refs)) {
+		ref_s = list_first_entry(&shared_refs, struct __shared_ref,
+					list);
+		list_del(&ref_s->list);
+		if (!ret)
+			ret = __iter_shared_inline_ref(fs_info,
+							ref_s->disk_byte,
+							extent_item_objectid,
+							extent_offset, path,
+							&data_refs,
+							iterate, ctx);
+		kfree(ref_s);
+	}
+
+	return ret;
+}
+
+int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
+				struct btrfs_path *path,
+				iterate_extent_inodes_t *iterate, void *ctx)
+{
+	int ret;
+	u64 offset;
+	struct btrfs_key found_key;
+
+	ret = extent_from_logical(fs_info, logical, path,
+					&found_key);
+	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+		ret = -EINVAL;
+	if (ret < 0)
+		return ret;
+
+	offset = logical - found_key.objectid;
+	ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
+					offset, iterate, ctx);
+
+	return ret;
+}
+
+static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
+				struct btrfs_path *path,
+				iterate_irefs_t *iterate, void *ctx)
+{
+	int ret;
+	int slot;
+	u32 cur;
+	u32 len;
+	u32 name_len;
+	u64 parent = 0;
+	int found = 0;
+	struct extent_buffer *eb;
+	struct btrfs_item *item;
+	struct btrfs_inode_ref *iref;
+	struct btrfs_key found_key;
+
+	while (1) {
+		ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
+					&found_key);
+		if (ret < 0)
+			break;
+		if (ret) {
+			ret = found ? 0 : -ENOENT;
+			break;
+		}
+		++found;
+
+		parent = found_key.offset;
+		slot = path->slots[0];
+		eb = path->nodes[0];
+		/* make sure we can use eb after releasing the path */
+		atomic_inc(&eb->refs);
+		btrfs_release_path(path);
+
+		item = btrfs_item_nr(eb, slot);
+		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+
+		for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
+			name_len = btrfs_inode_ref_name_len(eb, iref);
+			/* path must be released before calling iterate()! */
+			ret = iterate(parent, iref, eb, ctx);
+			if (ret) {
+				free_extent_buffer(eb);
+				break;
+			}
+			len = sizeof(*iref) + name_len;
+			iref = (struct btrfs_inode_ref *)((char *)iref + len);
+		}
+		free_extent_buffer(eb);
+	}
+
+	btrfs_release_path(path);
+
+	return ret;
+}
+
+/*
+ * returns 0 if the path could be dumped (probably truncated)
+ * returns <0 in case of an error
+ */
+static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
+				struct extent_buffer *eb, void *ctx)
+{
+	struct inode_fs_paths *ipath = ctx;
+	char *fspath;
+	char *fspath_min;
+	int i = ipath->fspath->elem_cnt;
+	const int s_ptr = sizeof(char *);
+	u32 bytes_left;
+
+	bytes_left = ipath->fspath->bytes_left > s_ptr ?
+					ipath->fspath->bytes_left - s_ptr : 0;
+
+	fspath_min = (char *)ipath->fspath->str + (i + 1) * s_ptr;
+	fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
+				inum, fspath_min, bytes_left);
+	if (IS_ERR(fspath))
+		return PTR_ERR(fspath);
+
+	if (fspath > fspath_min) {
+		ipath->fspath->str[i] = fspath;
+		++ipath->fspath->elem_cnt;
+		ipath->fspath->bytes_left = fspath - fspath_min;
+	} else {
+		++ipath->fspath->elem_missed;
+		ipath->fspath->bytes_missing += fspath_min - fspath;
+		ipath->fspath->bytes_left = 0;
+	}
+
+	return 0;
+}
+
+/*
+ * this dumps all file system paths to the inode into the ipath struct, provided
+ * is has been created large enough. each path is zero-terminated and accessed
+ * from ipath->fspath->str[i].
+ * when it returns, there are ipath->fspath->elem_cnt number of paths available
+ * in ipath->fspath->str[]. when the allocated space wasn't sufficient, the
+ * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
+ * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
+ * have been needed to return all paths.
+ */
+int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
+{
+	return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
+				inode_to_path, ipath);
+}
+
+/*
+ * allocates space to return multiple file system paths for an inode.
+ * total_bytes to allocate are passed, note that space usable for actual path
+ * information will be total_bytes - sizeof(struct inode_fs_paths).
+ * the returned pointer must be freed with free_ipath() in the end.
+ */
+struct btrfs_data_container *init_data_container(u32 total_bytes)
+{
+	struct btrfs_data_container *data;
+	size_t alloc_bytes;
+
+	alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
+	data = kmalloc(alloc_bytes, GFP_NOFS);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+
+	if (total_bytes >= sizeof(*data)) {
+		data->bytes_left = total_bytes - sizeof(*data);
+		data->bytes_missing = 0;
+	} else {
+		data->bytes_missing = sizeof(*data) - total_bytes;
+		data->bytes_left = 0;
+	}
+
+	data->elem_cnt = 0;
+	data->elem_missed = 0;
+
+	return data;
+}
+
+/*
+ * allocates space to return multiple file system paths for an inode.
+ * total_bytes to allocate are passed, note that space usable for actual path
+ * information will be total_bytes - sizeof(struct inode_fs_paths).
+ * the returned pointer must be freed with free_ipath() in the end.
+ */
+struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
+					struct btrfs_path *path)
+{
+	struct inode_fs_paths *ifp;
+	struct btrfs_data_container *fspath;
+
+	fspath = init_data_container(total_bytes);
+	if (IS_ERR(fspath))
+		return (void *)fspath;
+
+	ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
+	if (!ifp) {
+		kfree(fspath);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	ifp->btrfs_path = path;
+	ifp->fspath = fspath;
+	ifp->fs_root = fs_root;
+
+	return ifp;
+}
+
+void free_ipath(struct inode_fs_paths *ipath)
+{
+	kfree(ipath);
+}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
new file mode 100644
index 000000000000..92618837cb8f
--- /dev/null
+++ b/fs/btrfs/backref.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_BACKREF__
+#define __BTRFS_BACKREF__
+
+#include "ioctl.h"
+
+struct inode_fs_paths {
+	struct btrfs_path		*btrfs_path;
+	struct btrfs_root		*fs_root;
+	struct btrfs_data_container	*fspath;
+};
+
+typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
+		void *ctx);
+typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
+				struct extent_buffer *eb, void *ctx);
+
+int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
+			struct btrfs_path *path);
+
+int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
+			struct btrfs_path *path, struct btrfs_key *found_key);
+
+int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
+				struct btrfs_extent_item *ei, u32 item_size,
+				u64 *out_root, u8 *out_level);
+
+int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
+				struct btrfs_path *path,
+				u64 extent_item_objectid,
+				u64 extent_offset,
+				iterate_extent_inodes_t *iterate, void *ctx);
+
+int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
+				struct btrfs_path *path,
+				iterate_extent_inodes_t *iterate, void *ctx);
+
+int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
+
+struct btrfs_data_container *init_data_container(u32 total_bytes);
+struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
+					struct btrfs_path *path);
+void free_ipath(struct inode_fs_paths *ipath);
+
+#endif
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index ad1ea789fcb4..1857e3871843 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -193,6 +193,17 @@ struct btrfs_ioctl_space_args {
 	struct btrfs_ioctl_space_info spaces[0];
 };
 
+struct btrfs_data_container {
+	__u32	bytes_left;	/* out -- bytes not needed to deliver output */
+	__u32	bytes_missing;	/* out -- additional bytes needed for result */
+	__u32	elem_cnt;	/* out */
+	__u32	elem_missed;	/* out */
+	union {
+		char	*str[0];	/* out */
+		__u64	val[0];		/* out */
+	};
+};
+
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
-- 
cgit v1.2.3-58-ga151


From 13db62b7a1e8c64763a93c155091620f85ff8920 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Mon, 13 Jun 2011 19:56:13 +0200
Subject: btrfs scrub: added unverified_errors

In normal operation, scrub is reading data sequentially in large portions.
In case of an i/o error, we try to find the corrupted area(s) by issuing
page sized read requests. With this commit we increment the
unverified_errors counter if all of the small size requests succeed.

Userland patches carrying such conspicous events to the administrator should
already be around.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/scrub.c | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a8d03d5efb5d..35099fa97d56 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -201,18 +201,25 @@ nomem:
  * recheck_error gets called for every page in the bio, even though only
  * one may be bad
  */
-static void scrub_recheck_error(struct scrub_bio *sbio, int ix)
+static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
 {
+	struct scrub_dev *sdev = sbio->sdev;
+	u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
+
 	if (sbio->err) {
-		if (scrub_fixup_io(READ, sbio->sdev->dev->bdev,
-				   (sbio->physical + ix * PAGE_SIZE) >> 9,
+		if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
 				   sbio->bio->bi_io_vec[ix].bv_page) == 0) {
 			if (scrub_fixup_check(sbio, ix) == 0)
-				return;
+				return 0;
 		}
 	}
 
+	spin_lock(&sdev->stat_lock);
+	++sdev->stat.read_errors;
+	spin_unlock(&sdev->stat_lock);
+
 	scrub_fixup(sbio, ix);
+	return 1;
 }
 
 static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
@@ -382,8 +389,14 @@ static void scrub_checksum(struct btrfs_work *work)
 	int ret;
 
 	if (sbio->err) {
+		ret = 0;
 		for (i = 0; i < sbio->count; ++i)
-			scrub_recheck_error(sbio, i);
+			ret |= scrub_recheck_error(sbio, i);
+		if (!ret) {
+			spin_lock(&sdev->stat_lock);
+			++sdev->stat.unverified_errors;
+			spin_unlock(&sdev->stat_lock);
+		}
 
 		sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
 		sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
@@ -396,10 +409,6 @@ static void scrub_checksum(struct btrfs_work *work)
 			bi->bv_offset = 0;
 			bi->bv_len = PAGE_SIZE;
 		}
-
-		spin_lock(&sdev->stat_lock);
-		++sdev->stat.read_errors;
-		spin_unlock(&sdev->stat_lock);
 		goto out;
 	}
 	for (i = 0; i < sbio->count; ++i) {
@@ -420,8 +429,14 @@ static void scrub_checksum(struct btrfs_work *work)
 			WARN_ON(1);
 		}
 		kunmap_atomic(buffer, KM_USER0);
-		if (ret)
-			scrub_recheck_error(sbio, i);
+		if (ret) {
+			ret = scrub_recheck_error(sbio, i);
+			if (!ret) {
+				spin_lock(&sdev->stat_lock);
+				++sdev->stat.unverified_errors;
+				spin_unlock(&sdev->stat_lock);
+			}
+		}
 	}
 
 out:
-- 
cgit v1.2.3-58-ga151


From 558540c17771eaf89b1a3be39aa2c8bc837da1a6 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Mon, 13 Jun 2011 19:59:12 +0200
Subject: btrfs scrub: print paths of corrupted files

While scrubbing, we may encounter various errors. Previously, a logical
address was printed to the log only. Now, all paths belonging to that
address are resolved and printed separately. That should work for hardlinks
as well as reflinks.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/scrub.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 163 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 35099fa97d56..221fd5c48736 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -17,10 +17,12 @@
  */
 
 #include <linux/blkdev.h>
+#include <linux/ratelimit.h>
 #include "ctree.h"
 #include "volumes.h"
 #include "disk-io.h"
 #include "ordered-data.h"
+#include "backref.h"
 
 /*
  * This is only the first step towards a full-features scrub. It reads all
@@ -100,6 +102,19 @@ struct scrub_dev {
 	spinlock_t		stat_lock;
 };
 
+struct scrub_warning {
+	struct btrfs_path	*path;
+	u64			extent_item_size;
+	char			*scratch_buf;
+	char			*msg_buf;
+	const char		*errstr;
+	sector_t		sector;
+	u64			logical;
+	struct btrfs_device	*dev;
+	int			msg_bufsize;
+	int			scratch_bufsize;
+};
+
 static void scrub_free_csums(struct scrub_dev *sdev)
 {
 	while (!list_empty(&sdev->csum_list)) {
@@ -195,6 +210,143 @@ nomem:
 	return ERR_PTR(-ENOMEM);
 }
 
+static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
+{
+	u64 isize;
+	u32 nlink;
+	int ret;
+	int i;
+	struct extent_buffer *eb;
+	struct btrfs_inode_item *inode_item;
+	struct scrub_warning *swarn = ctx;
+	struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
+	struct inode_fs_paths *ipath = NULL;
+	struct btrfs_root *local_root;
+	struct btrfs_key root_key;
+
+	root_key.objectid = root;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root_key.offset = (u64)-1;
+	local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+	if (IS_ERR(local_root)) {
+		ret = PTR_ERR(local_root);
+		goto err;
+	}
+
+	ret = inode_item_info(inum, 0, local_root, swarn->path);
+	if (ret) {
+		btrfs_release_path(swarn->path);
+		goto err;
+	}
+
+	eb = swarn->path->nodes[0];
+	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
+					struct btrfs_inode_item);
+	isize = btrfs_inode_size(eb, inode_item);
+	nlink = btrfs_inode_nlink(eb, inode_item);
+	btrfs_release_path(swarn->path);
+
+	ipath = init_ipath(4096, local_root, swarn->path);
+	ret = paths_from_inode(inum, ipath);
+
+	if (ret < 0)
+		goto err;
+
+	/*
+	 * we deliberately ignore the bit ipath might have been too small to
+	 * hold all of the paths here
+	 */
+	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
+		printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
+			"%s, sector %llu, root %llu, inode %llu, offset %llu, "
+			"length %llu, links %u (path: %s)\n", swarn->errstr,
+			swarn->logical, swarn->dev->name,
+			(unsigned long long)swarn->sector, root, inum, offset,
+			min(isize - offset, (u64)PAGE_SIZE), nlink,
+			ipath->fspath->str[i]);
+
+	free_ipath(ipath);
+	return 0;
+
+err:
+	printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
+		"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
+		"resolving failed with ret=%d\n", swarn->errstr,
+		swarn->logical, swarn->dev->name,
+		(unsigned long long)swarn->sector, root, inum, offset, ret);
+
+	free_ipath(ipath);
+	return 0;
+}
+
+static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
+				int ix)
+{
+	struct btrfs_device *dev = sbio->sdev->dev;
+	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+	struct btrfs_path *path;
+	struct btrfs_key found_key;
+	struct extent_buffer *eb;
+	struct btrfs_extent_item *ei;
+	struct scrub_warning swarn;
+	u32 item_size;
+	int ret;
+	u64 ref_root;
+	u8 ref_level;
+	unsigned long ptr = 0;
+	const int bufsize = 4096;
+	u64 extent_offset;
+
+	path = btrfs_alloc_path();
+
+	swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
+	swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
+	swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
+	swarn.logical = sbio->logical + ix * PAGE_SIZE;
+	swarn.errstr = errstr;
+	swarn.dev = dev;
+	swarn.msg_bufsize = bufsize;
+	swarn.scratch_bufsize = bufsize;
+
+	if (!path || !swarn.scratch_buf || !swarn.msg_buf)
+		goto out;
+
+	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
+	if (ret < 0)
+		goto out;
+
+	extent_offset = swarn.logical - found_key.objectid;
+	swarn.extent_item_size = found_key.offset;
+
+	eb = path->nodes[0];
+	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+	item_size = btrfs_item_size_nr(eb, path->slots[0]);
+
+	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		do {
+			ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
+							&ref_root, &ref_level);
+			printk(KERN_WARNING "%s at logical %llu on dev %s, "
+				"sector %llu: metadata %s (level %d) in tree "
+				"%llu\n", errstr, swarn.logical, dev->name,
+				(unsigned long long)swarn.sector,
+				ref_level ? "node" : "leaf",
+				ret < 0 ? -1 : ref_level,
+				ret < 0 ? -1 : ref_root);
+		} while (ret != 1);
+	} else {
+		swarn.path = path;
+		iterate_extent_inodes(fs_info, path, found_key.objectid,
+					extent_offset,
+					scrub_print_warning_inode, &swarn);
+	}
+
+out:
+	btrfs_free_path(path);
+	kfree(swarn.scratch_buf);
+	kfree(swarn.msg_buf);
+}
+
 /*
  * scrub_recheck_error gets called when either verification of the page
  * failed or the bio failed to read, e.g. with EIO. In the latter case,
@@ -205,6 +357,8 @@ static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
 {
 	struct scrub_dev *sdev = sbio->sdev;
 	u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
+	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+					DEFAULT_RATELIMIT_BURST);
 
 	if (sbio->err) {
 		if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
@@ -212,6 +366,11 @@ static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
 			if (scrub_fixup_check(sbio, ix) == 0)
 				return 0;
 		}
+		if (__ratelimit(&_rs))
+			scrub_print_warning("i/o error", sbio, ix);
+	} else {
+		if (__ratelimit(&_rs))
+			scrub_print_warning("checksum error", sbio, ix);
 	}
 
 	spin_lock(&sdev->stat_lock);
@@ -326,9 +485,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 	++sdev->stat.corrected_errors;
 	spin_unlock(&sdev->stat_lock);
 
-	if (printk_ratelimit())
-		printk(KERN_ERR "btrfs: fixed up at %llu\n",
-		       (unsigned long long)logical);
+	printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
+			       (unsigned long long)logical);
 	return;
 
 uncorrectable:
@@ -337,9 +495,8 @@ uncorrectable:
 	++sdev->stat.uncorrectable_errors;
 	spin_unlock(&sdev->stat_lock);
 
-	if (printk_ratelimit())
-		printk(KERN_ERR "btrfs: unable to fixup at %llu\n",
-			 (unsigned long long)logical);
+	printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
+				"logical %llu\n", (unsigned long long)logical);
 }
 
 static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
-- 
cgit v1.2.3-58-ga151


From 193ea74b2729e6ddc08fb6bde6e15a3bd4d94071 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Mon, 13 Jun 2011 19:56:54 +0200
Subject: btrfs scrub: bugfix: mirror_num off by one

Fix the mirror_num determination in scrub_stripe. The rest of the scrub code
did not use mirror_num for anything important and that error went unnoticed.
The nodatasum fixup patch of this set depends on a correct mirror_num.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/scrub.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 221fd5c48736..59caf8fcd1c7 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -452,7 +452,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 	 * first find a good copy
 	 */
 	for (i = 0; i < multi->num_stripes; ++i) {
-		if (i == sbio->spag[ix].mirror_num)
+		if (i + 1 == sbio->spag[ix].mirror_num)
 			continue;
 
 		if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev,
@@ -930,21 +930,21 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
 		offset = map->stripe_len * num;
 		increment = map->stripe_len * map->num_stripes;
-		mirror_num = 0;
+		mirror_num = 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 		int factor = map->num_stripes / map->sub_stripes;
 		offset = map->stripe_len * (num / map->sub_stripes);
 		increment = map->stripe_len * factor;
-		mirror_num = num % map->sub_stripes;
+		mirror_num = num % map->sub_stripes + 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
 		increment = map->stripe_len;
-		mirror_num = num % map->num_stripes;
+		mirror_num = num % map->num_stripes + 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
 		increment = map->stripe_len;
-		mirror_num = num % map->num_stripes;
+		mirror_num = num % map->num_stripes + 1;
 	} else {
 		increment = map->stripe_len;
-		mirror_num = 0;
+		mirror_num = 1;
 	}
 
 	path = btrfs_alloc_path();
-- 
cgit v1.2.3-58-ga151


From 8ddc7d9cd0a00062247c732b96386ec2462bdbc7 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Mon, 13 Jun 2011 20:02:58 +0200
Subject: btrfs: add mirror_num to extent_read_full_page

Currently, extent_read_full_page always assumes we are trying to read mirror
0, which generally is the best we can do. To add flexibility, pass it as a
parameter. This will be needed by scrub fixup code.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/disk-io.c   | 2 +-
 fs/btrfs/extent_io.c | 6 +++---
 fs/btrfs/extent_io.h | 2 +-
 fs/btrfs/inode.c     | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07b3ac662e19..dc0343802535 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -908,7 +908,7 @@ static int btree_readpage(struct file *file, struct page *page)
 {
 	struct extent_io_tree *tree;
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	return extent_read_full_page(tree, page, btree_get_extent);
+	return extent_read_full_page(tree, page, btree_get_extent, 0);
 }
 
 static int btree_releasepage(struct page *page, gfp_t gfp_flags)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d418164a35f1..afebb95e3490 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2076,16 +2076,16 @@ out:
 }
 
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
-			    get_extent_t *get_extent)
+			    get_extent_t *get_extent, int mirror_num)
 {
 	struct bio *bio = NULL;
 	unsigned long bio_flags = 0;
 	int ret;
 
-	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
+	ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
 				      &bio_flags);
 	if (bio)
-		ret = submit_one_bio(READ, bio, 0, bio_flags);
+		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
 	return ret;
 }
 
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7b2f0c3e7929..a9dd994bf826 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -185,7 +185,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 		    gfp_t mask);
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
-			  get_extent_t *get_extent);
+			  get_extent_t *get_extent, int mirror_num);
 int __init extent_io_init(void);
 void extent_io_exit(void);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b2d004ad66a0..efee8c7cfa45 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6289,7 +6289,7 @@ int btrfs_readpage(struct file *file, struct page *page)
 {
 	struct extent_io_tree *tree;
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	return extent_read_full_page(tree, page, btrfs_get_extent);
+	return extent_read_full_page(tree, page, btrfs_get_extent, 0);
 }
 
 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
-- 
cgit v1.2.3-58-ga151


From e12fa9cd390f8e93a9144bd99bd6f6ed316fbc1e Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Fri, 17 Jun 2011 15:55:21 +0200
Subject: btrfs scrub: use int for mirror_num, not u64

the rest of the code uses int mirror_num, and so should scrub

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/scrub.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 59caf8fcd1c7..41a01147b959 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -65,7 +65,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix);
 struct scrub_page {
 	u64			flags;  /* extent flags */
 	u64			generation;
-	u64			mirror_num;
+	int			mirror_num;
 	int			have_csum;
 	u8			csum[BTRFS_CSUM_SIZE];
 };
@@ -776,7 +776,7 @@ nomem:
 }
 
 static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
-		      u64 physical, u64 flags, u64 gen, u64 mirror_num,
+		      u64 physical, u64 flags, u64 gen, int mirror_num,
 		      u8 *csum, int force)
 {
 	struct scrub_bio *sbio;
@@ -873,7 +873,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
 
 /* scrub extent tries to collect up to 64 kB for each bio */
 static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
-			u64 physical, u64 flags, u64 gen, u64 mirror_num)
+			u64 physical, u64 flags, u64 gen, int mirror_num)
 {
 	int ret;
 	u8 csum[BTRFS_CSUM_SIZE];
@@ -919,7 +919,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 	u64 physical;
 	u64 logical;
 	u64 generation;
-	u64 mirror_num;
+	int mirror_num;
 
 	u64 increment = map->stripe_len;
 	u64 offset;
-- 
cgit v1.2.3-58-ga151


From 0ef8e45158f97dde4801b535e25f70f7caf01a27 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Mon, 13 Jun 2011 20:04:15 +0200
Subject: btrfs scrub: add fixup code for errors on nodatasum files

This removes a FIXME comment and introduces the first part of nodatasum
fixup: It gets the corresponding inode for a logical address and triggers a
regular readpage for the corrupted sector.

Once we have on-the-fly error correction our error will be automatically
corrected. The correction code is expected to clear the newly introduced
EXTENT_DAMAGED flag, making scrub report that error as "corrected" instead
of "uncorrectable" eventually.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/extent_io.h |   1 +
 fs/btrfs/scrub.c     | 188 +++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 183 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a9dd994bf826..435d454b9926 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -17,6 +17,7 @@
 #define EXTENT_NODATASUM (1 << 10)
 #define EXTENT_DO_ACCOUNTING (1 << 11)
 #define EXTENT_FIRST_DELALLOC (1 << 12)
+#define EXTENT_DAMAGED (1 << 13)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 41a01147b959..db09f01c0e4f 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -22,6 +22,7 @@
 #include "volumes.h"
 #include "disk-io.h"
 #include "ordered-data.h"
+#include "transaction.h"
 #include "backref.h"
 
 /*
@@ -89,6 +90,7 @@ struct scrub_dev {
 	int			first_free;
 	int			curr;
 	atomic_t		in_flight;
+	atomic_t		fixup_cnt;
 	spinlock_t		list_lock;
 	wait_queue_head_t	list_wait;
 	u16			csum_size;
@@ -102,6 +104,14 @@ struct scrub_dev {
 	spinlock_t		stat_lock;
 };
 
+struct scrub_fixup_nodatasum {
+	struct scrub_dev	*sdev;
+	u64			logical;
+	struct btrfs_root	*root;
+	struct btrfs_work	work;
+	int			mirror_num;
+};
+
 struct scrub_warning {
 	struct btrfs_path	*path;
 	u64			extent_item_size;
@@ -190,12 +200,13 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
 
 		if (i != SCRUB_BIOS_PER_DEV-1)
 			sdev->bios[i]->next_free = i + 1;
-		 else
+		else
 			sdev->bios[i]->next_free = -1;
 	}
 	sdev->first_free = 0;
 	sdev->curr = -1;
 	atomic_set(&sdev->in_flight, 0);
+	atomic_set(&sdev->fixup_cnt, 0);
 	atomic_set(&sdev->cancel_req, 0);
 	sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
 	INIT_LIST_HEAD(&sdev->csum_list);
@@ -347,6 +358,151 @@ out:
 	kfree(swarn.msg_buf);
 }
 
+static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
+{
+	struct page *page;
+	unsigned long index;
+	struct scrub_fixup_nodatasum *fixup = ctx;
+	int ret;
+	int corrected;
+	struct btrfs_key key;
+	struct inode *inode;
+	u64 end = offset + PAGE_SIZE - 1;
+	struct btrfs_root *local_root;
+
+	key.objectid = root;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+	local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
+	if (IS_ERR(local_root))
+		return PTR_ERR(local_root);
+
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.objectid = inum;
+	key.offset = 0;
+	inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	ret = set_extent_bit(&BTRFS_I(inode)->io_tree, offset, end,
+				EXTENT_DAMAGED, 0, NULL, NULL, GFP_NOFS);
+
+	/* set_extent_bit should either succeed or give proper error */
+	WARN_ON(ret > 0);
+	if (ret)
+		return ret < 0 ? ret : -EFAULT;
+
+	index = offset >> PAGE_CACHE_SHIFT;
+
+	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+	if (!page)
+		return -ENOMEM;
+
+	ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
+					btrfs_get_extent, fixup->mirror_num);
+	wait_on_page_locked(page);
+	corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, end,
+					EXTENT_DAMAGED, 0, NULL);
+
+	if (corrected)
+		WARN_ON(!PageUptodate(page));
+	else
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, end,
+					EXTENT_DAMAGED, 0, 0, NULL, GFP_NOFS);
+
+	put_page(page);
+	iput(inode);
+
+	if (ret < 0)
+		return ret;
+
+	if (ret == 0 && corrected) {
+		/*
+		 * we only need to call readpage for one of the inodes belonging
+		 * to this extent. so make iterate_extent_inodes stop
+		 */
+		return 1;
+	}
+
+	return -EIO;
+}
+
+static void scrub_fixup_nodatasum(struct btrfs_work *work)
+{
+	int ret;
+	struct scrub_fixup_nodatasum *fixup;
+	struct scrub_dev *sdev;
+	struct btrfs_trans_handle *trans = NULL;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_path *path;
+	int uncorrectable = 0;
+
+	fixup = container_of(work, struct scrub_fixup_nodatasum, work);
+	sdev = fixup->sdev;
+	fs_info = fixup->root->fs_info;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		spin_lock(&sdev->stat_lock);
+		++sdev->stat.malloc_errors;
+		spin_unlock(&sdev->stat_lock);
+		uncorrectable = 1;
+		goto out;
+	}
+
+	trans = btrfs_join_transaction(fixup->root);
+	if (IS_ERR(trans)) {
+		uncorrectable = 1;
+		goto out;
+	}
+
+	/*
+	 * the idea is to trigger a regular read through the standard path. we
+	 * read a page from the (failed) logical address by specifying the
+	 * corresponding copynum of the failed sector. thus, that readpage is
+	 * expected to fail.
+	 * that is the point where on-the-fly error correction will kick in
+	 * (once it's finished) and rewrite the failed sector if a good copy
+	 * can be found.
+	 */
+	ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
+						path, scrub_fixup_readpage,
+						fixup);
+	if (ret < 0) {
+		uncorrectable = 1;
+		goto out;
+	}
+	WARN_ON(ret != 1);
+
+	spin_lock(&sdev->stat_lock);
+	++sdev->stat.corrected_errors;
+	spin_unlock(&sdev->stat_lock);
+
+out:
+	if (trans && !IS_ERR(trans))
+		btrfs_end_transaction(trans, fixup->root);
+	if (uncorrectable) {
+		spin_lock(&sdev->stat_lock);
+		++sdev->stat.uncorrectable_errors;
+		spin_unlock(&sdev->stat_lock);
+		printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
+					"(nodatasum) error at logical %llu\n",
+					fixup->logical);
+	}
+
+	btrfs_free_path(path);
+	kfree(fixup);
+
+	/* see caller why we're pretending to be paused in the scrub counters */
+	mutex_lock(&fs_info->scrub_lock);
+	atomic_dec(&fs_info->scrubs_running);
+	atomic_dec(&fs_info->scrubs_paused);
+	mutex_unlock(&fs_info->scrub_lock);
+	atomic_dec(&sdev->fixup_cnt);
+	wake_up(&fs_info->scrub_pause_wait);
+	wake_up(&sdev->list_wait);
+}
+
 /*
  * scrub_recheck_error gets called when either verification of the page
  * failed or the bio failed to read, e.g. with EIO. In the latter case,
@@ -417,6 +573,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
 	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 	struct btrfs_multi_bio *multi = NULL;
+	struct scrub_fixup_nodatasum *fixup;
 	u64 logical = sbio->logical + ix * PAGE_SIZE;
 	u64 length;
 	int i;
@@ -425,12 +582,30 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 
 	if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
 	    (sbio->spag[ix].have_csum == 0)) {
+		fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
+		if (!fixup)
+			goto uncorrectable;
+		fixup->sdev = sdev;
+		fixup->logical = logical;
+		fixup->root = fs_info->extent_root;
+		fixup->mirror_num = sbio->spag[ix].mirror_num;
 		/*
-		 * nodatasum, don't try to fix anything
-		 * FIXME: we can do better, open the inode and trigger a
-		 * writeback
+		 * increment scrubs_running to prevent cancel requests from
+		 * completing as long as a fixup worker is running. we must also
+		 * increment scrubs_paused to prevent deadlocking on pause
+		 * requests used for transactions commits (as the worker uses a
+		 * transaction context). it is safe to regard the fixup worker
+		 * as paused for all matters practical. effectively, we only
+		 * avoid cancellation requests from completing.
 		 */
-		goto uncorrectable;
+		mutex_lock(&fs_info->scrub_lock);
+		atomic_inc(&fs_info->scrubs_running);
+		atomic_inc(&fs_info->scrubs_paused);
+		mutex_unlock(&fs_info->scrub_lock);
+		atomic_inc(&sdev->fixup_cnt);
+		fixup->work.func = scrub_fixup_nodatasum;
+		btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
+		return;
 	}
 
 	length = PAGE_SIZE;
@@ -1425,10 +1600,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
 		ret = scrub_enumerate_chunks(sdev, start, end);
 
 	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
-
 	atomic_dec(&fs_info->scrubs_running);
 	wake_up(&fs_info->scrub_pause_wait);
 
+	wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
+
 	if (progress)
 		memcpy(progress, &sdev->stat, sizeof(*progress));
 
-- 
cgit v1.2.3-58-ga151


From d7728c960dccf775b92f2c4139f1216275a45c44 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Thu, 7 Jul 2011 16:48:38 +0200
Subject: btrfs: new ioctls to do logical->inode and inode->path resolving

these ioctls make use of the new functions initially added for scrub. they
return all inodes belonging to a logical address (BTRFS_IOC_LOGICAL_INO) and
all paths belonging to an inode (BTRFS_IOC_INO_PATHS).

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/ioctl.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ioctl.h |  19 ++++++++
 2 files changed, 162 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 538f65a79ec5..7f57efa76d11 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -51,6 +51,7 @@
 #include "volumes.h"
 #include "locking.h"
 #include "inode-map.h"
+#include "backref.h"
 
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -2855,6 +2856,144 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
 	return ret;
 }
 
+static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
+{
+	int ret = 0;
+	int i;
+	unsigned long rel_ptr;
+	int size;
+	struct btrfs_ioctl_ino_path_args *ipa;
+	struct inode_fs_paths *ipath = NULL;
+	struct btrfs_path *path;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ipa = memdup_user(arg, sizeof(*ipa));
+	if (IS_ERR(ipa)) {
+		ret = PTR_ERR(ipa);
+		ipa = NULL;
+		goto out;
+	}
+
+	size = min_t(u32, ipa->size, 4096);
+	ipath = init_ipath(size, root, path);
+	if (IS_ERR(ipath)) {
+		ret = PTR_ERR(ipath);
+		ipath = NULL;
+		goto out;
+	}
+
+	ret = paths_from_inode(ipa->inum, ipath);
+	if (ret < 0)
+		goto out;
+
+	for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
+		rel_ptr = ipath->fspath->str[i] - (char *)ipath->fspath->str;
+		ipath->fspath->str[i] = (void *)rel_ptr;
+	}
+
+	ret = copy_to_user(ipa->fspath, ipath->fspath, size);
+	if (ret) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+out:
+	btrfs_free_path(path);
+	free_ipath(ipath);
+	kfree(ipa);
+
+	return ret;
+}
+
+static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
+{
+	struct btrfs_data_container *inodes = ctx;
+	const size_t c = 3 * sizeof(u64);
+
+	if (inodes->bytes_left >= c) {
+		inodes->bytes_left -= c;
+		inodes->val[inodes->elem_cnt] = inum;
+		inodes->val[inodes->elem_cnt + 1] = offset;
+		inodes->val[inodes->elem_cnt + 2] = root;
+		inodes->elem_cnt += 3;
+	} else {
+		inodes->bytes_missing += c - inodes->bytes_left;
+		inodes->bytes_left = 0;
+		inodes->elem_missed += 3;
+	}
+
+	return 0;
+}
+
+static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
+					void __user *arg)
+{
+	int ret = 0;
+	int size;
+	u64 extent_offset;
+	struct btrfs_ioctl_logical_ino_args *loi;
+	struct btrfs_data_container *inodes = NULL;
+	struct btrfs_path *path = NULL;
+	struct btrfs_key key;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	loi = memdup_user(arg, sizeof(*loi));
+	if (IS_ERR(loi)) {
+		ret = PTR_ERR(loi);
+		loi = NULL;
+		goto out;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	size = min_t(u32, loi->size, 4096);
+	inodes = init_data_container(size);
+	if (IS_ERR(inodes)) {
+		ret = PTR_ERR(inodes);
+		inodes = NULL;
+		goto out;
+	}
+
+	ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
+
+	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+		ret = -ENOENT;
+	if (ret < 0)
+		goto out;
+
+	extent_offset = loi->logical - key.objectid;
+	ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
+					extent_offset, build_ino_list, inodes);
+
+	if (ret < 0)
+		goto out;
+
+	ret = copy_to_user(loi->inodes, inodes, size);
+	if (ret)
+		ret = -EFAULT;
+
+out:
+	btrfs_free_path(path);
+	kfree(inodes);
+	kfree(loi);
+
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -2912,6 +3051,10 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_tree_search(file, argp);
 	case BTRFS_IOC_INO_LOOKUP:
 		return btrfs_ioctl_ino_lookup(file, argp);
+	case BTRFS_IOC_INO_PATHS:
+		return btrfs_ioctl_ino_to_path(root, argp);
+	case BTRFS_IOC_LOGICAL_INO:
+		return btrfs_ioctl_logical_to_ino(root, argp);
 	case BTRFS_IOC_SPACE_INFO:
 		return btrfs_ioctl_space_info(root, argp);
 	case BTRFS_IOC_SYNC:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 1857e3871843..2da30d4950e6 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -204,6 +204,20 @@ struct btrfs_data_container {
 	};
 };
 
+struct btrfs_ioctl_ino_path_args {
+	__u64				inum;		/* in */
+	__u32				size;		/* in */
+	__u64				reserved[4];
+	struct btrfs_data_container	*fspath;	/* out */
+};
+
+struct btrfs_ioctl_logical_ino_args {
+	__u64				logical;	/* in */
+	__u32				size;		/* in */
+	__u64				reserved[4];
+	struct btrfs_data_container	*inodes;	/* out */
+};
+
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -259,4 +273,9 @@ struct btrfs_data_container {
 				 struct btrfs_ioctl_dev_info_args)
 #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
 			       struct btrfs_ioctl_fs_info_args)
+#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
+					struct btrfs_ioctl_ino_path_args)
+#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
+					struct btrfs_ioctl_ino_path_args)
+
 #endif
-- 
cgit v1.2.3-58-ga151


From a1d3c4786a4b9c71c0767aa656a759968f7554b6 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Thu, 4 Aug 2011 17:15:33 +0200
Subject: btrfs: btrfs_multi_bio replaced with btrfs_bio

btrfs_bio is a bio abstraction able to split and not complete after the last
bio has returned (like the old btrfs_multi_bio). Additionally, btrfs_bio
tracks the mirror_num used to read data which can be used for error
correction purposes.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/extent-tree.c |  10 ++--
 fs/btrfs/scrub.c       |  20 ++++----
 fs/btrfs/volumes.c     | 128 ++++++++++++++++++++++++++-----------------------
 fs/btrfs/volumes.h     |  10 ++--
 4 files changed, 90 insertions(+), 78 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5be06a2462f..119f842c1d4f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1770,18 +1770,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 {
 	int ret;
 	u64 discarded_bytes = 0;
-	struct btrfs_multi_bio *multi = NULL;
+	struct btrfs_bio *bbio = NULL;
 
 
 	/* Tell the block device(s) that the sectors can be discarded */
 	ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
-			      bytenr, &num_bytes, &multi, 0);
+			      bytenr, &num_bytes, &bbio, 0);
 	if (!ret) {
-		struct btrfs_bio_stripe *stripe = multi->stripes;
+		struct btrfs_bio_stripe *stripe = bbio->stripes;
 		int i;
 
 
-		for (i = 0; i < multi->num_stripes; i++, stripe++) {
+		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
 			if (!stripe->dev->can_discard)
 				continue;
 
@@ -1800,7 +1800,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 			 */
 			ret = 0;
 		}
-		kfree(multi);
+		kfree(bbio);
 	}
 
 	if (actual_bytes)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index db09f01c0e4f..97142a218f0a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -572,7 +572,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 	struct scrub_dev *sdev = sbio->sdev;
 	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
 	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
-	struct btrfs_multi_bio *multi = NULL;
+	struct btrfs_bio *bbio = NULL;
 	struct scrub_fixup_nodatasum *fixup;
 	u64 logical = sbio->logical + ix * PAGE_SIZE;
 	u64 length;
@@ -610,8 +610,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 
 	length = PAGE_SIZE;
 	ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
-			      &multi, 0);
-	if (ret || !multi || length < PAGE_SIZE) {
+			      &bbio, 0);
+	if (ret || !bbio || length < PAGE_SIZE) {
 		printk(KERN_ERR
 		       "scrub_fixup: btrfs_map_block failed us for %llu\n",
 		       (unsigned long long)logical);
@@ -619,19 +619,19 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 		return;
 	}
 
-	if (multi->num_stripes == 1)
+	if (bbio->num_stripes == 1)
 		/* there aren't any replicas */
 		goto uncorrectable;
 
 	/*
 	 * first find a good copy
 	 */
-	for (i = 0; i < multi->num_stripes; ++i) {
+	for (i = 0; i < bbio->num_stripes; ++i) {
 		if (i + 1 == sbio->spag[ix].mirror_num)
 			continue;
 
-		if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev,
-				   multi->stripes[i].physical >> 9,
+		if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
+				   bbio->stripes[i].physical >> 9,
 				   sbio->bio->bi_io_vec[ix].bv_page)) {
 			/* I/O-error, this is not a good copy */
 			continue;
@@ -640,7 +640,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 		if (scrub_fixup_check(sbio, ix) == 0)
 			break;
 	}
-	if (i == multi->num_stripes)
+	if (i == bbio->num_stripes)
 		goto uncorrectable;
 
 	if (!sdev->readonly) {
@@ -655,7 +655,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 		}
 	}
 
-	kfree(multi);
+	kfree(bbio);
 	spin_lock(&sdev->stat_lock);
 	++sdev->stat.corrected_errors;
 	spin_unlock(&sdev->stat_lock);
@@ -665,7 +665,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 	return;
 
 uncorrectable:
-	kfree(multi);
+	kfree(bbio);
 	spin_lock(&sdev->stat_lock);
 	++sdev->stat.uncorrectable_errors;
 	spin_unlock(&sdev->stat_lock);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f2a4cc79da61..9db4d7962f9b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2848,7 +2848,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
 
 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 			     u64 logical, u64 *length,
-			     struct btrfs_multi_bio **multi_ret,
+			     struct btrfs_bio **bbio_ret,
 			     int mirror_num)
 {
 	struct extent_map *em;
@@ -2866,18 +2866,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	int i;
 	int num_stripes;
 	int max_errors = 0;
-	struct btrfs_multi_bio *multi = NULL;
+	struct btrfs_bio *bbio = NULL;
 
-	if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
+	if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
 		stripes_allocated = 1;
 again:
-	if (multi_ret) {
-		multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
+	if (bbio_ret) {
+		bbio = kzalloc(btrfs_bio_size(stripes_allocated),
 				GFP_NOFS);
-		if (!multi)
+		if (!bbio)
 			return -ENOMEM;
 
-		atomic_set(&multi->error, 0);
+		atomic_set(&bbio->error, 0);
 	}
 
 	read_lock(&em_tree->lock);
@@ -2898,7 +2898,7 @@ again:
 	if (mirror_num > map->num_stripes)
 		mirror_num = 0;
 
-	/* if our multi bio struct is too small, back off and try again */
+	/* if our btrfs_bio struct is too small, back off and try again */
 	if (rw & REQ_WRITE) {
 		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
 				 BTRFS_BLOCK_GROUP_DUP)) {
@@ -2917,11 +2917,11 @@ again:
 			stripes_required = map->num_stripes;
 		}
 	}
-	if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
+	if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
 	    stripes_allocated < stripes_required) {
 		stripes_allocated = map->num_stripes;
 		free_extent_map(em);
-		kfree(multi);
+		kfree(bbio);
 		goto again;
 	}
 	stripe_nr = offset;
@@ -2950,7 +2950,7 @@ again:
 		*length = em->len - offset;
 	}
 
-	if (!multi_ret)
+	if (!bbio_ret)
 		goto out;
 
 	num_stripes = 1;
@@ -2975,13 +2975,17 @@ again:
 			stripe_index = find_live_mirror(map, 0,
 					    map->num_stripes,
 					    current->pid % map->num_stripes);
+			mirror_num = stripe_index + 1;
 		}
 
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-		if (rw & (REQ_WRITE | REQ_DISCARD))
+		if (rw & (REQ_WRITE | REQ_DISCARD)) {
 			num_stripes = map->num_stripes;
-		else if (mirror_num)
+		} else if (mirror_num) {
 			stripe_index = mirror_num - 1;
+		} else {
+			mirror_num = 1;
+		}
 
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 		int factor = map->num_stripes / map->sub_stripes;
@@ -3001,6 +3005,7 @@ again:
 			stripe_index = find_live_mirror(map, stripe_index,
 					      map->sub_stripes, stripe_index +
 					      current->pid % map->sub_stripes);
+			mirror_num = stripe_index + 1;
 		}
 	} else {
 		/*
@@ -3009,15 +3014,16 @@ again:
 		 * stripe_index is the number of our device in the stripe array
 		 */
 		stripe_index = do_div(stripe_nr, map->num_stripes);
+		mirror_num = stripe_index + 1;
 	}
 	BUG_ON(stripe_index >= map->num_stripes);
 
 	if (rw & REQ_DISCARD) {
 		for (i = 0; i < num_stripes; i++) {
-			multi->stripes[i].physical =
+			bbio->stripes[i].physical =
 				map->stripes[stripe_index].physical +
 				stripe_offset + stripe_nr * map->stripe_len;
-			multi->stripes[i].dev = map->stripes[stripe_index].dev;
+			bbio->stripes[i].dev = map->stripes[stripe_index].dev;
 
 			if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
 				u64 stripes;
@@ -3038,16 +3044,16 @@ again:
 				}
 				stripes = stripe_nr_end - 1 - j;
 				do_div(stripes, map->num_stripes);
-				multi->stripes[i].length = map->stripe_len *
+				bbio->stripes[i].length = map->stripe_len *
 					(stripes - stripe_nr + 1);
 
 				if (i == 0) {
-					multi->stripes[i].length -=
+					bbio->stripes[i].length -=
 						stripe_offset;
 					stripe_offset = 0;
 				}
 				if (stripe_index == last_stripe)
-					multi->stripes[i].length -=
+					bbio->stripes[i].length -=
 						stripe_end_offset;
 			} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 				u64 stripes;
@@ -3072,11 +3078,11 @@ again:
 				}
 				stripes = stripe_nr_end - 1 - j;
 				do_div(stripes, factor);
-				multi->stripes[i].length = map->stripe_len *
+				bbio->stripes[i].length = map->stripe_len *
 					(stripes - stripe_nr + 1);
 
 				if (i < map->sub_stripes) {
-					multi->stripes[i].length -=
+					bbio->stripes[i].length -=
 						stripe_offset;
 					if (i == map->sub_stripes - 1)
 						stripe_offset = 0;
@@ -3084,11 +3090,11 @@ again:
 				if (stripe_index >= last_stripe &&
 				    stripe_index <= (last_stripe +
 						     map->sub_stripes - 1)) {
-					multi->stripes[i].length -=
+					bbio->stripes[i].length -=
 						stripe_end_offset;
 				}
 			} else
-				multi->stripes[i].length = *length;
+				bbio->stripes[i].length = *length;
 
 			stripe_index++;
 			if (stripe_index == map->num_stripes) {
@@ -3099,19 +3105,20 @@ again:
 		}
 	} else {
 		for (i = 0; i < num_stripes; i++) {
-			multi->stripes[i].physical =
+			bbio->stripes[i].physical =
 				map->stripes[stripe_index].physical +
 				stripe_offset +
 				stripe_nr * map->stripe_len;
-			multi->stripes[i].dev =
+			bbio->stripes[i].dev =
 				map->stripes[stripe_index].dev;
 			stripe_index++;
 		}
 	}
-	if (multi_ret) {
-		*multi_ret = multi;
-		multi->num_stripes = num_stripes;
-		multi->max_errors = max_errors;
+	if (bbio_ret) {
+		*bbio_ret = bbio;
+		bbio->num_stripes = num_stripes;
+		bbio->max_errors = max_errors;
+		bbio->mirror_num = mirror_num;
 	}
 out:
 	free_extent_map(em);
@@ -3120,9 +3127,9 @@ out:
 
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		      u64 logical, u64 *length,
-		      struct btrfs_multi_bio **multi_ret, int mirror_num)
+		      struct btrfs_bio **bbio_ret, int mirror_num)
 {
-	return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
+	return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
 				 mirror_num);
 }
 
@@ -3191,28 +3198,28 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 	return 0;
 }
 
-static void end_bio_multi_stripe(struct bio *bio, int err)
+static void btrfs_end_bio(struct bio *bio, int err)
 {
-	struct btrfs_multi_bio *multi = bio->bi_private;
+	struct btrfs_bio *bbio = bio->bi_private;
 	int is_orig_bio = 0;
 
 	if (err)
-		atomic_inc(&multi->error);
+		atomic_inc(&bbio->error);
 
-	if (bio == multi->orig_bio)
+	if (bio == bbio->orig_bio)
 		is_orig_bio = 1;
 
-	if (atomic_dec_and_test(&multi->stripes_pending)) {
+	if (atomic_dec_and_test(&bbio->stripes_pending)) {
 		if (!is_orig_bio) {
 			bio_put(bio);
-			bio = multi->orig_bio;
+			bio = bbio->orig_bio;
 		}
-		bio->bi_private = multi->private;
-		bio->bi_end_io = multi->end_io;
+		bio->bi_private = bbio->private;
+		bio->bi_end_io = bbio->end_io;
 		/* only send an error to the higher layers if it is
 		 * beyond the tolerance of the multi-bio
 		 */
-		if (atomic_read(&multi->error) > multi->max_errors) {
+		if (atomic_read(&bbio->error) > bbio->max_errors) {
 			err = -EIO;
 		} else if (err) {
 			/*
@@ -3222,7 +3229,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err)
 			set_bit(BIO_UPTODATE, &bio->bi_flags);
 			err = 0;
 		}
-		kfree(multi);
+		kfree(bbio);
 
 		bio_endio(bio, err);
 	} else if (!is_orig_bio) {
@@ -3302,20 +3309,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	u64 logical = (u64)bio->bi_sector << 9;
 	u64 length = 0;
 	u64 map_length;
-	struct btrfs_multi_bio *multi = NULL;
 	int ret;
 	int dev_nr = 0;
 	int total_devs = 1;
+	struct btrfs_bio *bbio = NULL;
 
 	length = bio->bi_size;
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
 
-	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
+	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
 			      mirror_num);
 	BUG_ON(ret);
 
-	total_devs = multi->num_stripes;
+	total_devs = bbio->num_stripes;
 	if (map_length < length) {
 		printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
 		       "len %llu\n", (unsigned long long)logical,
@@ -3323,25 +3330,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		       (unsigned long long)map_length);
 		BUG();
 	}
-	multi->end_io = first_bio->bi_end_io;
-	multi->private = first_bio->bi_private;
-	multi->orig_bio = first_bio;
-	atomic_set(&multi->stripes_pending, multi->num_stripes);
+
+	bbio->orig_bio = first_bio;
+	bbio->private = first_bio->bi_private;
+	bbio->end_io = first_bio->bi_end_io;
+	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
 
 	while (dev_nr < total_devs) {
-		if (total_devs > 1) {
-			if (dev_nr < total_devs - 1) {
-				bio = bio_clone(first_bio, GFP_NOFS);
-				BUG_ON(!bio);
-			} else {
-				bio = first_bio;
-			}
-			bio->bi_private = multi;
-			bio->bi_end_io = end_bio_multi_stripe;
+		if (dev_nr < total_devs - 1) {
+			bio = bio_clone(first_bio, GFP_NOFS);
+			BUG_ON(!bio);
+		} else {
+			bio = first_bio;
 		}
-		bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
-		dev = multi->stripes[dev_nr].dev;
+		bio->bi_private = bbio;
+		bio->bi_end_io = btrfs_end_bio;
+		bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
+		dev = bbio->stripes[dev_nr].dev;
 		if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
+			pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
+				 "(%s id %llu), size=%u\n", rw,
+				 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
+				 dev->name, dev->devid, bio->bi_size);
 			bio->bi_bdev = dev->bdev;
 			if (async_submit)
 				schedule_bio(root, dev, rw, bio);
@@ -3354,8 +3364,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		}
 		dev_nr++;
 	}
-	if (total_devs == 1)
-		kfree(multi);
 	return 0;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6d866db4e177..71f4f3f67495 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -136,7 +136,10 @@ struct btrfs_bio_stripe {
 	u64 length; /* only used for discard mappings */
 };
 
-struct btrfs_multi_bio {
+struct btrfs_bio;
+typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
+
+struct btrfs_bio {
 	atomic_t stripes_pending;
 	bio_end_io_t *end_io;
 	struct bio *orig_bio;
@@ -144,6 +147,7 @@ struct btrfs_multi_bio {
 	atomic_t error;
 	int max_errors;
 	int num_stripes;
+	int mirror_num;
 	struct btrfs_bio_stripe stripes[];
 };
 
@@ -171,7 +175,7 @@ struct map_lookup {
 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
 				   u64 end, u64 *length);
 
-#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
+#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
 			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
@@ -180,7 +184,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   u64 chunk_offset, u64 start, u64 num_bytes);
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		    u64 logical, u64 *length,
-		    struct btrfs_multi_bio **multi_ret, int mirror_num);
+		    struct btrfs_bio **bbio_ret, int mirror_num);
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 		     u64 chunk_start, u64 physical, u64 devid,
 		     u64 **logical, int *naddrs, int *stripe_len);
-- 
cgit v1.2.3-58-ga151


From 1503140d3ec2be9b917d2f8f7c64cb77b79a215b Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Thu, 16 Jun 2011 14:37:03 +0200
Subject: btrfs: Do not use bio->bi_bdev after submission

The block layer modifies bio->bi_bdev and bio->bi_sector while working on
the bio, they do _not_ come back unmodified in the completion callback.

To call add_page, we need at least some bi_bdev set, which is why the code
was working, previously. With this patch, we use the latest_bdev from
fsinfo instead of the leftover in the bio. This gives us the possibility to
use the bi_bdev field for another purpose.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index efee8c7cfa45..936a6fabaa9f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1915,7 +1915,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 	bio->bi_private = state;
 	bio->bi_end_io = failed_bio->bi_end_io;
 	bio->bi_sector = failrec->logical >> 9;
-	bio->bi_bdev = failed_bio->bi_bdev;
+	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 	bio->bi_size = 0;
 
 	bio_add_page(bio, page, failrec->len, start - page_offset(page));
-- 
cgit v1.2.3-58-ga151


From 2774b2ca3d49124bf1ae89e8d575b3dab4221266 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Thu, 16 Jun 2011 12:05:19 +0200
Subject: btrfs: Put mirror_num in bi_bdev

The error correction code wants to make sure that only the bad mirror is
rewritten. Thus, we need to know which mirror is the bad one. I did not
find a more apropriate field than bi_bdev. But I think using this is fine,
because it is modified by the block layer, anyway, and should not be read
after the bio returned.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/volumes.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9db4d7962f9b..18baac5a3f6c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3216,6 +3216,8 @@ static void btrfs_end_bio(struct bio *bio, int err)
 		}
 		bio->bi_private = bbio->private;
 		bio->bi_end_io = bbio->end_io;
+		bio->bi_bdev = (struct block_device *)
+					(unsigned long)bbio->mirror_num;
 		/* only send an error to the higher layers if it is
 		 * beyond the tolerance of the multi-bio
 		 */
-- 
cgit v1.2.3-58-ga151


From 4a54c8c165b66300830a67349fc7595e3fc442f7 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Fri, 22 Jul 2011 15:41:52 +0200
Subject: btrfs: Moved repair code from inode.c to extent_io.c

The raid-retry code in inode.c can be generalized so that it works for
metadata as well. Thus, this patch moves it to extent_io.c and makes the
raid-retry code a raid-repair code.

Repair works that way: Whenever a read error occurs and we have more
mirrors to try, note the failed mirror, and retry another. If we find a
good one, check if we did note a failure earlier and if so, do not allow
the read to complete until after the bad sector was written with the good
data we just fetched. As we have the extent locked while reading, no one
can change the data in between.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/extent_io.c | 387 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/extent_io.h |  10 +-
 fs/btrfs/inode.c     | 155 +--------------------
 3 files changed, 393 insertions(+), 159 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index afebb95e3490..624ef10d36cc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
 #include "compat.h"
 #include "ctree.h"
 #include "btrfs_inode.h"
+#include "volumes.h"
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -1599,6 +1600,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
 	return 0;
 }
 
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data.  This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors.  If another mirror has good data, the page is set up to date
+ * and things continue.  If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
+struct io_failure_record {
+	struct page *page;
+	u64 start;
+	u64 len;
+	u64 logical;
+	unsigned long bio_flags;
+	int this_mirror;
+	int failed_mirror;
+	int in_validation;
+};
+
+static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
+				int did_repair)
+{
+	int ret;
+	int err = 0;
+	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+
+	set_state_private(failure_tree, rec->start, 0);
+	ret = clear_extent_bits(failure_tree, rec->start,
+				rec->start + rec->len - 1,
+				EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+	if (ret)
+		err = ret;
+
+	if (did_repair) {
+		ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
+					rec->start + rec->len - 1,
+					EXTENT_DAMAGED, GFP_NOFS);
+		if (ret && !err)
+			err = ret;
+	}
+
+	kfree(rec);
+	return err;
+}
+
+static void repair_io_failure_callback(struct bio *bio, int err)
+{
+	complete(bio->bi_private);
+}
+
+/*
+ * this bypasses the standard btrfs submit functions deliberately, as
+ * the standard behavior is to write all copies in a raid setup. here we only
+ * want to write the one bad copy. so we do the mapping for ourselves and issue
+ * submit_bio directly.
+ * to avoid any synchonization issues, wait for the data after writing, which
+ * actually prevents the read that triggered the error from finishing.
+ * currently, there can be no more than two copies of every data bit. thus,
+ * exactly one rewrite is required.
+ */
+int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+			u64 length, u64 logical, struct page *page,
+			int mirror_num)
+{
+	struct bio *bio;
+	struct btrfs_device *dev;
+	DECLARE_COMPLETION_ONSTACK(compl);
+	u64 map_length = 0;
+	u64 sector;
+	struct btrfs_bio *bbio = NULL;
+	int ret;
+
+	BUG_ON(!mirror_num);
+
+	bio = bio_alloc(GFP_NOFS, 1);
+	if (!bio)
+		return -EIO;
+	bio->bi_private = &compl;
+	bio->bi_end_io = repair_io_failure_callback;
+	bio->bi_size = 0;
+	map_length = length;
+
+	ret = btrfs_map_block(map_tree, WRITE, logical,
+			      &map_length, &bbio, mirror_num);
+	if (ret) {
+		bio_put(bio);
+		return -EIO;
+	}
+	BUG_ON(mirror_num != bbio->mirror_num);
+	sector = bbio->stripes[mirror_num-1].physical >> 9;
+	bio->bi_sector = sector;
+	dev = bbio->stripes[mirror_num-1].dev;
+	kfree(bbio);
+	if (!dev || !dev->bdev || !dev->writeable) {
+		bio_put(bio);
+		return -EIO;
+	}
+	bio->bi_bdev = dev->bdev;
+	bio_add_page(bio, page, length, start-page_offset(page));
+	submit_bio(WRITE_SYNC, bio);
+	wait_for_completion(&compl);
+
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+		/* try to remap that extent elsewhere? */
+		bio_put(bio);
+		return -EIO;
+	}
+
+	printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
+			"sector %llu)\n", page->mapping->host->i_ino, start,
+			dev->name, sector);
+
+	bio_put(bio);
+	return 0;
+}
+
+/*
+ * each time an IO finishes, we do a fast check in the IO failure tree
+ * to see if we need to process or clean up an io_failure_record
+ */
+static int clean_io_failure(u64 start, struct page *page)
+{
+	u64 private;
+	u64 private_failure;
+	struct io_failure_record *failrec;
+	struct btrfs_mapping_tree *map_tree;
+	struct extent_state *state;
+	int num_copies;
+	int did_repair = 0;
+	int ret;
+	struct inode *inode = page->mapping->host;
+
+	private = 0;
+	ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+				(u64)-1, 1, EXTENT_DIRTY, 0);
+	if (!ret)
+		return 0;
+
+	ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
+				&private_failure);
+	if (ret)
+		return 0;
+
+	failrec = (struct io_failure_record *)(unsigned long) private_failure;
+	BUG_ON(!failrec->this_mirror);
+
+	if (failrec->in_validation) {
+		/* there was no real error, just free the record */
+		pr_debug("clean_io_failure: freeing dummy error at %llu\n",
+			 failrec->start);
+		did_repair = 1;
+		goto out;
+	}
+
+	spin_lock(&BTRFS_I(inode)->io_tree.lock);
+	state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
+					    failrec->start,
+					    EXTENT_LOCKED);
+	spin_unlock(&BTRFS_I(inode)->io_tree.lock);
+
+	if (state && state->start == failrec->start) {
+		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
+		num_copies = btrfs_num_copies(map_tree, failrec->logical,
+						failrec->len);
+		if (num_copies > 1)  {
+			ret = repair_io_failure(map_tree, start, failrec->len,
+						failrec->logical, page,
+						failrec->failed_mirror);
+			did_repair = !ret;
+		}
+	}
+
+out:
+	if (!ret)
+		ret = free_io_failure(inode, failrec, did_repair);
+
+	return ret;
+}
+
+/*
+ * this is a generic handler for readpage errors (default
+ * readpage_io_failed_hook). if other copies exist, read those and write back
+ * good data to the failed position. does not investigate in remapping the
+ * failed extent elsewhere, hoping the device will be smart enough to do this as
+ * needed
+ */
+
+static int bio_readpage_error(struct bio *failed_bio, struct page *page,
+				u64 start, u64 end, int failed_mirror,
+				struct extent_state *state)
+{
+	struct io_failure_record *failrec = NULL;
+	u64 private;
+	struct extent_map *em;
+	struct inode *inode = page->mapping->host;
+	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct bio *bio;
+	int num_copies;
+	int ret;
+	int read_mode;
+	u64 logical;
+
+	BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+	ret = get_state_private(failure_tree, start, &private);
+	if (ret) {
+		failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
+		if (!failrec)
+			return -ENOMEM;
+		failrec->start = start;
+		failrec->len = end - start + 1;
+		failrec->this_mirror = 0;
+		failrec->bio_flags = 0;
+		failrec->in_validation = 0;
+
+		read_lock(&em_tree->lock);
+		em = lookup_extent_mapping(em_tree, start, failrec->len);
+		if (!em) {
+			read_unlock(&em_tree->lock);
+			kfree(failrec);
+			return -EIO;
+		}
+
+		if (em->start > start || em->start + em->len < start) {
+			free_extent_map(em);
+			em = NULL;
+		}
+		read_unlock(&em_tree->lock);
+
+		if (!em || IS_ERR(em)) {
+			kfree(failrec);
+			return -EIO;
+		}
+		logical = start - em->start;
+		logical = em->block_start + logical;
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+			logical = em->block_start;
+			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+			extent_set_compress_type(&failrec->bio_flags,
+						 em->compress_type);
+		}
+		pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
+			 "len=%llu\n", logical, start, failrec->len);
+		failrec->logical = logical;
+		free_extent_map(em);
+
+		/* set the bits in the private failure tree */
+		ret = set_extent_bits(failure_tree, start, end,
+					EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+		if (ret >= 0)
+			ret = set_state_private(failure_tree, start,
+						(u64)(unsigned long)failrec);
+		/* set the bits in the inode's tree */
+		if (ret >= 0)
+			ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
+						GFP_NOFS);
+		if (ret < 0) {
+			kfree(failrec);
+			return ret;
+		}
+	} else {
+		failrec = (struct io_failure_record *)(unsigned long)private;
+		pr_debug("bio_readpage_error: (found) logical=%llu, "
+			 "start=%llu, len=%llu, validation=%d\n",
+			 failrec->logical, failrec->start, failrec->len,
+			 failrec->in_validation);
+		/*
+		 * when data can be on disk more than twice, add to failrec here
+		 * (e.g. with a list for failed_mirror) to make
+		 * clean_io_failure() clean all those errors at once.
+		 */
+	}
+	num_copies = btrfs_num_copies(
+			      &BTRFS_I(inode)->root->fs_info->mapping_tree,
+			      failrec->logical, failrec->len);
+	if (num_copies == 1) {
+		/*
+		 * we only have a single copy of the data, so don't bother with
+		 * all the retry and error correction code that follows. no
+		 * matter what the error is, it is very likely to persist.
+		 */
+		pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
+			 "state=%p, num_copies=%d, next_mirror %d, "
+			 "failed_mirror %d\n", state, num_copies,
+			 failrec->this_mirror, failed_mirror);
+		free_io_failure(inode, failrec, 0);
+		return -EIO;
+	}
+
+	if (!state) {
+		spin_lock(&tree->lock);
+		state = find_first_extent_bit_state(tree, failrec->start,
+						    EXTENT_LOCKED);
+		if (state && state->start != failrec->start)
+			state = NULL;
+		spin_unlock(&tree->lock);
+	}
+
+	/*
+	 * there are two premises:
+	 *	a) deliver good data to the caller
+	 *	b) correct the bad sectors on disk
+	 */
+	if (failed_bio->bi_vcnt > 1) {
+		/*
+		 * to fulfill b), we need to know the exact failing sectors, as
+		 * we don't want to rewrite any more than the failed ones. thus,
+		 * we need separate read requests for the failed bio
+		 *
+		 * if the following BUG_ON triggers, our validation request got
+		 * merged. we need separate requests for our algorithm to work.
+		 */
+		BUG_ON(failrec->in_validation);
+		failrec->in_validation = 1;
+		failrec->this_mirror = failed_mirror;
+		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+	} else {
+		/*
+		 * we're ready to fulfill a) and b) alongside. get a good copy
+		 * of the failed sector and if we succeed, we have setup
+		 * everything for repair_io_failure to do the rest for us.
+		 */
+		if (failrec->in_validation) {
+			BUG_ON(failrec->this_mirror != failed_mirror);
+			failrec->in_validation = 0;
+			failrec->this_mirror = 0;
+		}
+		failrec->failed_mirror = failed_mirror;
+		failrec->this_mirror++;
+		if (failrec->this_mirror == failed_mirror)
+			failrec->this_mirror++;
+		read_mode = READ_SYNC;
+	}
+
+	if (!state || failrec->this_mirror > num_copies) {
+		pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
+			 "next_mirror %d, failed_mirror %d\n", state,
+			 num_copies, failrec->this_mirror, failed_mirror);
+		free_io_failure(inode, failrec, 0);
+		return -EIO;
+	}
+
+	bio = bio_alloc(GFP_NOFS, 1);
+	bio->bi_private = state;
+	bio->bi_end_io = failed_bio->bi_end_io;
+	bio->bi_sector = failrec->logical >> 9;
+	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+	bio->bi_size = 0;
+
+	bio_add_page(bio, page, failrec->len, start - page_offset(page));
+
+	pr_debug("bio_readpage_error: submitting new read[%#x] to "
+		 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
+		 failrec->this_mirror, num_copies, failrec->in_validation);
+
+	tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
+					failrec->bio_flags, 0);
+	return 0;
+}
+
 /* lots and lots of room for performance fixes in the end_bio funcs */
 
 /*
@@ -1697,6 +2060,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 		struct extent_state *cached = NULL;
 		struct extent_state *state;
 
+		pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
+			 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
+			 (long int)bio->bi_bdev);
 		tree = &BTRFS_I(page->mapping->host)->io_tree;
 
 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1727,11 +2093,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 							      state);
 			if (ret)
 				uptodate = 0;
+			else
+				clean_io_failure(start, page);
 		}
-		if (!uptodate && tree->ops &&
-		    tree->ops->readpage_io_failed_hook) {
-			ret = tree->ops->readpage_io_failed_hook(bio, page,
-							 start, end, NULL);
+		if (!uptodate) {
+			u64 failed_mirror;
+			failed_mirror = (u64)bio->bi_bdev;
+			if (tree->ops && tree->ops->readpage_io_failed_hook)
+				ret = tree->ops->readpage_io_failed_hook(
+						bio, page, start, end,
+						failed_mirror, NULL);
+			else
+				ret = bio_readpage_error(bio, page, start, end,
+							 failed_mirror, NULL);
 			if (ret == 0) {
 				uptodate =
 					test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -1811,6 +2185,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
 					   mirror_num, bio_flags, start);
 	else
 		submit_bio(rw, bio);
+
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
 		ret = -EOPNOTSUPP;
 	bio_put(bio);
@@ -2926,7 +3301,7 @@ out:
 	return ret;
 }
 
-static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+inline struct page *extent_buffer_page(struct extent_buffer *eb,
 					      unsigned long i)
 {
 	struct page *p;
@@ -2951,7 +3326,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
 	return p;
 }
 
-static inline unsigned long num_extent_pages(u64 start, u64 len)
+inline unsigned long num_extent_pages(u64 start, u64 len)
 {
 	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
 		(start >> PAGE_CACHE_SHIFT);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 435d454b9926..a8e20b672922 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -68,7 +68,7 @@ struct extent_io_ops {
 			      unsigned long bio_flags);
 	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
 	int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
-				       u64 start, u64 end,
+				       u64 start, u64 end, u64 failed_mirror,
 				       struct extent_state *state);
 	int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
 					u64 start, u64 end,
@@ -252,6 +252,8 @@ void free_extent_buffer(struct extent_buffer *eb);
 int read_extent_buffer_pages(struct extent_io_tree *tree,
 			     struct extent_buffer *eb, u64 start, int wait,
 			     get_extent_t *get_extent, int mirror_num);
+unsigned long num_extent_pages(u64 start, u64 len);
+struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
 
 static inline void extent_buffer_get(struct extent_buffer *eb)
 {
@@ -301,4 +303,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 struct bio *
 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 		gfp_t gfp_flags);
+
+struct btrfs_mapping_tree;
+
+int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+			u64 length, u64 logical, struct page *page,
+			int mirror_num);
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 936a6fabaa9f..9327f45434e8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -45,10 +45,10 @@
 #include "btrfs_inode.h"
 #include "ioctl.h"
 #include "print-tree.h"
-#include "volumes.h"
 #include "ordered-data.h"
 #include "xattr.h"
 #include "tree-log.h"
+#include "volumes.h"
 #include "compression.h"
 #include "locking.h"
 #include "free-space-cache.h"
@@ -1818,154 +1818,10 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 	return btrfs_finish_ordered_io(page->mapping->host, start, end);
 }
 
-/*
- * When IO fails, either with EIO or csum verification fails, we
- * try other mirrors that might have a good copy of the data.  This
- * io_failure_record is used to record state as we go through all the
- * mirrors.  If another mirror has good data, the page is set up to date
- * and things continue.  If a good mirror can't be found, the original
- * bio end_io callback is called to indicate things have failed.
- */
-struct io_failure_record {
-	struct page *page;
-	u64 start;
-	u64 len;
-	u64 logical;
-	unsigned long bio_flags;
-	int last_mirror;
-};
-
-static int btrfs_io_failed_hook(struct bio *failed_bio,
-			 struct page *page, u64 start, u64 end,
-			 struct extent_state *state)
-{
-	struct io_failure_record *failrec = NULL;
-	u64 private;
-	struct extent_map *em;
-	struct inode *inode = page->mapping->host;
-	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-	struct bio *bio;
-	int num_copies;
-	int ret;
-	int rw;
-	u64 logical;
-
-	ret = get_state_private(failure_tree, start, &private);
-	if (ret) {
-		failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
-		if (!failrec)
-			return -ENOMEM;
-		failrec->start = start;
-		failrec->len = end - start + 1;
-		failrec->last_mirror = 0;
-		failrec->bio_flags = 0;
-
-		read_lock(&em_tree->lock);
-		em = lookup_extent_mapping(em_tree, start, failrec->len);
-		if (em->start > start || em->start + em->len < start) {
-			free_extent_map(em);
-			em = NULL;
-		}
-		read_unlock(&em_tree->lock);
-
-		if (IS_ERR_OR_NULL(em)) {
-			kfree(failrec);
-			return -EIO;
-		}
-		logical = start - em->start;
-		logical = em->block_start + logical;
-		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
-			logical = em->block_start;
-			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
-			extent_set_compress_type(&failrec->bio_flags,
-						 em->compress_type);
-		}
-		failrec->logical = logical;
-		free_extent_map(em);
-		set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
-				EXTENT_DIRTY, GFP_NOFS);
-		set_state_private(failure_tree, start,
-				 (u64)(unsigned long)failrec);
-	} else {
-		failrec = (struct io_failure_record *)(unsigned long)private;
-	}
-	num_copies = btrfs_num_copies(
-			      &BTRFS_I(inode)->root->fs_info->mapping_tree,
-			      failrec->logical, failrec->len);
-	failrec->last_mirror++;
-	if (!state) {
-		spin_lock(&BTRFS_I(inode)->io_tree.lock);
-		state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
-						    failrec->start,
-						    EXTENT_LOCKED);
-		if (state && state->start != failrec->start)
-			state = NULL;
-		spin_unlock(&BTRFS_I(inode)->io_tree.lock);
-	}
-	if (!state || failrec->last_mirror > num_copies) {
-		set_state_private(failure_tree, failrec->start, 0);
-		clear_extent_bits(failure_tree, failrec->start,
-				  failrec->start + failrec->len - 1,
-				  EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
-		kfree(failrec);
-		return -EIO;
-	}
-	bio = bio_alloc(GFP_NOFS, 1);
-	bio->bi_private = state;
-	bio->bi_end_io = failed_bio->bi_end_io;
-	bio->bi_sector = failrec->logical >> 9;
-	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
-	bio->bi_size = 0;
-
-	bio_add_page(bio, page, failrec->len, start - page_offset(page));
-	if (failed_bio->bi_rw & REQ_WRITE)
-		rw = WRITE;
-	else
-		rw = READ;
-
-	ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
-						      failrec->last_mirror,
-						      failrec->bio_flags, 0);
-	return ret;
-}
-
-/*
- * each time an IO finishes, we do a fast check in the IO failure tree
- * to see if we need to process or clean up an io_failure_record
- */
-static int btrfs_clean_io_failures(struct inode *inode, u64 start)
-{
-	u64 private;
-	u64 private_failure;
-	struct io_failure_record *failure;
-	int ret;
-
-	private = 0;
-	if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
-			     (u64)-1, 1, EXTENT_DIRTY, 0)) {
-		ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
-					start, &private_failure);
-		if (ret == 0) {
-			failure = (struct io_failure_record *)(unsigned long)
-				   private_failure;
-			set_state_private(&BTRFS_I(inode)->io_failure_tree,
-					  failure->start, 0);
-			clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
-					  failure->start,
-					  failure->start + failure->len - 1,
-					  EXTENT_DIRTY | EXTENT_LOCKED,
-					  GFP_NOFS);
-			kfree(failure);
-		}
-	}
-	return 0;
-}
-
 /*
  * when reads are done, we need to check csums to verify the data is correct
- * if there's a match, we allow the bio to finish.  If not, we go through
- * the io_failure_record routines to find good copies
+ * if there's a match, we allow the bio to finish.  If not, the code in
+ * extent_io.c will try to find good copies for us.
  */
 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state)
@@ -2011,10 +1867,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 
 	kunmap_atomic(kaddr, KM_USER0);
 good:
-	/* if the io failure tree for this inode is non-empty,
-	 * check to see if we've recovered from a failed IO
-	 */
-	btrfs_clean_io_failures(inode, start);
 	return 0;
 
 zeroit:
@@ -7420,7 +7272,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
 	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
 	.writepage_start_hook = btrfs_writepage_start_hook,
-	.readpage_io_failed_hook = btrfs_io_failed_hook,
 	.set_bit_hook = btrfs_set_bit_hook,
 	.clear_bit_hook = btrfs_clear_bit_hook,
 	.merge_extent_hook = btrfs_merge_extent_hook,
-- 
cgit v1.2.3-58-ga151


From 5da6fcbc4eb50c0f55d520750332f5a6ab13508c Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Thu, 4 Aug 2011 18:11:04 +0200
Subject: btrfs: integrating raid-repair and scrub-fixup-nodatasum

This ties nodatasum fixup in scrub together with raid repair patches. While
both series are working fine alone, scrub will report uncorrectable errors
if they occur in a nodatasum extent *and* the page is in the page cache.

Previously, we would have triggered readpage to find good data and do the
repair. However, readpage wouldn't read anything in the case where the page
is up to date in the cache. So, we simply take that good data we have and
call repair_io_failure directly (unless the page in the cache is dirty).

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/scrub.c | 92 +++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 67 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 97142a218f0a..eba42e5fd5fd 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -24,6 +24,7 @@
 #include "ordered-data.h"
 #include "transaction.h"
 #include "backref.h"
+#include "extent_io.h"
 
 /*
  * This is only the first step towards a full-features scrub. It reads all
@@ -360,13 +361,13 @@ out:
 
 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
 {
-	struct page *page;
+	struct page *page = NULL;
 	unsigned long index;
 	struct scrub_fixup_nodatasum *fixup = ctx;
 	int ret;
-	int corrected;
+	int corrected = 0;
 	struct btrfs_key key;
-	struct inode *inode;
+	struct inode *inode = NULL;
 	u64 end = offset + PAGE_SIZE - 1;
 	struct btrfs_root *local_root;
 
@@ -384,34 +385,75 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
-	ret = set_extent_bit(&BTRFS_I(inode)->io_tree, offset, end,
-				EXTENT_DAMAGED, 0, NULL, NULL, GFP_NOFS);
-
-	/* set_extent_bit should either succeed or give proper error */
-	WARN_ON(ret > 0);
-	if (ret)
-		return ret < 0 ? ret : -EFAULT;
-
 	index = offset >> PAGE_CACHE_SHIFT;
 
 	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
-	if (!page)
-		return -ENOMEM;
+	if (!page) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (PageUptodate(page)) {
+		struct btrfs_mapping_tree *map_tree;
+		if (PageDirty(page)) {
+			/*
+			 * we need to write the data to the defect sector. the
+			 * data that was in that sector is not in memory,
+			 * because the page was modified. we must not write the
+			 * modified page to that sector.
+			 *
+			 * TODO: what could be done here: wait for the delalloc
+			 *       runner to write out that page (might involve
+			 *       COW) and see whether the sector is still
+			 *       referenced afterwards.
+			 *
+			 * For the meantime, we'll treat this error
+			 * incorrectable, although there is a chance that a
+			 * later scrub will find the bad sector again and that
+			 * there's no dirty page in memory, then.
+			 */
+			ret = -EIO;
+			goto out;
+		}
+		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
+		ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
+					fixup->logical, page,
+					fixup->mirror_num);
+		unlock_page(page);
+		corrected = !ret;
+	} else {
+		/*
+		 * we need to get good data first. the general readpage path
+		 * will call repair_io_failure for us, we just have to make
+		 * sure we read the bad mirror.
+		 */
+		ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
+					EXTENT_DAMAGED, GFP_NOFS);
+		if (ret) {
+			/* set_extent_bits should give proper error */
+			WARN_ON(ret > 0);
+			if (ret > 0)
+				ret = -EFAULT;
+			goto out;
+		}
 
-	ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
-					btrfs_get_extent, fixup->mirror_num);
-	wait_on_page_locked(page);
-	corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, end,
-					EXTENT_DAMAGED, 0, NULL);
+		ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
+						btrfs_get_extent,
+						fixup->mirror_num);
+		wait_on_page_locked(page);
 
-	if (corrected)
-		WARN_ON(!PageUptodate(page));
-	else
-		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, end,
-					EXTENT_DAMAGED, 0, 0, NULL, GFP_NOFS);
+		corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
+						end, EXTENT_DAMAGED, 0, NULL);
+		if (!corrected)
+			clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
+						EXTENT_DAMAGED, GFP_NOFS);
+	}
 
-	put_page(page);
-	iput(inode);
+out:
+	if (page)
+		put_page(page);
+	if (inode)
+		iput(inode);
 
 	if (ret < 0)
 		return ret;
-- 
cgit v1.2.3-58-ga151


From bb82ab88dfdb12948af58989c75bfe904bc1b09d Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Fri, 10 Jun 2011 14:06:53 +0200
Subject: btrfs: add an extra wait mode to read_extent_buffer_pages

read_extent_buffer_pages currently has two modes, either trigger a read
without waiting for anything, or wait for the I/O to finish. The former
also bails when it's unable to lock the page. This patch now adds an
additional parameter to allow it to block on page lock, but don't wait
for completion.

Changes v5:
 - merge the 2 wait parameters into one and define WAIT_NONE, WAIT_COMPLETE and
   WAIT_PAGE_LOCK

Change v6:
 - fix bug introduced in v5

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/disk-io.c   | 5 +++--
 fs/btrfs/extent_io.c | 7 +++----
 fs/btrfs/extent_io.h | 3 +++
 3 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07b3ac662e19..e6170e142cad 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -367,7 +367,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 	clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 	while (1) {
-		ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+		ret = read_extent_buffer_pages(io_tree, eb, start,
+					       WAIT_COMPLETE,
 					       btree_get_extent, mirror_num);
 		if (!ret &&
 		    !verify_parent_transid(io_tree, eb, parent_transid))
@@ -974,7 +975,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
 	if (!buf)
 		return 0;
 	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
-				 buf, 0, 0, btree_get_extent, 0);
+				 buf, 0, WAIT_NONE, btree_get_extent, 0);
 	free_extent_buffer(buf);
 	return ret;
 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d418164a35f1..823028e73cf8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3349,8 +3349,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
 }
 
 int read_extent_buffer_pages(struct extent_io_tree *tree,
-			     struct extent_buffer *eb,
-			     u64 start, int wait,
+			     struct extent_buffer *eb, u64 start, int wait,
 			     get_extent_t *get_extent, int mirror_num)
 {
 	unsigned long i;
@@ -3386,7 +3385,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	num_pages = num_extent_pages(eb->start, eb->len);
 	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
-		if (!wait) {
+		if (wait == WAIT_NONE) {
 			if (!trylock_page(page))
 				goto unlock_exit;
 		} else {
@@ -3430,7 +3429,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	if (bio)
 		submit_one_bio(READ, bio, mirror_num, bio_flags);
 
-	if (ret || !wait)
+	if (ret || wait != WAIT_COMPLETE)
 		return ret;
 
 	for (i = start_i; i < num_pages; i++) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7b2f0c3e7929..6d74c6b34691 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -248,6 +248,9 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
 					 u64 start, unsigned long len);
 void free_extent_buffer(struct extent_buffer *eb);
+#define WAIT_NONE	0
+#define WAIT_COMPLETE	1
+#define WAIT_PAGE_LOCK	2
 int read_extent_buffer_pages(struct extent_io_tree *tree,
 			     struct extent_buffer *eb, u64 start, int wait,
 			     get_extent_t *get_extent, int mirror_num);
-- 
cgit v1.2.3-58-ga151


From ab0fff03055d2d1b01a7581badeba18db9c4f55c Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Mon, 23 May 2011 14:25:41 +0200
Subject: btrfs: add READAHEAD extent buffer flag

Add a READAHEAD extent buffer flag.
Add a function to trigger a read with this flag set.

Changes v2:
 - use extent buffer flags instead of extent state flags

Changes v5:
 - adapt to changed read_extent_buffer_pages interface
 - don't return eb from reada_tree_block_flagged if it has CORRUPT flag set

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/disk-io.c   | 32 ++++++++++++++++++++++++++++++++
 fs/btrfs/disk-io.h   |  2 ++
 fs/btrfs/extent_io.h |  1 +
 3 files changed, 35 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e6170e142cad..1220d04072c8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -980,6 +980,38 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
 	return ret;
 }
 
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+			 int mirror_num, struct extent_buffer **eb)
+{
+	struct extent_buffer *buf = NULL;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
+	int ret;
+
+	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+	if (!buf)
+		return 0;
+
+	set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
+
+	ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
+				       btree_get_extent, mirror_num);
+	if (ret) {
+		free_extent_buffer(buf);
+		return ret;
+	}
+
+	if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
+		free_extent_buffer(buf);
+		return -EIO;
+	} else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
+		*eb = buf;
+	} else {
+		free_extent_buffer(buf);
+	}
+	return 0;
+}
+
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize)
 {
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bec3ea4bd67f..b3bdb5c1390f 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 				      u32 blocksize, u64 parent_transid);
 int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
 			 u64 parent_transid);
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+			 int mirror_num, struct extent_buffer **eb);
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 						   u64 bytenr, u32 blocksize);
 int clean_tree_block(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 6d74c6b34691..fcaf49bcb880 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -32,6 +32,7 @@
 #define EXTENT_BUFFER_BLOCKING 1
 #define EXTENT_BUFFER_DIRTY 2
 #define EXTENT_BUFFER_CORRUPT 3
+#define EXTENT_BUFFER_READAHEAD 4	/* this got triggered by readahead */
 
 /* these are flags for extent_clear_unlock_delalloc */
 #define EXTENT_CLEAR_UNLOCK_PAGE 0x1
-- 
cgit v1.2.3-58-ga151


From 90519d66abbccc251d14719ac76f191f70826e40 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Mon, 23 May 2011 14:30:00 +0200
Subject: btrfs: state information for readahead

Add state information for readahead to btrfs_fs_info and btrfs_device

Changes v2:
 - don't wait in radix_trees
 - add own set of workers for readahead

Reviewed-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/ctree.h   |  5 +++++
 fs/btrfs/disk-io.c | 10 ++++++++++
 fs/btrfs/volumes.c |  8 ++++++++
 fs/btrfs/volumes.h |  8 ++++++++
 4 files changed, 31 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03912c5c6f49..f71fd24cc152 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1036,6 +1036,7 @@ struct btrfs_fs_info {
 	struct btrfs_workers endio_freespace_worker;
 	struct btrfs_workers submit_workers;
 	struct btrfs_workers caching_workers;
+	struct btrfs_workers readahead_workers;
 
 	/*
 	 * fixup workers take dirty pages that didn't properly go through
@@ -1119,6 +1120,10 @@ struct btrfs_fs_info {
 	u64 fs_state;
 
 	struct btrfs_delayed_root *delayed_root;
+
+	/* readahead tree */
+	spinlock_t reada_lock;
+	struct radix_tree_root reada_tree;
 };
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1220d04072c8..fd28c40e9ec3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1711,6 +1711,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->defrag_inodes = RB_ROOT;
 	fs_info->trans_no_join = 0;
 
+	/* readahead state */
+	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+	spin_lock_init(&fs_info->reada_lock);
+
 	fs_info->thread_pool_size = min_t(unsigned long,
 					  num_online_cpus() + 2, 8);
 
@@ -1903,6 +1907,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
 			   fs_info->thread_pool_size,
 			   &fs_info->generic_worker);
+	btrfs_init_workers(&fs_info->readahead_workers, "readahead",
+			   fs_info->thread_pool_size,
+			   &fs_info->generic_worker);
 
 	/*
 	 * endios are largely parallel and should have a very
@@ -1913,6 +1920,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	fs_info->endio_write_workers.idle_thresh = 2;
 	fs_info->endio_meta_write_workers.idle_thresh = 2;
+	fs_info->readahead_workers.idle_thresh = 2;
 
 	btrfs_start_workers(&fs_info->workers, 1);
 	btrfs_start_workers(&fs_info->generic_worker, 1);
@@ -1926,6 +1934,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
 	btrfs_start_workers(&fs_info->delayed_workers, 1);
 	btrfs_start_workers(&fs_info->caching_workers, 1);
+	btrfs_start_workers(&fs_info->readahead_workers, 1);
 
 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
 	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -2650,6 +2659,7 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_stop_workers(&fs_info->submit_workers);
 	btrfs_stop_workers(&fs_info->delayed_workers);
 	btrfs_stop_workers(&fs_info->caching_workers);
+	btrfs_stop_workers(&fs_info->readahead_workers);
 
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f2a4cc79da61..1dccce5bc93d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -366,6 +366,14 @@ static noinline int device_list_add(const char *path,
 		}
 		INIT_LIST_HEAD(&device->dev_alloc_list);
 
+		/* init readahead state */
+		spin_lock_init(&device->reada_lock);
+		device->reada_curr_zone = NULL;
+		atomic_set(&device->reada_in_flight, 0);
+		device->reada_next = 0;
+		INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
+		INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
+
 		mutex_lock(&fs_devices->device_list_mutex);
 		list_add_rcu(&device->dev_list, &fs_devices->devices);
 		mutex_unlock(&fs_devices->device_list_mutex);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6d866db4e177..2a751246188a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -92,6 +92,14 @@ struct btrfs_device {
 	struct btrfs_work work;
 	struct rcu_head rcu;
 	struct work_struct rcu_work;
+
+	/* readahead state */
+	spinlock_t reada_lock;
+	atomic_t reada_in_flight;
+	u64 reada_next;
+	struct reada_zone *reada_curr_zone;
+	struct radix_tree_root reada_zones;
+	struct radix_tree_root reada_extents;
 };
 
 struct btrfs_fs_devices {
-- 
cgit v1.2.3-58-ga151


From 7414a03fbf9e75fbbf2a3c16828cd862e572aa44 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Mon, 23 May 2011 14:33:49 +0200
Subject: btrfs: initial readahead code and prototypes

This is the implementation for the generic read ahead framework.

To trigger a readahead, btrfs_reada_add must be called. It will start
a read ahead for the given range [start, end) on tree root. The returned
handle can either be used to wait on the readahead to finish
(btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).

The read ahead works as follows:
On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
reada_start_machine will then search for extents to prefetch and trigger
some reads. When a read finishes for a node, all contained node/leaf
pointers that lie in the given range will also be enqueued. The reads will
be triggered in sequential order, thus giving a big win over a naive
enumeration. It will also make use of multi-device layouts. Each disk
will have its on read pointer and all disks will by utilized in parallel.
Also will no two disks read both sides of a mirror simultaneously, as this
would waste seeking capacity. Instead both disks will read different parts
of the filesystem.
Any number of readaheads can be started in parallel. The read order will be
determined globally, i.e. 2 parallel readaheads will normally finish faster
than the 2 started one after another.

Changes v2:
 - protect root->node by transaction instead of node_lock
 - fix missed branches:
    The readahead had a too simple check to determine if a branch from
    a node should be checked or not. It now also records the upper bound
    of each node to see if the requested RA range lies within.
 - use KERN_CONT to debug output, to avoid line breaks
 - defer reada_start_machine to worker to avoid deadlock

Changes v3:
 - protect root->node by rcu

Changes v5:
 - changed EIO-semantics of reada_tree_block_flagged
 - remove spin_lock from reada_control and make elems an atomic_t
 - remove unused read_total from reada_control
 - kill reada_key_cmp, use btrfs_comp_cpu_keys instead
 - use kref-style release functions where possible
 - return struct reada_control * instead of void * from btrfs_reada_add

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/Makefile |   3 +-
 fs/btrfs/ctree.h  |  16 +
 fs/btrfs/reada.c  | 949 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 967 insertions(+), 1 deletion(-)
 create mode 100644 fs/btrfs/reada.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 40e6ac08c21f..bdd6fb238ce1 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,6 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
-	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o
+	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
+	   reada.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f71fd24cc152..370af767440d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2702,4 +2702,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
 			 struct btrfs_scrub_progress *progress);
 
+/* reada.c */
+struct reada_control {
+	struct btrfs_root	*root;		/* tree to prefetch */
+	struct btrfs_key	key_start;
+	struct btrfs_key	key_end;	/* exclusive */
+	atomic_t		elems;
+	struct kref		refcnt;
+	wait_queue_head_t	wait;
+};
+struct reada_control *btrfs_reada_add(struct btrfs_root *root,
+			      struct btrfs_key *start, struct btrfs_key *end);
+int btrfs_reada_wait(void *handle);
+void btrfs_reada_detach(void *handle);
+int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+			 u64 start, int err);
+
 #endif
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
new file mode 100644
index 000000000000..2b701d082227
--- /dev/null
+++ b/fs/btrfs/reada.c
@@ -0,0 +1,949 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "disk-io.h"
+#include "transaction.h"
+
+#undef DEBUG
+
+/*
+ * This is the implementation for the generic read ahead framework.
+ *
+ * To trigger a readahead, btrfs_reada_add must be called. It will start
+ * a read ahead for the given range [start, end) on tree root. The returned
+ * handle can either be used to wait on the readahead to finish
+ * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
+ *
+ * The read ahead works as follows:
+ * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
+ * reada_start_machine will then search for extents to prefetch and trigger
+ * some reads. When a read finishes for a node, all contained node/leaf
+ * pointers that lie in the given range will also be enqueued. The reads will
+ * be triggered in sequential order, thus giving a big win over a naive
+ * enumeration. It will also make use of multi-device layouts. Each disk
+ * will have its on read pointer and all disks will by utilized in parallel.
+ * Also will no two disks read both sides of a mirror simultaneously, as this
+ * would waste seeking capacity. Instead both disks will read different parts
+ * of the filesystem.
+ * Any number of readaheads can be started in parallel. The read order will be
+ * determined globally, i.e. 2 parallel readaheads will normally finish faster
+ * than the 2 started one after another.
+ */
+
+#define MAX_MIRRORS 2
+#define MAX_IN_FLIGHT 6
+
+struct reada_extctl {
+	struct list_head	list;
+	struct reada_control	*rc;
+	u64			generation;
+};
+
+struct reada_extent {
+	u64			logical;
+	struct btrfs_key	top;
+	u32			blocksize;
+	int			err;
+	struct list_head	extctl;
+	struct kref		refcnt;
+	spinlock_t		lock;
+	struct reada_zone	*zones[MAX_MIRRORS];
+	int			nzones;
+	struct btrfs_device	*scheduled_for;
+};
+
+struct reada_zone {
+	u64			start;
+	u64			end;
+	u64			elems;
+	struct list_head	list;
+	spinlock_t		lock;
+	int			locked;
+	struct btrfs_device	*device;
+	struct btrfs_device	*devs[MAX_MIRRORS]; /* full list, incl self */
+	int			ndevs;
+	struct kref		refcnt;
+};
+
+struct reada_machine_work {
+	struct btrfs_work	work;
+	struct btrfs_fs_info	*fs_info;
+};
+
+static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
+static void reada_control_release(struct kref *kref);
+static void reada_zone_release(struct kref *kref);
+static void reada_start_machine(struct btrfs_fs_info *fs_info);
+static void __reada_start_machine(struct btrfs_fs_info *fs_info);
+
+static int reada_add_block(struct reada_control *rc, u64 logical,
+			   struct btrfs_key *top, int level, u64 generation);
+
+/* recurses */
+/* in case of err, eb might be NULL */
+static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+			    u64 start, int err)
+{
+	int level = 0;
+	int nritems;
+	int i;
+	u64 bytenr;
+	u64 generation;
+	struct reada_extent *re;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct list_head list;
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	struct btrfs_device *for_dev;
+
+	if (eb)
+		level = btrfs_header_level(eb);
+
+	/* find extent */
+	spin_lock(&fs_info->reada_lock);
+	re = radix_tree_lookup(&fs_info->reada_tree, index);
+	if (re)
+		kref_get(&re->refcnt);
+	spin_unlock(&fs_info->reada_lock);
+
+	if (!re)
+		return -1;
+
+	spin_lock(&re->lock);
+	/*
+	 * just take the full list from the extent. afterwards we
+	 * don't need the lock anymore
+	 */
+	list_replace_init(&re->extctl, &list);
+	for_dev = re->scheduled_for;
+	re->scheduled_for = NULL;
+	spin_unlock(&re->lock);
+
+	if (err == 0) {
+		nritems = level ? btrfs_header_nritems(eb) : 0;
+		generation = btrfs_header_generation(eb);
+		/*
+		 * FIXME: currently we just set nritems to 0 if this is a leaf,
+		 * effectively ignoring the content. In a next step we could
+		 * trigger more readahead depending from the content, e.g.
+		 * fetch the checksums for the extents in the leaf.
+		 */
+	} else {
+		/*
+		 * this is the error case, the extent buffer has not been
+		 * read correctly. We won't access anything from it and
+		 * just cleanup our data structures. Effectively this will
+		 * cut the branch below this node from read ahead.
+		 */
+		nritems = 0;
+		generation = 0;
+	}
+
+	for (i = 0; i < nritems; i++) {
+		struct reada_extctl *rec;
+		u64 n_gen;
+		struct btrfs_key key;
+		struct btrfs_key next_key;
+
+		btrfs_node_key_to_cpu(eb, &key, i);
+		if (i + 1 < nritems)
+			btrfs_node_key_to_cpu(eb, &next_key, i + 1);
+		else
+			next_key = re->top;
+		bytenr = btrfs_node_blockptr(eb, i);
+		n_gen = btrfs_node_ptr_generation(eb, i);
+
+		list_for_each_entry(rec, &list, list) {
+			struct reada_control *rc = rec->rc;
+
+			/*
+			 * if the generation doesn't match, just ignore this
+			 * extctl. This will probably cut off a branch from
+			 * prefetch. Alternatively one could start a new (sub-)
+			 * prefetch for this branch, starting again from root.
+			 * FIXME: move the generation check out of this loop
+			 */
+#ifdef DEBUG
+			if (rec->generation != generation) {
+				printk(KERN_DEBUG "generation mismatch for "
+						"(%llu,%d,%llu) %llu != %llu\n",
+				       key.objectid, key.type, key.offset,
+				       rec->generation, generation);
+			}
+#endif
+			if (rec->generation == generation &&
+			    btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
+			    btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
+				reada_add_block(rc, bytenr, &next_key,
+						level - 1, n_gen);
+		}
+	}
+	/*
+	 * free extctl records
+	 */
+	while (!list_empty(&list)) {
+		struct reada_control *rc;
+		struct reada_extctl *rec;
+
+		rec = list_first_entry(&list, struct reada_extctl, list);
+		list_del(&rec->list);
+		rc = rec->rc;
+		kfree(rec);
+
+		kref_get(&rc->refcnt);
+		if (atomic_dec_and_test(&rc->elems)) {
+			kref_put(&rc->refcnt, reada_control_release);
+			wake_up(&rc->wait);
+		}
+		kref_put(&rc->refcnt, reada_control_release);
+
+		reada_extent_put(fs_info, re);	/* one ref for each entry */
+	}
+	reada_extent_put(fs_info, re);	/* our ref */
+	if (for_dev)
+		atomic_dec(&for_dev->reada_in_flight);
+
+	return 0;
+}
+
+/*
+ * start is passed separately in case eb in NULL, which may be the case with
+ * failed I/O
+ */
+int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+			 u64 start, int err)
+{
+	int ret;
+
+	ret = __readahead_hook(root, eb, start, err);
+
+	reada_start_machine(root->fs_info);
+
+	return ret;
+}
+
+static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
+					  struct btrfs_device *dev, u64 logical,
+					  struct btrfs_multi_bio *multi)
+{
+	int ret;
+	int looped = 0;
+	struct reada_zone *zone;
+	struct btrfs_block_group_cache *cache = NULL;
+	u64 start;
+	u64 end;
+	int i;
+
+again:
+	zone = NULL;
+	spin_lock(&fs_info->reada_lock);
+	ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
+				     logical >> PAGE_CACHE_SHIFT, 1);
+	if (ret == 1)
+		kref_get(&zone->refcnt);
+	spin_unlock(&fs_info->reada_lock);
+
+	if (ret == 1) {
+		if (logical >= zone->start && logical < zone->end)
+			return zone;
+		spin_lock(&fs_info->reada_lock);
+		kref_put(&zone->refcnt, reada_zone_release);
+		spin_unlock(&fs_info->reada_lock);
+	}
+
+	if (looped)
+		return NULL;
+
+	cache = btrfs_lookup_block_group(fs_info, logical);
+	if (!cache)
+		return NULL;
+
+	start = cache->key.objectid;
+	end = start + cache->key.offset - 1;
+	btrfs_put_block_group(cache);
+
+	zone = kzalloc(sizeof(*zone), GFP_NOFS);
+	if (!zone)
+		return NULL;
+
+	zone->start = start;
+	zone->end = end;
+	INIT_LIST_HEAD(&zone->list);
+	spin_lock_init(&zone->lock);
+	zone->locked = 0;
+	kref_init(&zone->refcnt);
+	zone->elems = 0;
+	zone->device = dev; /* our device always sits at index 0 */
+	for (i = 0; i < multi->num_stripes; ++i) {
+		/* bounds have already been checked */
+		zone->devs[i] = multi->stripes[i].dev;
+	}
+	zone->ndevs = multi->num_stripes;
+
+	spin_lock(&fs_info->reada_lock);
+	ret = radix_tree_insert(&dev->reada_zones,
+				(unsigned long)zone->end >> PAGE_CACHE_SHIFT,
+				zone);
+	spin_unlock(&fs_info->reada_lock);
+
+	if (ret) {
+		kfree(zone);
+		looped = 1;
+		goto again;
+	}
+
+	return zone;
+}
+
+static struct reada_extent *reada_find_extent(struct btrfs_root *root,
+					      u64 logical,
+					      struct btrfs_key *top, int level)
+{
+	int ret;
+	int looped = 0;
+	struct reada_extent *re = NULL;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+	struct btrfs_multi_bio *multi = NULL;
+	struct btrfs_device *dev;
+	u32 blocksize;
+	u64 length;
+	int nzones = 0;
+	int i;
+	unsigned long index = logical >> PAGE_CACHE_SHIFT;
+
+again:
+	spin_lock(&fs_info->reada_lock);
+	re = radix_tree_lookup(&fs_info->reada_tree, index);
+	if (re)
+		kref_get(&re->refcnt);
+	spin_unlock(&fs_info->reada_lock);
+
+	if (re || looped)
+		return re;
+
+	re = kzalloc(sizeof(*re), GFP_NOFS);
+	if (!re)
+		return NULL;
+
+	blocksize = btrfs_level_size(root, level);
+	re->logical = logical;
+	re->blocksize = blocksize;
+	re->top = *top;
+	INIT_LIST_HEAD(&re->extctl);
+	spin_lock_init(&re->lock);
+	kref_init(&re->refcnt);
+
+	/*
+	 * map block
+	 */
+	length = blocksize;
+	ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &multi, 0);
+	if (ret || !multi || length < blocksize)
+		goto error;
+
+	if (multi->num_stripes > MAX_MIRRORS) {
+		printk(KERN_ERR "btrfs readahead: more than %d copies not "
+				"supported", MAX_MIRRORS);
+		goto error;
+	}
+
+	for (nzones = 0; nzones < multi->num_stripes; ++nzones) {
+		struct reada_zone *zone;
+
+		dev = multi->stripes[nzones].dev;
+		zone = reada_find_zone(fs_info, dev, logical, multi);
+		if (!zone)
+			break;
+
+		re->zones[nzones] = zone;
+		spin_lock(&zone->lock);
+		if (!zone->elems)
+			kref_get(&zone->refcnt);
+		++zone->elems;
+		spin_unlock(&zone->lock);
+		spin_lock(&fs_info->reada_lock);
+		kref_put(&zone->refcnt, reada_zone_release);
+		spin_unlock(&fs_info->reada_lock);
+	}
+	re->nzones = nzones;
+	if (nzones == 0) {
+		/* not a single zone found, error and out */
+		goto error;
+	}
+
+	/* insert extent in reada_tree + all per-device trees, all or nothing */
+	spin_lock(&fs_info->reada_lock);
+	ret = radix_tree_insert(&fs_info->reada_tree, index, re);
+	if (ret) {
+		spin_unlock(&fs_info->reada_lock);
+		if (ret != -ENOMEM) {
+			/* someone inserted the extent in the meantime */
+			looped = 1;
+		}
+		goto error;
+	}
+	for (i = 0; i < nzones; ++i) {
+		dev = multi->stripes[i].dev;
+		ret = radix_tree_insert(&dev->reada_extents, index, re);
+		if (ret) {
+			while (--i >= 0) {
+				dev = multi->stripes[i].dev;
+				BUG_ON(dev == NULL);
+				radix_tree_delete(&dev->reada_extents, index);
+			}
+			BUG_ON(fs_info == NULL);
+			radix_tree_delete(&fs_info->reada_tree, index);
+			spin_unlock(&fs_info->reada_lock);
+			goto error;
+		}
+	}
+	spin_unlock(&fs_info->reada_lock);
+
+	return re;
+
+error:
+	while (nzones) {
+		struct reada_zone *zone;
+
+		--nzones;
+		zone = re->zones[nzones];
+		kref_get(&zone->refcnt);
+		spin_lock(&zone->lock);
+		--zone->elems;
+		if (zone->elems == 0) {
+			/*
+			 * no fs_info->reada_lock needed, as this can't be
+			 * the last ref
+			 */
+			kref_put(&zone->refcnt, reada_zone_release);
+		}
+		spin_unlock(&zone->lock);
+
+		spin_lock(&fs_info->reada_lock);
+		kref_put(&zone->refcnt, reada_zone_release);
+		spin_unlock(&fs_info->reada_lock);
+	}
+	kfree(re);
+	if (looped)
+		goto again;
+	return NULL;
+}
+
+static void reada_kref_dummy(struct kref *kr)
+{
+}
+
+static void reada_extent_put(struct btrfs_fs_info *fs_info,
+			     struct reada_extent *re)
+{
+	int i;
+	unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
+
+	spin_lock(&fs_info->reada_lock);
+	if (!kref_put(&re->refcnt, reada_kref_dummy)) {
+		spin_unlock(&fs_info->reada_lock);
+		return;
+	}
+
+	radix_tree_delete(&fs_info->reada_tree, index);
+	for (i = 0; i < re->nzones; ++i) {
+		struct reada_zone *zone = re->zones[i];
+
+		radix_tree_delete(&zone->device->reada_extents, index);
+	}
+
+	spin_unlock(&fs_info->reada_lock);
+
+	for (i = 0; i < re->nzones; ++i) {
+		struct reada_zone *zone = re->zones[i];
+
+		kref_get(&zone->refcnt);
+		spin_lock(&zone->lock);
+		--zone->elems;
+		if (zone->elems == 0) {
+			/* no fs_info->reada_lock needed, as this can't be
+			 * the last ref */
+			kref_put(&zone->refcnt, reada_zone_release);
+		}
+		spin_unlock(&zone->lock);
+
+		spin_lock(&fs_info->reada_lock);
+		kref_put(&zone->refcnt, reada_zone_release);
+		spin_unlock(&fs_info->reada_lock);
+	}
+	if (re->scheduled_for)
+		atomic_dec(&re->scheduled_for->reada_in_flight);
+
+	kfree(re);
+}
+
+static void reada_zone_release(struct kref *kref)
+{
+	struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
+
+	radix_tree_delete(&zone->device->reada_zones,
+			  zone->end >> PAGE_CACHE_SHIFT);
+
+	kfree(zone);
+}
+
+static void reada_control_release(struct kref *kref)
+{
+	struct reada_control *rc = container_of(kref, struct reada_control,
+						refcnt);
+
+	kfree(rc);
+}
+
+static int reada_add_block(struct reada_control *rc, u64 logical,
+			   struct btrfs_key *top, int level, u64 generation)
+{
+	struct btrfs_root *root = rc->root;
+	struct reada_extent *re;
+	struct reada_extctl *rec;
+
+	re = reada_find_extent(root, logical, top, level); /* takes one ref */
+	if (!re)
+		return -1;
+
+	rec = kzalloc(sizeof(*rec), GFP_NOFS);
+	if (!rec) {
+		reada_extent_put(root->fs_info, re);
+		return -1;
+	}
+
+	rec->rc = rc;
+	rec->generation = generation;
+	atomic_inc(&rc->elems);
+
+	spin_lock(&re->lock);
+	list_add_tail(&rec->list, &re->extctl);
+	spin_unlock(&re->lock);
+
+	/* leave the ref on the extent */
+
+	return 0;
+}
+
+/*
+ * called with fs_info->reada_lock held
+ */
+static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
+{
+	int i;
+	unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
+
+	for (i = 0; i < zone->ndevs; ++i) {
+		struct reada_zone *peer;
+		peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
+		if (peer && peer->device != zone->device)
+			peer->locked = lock;
+	}
+}
+
+/*
+ * called with fs_info->reada_lock held
+ */
+static int reada_pick_zone(struct btrfs_device *dev)
+{
+	struct reada_zone *top_zone = NULL;
+	struct reada_zone *top_locked_zone = NULL;
+	u64 top_elems = 0;
+	u64 top_locked_elems = 0;
+	unsigned long index = 0;
+	int ret;
+
+	if (dev->reada_curr_zone) {
+		reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
+		kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
+		dev->reada_curr_zone = NULL;
+	}
+	/* pick the zone with the most elements */
+	while (1) {
+		struct reada_zone *zone;
+
+		ret = radix_tree_gang_lookup(&dev->reada_zones,
+					     (void **)&zone, index, 1);
+		if (ret == 0)
+			break;
+		index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+		if (zone->locked) {
+			if (zone->elems > top_locked_elems) {
+				top_locked_elems = zone->elems;
+				top_locked_zone = zone;
+			}
+		} else {
+			if (zone->elems > top_elems) {
+				top_elems = zone->elems;
+				top_zone = zone;
+			}
+		}
+	}
+	if (top_zone)
+		dev->reada_curr_zone = top_zone;
+	else if (top_locked_zone)
+		dev->reada_curr_zone = top_locked_zone;
+	else
+		return 0;
+
+	dev->reada_next = dev->reada_curr_zone->start;
+	kref_get(&dev->reada_curr_zone->refcnt);
+	reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
+
+	return 1;
+}
+
+static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
+				   struct btrfs_device *dev)
+{
+	struct reada_extent *re = NULL;
+	int mirror_num = 0;
+	struct extent_buffer *eb = NULL;
+	u64 logical;
+	u32 blocksize;
+	int ret;
+	int i;
+	int need_kick = 0;
+
+	spin_lock(&fs_info->reada_lock);
+	if (dev->reada_curr_zone == NULL) {
+		ret = reada_pick_zone(dev);
+		if (!ret) {
+			spin_unlock(&fs_info->reada_lock);
+			return 0;
+		}
+	}
+	/*
+	 * FIXME currently we issue the reads one extent at a time. If we have
+	 * a contiguous block of extents, we could also coagulate them or use
+	 * plugging to speed things up
+	 */
+	ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
+				     dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+	if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
+		ret = reada_pick_zone(dev);
+		if (!ret) {
+			spin_unlock(&fs_info->reada_lock);
+			return 0;
+		}
+		re = NULL;
+		ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
+					dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+	}
+	if (ret == 0) {
+		spin_unlock(&fs_info->reada_lock);
+		return 0;
+	}
+	dev->reada_next = re->logical + re->blocksize;
+	kref_get(&re->refcnt);
+
+	spin_unlock(&fs_info->reada_lock);
+
+	/*
+	 * find mirror num
+	 */
+	for (i = 0; i < re->nzones; ++i) {
+		if (re->zones[i]->device == dev) {
+			mirror_num = i + 1;
+			break;
+		}
+	}
+	logical = re->logical;
+	blocksize = re->blocksize;
+
+	spin_lock(&re->lock);
+	if (re->scheduled_for == NULL) {
+		re->scheduled_for = dev;
+		need_kick = 1;
+	}
+	spin_unlock(&re->lock);
+
+	reada_extent_put(fs_info, re);
+
+	if (!need_kick)
+		return 0;
+
+	atomic_inc(&dev->reada_in_flight);
+	ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
+			 mirror_num, &eb);
+	if (ret)
+		__readahead_hook(fs_info->extent_root, NULL, logical, ret);
+	else if (eb)
+		__readahead_hook(fs_info->extent_root, eb, eb->start, ret);
+
+	if (eb)
+		free_extent_buffer(eb);
+
+	return 1;
+
+}
+
+static void reada_start_machine_worker(struct btrfs_work *work)
+{
+	struct reada_machine_work *rmw;
+	struct btrfs_fs_info *fs_info;
+
+	rmw = container_of(work, struct reada_machine_work, work);
+	fs_info = rmw->fs_info;
+
+	kfree(rmw);
+
+	__reada_start_machine(fs_info);
+}
+
+static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	u64 enqueued;
+	u64 total = 0;
+	int i;
+
+	do {
+		enqueued = 0;
+		list_for_each_entry(device, &fs_devices->devices, dev_list) {
+			if (atomic_read(&device->reada_in_flight) <
+			    MAX_IN_FLIGHT)
+				enqueued += reada_start_machine_dev(fs_info,
+								    device);
+		}
+		total += enqueued;
+	} while (enqueued && total < 10000);
+
+	if (enqueued == 0)
+		return;
+
+	/*
+	 * If everything is already in the cache, this is effectively single
+	 * threaded. To a) not hold the caller for too long and b) to utilize
+	 * more cores, we broke the loop above after 10000 iterations and now
+	 * enqueue to workers to finish it. This will distribute the load to
+	 * the cores.
+	 */
+	for (i = 0; i < 2; ++i)
+		reada_start_machine(fs_info);
+}
+
+static void reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+	struct reada_machine_work *rmw;
+
+	rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
+	if (!rmw) {
+		/* FIXME we cannot handle this properly right now */
+		BUG();
+	}
+	rmw->work.func = reada_start_machine_worker;
+	rmw->fs_info = fs_info;
+
+	btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
+}
+
+#ifdef DEBUG
+static void dump_devs(struct btrfs_fs_info *fs_info, int all)
+{
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	unsigned long index;
+	int ret;
+	int i;
+	int j;
+	int cnt;
+
+	spin_lock(&fs_info->reada_lock);
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+		printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid,
+			atomic_read(&device->reada_in_flight));
+		index = 0;
+		while (1) {
+			struct reada_zone *zone;
+			ret = radix_tree_gang_lookup(&device->reada_zones,
+						     (void **)&zone, index, 1);
+			if (ret == 0)
+				break;
+			printk(KERN_DEBUG "  zone %llu-%llu elems %llu locked "
+				"%d devs", zone->start, zone->end, zone->elems,
+				zone->locked);
+			for (j = 0; j < zone->ndevs; ++j) {
+				printk(KERN_CONT " %lld",
+					zone->devs[j]->devid);
+			}
+			if (device->reada_curr_zone == zone)
+				printk(KERN_CONT " curr off %llu",
+					device->reada_next - zone->start);
+			printk(KERN_CONT "\n");
+			index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+		}
+		cnt = 0;
+		index = 0;
+		while (all) {
+			struct reada_extent *re = NULL;
+
+			ret = radix_tree_gang_lookup(&device->reada_extents,
+						     (void **)&re, index, 1);
+			if (ret == 0)
+				break;
+			printk(KERN_DEBUG
+				"  re: logical %llu size %u empty %d for %lld",
+				re->logical, re->blocksize,
+				list_empty(&re->extctl), re->scheduled_for ?
+				re->scheduled_for->devid : -1);
+
+			for (i = 0; i < re->nzones; ++i) {
+				printk(KERN_CONT " zone %llu-%llu devs",
+					re->zones[i]->start,
+					re->zones[i]->end);
+				for (j = 0; j < re->zones[i]->ndevs; ++j) {
+					printk(KERN_CONT " %lld",
+						re->zones[i]->devs[j]->devid);
+				}
+			}
+			printk(KERN_CONT "\n");
+			index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+			if (++cnt > 15)
+				break;
+		}
+	}
+
+	index = 0;
+	cnt = 0;
+	while (all) {
+		struct reada_extent *re = NULL;
+
+		ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
+					     index, 1);
+		if (ret == 0)
+			break;
+		if (!re->scheduled_for) {
+			index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+			continue;
+		}
+		printk(KERN_DEBUG
+			"re: logical %llu size %u list empty %d for %lld",
+			re->logical, re->blocksize, list_empty(&re->extctl),
+			re->scheduled_for ? re->scheduled_for->devid : -1);
+		for (i = 0; i < re->nzones; ++i) {
+			printk(KERN_CONT " zone %llu-%llu devs",
+				re->zones[i]->start,
+				re->zones[i]->end);
+			for (i = 0; i < re->nzones; ++i) {
+				printk(KERN_CONT " zone %llu-%llu devs",
+					re->zones[i]->start,
+					re->zones[i]->end);
+				for (j = 0; j < re->zones[i]->ndevs; ++j) {
+					printk(KERN_CONT " %lld",
+						re->zones[i]->devs[j]->devid);
+				}
+			}
+		}
+		printk(KERN_CONT "\n");
+		index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+	}
+	spin_unlock(&fs_info->reada_lock);
+}
+#endif
+
+/*
+ * interface
+ */
+struct reada_control *btrfs_reada_add(struct btrfs_root *root,
+			struct btrfs_key *key_start, struct btrfs_key *key_end)
+{
+	struct reada_control *rc;
+	u64 start;
+	u64 generation;
+	int level;
+	struct extent_buffer *node;
+	static struct btrfs_key max_key = {
+		.objectid = (u64)-1,
+		.type = (u8)-1,
+		.offset = (u64)-1
+	};
+
+	rc = kzalloc(sizeof(*rc), GFP_NOFS);
+	if (!rc)
+		return ERR_PTR(-ENOMEM);
+
+	rc->root = root;
+	rc->key_start = *key_start;
+	rc->key_end = *key_end;
+	atomic_set(&rc->elems, 0);
+	init_waitqueue_head(&rc->wait);
+	kref_init(&rc->refcnt);
+	kref_get(&rc->refcnt); /* one ref for having elements */
+
+	node = btrfs_root_node(root);
+	start = node->start;
+	level = btrfs_header_level(node);
+	generation = btrfs_header_generation(node);
+	free_extent_buffer(node);
+
+	reada_add_block(rc, start, &max_key, level, generation);
+
+	reada_start_machine(root->fs_info);
+
+	return rc;
+}
+
+#ifdef DEBUG
+int btrfs_reada_wait(void *handle)
+{
+	struct reada_control *rc = handle;
+
+	while (atomic_read(&rc->elems)) {
+		wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
+				   5 * HZ);
+		dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
+	}
+
+	dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
+
+	kref_put(&rc->refcnt, reada_control_release);
+
+	return 0;
+}
+#else
+int btrfs_reada_wait(void *handle)
+{
+	struct reada_control *rc = handle;
+
+	while (atomic_read(&rc->elems)) {
+		wait_event(rc->wait, atomic_read(&rc->elems) == 0);
+	}
+
+	kref_put(&rc->refcnt, reada_control_release);
+
+	return 0;
+}
+#endif
+
+void btrfs_reada_detach(void *handle)
+{
+	struct reada_control *rc = handle;
+
+	kref_put(&rc->refcnt, reada_control_release);
+}
-- 
cgit v1.2.3-58-ga151


From 4bb31e928d1a47f5bd046ecb176b8eff7c589fc0 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Fri, 10 Jun 2011 13:55:54 +0200
Subject: btrfs: hooks for readahead

This adds the hooks needed for readahead. In the readpage_end_io_hook,
the extent state is checked for the EXTENT_READAHEAD flag. Only in this
case the readahead hook is called, to keep the impact on non-ra as low
as possible.
Additionally, a hook for a failed IO is added, otherwise readahead would
wait indefinitely for the extent to finish.

Changes for v2:
 - eliminate race condition

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/disk-io.c   | 37 +++++++++++++++++++++++++++++++++++++
 fs/btrfs/extent_io.c |  2 +-
 2 files changed, 38 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fd28c40e9ec3..2151828aa142 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -609,11 +609,47 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
 	end = eb->start + end - 1;
 err:
+	if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
+		clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+		btree_readahead_hook(root, eb, eb->start, ret);
+	}
+
 	free_extent_buffer(eb);
 out:
 	return ret;
 }
 
+static int btree_io_failed_hook(struct bio *failed_bio,
+			 struct page *page, u64 start, u64 end,
+			 struct extent_state *state)
+{
+	struct extent_io_tree *tree;
+	unsigned long len;
+	struct extent_buffer *eb;
+	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	if (page->private == EXTENT_PAGE_PRIVATE)
+		goto out;
+	if (!page->private)
+		goto out;
+
+	len = page->private >> 2;
+	WARN_ON(len == 0);
+
+	eb = alloc_extent_buffer(tree, start, len, page);
+	if (eb == NULL)
+		goto out;
+
+	if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
+		clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+		btree_readahead_hook(root, eb, eb->start, -EIO);
+	}
+
+out:
+	return -EIO;	/* we fixed nothing */
+}
+
 static void end_workqueue_bio(struct bio *bio, int err)
 {
 	struct end_io_wq *end_io_wq = bio->bi_private;
@@ -3166,6 +3202,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
 static struct extent_io_ops btree_extent_io_ops = {
 	.write_cache_pages_lock_hook = btree_lock_page_hook,
 	.readpage_end_io_hook = btree_readpage_end_io_hook,
+	.readpage_io_failed_hook = btree_io_failed_hook,
 	.submit_bio_hook = btree_submit_bio_hook,
 	/* note we're sharing with inode.c for the merge bio hook */
 	.merge_bio_hook = btrfs_merge_bio_hook,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 823028e73cf8..deba714236d7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1731,7 +1731,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 		if (!uptodate && tree->ops &&
 		    tree->ops->readpage_io_failed_hook) {
 			ret = tree->ops->readpage_io_failed_hook(bio, page,
-							 start, end, NULL);
+							 start, end, state);
 			if (ret == 0) {
 				uptodate =
 					test_bit(BIO_UPTODATE, &bio->bi_flags);
-- 
cgit v1.2.3-58-ga151


From 7a26285eea8eb92e0088db011571d887d4551b0f Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Fri, 10 Jun 2011 12:39:23 +0200
Subject: btrfs: use readahead API for scrub

Scrub uses a simple tree-enumeration to bring the relevant portions
of the extent- and csum-tree into the page cache before starting the
scrub-I/O. This is now replaced by using the new readahead-API.
During readahead the scrub is being accounted as paused, so it won't
hold off transaction commits.

This change raises the average disk bandwith utilisation on my test
volume from 70% to 90%. On another volume, the time for a test run
went down from 89s to 43s.

Changes v5:
 - reada1/2 are now of type struct reada_control *

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/scrub.c | 112 +++++++++++++++++++++++++------------------------------
 1 file changed, 50 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a8d03d5efb5d..f930f2776589 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -29,15 +29,12 @@
  * any can be found.
  *
  * Future enhancements:
- *  - To enhance the performance, better read-ahead strategies for the
- *    extent-tree can be employed.
  *  - In case an unrepairable extent is encountered, track which files are
  *    affected and report them
  *  - In case of a read error on files with nodatasum, map the file and read
  *    the extent to trigger a writeback of the good copy
  *  - track and record media errors, throw out bad devices
  *  - add a mode to also read unallocated space
- *  - make the prefetch cancellable
  */
 
 struct scrub_bio;
@@ -741,13 +738,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 	int slot;
 	int i;
 	u64 nstripes;
-	int start_stripe;
 	struct extent_buffer *l;
 	struct btrfs_key key;
 	u64 physical;
 	u64 logical;
 	u64 generation;
 	u64 mirror_num;
+	struct reada_control *reada1;
+	struct reada_control *reada2;
+	struct btrfs_key key_start;
+	struct btrfs_key key_end;
 
 	u64 increment = map->stripe_len;
 	u64 offset;
@@ -779,81 +779,67 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 	if (!path)
 		return -ENOMEM;
 
-	path->reada = 2;
 	path->search_commit_root = 1;
 	path->skip_locking = 1;
 
 	/*
-	 * find all extents for each stripe and just read them to get
-	 * them into the page cache
-	 * FIXME: we can do better. build a more intelligent prefetching
+	 * trigger the readahead for extent tree csum tree and wait for
+	 * completion. During readahead, the scrub is officially paused
+	 * to not hold off transaction commits
 	 */
 	logical = base + offset;
-	physical = map->stripes[num].physical;
-	ret = 0;
-	for (i = 0; i < nstripes; ++i) {
-		key.objectid = logical;
-		key.type = BTRFS_EXTENT_ITEM_KEY;
-		key.offset = (u64)0;
-
-		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-		if (ret < 0)
-			goto out_noplug;
-
-		/*
-		 * we might miss half an extent here, but that doesn't matter,
-		 * as it's only the prefetch
-		 */
-		while (1) {
-			l = path->nodes[0];
-			slot = path->slots[0];
-			if (slot >= btrfs_header_nritems(l)) {
-				ret = btrfs_next_leaf(root, path);
-				if (ret == 0)
-					continue;
-				if (ret < 0)
-					goto out_noplug;
 
-				break;
-			}
-			btrfs_item_key_to_cpu(l, &key, slot);
+	wait_event(sdev->list_wait,
+		   atomic_read(&sdev->in_flight) == 0);
+	atomic_inc(&fs_info->scrubs_paused);
+	wake_up(&fs_info->scrub_pause_wait);
 
-			if (key.objectid >= logical + map->stripe_len)
-				break;
+	/* FIXME it might be better to start readahead at commit root */
+	key_start.objectid = logical;
+	key_start.type = BTRFS_EXTENT_ITEM_KEY;
+	key_start.offset = (u64)0;
+	key_end.objectid = base + offset + nstripes * increment;
+	key_end.type = BTRFS_EXTENT_ITEM_KEY;
+	key_end.offset = (u64)0;
+	reada1 = btrfs_reada_add(root, &key_start, &key_end);
+
+	key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	key_start.type = BTRFS_EXTENT_CSUM_KEY;
+	key_start.offset = logical;
+	key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	key_end.type = BTRFS_EXTENT_CSUM_KEY;
+	key_end.offset = base + offset + nstripes * increment;
+	reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
+
+	if (!IS_ERR(reada1))
+		btrfs_reada_wait(reada1);
+	if (!IS_ERR(reada2))
+		btrfs_reada_wait(reada2);
 
-			path->slots[0]++;
-		}
-		btrfs_release_path(path);
-		logical += increment;
-		physical += map->stripe_len;
-		cond_resched();
+	mutex_lock(&fs_info->scrub_lock);
+	while (atomic_read(&fs_info->scrub_pause_req)) {
+		mutex_unlock(&fs_info->scrub_lock);
+		wait_event(fs_info->scrub_pause_wait,
+		   atomic_read(&fs_info->scrub_pause_req) == 0);
+		mutex_lock(&fs_info->scrub_lock);
 	}
+	atomic_dec(&fs_info->scrubs_paused);
+	mutex_unlock(&fs_info->scrub_lock);
+	wake_up(&fs_info->scrub_pause_wait);
 
 	/*
 	 * collect all data csums for the stripe to avoid seeking during
 	 * the scrub. This might currently (crc32) end up to be about 1MB
 	 */
-	start_stripe = 0;
 	blk_start_plug(&plug);
-again:
-	logical = base + offset + start_stripe * increment;
-	for (i = start_stripe; i < nstripes; ++i) {
-		ret = btrfs_lookup_csums_range(csum_root, logical,
-					       logical + map->stripe_len - 1,
-					       &sdev->csum_list, 1);
-		if (ret)
-			goto out;
 
-		logical += increment;
-		cond_resched();
-	}
 	/*
 	 * now find all extents for each stripe and scrub them
 	 */
-	logical = base + offset + start_stripe * increment;
-	physical = map->stripes[num].physical + start_stripe * map->stripe_len;
+	logical = base + offset;
+	physical = map->stripes[num].physical;
 	ret = 0;
-	for (i = start_stripe; i < nstripes; ++i) {
+	for (i = 0; i < nstripes; ++i) {
 		/*
 		 * canceled?
 		 */
@@ -882,11 +868,14 @@ again:
 			atomic_dec(&fs_info->scrubs_paused);
 			mutex_unlock(&fs_info->scrub_lock);
 			wake_up(&fs_info->scrub_pause_wait);
-			scrub_free_csums(sdev);
-			start_stripe = i;
-			goto again;
 		}
 
+		ret = btrfs_lookup_csums_range(csum_root, logical,
+					       logical + map->stripe_len - 1,
+					       &sdev->csum_list, 1);
+		if (ret)
+			goto out;
+
 		key.objectid = logical;
 		key.type = BTRFS_EXTENT_ITEM_KEY;
 		key.offset = (u64)0;
@@ -982,7 +971,6 @@ next:
 
 out:
 	blk_finish_plug(&plug);
-out_noplug:
 	btrfs_free_path(path);
 	return ret < 0 ? ret : 0;
 }
-- 
cgit v1.2.3-58-ga151


From af6a311384bce6c88e15c80ab22ab051a918b4eb Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Mon, 3 Oct 2011 20:46:17 -0600
Subject: writeback: add bg_threshold parameter to __bdi_update_bandwidth()

No behavior change.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c         |  2 +-
 include/linux/writeback.h |  1 +
 mm/page-writeback.c       | 11 +++++++----
 3 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 04cf3b91e501..28076562ada0 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -675,7 +675,7 @@ static inline bool over_bground_thresh(void)
 static void wb_update_bandwidth(struct bdi_writeback *wb,
 				unsigned long start_time)
 {
-	__bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time);
+	__bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time);
 }
 
 /*
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 2b8963ff0f35..ddb4652cb337 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -143,6 +143,7 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
 
 void __bdi_update_bandwidth(struct backing_dev_info *bdi,
 			    unsigned long thresh,
+			    unsigned long bg_thresh,
 			    unsigned long dirty,
 			    unsigned long bdi_thresh,
 			    unsigned long bdi_dirty,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c16ddd8f5cb6..4b954c9fe846 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -779,6 +779,7 @@ static void global_update_bandwidth(unsigned long thresh,
 
 void __bdi_update_bandwidth(struct backing_dev_info *bdi,
 			    unsigned long thresh,
+			    unsigned long bg_thresh,
 			    unsigned long dirty,
 			    unsigned long bdi_thresh,
 			    unsigned long bdi_dirty,
@@ -815,6 +816,7 @@ snapshot:
 
 static void bdi_update_bandwidth(struct backing_dev_info *bdi,
 				 unsigned long thresh,
+				 unsigned long bg_thresh,
 				 unsigned long dirty,
 				 unsigned long bdi_thresh,
 				 unsigned long bdi_dirty,
@@ -823,8 +825,8 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
 	if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
 		return;
 	spin_lock(&bdi->wb.list_lock);
-	__bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty,
-			       start_time);
+	__bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty,
+			       bdi_thresh, bdi_dirty, start_time);
 	spin_unlock(&bdi->wb.list_lock);
 }
 
@@ -912,8 +914,9 @@ static void balance_dirty_pages(struct address_space *mapping,
 		if (!bdi->dirty_exceeded)
 			bdi->dirty_exceeded = 1;
 
-		bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty,
-				     bdi_thresh, bdi_dirty, start_time);
+		bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,
+				     nr_dirty, bdi_thresh, bdi_dirty,
+				     start_time);
 
 		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
 		 * Unstable writes are a feature of certain networked
-- 
cgit v1.2.3-58-ga151


From b00949aa2df9970a912bf060bc95e99da356881c Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Thu, 18 Nov 2010 14:38:33 -0600
Subject: writeback: per-bdi background threshold

One thing puzzled me is that in JBOD case, the per-disk writeout
performance is smaller than the corresponding single-disk case even
when they have comparable bdi_thresh. Tracing shows find that in single
disk case, bdi_writeback is always kept high while in JBOD case, it
could drop low from time to time and correspondingly bdi_reclaimable
could sometimes rush high.

The fix is to watch bdi_reclaimable and kick background writeback as
soon as it goes high. This resembles the global background threshold
but in per-bdi manner. The trick is, as long as bdi_reclaimable does
not go high, bdi_writeback naturally won't go low because
bdi_reclaimable+bdi_writeback ~= bdi_thresh.

With less fluctuated writeback pages, JBOD performance is observed to
increase noticeably in various cases.

vmstat:nr_written values before/after patch:

  3.1.0-rc4-wo-underrun+      3.1.0-rc4-bgthresh3+
------------------------  ------------------------
               125596480       +25.9%    158179363  JBOD-10HDD-16G/ext4-100dd-1M-24p-16384M-20:10-X
                61790815      +110.4%    130032231  JBOD-10HDD-16G/ext4-10dd-1M-24p-16384M-20:10-X
                58853546        -0.1%     58823828  JBOD-10HDD-16G/ext4-1dd-1M-24p-16384M-20:10-X
               110159811       +24.7%    137355377  JBOD-10HDD-16G/xfs-100dd-1M-24p-16384M-20:10-X
                69544762       +10.8%     77080047  JBOD-10HDD-16G/xfs-10dd-1M-24p-16384M-20:10-X
                50644862        +0.5%     50890006  JBOD-10HDD-16G/xfs-1dd-1M-24p-16384M-20:10-X
                42677090       +28.0%     54643527  JBOD-10HDD-thresh=100M/ext4-100dd-1M-24p-16384M-100M:10-X
                47491324       +13.3%     53785605  JBOD-10HDD-thresh=100M/ext4-10dd-1M-24p-16384M-100M:10-X
                52548986        +0.9%     53001031  JBOD-10HDD-thresh=100M/ext4-1dd-1M-24p-16384M-100M:10-X
                26783091       +36.8%     36650248  JBOD-10HDD-thresh=100M/xfs-100dd-1M-24p-16384M-100M:10-X
                35526347       +14.0%     40492312  JBOD-10HDD-thresh=100M/xfs-10dd-1M-24p-16384M-100M:10-X
                44670723        -1.1%     44177606  JBOD-10HDD-thresh=100M/xfs-1dd-1M-24p-16384M-100M:10-X
               127996037       +22.4%    156719990  JBOD-10HDD-thresh=2G/ext4-100dd-1M-24p-16384M-2048M:10-X
                57518856        +3.8%     59677625  JBOD-10HDD-thresh=2G/ext4-10dd-1M-24p-16384M-2048M:10-X
                51919909       +12.2%     58269894  JBOD-10HDD-thresh=2G/ext4-1dd-1M-24p-16384M-2048M:10-X
                86410514       +79.0%    154660433  JBOD-10HDD-thresh=2G/xfs-100dd-1M-24p-16384M-2048M:10-X
                40132519       +38.6%     55617893  JBOD-10HDD-thresh=2G/xfs-10dd-1M-24p-16384M-2048M:10-X
                48423248        +7.5%     52042927  JBOD-10HDD-thresh=2G/xfs-1dd-1M-24p-16384M-2048M:10-X
               206041046       +44.1%    296846536  JBOD-10HDD-thresh=4G/xfs-100dd-1M-24p-16384M-4096M:10-X
                72312903       -19.4%     58272885  JBOD-10HDD-thresh=4G/xfs-10dd-1M-24p-16384M-4096M:10-X
                50635672        -0.5%     50384787  JBOD-10HDD-thresh=4G/xfs-1dd-1M-24p-16384M-4096M:10-X
                68308534      +115.7%    147324758  JBOD-10HDD-thresh=800M/ext4-100dd-1M-24p-16384M-800M:10-X
                57882933       +14.5%     66269621  JBOD-10HDD-thresh=800M/ext4-10dd-1M-24p-16384M-800M:10-X
                52183472       +12.8%     58855181  JBOD-10HDD-thresh=800M/ext4-1dd-1M-24p-16384M-800M:10-X
                53788956       +94.2%    104460352  JBOD-10HDD-thresh=800M/xfs-100dd-1M-24p-16384M-800M:10-X
                44493342       +35.5%     60298210  JBOD-10HDD-thresh=800M/xfs-10dd-1M-24p-16384M-800M:10-X
                42641209       +18.9%     50681038  JBOD-10HDD-thresh=800M/xfs-1dd-1M-24p-16384M-800M:10-X

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 28076562ada0..6401cd76f109 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -658,14 +658,21 @@ long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages)
 	return nr_pages - work.nr_pages;
 }
 
-static inline bool over_bground_thresh(void)
+static bool over_bground_thresh(struct backing_dev_info *bdi)
 {
 	unsigned long background_thresh, dirty_thresh;
 
 	global_dirty_limits(&background_thresh, &dirty_thresh);
 
-	return (global_page_state(NR_FILE_DIRTY) +
-		global_page_state(NR_UNSTABLE_NFS) > background_thresh);
+	if (global_page_state(NR_FILE_DIRTY) +
+	    global_page_state(NR_UNSTABLE_NFS) > background_thresh)
+		return true;
+
+	if (bdi_stat(bdi, BDI_RECLAIMABLE) >
+				bdi_dirty_limit(bdi, background_thresh))
+		return true;
+
+	return false;
 }
 
 /*
@@ -727,7 +734,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 * For background writeout, stop when we are below the
 		 * background dirty threshold
 		 */
-		if (work->for_background && !over_bground_thresh())
+		if (work->for_background && !over_bground_thresh(wb->bdi))
 			break;
 
 		if (work->for_kupdate) {
@@ -811,7 +818,7 @@ static unsigned long get_nr_dirty_pages(void)
 
 static long wb_check_background_flush(struct bdi_writeback *wb)
 {
-	if (over_bground_thresh()) {
+	if (over_bground_thresh(wb->bdi)) {
 
 		struct wb_writeback_work work = {
 			.nr_pages	= LONG_MAX,
-- 
cgit v1.2.3-58-ga151


From 1958c7c284298f0d17a1ab3820c77c40ea3d3711 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Fri, 23 Sep 2011 13:42:43 -0700
Subject: exofs/ore.c: local functions should be static

This quiets the sparse noise:

warning: symbol '_calc_trunk_info' was not declared. Should it be static?

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/ore.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 25305af88198..6114fdffcad8 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -753,8 +753,8 @@ struct _trunc_info {
 	unsigned max_devs;
 };
 
-void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
-		       struct _trunc_info *ti)
+static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
+			     struct _trunc_info *ti)
 {
 	unsigned stripe_unit = layout->stripe_unit;
 
-- 
cgit v1.2.3-58-ga151


From de74b05ace743b4a7aefad9e9b33ff899979b34a Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Fri, 23 Sep 2011 13:46:51 -0700
Subject: exofs/super.c: local functions should be static

This quiets the following sparse noise:

warning: symbol 'exofs_sync_fs' was not declared. Should it be static?
warning: symbol 'exofs_free_sbi' was not declared. Should it be static?
warning: symbol 'exofs_get_parent' was not declared. Should it be static?

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/super.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 274894053b02..afe79ee088ad 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -355,7 +355,7 @@ static const struct export_operations exofs_export_ops;
 /*
  * Write the superblock to the OSD
  */
-int exofs_sync_fs(struct super_block *sb, int wait)
+static int exofs_sync_fs(struct super_block *sb, int wait)
 {
 	struct exofs_sb_info *sbi;
 	struct exofs_fscb *fscb;
@@ -429,7 +429,7 @@ static void _exofs_print_device(const char *msg, const char *dev_path,
 		msg, dev_path ?: "", odi->osdname, _LLU(pid));
 }
 
-void exofs_free_sbi(struct exofs_sb_info *sbi)
+static void exofs_free_sbi(struct exofs_sb_info *sbi)
 {
 	while (sbi->comps.numdevs) {
 		int i = --sbi->comps.numdevs;
@@ -981,7 +981,7 @@ static const struct super_operations exofs_sops = {
  * EXPORT OPERATIONS
  *****************************************************************************/
 
-struct dentry *exofs_get_parent(struct dentry *child)
+static struct dentry *exofs_get_parent(struct dentry *child)
 {
 	unsigned long ino = exofs_parent_ino(child);
 
-- 
cgit v1.2.3-58-ga151


From 5bf696dad4beecb6174e701c97e1f2574e6a2c96 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 28 Sep 2011 11:39:59 +0300
Subject: exofs: Rename struct ore_components comps => oc

ore_components already has a comps member so this leads
to things like comps->comps which is annoying. the name oc
was already used in new code. So rename all old usage of
ore_components comps => ore_components oc.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/exofs.h       | 16 +++++++-------
 fs/exofs/inode.c       | 22 +++++++++----------
 fs/exofs/ore.c         | 30 +++++++++++++-------------
 fs/exofs/super.c       | 58 +++++++++++++++++++++++++-------------------------
 include/scsi/osd_ore.h |  2 +-
 5 files changed, 64 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index f4e442ec7445..c09d5a765efe 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -71,7 +71,7 @@ struct exofs_sb_info {
 						 */
 	struct ore_layout	layout;		/* Default files layout       */
 	struct ore_comp one_comp;		/* id & cred of partition id=0*/
-	struct ore_components comps;		/* comps for the partition    */
+	struct ore_components oc;		/* comps for the partition    */
 	struct osd_dev	*_min_one_dev[1];	/* Place holder for one dev   */
 };
 
@@ -86,7 +86,7 @@ struct exofs_i_info {
 	uint32_t       i_dir_start_lookup; /* which page to start lookup      */
 	uint64_t       i_commit_size;      /* the object's written length     */
 	struct ore_comp one_comp;	   /* same component for all devices  */
-	struct ore_components comps;	   /* inode view of the device table  */
+	struct ore_components oc;	   /* inode view of the device table  */
 };
 
 static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
@@ -207,7 +207,7 @@ extern const struct inode_operations exofs_fast_symlink_inode_operations;
  * bigger and that the device table repeats twice.
  * See: exofs_read_lookup_dev_table()
  */
-static inline void exofs_init_comps(struct ore_components *comps,
+static inline void exofs_init_comps(struct ore_components *oc,
 				    struct ore_comp *one_comp,
 				    struct exofs_sb_info *sbi, osd_id oid)
 {
@@ -217,13 +217,13 @@ static inline void exofs_init_comps(struct ore_components *comps,
 	one_comp->obj.id = oid;
 	exofs_make_credential(one_comp->cred, &one_comp->obj);
 
-	comps->numdevs = sbi->comps.numdevs;
-	comps->single_comp = EC_SINGLE_COMP;
-	comps->comps = one_comp;
+	oc->numdevs = sbi->oc.numdevs;
+	oc->single_comp = EC_SINGLE_COMP;
+	oc->comps = one_comp;
 
 	/* Round robin device view of the table */
-	first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->comps.numdevs;
-	comps->ods = sbi->comps.ods + first_dev;
+	first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs;
+	oc->ods = sbi->oc.ods + first_dev;
 }
 
 #endif
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index f39a38fc2349..61b2f7e5cdbd 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -270,7 +270,7 @@ static int read_exec(struct page_collect *pcol)
 		return 0;
 
 	if (!pcol->ios) {
-		int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, true,
+		int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true,
 					     pcol->pg_first << PAGE_CACHE_SHIFT,
 					     pcol->length, &pcol->ios);
 
@@ -516,7 +516,7 @@ static int write_exec(struct page_collect *pcol)
 		return 0;
 
 	BUG_ON(pcol->ios);
-	ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, false,
+	ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
 				 pcol->pg_first << PAGE_CACHE_SHIFT,
 				 pcol->length, &pcol->ios);
 
@@ -860,7 +860,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize)
 
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 
-	ret = ore_truncate(&sbi->layout, &oi->comps, (u64)newsize);
+	ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize);
 	if (likely(!ret))
 		truncate_setsize(inode, newsize);
 
@@ -927,14 +927,14 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
 	struct exofs_on_disk_inode_layout *layout;
 	int ret;
 
-	ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
 		return ret;
 	}
 
-	attrs[1].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs);
-	attrs[2].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs);
+	attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
+	attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
 
 	ios->in_attr = attrs;
 	ios->in_attr_len = ARRAY_SIZE(attrs);
@@ -1018,7 +1018,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 		return inode;
 	oi = exofs_i(inode);
 	__oi_init(oi);
-	exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info,
+	exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
 			 exofs_oi_objno(oi));
 
 	/* read the inode from the osd */
@@ -1172,13 +1172,13 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
 	spin_unlock(&sbi->s_next_gen_lock);
 	insert_inode_hash(inode);
 
-	exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info,
+	exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
 			 exofs_oi_objno(oi));
 	exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
 
 	mark_inode_dirty(inode);
 
-	ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n");
 		return ERR_PTR(ret);
@@ -1267,7 +1267,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
 	} else
 		memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
 
-	ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
 		goto free_args;
@@ -1350,7 +1350,7 @@ void exofs_evict_inode(struct inode *inode)
 	/* ignore the error, attempt a remove anyway */
 
 	/* Now Remove the OSD objects */
-	ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: ore_get_io_state failed\n", __func__);
 		return;
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 6114fdffcad8..870f85a232d1 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -49,20 +49,20 @@ MODULE_LICENSE("GPL");
 
 static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
 {
-	return ios->comps->comps[index & ios->comps->single_comp].cred;
+	return ios->oc->comps[index & ios->oc->single_comp].cred;
 }
 
 static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)
 {
-	return &ios->comps->comps[index & ios->comps->single_comp].obj;
+	return &ios->oc->comps[index & ios->oc->single_comp].obj;
 }
 
 static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
 {
-	return ios->comps->ods[index];
+	return ios->oc->ods[index];
 }
 
-int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps,
+int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
 		      bool is_reading, u64 offset, u64 length,
 		      struct ore_io_state **pios)
 {
@@ -71,16 +71,16 @@ int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps,
 	/*TODO: Maybe use kmem_cach per sbi of size
 	 * exofs_io_state_size(layout->s_numdevs)
 	 */
-	ios = kzalloc(ore_io_state_size(comps->numdevs), GFP_KERNEL);
+	ios = kzalloc(ore_io_state_size(oc->numdevs), GFP_KERNEL);
 	if (unlikely(!ios)) {
 		ORE_DBGMSG("Failed kzalloc bytes=%d\n",
-			     ore_io_state_size(comps->numdevs));
+			     ore_io_state_size(oc->numdevs));
 		*pios = NULL;
 		return -ENOMEM;
 	}
 
 	ios->layout = layout;
-	ios->comps = comps;
+	ios->oc = oc;
 	ios->offset = offset;
 	ios->length = length;
 	ios->reading = is_reading;
@@ -90,10 +90,10 @@ int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps,
 }
 EXPORT_SYMBOL(ore_get_rw_state);
 
-int  ore_get_io_state(struct ore_layout *layout, struct ore_components *comps,
+int  ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
 		      struct ore_io_state **ios)
 {
-	return ore_get_rw_state(layout, comps, true, 0, 0, ios);
+	return ore_get_rw_state(layout, oc, true, 0, 0, ios);
 }
 EXPORT_SYMBOL(ore_get_io_state);
 
@@ -476,7 +476,7 @@ int ore_create(struct ore_io_state *ios)
 {
 	int i, ret;
 
-	for (i = 0; i < ios->comps->numdevs; i++) {
+	for (i = 0; i < ios->oc->numdevs; i++) {
 		struct osd_request *or;
 
 		or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
@@ -501,7 +501,7 @@ int ore_remove(struct ore_io_state *ios)
 {
 	int i, ret;
 
-	for (i = 0; i < ios->comps->numdevs; i++) {
+	for (i = 0; i < ios->oc->numdevs; i++) {
 		struct osd_request *or;
 
 		or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
@@ -768,7 +768,7 @@ static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
 	ti->max_devs = layout->group_width * layout->group_count;
 }
 
-int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
+int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
 		   u64 size)
 {
 	struct ore_io_state *ios;
@@ -779,7 +779,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
 	struct _trunc_info ti;
 	int i, ret;
 
-	ret = ore_get_io_state(layout, comps, &ios);
+	ret = ore_get_io_state(layout, oc, &ios);
 	if (unlikely(ret))
 		return ret;
 
@@ -792,7 +792,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
 		goto out;
 	}
 
-	ios->numdevs = ios->comps->numdevs;
+	ios->numdevs = ios->oc->numdevs;
 
 	for (i = 0; i < ti.max_devs; ++i) {
 		struct exofs_trunc_attr *size_attr = &size_attrs[i];
@@ -815,7 +815,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
 		size_attr->attr.val_ptr = &size_attr->newsize;
 
 		ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
-			     _LLU(comps->comps->obj.id), _LLU(obj_size), i);
+			     _LLU(oc->comps->obj.id), _LLU(obj_size), i);
 		ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
 					&size_attr->attr);
 		if (unlikely(ret))
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index afe79ee088ad..babb1958f6f0 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -266,7 +266,7 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi)
 	struct ore_io_state *ios;
 	int ret;
 
-	ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
 		return ret;
@@ -321,7 +321,7 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
 	struct ore_io_state *ios;
 	int ret;
 
-	ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
 		return ret;
@@ -360,7 +360,7 @@ static int exofs_sync_fs(struct super_block *sb, int wait)
 	struct exofs_sb_info *sbi;
 	struct exofs_fscb *fscb;
 	struct ore_comp one_comp;
-	struct ore_components comps;
+	struct ore_components oc;
 	struct ore_io_state *ios;
 	int ret = -ENOMEM;
 
@@ -378,9 +378,9 @@ static int exofs_sync_fs(struct super_block *sb, int wait)
 	 * the writeable info is set in exofs_sbi_write_stats() above.
 	 */
 
-	exofs_init_comps(&comps, &one_comp, sbi, EXOFS_SUPER_ID);
+	exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID);
 
-	ret = ore_get_io_state(&sbi->layout, &comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &oc, &ios);
 	if (unlikely(ret))
 		goto out;
 
@@ -431,17 +431,17 @@ static void _exofs_print_device(const char *msg, const char *dev_path,
 
 static void exofs_free_sbi(struct exofs_sb_info *sbi)
 {
-	while (sbi->comps.numdevs) {
-		int i = --sbi->comps.numdevs;
-		struct osd_dev *od = sbi->comps.ods[i];
+	while (sbi->oc.numdevs) {
+		int i = --sbi->oc.numdevs;
+		struct osd_dev *od = sbi->oc.ods[i];
 
 		if (od) {
-			sbi->comps.ods[i] = NULL;
+			sbi->oc.ods[i] = NULL;
 			osduld_put_device(od);
 		}
 	}
-	if (sbi->comps.ods != sbi->_min_one_dev)
-		kfree(sbi->comps.ods);
+	if (sbi->oc.ods != sbi->_min_one_dev)
+		kfree(sbi->oc.ods);
 	kfree(sbi);
 }
 
@@ -468,7 +468,7 @@ static void exofs_put_super(struct super_block *sb)
 				  msecs_to_jiffies(100));
 	}
 
-	_exofs_print_device("Unmounting", NULL, sbi->comps.ods[0],
+	_exofs_print_device("Unmounting", NULL, sbi->oc.ods[0],
 			    sbi->one_comp.obj.partition);
 
 	bdi_destroy(&sbi->bdi);
@@ -623,7 +623,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 		return -ENOMEM;
 	}
 
-	sbi->comps.numdevs = 0;
+	sbi->oc.numdevs = 0;
 
 	comp.obj.partition = sbi->one_comp.obj.partition;
 	comp.obj.id = EXOFS_DEVTABLE_ID;
@@ -648,13 +648,13 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 		goto out;
 
 	if (likely(numdevs > 1)) {
-		unsigned size = numdevs * sizeof(sbi->comps.ods[0]);
+		unsigned size = numdevs * sizeof(sbi->oc.ods[0]);
 
 		/* Twice bigger table: See exofs_init_comps() and below
 		 * comment
 		 */
-		sbi->comps.ods = kzalloc(size + size - 1, GFP_KERNEL);
-		if (unlikely(!sbi->comps.ods)) {
+		sbi->oc.ods = kzalloc(size + size - 1, GFP_KERNEL);
+		if (unlikely(!sbi->oc.ods)) {
 			EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
 				  numdevs);
 			ret = -ENOMEM;
@@ -681,8 +681,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 		 * line. We always keep them in device-table order.
 		 */
 		if (fscb_od && osduld_device_same(fscb_od, &odi)) {
-			sbi->comps.ods[i] = fscb_od;
-			++sbi->comps.numdevs;
+			sbi->oc.ods[i] = fscb_od;
+			++sbi->oc.numdevs;
 			fscb_od = NULL;
 			continue;
 		}
@@ -695,8 +695,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 			goto out;
 		}
 
-		sbi->comps.ods[i] = od;
-		++sbi->comps.numdevs;
+		sbi->oc.ods[i] = od;
+		++sbi->oc.numdevs;
 
 		/* Read the fscb of the other devices to make sure the FS
 		 * partition is there.
@@ -719,7 +719,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 out:
 	kfree(dt);
 	if (likely(!ret)) {
-		unsigned numdevs = sbi->comps.numdevs;
+		unsigned numdevs = sbi->oc.numdevs;
 
 		if (unlikely(fscb_od)) {
 			EXOFS_ERR("ERROR: Bad device-table container device not present\n");
@@ -732,7 +732,7 @@ out:
 		 * starting at this device. See exofs_init_comps()
 		 */
 		for (i = 0; i < numdevs - 1; ++i)
-			sbi->comps.ods[i + numdevs] = sbi->comps.ods[i];
+			sbi->oc.ods[i + numdevs] = sbi->oc.ods[i];
 	}
 	return ret;
 }
@@ -783,10 +783,10 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->one_comp.obj.partition = opts->pid;
 	sbi->one_comp.obj.id = 0;
 	exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj);
-	sbi->comps.numdevs = 1;
-	sbi->comps.single_comp = EC_SINGLE_COMP;
-	sbi->comps.comps = &sbi->one_comp;
-	sbi->comps.ods = sbi->_min_one_dev;
+	sbi->oc.numdevs = 1;
+	sbi->oc.single_comp = EC_SINGLE_COMP;
+	sbi->oc.comps = &sbi->one_comp;
+	sbi->oc.ods = sbi->_min_one_dev;
 
 	/* fill in some other data by hand */
 	memset(sb->s_id, 0, sizeof(sb->s_id));
@@ -835,7 +835,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		if (unlikely(ret))
 			goto free_sbi;
 	} else {
-		sbi->comps.ods[0] = od;
+		sbi->oc.ods[0] = od;
 	}
 
 	__sbi_read_stats(sbi);
@@ -875,7 +875,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		goto free_sbi;
 	}
 
-	_exofs_print_device("Mounting", opts->dev_name, sbi->comps.ods[0],
+	_exofs_print_device("Mounting", opts->dev_name, sbi->oc.ods[0],
 			    sbi->one_comp.obj.partition);
 	return 0;
 
@@ -924,7 +924,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	uint64_t used = ULLONG_MAX;
 	int ret;
 
-	ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
 	if (ret) {
 		EXOFS_DBGMSG("ore_get_io_state failed.\n");
 		return ret;
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index c5c5e008e6de..954292a23767 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -64,7 +64,7 @@ struct ore_io_state {
 	ore_io_done_fn	done;
 
 	struct ore_layout	*layout;
-	struct ore_components	*comps;
+	struct ore_components	*oc;
 
 	/* Global read/write IO*/
 	loff_t			offset;
-- 
cgit v1.2.3-58-ga151


From 8d2d83a8352b0f9c1da82c36f741722f2960feea Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 10 Aug 2011 14:15:02 -0700
Subject: exofs: Remove unused data_map member from exofs_sb_info

The struct pnfs_osd_data_map data_map member of exofs_sb_info was
never used after mount. In fact all it's members were duplicated
by the ore_layout structure. So just remove the duplicated information.

Also removed some stupid, but perfectly supported, restrictions on
layout parameters. The case where num_devices is not divisible by
mirror_count+1 is perfectly fine since the rotating device view
will eventually use all the devices it can get.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Benny Halevy <bhalevy@tonian.com>
---
 fs/exofs/exofs.h       |  3 ---
 fs/exofs/super.c       | 57 +++++++++++++++++++-------------------------------
 include/scsi/osd_ore.h |  2 ++
 3 files changed, 24 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index c09d5a765efe..3b2e0478f363 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -66,9 +66,6 @@ struct exofs_sb_info {
 	u32		s_next_generation;	/* next gen # to use          */
 	atomic_t	s_curr_pending;		/* number of pending commands */
 
-	struct pnfs_osd_data_map data_map;	/* Default raid to use
-						 * FIXME: Needed ?
-						 */
 	struct ore_layout	layout;		/* Default files layout       */
 	struct ore_comp one_comp;		/* id & cred of partition id=0*/
 	struct ore_components oc;		/* comps for the partition    */
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index babb1958f6f0..90b4c526939f 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -481,64 +481,51 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
 {
 	u64 stripe_length;
 
-	sbi->data_map.odm_num_comps   =
-				le32_to_cpu(dt->dt_data_map.cb_num_comps);
-	sbi->data_map.odm_stripe_unit =
+	sbi->layout.stripe_unit =
 				le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
-	sbi->data_map.odm_group_width =
+	sbi->layout.group_width =
 				le32_to_cpu(dt->dt_data_map.cb_group_width);
-	sbi->data_map.odm_group_depth =
+	sbi->layout.group_depth =
 				le32_to_cpu(dt->dt_data_map.cb_group_depth);
-	sbi->data_map.odm_mirror_cnt  =
-				le32_to_cpu(dt->dt_data_map.cb_mirror_cnt);
-	sbi->data_map.odm_raid_algorithm  =
+	sbi->layout.mirrors_p1  =
+				le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1;
+	sbi->layout.raid_algorithm  =
 				le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
 
 /* FIXME: Only raid0 for now. if not so, do not mount */
-	if (sbi->data_map.odm_num_comps != numdevs) {
-		EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
-			  sbi->data_map.odm_num_comps, numdevs);
-		return -EINVAL;
-	}
-	if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
+	if (sbi->layout.raid_algorithm != PNFS_OSD_RAID_0) {
 		EXOFS_ERR("Only RAID_0 for now\n");
 		return -EINVAL;
 	}
-	if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
-		EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
-			  numdevs, sbi->data_map.odm_mirror_cnt);
+	if (numdevs < (sbi->layout.group_width * sbi->layout.mirrors_p1)) {
+		EXOFS_ERR("Data Map wrong, "
+			  "numdevs=%d < group_width=%d * mirrors=%d\n",
+			  numdevs, sbi->layout.group_width,
+			  sbi->layout.mirrors_p1);
 		return -EINVAL;
 	}
 
-	if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
+	if (0 != (sbi->layout.stripe_unit & ~PAGE_MASK)) {
 		EXOFS_ERR("Stripe Unit(0x%llx)"
 			  " must be Multples of PAGE_SIZE(0x%lx)\n",
-			  _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
+			  _LLU(sbi->layout.stripe_unit), PAGE_SIZE);
 		return -EINVAL;
 	}
 
-	sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
-	sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
-
-	if (sbi->data_map.odm_group_width) {
-		sbi->layout.group_width = sbi->data_map.odm_group_width;
-		sbi->layout.group_depth = sbi->data_map.odm_group_depth;
+	if (sbi->layout.group_width) {
 		if (!sbi->layout.group_depth) {
 			EXOFS_ERR("group_depth == 0 && group_width != 0\n");
 			return -EINVAL;
 		}
-		sbi->layout.group_count = sbi->data_map.odm_num_comps /
-						sbi->layout.mirrors_p1 /
-						sbi->data_map.odm_group_width;
+		sbi->layout.group_count = numdevs / sbi->layout.mirrors_p1 /
+						sbi->layout.group_width;
 	} else {
-		if (sbi->data_map.odm_group_depth) {
+		if (sbi->layout.group_depth) {
 			printk(KERN_NOTICE "Warning: group_depth ignored "
-				"group_width == 0 && group_depth == %d\n",
-				sbi->data_map.odm_group_depth);
-			sbi->data_map.odm_group_depth = 0;
+				"group_width == 0 && group_depth == %lld\n",
+				_LLU(sbi->layout.group_depth));
 		}
-		sbi->layout.group_width = sbi->data_map.odm_num_comps /
-							sbi->layout.mirrors_p1;
+		sbi->layout.group_width = numdevs / sbi->layout.mirrors_p1;
 		sbi->layout.group_depth = -1;
 		sbi->layout.group_count = 1;
 	}
@@ -558,7 +545,7 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
 		sbi->layout.group_width,
 		_LLU(sbi->layout.group_depth),
 		sbi->layout.mirrors_p1,
-		sbi->data_map.odm_raid_algorithm);
+		sbi->layout.raid_algorithm);
 	return 0;
 }
 
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index 954292a23767..f7fabb478877 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -34,6 +34,8 @@ struct ore_comp {
 
 struct ore_layout {
 	/* Our way of looking at the data_map */
+	enum pnfs_osd_raid_algorithm4
+		 raid_algorithm;
 	unsigned stripe_unit;
 	unsigned mirrors_p1;
 
-- 
cgit v1.2.3-58-ga151


From eb507bc18969f63b8968034144fd69706c492516 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 10 Aug 2011 14:17:28 -0700
Subject: ore: Make ore_striping_info and ore_calc_stripe_info public

The struct ore_striping_info will be used later in other
structures. And ore_calc_stripe_info as well. Rename them
make struct ore_striping_info public. ore_calc_stripe_info
is still static, will be made public on first use.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/ore.c         | 24 ++++++++----------------
 include/scsi/osd_ore.h |  8 ++++++++
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 870f85a232d1..c2b0033a724b 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -287,16 +287,8 @@ EXPORT_SYMBOL(ore_check_io);
  *
  *	O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
  */
-struct _striping_info {
-	u64 obj_offset;
-	u64 group_length;
-	u64 M; /* for truncate */
-	unsigned dev;
-	unsigned unit_off;
-};
-
-static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset,
-			      struct _striping_info *si)
+static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
+				 struct ore_striping_info *si)
 {
 	u32	stripe_unit = layout->stripe_unit;
 	u32	group_width = layout->group_width;
@@ -375,7 +367,7 @@ static int _add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
 }
 
 static int _prepare_one_group(struct ore_io_state *ios, u64 length,
-			      struct _striping_info *si)
+			      struct ore_striping_info *si)
 {
 	unsigned stripe_unit = ios->layout->stripe_unit;
 	unsigned mirrors_p1 = ios->layout->mirrors_p1;
@@ -434,14 +426,14 @@ static int _prepare_for_striping(struct ore_io_state *ios)
 {
 	u64 length = ios->length;
 	u64 offset = ios->offset;
-	struct _striping_info si;
+	struct ore_striping_info si;
 	int ret = 0;
 
 	if (!ios->pages) {
 		if (ios->kern_buff) {
 			struct ore_per_dev_state *per_dev = &ios->per_dev[0];
 
-			_calc_stripe_info(ios->layout, ios->offset, &si);
+			ore_calc_stripe_info(ios->layout, ios->offset, &si);
 			per_dev->offset = si.obj_offset;
 			per_dev->dev = si.dev;
 
@@ -455,7 +447,7 @@ static int _prepare_for_striping(struct ore_io_state *ios)
 	}
 
 	while (length) {
-		_calc_stripe_info(ios->layout, offset, &si);
+		ore_calc_stripe_info(ios->layout, offset, &si);
 
 		if (length < si.group_length)
 			si.group_length = length;
@@ -744,7 +736,7 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
 }
 
 struct _trunc_info {
-	struct _striping_info si;
+	struct ore_striping_info si;
 	u64 prev_group_obj_off;
 	u64 next_group_obj_off;
 
@@ -758,7 +750,7 @@ static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
 {
 	unsigned stripe_unit = layout->stripe_unit;
 
-	_calc_stripe_info(layout, file_offset, &ti->si);
+	ore_calc_stripe_info(layout, file_offset, &ti->si);
 
 	ti->prev_group_obj_off = ti->si.M * stripe_unit;
 	ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index f7fabb478877..e4d550faa7c9 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -56,6 +56,14 @@ struct ore_components {
 	struct osd_dev	**ods;			/* osd_dev array              */
 };
 
+struct ore_striping_info {
+	u64 obj_offset;
+	u64 group_length;
+	u64 M; /* for truncate */
+	unsigned dev;
+	unsigned unit_off;
+};
+
 struct ore_io_state;
 typedef void (*ore_io_done_fn)(struct ore_io_state *ios, void *private);
 
-- 
cgit v1.2.3-58-ga151


From d866d875f68fdeae63df334d291fe138dc636d96 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 28 Sep 2011 14:43:09 +0300
Subject: ore/exofs: Change the type of the devices array (API change)

In the pNFS obj-LD the device table at the layout level needs
to point to a device_cache node, where it is possible and likely
that many layouts will point to the same device-nodes.

In Exofs we have a more orderly structure where we have a single
array of devices that repeats twice for a round-robin view of the
device table

This patch moves to a model that can be used by the pNFS obj-LD
where struct ore_components holds an array of ore_dev-pointers.
(ore_dev is newly defined and contains a struct osd_dev *od
 member)

Each pointer in the array of pointers will point to a bigger
user-defined dev_struct. That can be accessed by use of the
container_of macro.

In Exofs an __alloc_dev_table() function allocates the
ore_dev-pointers array as well as an exofs_dev array, in one
allocation and does the addresses dance to set everything pointing
correctly. It still keeps the double allocation trick for the
inodes round-robin view of the table.

The device table is always allocated dynamically, also for the
single device case. So it is unconditionally freed at umount.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/exofs.h       | 10 +++--
 fs/exofs/ore.c         |  2 +-
 fs/exofs/super.c       | 99 +++++++++++++++++++++++++++++++-------------------
 include/scsi/osd_ore.h | 26 ++++++++++++-
 4 files changed, 94 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 3b2e0478f363..006fd6f33571 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -53,6 +53,10 @@
 /* u64 has problems with printk this will cast it to unsigned long long */
 #define _LLU(x) (unsigned long long)(x)
 
+struct exofs_dev {
+	struct ore_dev ored;
+	unsigned did;
+};
 /*
  * our extension to the in-memory superblock
  */
@@ -69,7 +73,6 @@ struct exofs_sb_info {
 	struct ore_layout	layout;		/* Default files layout       */
 	struct ore_comp one_comp;		/* id & cred of partition id=0*/
 	struct ore_components oc;		/* comps for the partition    */
-	struct osd_dev	*_min_one_dev[1];	/* Place holder for one dev   */
 };
 
 /*
@@ -214,13 +217,14 @@ static inline void exofs_init_comps(struct ore_components *oc,
 	one_comp->obj.id = oid;
 	exofs_make_credential(one_comp->cred, &one_comp->obj);
 
-	oc->numdevs = sbi->oc.numdevs;
+	oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 *
+							sbi->layout.group_count;
 	oc->single_comp = EC_SINGLE_COMP;
 	oc->comps = one_comp;
 
 	/* Round robin device view of the table */
 	first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs;
-	oc->ods = sbi->oc.ods + first_dev;
+	oc->ods = &sbi->oc.ods[first_dev];
 }
 
 #endif
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index c2b0033a724b..a7d79257fc65 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -59,7 +59,7 @@ static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)
 
 static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
 {
-	return ios->oc->ods[index];
+	return ore_comp_dev(ios->oc, index);
 }
 
 int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 90b4c526939f..bce3686f0aa0 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -431,17 +431,18 @@ static void _exofs_print_device(const char *msg, const char *dev_path,
 
 static void exofs_free_sbi(struct exofs_sb_info *sbi)
 {
-	while (sbi->oc.numdevs) {
-		int i = --sbi->oc.numdevs;
-		struct osd_dev *od = sbi->oc.ods[i];
+	unsigned numdevs = sbi->oc.numdevs;
+
+	while (numdevs) {
+		unsigned i = --numdevs;
+		struct osd_dev *od = ore_comp_dev(&sbi->oc, i);
 
 		if (od) {
-			sbi->oc.ods[i] = NULL;
+			ore_comp_set_dev(&sbi->oc, i, NULL);
 			osduld_put_device(od);
 		}
 	}
-	if (sbi->oc.ods != sbi->_min_one_dev)
-		kfree(sbi->oc.ods);
+	kfree(sbi->oc.ods);
 	kfree(sbi);
 }
 
@@ -468,7 +469,7 @@ static void exofs_put_super(struct super_block *sb)
 				  msecs_to_jiffies(100));
 	}
 
-	_exofs_print_device("Unmounting", NULL, sbi->oc.ods[0],
+	_exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0),
 			    sbi->one_comp.obj.partition);
 
 	bdi_destroy(&sbi->bdi);
@@ -592,12 +593,40 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
 	return !(odi->systemid_len || odi->osdname_len);
 }
 
+int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
+		      struct exofs_dev **peds)
+{
+	struct __alloc_ore_devs_and_exofs_devs {
+		/* Twice bigger table: See exofs_init_comps() and comment at
+		 * exofs_read_lookup_dev_table()
+		 */
+		struct ore_dev *oreds[numdevs * 2 - 1];
+		struct exofs_dev eds[numdevs];
+	} *aoded;
+	struct exofs_dev *eds;
+	unsigned i;
+
+	aoded = kzalloc(sizeof(*aoded), GFP_KERNEL);
+	if (unlikely(!aoded)) {
+		EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
+			  numdevs);
+		return -ENOMEM;
+	}
+
+	sbi->oc.ods = aoded->oreds;
+	*peds = eds = aoded->eds;
+	for (i = 0; i < numdevs; ++i)
+		aoded->oreds[i] = &eds[i].ored;
+	return 0;
+}
+
 static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 				       struct osd_dev *fscb_od,
 				       unsigned table_count)
 {
 	struct ore_comp comp;
 	struct exofs_device_table *dt;
+	struct exofs_dev *eds;
 	unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
 					     sizeof(*dt);
 	unsigned numdevs, i;
@@ -634,20 +663,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 	if (unlikely(ret))
 		goto out;
 
-	if (likely(numdevs > 1)) {
-		unsigned size = numdevs * sizeof(sbi->oc.ods[0]);
-
-		/* Twice bigger table: See exofs_init_comps() and below
-		 * comment
-		 */
-		sbi->oc.ods = kzalloc(size + size - 1, GFP_KERNEL);
-		if (unlikely(!sbi->oc.ods)) {
-			EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
-				  numdevs);
-			ret = -ENOMEM;
-			goto out;
-		}
-	}
+	ret = __alloc_dev_table(sbi, numdevs, &eds);
+	if (unlikely(ret))
+		goto out;
+	/* exofs round-robins the device table view according to inode
+	 * number. We hold a: twice bigger table hence inodes can point
+	 * to any device and have a sequential view of the table
+	 * starting at this device. See exofs_init_comps()
+	 */
+	memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0],
+		(numdevs - 1) * sizeof(sbi->oc.ods[0]));
 
 	for (i = 0; i < numdevs; i++) {
 		struct exofs_fscb fscb;
@@ -663,12 +688,15 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 		printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
 		       i, odi.osdname);
 
+		/* the exofs id is currently the table index */
+		eds[i].did = i;
+
 		/* On all devices the device table is identical. The user can
 		 * specify any one of the participating devices on the command
 		 * line. We always keep them in device-table order.
 		 */
 		if (fscb_od && osduld_device_same(fscb_od, &odi)) {
-			sbi->oc.ods[i] = fscb_od;
+			eds[i].ored.od = fscb_od;
 			++sbi->oc.numdevs;
 			fscb_od = NULL;
 			continue;
@@ -682,7 +710,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 			goto out;
 		}
 
-		sbi->oc.ods[i] = od;
+		eds[i].ored.od = od;
 		++sbi->oc.numdevs;
 
 		/* Read the fscb of the other devices to make sure the FS
@@ -705,21 +733,10 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 
 out:
 	kfree(dt);
-	if (likely(!ret)) {
-		unsigned numdevs = sbi->oc.numdevs;
-
-		if (unlikely(fscb_od)) {
+	if (unlikely(fscb_od && !ret)) {
 			EXOFS_ERR("ERROR: Bad device-table container device not present\n");
 			osduld_put_device(fscb_od);
 			return -EINVAL;
-		}
-		/* exofs round-robins the device table view according to inode
-		 * number. We hold a: twice bigger table hence inodes can point
-		 * to any device and have a sequential view of the table
-		 * starting at this device. See exofs_init_comps()
-		 */
-		for (i = 0; i < numdevs - 1; ++i)
-			sbi->oc.ods[i + numdevs] = sbi->oc.ods[i];
 	}
 	return ret;
 }
@@ -773,7 +790,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->oc.numdevs = 1;
 	sbi->oc.single_comp = EC_SINGLE_COMP;
 	sbi->oc.comps = &sbi->one_comp;
-	sbi->oc.ods = sbi->_min_one_dev;
 
 	/* fill in some other data by hand */
 	memset(sb->s_id, 0, sizeof(sb->s_id));
@@ -822,7 +838,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		if (unlikely(ret))
 			goto free_sbi;
 	} else {
-		sbi->oc.ods[0] = od;
+		struct exofs_dev *eds;
+
+		ret = __alloc_dev_table(sbi, 1, &eds);
+		if (unlikely(ret))
+			goto free_sbi;
+
+		ore_comp_set_dev(&sbi->oc, 0, od);
 	}
 
 	__sbi_read_stats(sbi);
@@ -862,7 +884,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		goto free_sbi;
 	}
 
-	_exofs_print_device("Mounting", opts->dev_name, sbi->oc.ods[0],
+	_exofs_print_device("Mounting", opts->dev_name,
+			    ore_comp_dev(&sbi->oc, 0),
 			    sbi->one_comp.obj.partition);
 	return 0;
 
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index e4d550faa7c9..8fefdfbb1ced 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -44,6 +44,10 @@ struct ore_layout {
 	unsigned group_count;
 };
 
+struct ore_dev {
+	struct osd_dev *od;
+};
+
 struct ore_components {
 	unsigned	numdevs;		/* Num of devices in array    */
 	/* If @single_comp == EC_SINGLE_COMP, @comps points to a single
@@ -53,9 +57,29 @@ struct ore_components {
 		EC_SINGLE_COMP = 0, EC_MULTPLE_COMPS = 0xffffffff
 	}		single_comp;
 	struct ore_comp	*comps;
-	struct osd_dev	**ods;			/* osd_dev array              */
+
+	/* Array of pointers to ore_dev-* . User will usually have these pointed
+	 * too a bigger struct which contain an "ore_dev ored" member and use
+	 * container_of(oc->ods[i], struct foo_dev, ored) to access the bigger
+	 * structure.
+	 */
+	struct ore_dev	**ods;
 };
 
+/* ore_comp_dev Recievies a logical device index */
+static inline struct osd_dev *ore_comp_dev(
+	const struct ore_components *oc, unsigned i)
+{
+	BUG_ON(oc->numdevs <= i);
+	return oc->ods[i]->od;
+}
+
+static inline void ore_comp_set_dev(
+	struct ore_components *oc, unsigned i, struct osd_dev *od)
+{
+	oc->ods[i]->od = od;
+}
+
 struct ore_striping_info {
 	u64 obj_offset;
 	u64 group_length;
-- 
cgit v1.2.3-58-ga151


From 3ee77f2091c229a72c64d9aeef1ee4a7624e5795 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Fri, 23 Sep 2011 13:55:52 -0700
Subject: ext3/balloc.c: local functions should be static

This quites the sparse noise:

warning: symbol 'ext3_trim_all_free' was not declared. Should it be static?

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/balloc.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index caedc00520ed..a7fceaba87f1 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1924,9 +1924,10 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
  * reaches any used block. Then issue a TRIM command on this extent and free
  * the extent in the block bitmap. This is done until whole group is scanned.
  */
-ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
-				ext3_grpblk_t start, ext3_grpblk_t max,
-				ext3_grpblk_t minblocks)
+static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb,
+					unsigned int group,
+					ext3_grpblk_t start, ext3_grpblk_t max,
+					ext3_grpblk_t minblocks)
 {
 	handle_t *handle;
 	ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
-- 
cgit v1.2.3-58-ga151


From bc1123239ab950798779ca4e23228afb5443eb5d Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon@gmail.com>
Date: Mon, 3 Oct 2011 14:02:59 +0900
Subject: udf: Add readpages support for udf.

Use mpage_readpages() instead of multiple calls to udf_readpage() to reduce the
CPU utilization and make performance higher.

Signed-off-by: Namjae Jeon <linkinjeon@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/inode.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 1d1358ed80c1..f94d6f9febf5 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -37,6 +37,7 @@
 #include <linux/writeback.h>
 #include <linux/slab.h>
 #include <linux/crc-itu-t.h>
+#include <linux/mpage.h>
 
 #include "udf_i.h"
 #include "udf_sb.h"
@@ -104,7 +105,13 @@ static int udf_writepage(struct page *page, struct writeback_control *wbc)
 
 static int udf_readpage(struct file *file, struct page *page)
 {
-	return block_read_full_page(page, udf_get_block);
+	return mpage_readpage(page, udf_get_block);
+}
+
+static int udf_readpages(struct file *file, struct address_space *mapping,
+			struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, udf_get_block);
 }
 
 static int udf_write_begin(struct file *file, struct address_space *mapping,
@@ -139,6 +146,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
 
 const struct address_space_operations udf_aops = {
 	.readpage	= udf_readpage,
+	.readpages	= udf_readpages,
 	.writepage	= udf_writepage,
 	.write_begin		= udf_write_begin,
 	.write_end		= generic_write_end,
-- 
cgit v1.2.3-58-ga151


From 7aa0baeaba4afc4fbed7aad2812a1116e6b0adcd Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Thu, 6 Oct 2011 10:22:28 -0400
Subject: ext4: Free resources in ext4_mb_init()'s error paths

In commit 79a77c5ac, we move ext4_mb_init_backend after the allocation
of s_locality_group to avoid memory leak in error path, but there are
still some other error paths in ext4_mb_init that need to do the same
work. So this patch adds all the error patch for ext4_mb_init. And all
the pointers are reset to NULL in case the caller may double free them.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 1c83161090f8..8c005c028203 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2504,7 +2504,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
 	if (sbi->s_locality_groups == NULL) {
 		ret = -ENOMEM;
-		goto out;
+		goto out_free_groupinfo_slab;
 	}
 	for_each_possible_cpu(i) {
 		struct ext4_locality_group *lg;
@@ -2517,9 +2517,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 
 	/* init file for buddy data */
 	ret = ext4_mb_init_backend(sb);
-	if (ret != 0) {
-		goto out;
-	}
+	if (ret != 0)
+		goto out_free_locality_groups;
 
 	if (sbi->s_proc)
 		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
@@ -2527,11 +2526,19 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 
 	if (sbi->s_journal)
 		sbi->s_journal->j_commit_callback = release_blocks_on_commit;
+
+	return 0;
+
+out_free_locality_groups:
+	free_percpu(sbi->s_locality_groups);
+	sbi->s_locality_groups = NULL;
+out_free_groupinfo_slab:
+	ext4_groupinfo_destroy_slabs();
 out:
-	if (ret) {
-		kfree(sbi->s_mb_offsets);
-		kfree(sbi->s_mb_maxs);
-	}
+	kfree(sbi->s_mb_offsets);
+	sbi->s_mb_offsets = NULL;
+	kfree(sbi->s_mb_maxs);
+	sbi->s_mb_maxs = NULL;
 	return ret;
 }
 
-- 
cgit v1.2.3-58-ga151


From dcf2d804ed6ffe5e942b909ed5e5b74628be6ee4 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Thu, 6 Oct 2011 12:10:11 -0400
Subject: ext4: Free resources in some error path in ext4_fill_super

Some of the error path in ext4_fill_super don't release the
resouces properly. So this patch just try to release them
in the right way.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 07f3de341452..db2cd3f21074 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3785,22 +3785,19 @@ no_journal:
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
 			 err);
-		goto failed_mount4;
+		goto failed_mount5;
 	}
 
 	err = ext4_register_li_request(sb, first_not_zeroed);
 	if (err)
-		goto failed_mount4;
+		goto failed_mount6;
 
 	sbi->s_kobj.kset = ext4_kset;
 	init_completion(&sbi->s_kobj_unregister);
 	err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
 				   "%s", sb->s_id);
-	if (err) {
-		ext4_mb_release(sb);
-		ext4_ext_release(sb);
-		goto failed_mount4;
-	};
+	if (err)
+		goto failed_mount7;
 
 	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
 	ext4_orphan_cleanup(sb, es);
@@ -3834,13 +3831,19 @@ cantfind_ext4:
 		ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
 	goto failed_mount;
 
+failed_mount7:
+	ext4_unregister_li_request(sb);
+failed_mount6:
+	ext4_ext_release(sb);
+failed_mount5:
+	ext4_mb_release(sb);
+	ext4_release_system_zone(sb);
 failed_mount4:
 	iput(root);
 	sb->s_root = NULL;
 	ext4_msg(sb, KERN_ERR, "mount failed");
 	destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
 failed_mount_wq:
-	ext4_release_system_zone(sb);
 	if (sbi->s_journal) {
 		jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
-- 
cgit v1.2.3-58-ga151


From 4113c4caa4f355b8ff8b7ff0510c29c9d00d30b3 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Sat, 8 Oct 2011 14:34:47 -0400
Subject: ext4: remove deprecated oldalloc

For a long time now orlov is the default block allocator in the
ext4. It performs better than the old one and no one seems to claim
otherwise so we can safely drop it and make oldalloc and orlov mount
option deprecated.

This is a part of the effort to reduce number of ext4 options hence the
test matrix.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 Documentation/filesystems/ext4.txt |   8 ---
 fs/ext4/ext4.h                     |   1 -
 fs/ext4/ialloc.c                   | 138 +------------------------------------
 fs/ext4/super.c                    |   8 +--
 4 files changed, 7 insertions(+), 148 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index a53040193967..4917cf24a5e0 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -203,14 +203,6 @@ inode_readahead_blks=n	This tuning parameter controls the maximum
 			table readahead algorithm will pre-read into
 			the buffer cache.  The default value is 32 blocks.
 
-orlov		(*)	This enables the new Orlov block allocator. It is
-			enabled by default.
-
-oldalloc		This disables the Orlov block allocator and enables
-			the old block allocator.  Orlov should have better
-			performance - we'd like to get some feedback if it's
-			the contrary for you.
-
 nouser_xattr		Disables Extended User Attributes. If you have extended
 			attribute support enabled in the kernel configuration
 			(CONFIG_EXT4_FS_XATTR), extended attribute support
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1bbd2caebe7f..61b0b5cd4aa2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -913,7 +913,6 @@ struct ext4_inode_info {
 /*
  * Mount flags
  */
-#define EXT4_MOUNT_OLDALLOC		0x00002  /* Don't use the new Orlov allocator */
 #define EXT4_MOUNT_GRPID		0x00004	/* Create files with directory's group */
 #define EXT4_MOUNT_DEBUG		0x00008	/* Some debugging messages */
 #define EXT4_MOUNT_ERRORS_CONT		0x00010	/* Continue on errors */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index d50a7d5e4726..04219988e75f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -293,120 +293,6 @@ error_return:
 	ext4_std_error(sb, fatal);
 }
 
-/*
- * There are two policies for allocating an inode.  If the new inode is
- * a directory, then a forward search is made for a block group with both
- * free space and a low directory-to-inode ratio; if that fails, then of
- * the groups with above-average free space, that group with the fewest
- * directories already is chosen.
- *
- * For other inodes, search forward from the parent directory\'s block
- * group to find a free inode.
- */
-static int find_group_dir(struct super_block *sb, struct inode *parent,
-				ext4_group_t *best_group)
-{
-	ext4_group_t ngroups = ext4_get_groups_count(sb);
-	unsigned int freei, avefreei;
-	struct ext4_group_desc *desc, *best_desc = NULL;
-	ext4_group_t group;
-	int ret = -1;
-
-	freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
-	avefreei = freei / ngroups;
-
-	for (group = 0; group < ngroups; group++) {
-		desc = ext4_get_group_desc(sb, group, NULL);
-		if (!desc || !ext4_free_inodes_count(sb, desc))
-			continue;
-		if (ext4_free_inodes_count(sb, desc) < avefreei)
-			continue;
-		if (!best_desc ||
-		    (ext4_free_group_clusters(sb, desc) >
-		     ext4_free_group_clusters(sb, best_desc))) {
-			*best_group = group;
-			best_desc = desc;
-			ret = 0;
-		}
-	}
-	return ret;
-}
-
-#define free_block_ratio 10
-
-static int find_group_flex(struct super_block *sb, struct inode *parent,
-			   ext4_group_t *best_group)
-{
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_group_desc *desc;
-	struct flex_groups *flex_group = sbi->s_flex_groups;
-	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
-	ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
-	ext4_group_t ngroups = ext4_get_groups_count(sb);
-	int flex_size = ext4_flex_bg_size(sbi);
-	ext4_group_t best_flex = parent_fbg_group;
-	int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
-	int flexbg_free_clusters;
-	int flex_freeb_ratio;
-	ext4_group_t n_fbg_groups;
-	ext4_group_t i;
-
-	n_fbg_groups = (ngroups + flex_size - 1) >>
-		sbi->s_log_groups_per_flex;
-
-find_close_to_parent:
-	flexbg_free_clusters = atomic_read(&flex_group[best_flex].free_clusters);
-	flex_freeb_ratio = EXT4_C2B(sbi, flexbg_free_clusters) * 100 /
-		blocks_per_flex;
-	if (atomic_read(&flex_group[best_flex].free_inodes) &&
-	    flex_freeb_ratio > free_block_ratio)
-		goto found_flexbg;
-
-	if (best_flex && best_flex == parent_fbg_group) {
-		best_flex--;
-		goto find_close_to_parent;
-	}
-
-	for (i = 0; i < n_fbg_groups; i++) {
-		if (i == parent_fbg_group || i == parent_fbg_group - 1)
-			continue;
-
-		flexbg_free_clusters = atomic_read(&flex_group[i].free_clusters);
-		flex_freeb_ratio = EXT4_C2B(sbi, flexbg_free_clusters) * 100 /
-			blocks_per_flex;
-
-		if (flex_freeb_ratio > free_block_ratio &&
-		    (atomic_read(&flex_group[i].free_inodes))) {
-			best_flex = i;
-			goto found_flexbg;
-		}
-
-		if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
-		    ((atomic_read(&flex_group[i].free_clusters) >
-		      atomic_read(&flex_group[best_flex].free_clusters)) &&
-		     atomic_read(&flex_group[i].free_inodes)))
-			best_flex = i;
-	}
-
-	if (!atomic_read(&flex_group[best_flex].free_inodes) ||
-	    !atomic_read(&flex_group[best_flex].free_clusters))
-		return -1;
-
-found_flexbg:
-	for (i = best_flex * flex_size; i < ngroups &&
-		     i < (best_flex + 1) * flex_size; i++) {
-		desc = ext4_get_group_desc(sb, i, NULL);
-		if (ext4_free_inodes_count(sb, desc)) {
-			*best_group = i;
-			goto out;
-		}
-	}
-
-	return -1;
-out:
-	return 0;
-}
-
 struct orlov_stats {
 	__u32 free_inodes;
 	__u32 free_clusters;
@@ -819,7 +705,6 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
 	int ret2, err = 0;
 	struct inode *ret;
 	ext4_group_t i;
-	static int once = 1;
 	ext4_group_t flex_group;
 
 	/* Cannot create files in a deleted directory */
@@ -845,26 +730,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
 		goto got_group;
 	}
 
-	if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
-		ret2 = find_group_flex(sb, dir, &group);
-		if (ret2 == -1) {
-			ret2 = find_group_other(sb, dir, &group, mode);
-			if (ret2 == 0 && once) {
-				once = 0;
-				printk(KERN_NOTICE "ext4: find_group_flex "
-				       "failed, fallback succeeded dir %lu\n",
-				       dir->i_ino);
-			}
-		}
-		goto got_group;
-	}
-
-	if (S_ISDIR(mode)) {
-		if (test_opt(sb, OLDALLOC))
-			ret2 = find_group_dir(sb, dir, &group);
-		else
-			ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
-	} else
+	if (S_ISDIR(mode))
+		ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
+	else
 		ret2 = find_group_other(sb, dir, &group, mode);
 
 got_group:
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index db2cd3f21074..42f76c64df8c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1073,8 +1073,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",nouid32");
 	if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
 		seq_puts(seq, ",debug");
-	if (test_opt(sb, OLDALLOC))
-		seq_puts(seq, ",oldalloc");
 #ifdef CONFIG_EXT4_FS_XATTR
 	if (test_opt(sb, XATTR_USER))
 		seq_puts(seq, ",user_xattr");
@@ -1583,10 +1581,12 @@ static int parse_options(char *options, struct super_block *sb,
 			set_opt(sb, DEBUG);
 			break;
 		case Opt_oldalloc:
-			set_opt(sb, OLDALLOC);
+			ext4_msg(sb, KERN_WARNING,
+				 "Ignoring deprecated oldalloc option");
 			break;
 		case Opt_orlov:
-			clear_opt(sb, OLDALLOC);
+			ext4_msg(sb, KERN_WARNING,
+				 "Ignoring deprecated orlov option");
 			break;
 #ifdef CONFIG_EXT4_FS_XATTR
 		case Opt_user_xattr:
-- 
cgit v1.2.3-58-ga151


From df3ab17072c31fbd394614711772682f0a956a2c Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Sat, 8 Oct 2011 15:53:49 -0400
Subject: ext4: fix the comment describing ext4_ext_search_right()

The comment describing what ext4_ext_search_right() does is incorrect.
We return 0 in *phys when *logical is the 'largest' allocated block,
not smallest.

Fix a few other typos while we're at it.

Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ext4/extents.c | 4 ++--
 fs/ext4/inode.c   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ad39627c1fbc..142e3443476f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1263,7 +1263,7 @@ static int ext4_ext_search_left(struct inode *inode,
 /*
  * search the closest allocated block to the right for *logical
  * and returns it at @logical + it's physical address at @phys
- * if *logical is the smallest allocated block, the function
+ * if *logical is the largest allocated block, the function
  * returns 0 at @phys
  * return value contains 0 (success) or error code
  */
@@ -2168,7 +2168,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
 			 *  need to account for leaf block credit
 			 *
 			 *  bitmaps and block group descriptor blocks
-			 *  and other metadat blocks still need to be
+			 *  and other metadata blocks still need to be
 			 *  accounted.
 			 */
 			/* 1 bitmap, 1 block group descriptor */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1380cd29c312..4863238c3754 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -458,7 +458,7 @@ static void set_buffers_da_mapped(struct inode *inode,
  * the buffer head is mapped.
  *
  * It returns 0 if plain look up failed (blocks have not been allocated), in
- * that casem, buffer head is unmapped
+ * that case, buffer head is unmapped
  *
  * It returns the error in case of allocation failure.
  */
@@ -497,7 +497,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 	 * Returns if the blocks have already allocated
 	 *
 	 * Note that if blocks have been preallocated
-	 * ext4_ext_get_block() returns th create = 0
+	 * ext4_ext_get_block() returns the create = 0
 	 * with buffer head unmapped.
 	 */
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
-- 
cgit v1.2.3-58-ga151


From 7fd59c83b05dc1b8af2be4d991ee376f782cd8b0 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Sat, 8 Oct 2011 15:56:35 -0400
Subject: ext4: remove the obsolete/broken EXT4_IOC_WAIT_FOR_READONLY ioctl

There are no users of the EXT4_IOC_WAIT_FOR_READONLY ioctl, and it is
also broken.  No one sets the set_ro_timer, no one wakes up us and our
state is set to TASK_INTERRUPTIBLE not RUNNING.  So remove it.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  | 10 ----------
 fs/ext4/ioctl.c | 28 ----------------------------
 2 files changed, 38 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 61b0b5cd4aa2..4546da4f26c4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -564,9 +564,6 @@ struct ext4_new_group_data {
 #define	EXT4_IOC_SETVERSION		_IOW('f', 4, long)
 #define	EXT4_IOC_GETVERSION_OLD		FS_IOC_GETVERSION
 #define	EXT4_IOC_SETVERSION_OLD		FS_IOC_SETVERSION
-#ifdef CONFIG_JBD2_DEBUG
-#define EXT4_IOC_WAIT_FOR_READONLY	_IOR('f', 99, long)
-#endif
 #define EXT4_IOC_GETRSVSZ		_IOR('f', 5, long)
 #define EXT4_IOC_SETRSVSZ		_IOW('f', 6, long)
 #define EXT4_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
@@ -589,9 +586,6 @@ struct ext4_new_group_data {
 #define EXT4_IOC32_SETRSVSZ		_IOW('f', 6, int)
 #define EXT4_IOC32_GROUP_EXTEND		_IOW('f', 7, unsigned int)
 #define EXT4_IOC32_GROUP_ADD		_IOW('f', 8, struct compat_ext4_new_group_input)
-#ifdef CONFIG_JBD2_DEBUG
-#define EXT4_IOC32_WAIT_FOR_READONLY	_IOR('f', 99, int)
-#endif
 #define EXT4_IOC32_GETVERSION_OLD	FS_IOC32_GETVERSION
 #define EXT4_IOC32_SETVERSION_OLD	FS_IOC32_SETVERSION
 #endif
@@ -1171,10 +1165,6 @@ struct ext4_sb_info {
 	u32 s_max_batch_time;
 	u32 s_min_batch_time;
 	struct block_device *journal_bdev;
-#ifdef CONFIG_JBD2_DEBUG
-	struct timer_list turn_ro_timer;	/* For turning read-only (crash simulation) */
-	wait_queue_head_t ro_wait_queue;	/* For people waiting for the fs to go read-only */
-#endif
 #ifdef CONFIG_QUOTA
 	char *s_qf_names[MAXQUOTAS];		/* Names of quota files with journalled quota */
 	int s_jquota_fmt;			/* Format of quota to use */
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 2046d699b4a7..8f7ea6905421 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -174,29 +174,6 @@ setversion_out:
 		mnt_drop_write(filp->f_path.mnt);
 		return err;
 	}
-#ifdef CONFIG_JBD2_DEBUG
-	case EXT4_IOC_WAIT_FOR_READONLY:
-		/*
-		 * This is racy - by the time we're woken up and running,
-		 * the superblock could be released.  And the module could
-		 * have been unloaded.  So sue me.
-		 *
-		 * Returns 1 if it slept, else zero.
-		 */
-		{
-			DECLARE_WAITQUEUE(wait, current);
-			int ret = 0;
-
-			set_current_state(TASK_INTERRUPTIBLE);
-			add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
-			if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) {
-				schedule();
-				ret = 1;
-			}
-			remove_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
-			return ret;
-		}
-#endif
 	case EXT4_IOC_GROUP_EXTEND: {
 		ext4_fsblk_t n_blocks_count;
 		int err, err2=0;
@@ -421,11 +398,6 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case EXT4_IOC32_SETVERSION_OLD:
 		cmd = EXT4_IOC_SETVERSION_OLD;
 		break;
-#ifdef CONFIG_JBD2_DEBUG
-	case EXT4_IOC32_WAIT_FOR_READONLY:
-		cmd = EXT4_IOC_WAIT_FOR_READONLY;
-		break;
-#endif
 	case EXT4_IOC32_GETRSVSZ:
 		cmd = EXT4_IOC_GETRSVSZ;
 		break;
-- 
cgit v1.2.3-58-ga151


From 6ee3b2122431fc75015c6114d4749de76422452b Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Sat, 8 Oct 2011 16:08:34 -0400
Subject: ext4: use le32_to_cpu for ext4_extent_idx.ei_block in
 ext4_ext_search_left()

ext4_extent_idx.e_block is __le32, so use le32_to_cpu() in
ext4_ext_search_left().

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 142e3443476f..f473ddf0bd94 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1238,9 +1238,9 @@ static int ext4_ext_search_left(struct inode *inode,
 			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
 				EXT4_ERROR_INODE(inode,
 				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
-				  ix != NULL ? ix->ei_block : 0,
+				  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
 				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
-				    EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0,
+		le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
 				  depth);
 				return -EIO;
 			}
-- 
cgit v1.2.3-58-ga151


From d44651d0f922b7aaeddd9fc04f2f5700a65983dd Mon Sep 17 00:00:00 2001
From: Fabrice Jouhaud <yargil@free.fr>
Date: Sat, 8 Oct 2011 16:26:03 -0400
Subject: ext4: fix ext4 so it works without CONFIG_PROC_FS

This fixes a bug which was introduced in dd68314ccf3fb.  The problem
came from the test of the return value of proc_mkdir which is always
false without procfs, and this would initialization of ext4.

Signed-off-by: Fabrice Jouhaud <yargil@free.fr>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 42f76c64df8c..dcc460537bc7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3540,10 +3540,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-#ifdef CONFIG_PROC_FS
 	if (ext4_proc_root)
 		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
-#endif
 
 	bgl_lock_init(sbi->s_blockgroup_lock);
 
@@ -5070,13 +5068,11 @@ static int __init ext4_init_fs(void)
 		return err;
 	err = ext4_init_system_zone();
 	if (err)
-		goto out7;
+		goto out6;
 	ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
 	if (!ext4_kset)
-		goto out6;
-	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
-	if (!ext4_proc_root)
 		goto out5;
+	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
 
 	err = ext4_init_feat_adverts();
 	if (err)
@@ -5112,12 +5108,12 @@ out2:
 out3:
 	ext4_exit_feat_adverts();
 out4:
-	remove_proc_entry("fs/ext4", NULL);
-out5:
+	if (ext4_proc_root)
+		remove_proc_entry("fs/ext4", NULL);
 	kset_unregister(ext4_kset);
-out6:
+out5:
 	ext4_exit_system_zone();
-out7:
+out6:
 	ext4_exit_pageio();
 	return err;
 }
-- 
cgit v1.2.3-58-ga151


From 40bfa16dac2adcded9e2eda58246cc3700d97de4 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Fri, 7 Oct 2011 23:21:18 +0800
Subject: ext3: Remove the obsolete broken EXT3_IOC32_WAIT_FOR_READONLY.

There are no user of EXT3_IOC32_WAIT_FOR_READONLY and also it is
broken. No one set the set_ro_timer, no one wake up us and our
state is set to TASK_INTERRUPTIBLE not RUNNING. So remove it.

Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/ioctl.c            | 24 ------------------------
 include/linux/ext3_fs_sb.h |  4 ----
 2 files changed, 28 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index c7f43944f160..ba1b54e23cae 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -150,30 +150,6 @@ setversion_out:
 		mnt_drop_write(filp->f_path.mnt);
 		return err;
 	}
-#ifdef CONFIG_JBD_DEBUG
-	case EXT3_IOC_WAIT_FOR_READONLY:
-		/*
-		 * This is racy - by the time we're woken up and running,
-		 * the superblock could be released.  And the module could
-		 * have been unloaded.  So sue me.
-		 *
-		 * Returns 1 if it slept, else zero.
-		 */
-		{
-			struct super_block *sb = inode->i_sb;
-			DECLARE_WAITQUEUE(wait, current);
-			int ret = 0;
-
-			set_current_state(TASK_INTERRUPTIBLE);
-			add_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait);
-			if (timer_pending(&EXT3_SB(sb)->turn_ro_timer)) {
-				schedule();
-				ret = 1;
-			}
-			remove_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait);
-			return ret;
-		}
-#endif
 	case EXT3_IOC_GETRSVSZ:
 		if (test_opt(inode->i_sb, RESERVATION)
 			&& S_ISREG(inode->i_mode)
diff --git a/include/linux/ext3_fs_sb.h b/include/linux/ext3_fs_sb.h
index 258088ab3c6b..64365252f1b0 100644
--- a/include/linux/ext3_fs_sb.h
+++ b/include/linux/ext3_fs_sb.h
@@ -76,10 +76,6 @@ struct ext3_sb_info {
 	struct mutex s_resize_lock;
 	unsigned long s_commit_interval;
 	struct block_device *journal_bdev;
-#ifdef CONFIG_JBD_DEBUG
-	struct timer_list turn_ro_timer;	/* For turning read-only (crash simulation) */
-	wait_queue_head_t ro_wait_queue;	/* For people waiting for the fs to go read-only */
-#endif
 #ifdef CONFIG_QUOTA
 	char *s_qf_names[MAXQUOTAS];		/* Names of quota files with journalled quota */
 	int s_jquota_fmt;			/* Format of quota to use */
-- 
cgit v1.2.3-58-ga151


From 7e273e3b41e32716dc122b293b5f15635af495ff Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 10 Oct 2011 01:08:02 -0700
Subject: udf: Promote some debugging messages to udf_error

If there is a problem with a scratched disc or loader, it's valuable to know
which error occurred.

Convert some debug messages to udf_error, neaten those messages too.
Add the calculated tag checksum and the read checksum to error message.
Make udf_error a public function and move the logging prototypes together.

Original-patch-by: NamJae Jeon <linkinjeon@gmail.com>
Reviewed-by: NamJae Jeon <linkinjeon@gmail.com>
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/misc.c    | 13 +++++++++----
 fs/udf/super.c   |  6 ++----
 fs/udf/udfdecl.h |  9 +++++++--
 3 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index 9215700c00a4..a85fa7c55b0a 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -204,6 +204,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
 {
 	struct tag *tag_p;
 	struct buffer_head *bh = NULL;
+	u8 checksum;
 
 	/* Read the block */
 	if (block == 0xFFFFFFFF)
@@ -211,7 +212,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
 
 	bh = udf_tread(sb, block);
 	if (!bh) {
-		udf_debug("block=%d, location=%d: read failed\n",
+		udf_error(sb, __func__, "read failed, block=%u, location=%d\n",
 			  block, location);
 		return NULL;
 	}
@@ -227,15 +228,19 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
 	}
 
 	/* Verify the tag checksum */
-	if (udf_tag_checksum(tag_p) != tag_p->tagChecksum) {
-		printk(KERN_ERR "udf: tag checksum failed block %d\n", block);
+	checksum = udf_tag_checksum(tag_p);
+	if (checksum != tag_p->tagChecksum) {
+		udf_error(sb, __func__,
+			  "tag checksum failed, block %u: 0x%02x != 0x%02x\n",
+			  block, checksum, tag_p->tagChecksum);
 		goto error_out;
 	}
 
 	/* Verify the tag version */
 	if (tag_p->descVersion != cpu_to_le16(0x0002U) &&
 	    tag_p->descVersion != cpu_to_le16(0x0003U)) {
-		udf_debug("tag version 0x%04x != 0x0002 || 0x0003 block %d\n",
+		udf_error(sb, __func__,
+			  "tag version 0x%04x != 0x0002 || 0x0003, block %u\n",
 			  le16_to_cpu(tag_p->descVersion), block);
 		goto error_out;
 	}
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 7b27b063ff6d..80f47ce515bc 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -92,8 +92,6 @@ static void udf_close_lvid(struct super_block *);
 static unsigned int udf_count_free(struct super_block *);
 static int udf_statfs(struct dentry *, struct kstatfs *);
 static int udf_show_options(struct seq_file *, struct vfsmount *);
-static void udf_error(struct super_block *sb, const char *function,
-		      const char *fmt, ...);
 
 struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi)
 {
@@ -2096,8 +2094,8 @@ error_out:
 	return -EINVAL;
 }
 
-static void udf_error(struct super_block *sb, const char *function,
-		      const char *fmt, ...)
+void udf_error(struct super_block *sb, const char *function,
+	       const char *fmt, ...)
 {
 	va_list args;
 
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index dbd52d4b5eed..81e66afecd42 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -29,6 +29,13 @@ do { \
 #define udf_debug(f, a...) /**/
 #endif
 
+__attribute__((format(printf, 3, 4)))
+extern void udf_warning(struct super_block *, const char *, const char *, ...);
+
+__attribute__((format(printf, 3, 4)))
+extern void udf_error(struct super_block *sb, const char *function,
+		      const char *fmt, ...);
+
 #define udf_info(f, a...) \
 	printk(KERN_INFO "UDF-fs INFO " f, ##a);
 
@@ -112,8 +119,6 @@ struct extent_position {
 
 /* super.c */
 
-__attribute__((format(printf, 3, 4)))
-extern void udf_warning(struct super_block *, const char *, const char *, ...);
 static inline void udf_updated_lvid(struct super_block *sb)
 {
 	struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh;
-- 
cgit v1.2.3-58-ga151


From 8076c363da15e7c35a4094974d1b4bcc196b5fa9 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 10 Oct 2011 01:08:03 -0700
Subject: udf: Rename udf_error to udf_err

Rename udf_error to udf_err for consistency with normal logging
uses of pr_err.

Rename function udf_err to _udf_err.
Remove __func__ from uses and move __func__ to a new udf_err
macro that calls _udf_err.
Some of the udf_error uses had \n terminations, some did not so
standardize \n's to udf_err uses, remove \n from _udf_err function.
Coalesce udf_err formats.
One message prefixed with udf_read_super is now prefixed with
udf_fill_super.

Reviewed-by: NamJae Jeon <linkinjeon@gmail.com>
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/misc.c    | 18 ++++++++----------
 fs/udf/super.c   | 27 +++++++++++----------------
 fs/udf/udfdecl.h |  7 ++++---
 3 files changed, 23 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index a85fa7c55b0a..c175b4dabc14 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -212,8 +212,8 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
 
 	bh = udf_tread(sb, block);
 	if (!bh) {
-		udf_error(sb, __func__, "read failed, block=%u, location=%d\n",
-			  block, location);
+		udf_err(sb, "read failed, block=%u, location=%d\n",
+			block, location);
 		return NULL;
 	}
 
@@ -230,18 +230,16 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
 	/* Verify the tag checksum */
 	checksum = udf_tag_checksum(tag_p);
 	if (checksum != tag_p->tagChecksum) {
-		udf_error(sb, __func__,
-			  "tag checksum failed, block %u: 0x%02x != 0x%02x\n",
-			  block, checksum, tag_p->tagChecksum);
+		udf_err(sb, "tag checksum failed, block %u: 0x%02x != 0x%02x\n",
+			block, checksum, tag_p->tagChecksum);
 		goto error_out;
 	}
 
 	/* Verify the tag version */
 	if (tag_p->descVersion != cpu_to_le16(0x0002U) &&
 	    tag_p->descVersion != cpu_to_le16(0x0003U)) {
-		udf_error(sb, __func__,
-			  "tag version 0x%04x != 0x0002 || 0x0003, block %u\n",
-			  le16_to_cpu(tag_p->descVersion), block);
+		udf_err(sb, "tag version 0x%04x != 0x0002 || 0x0003, block %u\n",
+			le16_to_cpu(tag_p->descVersion), block);
 		goto error_out;
 	}
 
@@ -253,8 +251,8 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
 		return bh;
 
 	udf_debug("Crc failure block %d: crc = %d, crclen = %d\n", block,
-	    le16_to_cpu(tag_p->descCRC), le16_to_cpu(tag_p->descCRCLength));
-
+		  le16_to_cpu(tag_p->descCRC),
+		  le16_to_cpu(tag_p->descCRCLength));
 error_out:
 	brelse(bh);
 	return NULL;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 80f47ce515bc..76cc08f9202e 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -242,9 +242,8 @@ static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count)
 	sbi->s_partmaps = kcalloc(count, sizeof(struct udf_part_map),
 				  GFP_KERNEL);
 	if (!sbi->s_partmaps) {
-		udf_error(sb, __func__,
-			  "Unable to allocate space for %d partition maps",
-			  count);
+		udf_err(sb, "Unable to allocate space for %d partition maps\n",
+			count);
 		sbi->s_partitions = 0;
 		return -ENOMEM;
 	}
@@ -879,8 +878,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
 
 	if (mdata->s_mirror_fe == NULL) {
 		if (fe_error) {
-			udf_error(sb, __func__, "mirror inode efe not found "
-			"and metadata inode is missing too, exiting...");
+			udf_err(sb, "mirror inode efe not found and metadata inode is missing too, exiting...\n");
 			goto error_exit;
 		} else
 			udf_warning(sb, __func__, "mirror inode efe not found,"
@@ -915,8 +913,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
 					"not found but it's ok since the disc"
 					" is mounted read-only");
 			else {
-				udf_error(sb, __func__, "bitmap inode efe not "
-					"found and attempted read-write mount");
+				udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n");
 				goto error_exit;
 			}
 		}
@@ -969,9 +966,8 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
 		bitmap = vzalloc(size); /* TODO: get rid of vzalloc */
 
 	if (bitmap == NULL) {
-		udf_error(sb, __func__,
-			  "Unable to allocate space for bitmap "
-			  "and %d buffer_head pointers", nr_groups);
+		udf_err(sb, "Unable to allocate space for bitmap and %d buffer_head pointers\n",
+			nr_groups);
 		return NULL;
 	}
 
@@ -1935,8 +1931,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 
 	if (uopt.flags & (1 << UDF_FLAG_UTF8) &&
 	    uopt.flags & (1 << UDF_FLAG_NLS_MAP)) {
-		udf_error(sb, "udf_read_super",
-			  "utf8 cannot be combined with iocharset\n");
+		udf_err(sb, "utf8 cannot be combined with iocharset\n");
 		goto error_out;
 	}
 #ifdef CONFIG_UDF_NLS
@@ -2094,8 +2089,8 @@ error_out:
 	return -EINVAL;
 }
 
-void udf_error(struct super_block *sb, const char *function,
-	       const char *fmt, ...)
+void _udf_err(struct super_block *sb, const char *function,
+	      const char *fmt, ...)
 {
 	va_list args;
 
@@ -2106,8 +2101,8 @@ void udf_error(struct super_block *sb, const char *function,
 	va_start(args, fmt);
 	vsnprintf(error_buf, sizeof(error_buf), fmt, args);
 	va_end(args);
-	printk(KERN_CRIT "UDF-fs error (device %s): %s: %s\n",
-		sb->s_id, function, error_buf);
+	printk(KERN_CRIT "UDF-fs error (device %s): %s: %s",
+	       sb->s_id, function, error_buf);
 }
 
 void udf_warning(struct super_block *sb, const char *function,
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 81e66afecd42..7bc3ba1415fc 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -33,13 +33,14 @@ __attribute__((format(printf, 3, 4)))
 extern void udf_warning(struct super_block *, const char *, const char *, ...);
 
 __attribute__((format(printf, 3, 4)))
-extern void udf_error(struct super_block *sb, const char *function,
-		      const char *fmt, ...);
+extern void _udf_err(struct super_block *sb, const char *function,
+		     const char *fmt, ...);
+#define udf_err(sb, fmt, ...)					\
+	_udf_err(sb, __func__, fmt, ##__VA_ARGS__)
 
 #define udf_info(f, a...) \
 	printk(KERN_INFO "UDF-fs INFO " f, ##a);
 
-
 #define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) )
 #define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) )
 
-- 
cgit v1.2.3-58-ga151


From a40ecd7b3ccf520ff02da93e8d1ba6cd55c2e359 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 10 Oct 2011 01:08:04 -0700
Subject: udf: Rename udf_warning to udf_warn

Rename udf_warning to udf_warn for consistency with normal logging
uses of pr_warn.

Rename function udf_warning to _udf_warn.
Remove __func__ from uses and move __func__ to a new udf_warn
macro that calls _udf_warn.
Add \n's to uses of udf_warn, remove \n from _udf_warn.
Coalesce formats.

Reviewed-by: NamJae Jeon <linkinjeon@gmail.com>
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/namei.c     |  5 ++---
 fs/udf/partition.c |  3 +--
 fs/udf/super.c     | 22 ++++++++--------------
 fs/udf/udfdecl.h   |  5 ++++-
 4 files changed, 15 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index f1dce848ef96..78d59ebd0656 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -799,9 +799,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
 	if (retval)
 		goto end_rmdir;
 	if (inode->i_nlink != 2)
-		udf_warning(inode->i_sb, "udf_rmdir",
-			    "empty directory has nlink != 2 (%d)",
-			    inode->i_nlink);
+		udf_warn(inode->i_sb, "empty directory has nlink != 2 (%d)\n",
+			 inode->i_nlink);
 	clear_nlink(inode);
 	inode->i_size = 0;
 	inode_dec_link_count(dir);
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index a71090ea0e07..c72edb2260e3 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -322,8 +322,7 @@ uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block,
 	BUG_ON(!inode);
 	retblk = udf_try_read_meta(inode, block, partition, offset);
 	if (retblk == 0xFFFFFFFF) {
-		udf_warning(sb, __func__, "error reading from METADATA, "
-			"trying to read from MIRROR");
+		udf_warn(sb, "error reading from METADATA, trying to read from MIRROR\n");
 		inode = mdata->s_mirror_fe;
 		if (!inode)
 			return 0xFFFFFFFF;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 76cc08f9202e..622331f1290d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -855,13 +855,11 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
 	mdata->s_metadata_fe = udf_iget(sb, &addr);
 
 	if (mdata->s_metadata_fe == NULL) {
-		udf_warning(sb, __func__, "metadata inode efe not found, "
-				"will try mirror inode.");
+		udf_warn(sb, "metadata inode efe not found, will try mirror inode\n");
 		fe_error = 1;
 	} else if (UDF_I(mdata->s_metadata_fe)->i_alloc_type !=
 		 ICBTAG_FLAG_AD_SHORT) {
-		udf_warning(sb, __func__, "metadata inode efe does not have "
-			"short allocation descriptors!");
+		udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n");
 		fe_error = 1;
 		iput(mdata->s_metadata_fe);
 		mdata->s_metadata_fe = NULL;
@@ -881,12 +879,10 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
 			udf_err(sb, "mirror inode efe not found and metadata inode is missing too, exiting...\n");
 			goto error_exit;
 		} else
-			udf_warning(sb, __func__, "mirror inode efe not found,"
-					" but metadata inode is OK");
+			udf_warn(sb, "mirror inode efe not found, but metadata inode is OK\n");
 	} else if (UDF_I(mdata->s_mirror_fe)->i_alloc_type !=
 		 ICBTAG_FLAG_AD_SHORT) {
-		udf_warning(sb, __func__, "mirror inode efe does not have "
-			"short allocation descriptors!");
+		udf_warn(sb, "mirror inode efe does not have short allocation descriptors!\n");
 		iput(mdata->s_mirror_fe);
 		mdata->s_mirror_fe = NULL;
 		if (fe_error)
@@ -909,9 +905,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
 
 		if (mdata->s_bitmap_fe == NULL) {
 			if (sb->s_flags & MS_RDONLY)
-				udf_warning(sb, __func__, "bitmap inode efe "
-					"not found but it's ok since the disc"
-					" is mounted read-only");
+				udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n");
 			else {
 				udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n");
 				goto error_exit;
@@ -2105,15 +2099,15 @@ void _udf_err(struct super_block *sb, const char *function,
 	       sb->s_id, function, error_buf);
 }
 
-void udf_warning(struct super_block *sb, const char *function,
-		 const char *fmt, ...)
+void _udf_warn(struct super_block *sb, const char *function,
+	       const char *fmt, ...)
 {
 	va_list args;
 
 	va_start(args, fmt);
 	vsnprintf(error_buf, sizeof(error_buf), fmt, args);
 	va_end(args);
-	printk(KERN_WARNING "UDF-fs warning (device %s): %s: %s\n",
+	printk(KERN_WARNING "UDF-fs warning (device %s): %s: %s",
 	       sb->s_id, function, error_buf);
 }
 
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 7bc3ba1415fc..85e15edc080f 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -30,7 +30,10 @@ do { \
 #endif
 
 __attribute__((format(printf, 3, 4)))
-extern void udf_warning(struct super_block *, const char *, const char *, ...);
+extern void _udf_warn(struct super_block *sb, const char *function,
+		      const char *fmt, ...);
+#define udf_warn(sb, fmt, ...)					\
+	_udf_warn(sb, __func__, fmt, ##__VA_ARGS__)
 
 __attribute__((format(printf, 3, 4)))
 extern void _udf_err(struct super_block *sb, const char *function,
-- 
cgit v1.2.3-58-ga151


From 71c3bcd71393a9e67d5b77597a612537f89c30ed Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 27 Sep 2011 21:42:29 -0400
Subject: nfsd4: fix state lock usage in LOCKU

In commit 5ec094c1096ab3bb795651855d53f18daa26afde "nfsd4: extend state
lock over seqid replay logic" I modified the exit logic of all the
seqid-based procedures except nfsd4_locku().  Fix the oversight.

The result of the bug was a double-unlock while handling the LOCKU
procedure, and a warning like:

[  142.150014] WARNING: at kernel/mutex-debug.c:78 debug_mutex_unlock+0xda/0xe0()
...
[  142.152927] Pid: 742, comm: nfsd Not tainted 3.1.0-rc1-SLIM+ #9
[  142.152927] Call Trace:
[  142.152927]  [<ffffffff8105fa4f>] warn_slowpath_common+0x7f/0xc0
[  142.152927]  [<ffffffff8105faaa>] warn_slowpath_null+0x1a/0x20
[  142.152927]  [<ffffffff810960ca>] debug_mutex_unlock+0xda/0xe0
[  142.152927]  [<ffffffff813e4200>] __mutex_unlock_slowpath+0x80/0x140
[  142.152927]  [<ffffffff813e42ce>] mutex_unlock+0xe/0x10
[  142.152927]  [<ffffffffa03bd3f5>] nfs4_lock_state+0x35/0x40 [nfsd]
[  142.152927]  [<ffffffffa03b0b71>] nfsd4_proc_compound+0x2a1/0x690
[nfsd]
[  142.152927]  [<ffffffffa039f9fb>] nfsd_dispatch+0xeb/0x230 [nfsd]
[  142.152927]  [<ffffffffa02b1055>] svc_process_common+0x345/0x690
[sunrpc]
[  142.152927]  [<ffffffff81058d10>] ? try_to_wake_up+0x280/0x280
[  142.152927]  [<ffffffffa02b16e2>] svc_process+0x102/0x150 [sunrpc]
[  142.152927]  [<ffffffffa039f0bd>] nfsd+0xbd/0x160 [nfsd]
[  142.152927]  [<ffffffffa039f000>] ? 0xffffffffa039efff
[  142.152927]  [<ffffffff8108230c>] kthread+0x8c/0xa0
[  142.152927]  [<ffffffff813e8694>] kernel_thread_helper+0x4/0x10
[  142.152927]  [<ffffffff81082280>] ? kthread_worker_fn+0x190/0x190
[  142.152927]  [<ffffffff813e8690>] ? gs_change+0x13/0x13

Reported-by: Bryan Schumaker <bjschuma@netapp.com>
Tested-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 931155f51ecc..8e253a347649 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4185,7 +4185,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 
 out:
-	nfs4_unlock_state();
+	if (!cstate->replay_owner)
+		nfs4_unlock_state();
 	return status;
 
 out_nfserr:
-- 
cgit v1.2.3-58-ga151


From 6409a5a65d2b959c3f5e2b8adfa67c349e294652 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 28 Sep 2011 11:37:56 -0400
Subject: nfsd4: clean up downgrading code

In response to some review comments, get rid of the somewhat obscure
for-loop with bitops, and improve a comment.

Reported-by: Steve Dickson <steved@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 31 ++++++++++++++++++++++---------
 fs/nfsd/state.h     |  8 +++++---
 2 files changed, 27 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8e253a347649..683885b18ceb 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3523,16 +3523,29 @@ out:
 	return status;
 }
 
-static inline void nfs4_file_downgrade(struct nfs4_ol_stateid *stp, unsigned int to_access)
+static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access)
 {
-	int i;
+	if (!test_bit(access, &stp->st_access_bmap))
+		return;
+	nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(access));
+	__clear_bit(access, &stp->st_access_bmap);
+}
 
-	for (i = 1; i < 4; i++) {
-		if (test_bit(i, &stp->st_access_bmap)
-					&& ((i & to_access) != i)) {
-			nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(i));
-			__clear_bit(i, &stp->st_access_bmap);
-		}
+static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access)
+{
+	switch (to_access) {
+	case NFS4_SHARE_ACCESS_READ:
+		nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_WRITE);
+		nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH);
+		break;
+	case NFS4_SHARE_ACCESS_WRITE:
+		nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_READ);
+		nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH);
+		break;
+	case NFS4_SHARE_ACCESS_BOTH:
+		break;
+	default:
+		BUG();
 	}
 }
 
@@ -3578,7 +3591,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 			stp->st_deny_bmap, od->od_share_deny);
 		goto out;
 	}
-	nfs4_file_downgrade(stp, od->od_share_access);
+	nfs4_stateid_downgrade(stp, od->od_share_access);
 
 	reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap);
 
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 13f6f9f5ceec..22b065a55ea0 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -403,9 +403,11 @@ struct nfs4_file {
 	/* One each for O_RDONLY, O_WRONLY, O_RDWR: */
 	struct file *		fi_fds[3];
 	/*
-	 * Each open or lock stateid contributes 1 to either
-	 * fi_access[O_RDONLY], fi_access[O_WRONLY], or both, depending
-	 * on open or lock mode:
+	 * Each open or lock stateid contributes 0-4 to the counts
+	 * below depending on which bits are set in st_access_bitmap:
+	 *     1 to fi_access[O_RDONLY] if NFS4_SHARE_ACCES_READ is set
+	 *   + 1 to fi_access[O_WRONLY] if NFS4_SHARE_ACCESS_WRITE is set
+	 *   + 1 to both of the above if NFS4_SHARE_ACCESS_BOTH is set.
 	 */
 	atomic_t		fi_access[2];
 	struct file		*fi_deleg_file;
-- 
cgit v1.2.3-58-ga151


From b31b30e5c76b7653b4434fcdc3c5d2b46a367c2a Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 28 Sep 2011 11:47:20 -0400
Subject: nfsd4: cleanup state.h comments

These comments are mostly out of date.

Reported-by: Bryan Schumaker <bjschuma@netapp.com>
---
 fs/nfsd/state.h | 45 ++++++++-------------------------------------
 1 file changed, 8 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 22b065a55ea0..aa14f06af2df 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -332,25 +332,6 @@ struct nfs4_replay {
 	char			rp_ibuf[NFSD4_REPLAY_ISIZE];
 };
 
-/*
-* nfs4_stateowner can either be an open_owner, or a lock_owner
-*
-*    so_idhash:  stateid_hashtbl[] for open owner, lockstateid_hashtbl[]
-*         for lock_owner
-*    so_strhash: ownerstr_hashtbl[] for open_owner, lock_ownerstr_hashtbl[]
-*         for lock_owner
-*    so_perclient: nfs4_client->cl_perclient entry - used when nfs4_client
-*         struct is reaped.
-*    so_perfilestate: heads the list of nfs4_ol_stateid (either open or lock) 
-*         and is used to ensure no dangling nfs4_ol_stateid references when we 
-*         release a stateowner.
-*    so_perlockowner: (open) nfs4_ol_stateid->st_perlockowner entry - used when
-*         close is called to reap associated byte-range locks
-*    so_close_lru: (open) stateowner is placed on this list instead of being
-*         reaped (when so_perfilestate is empty) to hold the last close replay.
-*         reaped by laundramat thread after lease period.
-*/
-
 struct nfs4_stateowner {
 	struct list_head        so_strhash;   /* hash by op_name */
 	struct list_head        so_stateids;
@@ -366,7 +347,14 @@ struct nfs4_stateowner {
 struct nfs4_openowner {
 	struct nfs4_stateowner	oo_owner; /* must be first field */
 	struct list_head        oo_perclient;
-	struct list_head	oo_close_lru; /* tail queue */
+	/*
+	 * We keep around openowners a little while after last close,
+	 * which saves clients from having to confirm, and allows us to
+	 * handle close replays if they come soon enough.  The close_lru
+	 * is a list of such openowners, to be reaped by the laundromat
+	 * thread eventually if they remain unused:
+	 */
+	struct list_head	oo_close_lru;
 	struct nfs4_ol_stateid *oo_last_closed_stid;
 	time_t			oo_time; /* time of placement on so_close_lru */
 #define NFS4_OO_CONFIRMED   1
@@ -443,23 +431,6 @@ static inline struct file *find_any_file(struct nfs4_file *f)
 		return f->fi_fds[O_RDONLY];
 }
 
-/*
-* nfs4_ol_stateid can either be an open stateid or (eventually) a lock stateid
-*
-* (open)nfs4_ol_stateid: one per (open)nfs4_stateowner, nfs4_file
-*
-* 	st_hash: stateid_hashtbl[] entry or lockstateid_hashtbl entry
-* 	st_perfile: file_hashtbl[] entry.
-* 	st_perfile_state: nfs4_stateowner->so_perfilestate
-*       st_perlockowner: (open stateid) list of lock nfs4_stateowners
-* 	st_access_bmap: used only for open stateid
-* 	st_deny_bmap: used only for open stateid
-*	st_openstp: open stateid lock stateid was derived from
-*
-* XXX: open stateids and lock stateids have diverged sufficiently that
-* we should consider defining separate structs for the two cases.
-*/
-
 /* "ol" stands for "Open or Lock".  Better suggestions welcome. */
 struct nfs4_ol_stateid {
 	struct nfs4_stid    st_stid;
-- 
cgit v1.2.3-58-ga151


From c30e92df30d7d5fe65262fbce5d1b7de675fe34e Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 10 Oct 2011 17:34:31 -0400
Subject: nfsd4: ignore WANT bits in open downgrade

We don't use WANT bits yet--and sending them can probably trigger a
BUG() further down.

Cc: stable@kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 683885b18ceb..a39a542d8be5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3574,6 +3574,8 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 	if (!access_valid(od->od_share_access, cstate->minorversion)
 			|| !deny_valid(od->od_share_deny))
 		return nfserr_inval;
+	/* We don't yet support WANT bits: */
+	od->od_share_access &= NFS4_SHARE_ACCESS_MASK;
 
 	nfs4_lock_state();
 	status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid,
-- 
cgit v1.2.3-58-ga151


From 04f9e664b21c4440daf4d08f31db9b18517e4b8d Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 10 Oct 2011 14:37:13 -0400
Subject: nfsd4: move access/deny validity checks to xdr code

I'd rather put more of these sorts of checks into standardized xdr
decoders for the various types rather than have them cluttering up the
core logic in nfs4proc.c and nfs4state.c.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 32 ---------------------
 fs/nfsd/nfs4xdr.c   | 80 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 73 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a39a542d8be5..6bfa293e1c91 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2418,31 +2418,6 @@ find_file(struct inode *ino)
 	return NULL;
 }
 
-static inline int access_valid(u32 x, u32 minorversion)
-{
-	if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
-		return 0;
-	if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH)
-		return 0;
-	x &= ~NFS4_SHARE_ACCESS_MASK;
-	if (minorversion && x) {
-		if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL)
-			return 0;
-		if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED)
-			return 0;
-		x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK);
-	}
-	if (x)
-		return 0;
-	return 1;
-}
-
-static inline int deny_valid(u32 x)
-{
-	/* Note: unlike access bits, deny bits may be zero. */
-	return x <= NFS4_SHARE_DENY_BOTH;
-}
-
 /*
  * Called to check deny when READ with all zero stateid or
  * WRITE with all zero or all one stateid
@@ -2918,10 +2893,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	struct nfs4_delegation *dp = NULL;
 	__be32 status;
 
-	status = nfserr_inval;
-	if (!access_valid(open->op_share_access, resp->cstate.minorversion)
-			|| !deny_valid(open->op_share_deny))
-		goto out;
 	/*
 	 * Lookup file; if found, lookup stateid and check open request,
 	 * and check for delegations in the process of being recalled.
@@ -3571,9 +3542,6 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 			(int)cstate->current_fh.fh_dentry->d_name.len,
 			cstate->current_fh.fh_dentry->d_name.name);
 
-	if (!access_valid(od->od_share_access, cstate->minorversion)
-			|| !deny_valid(od->od_share_deny))
-		return nfserr_inval;
 	/* We don't yet support WANT bits: */
 	od->od_share_access &= NFS4_SHARE_ACCESS_MASK;
 
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 5779acde7e70..94da8bb36c85 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -639,6 +639,64 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup
 	DECODE_TAIL;
 }
 
+static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *x)
+{
+	__be32 *p;
+	u32 w;
+
+	READ_BUF(4);
+	READ32(w);
+	*x = w;
+	switch (w & NFS4_SHARE_ACCESS_MASK) {
+	case NFS4_SHARE_ACCESS_READ:
+	case NFS4_SHARE_ACCESS_WRITE:
+	case NFS4_SHARE_ACCESS_BOTH:
+		break;
+	default:
+		return nfserr_bad_xdr;
+	}
+	w &= !NFS4_SHARE_ACCESS_MASK;
+	if (!w)
+		return nfs_ok;
+	if (!argp->minorversion)
+		return nfserr_bad_xdr;
+	switch (w & NFS4_SHARE_WANT_MASK) {
+	case NFS4_SHARE_WANT_NO_PREFERENCE:
+	case NFS4_SHARE_WANT_READ_DELEG:
+	case NFS4_SHARE_WANT_WRITE_DELEG:
+	case NFS4_SHARE_WANT_ANY_DELEG:
+	case NFS4_SHARE_WANT_NO_DELEG:
+	case NFS4_SHARE_WANT_CANCEL:
+		break;
+	default:
+		return nfserr_bad_xdr;
+	}
+	w &= !NFS4_SHARE_WANT_MASK;
+	if (!w)
+		return nfs_ok;
+	switch (w) {
+	case NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL:
+	case NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED:
+		return nfs_ok;
+	}
+xdr_error:
+	return nfserr_bad_xdr;
+}
+
+static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x)
+{
+	__be32 *p;
+
+	READ_BUF(4);
+	READ32(*x);
+	/* Note: unlinke access bits, deny bits may be zero. */
+	if (*x & !NFS4_SHARE_DENY_BOTH)
+		return nfserr_bad_xdr;
+	return nfs_ok;
+xdr_error:
+	return nfserr_bad_xdr;
+}
+
 static __be32
 nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 {
@@ -649,10 +707,15 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 	open->op_openowner = NULL;
 
 	/* seqid, share_access, share_deny, clientid, ownerlen */
-	READ_BUF(16 + sizeof(clientid_t));
+	READ_BUF(4);
 	READ32(open->op_seqid);
-	READ32(open->op_share_access);
-	READ32(open->op_share_deny);
+	status = nfsd4_decode_share_access(argp, &open->op_share_access);
+	if (status)
+		goto xdr_error;
+	status = nfsd4_decode_share_deny(argp, &open->op_share_deny);
+	if (status)
+		goto xdr_error;
+	READ_BUF(sizeof(clientid_t) + 4);
 	COPYMEM(&open->op_clientid, sizeof(clientid_t));
 	READ32(open->op_owner.len);
 
@@ -753,11 +816,14 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
 	status = nfsd4_decode_stateid(argp, &open_down->od_stateid);
 	if (status)
 		return status;
-	READ_BUF(12);
+	READ_BUF(4);
 	READ32(open_down->od_seqid);
-	READ32(open_down->od_share_access);
-	READ32(open_down->od_share_deny);
-						        
+	status = nfsd4_decode_share_access(argp, &open_down->od_share_access);
+	if (status)
+		return status;
+	status = nfsd4_decode_share_deny(argp, &open_down->od_share_deny);
+	if (status)
+		return status;
 	DECODE_TAIL;
 }
 
-- 
cgit v1.2.3-58-ga151


From a084daf512bb66fa3c8e21c7027daea521179cd0 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 10 Oct 2011 15:07:40 -0400
Subject: nfsd4: move name-length checks to xdr

Again, these checks are better in the xdr code.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 24 ++----------------------
 fs/nfsd/nfs4xdr.c   | 45 ++++++++++++++++++++++++++++++---------------
 fs/nfsd/xdr4.h      |  3 +--
 3 files changed, 33 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6bfa293e1c91..5f35f35a2da0 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1189,17 +1189,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
 	return clp;
 }
 
-static int check_name(struct xdr_netobj name)
-{
-	if (name.len == 0) 
-		return 0;
-	if (name.len > NFS4_OPAQUE_LIMIT) {
-		dprintk("NFSD: check_name: name too long(%d)!\n", name.len);
-		return 0;
-	}
-	return 1;
-}
-
 static void
 add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval)
 {
@@ -1442,7 +1431,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
 		__func__, rqstp, exid, exid->clname.len, exid->clname.data,
 		addr_str, exid->flags, exid->spa_how);
 
-	if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
+	if (exid->flags & ~EXCHGID4_FLAG_MASK_A)
 		return nfserr_inval;
 
 	/* Currently only support SP4_NONE */
@@ -1992,19 +1981,13 @@ __be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		  struct nfsd4_setclientid *setclid)
 {
-	struct xdr_netobj 	clname = { 
-		.len = setclid->se_namelen,
-		.data = setclid->se_name,
-	};
+	struct xdr_netobj 	clname = setclid->se_name;
 	nfs4_verifier		clverifier = setclid->se_verf;
 	unsigned int 		strhashval;
 	struct nfs4_client	*conf, *unconf, *new;
 	__be32 			status;
 	char                    dname[HEXDIR_LEN];
 	
-	if (!check_name(clname))
-		return nfserr_inval;
-
 	status = nfs4_make_rec_clidname(dname, &clname);
 	if (status)
 		return status;
@@ -2523,9 +2506,6 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
 	struct nfs4_openowner *oo = NULL;
 	__be32 status;
 
-	if (!check_name(open->op_owner))
-		return nfserr_inval;
-
 	if (STALE_CLIENTID(&open->op_clientid))
 		return nfserr_stale_clientid;
 
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 94da8bb36c85..2cab33cc3238 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -697,6 +697,23 @@ xdr_error:
 	return nfserr_bad_xdr;
 }
 
+static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o)
+{
+	__be32 *p;
+
+	READ_BUF(4);
+	READ32(o->len);
+
+	if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT)
+		return nfserr_bad_xdr;
+
+	READ_BUF(o->len);
+	SAVEMEM(o->data, o->len);
+	return nfs_ok;
+xdr_error:
+	return nfserr_bad_xdr;
+}
+
 static __be32
 nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 {
@@ -715,13 +732,12 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 	status = nfsd4_decode_share_deny(argp, &open->op_share_deny);
 	if (status)
 		goto xdr_error;
-	READ_BUF(sizeof(clientid_t) + 4);
+	READ_BUF(sizeof(clientid_t));
 	COPYMEM(&open->op_clientid, sizeof(clientid_t));
-	READ32(open->op_owner.len);
-
-	/* owner, open_flag */
-	READ_BUF(open->op_owner.len + 4);
-	SAVEMEM(open->op_owner.data, open->op_owner.len);
+	status = nfsd4_decode_opaque(argp, &open->op_owner);
+	if (status)
+		goto xdr_error;
+	READ_BUF(4);
 	READ32(open->op_create);
 	switch (open->op_create) {
 	case NFS4_OPEN_NOCREATE:
@@ -964,12 +980,13 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
 {
 	DECODE_HEAD;
 
-	READ_BUF(12);
+	READ_BUF(8);
 	COPYMEM(setclientid->se_verf.data, 8);
-	READ32(setclientid->se_namelen);
 
-	READ_BUF(setclientid->se_namelen + 8);
-	SAVEMEM(setclientid->se_name, setclientid->se_namelen);
+	status = nfsd4_decode_opaque(argp, &setclientid->se_name);
+	if (status)
+		return nfserr_bad_xdr;
+	READ_BUF(8);
 	READ32(setclientid->se_callback_prog);
 	READ32(setclientid->se_callback_netid_len);
 
@@ -1112,11 +1129,9 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
 	READ_BUF(NFS4_VERIFIER_SIZE);
 	COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
 
-	READ_BUF(4);
-	READ32(exid->clname.len);
-
-	READ_BUF(exid->clname.len);
-	SAVEMEM(exid->clname.data, exid->clname.len);
+	status = nfsd4_decode_opaque(argp, &exid->clname);
+	if (status)
+		return nfserr_bad_xdr;
 
 	READ_BUF(4);
 	READ32(exid->flags);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index c9012149637c..4c8a7ec3f25d 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -317,8 +317,7 @@ struct nfsd4_setattr {
 
 struct nfsd4_setclientid {
 	nfs4_verifier	se_verf;            /* request */
-	u32		se_namelen;         /* request */
-	char *		se_name;            /* request */
+	struct xdr_netobj se_name;
 	u32		se_callback_prog;   /* request */
 	u32		se_callback_netid_len;  /* request */
 	char *		se_callback_netid_val;  /* request */
-- 
cgit v1.2.3-58-ga151


From b6d2f1ca3c1162f51098969e9c52fd099720416a Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 10 Oct 2011 17:44:19 -0400
Subject: nfsd4: more robust ignoring of WANT bits in OPEN

Mask out the WANT bits right at the start instead of on each use.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c  | 3 +++
 fs/nfsd/nfs4state.c | 5 ++---
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 4e41f65c7021..5b192a2512b6 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -315,6 +315,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
 		return nfserr_inval;
 
+	/* We don't yet support WANT bits: */
+	open->op_share_access &= NFS4_SHARE_ACCESS_MASK;
+
 	/*
 	 * RFC5661 18.51.3
 	 * Before RECLAIM_COMPLETE done, server should deny new lock
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 5f35f35a2da0..2042805da960 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2345,8 +2345,7 @@ static inline __be32 init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_
 	stp->st_file = fp;
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = 0;
-	__set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK,
-		  &stp->st_access_bmap);
+	__set_bit(open->op_share_access, &stp->st_access_bmap);
 	__set_bit(open->op_share_deny, &stp->st_deny_bmap);
 	stp->st_openstp = NULL;
 	return nfs_ok;
@@ -2690,7 +2689,7 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
 static __be32
 nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open)
 {
-	u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
+	u32 op_share_access = open->op_share_access;
 	bool new_access;
 	__be32 status;
 
-- 
cgit v1.2.3-58-ga151


From 875cd04381e695afbbbef025438113e12af84963 Mon Sep 17 00:00:00 2001
From: Sachin Prabhu <sprabhu@redhat.com>
Date: Tue, 11 Oct 2011 13:43:50 +0100
Subject: cifs: Display strictcache mount option in /proc/mounts

Commit d39454ffe4a3c85428483b8a8a8e5e797b6363d5 adds a strictcache mount
option. This patch allows the display of this mount option in
/proc/mounts when listing shares mounted with the strictcache mount
option.

Signed-off-by: Sachin Prabhu <sprabhu@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsfs.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 54b8f1e7da94..03d2f6b27cc4 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -432,6 +432,8 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 		seq_printf(s, ",mfsymlinks");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
 		seq_printf(s, ",fsc");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
+		seq_printf(s, ",strictcache");
 
 	seq_printf(s, ",rsize=%d", cifs_sb->rsize);
 	seq_printf(s, ",wsize=%d", cifs_sb->wsize);
-- 
cgit v1.2.3-58-ga151


From 0c38a2512df272b14ef4238b476a2e4f70da1479 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 25 Aug 2011 07:17:01 +0000
Subject: xfs: don't serialise direct IO reads on page cache checks

There is no need to grab the i_mutex of the IO lock in exclusive
mode if we don't need to invalidate the page cache. Taking these
locks on every direct IO effective serialises them as taking the IO
lock in exclusive mode has to wait for all shared holders to drop
the lock. That only happens when IO is complete, so effective it
prevents dispatch of concurrent direct IO reads to the same inode.

Fix this by taking the IO lock shared to check the page cache state,
and only then drop it and take the IO lock exclusively if there is
work to be done. Hence for the normal direct IO case, no exclusive
locking will occur.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Tested-by: Joern Engel <joern@logfs.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_file.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 7f7b42469ea7..8fd4a0708d30 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -317,7 +317,19 @@ xfs_file_aio_read(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	if (unlikely(ioflags & IO_ISDIRECT)) {
+	/*
+	 * Locking is a bit tricky here. If we take an exclusive lock
+	 * for direct IO, we effectively serialise all new concurrent
+	 * read IO to this file and block it behind IO that is currently in
+	 * progress because IO in progress holds the IO lock shared. We only
+	 * need to hold the lock exclusive to blow away the page cache, so
+	 * only take lock exclusively if the page cache needs invalidation.
+	 * This allows the normal direct IO case of no page cache pages to
+	 * proceeed concurrently without serialisation.
+	 */
+	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+	if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) {
+		xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 		xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
 
 		if (inode->i_mapping->nrpages) {
@@ -330,8 +342,7 @@ xfs_file_aio_read(
 			}
 		}
 		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-	} else
-		xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+	}
 
 	trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
 
-- 
cgit v1.2.3-58-ga151


From 7271d243f9d1b4106289e4cf876c8b1203de59ab Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Thu, 25 Aug 2011 07:17:02 +0000
Subject: xfs: don't serialise adjacent concurrent direct IO appending writes

For append write workloads, extending the file requires a certain
amount of exclusive locking to be done up front to ensure sanity in
things like ensuring that we've zeroed any allocated regions
between the old EOF and the start of the new IO.

For single threads, this typically isn't a problem, and for large
IOs we don't serialise enough for it to be a problem for two
threads on really fast block devices. However for smaller IO and
larger thread counts we have a problem.

Take 4 concurrent sequential, single block sized and aligned IOs.
After the first IO is submitted but before it completes, we end up
with this state:

        IO 1    IO 2    IO 3    IO 4
      +-------+-------+-------+-------+
      ^       ^
      |       |
      |       |
      |       |
      |       \- ip->i_new_size
      \- ip->i_size

And the IO is done without exclusive locking because offset <=
ip->i_size. When we submit IO 2, we see offset > ip->i_size, and
grab the IO lock exclusive, because there is a chance we need to do
EOF zeroing. However, there is already an IO in progress that avoids
the need for IO zeroing because offset <= ip->i_new_size. hence we
could avoid holding the IO lock exlcusive for this. Hence after
submission of the second IO, we'd end up this state:

        IO 1    IO 2    IO 3    IO 4
      +-------+-------+-------+-------+
      ^               ^
      |               |
      |               |
      |               |
      |               \- ip->i_new_size
      \- ip->i_size

There is no need to grab the i_mutex of the IO lock in exclusive
mode if we don't need to invalidate the page cache. Taking these
locks on every direct IO effective serialises them as taking the IO
lock in exclusive mode has to wait for all shared holders to drop
the lock. That only happens when IO is complete, so effective it
prevents dispatch of concurrent direct IO writes to the same inode.

And so you can see that for the third concurrent IO, we'd avoid
exclusive locking for the same reason we avoided the exclusive lock
for the second IO.

Fixing this is a bit more complex than that, because we need to hold
a write-submission local value of ip->i_new_size to that clearing
the value is only done if no other thread has updated it before our
IO completes.....

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_file.c | 68 ++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 52 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 8fd4a0708d30..cbbac5cc9c26 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -418,11 +418,13 @@ xfs_aio_write_isize_update(
  */
 STATIC void
 xfs_aio_write_newsize_update(
-	struct xfs_inode	*ip)
+	struct xfs_inode	*ip,
+	xfs_fsize_t		new_size)
 {
-	if (ip->i_new_size) {
+	if (new_size == ip->i_new_size) {
 		xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
-		ip->i_new_size = 0;
+		if (new_size == ip->i_new_size)
+			ip->i_new_size = 0;
 		if (ip->i_d.di_size > ip->i_size)
 			ip->i_d.di_size = ip->i_size;
 		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
@@ -473,7 +475,7 @@ xfs_file_splice_write(
 	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
 
 	xfs_aio_write_isize_update(inode, ppos, ret);
-	xfs_aio_write_newsize_update(ip);
+	xfs_aio_write_newsize_update(ip, new_size);
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 	return ret;
 }
@@ -670,6 +672,7 @@ xfs_file_aio_write_checks(
 	struct file		*file,
 	loff_t			*pos,
 	size_t			*count,
+	xfs_fsize_t		*new_sizep,
 	int			*iolock)
 {
 	struct inode		*inode = file->f_mapping->host;
@@ -677,6 +680,8 @@ xfs_file_aio_write_checks(
 	xfs_fsize_t		new_size;
 	int			error = 0;
 
+	*new_sizep = 0;
+restart:
 	error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
 	if (error) {
 		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
@@ -684,20 +689,41 @@ xfs_file_aio_write_checks(
 		return error;
 	}
 
-	new_size = *pos + *count;
-	if (new_size > ip->i_size)
-		ip->i_new_size = new_size;
-
 	if (likely(!(file->f_mode & FMODE_NOCMTIME)))
 		file_update_time(file);
 
 	/*
 	 * If the offset is beyond the size of the file, we need to zero any
 	 * blocks that fall between the existing EOF and the start of this
-	 * write.
+	 * write. There is no need to issue zeroing if another in-flght IO ends
+	 * at or before this one If zeronig is needed and we are currently
+	 * holding the iolock shared, we need to update it to exclusive which
+	 * involves dropping all locks and relocking to maintain correct locking
+	 * order. If we do this, restart the function to ensure all checks and
+	 * values are still valid.
 	 */
-	if (*pos > ip->i_size)
+	if ((ip->i_new_size && *pos > ip->i_new_size) ||
+	    (!ip->i_new_size && *pos > ip->i_size)) {
+		if (*iolock == XFS_IOLOCK_SHARED) {
+			xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+			*iolock = XFS_IOLOCK_EXCL;
+			xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+			goto restart;
+		}
 		error = -xfs_zero_eof(ip, *pos, ip->i_size);
+	}
+
+	/*
+	 * If this IO extends beyond EOF, we may need to update ip->i_new_size.
+	 * We have already zeroed space beyond EOF (if necessary).  Only update
+	 * ip->i_new_size if this IO ends beyond any other in-flight writes.
+	 */
+	new_size = *pos + *count;
+	if (new_size > ip->i_size) {
+		if (new_size > ip->i_new_size)
+			ip->i_new_size = new_size;
+		*new_sizep = new_size;
+	}
 
 	xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
 	if (error)
@@ -744,6 +770,7 @@ xfs_file_dio_aio_write(
 	unsigned long		nr_segs,
 	loff_t			pos,
 	size_t			ocount,
+	xfs_fsize_t		*new_size,
 	int			*iolock)
 {
 	struct file		*file = iocb->ki_filp;
@@ -764,13 +791,20 @@ xfs_file_dio_aio_write(
 	if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
 		unaligned_io = 1;
 
-	if (unaligned_io || mapping->nrpages || pos > ip->i_size)
+	/*
+	 * We don't need to take an exclusive lock unless there page cache needs
+	 * to be invalidated or unaligned IO is being executed. We don't need to
+	 * consider the EOF extension case here because
+	 * xfs_file_aio_write_checks() will relock the inode as necessary for
+	 * EOF zeroing cases and fill out the new inode size as appropriate.
+	 */
+	if (unaligned_io || mapping->nrpages)
 		*iolock = XFS_IOLOCK_EXCL;
 	else
 		*iolock = XFS_IOLOCK_SHARED;
 	xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
 
-	ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+	ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
 	if (ret)
 		return ret;
 
@@ -809,6 +843,7 @@ xfs_file_buffered_aio_write(
 	unsigned long		nr_segs,
 	loff_t			pos,
 	size_t			ocount,
+	xfs_fsize_t		*new_size,
 	int			*iolock)
 {
 	struct file		*file = iocb->ki_filp;
@@ -822,7 +857,7 @@ xfs_file_buffered_aio_write(
 	*iolock = XFS_IOLOCK_EXCL;
 	xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
 
-	ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+	ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
 	if (ret)
 		return ret;
 
@@ -862,6 +897,7 @@ xfs_file_aio_write(
 	ssize_t			ret;
 	int			iolock;
 	size_t			ocount = 0;
+	xfs_fsize_t		new_size = 0;
 
 	XFS_STATS_INC(xs_write_calls);
 
@@ -881,10 +917,10 @@ xfs_file_aio_write(
 
 	if (unlikely(file->f_flags & O_DIRECT))
 		ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
-						ocount, &iolock);
+						ocount, &new_size, &iolock);
 	else
 		ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
-						ocount, &iolock);
+						ocount, &new_size, &iolock);
 
 	xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
 
@@ -905,7 +941,7 @@ xfs_file_aio_write(
 	}
 
 out_unlock:
-	xfs_aio_write_newsize_update(ip);
+	xfs_aio_write_newsize_update(ip, new_size);
 	xfs_rw_iunlock(ip, iolock);
 	return ret;
 }
-- 
cgit v1.2.3-58-ga151


From 375ec69d2ef6e0797f19f5823e36e249765c3d41 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 08:28:03 +0000
Subject: xfs: remove delwri buffer handling from xfs_buf_iorequest

We cannot ever reach xfs_buf_iorequest for a buffer with XBF_DELWRI set,
given that all write handlers make sure that the buffer is remove from
the delwri queue before, and we never do reads with the XBF_DELWRI flag
set (which the code would not handle correctly anyway).

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_buf.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c57836dc778f..2e71a26da22e 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1275,15 +1275,10 @@ xfs_buf_iorequest(
 {
 	trace_xfs_buf_iorequest(bp, _RET_IP_);
 
-	if (bp->b_flags & XBF_DELWRI) {
-		xfs_buf_delwri_queue(bp, 1);
-		return 0;
-	}
+	ASSERT(!(bp->b_flags & XBF_DELWRI));
 
-	if (bp->b_flags & XBF_WRITE) {
+	if (bp->b_flags & XBF_WRITE)
 		xfs_buf_wait_unpin(bp);
-	}
-
 	xfs_buf_hold(bp);
 
 	/* Set the count to 1 initially, this will stop an I/O
-- 
cgit v1.2.3-58-ga151


From 527cfdf19dd538a5a9e46b9bed0f30a38c28438d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 08:28:04 +0000
Subject: xfs: remove the unlock argument to xfs_buf_delwri_queue

We can just unlock the buffer in the caller, and the decrement of b_hold
would also be needed in the !unlock, we just never hit that case currently
given that the caller handles that case.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_buf.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 2e71a26da22e..04689dbbcbba 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -43,7 +43,7 @@
 
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
-STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
+STATIC void xfs_buf_delwri_queue(xfs_buf_t *);
 
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
@@ -940,7 +940,7 @@ xfs_buf_unlock(
 	if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) {
 		atomic_inc(&bp->b_hold);
 		bp->b_flags |= XBF_ASYNC;
-		xfs_buf_delwri_queue(bp, 0);
+		xfs_buf_delwri_queue(bp);
 	}
 
 	XB_CLEAR_OWNER(bp);
@@ -1049,7 +1049,8 @@ xfs_bdwrite(
 	bp->b_flags &= ~XBF_READ;
 	bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
 
-	xfs_buf_delwri_queue(bp, 1);
+	xfs_buf_delwri_queue(bp);
+	xfs_buf_unlock(bp);
 }
 
 /*
@@ -1562,8 +1563,7 @@ error:
  */
 STATIC void
 xfs_buf_delwri_queue(
-	xfs_buf_t		*bp,
-	int			unlock)
+	xfs_buf_t		*bp)
 {
 	struct list_head	*dwq = &bp->b_target->bt_delwrite_queue;
 	spinlock_t		*dwlk = &bp->b_target->bt_delwrite_lock;
@@ -1576,8 +1576,7 @@ xfs_buf_delwri_queue(
 	/* If already in the queue, dequeue and place at tail */
 	if (!list_empty(&bp->b_list)) {
 		ASSERT(bp->b_flags & _XBF_DELWRI_Q);
-		if (unlock)
-			atomic_dec(&bp->b_hold);
+		atomic_dec(&bp->b_hold);
 		list_del(&bp->b_list);
 	}
 
@@ -1590,9 +1589,6 @@ xfs_buf_delwri_queue(
 	list_add_tail(&bp->b_list, dwq);
 	bp->b_queuetime = jiffies;
 	spin_unlock(dwlk);
-
-	if (unlock)
-		xfs_buf_unlock(bp);
 }
 
 void
-- 
cgit v1.2.3-58-ga151


From 5a8ee6bafdd0ab8555adceac8b2cec539a552a1f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 08:28:05 +0000
Subject: xfs: move more delwri setup into xfs_buf_delwri_queue

Do not transfer a reference held by the caller to the buffer on the list,
or decrement it in xfs_buf_delwri_queue, but instead grab a new reference
if needed, and let the caller drop its own reference.  Also move setting
of the XBF_DELWRI and XBF_ASYNC flags into xfs_buf_delwri_queue, and
only do it if needed.  Note that for now xfs_buf_unlock already has
XBF_DELWRI, but that will change in the following patches.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_buf.c | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 04689dbbcbba..86c0945053c9 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -937,11 +937,8 @@ void
 xfs_buf_unlock(
 	struct xfs_buf		*bp)
 {
-	if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) {
-		atomic_inc(&bp->b_hold);
-		bp->b_flags |= XBF_ASYNC;
+	if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI)
 		xfs_buf_delwri_queue(bp);
-	}
 
 	XB_CLEAR_OWNER(bp);
 	up(&bp->b_sema);
@@ -1046,11 +1043,8 @@ xfs_bdwrite(
 {
 	trace_xfs_buf_bdwrite(bp, _RET_IP_);
 
-	bp->b_flags &= ~XBF_READ;
-	bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
-
 	xfs_buf_delwri_queue(bp);
-	xfs_buf_unlock(bp);
+	xfs_buf_relse(bp);
 }
 
 /*
@@ -1570,23 +1564,22 @@ xfs_buf_delwri_queue(
 
 	trace_xfs_buf_delwri_queue(bp, _RET_IP_);
 
-	ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC));
+	ASSERT(!(bp->b_flags & XBF_READ));
 
 	spin_lock(dwlk);
-	/* If already in the queue, dequeue and place at tail */
 	if (!list_empty(&bp->b_list)) {
+		/* if already in the queue, move it to the tail */
 		ASSERT(bp->b_flags & _XBF_DELWRI_Q);
-		atomic_dec(&bp->b_hold);
-		list_del(&bp->b_list);
-	}
-
-	if (list_empty(dwq)) {
+		list_move_tail(&bp->b_list, dwq);
+	} else {
 		/* start xfsbufd as it is about to have something to do */
-		wake_up_process(bp->b_target->bt_task);
-	}
+		if (list_empty(dwq))
+			wake_up_process(bp->b_target->bt_task);
 
-	bp->b_flags |= _XBF_DELWRI_Q;
-	list_add_tail(&bp->b_list, dwq);
+		atomic_inc(&bp->b_hold);
+		bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC;
+		list_add_tail(&bp->b_list, dwq);
+	}
 	bp->b_queuetime = jiffies;
 	spin_unlock(dwlk);
 }
-- 
cgit v1.2.3-58-ga151


From 61551f1ee536289084a4a8f1c4f187e2f371c440 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 08:28:06 +0000
Subject: xfs: call xfs_buf_delwri_queue directly

Unify the ways we add buffers to the delwri queue by always calling
xfs_buf_delwri_queue directly.  The xfs_bdwrite functions is removed and
opencoded in its callers, and the two places setting XBF_DELWRI while a
buffer is locked and expecting xfs_buf_unlock to pick it up are converted
to call xfs_buf_delwri_queue directly, too.  Also replace the
XFS_BUF_UNDELAYWRITE macro with direct calls to xfs_buf_delwri_dequeue
to make the explicit queuing/dequeuing more obvious.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_attr.c        |  2 +-
 fs/xfs/xfs_buf.c         | 21 +++------------------
 fs/xfs/xfs_buf.h         |  8 +++-----
 fs/xfs/xfs_buf_item.c    |  4 ++--
 fs/xfs/xfs_dquot.c       |  6 ++++--
 fs/xfs/xfs_inode.c       |  6 ++++--
 fs/xfs/xfs_log_recover.c |  9 ++++++---
 fs/xfs/xfs_mount.c       |  2 +-
 fs/xfs/xfs_qm.c          |  3 ++-
 fs/xfs/xfs_rw.c          |  2 +-
 fs/xfs/xfs_trace.h       |  1 -
 fs/xfs/xfs_trans_buf.c   |  5 +++--
 12 files changed, 30 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 160bcdc34a6e..b097fd5a2b3f 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2189,7 +2189,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 		bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
 		if (bp) {
 			XFS_BUF_STALE(bp);
-			XFS_BUF_UNDELAYWRITE(bp);
+			xfs_buf_delwri_dequeue(bp);
 			xfs_buf_relse(bp);
 			bp = NULL;
 		}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 86c0945053c9..309eca75fad4 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -43,7 +43,6 @@
 
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
-STATIC void xfs_buf_delwri_queue(xfs_buf_t *);
 
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
@@ -937,9 +936,6 @@ void
 xfs_buf_unlock(
 	struct xfs_buf		*bp)
 {
-	if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI)
-		xfs_buf_delwri_queue(bp);
-
 	XB_CLEAR_OWNER(bp);
 	up(&bp->b_sema);
 
@@ -1036,17 +1032,6 @@ xfs_bwrite(
 	return error;
 }
 
-void
-xfs_bdwrite(
-	void			*mp,
-	struct xfs_buf		*bp)
-{
-	trace_xfs_buf_bdwrite(bp, _RET_IP_);
-
-	xfs_buf_delwri_queue(bp);
-	xfs_buf_relse(bp);
-}
-
 /*
  * Called when we want to stop a buffer from getting written or read.
  * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
@@ -1069,7 +1054,7 @@ xfs_bioerror(
 	 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
 	 */
 	XFS_BUF_UNREAD(bp);
-	XFS_BUF_UNDELAYWRITE(bp);
+	xfs_buf_delwri_dequeue(bp);
 	XFS_BUF_UNDONE(bp);
 	XFS_BUF_STALE(bp);
 
@@ -1098,7 +1083,7 @@ xfs_bioerror_relse(
 	 * change that interface.
 	 */
 	XFS_BUF_UNREAD(bp);
-	XFS_BUF_UNDELAYWRITE(bp);
+	xfs_buf_delwri_dequeue(bp);
 	XFS_BUF_DONE(bp);
 	XFS_BUF_STALE(bp);
 	bp->b_iodone = NULL;
@@ -1555,7 +1540,7 @@ error:
 /*
  *	Delayed write buffer handling
  */
-STATIC void
+void
 xfs_buf_delwri_queue(
 	xfs_buf_t		*bp)
 {
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 620972b8094d..f1a8933becb6 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -198,7 +198,6 @@ extern void xfs_buf_unlock(xfs_buf_t *);
 
 /* Buffer Read and Write Routines */
 extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
-extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
 
 extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
 extern int xfs_bdstrat_cb(struct xfs_buf *);
@@ -221,8 +220,9 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp)
 extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
 
 /* Delayed Write Buffer Routines */
-extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
-extern void xfs_buf_delwri_promote(xfs_buf_t *);
+extern void xfs_buf_delwri_queue(struct xfs_buf *);
+extern void xfs_buf_delwri_dequeue(struct xfs_buf *);
+extern void xfs_buf_delwri_promote(struct xfs_buf *);
 
 /* Buffer Daemon Setup Routines */
 extern int xfs_buf_init(void);
@@ -251,8 +251,6 @@ void xfs_buf_stale(struct xfs_buf *bp);
 					XFS_BUF_DONE(bp);	\
 				} while (0)
 
-#define XFS_BUF_DELAYWRITE(bp)		((bp)->b_flags |= XBF_DELWRI)
-#define XFS_BUF_UNDELAYWRITE(bp)	xfs_buf_delwri_dequeue(bp)
 #define XFS_BUF_ISDELAYWRITE(bp)	((bp)->b_flags & XBF_DELWRI)
 
 #define XFS_BUF_DONE(bp)	((bp)->b_flags |= XBF_DONE)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index cac2ecfa6746..3243083d8693 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -992,7 +992,7 @@ xfs_buf_iodone_callbacks(
 		xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
 
 		if (!XFS_BUF_ISSTALE(bp)) {
-			XFS_BUF_DELAYWRITE(bp);
+			xfs_buf_delwri_queue(bp);
 			XFS_BUF_DONE(bp);
 		}
 		ASSERT(bp->b_iodone != NULL);
@@ -1007,7 +1007,7 @@ xfs_buf_iodone_callbacks(
 	 */
 	XFS_BUF_STALE(bp);
 	XFS_BUF_DONE(bp);
-	XFS_BUF_UNDELAYWRITE(bp);
+	xfs_buf_delwri_dequeue(bp);
 
 	trace_xfs_buf_error_relse(bp, _RET_IP_);
 
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index db62959bed13..0f78dd46415c 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1243,8 +1243,10 @@ xfs_qm_dqflush(
 
 	if (flags & SYNC_WAIT)
 		error = xfs_bwrite(mp, bp);
-	else
-		xfs_bdwrite(mp, bp);
+	else {
+		xfs_buf_delwri_queue(bp);
+		xfs_buf_relse(bp);
+	}
 
 	trace_xfs_dqflush_done(dqp);
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0239a7c7c886..f8fe1c66d420 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2598,8 +2598,10 @@ xfs_iflush(
 
 	if (flags & SYNC_WAIT)
 		error = xfs_bwrite(mp, bp);
-	else
-		xfs_bdwrite(mp, bp);
+	else {
+		xfs_buf_delwri_queue(bp);
+		xfs_buf_relse(bp);
+	}
 	return error;
 
 corrupt_out:
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a199dbcee7d8..22946949bf5e 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2176,7 +2176,8 @@ xlog_recover_buffer_pass2(
 	} else {
 		ASSERT(bp->b_target->bt_mount == mp);
 		bp->b_iodone = xlog_recover_iodone;
-		xfs_bdwrite(mp, bp);
+		xfs_buf_delwri_queue(bp);
+		xfs_buf_relse(bp);
 	}
 
 	return (error);
@@ -2439,7 +2440,8 @@ xlog_recover_inode_pass2(
 write_inode_buffer:
 	ASSERT(bp->b_target->bt_mount == mp);
 	bp->b_iodone = xlog_recover_iodone;
-	xfs_bdwrite(mp, bp);
+	xfs_buf_delwri_queue(bp);
+	xfs_buf_relse(bp);
 error:
 	if (need_free)
 		kmem_free(in_f);
@@ -2561,7 +2563,8 @@ xlog_recover_dquot_pass2(
 	ASSERT(dq_f->qlf_size == 2);
 	ASSERT(bp->b_target->bt_mount == mp);
 	bp->b_iodone = xlog_recover_iodone;
-	xfs_bdwrite(mp, bp);
+	xfs_buf_delwri_queue(bp);
+	xfs_buf_relse(bp);
 
 	return (0);
 }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 0081657ad985..a957e437bee1 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1612,7 +1612,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
 
 		XFS_BUF_UNDONE(sbp);
 		XFS_BUF_UNREAD(sbp);
-		XFS_BUF_UNDELAYWRITE(sbp);
+		xfs_buf_delwri_dequeue(sbp);
 		XFS_BUF_WRITE(sbp);
 		XFS_BUF_UNASYNC(sbp);
 		ASSERT(sbp->b_target == mp->m_ddev_targp);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 9a0aa76facdf..f51bef885e6d 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1296,7 +1296,8 @@ xfs_qm_dqiter_bufs(
 			break;
 
 		xfs_qm_reset_dqcounts(mp, bp, firstid, type);
-		xfs_bdwrite(mp, bp);
+		xfs_buf_delwri_queue(bp);
+		xfs_buf_relse(bp);
 		/*
 		 * goto the next block.
 		 */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index c96a8a05ac03..99823c3b9aca 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -149,7 +149,7 @@ xfs_read_buf(
 		}
 		if (bp) {
 			XFS_BUF_UNDONE(bp);
-			XFS_BUF_UNDELAYWRITE(bp);
+			xfs_buf_delwri_dequeue(bp);
 			XFS_BUF_STALE(bp);
 			/*
 			 * brelse clears B_ERROR and b_error
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 690fc7a7bd72..bb5e660e0fab 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -320,7 +320,6 @@ DEFINE_BUF_EVENT(xfs_buf_rele);
 DEFINE_BUF_EVENT(xfs_buf_iodone);
 DEFINE_BUF_EVENT(xfs_buf_iorequest);
 DEFINE_BUF_EVENT(xfs_buf_bawrite);
-DEFINE_BUF_EVENT(xfs_buf_bdwrite);
 DEFINE_BUF_EVENT(xfs_buf_lock);
 DEFINE_BUF_EVENT(xfs_buf_lock_done);
 DEFINE_BUF_EVENT(xfs_buf_trylock);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 137e2b9e2948..5e5196a269dd 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -643,13 +643,14 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
 	 * inside the b_bdstrat callback so that this won't get written to
 	 * disk.
 	 */
-	XFS_BUF_DELAYWRITE(bp);
 	XFS_BUF_DONE(bp);
 
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 	bp->b_iodone = xfs_buf_iodone_callbacks;
 	bip->bli_item.li_cb = xfs_buf_iodone;
 
+	xfs_buf_delwri_queue(bp);
+
 	trace_xfs_trans_log_buf(bip);
 
 	/*
@@ -738,7 +739,7 @@ xfs_trans_binval(
 	 * We set the stale bit in the buffer as well since we're getting
 	 * rid of it.
 	 */
-	XFS_BUF_UNDELAYWRITE(bp);
+	xfs_buf_delwri_dequeue(bp);
 	XFS_BUF_STALE(bp);
 	bip->bli_flags |= XFS_BLI_STALE;
 	bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
-- 
cgit v1.2.3-58-ga151


From c2b006c1da1602551def200e4661535f02b82488 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 08:28:07 +0000
Subject: xfs: let xfs_bwrite callers handle the xfs_buf_relse

Remove the xfs_buf_relse from xfs_bwrite and let the caller handle it to
mirror the delwri and read paths.

Also remove the mount pointer passed to xfs_bwrite, which is superflous now
that we have a mount pointer in the buftarg.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_attr.c        |  7 ++++---
 fs/xfs/xfs_buf.c         |  8 ++++----
 fs/xfs/xfs_buf.h         |  2 +-
 fs/xfs/xfs_dquot.c       |  8 ++++----
 fs/xfs/xfs_fsops.c       | 40 ++++++++++++++++++++++------------------
 fs/xfs/xfs_inode.c       |  8 ++++----
 fs/xfs/xfs_log_recover.c | 11 +++++++----
 fs/xfs/xfs_sync.c        |  6 ++++--
 8 files changed, 50 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index b097fd5a2b3f..8f0f65833500 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2128,9 +2128,10 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 		xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
 		if (tmp < XFS_BUF_SIZE(bp))
 			xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
-		if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
-			return (error);
-		}
+		error = xfs_bwrite(bp);	/* GROT: NOTE: synchronous write */
+		xfs_buf_relse(bp);
+		if (error)
+			return error;
 		src += tmp;
 		valuelen -= tmp;
 
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 309eca75fad4..63dbeb9efc49 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1014,7 +1014,6 @@ xfs_buf_ioerror(
 
 int
 xfs_bwrite(
-	struct xfs_mount	*mp,
 	struct xfs_buf		*bp)
 {
 	int			error;
@@ -1026,9 +1025,10 @@ xfs_bwrite(
 	xfs_bdstrat_cb(bp);
 
 	error = xfs_buf_iowait(bp);
-	if (error)
-		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-	xfs_buf_relse(bp);
+	if (error) {
+		xfs_force_shutdown(bp->b_target->bt_mount,
+				   SHUTDOWN_META_IO_ERROR);
+	}
 	return error;
 }
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index f1a8933becb6..3f543ed3009b 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -197,7 +197,7 @@ extern void xfs_buf_unlock(xfs_buf_t *);
 	((bp)->b_sema.count <= 0)
 
 /* Buffer Read and Write Routines */
-extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
+extern int xfs_bwrite(struct xfs_buf *bp);
 
 extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
 extern int xfs_bdstrat_cb(struct xfs_buf *);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 0f78dd46415c..3e2ccaedc51e 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1242,11 +1242,11 @@ xfs_qm_dqflush(
 	}
 
 	if (flags & SYNC_WAIT)
-		error = xfs_bwrite(mp, bp);
-	else {
+		error = xfs_bwrite(bp);
+	else
 		xfs_buf_delwri_queue(bp);
-		xfs_buf_relse(bp);
-	}
+
+	xfs_buf_relse(bp);
 
 	trace_xfs_dqflush_done(dqp);
 
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 9153d2c77caf..e023f940a3dd 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -216,10 +216,11 @@ xfs_growfs_data_private(
 		tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
 		agf->agf_freeblks = cpu_to_be32(tmpsize);
 		agf->agf_longest = cpu_to_be32(tmpsize);
-		error = xfs_bwrite(mp, bp);
-		if (error) {
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
 			goto error0;
-		}
+
 		/*
 		 * AG inode header block
 		 */
@@ -240,10 +241,11 @@ xfs_growfs_data_private(
 		agi->agi_dirino = cpu_to_be32(NULLAGINO);
 		for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
 			agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
-		error = xfs_bwrite(mp, bp);
-		if (error) {
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
 			goto error0;
-		}
+
 		/*
 		 * BNO btree root block
 		 */
@@ -262,10 +264,11 @@ xfs_growfs_data_private(
 		arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
 		arec->ar_blockcount = cpu_to_be32(
 			agsize - be32_to_cpu(arec->ar_startblock));
-		error = xfs_bwrite(mp, bp);
-		if (error) {
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
 			goto error0;
-		}
+
 		/*
 		 * CNT btree root block
 		 */
@@ -285,10 +288,11 @@ xfs_growfs_data_private(
 		arec->ar_blockcount = cpu_to_be32(
 			agsize - be32_to_cpu(arec->ar_startblock));
 		nfree += be32_to_cpu(arec->ar_blockcount);
-		error = xfs_bwrite(mp, bp);
-		if (error) {
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
 			goto error0;
-		}
+
 		/*
 		 * INO btree root block
 		 */
@@ -303,10 +307,10 @@ xfs_growfs_data_private(
 		block->bb_numrecs = 0;
 		block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
 		block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-		error = xfs_bwrite(mp, bp);
-		if (error) {
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
 			goto error0;
-		}
 	}
 	xfs_trans_agblocks_delta(tp, nfree);
 	/*
@@ -396,9 +400,9 @@ xfs_growfs_data_private(
 		 * just issue a warning and continue.  The real work is
 		 * already done and committed.
 		 */
-		if (!(error = xfs_bwrite(mp, bp))) {
-			continue;
-		} else {
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error) {
 			xfs_warn(mp,
 		"write error %d updating secondary superblock for ag %d",
 				error, agno);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index f8fe1c66d420..7f237ba3c292 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2597,11 +2597,11 @@ xfs_iflush(
 		goto cluster_corrupt_out;
 
 	if (flags & SYNC_WAIT)
-		error = xfs_bwrite(mp, bp);
-	else {
+		error = xfs_bwrite(bp);
+	else
 		xfs_buf_delwri_queue(bp);
-		xfs_buf_relse(bp);
-	}
+
+	xfs_buf_relse(bp);
 	return error;
 
 corrupt_out:
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 22946949bf5e..be173852b2ca 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -268,9 +268,12 @@ xlog_bwrite(
 	xfs_buf_lock(bp);
 	XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
 
-	if ((error = xfs_bwrite(log->l_mp, bp)))
+	error = xfs_bwrite(bp);
+	if (error) {
 		xfs_ioerror_alert("xlog_bwrite", log->l_mp,
 				  bp, XFS_BUF_ADDR(bp));
+	}
+	xfs_buf_relse(bp);
 	return error;
 }
 
@@ -2172,15 +2175,15 @@ xlog_recover_buffer_pass2(
 	    (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
 			(__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
 		XFS_BUF_STALE(bp);
-		error = xfs_bwrite(mp, bp);
+		error = xfs_bwrite(bp);
 	} else {
 		ASSERT(bp->b_target->bt_mount == mp);
 		bp->b_iodone = xlog_recover_iodone;
 		xfs_buf_delwri_queue(bp);
-		xfs_buf_relse(bp);
 	}
 
-	return (error);
+	xfs_buf_relse(bp);
+	return error;
 }
 
 STATIC int
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 4604f90f86a3..90cc197e0433 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -322,6 +322,7 @@ xfs_sync_fsdata(
 	struct xfs_mount	*mp)
 {
 	struct xfs_buf		*bp;
+	int			error;
 
 	/*
 	 * If the buffer is pinned then push on the log so we won't get stuck
@@ -334,8 +335,9 @@ xfs_sync_fsdata(
 	bp = xfs_getsb(mp, 0);
 	if (xfs_buf_ispinned(bp))
 		xfs_log_force(mp, 0);
-
-	return xfs_bwrite(mp, bp);
+	error = xfs_bwrite(bp);
+	xfs_buf_relse(bp);
+	return error;
 }
 
 /*
-- 
cgit v1.2.3-58-ga151


From c4e1c098ee8a72ea563a697a2b175868be86fdc9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 08:28:08 +0000
Subject: xfs: use the "delwri" terminology consistently

And also remove the strange local lock and delwri list pointers in a few
functions.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_buf.c | 43 +++++++++++++++++++------------------------
 fs/xfs/xfs_buf.h |  4 ++--
 2 files changed, 21 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 63dbeb9efc49..d3c2b58d7d70 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1489,12 +1489,12 @@ xfs_setsize_buftarg(
 }
 
 STATIC int
-xfs_alloc_delwrite_queue(
+xfs_alloc_delwri_queue(
 	xfs_buftarg_t		*btp,
 	const char		*fsname)
 {
-	INIT_LIST_HEAD(&btp->bt_delwrite_queue);
-	spin_lock_init(&btp->bt_delwrite_lock);
+	INIT_LIST_HEAD(&btp->bt_delwri_queue);
+	spin_lock_init(&btp->bt_delwri_lock);
 	btp->bt_flags = 0;
 	btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
 	if (IS_ERR(btp->bt_task))
@@ -1524,7 +1524,7 @@ xfs_alloc_buftarg(
 	spin_lock_init(&btp->bt_lru_lock);
 	if (xfs_setsize_buftarg_early(btp, bdev))
 		goto error;
-	if (xfs_alloc_delwrite_queue(btp, fsname))
+	if (xfs_alloc_delwri_queue(btp, fsname))
 		goto error;
 	btp->bt_shrinker.shrink = xfs_buftarg_shrink;
 	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
@@ -1544,46 +1544,44 @@ void
 xfs_buf_delwri_queue(
 	xfs_buf_t		*bp)
 {
-	struct list_head	*dwq = &bp->b_target->bt_delwrite_queue;
-	spinlock_t		*dwlk = &bp->b_target->bt_delwrite_lock;
+	struct xfs_buftarg	*btp = bp->b_target;
 
 	trace_xfs_buf_delwri_queue(bp, _RET_IP_);
 
 	ASSERT(!(bp->b_flags & XBF_READ));
 
-	spin_lock(dwlk);
+	spin_lock(&btp->bt_delwri_lock);
 	if (!list_empty(&bp->b_list)) {
 		/* if already in the queue, move it to the tail */
 		ASSERT(bp->b_flags & _XBF_DELWRI_Q);
-		list_move_tail(&bp->b_list, dwq);
+		list_move_tail(&bp->b_list, &btp->bt_delwri_queue);
 	} else {
 		/* start xfsbufd as it is about to have something to do */
-		if (list_empty(dwq))
+		if (list_empty(&btp->bt_delwri_queue))
 			wake_up_process(bp->b_target->bt_task);
 
 		atomic_inc(&bp->b_hold);
 		bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC;
-		list_add_tail(&bp->b_list, dwq);
+		list_add_tail(&bp->b_list, &btp->bt_delwri_queue);
 	}
 	bp->b_queuetime = jiffies;
-	spin_unlock(dwlk);
+	spin_unlock(&btp->bt_delwri_lock);
 }
 
 void
 xfs_buf_delwri_dequeue(
 	xfs_buf_t		*bp)
 {
-	spinlock_t		*dwlk = &bp->b_target->bt_delwrite_lock;
 	int			dequeued = 0;
 
-	spin_lock(dwlk);
+	spin_lock(&bp->b_target->bt_delwri_lock);
 	if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
 		ASSERT(bp->b_flags & _XBF_DELWRI_Q);
 		list_del_init(&bp->b_list);
 		dequeued = 1;
 	}
 	bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
-	spin_unlock(dwlk);
+	spin_unlock(&bp->b_target->bt_delwri_lock);
 
 	if (dequeued)
 		xfs_buf_rele(bp);
@@ -1615,9 +1613,9 @@ xfs_buf_delwri_promote(
 	if (bp->b_queuetime < jiffies - age)
 		return;
 	bp->b_queuetime = jiffies - age;
-	spin_lock(&btp->bt_delwrite_lock);
-	list_move(&bp->b_list, &btp->bt_delwrite_queue);
-	spin_unlock(&btp->bt_delwrite_lock);
+	spin_lock(&btp->bt_delwri_lock);
+	list_move(&bp->b_list, &btp->bt_delwri_queue);
+	spin_unlock(&btp->bt_delwri_lock);
 }
 
 STATIC void
@@ -1638,15 +1636,13 @@ xfs_buf_delwri_split(
 	unsigned long	age)
 {
 	xfs_buf_t	*bp, *n;
-	struct list_head *dwq = &target->bt_delwrite_queue;
-	spinlock_t	*dwlk = &target->bt_delwrite_lock;
 	int		skipped = 0;
 	int		force;
 
 	force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
 	INIT_LIST_HEAD(list);
-	spin_lock(dwlk);
-	list_for_each_entry_safe(bp, n, dwq, b_list) {
+	spin_lock(&target->bt_delwri_lock);
+	list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) {
 		ASSERT(bp->b_flags & XBF_DELWRI);
 
 		if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) {
@@ -1663,10 +1659,9 @@ xfs_buf_delwri_split(
 		} else
 			skipped++;
 	}
-	spin_unlock(dwlk);
 
+	spin_unlock(&target->bt_delwri_lock);
 	return skipped;
-
 }
 
 /*
@@ -1716,7 +1711,7 @@ xfsbufd(
 		}
 
 		/* sleep for a long time if there is nothing to do. */
-		if (list_empty(&target->bt_delwrite_queue))
+		if (list_empty(&target->bt_delwri_queue))
 			tout = MAX_SCHEDULE_TIMEOUT;
 		schedule_timeout_interruptible(tout);
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 3f543ed3009b..c23826685d3d 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -105,8 +105,8 @@ typedef struct xfs_buftarg {
 
 	/* per device delwri queue */
 	struct task_struct	*bt_task;
-	struct list_head	bt_delwrite_queue;
-	spinlock_t		bt_delwrite_lock;
+	struct list_head	bt_delwri_queue;
+	spinlock_t		bt_delwri_lock;
 	unsigned long		bt_flags;
 
 	/* LRU control structures */
-- 
cgit v1.2.3-58-ga151


From 398d25ef23b10ce75424e0336a8d059dda1dbc8d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 08:28:09 +0000
Subject: xfs: remove dead ENODEV handling in xfs_destroy_ioend

No driver returns ENODEV from it bio completion handler, not has this
ever been documented.  Remove the dead code dealing with it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_aops.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 8c37dde4c521..d91564404abf 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -122,17 +122,6 @@ xfs_destroy_ioend(
 		bh->b_end_io(bh, !ioend->io_error);
 	}
 
-	/*
-	 * Volume managers supporting multiple paths can send back ENODEV
-	 * when the final path disappears.  In this case continuing to fill
-	 * the page cache with dirty data which cannot be written out is
-	 * evil, so prevent that.
-	 */
-	if (unlikely(ioend->io_error == -ENODEV)) {
-		xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ,
-				      __FILE__, __LINE__);
-	}
-
 	xfs_ioend_wake(ip);
 	mempool_free(ioend, xfs_ioend_pool);
 }
-- 
cgit v1.2.3-58-ga151


From c859cdd1da008b3825555be3242908088a3de366 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 08:28:10 +0000
Subject: xfs: defer AIO/DIO completions

We really shouldn't complete AIO or DIO requests until we have finished
the unwritten extent conversion and size update.  This means fsync never
has to pick up any ioends as all work has been completed when signalling
I/O completion.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_aops.c | 26 +++++++++-----------------
 fs/xfs/xfs_aops.h |  1 +
 2 files changed, 10 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index d91564404abf..10660c364105 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -122,6 +122,11 @@ xfs_destroy_ioend(
 		bh->b_end_io(bh, !ioend->io_error);
 	}
 
+	if (ioend->io_iocb) {
+		if (ioend->io_isasync)
+			aio_complete(ioend->io_iocb, ioend->io_result, 0);
+		inode_dio_done(ioend->io_inode);
+	}
 	xfs_ioend_wake(ip);
 	mempool_free(ioend, xfs_ioend_pool);
 }
@@ -236,8 +241,6 @@ xfs_end_io(
 		/* ensure we don't spin on blocked ioends */
 		delay(1);
 	} else {
-		if (ioend->io_iocb)
-			aio_complete(ioend->io_iocb, ioend->io_result, 0);
 		xfs_destroy_ioend(ioend);
 	}
 }
@@ -274,6 +277,7 @@ xfs_alloc_ioend(
 	 * all the I/O from calling the completion routine too early.
 	 */
 	atomic_set(&ioend->io_remaining, 1);
+	ioend->io_isasync = 0;
 	ioend->io_error = 0;
 	ioend->io_list = NULL;
 	ioend->io_type = type;
@@ -1289,7 +1293,6 @@ xfs_end_io_direct_write(
 	bool			is_async)
 {
 	struct xfs_ioend	*ioend = iocb->private;
-	struct inode		*inode = ioend->io_inode;
 
 	/*
 	 * blockdev_direct_IO can return an error even after the I/O
@@ -1300,28 +1303,17 @@ xfs_end_io_direct_write(
 
 	ioend->io_offset = offset;
 	ioend->io_size = size;
+	ioend->io_iocb = iocb;
+	ioend->io_result = ret;
 	if (private && size > 0)
 		ioend->io_type = IO_UNWRITTEN;
 
 	if (is_async) {
-		/*
-		 * If we are converting an unwritten extent we need to delay
-		 * the AIO completion until after the unwrittent extent
-		 * conversion has completed, otherwise do it ASAP.
-		 */
-		if (ioend->io_type == IO_UNWRITTEN) {
-			ioend->io_iocb = iocb;
-			ioend->io_result = ret;
-		} else {
-			aio_complete(iocb, ret, 0);
-		}
+		ioend->io_isasync = 1;
 		xfs_finish_ioend(ioend);
 	} else {
 		xfs_finish_ioend_sync(ioend);
 	}
-
-	/* XXX: probably should move into the real I/O completion handler */
-	inode_dio_done(inode);
 }
 
 STATIC ssize_t
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 71f721e1a71f..ce3dcb50762e 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -47,6 +47,7 @@ typedef struct xfs_ioend {
 	unsigned int		io_type;	/* delalloc / unwritten */
 	int			io_error;	/* I/O error code */
 	atomic_t		io_remaining;	/* hold count */
+	unsigned int		io_isasync : 1;	/* needs aio_complete */
 	struct inode		*io_inode;	/* file being written to */
 	struct buffer_head	*io_buffer_head;/* buffer linked list head */
 	struct buffer_head	*io_buffer_tail;/* buffer linked list tail */
-- 
cgit v1.2.3-58-ga151


From fc0063c4474599b7a066ba76b90902abe21bc675 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 08:28:11 +0000
Subject: xfs: reduce ioend latency

There is no reason to queue up ioends for processing in user context
unless we actually need it.  Just complete ioends that do not convert
unwritten extents or need a size update from the end_io context.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_aops.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 10660c364105..e1ff0770784e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -149,6 +149,15 @@ xfs_ioend_new_eof(
 	return isize > ip->i_d.di_size ? isize : 0;
 }
 
+/*
+ * Fast and loose check if this write could update the on-disk inode size.
+ */
+static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
+{
+	return ioend->io_offset + ioend->io_size >
+		XFS_I(ioend->io_inode)->i_d.di_size;
+}
+
 /*
  * Update on-disk file size now that data has been written to disk.  The
  * current in-memory file size is i_size.  If a write is beyond eof i_new_size
@@ -186,6 +195,9 @@ xfs_setfilesize(
 
 /*
  * Schedule IO completion handling on the final put of an ioend.
+ *
+ * If there is no work to do we might as well call it a day and free the
+ * ioend right now.
  */
 STATIC void
 xfs_finish_ioend(
@@ -194,8 +206,10 @@ xfs_finish_ioend(
 	if (atomic_dec_and_test(&ioend->io_remaining)) {
 		if (ioend->io_type == IO_UNWRITTEN)
 			queue_work(xfsconvertd_workqueue, &ioend->io_work);
-		else
+		else if (xfs_ioend_is_append(ioend))
 			queue_work(xfsdatad_workqueue, &ioend->io_work);
+		else
+			xfs_destroy_ioend(ioend);
 	}
 }
 
-- 
cgit v1.2.3-58-ga151


From 2b3ffd7eb7b4392e3657c5046b055ca9f1f7cf5e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 08:28:12 +0000
Subject: xfs: wait for I/O completion when writing out pages in
 xfs_setattr_size

The current code relies on the xfs_ioend_wait call later on to make sure
all I/O actually has completed.  The xfs_ioend_wait call will go away soon,
so prepare for that by using the waiting filemap function.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_iops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 673704fab748..32aca87bde5e 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -833,8 +833,8 @@ xfs_setattr_size(
 	 * care about here.
 	 */
 	if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
-		error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size,
-					XBF_ASYNC, FI_NONE);
+		error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0,
+					FI_NONE);
 		if (error)
 			goto out_unlock;
 	}
-- 
cgit v1.2.3-58-ga151


From 4a06fd262dbeb70a2c315f7259e063efa493fe3d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 08:28:13 +0000
Subject: xfs: remove i_iocount

We now have an i_dio_count filed and surrounding infrastructure to wait
for direct I/O completion instead of i_icount, and we have never needed
to iocount waits for buffered I/O given that we only set the page uptodate
after finishing all required work.  Thus remove i_iocount, and replace
the actually needed waits with calls to inode_dio_wait.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_aops.c     | 39 +--------------------------------------
 fs/xfs/xfs_aops.h     |  3 ---
 fs/xfs/xfs_file.c     |  8 ++------
 fs/xfs/xfs_iget.c     |  2 --
 fs/xfs/xfs_inode.h    |  1 -
 fs/xfs/xfs_iops.c     |  4 ++--
 fs/xfs/xfs_super.c    |  7 +------
 fs/xfs/xfs_sync.c     |  8 ++------
 fs/xfs/xfs_vnodeops.c |  4 +---
 9 files changed, 9 insertions(+), 67 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e1ff0770784e..46bba3e0af47 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -38,40 +38,6 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
 
-
-/*
- * Prime number of hash buckets since address is used as the key.
- */
-#define NVSYNC		37
-#define to_ioend_wq(v)	(&xfs_ioend_wq[((unsigned long)v) % NVSYNC])
-static wait_queue_head_t xfs_ioend_wq[NVSYNC];
-
-void __init
-xfs_ioend_init(void)
-{
-	int i;
-
-	for (i = 0; i < NVSYNC; i++)
-		init_waitqueue_head(&xfs_ioend_wq[i]);
-}
-
-void
-xfs_ioend_wait(
-	xfs_inode_t	*ip)
-{
-	wait_queue_head_t *wq = to_ioend_wq(ip);
-
-	wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
-}
-
-STATIC void
-xfs_ioend_wake(
-	xfs_inode_t	*ip)
-{
-	if (atomic_dec_and_test(&ip->i_iocount))
-		wake_up(to_ioend_wq(ip));
-}
-
 void
 xfs_count_page_state(
 	struct page		*page,
@@ -115,7 +81,6 @@ xfs_destroy_ioend(
 	xfs_ioend_t		*ioend)
 {
 	struct buffer_head	*bh, *next;
-	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
 
 	for (bh = ioend->io_buffer_head; bh; bh = next) {
 		next = bh->b_private;
@@ -127,7 +92,7 @@ xfs_destroy_ioend(
 			aio_complete(ioend->io_iocb, ioend->io_result, 0);
 		inode_dio_done(ioend->io_inode);
 	}
-	xfs_ioend_wake(ip);
+
 	mempool_free(ioend, xfs_ioend_pool);
 }
 
@@ -298,7 +263,6 @@ xfs_alloc_ioend(
 	ioend->io_inode = inode;
 	ioend->io_buffer_head = NULL;
 	ioend->io_buffer_tail = NULL;
-	atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);
 	ioend->io_offset = 0;
 	ioend->io_size = 0;
 	ioend->io_iocb = NULL;
@@ -558,7 +522,6 @@ xfs_cancel_ioend(
 			unlock_buffer(bh);
 		} while ((bh = next_bh) != NULL);
 
-		xfs_ioend_wake(XFS_I(ioend->io_inode));
 		mempool_free(ioend, xfs_ioend_pool);
 	} while ((ioend = next) != NULL);
 }
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index ce3dcb50762e..116dd5c37034 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -61,9 +61,6 @@ typedef struct xfs_ioend {
 extern const struct address_space_operations xfs_address_space_operations;
 extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
 
-extern void xfs_ioend_init(void);
-extern void xfs_ioend_wait(struct xfs_inode *);
-
 extern void xfs_count_page_state(struct page *, int *, int *);
 
 #endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index cbbac5cc9c26..ee63c4fb3639 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -149,10 +149,6 @@ xfs_file_fsync(
 
 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
 
-	xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	xfs_ioend_wait(ip);
-	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
 	if (mp->m_flags & XFS_MOUNT_BARRIER) {
 		/*
 		 * If we have an RT and/or log subvolume we need to make sure
@@ -758,7 +754,7 @@ restart:
  * the dio layer.  To avoid the problem with aio, we also need to wait for
  * outstanding IOs to complete so that unwritten extent conversion is completed
  * before we try to map the overlapping block. This is currently implemented by
- * hitting it with a big hammer (i.e. xfs_ioend_wait()).
+ * hitting it with a big hammer (i.e. inode_dio_wait()).
  *
  * Returns with locks held indicated by @iolock and errors indicated by
  * negative return values.
@@ -821,7 +817,7 @@ xfs_file_dio_aio_write(
 	 * otherwise demote the lock if we had to flush cached pages
 	 */
 	if (unaligned_io)
-		xfs_ioend_wait(ip);
+		inode_dio_wait(inode);
 	else if (*iolock == XFS_IOLOCK_EXCL) {
 		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
 		*iolock = XFS_IOLOCK_SHARED;
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 7759812c1bbe..0fa98b1c70ea 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -75,7 +75,6 @@ xfs_inode_alloc(
 		return NULL;
 	}
 
-	ASSERT(atomic_read(&ip->i_iocount) == 0);
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
@@ -150,7 +149,6 @@ xfs_inode_free(
 	}
 
 	/* asserts to verify all state is correct here */
-	ASSERT(atomic_read(&ip->i_iocount) == 0);
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 2380a4bcbece..760140d1dd66 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -257,7 +257,6 @@ typedef struct xfs_inode {
 
 	xfs_fsize_t		i_size;		/* in-memory size */
 	xfs_fsize_t		i_new_size;	/* size when write completes */
-	atomic_t		i_iocount;	/* outstanding I/O count */
 
 	/* VFS inode */
 	struct inode		i_vnode;	/* embedded VFS inode */
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 32aca87bde5e..e041e917c1d9 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -840,9 +840,9 @@ xfs_setattr_size(
 	}
 
 	/*
-	 * Wait for all I/O to complete.
+	 * Wait for all direct I/O to complete.
 	 */
-	xfs_ioend_wait(ip);
+	inode_dio_wait(inode);
 
 	error = -block_truncate_page(inode->i_mapping, iattr->ia_size,
 				     xfs_get_blocks);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 2366c54cc4fa..54d5e102ffe1 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -796,8 +796,6 @@ xfs_fs_destroy_inode(
 	if (is_bad_inode(inode))
 		goto out_reclaim;
 
-	xfs_ioend_wait(ip);
-
 	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
 
 	/*
@@ -837,7 +835,6 @@ xfs_fs_inode_init_once(
 	inode_init_once(VFS_I(ip));
 
 	/* xfs inode */
-	atomic_set(&ip->i_iocount, 0);
 	atomic_set(&ip->i_pincount, 0);
 	spin_lock_init(&ip->i_flags_lock);
 	init_waitqueue_head(&ip->i_ipin_wait);
@@ -914,9 +911,8 @@ xfs_fs_write_inode(
 		 * of forcing it all the way to stable storage using a
 		 * synchronous transaction we let the log force inside the
 		 * ->sync_fs call do that for thus, which reduces the number
-		 * of synchronous log foces dramatically.
+		 * of synchronous log forces dramatically.
 		 */
-		xfs_ioend_wait(ip);
 		error = xfs_log_inode(ip);
 		if (error)
 			goto out;
@@ -1681,7 +1677,6 @@ init_xfs_fs(void)
 	printk(KERN_INFO XFS_VERSION_STRING " with "
 			 XFS_BUILD_OPTIONS " enabled\n");
 
-	xfs_ioend_init();
 	xfs_dir_startup();
 
 	error = xfs_init_zones();
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 90cc197e0433..bf2b38c21caa 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -227,21 +227,17 @@ xfs_sync_inode_data(
 	int			error = 0;
 
 	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
-		goto out_wait;
+		return 0;
 
 	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
 		if (flags & SYNC_TRYLOCK)
-			goto out_wait;
+			return 0;
 		xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	}
 
 	error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
 				0 : XBF_ASYNC, FI_NONE);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
- out_wait:
-	if (flags & SYNC_WAIT)
-		xfs_ioend_wait(ip);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 51fc429527bc..c2ff0fc86567 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -647,8 +647,6 @@ xfs_inactive(
 	if (truncate) {
 		xfs_ilock(ip, XFS_IOLOCK_EXCL);
 
-		xfs_ioend_wait(ip);
-
 		error = xfs_trans_reserve(tp, 0,
 					  XFS_ITRUNCATE_LOG_RES(mp),
 					  0, XFS_TRANS_PERM_LOG_RES,
@@ -2076,7 +2074,7 @@ xfs_free_file_space(
 	if (need_iolock) {
 		xfs_ilock(ip, XFS_IOLOCK_EXCL);
 		/* wait for the completion of any pending DIOs */
-		xfs_ioend_wait(ip);
+		inode_dio_wait(VFS_I(ip));
 	}
 
 	rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
-- 
cgit v1.2.3-58-ga151


From 859f57ca00805e6c482eef1a7ab073097d02c8ca Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 27 Aug 2011 14:45:11 +0000
Subject: xfs: avoid synchronous transactions when deleting attr blocks

Currently xfs_attr_inactive causes a synchronous transactions if we are
removing a file that has any extents allocated to the attribute fork, and
thus makes XFS extremely slow at removing files with out of line extended
attributes. The code looks a like a relict from the days before the busy
extent list, but with the busy extent list we avoid reusing data and attr
extents that have been freed but not commited yet, so this code is just
as superflous as the synchronous transactions for data blocks.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Bernd Schubert <bernd.schubert@itwm.fraunhofer.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_attr.c | 12 ------------
 fs/xfs/xfs_bmap.c | 10 +---------
 2 files changed, 1 insertion(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 8f0f65833500..3dd5c9c374cb 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -823,18 +823,6 @@ xfs_attr_inactive(xfs_inode_t *dp)
 	if (error)
 		goto out;
 
-	/*
-	 * Signal synchronous inactive transactions unless this is a
-	 * synchronous mount filesystem in which case we know that we're here
-	 * because we've been called out of xfs_inactive which means that the
-	 * last reference is gone and the unlink transaction has already hit
-	 * the disk so async inactive transactions are safe.
-	 */
-	if (!(mp->m_flags & XFS_MOUNT_WSYNC)) {
-		if (dp->i_d.di_anextents > 0)
-			xfs_trans_set_sync(trans);
-	}
-
 	error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
 	if (error)
 		goto out;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 452a291383ab..a6075c0f8456 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3782,19 +3782,11 @@ xfs_bmap_compute_maxlevels(
  * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
  * caller.  Frees all the extents that need freeing, which must be done
  * last due to locking considerations.  We never free any extents in
- * the first transaction.  This is to allow the caller to make the first
- * transaction a synchronous one so that the pointers to the data being
- * broken in this transaction will be permanent before the data is actually
- * freed.  This is necessary to prevent blocks from being reallocated
- * and written to before the free and reallocation are actually permanent.
- * We do not just make the first transaction synchronous here, because
- * there are more efficient ways to gain the same protection in some cases
- * (see the file truncation code).
+ * the first transaction.
  *
  * Return 1 if the given transaction was committed and a new one
  * started, and 0 otherwise in the committed parameter.
  */
-/*ARGSUSED*/
 int						/* error */
 xfs_bmap_finish(
 	xfs_trans_t		**tp,		/* transaction pointer addr */
-- 
cgit v1.2.3-58-ga151


From c58cb165bd44de8aaee9755a144136ae743be116 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 27 Aug 2011 14:42:53 +0000
Subject: xfs: avoid direct I/O write vs buffered I/O race

Currently a buffered reader or writer can add pages to the pagecache
while we are waiting for the iolock in xfs_file_dio_aio_write.  Prevent
this by re-checking mapping->nrpages after we got the iolock, and if
nessecary upgrade the lock to exclusive mode.  To simplify this a bit
only take the ilock inside of xfs_file_aio_write_checks.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_file.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ee63c4fb3639..06fe97e56e48 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -676,6 +676,7 @@ xfs_file_aio_write_checks(
 	xfs_fsize_t		new_size;
 	int			error = 0;
 
+	xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
 	*new_sizep = 0;
 restart:
 	error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
@@ -798,14 +799,24 @@ xfs_file_dio_aio_write(
 		*iolock = XFS_IOLOCK_EXCL;
 	else
 		*iolock = XFS_IOLOCK_SHARED;
-	xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+	xfs_rw_ilock(ip, *iolock);
+
+	/*
+	 * Recheck if there are cached pages that need invalidate after we got
+	 * the iolock to protect against other threads adding new pages while
+	 * we were waiting for the iolock.
+	 */
+	if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) {
+		xfs_rw_iunlock(ip, *iolock);
+		*iolock = XFS_IOLOCK_EXCL;
+		xfs_rw_ilock(ip, *iolock);
+	}
 
 	ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
 	if (ret)
 		return ret;
 
 	if (mapping->nrpages) {
-		WARN_ON(*iolock != XFS_IOLOCK_EXCL);
 		ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
 							FI_REMAPF_LOCKED);
 		if (ret)
@@ -851,7 +862,7 @@ xfs_file_buffered_aio_write(
 	size_t			count = ocount;
 
 	*iolock = XFS_IOLOCK_EXCL;
-	xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+	xfs_rw_ilock(ip, *iolock);
 
 	ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
 	if (ret)
-- 
cgit v1.2.3-58-ga151


From 04f658ee229f60dbb9a0dc2f3d6871b12b758051 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 24 Aug 2011 05:59:25 +0000
Subject: xfs: improve ioend error handling

Return unwritten extent conversion errors to aio_complete.

Skip both unwritten extent conversion and size updates if we had an
I/O error or the filesystem has been shut down.

Return -EIO to the aio/buffer completion handlers in case of a
forced shutdown.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_aops.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 46bba3e0af47..22aadf667862 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -88,8 +88,10 @@ xfs_destroy_ioend(
 	}
 
 	if (ioend->io_iocb) {
-		if (ioend->io_isasync)
-			aio_complete(ioend->io_iocb, ioend->io_result, 0);
+		if (ioend->io_isasync) {
+			aio_complete(ioend->io_iocb, ioend->io_error ?
+					ioend->io_error : ioend->io_result, 0);
+		}
 		inode_dio_done(ioend->io_inode);
 	}
 
@@ -141,9 +143,6 @@ xfs_setfilesize(
 	xfs_inode_t		*ip = XFS_I(ioend->io_inode);
 	xfs_fsize_t		isize;
 
-	if (unlikely(ioend->io_error))
-		return 0;
-
 	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
 		return EAGAIN;
 
@@ -189,17 +188,24 @@ xfs_end_io(
 	struct xfs_inode *ip = XFS_I(ioend->io_inode);
 	int		error = 0;
 
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		error = -EIO;
+		goto done;
+	}
+	if (ioend->io_error)
+		goto done;
+
 	/*
 	 * For unwritten extents we need to issue transactions to convert a
 	 * range to normal written extens after the data I/O has finished.
 	 */
-	if (ioend->io_type == IO_UNWRITTEN &&
-	    likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
-
+	if (ioend->io_type == IO_UNWRITTEN) {
 		error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
 						 ioend->io_size);
-		if (error)
-			ioend->io_error = error;
+		if (error) {
+			ioend->io_error = -error;
+			goto done;
+		}
 	}
 
 	/*
@@ -209,6 +215,7 @@ xfs_end_io(
 	error = xfs_setfilesize(ioend);
 	ASSERT(!error || error == EAGAIN);
 
+done:
 	/*
 	 * If we didn't complete processing of the ioend, requeue it to the
 	 * tail of the workqueue for another attempt later. Otherwise destroy
-- 
cgit v1.2.3-58-ga151


From b522950f0ab8551f2ef56c210ebd50e6c6396601 Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Wed, 7 Sep 2011 19:37:54 +0000
Subject: xfs: Check the return value of xfs_buf_get()

Check the return value of xfs_buf_get() and fail appropriately.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_attr.c  |  4 ++--
 fs/xfs/xfs_fsops.c | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 3dd5c9c374cb..c7dab0c0bdda 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2109,8 +2109,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 
 		bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt,
 				 XBF_LOCK | XBF_DONT_BLOCK);
-		ASSERT(!xfs_buf_geterror(bp));
-
+		if (!bp)
+			return ENOMEM;
 		tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
 							XFS_BUF_SIZE(bp);
 		xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index e023f940a3dd..1c6fdeb702ff 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -194,6 +194,10 @@ xfs_growfs_data_private(
 		bp = xfs_buf_get(mp->m_ddev_targp,
 				 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
 				 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
+		if (!bp) {
+			error = ENOMEM;
+			goto error0;
+		}
 		agf = XFS_BUF_TO_AGF(bp);
 		memset(agf, 0, mp->m_sb.sb_sectsize);
 		agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
@@ -227,6 +231,10 @@ xfs_growfs_data_private(
 		bp = xfs_buf_get(mp->m_ddev_targp,
 				 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
 				 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
+		if (!bp) {
+			error = ENOMEM;
+			goto error0;
+		}
 		agi = XFS_BUF_TO_AGI(bp);
 		memset(agi, 0, mp->m_sb.sb_sectsize);
 		agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
@@ -253,6 +261,10 @@ xfs_growfs_data_private(
 				 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
 				 BTOBB(mp->m_sb.sb_blocksize),
 				 XBF_LOCK | XBF_MAPPED);
+		if (!bp) {
+			error = ENOMEM;
+			goto error0;
+		}
 		block = XFS_BUF_TO_BLOCK(bp);
 		memset(block, 0, mp->m_sb.sb_blocksize);
 		block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
@@ -276,6 +288,10 @@ xfs_growfs_data_private(
 				 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
 				 BTOBB(mp->m_sb.sb_blocksize),
 				 XBF_LOCK | XBF_MAPPED);
+		if (!bp) {
+			error = ENOMEM;
+			goto error0;
+		}
 		block = XFS_BUF_TO_BLOCK(bp);
 		memset(block, 0, mp->m_sb.sb_blocksize);
 		block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
@@ -300,6 +316,10 @@ xfs_growfs_data_private(
 				 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
 				 BTOBB(mp->m_sb.sb_blocksize),
 				 XBF_LOCK | XBF_MAPPED);
+		if (!bp) {
+			error = ENOMEM;
+			goto error0;
+		}
 		block = XFS_BUF_TO_BLOCK(bp);
 		memset(block, 0, mp->m_sb.sb_blocksize);
 		block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
-- 
cgit v1.2.3-58-ga151


From 2a30f36d9069b0646dcdd73def5fd7ab674bffd6 Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Tue, 20 Sep 2011 13:56:55 +0000
Subject: xfs: Check the return value of xfs_trans_get_buf()

Check the return value of xfs_trans_get_buf() and fail
appropriately.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_attr_leaf.c |  2 ++
 fs/xfs/xfs_btree.c     |  3 ++-
 fs/xfs/xfs_dquot.c     |  5 ++++-
 fs/xfs/xfs_ialloc.c    | 13 ++++++++-----
 fs/xfs/xfs_inode.c     |  9 ++++++---
 fs/xfs/xfs_vnodeops.c  |  9 ++++++++-
 6 files changed, 30 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 8fad9602542b..58c3add07b65 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -2948,6 +2948,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
 			bp = xfs_trans_get_buf(*trans,
 					dp->i_mount->m_ddev_targp,
 					dblkno, dblkcnt, XBF_LOCK);
+			if (!bp)
+				return ENOMEM;
 			xfs_trans_binval(*trans, bp);
 			/*
 			 * Roll to next transaction.
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 2b9fd385e27d..28cc0199dc1f 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -970,7 +970,8 @@ xfs_btree_get_buf_block(
 	*bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
 				 mp->m_bsize, flags);
 
-	ASSERT(!xfs_buf_geterror(*bpp));
+	if (!*bpp)
+		return ENOMEM;
 
 	*block = XFS_BUF_TO_BLOCK(*bpp);
 	return 0;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 3e2ccaedc51e..0c5fe66ce92b 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -402,8 +402,11 @@ xfs_qm_dqalloc(
 			       dqp->q_blkno,
 			       mp->m_quotainfo->qi_dqchunklen,
 			       0);
-	if (!bp || (error = xfs_buf_geterror(bp)))
+
+	error = xfs_buf_geterror(bp);
+	if (error)
 		goto error1;
+
 	/*
 	 * Make a chunk of dquots out of this buffer and log
 	 * the entire thing.
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 9f24ec28283b..207e0b0c0730 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -150,7 +150,7 @@ xfs_check_agi_freecount(
 /*
  * Initialise a new set of inodes.
  */
-STATIC void
+STATIC int
 xfs_ialloc_inode_init(
 	struct xfs_mount	*mp,
 	struct xfs_trans	*tp,
@@ -202,8 +202,8 @@ xfs_ialloc_inode_init(
 		fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
 					 mp->m_bsize * blks_per_cluster,
 					 XBF_LOCK);
-		ASSERT(!xfs_buf_geterror(fbuf));
-
+		if (!fbuf)
+			return ENOMEM;
 		/*
 		 * Initialize all inodes in this buffer and then log them.
 		 *
@@ -225,6 +225,7 @@ xfs_ialloc_inode_init(
 		}
 		xfs_trans_inode_alloc_buf(tp, fbuf);
 	}
+	return 0;
 }
 
 /*
@@ -369,9 +370,11 @@ xfs_ialloc_ag_alloc(
 	 * rather than a linear progression to prevent the next generation
 	 * number from being easily guessable.
 	 */
-	xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len,
-			      random32());
+	error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno,
+			args.len, random32());
 
+	if (error)
+		return error;
 	/*
 	 * Convert the results.
 	 */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7f237ba3c292..d689253fdfda 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1644,7 +1644,7 @@ xfs_iunlink_remove(
  * inodes that are in memory - they all must be marked stale and attached to
  * the cluster buffer.
  */
-STATIC void
+STATIC int
 xfs_ifree_cluster(
 	xfs_inode_t	*free_ip,
 	xfs_trans_t	*tp,
@@ -1690,6 +1690,8 @@ xfs_ifree_cluster(
 					mp->m_bsize * blks_per_cluster,
 					XBF_LOCK);
 
+		if (!bp)
+			return ENOMEM;
 		/*
 		 * Walk the inodes already attached to the buffer and mark them
 		 * stale. These will all have the flush locks held, so an
@@ -1799,6 +1801,7 @@ retry:
 	}
 
 	xfs_perag_put(pag);
+	return 0;
 }
 
 /*
@@ -1878,10 +1881,10 @@ xfs_ifree(
 	dip->di_mode = 0;
 
 	if (delete) {
-		xfs_ifree_cluster(ip, tp, first_ino);
+		error = xfs_ifree_cluster(ip, tp, first_ino);
 	}
 
-	return 0;
+	return error;
 }
 
 /*
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index c2ff0fc86567..0d1caec873a1 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -308,6 +308,10 @@ xfs_inactive_symlink_rmt(
 		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
 			XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
 			XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
+		if (!bp) {
+			error = ENOMEM;
+			goto error1;
+		}
 		xfs_trans_binval(tp, bp);
 	}
 	/*
@@ -1648,7 +1652,10 @@ xfs_symlink(
 			byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
 			bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
 					       BTOBB(byte_cnt), 0);
-			ASSERT(!xfs_buf_geterror(bp));
+			if (!bp) {
+				error = ENOMEM;
+				goto error2;
+			}
 			if (pathlen < byte_cnt) {
 				byte_cnt = pathlen;
 			}
-- 
cgit v1.2.3-58-ga151


From eabbaf118239d0d4188298b52751040f3b4cc28f Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Thu, 8 Sep 2011 20:18:50 +0000
Subject: xfs: Fix the incorrect comment in the header of _xfs_buf_find

Fix the incorrect comment in the header of the function
_xfs_buf_find().

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_buf.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index d3c2b58d7d70..e3af85095ddb 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -415,10 +415,7 @@ _xfs_buf_map_pages(
 /*
  *	Look up, and creates if absent, a lockable buffer for
  *	a given range of an inode.  The buffer is returned
- *	locked.	 If other overlapping buffers exist, they are
- *	released before the new buffer is created and locked,
- *	which may imply that this call will block until those buffers
- *	are unlocked.  No I/O is implied by this call.
+ *	locked.	No I/O is implied by this call.
  */
 xfs_buf_t *
 _xfs_buf_find(
-- 
cgit v1.2.3-58-ga151


From ed32201e65e15f3e6955cb84cbb544b08f81e5a5 Mon Sep 17 00:00:00 2001
From: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Date: Sat, 17 Sep 2011 13:38:38 +0000
Subject: xfs: Return -EIO when xfs_vn_getattr() failed

An attribute of inode can be fetched via xfs_vn_getattr() in XFS.
Currently it returns EIO, not negative value, when it failed.  As a
result, the system call returns not negative value even though an
error occured. The stat(2), ls and mv commands cannot handle this
error and do not work correctly.

This patch fixes this bug, and returns -EIO, not EIO when an error
is detected in xfs_vn_getattr().

Signed-off-by: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_iops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index e041e917c1d9..e6b3e7620888 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -465,7 +465,7 @@ xfs_vn_getattr(
 	trace_xfs_getattr(ip);
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
+		return -XFS_ERROR(EIO);
 
 	stat->size = XFS_ISIZE(ip);
 	stat->dev = inode->i_sb->s_dev;
-- 
cgit v1.2.3-58-ga151


From e7455e02e5effcdd49bb28e7dfface2d3473de52 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Sep 2011 20:40:41 +0000
Subject: xfs: remove the first extent special case in xfs_bmap_add_extent

Both xfs_bmap_add_extent_hole_delay and xfs_bmap_add_extent_hole_real
already contain code to handle the case where there is no extent to
merge with, which is effectively the same as the code duplicated here.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index a6075c0f8456..927e5454a43c 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -465,27 +465,10 @@ xfs_bmap_add_extent(
 	ASSERT(*idx >= 0);
 	ASSERT(*idx <= nextents);
 
-	/*
-	 * This is the first extent added to a new/empty file.
-	 * Special case this one, so other routines get to assume there are
-	 * already extents in the list.
-	 */
-	if (nextents == 0) {
-		xfs_iext_insert(ip, *idx, 1, new,
-				whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
-
-		ASSERT(cur == NULL);
-
-		if (!isnullstartblock(new->br_startblock)) {
-			XFS_IFORK_NEXT_SET(ip, whichfork, 1);
-			logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
-		} else
-			logflags = 0;
-	}
 	/*
 	 * Any kind of new delayed allocation goes here.
 	 */
-	else if (isnullstartblock(new->br_startblock)) {
+	if (isnullstartblock(new->br_startblock)) {
 		if (cur)
 			ASSERT((cur->bc_private.b.flags &
 				XFS_BTCUR_BPRV_WASDEL) == 0);
-- 
cgit v1.2.3-58-ga151


From b9b984d7846e37c57e5b3f8cd883ad45e8ebc2cf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Sep 2011 20:40:42 +0000
Subject: xfs: remove impossible to read code in xfs_bmap_add_extent_delay_real

We already have the worst case blocks reserved, so xfs_icsb_modify_counters
won't fail in xfs_bmap_add_extent_delay_real.  In fact we've had an assert
to catch this case since day and it never triggered.  So remove the code
to try smaller reservations, and just return the error for that case in
addition to keeping the assert.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 35 ++++++++---------------------------
 1 file changed, 8 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 927e5454a43c..ebcd38b14830 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1045,34 +1045,15 @@ xfs_bmap_add_extent_delay_real(
 		temp2 = xfs_bmap_worst_indlen(ip, temp2);
 		diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
 			(cur ? cur->bc_private.b.allocated : 0));
-		if (diff > 0 &&
-		    xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-					     -((int64_t)diff), 0)) {
-			/*
-			 * Ick gross gag me with a spoon.
-			 */
-			ASSERT(0);	/* want to see if this ever happens! */
-			while (diff > 0) {
-				if (temp) {
-					temp--;
-					diff--;
-					if (!diff ||
-					    !xfs_icsb_modify_counters(ip->i_mount,
-						    XFS_SBS_FDBLOCKS,
-						    -((int64_t)diff), 0))
-						break;
-				}
-				if (temp2) {
-					temp2--;
-					diff--;
-					if (!diff ||
-					    !xfs_icsb_modify_counters(ip->i_mount,
-						    XFS_SBS_FDBLOCKS,
-						    -((int64_t)diff), 0))
-						break;
-				}
-			}
+		if (diff > 0) {
+			error = xfs_icsb_modify_counters(ip->i_mount,
+					XFS_SBS_FDBLOCKS,
+					-((int64_t)diff), 0);
+			ASSERT(!error);
+			if (error)
+				goto done;
 		}
+
 		ep = xfs_iext_get_ext(ifp, *idx);
 		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-- 
cgit v1.2.3-58-ga151


From ecee76ba9d91fdcbdff933ca1bd41465ca4c4fdb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Sep 2011 20:40:43 +0000
Subject: xfs: remove the nextents variable in xfs_bmapi

Instead of using a local variable that needs to updated when we modify
the extent map just check ifp->if_bytes directly where we use it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index ebcd38b14830..05d38adbb85e 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4294,7 +4294,6 @@ xfs_bmapi(
 	xfs_mount_t	*mp;		/* xfs mount structure */
 	int		n;		/* current extent index */
 	int		nallocs;	/* number of extents alloc'd */
-	xfs_extnum_t	nextents;	/* number of extents in file */
 	xfs_fileoff_t	obno;		/* old block number (offset) */
 	xfs_bmbt_irec_t	prev;		/* previous file extent record */
 	int		tmp_logflags;	/* temp flags holder */
@@ -4380,7 +4379,6 @@ xfs_bmapi(
 		goto error0;
 	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
 		&prev);
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
 	n = 0;
 	end = bno + len;
 	obno = bno;
@@ -4622,7 +4620,6 @@ xfs_bmapi(
 			if (error)
 				goto error0;
 			ep = xfs_iext_get_ext(ifp, lastx);
-			nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
 			xfs_bmbt_get_all(ep, &got);
 			ASSERT(got.br_startoff <= aoff);
 			ASSERT(got.br_startoff + got.br_blockcount >=
@@ -4723,7 +4720,6 @@ xfs_bmapi(
 			if (error)
 				goto error0;
 			ep = xfs_iext_get_ext(ifp, lastx);
-			nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
 			xfs_bmbt_get_all(ep, &got);
 			/*
 			 * We may have combined previously unwritten
@@ -4781,7 +4777,7 @@ xfs_bmapi(
 		 * Else go on to the next record.
 		 */
 		prev = got;
-		if (++lastx < nextents) {
+		if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) {
 			ep = xfs_iext_get_ext(ifp, lastx);
 			xfs_bmbt_get_all(ep, &got);
 		} else {
-- 
cgit v1.2.3-58-ga151


From aef9a89586fc8475bf0333b8736d5aa8aa6f4897 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sun, 18 Sep 2011 20:40:44 +0000
Subject: xfs: factor extent map manipulations out of xfs_bmapi

To further improve the readability of xfs_bmapi(), factor the pure
extent map manipulations out into separate functions. This removes
large blocks of logic from the xfs_bmapi() code loop and makes it
easier to see the operational logic flow for xfs_bmapi().

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 181 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 107 insertions(+), 74 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 05d38adbb85e..59f63949d3bc 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4248,6 +4248,107 @@ xfs_bmap_validate_ret(
 #endif /* DEBUG */
 
 
+/*
+ * Trim the returned map to the required bounds
+ */
+STATIC void
+xfs_bmapi_trim_map(
+	struct xfs_bmbt_irec	*mval,
+	struct xfs_bmbt_irec	*got,
+	xfs_fileoff_t		*bno,
+	xfs_filblks_t		len,
+	xfs_fileoff_t		obno,
+	xfs_fileoff_t		end,
+	int			n,
+	int			flags)
+{
+	if ((flags & XFS_BMAPI_ENTIRE) ||
+	    got->br_startoff + got->br_blockcount <= obno) {
+		*mval = *got;
+		if (isnullstartblock(got->br_startblock))
+			mval->br_startblock = DELAYSTARTBLOCK;
+		return;
+	}
+
+	if (obno > *bno)
+		*bno = obno;
+	ASSERT((*bno >= obno) || (n == 0));
+	ASSERT(*bno < end);
+	mval->br_startoff = *bno;
+	if (isnullstartblock(got->br_startblock))
+		mval->br_startblock = DELAYSTARTBLOCK;
+	else
+		mval->br_startblock = got->br_startblock +
+					(*bno - got->br_startoff);
+	/*
+	 * Return the minimum of what we got and what we asked for for
+	 * the length.  We can use the len variable here because it is
+	 * modified below and we could have been there before coming
+	 * here if the first part of the allocation didn't overlap what
+	 * was asked for.
+	 */
+	mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno,
+			got->br_blockcount - (*bno - got->br_startoff));
+	mval->br_state = got->br_state;
+	ASSERT(mval->br_blockcount <= len);
+	return;
+}
+
+/*
+ * Update and validate the extent map to return
+ */
+STATIC void
+xfs_bmapi_update_map(
+	struct xfs_bmbt_irec	**map,
+	xfs_fileoff_t		*bno,
+	xfs_filblks_t		*len,
+	xfs_fileoff_t		obno,
+	xfs_fileoff_t		end,
+	int			*n,
+	int			flags)
+{
+	xfs_bmbt_irec_t	*mval = *map;
+
+	ASSERT((flags & XFS_BMAPI_ENTIRE) ||
+	       ((mval->br_startoff + mval->br_blockcount) <= end));
+	ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) ||
+	       (mval->br_startoff < obno));
+
+	*bno = mval->br_startoff + mval->br_blockcount;
+	*len = end - *bno;
+	if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) {
+		/* update previous map with new information */
+		ASSERT(mval->br_startblock == mval[-1].br_startblock);
+		ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
+		ASSERT(mval->br_state == mval[-1].br_state);
+		mval[-1].br_blockcount = mval->br_blockcount;
+		mval[-1].br_state = mval->br_state;
+	} else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
+		   mval[-1].br_startblock != DELAYSTARTBLOCK &&
+		   mval[-1].br_startblock != HOLESTARTBLOCK &&
+		   mval->br_startblock == mval[-1].br_startblock +
+					  mval[-1].br_blockcount &&
+		   ((flags & XFS_BMAPI_IGSTATE) ||
+			mval[-1].br_state == mval->br_state)) {
+		ASSERT(mval->br_startoff ==
+		       mval[-1].br_startoff + mval[-1].br_blockcount);
+		mval[-1].br_blockcount += mval->br_blockcount;
+	} else if (*n > 0 &&
+		   mval->br_startblock == DELAYSTARTBLOCK &&
+		   mval[-1].br_startblock == DELAYSTARTBLOCK &&
+		   mval->br_startoff ==
+		   mval[-1].br_startoff + mval[-1].br_blockcount) {
+		mval[-1].br_blockcount += mval->br_blockcount;
+		mval[-1].br_state = mval->br_state;
+	} else if (!((*n == 0) &&
+		     ((mval->br_startoff + mval->br_blockcount) <=
+		      obno))) {
+		mval++;
+		(*n)++;
+	}
+	*map = mval;
+}
+
 /*
  * Map file blocks to filesystem blocks.
  * File range is given by the bno/len pair.
@@ -4650,44 +4751,9 @@ xfs_bmapi(
 			n++;
 			continue;
 		}
-		/*
-		 * Then deal with the allocated space we found.
-		 */
-		ASSERT(ep != NULL);
-		if (!(flags & XFS_BMAPI_ENTIRE) &&
-		    (got.br_startoff + got.br_blockcount > obno)) {
-			if (obno > bno)
-				bno = obno;
-			ASSERT((bno >= obno) || (n == 0));
-			ASSERT(bno < end);
-			mval->br_startoff = bno;
-			if (isnullstartblock(got.br_startblock)) {
-				ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
-				mval->br_startblock = DELAYSTARTBLOCK;
-			} else
-				mval->br_startblock =
-					got.br_startblock +
-					(bno - got.br_startoff);
-			/*
-			 * Return the minimum of what we got and what we
-			 * asked for for the length.  We can use the len
-			 * variable here because it is modified below
-			 * and we could have been there before coming
-			 * here if the first part of the allocation
-			 * didn't overlap what was asked for.
-			 */
-			mval->br_blockcount =
-				XFS_FILBLKS_MIN(end - bno, got.br_blockcount -
-					(bno - got.br_startoff));
-			mval->br_state = got.br_state;
-			ASSERT(mval->br_blockcount <= len);
-		} else {
-			*mval = got;
-			if (isnullstartblock(mval->br_startblock)) {
-				ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
-				mval->br_startblock = DELAYSTARTBLOCK;
-			}
-		}
+
+		/* Deal with the allocated space we found.  */
+		xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
 
 		/*
 		 * Check if writing previously allocated but
@@ -4730,42 +4796,9 @@ xfs_bmapi(
 				continue;
 		}
 
-		ASSERT((flags & XFS_BMAPI_ENTIRE) ||
-		       ((mval->br_startoff + mval->br_blockcount) <= end));
-		ASSERT((flags & XFS_BMAPI_ENTIRE) ||
-		       (mval->br_blockcount <= len) ||
-		       (mval->br_startoff < obno));
-		bno = mval->br_startoff + mval->br_blockcount;
-		len = end - bno;
-		if (n > 0 && mval->br_startoff == mval[-1].br_startoff) {
-			ASSERT(mval->br_startblock == mval[-1].br_startblock);
-			ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
-			ASSERT(mval->br_state == mval[-1].br_state);
-			mval[-1].br_blockcount = mval->br_blockcount;
-			mval[-1].br_state = mval->br_state;
-		} else if (n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
-			   mval[-1].br_startblock != DELAYSTARTBLOCK &&
-			   mval[-1].br_startblock != HOLESTARTBLOCK &&
-			   mval->br_startblock ==
-			   mval[-1].br_startblock + mval[-1].br_blockcount &&
-			   ((flags & XFS_BMAPI_IGSTATE) ||
-				mval[-1].br_state == mval->br_state)) {
-			ASSERT(mval->br_startoff ==
-			       mval[-1].br_startoff + mval[-1].br_blockcount);
-			mval[-1].br_blockcount += mval->br_blockcount;
-		} else if (n > 0 &&
-			   mval->br_startblock == DELAYSTARTBLOCK &&
-			   mval[-1].br_startblock == DELAYSTARTBLOCK &&
-			   mval->br_startoff ==
-			   mval[-1].br_startoff + mval[-1].br_blockcount) {
-			mval[-1].br_blockcount += mval->br_blockcount;
-			mval[-1].br_state = mval->br_state;
-		} else if (!((n == 0) &&
-			     ((mval->br_startoff + mval->br_blockcount) <=
-			      obno))) {
-			mval++;
-			n++;
-		}
+		/* update the extent map to return */
+		xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
 		/*
 		 * If we're done, stop now.  Stop when we've allocated
 		 * XFS_BMAP_MAX_NMAP extents no matter what.  Otherwise
-- 
cgit v1.2.3-58-ga151


From 5c8ed2021ff291f5e399a9b43c4f699b2fc58fbb Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sun, 18 Sep 2011 20:40:45 +0000
Subject: xfs: introduce xfs_bmapi_read()

xfs_bmapi() currently handles both extent map reading and
allocation. As a result, the code is littered with "if (wr)"
branches to conditionally do allocation operations if required.
This makes the code much harder to follow and causes significant
indent issues with the code.

Given that read mapping is much simpler than allocation, we can
split out read mapping from xfs_bmapi() and reuse the logic that
we have already factored out do do all the hard work of handling the
extent map manipulations. The results in a much simpler function for
the common extent read operations, and will allow the allocation
code to be simplified in another commit.

Once xfs_bmapi_read() is implemented, convert all the callers of
xfs_bmapi() that are only reading extents to use the new function.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_aops.c      |   8 ++--
 fs/xfs/xfs_attr.c      |  30 ++++++--------
 fs/xfs/xfs_attr_leaf.c |   5 +--
 fs/xfs/xfs_bmap.c      | 104 +++++++++++++++++++++++++++++++++++++++++++++----
 fs/xfs/xfs_bmap.h      |   4 ++
 fs/xfs/xfs_da_btree.c  |   9 ++---
 fs/xfs/xfs_dir2_leaf.c |   6 +--
 fs/xfs/xfs_dquot.c     |   5 +--
 fs/xfs/xfs_file.c      |  10 ++---
 fs/xfs/xfs_inode.c     |  12 +++---
 fs/xfs/xfs_iomap.c     |   4 +-
 fs/xfs/xfs_qm.c        |   7 +---
 fs/xfs/xfs_vnodeops.c  |  24 ++++++------
 13 files changed, 151 insertions(+), 77 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 22aadf667862..11b2aad982d4 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -315,8 +315,8 @@ xfs_map_blocks(
 		count = mp->m_maxioffset - offset;
 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
-	error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
-			  bmapi_flags,  NULL, 0, imap, &nimaps, NULL);
+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+				imap, &nimaps, bmapi_flags);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 	if (error)
@@ -1138,8 +1138,8 @@ __xfs_get_blocks(
 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 
-	error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
-			  XFS_BMAPI_ENTIRE,  NULL, 0, &imap, &nimaps, NULL);
+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+				&imap, &nimaps, XFS_BMAPI_ENTIRE);
 	if (error)
 		goto out_unlock;
 
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index c7dab0c0bdda..41ef02b7185a 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -1963,10 +1963,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
 	lblkno = args->rmtblkno;
 	while (valuelen > 0) {
 		nmap = ATTR_RMTVALUE_MAPSIZE;
-		error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno,
-				  args->rmtblkcnt,
-				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-				  NULL, 0, map, &nmap, NULL);
+		error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+				       args->rmtblkcnt, map, &nmap,
+				       XFS_BMAPI_ATTRFORK);
 		if (error)
 			return(error);
 		ASSERT(nmap >= 1);
@@ -2092,14 +2091,11 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 		 */
 		xfs_bmap_init(args->flist, args->firstblock);
 		nmap = 1;
-		error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno,
-				  args->rmtblkcnt,
-				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-				  args->firstblock, 0, &map, &nmap,
-				  NULL);
-		if (error) {
+		error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
+				       args->rmtblkcnt, &map, &nmap,
+				       XFS_BMAPI_ATTRFORK);
+		if (error)
 			return(error);
-		}
 		ASSERT(nmap == 1);
 		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
 		       (map.br_startblock != HOLESTARTBLOCK));
@@ -2155,16 +2151,12 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 		/*
 		 * Try to remember where we decided to put the value.
 		 */
-		xfs_bmap_init(args->flist, args->firstblock);
 		nmap = 1;
-		error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno,
-					args->rmtblkcnt,
-					XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-					args->firstblock, 0, &map, &nmap,
-					args->flist);
-		if (error) {
+		error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+				       args->rmtblkcnt, &map, &nmap,
+				       XFS_BMAPI_ATTRFORK);
+		if (error)
 			return(error);
-		}
 		ASSERT(nmap == 1);
 		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
 		       (map.br_startblock != HOLESTARTBLOCK));
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 58c3add07b65..d4906e7c9787 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -2926,9 +2926,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
 		 * Try to remember where we decided to put the value.
 		 */
 		nmap = 1;
-		error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt,
-					XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-					NULL, 0, &map, &nmap, NULL);
+		error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
+				       &map, &nmap, XFS_BMAPI_ATTRFORK);
 		if (error) {
 			return(error);
 		}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 59f63949d3bc..fc5acf0b921e 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4349,6 +4349,98 @@ xfs_bmapi_update_map(
 	*map = mval;
 }
 
+/*
+ * Map file blocks to filesystem blocks without allocation.
+ */
+int
+xfs_bmapi_read(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	struct xfs_bmbt_irec	*mval,
+	int			*nmap,
+	int			flags)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp;
+	struct xfs_bmbt_irec	got;
+	struct xfs_bmbt_irec	prev;
+	xfs_fileoff_t		obno;
+	xfs_fileoff_t		end;
+	xfs_extnum_t		lastx;
+	int			error;
+	int			eof;
+	int			n = 0;
+	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+						XFS_ATTR_FORK : XFS_DATA_FORK;
+
+	ASSERT(*nmap >= 1);
+	ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
+			   XFS_BMAPI_IGSTATE)));
+
+	if (unlikely(XFS_TEST_ERROR(
+	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+		XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	XFS_STATS_INC(xs_blk_mapr);
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(NULL, ip, whichfork);
+		if (error)
+			return error;
+	}
+
+	xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev);
+	end = bno + len;
+	obno = bno;
+
+	while (bno < end && n < *nmap) {
+		/* Reading past eof, act as though there's a hole up to end. */
+		if (eof)
+			got.br_startoff = end;
+		if (got.br_startoff > bno) {
+			/* Reading in a hole.  */
+			mval->br_startoff = bno;
+			mval->br_startblock = HOLESTARTBLOCK;
+			mval->br_blockcount =
+				XFS_FILBLKS_MIN(len, got.br_startoff - bno);
+			mval->br_state = XFS_EXT_NORM;
+			bno += mval->br_blockcount;
+			len -= mval->br_blockcount;
+			mval++;
+			n++;
+			continue;
+		}
+
+		/* set up the extent map to return. */
+		xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
+		xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
+		/* If we're done, stop now. */
+		if (bno >= end || n >= *nmap)
+			break;
+
+		/* Else go on to the next record. */
+		if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
+		else
+			eof = 1;
+	}
+	*nmap = n;
+	return 0;
+}
+
 /*
  * Map file blocks to filesystem blocks.
  * File range is given by the bno/len pair.
@@ -5490,10 +5582,9 @@ xfs_getbmap(
 
 	do {
 		nmap = (nexleft > subnex) ? subnex : nexleft;
-		error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
-				  XFS_BB_TO_FSB(mp, bmv->bmv_length),
-				  bmapi_flags, NULL, 0, map, &nmap,
-				  NULL);
+		error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
+				       XFS_BB_TO_FSB(mp, bmv->bmv_length),
+				       map, &nmap, bmapi_flags);
 		if (error)
 			goto out_free_map;
 		ASSERT(nmap <= subnex);
@@ -6084,9 +6175,8 @@ xfs_bmap_punch_delalloc_range(
 		 * trying to remove a real extent (which requires a
 		 * transaction) or a hole, which is probably a bad idea...
 		 */
-		error = xfs_bmapi(NULL, ip, start_fsb, 1,
-				XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
-				&nimaps, NULL);
+		error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
+				       XFS_BMAPI_ENTIRE);
 
 		if (error) {
 			/* something screwed, just bail */
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index c62234bde053..16257c6eba71 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -294,6 +294,10 @@ xfs_bmapi(
 	int			*nmap,		/* i/o: mval size/count */
 	xfs_bmap_free_t		*flist);	/* i/o: list extents to free */
 
+int	xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
+		xfs_filblks_t len, struct xfs_bmbt_irec *mval,
+		int *nmap, int flags);
+
 /*
  * Map file blocks to filesystem blocks, simple version.
  * One block only, read-only.
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index ee9d5427fcd4..44dfa39099ef 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1995,11 +1995,10 @@ xfs_da_do_buf(
 		} else {
 			mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP);
 			nmap = nfsb;
-			if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno,
-					nfsb,
-					XFS_BMAPI_METADATA |
-						xfs_bmapi_aflag(whichfork),
-					NULL, 0, mapp, &nmap, NULL)))
+			error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb,
+					       mapp, &nmap,
+					       xfs_bmapi_aflag(whichfork));
+			if (error)
 				goto exit0;
 		}
 	} else {
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index ca2386d82cdf..66e108f561a3 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -888,12 +888,10 @@ xfs_dir2_leaf_getdents(
 				 * we already have in the table.
 				 */
 				nmap = map_size - map_valid;
-				error = xfs_bmapi(NULL, dp,
-					map_off,
+				error = xfs_bmapi_read(dp, map_off,
 					xfs_dir2_byte_to_da(mp,
 						XFS_DIR2_LEAF_OFFSET) - map_off,
-					XFS_BMAPI_METADATA, NULL, 0,
-					&map[map_valid], &nmap, NULL);
+					&map[map_valid], &nmap, 0);
 				/*
 				 * Don't know if we should ignore this or
 				 * try to return an error.
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 0c5fe66ce92b..c377961657ee 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -488,9 +488,8 @@ xfs_qm_dqtobp(
 	/*
 	 * Find the block map; no allocations yet
 	 */
-	error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
-			  XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
-			  NULL, 0, &map, &nmaps, NULL);
+	error = xfs_bmapi_read(quotip, dqp->q_fileoffset,
+			       XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
 
 	xfs_iunlock(quotip, XFS_ILOCK_SHARED);
 	if (error)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 06fe97e56e48..558543c146b3 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -509,11 +509,9 @@ xfs_zero_last_block(
 
 	last_fsb = XFS_B_TO_FSBT(mp, isize);
 	nimaps = 1;
-	error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
-			  &nimaps, NULL);
-	if (error) {
+	error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
+	if (error)
 		return error;
-	}
 	ASSERT(nimaps > 0);
 	/*
 	 * If the block underlying isize is just a hole, then there
@@ -604,8 +602,8 @@ xfs_zero_eof(
 	while (start_zero_fsb <= end_zero_fsb) {
 		nimaps = 1;
 		zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
-		error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
-				  0, NULL, 0, &imap, &nimaps, NULL);
+		error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
+					  &imap, &nimaps, 0);
 		if (error) {
 			ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
 			return error;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index d689253fdfda..f690d3a10b34 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1187,6 +1187,7 @@ xfs_isize_check(
 	xfs_fileoff_t		map_first;
 	int			nimaps;
 	xfs_bmbt_irec_t		imaps[2];
+	int			error;
 
 	if (!S_ISREG(ip->i_d.di_mode))
 		return;
@@ -1203,13 +1204,12 @@ xfs_isize_check(
 	 * The filesystem could be shutting down, so bmapi may return
 	 * an error.
 	 */
-	if (xfs_bmapi(NULL, ip, map_first,
+	error = xfs_bmapi_read(ip, map_first,
 			 (XFS_B_TO_FSB(mp,
-				       (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
-			  map_first),
-			 XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
-			 NULL))
-	    return;
+			       (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first),
+			 imaps, &nimaps, XFS_BMAPI_ENTIRE);
+	if (error)
+		return;
 	ASSERT(nimaps == 1);
 	ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
 }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 091d82b94c4d..544f053860f1 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -300,8 +300,8 @@ xfs_iomap_eof_want_preallocate(
 	while (count_fsb > 0) {
 		imaps = nimaps;
 		firstblock = NULLFSBLOCK;
-		error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0,
-				  &firstblock, 0, imap, &imaps, NULL);
+		error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps,
+				       0);
 		if (error)
 			return error;
 		for (n = 0; n < imaps; n++) {
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index f51bef885e6d..ddaf97a57ec6 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1347,11 +1347,8 @@ xfs_qm_dqiterate(
 		 * the inode is never added to the transaction.
 		 */
 		xfs_ilock(qip, XFS_ILOCK_SHARED);
-		error = xfs_bmapi(NULL, qip, lblkno,
-				  maxlblkcnt - lblkno,
-				  XFS_BMAPI_METADATA,
-				  NULL,
-				  0, map, &nmaps, NULL);
+		error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno,
+				       map, &nmaps, 0);
 		xfs_iunlock(qip, XFS_ILOCK_SHARED);
 		if (error)
 			break;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 0d1caec873a1..63874a87b378 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -72,8 +72,8 @@ xfs_readlink_bmap(
 	xfs_buf_t	*bp;
 	int		error = 0;
 
-	error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0,
-			mval, &nmaps, NULL);
+	error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, pathlen), mval, &nmaps,
+			       0);
 	if (error)
 		goto out;
 
@@ -178,8 +178,7 @@ xfs_free_eofblocks(
 
 	nimaps = 1;
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
-	error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
-			  NULL, 0, &imap, &nimaps, NULL);
+	error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 	if (!error && (nimaps != 0) &&
@@ -297,9 +296,9 @@ xfs_inactive_symlink_rmt(
 	done = 0;
 	xfs_bmap_init(&free_list, &first_block);
 	nmaps = ARRAY_SIZE(mval);
-	if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
-			XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
-			&free_list)))
+	error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, size),
+				mval, &nmaps, 0);
+	if (error)
 		goto error0;
 	/*
 	 * Invalidate the block(s).
@@ -1981,8 +1980,7 @@ xfs_zero_remaining_bytes(
 	for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
 		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 		nimap = 1;
-		error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0,
-			NULL, 0, &imap, &nimap, NULL);
+		error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
 		if (error || nimap < 1)
 			break;
 		ASSERT(imap.br_blockcount >= 1);
@@ -2101,8 +2099,8 @@ xfs_free_file_space(
 	 */
 	if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
 		nimap = 1;
-		error = xfs_bmapi(NULL, ip, startoffset_fsb,
-			1, 0, NULL, 0, &imap, &nimap, NULL);
+		error = xfs_bmapi_read(ip, startoffset_fsb, 1,
+					&imap, &nimap, 0);
 		if (error)
 			goto out_unlock_iolock;
 		ASSERT(nimap == 0 || nimap == 1);
@@ -2116,8 +2114,8 @@ xfs_free_file_space(
 				startoffset_fsb += mp->m_sb.sb_rextsize - mod;
 		}
 		nimap = 1;
-		error = xfs_bmapi(NULL, ip, endoffset_fsb - 1,
-			1, 0, NULL, 0, &imap, &nimap, NULL);
+		error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
+					&imap, &nimap, 0);
 		if (error)
 			goto out_unlock_iolock;
 		ASSERT(nimap == 0 || nimap == 1);
-- 
cgit v1.2.3-58-ga151


From 5b777ad517ee75d3bb8d67c142d808822e46601b Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sun, 18 Sep 2011 20:40:46 +0000
Subject: xfs: remove xfs_bmapi_single()

Now we have xfs_bmapi_read, there is no need for xfs_bmapi_single().
Change the remaining caller over and kill the function.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c     | 52 ---------------------------------------------------
 fs/xfs/xfs_bmap.h     | 15 ---------------
 fs/xfs/xfs_da_btree.c | 32 ++++++++-----------------------
 fs/xfs/xfs_rtalloc.c  | 32 +++++++++++--------------------
 4 files changed, 19 insertions(+), 112 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index fc5acf0b921e..fd06bf17d09c 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4968,58 +4968,6 @@ error0:
 	return error;
 }
 
-/*
- * Map file blocks to filesystem blocks, simple version.
- * One block (extent) only, read-only.
- * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
- * For the other flag values, the effect is as if XFS_BMAPI_METADATA
- * was set and all the others were clear.
- */
-int						/* error */
-xfs_bmapi_single(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*ip,		/* incore inode */
-	int		whichfork,	/* data or attr fork */
-	xfs_fsblock_t	*fsb,		/* output: mapped block */
-	xfs_fileoff_t	bno)		/* starting file offs. mapped */
-{
-	int		eof;		/* we've hit the end of extents */
-	int		error;		/* error return */
-	xfs_bmbt_irec_t	got;		/* current file extent record */
-	xfs_ifork_t	*ifp;		/* inode fork pointer */
-	xfs_extnum_t	lastx;		/* last useful extent number */
-	xfs_bmbt_irec_t	prev;		/* previous file extent record */
-
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	if (unlikely(
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)) {
-	       XFS_ERROR_REPORT("xfs_bmapi_single", XFS_ERRLEVEL_LOW,
-				ip->i_mount);
-	       return XFS_ERROR(EFSCORRUPTED);
-	}
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-		return XFS_ERROR(EIO);
-	XFS_STATS_INC(xs_blk_mapr);
-	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-	    (error = xfs_iread_extents(tp, ip, whichfork)))
-		return error;
-	(void)xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
-		&prev);
-	/*
-	 * Reading past eof, act as though there's a hole
-	 * up to end.
-	 */
-	if (eof || got.br_startoff > bno) {
-		*fsb = NULLFSBLOCK;
-		return 0;
-	}
-	ASSERT(!isnullstartblock(got.br_startblock));
-	ASSERT(bno < got.br_startoff + got.br_blockcount);
-	*fsb = got.br_startblock + (bno - got.br_startoff);
-	return 0;
-}
-
 /*
  * Unmap (remove) blocks from a file.
  * If nexts is nonzero then the number of extents to remove is limited to
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 16257c6eba71..01693390df21 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -298,21 +298,6 @@ int	xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
 		xfs_filblks_t len, struct xfs_bmbt_irec *mval,
 		int *nmap, int flags);
 
-/*
- * Map file blocks to filesystem blocks, simple version.
- * One block only, read-only.
- * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
- * For the other flag values, the effect is as if XFS_BMAPI_METADATA
- * was set and all the others were clear.
- */
-int						/* error */
-xfs_bmapi_single(
-	struct xfs_trans	*tp,		/* transaction pointer */
-	struct xfs_inode	*ip,		/* incore inode */
-	int			whichfork,	/* data or attr fork */
-	xfs_fsblock_t		*fsb,		/* output: mapped block */
-	xfs_fileoff_t		bno);		/* starting file offs. mapped */
-
 /*
  * Unmap (remove) blocks from a file.
  * If nexts is nonzero then the number of extents to remove is limited to
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 44dfa39099ef..70a5f580e5ed 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1975,32 +1975,16 @@ xfs_da_do_buf(
 		/*
 		 * Optimize the one-block case.
 		 */
-		if (nfsb == 1) {
-			xfs_fsblock_t	fsb;
-
-			if ((error =
-			    xfs_bmapi_single(trans, dp, whichfork, &fsb,
-				    (xfs_fileoff_t)bno))) {
-				return error;
-			}
+		if (nfsb == 1)
 			mapp = &map;
-			if (fsb == NULLFSBLOCK) {
-				nmap = 0;
-			} else {
-				map.br_startblock = fsb;
-				map.br_startoff = (xfs_fileoff_t)bno;
-				map.br_blockcount = 1;
-				nmap = 1;
-			}
-		} else {
+		else
 			mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP);
-			nmap = nfsb;
-			error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb,
-					       mapp, &nmap,
-					       xfs_bmapi_aflag(whichfork));
-			if (error)
-				goto exit0;
-		}
+
+		nmap = nfsb;
+		error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, mapp,
+				       &nmap, xfs_bmapi_aflag(whichfork));
+		if (error)
+			goto exit0;
 	} else {
 		map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
 		map.br_startoff = (xfs_fileoff_t)bno;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 35561a511b57..e5f40c6460b2 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -856,33 +856,23 @@ xfs_rtbuf_get(
 	xfs_buf_t	**bpp)		/* output: buffer for the block */
 {
 	xfs_buf_t	*bp;		/* block buffer, result */
-	xfs_daddr_t	d;		/* disk addr of block */
-	int		error;		/* error value */
-	xfs_fsblock_t	fsb;		/* fs block number for block */
 	xfs_inode_t	*ip;		/* bitmap or summary inode */
+	xfs_bmbt_irec_t	map;
+	int		nmap;
+	int		error;		/* error value */
 
 	ip = issum ? mp->m_rsumip : mp->m_rbmip;
-	/*
-	 * Map from the file offset (block) and inode number to the
-	 * file system block.
-	 */
-	error = xfs_bmapi_single(tp, ip, XFS_DATA_FORK, &fsb, block);
-	if (error) {
+
+	error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK);
+	if (error)
 		return error;
-	}
-	ASSERT(fsb != NULLFSBLOCK);
-	/*
-	 * Convert to disk address for buffer cache.
-	 */
-	d = XFS_FSB_TO_DADDR(mp, fsb);
-	/*
-	 * Read the buffer.
-	 */
-	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+
+	ASSERT(map.br_startblock != NULLFSBLOCK);
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+				   XFS_FSB_TO_DADDR(mp, map.br_startblock),
 				   mp->m_bsize, 0, &bp);
-	if (error) {
+	if (error)
 		return error;
-	}
 	ASSERT(!xfs_buf_geterror(bp));
 	*bpp = bp;
 	return 0;
-- 
cgit v1.2.3-58-ga151


From b64dfe4e180ab5047c59bcbe379538eb23be4d8e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Sep 2011 20:40:47 +0000
Subject: xfs: factor delalloc reservations out of xfs_bmapi

Move the reservation of delayed allocations, and addition of delalloc
regions to the extent trees into a new helper function.  For now
this adds some twisted goto logic to xfs_bmapi, but that will be
cleaned up in the following patches.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 202 +++++++++++++++++++++++++++++++-----------------------
 1 file changed, 118 insertions(+), 84 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index fd06bf17d09c..cd01783a89d7 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4441,6 +4441,120 @@ xfs_bmapi_read(
 	return 0;
 }
 
+STATIC int
+xfs_bmapi_reserve_delalloc(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		aoff,
+	xfs_filblks_t		len,
+	struct xfs_bmbt_irec	*got,
+	struct xfs_bmbt_irec	*prev,
+	xfs_extnum_t		*lastx,
+	int			eof)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	xfs_extlen_t		alen;
+	xfs_extlen_t		indlen;
+	xfs_fsblock_t		firstblock = NULLFSBLOCK;
+	struct xfs_btree_cur	*cur = NULL;
+	int			tmp_logflags = 0;
+	char			rt = XFS_IS_REALTIME_INODE(ip);
+	xfs_extlen_t		extsz;
+	int			error;
+
+	alen = XFS_FILBLKS_MIN(len, MAXEXTLEN);
+	if (!eof)
+		alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
+
+	/* Figure out the extent size, adjust alen */
+	extsz = xfs_get_extsz_hint(ip);
+	if (extsz) {
+		/*
+		 * Make sure we don't exceed a single extent length when we
+		 * align the extent by reducing length we are going to
+		 * allocate by the maximum amount extent size aligment may
+		 * require.
+		 */
+		alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1));
+		error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
+					       1, 0, &aoff, &alen);
+		ASSERT(!error);
+	}
+
+	if (rt)
+		extsz = alen / mp->m_sb.sb_rextsize;
+
+	/*
+	 * Make a transaction-less quota reservation for delayed allocation
+	 * blocks.  This number gets adjusted later.  We return if we haven't
+	 * allocated blocks already inside this loop.
+	 */
+	error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0,
+			rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+	if (error)
+		return error;
+
+	/*
+	 * Split changing sb for alen and indlen since they could be coming
+	 * from different places.
+	 */
+	indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
+	ASSERT(indlen > 0);
+
+	if (rt) {
+		error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
+					  -((int64_t)extsz), 0);
+	} else {
+		error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+						 -((int64_t)alen), 0);
+	}
+
+	if (error)
+		goto out_unreserve_quota;
+
+	error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+					 -((int64_t)indlen), 0);
+	if (error)
+		goto out_unreserve_blocks;
+
+
+	ip->i_delayed_blks += alen;
+
+	got->br_startoff = aoff;
+	got->br_startblock = nullstartblock(indlen);
+	got->br_blockcount = alen;
+	got->br_state = XFS_EXT_NORM;
+
+	error = xfs_bmap_add_extent(NULL, ip, lastx, &cur, got, &firstblock,
+				    NULL, &tmp_logflags, XFS_DATA_FORK);
+	ASSERT(!error);
+	ASSERT(!tmp_logflags);
+	ASSERT(!cur);
+
+	/*
+	 * Update our extent pointer, given that xfs_bmap_add_extent might
+	 * have merged it into one of the neighbouring ones.
+	 */
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
+
+	ASSERT(got->br_startoff <= aoff);
+	ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
+	ASSERT(isnullstartblock(got->br_startblock));
+	ASSERT(got->br_state == XFS_EXT_NORM);
+	return 0;
+
+out_unreserve_blocks:
+	if (rt)
+		xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0);
+	else
+		xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0);
+out_unreserve_quota:
+	if (XFS_IS_QUOTA_ON(mp))
+		xfs_trans_unreserve_quota_nblks(NULL, ip, alen, 0, rt ?
+				XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+	return error;
+}
+
 /*
  * Map file blocks to filesystem blocks.
  * File range is given by the bno/len pair.
@@ -4479,7 +4593,6 @@ xfs_bmapi(
 	int		error;		/* error return */
 	xfs_bmbt_irec_t	got;		/* current file extent record */
 	xfs_ifork_t	*ifp;		/* inode fork pointer */
-	xfs_extlen_t	indlen;		/* indirect blocks length */
 	xfs_extnum_t	lastx;		/* last useful extent number */
 	int		logflags;	/* flags for transaction logging */
 	xfs_extlen_t	minleft;	/* min blocks left after allocation */
@@ -4615,43 +4728,8 @@ xfs_bmapi(
 			}
 			minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1;
 			if (flags & XFS_BMAPI_DELAY) {
-				xfs_extlen_t	extsz;
-
-				/* Figure out the extent size, adjust alen */
-				extsz = xfs_get_extsz_hint(ip);
-				if (extsz) {
-					/*
-					 * make sure we don't exceed a single
-					 * extent length when we align the
-					 * extent by reducing length we are
-					 * going to allocate by the maximum
-					 * amount extent size aligment may
-					 * require.
-					 */
-					alen = XFS_FILBLKS_MIN(len,
-						   MAXEXTLEN - (2 * extsz - 1));
-					error = xfs_bmap_extsize_align(mp,
-							&got, &prev, extsz,
-							rt, eof,
-							flags&XFS_BMAPI_DELAY,
-							flags&XFS_BMAPI_CONVERT,
-							&aoff, &alen);
-					ASSERT(!error);
-				}
-
-				if (rt)
-					extsz = alen / mp->m_sb.sb_rextsize;
-
-				/*
-				 * Make a transaction-less quota reservation for
-				 * delayed allocation blocks. This number gets
-				 * adjusted later.  We return if we haven't
-				 * allocated blocks already inside this loop.
-				 */
-				error = xfs_trans_reserve_quota_nblks(
-						NULL, ip, (long)alen, 0,
-						rt ? XFS_QMOPT_RES_RTBLKS :
-						     XFS_QMOPT_RES_REGBLKS);
+				error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got,
+								   &prev, &lastx, eof);
 				if (error) {
 					if (n == 0) {
 						*nmap = 0;
@@ -4661,51 +4739,7 @@ xfs_bmapi(
 					break;
 				}
 
-				/*
-				 * Split changing sb for alen and indlen since
-				 * they could be coming from different places.
-				 */
-				indlen = (xfs_extlen_t)
-					xfs_bmap_worst_indlen(ip, alen);
-				ASSERT(indlen > 0);
-
-				if (rt) {
-					error = xfs_mod_incore_sb(mp,
-							XFS_SBS_FREXTENTS,
-							-((int64_t)extsz), 0);
-				} else {
-					error = xfs_icsb_modify_counters(mp,
-							XFS_SBS_FDBLOCKS,
-							-((int64_t)alen), 0);
-				}
-				if (!error) {
-					error = xfs_icsb_modify_counters(mp,
-							XFS_SBS_FDBLOCKS,
-							-((int64_t)indlen), 0);
-					if (error && rt)
-						xfs_mod_incore_sb(mp,
-							XFS_SBS_FREXTENTS,
-							(int64_t)extsz, 0);
-					else if (error)
-						xfs_icsb_modify_counters(mp,
-							XFS_SBS_FDBLOCKS,
-							(int64_t)alen, 0);
-				}
-
-				if (error) {
-					if (XFS_IS_QUOTA_ON(mp))
-						/* unreserve the blocks now */
-						(void)
-						xfs_trans_unreserve_quota_nblks(
-							NULL, ip,
-							(long)alen, 0, rt ?
-							XFS_QMOPT_RES_RTBLKS :
-							XFS_QMOPT_RES_REGBLKS);
-					break;
-				}
-
-				ip->i_delayed_blks += alen;
-				abno = nullstartblock(indlen);
+				goto trim_extent;
 			} else {
 				/*
 				 * If first time, allocate and fill in
@@ -4843,7 +4877,7 @@ xfs_bmapi(
 			n++;
 			continue;
 		}
-
+trim_extent:
 		/* Deal with the allocated space we found.  */
 		xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
 
-- 
cgit v1.2.3-58-ga151


From 4403280aa5c00c6074f2dc23e1cfc11f2bfb0032 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Sep 2011 20:40:48 +0000
Subject: xfs: introduce xfs_bmapi_delay()

Delalloc reservations are much simpler than allocations, so give
them a separate bmapi-level interface.  Using the previously added
xfs_bmapi_reserve_delalloc we get a function that is only minimally
more complicated than xfs_bmapi_read, which is far from the complexity
in xfs_bmapi.  Also remove the XFS_BMAPI_DELAY code after switching
over the only user to xfs_bmapi_delay.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c  | 115 +++++++++++++++++++++++++++++++++++++++++------------
 fs/xfs/xfs_bmap.h  |   5 ++-
 fs/xfs/xfs_iomap.c |   9 +----
 3 files changed, 94 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index cd01783a89d7..6f3276974555 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4238,7 +4238,7 @@ xfs_bmap_validate_ret(
 		ASSERT(i == 0 ||
 		       mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
 		       mval[i].br_startoff);
-		if ((flags & XFS_BMAPI_WRITE) && !(flags & XFS_BMAPI_DELAY))
+		if (flags & XFS_BMAPI_WRITE)
 			ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
 			       mval[i].br_startblock != HOLESTARTBLOCK);
 		ASSERT(mval[i].br_state == XFS_EXT_NORM ||
@@ -4555,6 +4555,90 @@ out_unreserve_quota:
 	return error;
 }
 
+/*
+ * Map file blocks to filesystem blocks, adding delayed allocations as needed.
+ */
+int
+xfs_bmapi_delay(
+	struct xfs_inode	*ip,	/* incore inode */
+	xfs_fileoff_t		bno,	/* starting file offs. mapped */
+	xfs_filblks_t		len,	/* length to map in file */
+	struct xfs_bmbt_irec	*mval,	/* output: map values */
+	int			*nmap,	/* i/o: mval size/count */
+	int			flags)	/* XFS_BMAPI_... */
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	struct xfs_bmbt_irec	got;	/* current file extent record */
+	struct xfs_bmbt_irec	prev;	/* previous file extent record */
+	xfs_fileoff_t		obno;	/* old block number (offset) */
+	xfs_fileoff_t		end;	/* end of mapped file region */
+	xfs_extnum_t		lastx;	/* last useful extent number */
+	int			eof;	/* we've hit the end of extents */
+	int			n = 0;	/* current extent index */
+	int			error = 0;
+
+	ASSERT(*nmap >= 1);
+	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+	ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
+
+	if (unlikely(XFS_TEST_ERROR(
+	    (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
+	     XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
+	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+		XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	XFS_STATS_INC(xs_blk_mapw);
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+		if (error)
+			return error;
+	}
+
+	xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev);
+	end = bno + len;
+	obno = bno;
+
+	while (bno < end && n < *nmap) {
+		if (eof || got.br_startoff > bno) {
+			error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got,
+							   &prev, &lastx, eof);
+			if (error) {
+				if (n == 0) {
+					*nmap = 0;
+					return error;
+				}
+				break;
+			}
+		}
+
+		/* set up the extent map to return. */
+		xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
+		xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
+		/* If we're done, stop now. */
+		if (bno >= end || n >= *nmap)
+			break;
+
+		/* Else go on to the next record. */
+		prev = got;
+		if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
+		else
+			eof = 1;
+	}
+
+	*nmap = n;
+	return 0;
+}
+
+
 /*
  * Map file blocks to filesystem blocks.
  * File range is given by the bno/len pair.
@@ -4663,7 +4747,6 @@ xfs_bmapi(
 	 */
 	if ((flags & XFS_BMAPI_IGSTATE) && wr)	/* if writing unwritten space */
 		wr = 0;				/* no allocations are allowed */
-	ASSERT(wr || !(flags & XFS_BMAPI_DELAY));
 	logflags = 0;
 	nallocs = 0;
 	cur = NULL;
@@ -4698,8 +4781,7 @@ xfs_bmapi(
 		if (eof && !wr)
 			got.br_startoff = end;
 		inhole = eof || got.br_startoff > bno;
-		wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) &&
-			isnullstartblock(got.br_startblock);
+		wasdelay = wr && !inhole && isnullstartblock(got.br_startblock);
 		/*
 		 * First, deal with the hole before the allocated space
 		 * that we found, if any.
@@ -4727,20 +4809,7 @@ xfs_bmapi(
 				aoff = bno;
 			}
 			minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1;
-			if (flags & XFS_BMAPI_DELAY) {
-				error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got,
-								   &prev, &lastx, eof);
-				if (error) {
-					if (n == 0) {
-						*nmap = 0;
-						ASSERT(cur == NULL);
-						return error;
-					}
-					break;
-				}
-
-				goto trim_extent;
-			} else {
+			{
 				/*
 				 * If first time, allocate and fill in
 				 * once-only bma fields.
@@ -4851,14 +4920,8 @@ xfs_bmapi(
 			ASSERT(got.br_startoff <= aoff);
 			ASSERT(got.br_startoff + got.br_blockcount >=
 				aoff + alen);
-#ifdef DEBUG
-			if (flags & XFS_BMAPI_DELAY) {
-				ASSERT(isnullstartblock(got.br_startblock));
-				ASSERT(startblockval(got.br_startblock) > 0);
-			}
 			ASSERT(got.br_state == XFS_EXT_NORM ||
 			       got.br_state == XFS_EXT_UNWRITTEN);
-#endif
 			/*
 			 * Fall down into the found allocated space case.
 			 */
@@ -4877,7 +4940,7 @@ xfs_bmapi(
 			n++;
 			continue;
 		}
-trim_extent:
+
 		/* Deal with the allocated space we found.  */
 		xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
 
@@ -4887,7 +4950,7 @@ trim_extent:
 		 */
 		if (wr &&
 		    ((mval->br_state == XFS_EXT_UNWRITTEN &&
-		      ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) ||
+		      ((flags & XFS_BMAPI_PREALLOC) == 0)) ||
 		     (mval->br_state == XFS_EXT_NORM &&
 		      ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) ==
 				(XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) {
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 01693390df21..c70c19c7f1fa 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -65,7 +65,6 @@ typedef	struct xfs_bmap_free
  * Flags for xfs_bmapi
  */
 #define	XFS_BMAPI_WRITE		0x001	/* write operation: allocate space */
-#define XFS_BMAPI_DELAY		0x002	/* delayed write operation */
 #define XFS_BMAPI_ENTIRE	0x004	/* return entire extent, not trimmed */
 #define XFS_BMAPI_METADATA	0x008	/* mapping metadata not user data */
 #define XFS_BMAPI_ATTRFORK	0x010	/* use attribute fork not data */
@@ -82,7 +81,6 @@ typedef	struct xfs_bmap_free
 
 #define XFS_BMAPI_FLAGS \
 	{ XFS_BMAPI_WRITE,	"WRITE" }, \
-	{ XFS_BMAPI_DELAY,	"DELAY" }, \
 	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
 	{ XFS_BMAPI_METADATA,	"METADATA" }, \
 	{ XFS_BMAPI_ATTRFORK,	"ATTRFORK" }, \
@@ -297,6 +295,9 @@ xfs_bmapi(
 int	xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
 		xfs_filblks_t len, struct xfs_bmbt_irec *mval,
 		int *nmap, int flags);
+int	xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno,
+		xfs_filblks_t len, struct xfs_bmbt_irec *mval,
+		int *nmap, int flags);
 
 /*
  * Unmap (remove) blocks from a file.
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 544f053860f1..681ba34c9233 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -381,7 +381,6 @@ xfs_iomap_write_delay(
 	xfs_fileoff_t	last_fsb;
 	xfs_off_t	aligned_offset;
 	xfs_fileoff_t	ioalign;
-	xfs_fsblock_t	firstblock;
 	xfs_extlen_t	extsz;
 	int		nimaps;
 	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
@@ -425,12 +424,8 @@ retry:
 	}
 
 	nimaps = XFS_WRITE_IMAPS;
-	firstblock = NULLFSBLOCK;
-	error = xfs_bmapi(NULL, ip, offset_fsb,
-			  (xfs_filblks_t)(last_fsb - offset_fsb),
-			  XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
-			  XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
-			  &nimaps, NULL);
+	error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb,
+				imap, &nimaps, XFS_BMAPI_ENTIRE);
 	switch (error) {
 	case 0:
 	case ENOSPC:
-- 
cgit v1.2.3-58-ga151


From 1fd044d9c6735e669f0db025f18023e56a608130 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Sep 2011 20:40:49 +0000
Subject: xfs: do not use xfs_bmap_add_extent for adding delalloc extents

We can just call xfs_bmap_add_extent_hole_delay directly to add a
delayed allocated regions to the extent tree, instead of going
through all the complexities of xfs_bmap_add_extent that aren't
needed for this simple case.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 52 +++++++++-------------------------------------------
 1 file changed, 9 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 6f3276974555..c9492e23447c 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -100,17 +100,6 @@ xfs_bmap_add_extent_delay_real(
 	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
 	int			*logflagsp); /* inode logging flags */
 
-/*
- * Called by xfs_bmap_add_extent to handle cases converting a hole
- * to a delayed allocation.
- */
-STATIC int				/* error */
-xfs_bmap_add_extent_hole_delay(
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
-	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	int			*logflagsp); /* inode logging flags */
-
 /*
  * Called by xfs_bmap_add_extent to handle cases converting a hole
  * to a real allocation.
@@ -431,8 +420,7 @@ xfs_bmap_add_attrfork_local(
 }
 
 /*
- * Called by xfs_bmapi to update file extent records and the btree
- * after allocating space (or doing a delayed allocation).
+ * Update file extent records and the btree after allocating space.
  */
 STATIC int				/* error */
 xfs_bmap_add_extent(
@@ -464,21 +452,12 @@ xfs_bmap_add_extent(
 
 	ASSERT(*idx >= 0);
 	ASSERT(*idx <= nextents);
+	ASSERT(!isnullstartblock(new->br_startblock));
 
-	/*
-	 * Any kind of new delayed allocation goes here.
-	 */
-	if (isnullstartblock(new->br_startblock)) {
-		if (cur)
-			ASSERT((cur->bc_private.b.flags &
-				XFS_BTCUR_BPRV_WASDEL) == 0);
-		error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
-						       &logflags);
-	}
 	/*
 	 * Real allocation off the end of the file.
 	 */
-	else if (*idx == nextents) {
+	if (*idx == nextents) {
 		if (cur)
 			ASSERT((cur->bc_private.b.flags &
 				XFS_BTCUR_BPRV_WASDEL) == 0);
@@ -1581,16 +1560,13 @@ done:
 }
 
 /*
- * Called by xfs_bmap_add_extent to handle cases converting a hole
- * to a delayed allocation.
+ * Convert a hole to a delayed allocation.
  */
-/*ARGSUSED*/
-STATIC int				/* error */
+STATIC void
 xfs_bmap_add_extent_hole_delay(
 	xfs_inode_t		*ip,	/* incore inode pointer */
 	xfs_extnum_t		*idx,	/* extent number to update/insert */
-	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	int			*logflagsp) /* inode logging flags */
+	xfs_bmbt_irec_t		*new)	/* new data to add to file extents */
 {
 	xfs_ifork_t		*ifp;	/* inode fork pointer */
 	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
@@ -1725,8 +1701,6 @@ xfs_bmap_add_extent_hole_delay(
 		 * Nothing to do for disk quota accounting here.
 		 */
 	}
-	*logflagsp = 0;
-	return 0;
 }
 
 /*
@@ -4455,9 +4429,6 @@ xfs_bmapi_reserve_delalloc(
 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
 	xfs_extlen_t		alen;
 	xfs_extlen_t		indlen;
-	xfs_fsblock_t		firstblock = NULLFSBLOCK;
-	struct xfs_btree_cur	*cur = NULL;
-	int			tmp_logflags = 0;
 	char			rt = XFS_IS_REALTIME_INODE(ip);
 	xfs_extlen_t		extsz;
 	int			error;
@@ -4524,16 +4495,11 @@ xfs_bmapi_reserve_delalloc(
 	got->br_startblock = nullstartblock(indlen);
 	got->br_blockcount = alen;
 	got->br_state = XFS_EXT_NORM;
-
-	error = xfs_bmap_add_extent(NULL, ip, lastx, &cur, got, &firstblock,
-				    NULL, &tmp_logflags, XFS_DATA_FORK);
-	ASSERT(!error);
-	ASSERT(!tmp_logflags);
-	ASSERT(!cur);
+	xfs_bmap_add_extent_hole_delay(ip, lastx, got);
 
 	/*
-	 * Update our extent pointer, given that xfs_bmap_add_extent might
-	 * have merged it into one of the neighbouring ones.
+	 * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
+	 * might have merged it into one of the neighbouring ones.
 	 */
 	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
 
-- 
cgit v1.2.3-58-ga151


From 7e47a4efde33aa3f0cb901e086a75751c2269f04 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sun, 18 Sep 2011 20:40:50 +0000
Subject: xfs: factor extent allocation out of xfs_bmapi

To further improve the readability of xfs_bmapi(), factor the extent
allocation out into a separate function. This removes a large block
of logic from the xfs_bmapi() code loop and makes it easier to see
the operational logic flow for xfs_bmapi().

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 302 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 162 insertions(+), 140 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index c9492e23447c..311cbc1d64c5 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4605,6 +4605,147 @@ xfs_bmapi_delay(
 }
 
 
+STATIC int
+xfs_bmapi_allocate(
+	struct xfs_bmalloca	*bma,
+	xfs_extnum_t		*lastx,
+	struct xfs_btree_cur	**cur,
+	xfs_fsblock_t		*firstblock,
+	struct xfs_bmap_free	*flist,
+	int			flags,
+	int			*nallocs,
+	int			*logflags)
+{
+	struct xfs_mount	*mp = bma->ip->i_mount;
+	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+						XFS_ATTR_FORK : XFS_DATA_FORK;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+	xfs_fsblock_t		abno;
+	xfs_extlen_t		alen;
+	xfs_fileoff_t		aoff;
+	int			error;
+	int			rt;
+
+	rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip);
+
+	/*
+	 * For the wasdelay case, we could also just allocate the stuff asked
+	 * for in this bmap call but that wouldn't be as good.
+	 */
+	if (bma->wasdel) {
+		alen = (xfs_extlen_t)bma->gotp->br_blockcount;
+		aoff = bma->gotp->br_startoff;
+		if (*lastx != NULLEXTNUM && *lastx) {
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx - 1),
+					 bma->prevp);
+		}
+	} else {
+		alen = (xfs_extlen_t)XFS_FILBLKS_MIN(bma->alen, MAXEXTLEN);
+		if (!bma->eof)
+			alen = (xfs_extlen_t)XFS_FILBLKS_MIN(alen,
+					bma->gotp->br_startoff - bma->off);
+		aoff = bma->off;
+	}
+
+	/*
+	 * Indicate if this is the first user data in the file, or just any
+	 * user data.
+	 */
+	if (!(flags & XFS_BMAPI_METADATA)) {
+		bma->userdata = (aoff == 0) ?
+			XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
+	}
+
+	/*
+	 * Fill in changeable bma fields.
+	 */
+	bma->alen = alen;
+	bma->off = aoff;
+	bma->firstblock = *firstblock;
+	bma->minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1;
+	bma->low = flist->xbf_low;
+	bma->aeof = 0;
+
+	/*
+	 * Only want to do the alignment at the eof if it is userdata and
+	 * allocation length is larger than a stripe unit.
+	 */
+	if (mp->m_dalign && alen >= mp->m_dalign &&
+	    !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
+		error = xfs_bmap_isaeof(bma->ip, aoff, whichfork, &bma->aeof);
+		if (error)
+			return error;
+	}
+
+	error = xfs_bmap_alloc(bma);
+	if (error)
+		return error;
+
+	/*
+	 * Copy out result fields.
+	 */
+	abno = bma->rval;
+	flist->xbf_low = bma->low;
+	alen = bma->alen;
+	aoff = bma->off;
+	ASSERT(*firstblock == NULLFSBLOCK ||
+	       XFS_FSB_TO_AGNO(mp, *firstblock) ==
+	       XFS_FSB_TO_AGNO(mp, bma->firstblock) ||
+	       (flist->xbf_low &&
+		XFS_FSB_TO_AGNO(mp, *firstblock) <
+			XFS_FSB_TO_AGNO(mp, bma->firstblock)));
+	*firstblock = bma->firstblock;
+	if (*cur)
+		(*cur)->bc_private.b.firstblock = *firstblock;
+	if (abno == NULLFSBLOCK)
+		return 0;
+	if ((ifp->if_flags & XFS_IFBROOT) && !*cur) {
+		(*cur) = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
+		(*cur)->bc_private.b.firstblock = *firstblock;
+		(*cur)->bc_private.b.flist = flist;
+	}
+	/*
+	 * Bump the number of extents we've allocated
+	 * in this call.
+	 */
+	(*nallocs)++;
+
+	if (*cur)
+		(*cur)->bc_private.b.flags =
+			bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+
+	bma->gotp->br_startoff = aoff;
+	bma->gotp->br_startblock = abno;
+	bma->gotp->br_blockcount = alen;
+	bma->gotp->br_state = XFS_EXT_NORM;
+
+	/*
+	 * A wasdelay extent has been initialized, so shouldn't be flagged
+	 * as unwritten.
+	 */
+	if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) &&
+	    xfs_sb_version_hasextflgbit(&mp->m_sb))
+		bma->gotp->br_state = XFS_EXT_UNWRITTEN;
+
+	error = xfs_bmap_add_extent(bma->tp, bma->ip, lastx, cur, bma->gotp,
+				    firstblock, flist, logflags, whichfork);
+	if (error)
+		return error;
+
+	/*
+	 * Update our extent pointer, given that xfs_bmap_add_extent  might
+	 * have merged it into one of the neighbouring ones.
+	 */
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), bma->gotp);
+
+	ASSERT(bma->gotp->br_startoff <= aoff);
+	ASSERT(bma->gotp->br_startoff + bma->gotp->br_blockcount >=
+		aoff + alen);
+	ASSERT(bma->gotp->br_state == XFS_EXT_NORM ||
+	       bma->gotp->br_state == XFS_EXT_UNWRITTEN);
+	return 0;
+}
+
 /*
  * Map file blocks to filesystem blocks.
  * File range is given by the bno/len pair.
@@ -4632,9 +4773,6 @@ xfs_bmapi(
 	int		*nmap,		/* i/o: mval size/count */
 	xfs_bmap_free_t	*flist)		/* i/o: list extents to free */
 {
-	xfs_fsblock_t	abno;		/* allocated block number */
-	xfs_extlen_t	alen;		/* allocated extent length */
-	xfs_fileoff_t	aoff;		/* allocated file offset */
 	xfs_bmalloca_t	bma = { 0 };	/* args for xfs_bmap_alloc */
 	xfs_btree_cur_t	*cur;		/* bmap btree cursor */
 	xfs_fileoff_t	end;		/* end of mapped file region */
@@ -4646,7 +4784,6 @@ xfs_bmapi(
 	xfs_extnum_t	lastx;		/* last useful extent number */
 	int		logflags;	/* flags for transaction logging */
 	xfs_extlen_t	minleft;	/* min blocks left after allocation */
-	xfs_extlen_t	minlen;		/* min allocation size */
 	xfs_mount_t	*mp;		/* xfs mount structure */
 	int		n;		/* current extent index */
 	int		nallocs;	/* number of extents alloc'd */
@@ -4737,7 +4874,13 @@ xfs_bmapi(
 	n = 0;
 	end = bno + len;
 	obno = bno;
-	bma.ip = NULL;
+
+	bma.tp = tp;
+	bma.ip = ip;
+	bma.prevp = &prev;
+	bma.gotp = &got;
+	bma.total = total;
+	bma.userdata = 0;
 
 	while (bno < end && n < *nmap) {
 		/*
@@ -4753,144 +4896,23 @@ xfs_bmapi(
 		 * that we found, if any.
 		 */
 		if (wr && (inhole || wasdelay)) {
-			/*
-			 * For the wasdelay case, we could also just
-			 * allocate the stuff asked for in this bmap call
-			 * but that wouldn't be as good.
-			 */
-			if (wasdelay) {
-				alen = (xfs_extlen_t)got.br_blockcount;
-				aoff = got.br_startoff;
-				if (lastx != NULLEXTNUM && lastx) {
-					ep = xfs_iext_get_ext(ifp, lastx - 1);
-					xfs_bmbt_get_all(ep, &prev);
-				}
-			} else {
-				alen = (xfs_extlen_t)
-					XFS_FILBLKS_MIN(len, MAXEXTLEN);
-				if (!eof)
-					alen = (xfs_extlen_t)
-						XFS_FILBLKS_MIN(alen,
-							got.br_startoff - bno);
-				aoff = bno;
-			}
-			minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1;
-			{
-				/*
-				 * If first time, allocate and fill in
-				 * once-only bma fields.
-				 */
-				if (bma.ip == NULL) {
-					bma.tp = tp;
-					bma.ip = ip;
-					bma.prevp = &prev;
-					bma.gotp = &got;
-					bma.total = total;
-					bma.userdata = 0;
-				}
-				/* Indicate if this is the first user data
-				 * in the file, or just any user data.
-				 */
-				if (!(flags & XFS_BMAPI_METADATA)) {
-					bma.userdata = (aoff == 0) ?
-						XFS_ALLOC_INITIAL_USER_DATA :
-						XFS_ALLOC_USERDATA;
-				}
-				/*
-				 * Fill in changeable bma fields.
-				 */
-				bma.eof = eof;
-				bma.firstblock = *firstblock;
-				bma.alen = alen;
-				bma.off = aoff;
-				bma.conv = !!(flags & XFS_BMAPI_CONVERT);
-				bma.wasdel = wasdelay;
-				bma.minlen = minlen;
-				bma.low = flist->xbf_low;
-				bma.minleft = minleft;
-				/*
-				 * Only want to do the alignment at the
-				 * eof if it is userdata and allocation length
-				 * is larger than a stripe unit.
-				 */
-				if (mp->m_dalign && alen >= mp->m_dalign &&
-				    (!(flags & XFS_BMAPI_METADATA)) &&
-				    (whichfork == XFS_DATA_FORK)) {
-					if ((error = xfs_bmap_isaeof(ip, aoff,
-							whichfork, &bma.aeof)))
-						goto error0;
-				} else
-					bma.aeof = 0;
-				/*
-				 * Call allocator.
-				 */
-				if ((error = xfs_bmap_alloc(&bma)))
-					goto error0;
-				/*
-				 * Copy out result fields.
-				 */
-				abno = bma.rval;
-				if ((flist->xbf_low = bma.low))
-					minleft = 0;
-				alen = bma.alen;
-				aoff = bma.off;
-				ASSERT(*firstblock == NULLFSBLOCK ||
-				       XFS_FSB_TO_AGNO(mp, *firstblock) ==
-				       XFS_FSB_TO_AGNO(mp, bma.firstblock) ||
-				       (flist->xbf_low &&
-					XFS_FSB_TO_AGNO(mp, *firstblock) <
-					XFS_FSB_TO_AGNO(mp, bma.firstblock)));
-				*firstblock = bma.firstblock;
-				if (cur)
-					cur->bc_private.b.firstblock =
-						*firstblock;
-				if (abno == NULLFSBLOCK)
-					break;
-				if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
-					cur = xfs_bmbt_init_cursor(mp, tp,
-						ip, whichfork);
-					cur->bc_private.b.firstblock =
-						*firstblock;
-					cur->bc_private.b.flist = flist;
-				}
-				/*
-				 * Bump the number of extents we've allocated
-				 * in this call.
-				 */
-				nallocs++;
-			}
-			if (cur)
-				cur->bc_private.b.flags =
-					wasdelay ? XFS_BTCUR_BPRV_WASDEL : 0;
-			got.br_startoff = aoff;
-			got.br_startblock = abno;
-			got.br_blockcount = alen;
-			got.br_state = XFS_EXT_NORM;	/* assume normal */
-			/*
-			 * Determine state of extent, and the filesystem.
-			 * A wasdelay extent has been initialized, so
-			 * shouldn't be flagged as unwritten.
-			 */
-			if (wr && xfs_sb_version_hasextflgbit(&mp->m_sb)) {
-				if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
-					got.br_state = XFS_EXT_UNWRITTEN;
-			}
-			error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, &got,
-				firstblock, flist, &tmp_logflags,
-				whichfork);
+			bma.eof = eof;
+			bma.conv = !!(flags & XFS_BMAPI_CONVERT);
+			bma.wasdel = wasdelay;
+			bma.alen = len;
+			bma.off = bno;
+			bma.minleft = minleft;
+
+			error = xfs_bmapi_allocate(&bma, &lastx, &cur,
+					firstblock, flist, flags, &nallocs,
+					&tmp_logflags);
 			logflags |= tmp_logflags;
 			if (error)
 				goto error0;
-			ep = xfs_iext_get_ext(ifp, lastx);
-			xfs_bmbt_get_all(ep, &got);
-			ASSERT(got.br_startoff <= aoff);
-			ASSERT(got.br_startoff + got.br_blockcount >=
-				aoff + alen);
-			ASSERT(got.br_state == XFS_EXT_NORM ||
-			       got.br_state == XFS_EXT_UNWRITTEN);
-			/*
-			 * Fall down into the found allocated space case.
-			 */
+			if (flist && flist->xbf_low)
+				minleft = 0;
+			if (bma.rval == NULLFSBLOCK)
+				break;
 		} else if (inhole) {
 			/*
 			 * Reading in a hole.
-- 
cgit v1.2.3-58-ga151


From b447fe5a05cbd01c4bf7fe2fa41cb9e99ce7e58e Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sun, 18 Sep 2011 20:40:51 +0000
Subject: xfs: factor unwritten extent map manipulations out of xfs_bmapi

To further improve the readability of xfs_bmapi(), factor the
unwritten extent conversion out into a separate function. This
removes large block of logic from the xfs_bmapi() code loop and
makes it easier to see the operational logic flow for xfs_bmapi().

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 107 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 70 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 311cbc1d64c5..c2e49fd18bfd 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4089,7 +4089,6 @@ xfs_bmap_read_extents(
 		xfs_extnum_t	num_recs;
 		xfs_extnum_t	start;
 
-
 		num_recs = xfs_btree_get_numrecs(block);
 		if (unlikely(i + num_recs > room)) {
 			ASSERT(i + num_recs <= room);
@@ -4746,6 +4745,69 @@ xfs_bmapi_allocate(
 	return 0;
 }
 
+STATIC int
+xfs_bmapi_convert_unwritten(
+	struct xfs_bmalloca	*bma,
+	struct xfs_bmbt_irec	*mval,
+	xfs_filblks_t		len,
+	xfs_extnum_t		*lastx,
+	struct xfs_btree_cur	**cur,
+	xfs_fsblock_t		*firstblock,
+	struct xfs_bmap_free	*flist,
+	int			flags,
+	int			*logflags)
+{
+	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+						XFS_ATTR_FORK : XFS_DATA_FORK;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+	int			error;
+
+	*logflags = 0;
+
+	/* check if we need to do unwritten->real conversion */
+	if (mval->br_state == XFS_EXT_UNWRITTEN &&
+	    (flags & XFS_BMAPI_PREALLOC))
+		return 0;
+
+	/* check if we need to do real->unwritten conversion */
+	if (mval->br_state == XFS_EXT_NORM &&
+	    (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) !=
+			(XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
+		return 0;
+
+	/*
+	 * Modify (by adding) the state flag, if writing.
+	 */
+	ASSERT(mval->br_blockcount <= len);
+	if ((ifp->if_flags & XFS_IFBROOT) && !*cur) {
+		*cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
+					bma->ip, whichfork);
+		(*cur)->bc_private.b.firstblock = *firstblock;
+		(*cur)->bc_private.b.flist = flist;
+	}
+	mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
+				? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+
+	error = xfs_bmap_add_extent(bma->tp, bma->ip, lastx, cur, mval,
+				firstblock, flist, logflags, whichfork);
+	if (error)
+		return error;
+
+	/*
+	 * Update our extent pointer, given that xfs_bmap_add_extent  might
+	 * have merged it into one of the neighbouring ones.
+	 */
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), bma->gotp);
+
+	/*
+	 * We may have combined previously unwritten space with written space,
+	 * so generate another request.
+	 */
+	if (mval->br_blockcount < len)
+		return EAGAIN;
+	return 0;
+}
+
 /*
  * Map file blocks to filesystem blocks.
  * File range is given by the bno/len pair.
@@ -4932,45 +4994,16 @@ xfs_bmapi(
 		/* Deal with the allocated space we found.  */
 		xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
 
-		/*
-		 * Check if writing previously allocated but
-		 * unwritten extents.
-		 */
-		if (wr &&
-		    ((mval->br_state == XFS_EXT_UNWRITTEN &&
-		      ((flags & XFS_BMAPI_PREALLOC) == 0)) ||
-		     (mval->br_state == XFS_EXT_NORM &&
-		      ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) ==
-				(XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) {
-			/*
-			 * Modify (by adding) the state flag, if writing.
-			 */
-			ASSERT(mval->br_blockcount <= len);
-			if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
-				cur = xfs_bmbt_init_cursor(mp,
-					tp, ip, whichfork);
-				cur->bc_private.b.firstblock =
-					*firstblock;
-				cur->bc_private.b.flist = flist;
-			}
-			mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
-						? XFS_EXT_NORM
-						: XFS_EXT_UNWRITTEN;
-			error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, mval,
-				firstblock, flist, &tmp_logflags,
-				whichfork);
+		/* Execute unwritten extent conversion if necessary */
+		if (wr) {
+			error = xfs_bmapi_convert_unwritten(&bma, mval, len,
+						&lastx, &cur, firstblock, flist, flags,
+						&tmp_logflags);
 			logflags |= tmp_logflags;
+			if (error == EAGAIN)
+				continue;
 			if (error)
 				goto error0;
-			ep = xfs_iext_get_ext(ifp, lastx);
-			xfs_bmbt_get_all(ep, &got);
-			/*
-			 * We may have combined previously unwritten
-			 * space with written space, so generate
-			 * another request.
-			 */
-			if (mval->br_blockcount < len)
-				continue;
 		}
 
 		/* update the extent map to return */
-- 
cgit v1.2.3-58-ga151


From c0dc7828af6952643219292be29e482ef74cb261 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sun, 18 Sep 2011 20:40:52 +0000
Subject: xfs: rename xfs_bmapi to xfs_bmapi_write

Now that all the read-only users of xfs_bmapi have been converted to
use xfs_bmapi_read(), we can remove all the read-only handling cases
from xfs_bmapi().

Once this is done, rename xfs_bmapi to xfs_bmapi_write to reflect
the fact it is for allocation only. This enables us to kill the
XFS_BMAPI_WRITE flag as well.

Also clean up xfs_bmapi_write to the style used in the newly added
xfs_bmapi_read/delay functions.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_attr.c     |   5 +-
 fs/xfs/xfs_bmap.c     | 225 +++++++++++++++++++++-----------------------------
 fs/xfs/xfs_bmap.h     |  50 +++--------
 fs/xfs/xfs_da_btree.c |  10 +--
 fs/xfs/xfs_dquot.c    |  12 ++-
 fs/xfs/xfs_iomap.c    |  20 ++---
 fs/xfs/xfs_rtalloc.c  |   6 +-
 fs/xfs/xfs_vnodeops.c |  19 ++---
 8 files changed, 135 insertions(+), 212 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 41ef02b7185a..5484766938f9 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2039,10 +2039,9 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 		 */
 		xfs_bmap_init(args->flist, args->firstblock);
 		nmap = 1;
-		error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno,
+		error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
 				  blkcnt,
-				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA |
-							XFS_BMAPI_WRITE,
+				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
 				  args->firstblock, args->total, &map, &nmap,
 				  args->flist);
 		if (!error) {
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index c2e49fd18bfd..595cc6311937 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4211,9 +4211,8 @@ xfs_bmap_validate_ret(
 		ASSERT(i == 0 ||
 		       mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
 		       mval[i].br_startoff);
-		if (flags & XFS_BMAPI_WRITE)
-			ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
-			       mval[i].br_startblock != HOLESTARTBLOCK);
+		ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
+		       mval[i].br_startblock != HOLESTARTBLOCK);
 		ASSERT(mval[i].br_state == XFS_EXT_NORM ||
 		       mval[i].br_state == XFS_EXT_UNWRITTEN);
 	}
@@ -4809,60 +4808,57 @@ xfs_bmapi_convert_unwritten(
 }
 
 /*
- * Map file blocks to filesystem blocks.
- * File range is given by the bno/len pair.
- * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
- * into a hole or past eof.
- * Only allocates blocks from a single allocation group,
- * to avoid locking problems.
+ * Map file blocks to filesystem blocks, and allocate blocks or convert the
+ * extent state if necessary.  Details behaviour is controlled by the flags
+ * parameter.  Only allocates blocks from a single allocation group, to avoid
+ * locking problems.
+ *
  * The returned value in "firstblock" from the first call in a transaction
  * must be remembered and presented to subsequent calls in "firstblock".
  * An upper bound for the number of blocks to be allocated is supplied to
  * the first call in "total"; if no allocation group has that many free
  * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
  */
-int					/* error */
-xfs_bmapi(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*ip,		/* incore inode */
-	xfs_fileoff_t	bno,		/* starting file offs. mapped */
-	xfs_filblks_t	len,		/* length to map in file */
-	int		flags,		/* XFS_BMAPI_... */
-	xfs_fsblock_t	*firstblock,	/* first allocated block
-					   controls a.g. for allocs */
-	xfs_extlen_t	total,		/* total blocks needed */
-	xfs_bmbt_irec_t	*mval,		/* output: map values */
-	int		*nmap,		/* i/o: mval size/count */
-	xfs_bmap_free_t	*flist)		/* i/o: list extents to free */
+int
+xfs_bmapi_write(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		bno,		/* starting file offs. mapped */
+	xfs_filblks_t		len,		/* length to map in file */
+	int			flags,		/* XFS_BMAPI_... */
+	xfs_fsblock_t		*firstblock,	/* first allocated block
+						   controls a.g. for allocs */
+	xfs_extlen_t		total,		/* total blocks needed */
+	struct xfs_bmbt_irec	*mval,		/* output: map values */
+	int			*nmap,		/* i/o: mval size/count */
+	struct xfs_bmap_free	*flist)		/* i/o: list extents to free */
 {
-	xfs_bmalloca_t	bma = { 0 };	/* args for xfs_bmap_alloc */
-	xfs_btree_cur_t	*cur;		/* bmap btree cursor */
-	xfs_fileoff_t	end;		/* end of mapped file region */
-	int		eof;		/* we've hit the end of extents */
-	xfs_bmbt_rec_host_t *ep;	/* extent record pointer */
-	int		error;		/* error return */
-	xfs_bmbt_irec_t	got;		/* current file extent record */
-	xfs_ifork_t	*ifp;		/* inode fork pointer */
-	xfs_extnum_t	lastx;		/* last useful extent number */
-	int		logflags;	/* flags for transaction logging */
-	xfs_extlen_t	minleft;	/* min blocks left after allocation */
-	xfs_mount_t	*mp;		/* xfs mount structure */
-	int		n;		/* current extent index */
-	int		nallocs;	/* number of extents alloc'd */
-	xfs_fileoff_t	obno;		/* old block number (offset) */
-	xfs_bmbt_irec_t	prev;		/* previous file extent record */
-	int		tmp_logflags;	/* temp flags holder */
-	int		whichfork;	/* data or attr fork */
-	char		inhole;		/* current location is hole in file */
-	char		wasdelay;	/* old extent was delayed */
-	char		wr;		/* this is a write request */
-	char		rt;		/* this is a realtime file */
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp;
+	struct xfs_bmalloca	bma = { 0 };	/* args for xfs_bmap_alloc */
+	struct xfs_btree_cur	*cur;		/* bmap btree cursor */
+	xfs_fileoff_t		end;		/* end of mapped file region */
+	int			eof;		/* after the end of extents */
+	int			error;		/* error return */
+	struct xfs_bmbt_irec	got;		/* current file extent record */
+	xfs_extnum_t		lastx;		/* last useful extent number */
+	int			logflags;	/* flags for transaction logging */
+	xfs_extlen_t		minleft;	/* min blocks left after allocation */
+	int			n;		/* current extent index */
+	int			nallocs;	/* number of extents alloc'd */
+	xfs_fileoff_t		obno;		/* old block number (offset) */
+	struct xfs_bmbt_irec	prev;		/* previous file extent record */
+	int			tmp_logflags;	/* temp flags holder */
+	int			whichfork;	/* data or attr fork */
+	char			inhole;		/* current location is hole in file */
+	char			wasdelay;	/* old extent was delayed */
+
 #ifdef DEBUG
-	xfs_fileoff_t	orig_bno;	/* original block number value */
-	int		orig_flags;	/* original flags arg value */
-	xfs_filblks_t	orig_len;	/* original value of len arg */
-	xfs_bmbt_irec_t	*orig_mval;	/* original value of mval */
-	int		orig_nmap;	/* original value of *nmap */
+	xfs_fileoff_t		orig_bno;	/* original block number value */
+	int			orig_flags;	/* original flags arg value */
+	xfs_filblks_t		orig_len;	/* original value of len arg */
+	struct xfs_bmbt_irec	*orig_mval;	/* original value of mval */
+	int			orig_nmap;	/* original value of *nmap */
 
 	orig_bno = bno;
 	orig_len = len;
@@ -4870,69 +4866,60 @@ xfs_bmapi(
 	orig_mval = mval;
 	orig_nmap = *nmap;
 #endif
+
 	ASSERT(*nmap >= 1);
-	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP || !(flags & XFS_BMAPI_WRITE));
+	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+	ASSERT(!(flags & XFS_BMAPI_IGSTATE));
+	ASSERT(tp != NULL);
+
 	whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
 		XFS_ATTR_FORK : XFS_DATA_FORK;
-	mp = ip->i_mount;
+
 	if (unlikely(XFS_TEST_ERROR(
 	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
 	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
 	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL),
 	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
-		XFS_ERROR_REPORT("xfs_bmapi", XFS_ERRLEVEL_LOW, mp);
+		XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
+
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
-	rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ASSERT(ifp->if_ext_max ==
 	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
-	if ((wr = (flags & XFS_BMAPI_WRITE)) != 0)
-		XFS_STATS_INC(xs_blk_mapw);
-	else
-		XFS_STATS_INC(xs_blk_mapr);
-	/*
-	 * IGSTATE flag is used to combine extents which
-	 * differ only due to the state of the extents.
-	 * This technique is used from xfs_getbmap()
-	 * when the caller does not wish to see the
-	 * separation (which is the default).
-	 *
-	 * This technique is also used when writing a
-	 * buffer which has been partially written,
-	 * (usually by being flushed during a chunkread),
-	 * to ensure one write takes place. This also
-	 * prevents a change in the xfs inode extents at
-	 * this time, intentionally. This change occurs
-	 * on completion of the write operation, in
-	 * xfs_strat_comp(), where the xfs_bmapi() call
-	 * is transactioned, and the extents combined.
-	 */
-	if ((flags & XFS_BMAPI_IGSTATE) && wr)	/* if writing unwritten space */
-		wr = 0;				/* no allocations are allowed */
+
+	XFS_STATS_INC(xs_blk_mapw);
+
 	logflags = 0;
 	nallocs = 0;
 	cur = NULL;
+
 	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
-		ASSERT(wr && tp);
-		if ((error = xfs_bmap_local_to_extents(tp, ip,
-				firstblock, total, &logflags, whichfork)))
+		error = xfs_bmap_local_to_extents(tp, ip, firstblock, total,
+						  &logflags, whichfork);
+		if (error)
 			goto error0;
 	}
-	if (wr && *firstblock == NULLFSBLOCK) {
+
+	if (*firstblock == NULLFSBLOCK) {
 		if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
 			minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
 		else
 			minleft = 1;
-	} else
+	} else {
 		minleft = 0;
-	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-	    (error = xfs_iread_extents(tp, ip, whichfork)))
-		goto error0;
-	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
-		&prev);
+	}
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(tp, ip, whichfork);
+		if (error)
+			goto error0;
+	}
+
+	xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev);
 	n = 0;
 	end = bno + len;
 	obno = bno;
@@ -4945,19 +4932,14 @@ xfs_bmapi(
 	bma.userdata = 0;
 
 	while (bno < end && n < *nmap) {
-		/*
-		 * Reading past eof, act as though there's a hole
-		 * up to end.
-		 */
-		if (eof && !wr)
-			got.br_startoff = end;
 		inhole = eof || got.br_startoff > bno;
-		wasdelay = wr && !inhole && isnullstartblock(got.br_startblock);
+		wasdelay = !inhole && isnullstartblock(got.br_startblock);
+
 		/*
 		 * First, deal with the hole before the allocated space
 		 * that we found, if any.
 		 */
-		if (wr && (inhole || wasdelay)) {
+		if (inhole || wasdelay) {
 			bma.eof = eof;
 			bma.conv = !!(flags & XFS_BMAPI_CONVERT);
 			bma.wasdel = wasdelay;
@@ -4975,36 +4957,20 @@ xfs_bmapi(
 				minleft = 0;
 			if (bma.rval == NULLFSBLOCK)
 				break;
-		} else if (inhole) {
-			/*
-			 * Reading in a hole.
-			 */
-			mval->br_startoff = bno;
-			mval->br_startblock = HOLESTARTBLOCK;
-			mval->br_blockcount =
-				XFS_FILBLKS_MIN(len, got.br_startoff - bno);
-			mval->br_state = XFS_EXT_NORM;
-			bno += mval->br_blockcount;
-			len -= mval->br_blockcount;
-			mval++;
-			n++;
-			continue;
 		}
 
 		/* Deal with the allocated space we found.  */
 		xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
 
 		/* Execute unwritten extent conversion if necessary */
-		if (wr) {
-			error = xfs_bmapi_convert_unwritten(&bma, mval, len,
-						&lastx, &cur, firstblock, flist, flags,
-						&tmp_logflags);
-			logflags |= tmp_logflags;
-			if (error == EAGAIN)
-				continue;
-			if (error)
-				goto error0;
-		}
+		error = xfs_bmapi_convert_unwritten(&bma, mval, len, &lastx,
+						    &cur, firstblock, flist,
+						    flags, &tmp_logflags);
+		logflags |= tmp_logflags;
+		if (error == EAGAIN)
+			continue;
+		if (error)
+			goto error0;
 
 		/* update the extent map to return */
 		xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
@@ -5016,24 +4982,22 @@ xfs_bmapi(
 		 */
 		if (bno >= end || n >= *nmap || nallocs >= *nmap)
 			break;
-		/*
-		 * Else go on to the next record.
-		 */
+
+		/* Else go on to the next record. */
 		prev = got;
-		if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) {
-			ep = xfs_iext_get_ext(ifp, lastx);
-			xfs_bmbt_get_all(ep, &got);
-		} else {
+		if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
+		else
 			eof = 1;
-		}
 	}
 	*nmap = n;
+
 	/*
 	 * Transform from btree to extents, give it cur.
 	 */
-	if (tp && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
 	    XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
-		ASSERT(wr && cur);
+		ASSERT(cur);
 		error = xfs_bmap_btree_to_extents(tp, ip, cur,
 			&tmp_logflags, whichfork);
 		logflags |= tmp_logflags;
@@ -5061,10 +5025,9 @@ error0:
 	 * detecting a case where the data is changed, there's an error,
 	 * and it's not logged so we don't shutdown when we should.
 	 */
-	if (logflags) {
-		ASSERT(tp && wr);
+	if (logflags)
 		xfs_trans_log_inode(tp, ip, logflags);
-	}
+
 	if (cur) {
 		if (!error) {
 			ASSERT(*firstblock == NULLFSBLOCK ||
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index c70c19c7f1fa..be235f56d2a6 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -62,25 +62,23 @@ typedef	struct xfs_bmap_free
 #define	XFS_BMAP_MAX_NMAP	4
 
 /*
- * Flags for xfs_bmapi
+ * Flags for xfs_bmapi_*
  */
-#define	XFS_BMAPI_WRITE		0x001	/* write operation: allocate space */
-#define XFS_BMAPI_ENTIRE	0x004	/* return entire extent, not trimmed */
-#define XFS_BMAPI_METADATA	0x008	/* mapping metadata not user data */
-#define XFS_BMAPI_ATTRFORK	0x010	/* use attribute fork not data */
-#define	XFS_BMAPI_PREALLOC	0x040	/* preallocation op: unwritten space */
-#define	XFS_BMAPI_IGSTATE	0x080	/* Ignore state - */
+#define XFS_BMAPI_ENTIRE	0x001	/* return entire extent, not trimmed */
+#define XFS_BMAPI_METADATA	0x002	/* mapping metadata not user data */
+#define XFS_BMAPI_ATTRFORK	0x004	/* use attribute fork not data */
+#define XFS_BMAPI_PREALLOC	0x008	/* preallocation op: unwritten space */
+#define XFS_BMAPI_IGSTATE	0x010	/* Ignore state - */
 					/* combine contig. space */
-#define	XFS_BMAPI_CONTIG	0x100	/* must allocate only one extent */
+#define XFS_BMAPI_CONTIG	0x020	/* must allocate only one extent */
 /*
  * unwritten extent conversion - this needs write cache flushing and no additional
  * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
  * from written to unwritten, otherwise convert from unwritten to written.
  */
-#define XFS_BMAPI_CONVERT	0x200
+#define XFS_BMAPI_CONVERT	0x040
 
 #define XFS_BMAPI_FLAGS \
-	{ XFS_BMAPI_WRITE,	"WRITE" }, \
 	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
 	{ XFS_BMAPI_METADATA,	"METADATA" }, \
 	{ XFS_BMAPI_ATTRFORK,	"ATTRFORK" }, \
@@ -265,39 +263,17 @@ xfs_bmap_read_extents(
 	struct xfs_inode	*ip,		/* incore inode */
 	int			whichfork);	/* data or attr fork */
 
-/*
- * Map file blocks to filesystem blocks.
- * File range is given by the bno/len pair.
- * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
- * into a hole or past eof.
- * Only allocates blocks from a single allocation group,
- * to avoid locking problems.
- * The returned value in "firstblock" from the first call in a transaction
- * must be remembered and presented to subsequent calls in "firstblock".
- * An upper bound for the number of blocks to be allocated is supplied to
- * the first call in "total"; if no allocation group has that many free
- * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
- */
-int						/* error */
-xfs_bmapi(
-	struct xfs_trans	*tp,		/* transaction pointer */
-	struct xfs_inode	*ip,		/* incore inode */
-	xfs_fileoff_t		bno,		/* starting file offs. mapped */
-	xfs_filblks_t		len,		/* length to map in file */
-	int			flags,		/* XFS_BMAPI_... */
-	xfs_fsblock_t		*firstblock,	/* first allocated block
-						   controls a.g. for allocs */
-	xfs_extlen_t		total,		/* total blocks needed */
-	struct xfs_bmbt_irec	*mval,		/* output: map values */
-	int			*nmap,		/* i/o: mval size/count */
-	xfs_bmap_free_t		*flist);	/* i/o: list extents to free */
-
 int	xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
 		xfs_filblks_t len, struct xfs_bmbt_irec *mval,
 		int *nmap, int flags);
 int	xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno,
 		xfs_filblks_t len, struct xfs_bmbt_irec *mval,
 		int *nmap, int flags);
+int	xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+		xfs_fsblock_t *firstblock, xfs_extlen_t total,
+		struct xfs_bmbt_irec *mval, int *nmap,
+		struct xfs_bmap_free *flist);
 
 /*
  * Unmap (remove) blocks from a file.
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 70a5f580e5ed..46c8aa2740da 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1578,9 +1578,8 @@ xfs_da_grow_inode_int(
 	 */
 	nmap = 1;
 	ASSERT(args->firstblock != NULL);
-	error = xfs_bmapi(tp, dp, *bno, count,
-			xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
-			XFS_BMAPI_CONTIG,
+	error = xfs_bmapi_write(tp, dp, *bno, count,
+			xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
 			args->firstblock, args->total, &map, &nmap,
 			args->flist);
 	if (error)
@@ -1602,9 +1601,8 @@ xfs_da_grow_inode_int(
 		for (b = *bno, mapi = 0; b < *bno + count; ) {
 			nmap = MIN(XFS_BMAP_MAX_NMAP, count);
 			c = (int)(*bno + count - b);
-			error = xfs_bmapi(tp, dp, b, c,
-					xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|
-					XFS_BMAPI_METADATA,
+			error = xfs_bmapi_write(tp, dp, b, c,
+					xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
 					args->firstblock, args->total,
 					&mapp[mapi], &nmap, args->flist);
 			if (error)
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index c377961657ee..179673531f20 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -379,14 +379,12 @@ xfs_qm_dqalloc(
 
 	xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL);
 	nmaps = 1;
-	if ((error = xfs_bmapi(tp, quotip,
-			      offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB,
-			      XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
-			      &firstblock,
-			      XFS_QM_DQALLOC_SPACE_RES(mp),
-			      &map, &nmaps, &flist))) {
+	error = xfs_bmapi_write(tp, quotip, offset_fsb,
+				XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
+				&firstblock, XFS_QM_DQALLOC_SPACE_RES(mp),
+				&map, &nmaps, &flist);
+	if (error)
 		goto error0;
-	}
 	ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
 	ASSERT(nmaps == 1);
 	ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 681ba34c9233..da5bf05c5bb7 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -210,20 +210,18 @@ xfs_iomap_write_direct(
 
 	xfs_trans_ijoin(tp, ip);
 
-	bmapi_flag = XFS_BMAPI_WRITE;
+	bmapi_flag = 0;
 	if (offset < ip->i_size || extsz)
 		bmapi_flag |= XFS_BMAPI_PREALLOC;
 
 	/*
-	 * Issue the xfs_bmapi() call to allocate the blocks.
-	 *
 	 * From this point onwards we overwrite the imap pointer that the
 	 * caller gave to us.
 	 */
 	xfs_bmap_init(&free_list, &firstfsb);
 	nimaps = 1;
-	error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag,
-		&firstfsb, 0, imap, &nimaps, &free_list);
+	error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag,
+				&firstfsb, 0, imap, &nimaps, &free_list);
 	if (error)
 		goto error0;
 
@@ -582,14 +580,12 @@ xfs_iomap_write_allocate(
 			}
 
 			/*
-			 * Go get the actual blocks.
-	 	 	 *
 			 * From this point onwards we overwrite the imap
 			 * pointer that the caller gave to us.
 			 */
-			error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
-					XFS_BMAPI_WRITE, &first_block, 1,
-					imap, &nimaps, &free_list);
+			error = xfs_bmapi_write(tp, ip, map_start_fsb,
+						count_fsb, 0, &first_block, 1,
+						imap, &nimaps, &free_list);
 			if (error)
 				goto trans_cancel;
 
@@ -703,8 +699,8 @@ xfs_iomap_write_unwritten(
 		 */
 		xfs_bmap_init(&free_list, &firstfsb);
 		nimaps = 1;
-		error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
-				  XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
+		error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
+				  XFS_BMAPI_CONVERT, &firstfsb,
 				  1, &imap, &nimaps, &free_list);
 		if (error)
 			goto error_on_bmapi_transaction;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index e5f40c6460b2..f29424964521 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -120,9 +120,9 @@ xfs_growfs_rt_alloc(
 		 */
 		nmap = 1;
 		cancelflags |= XFS_TRANS_ABORT;
-		error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks,
-			XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock,
-			resblks, &map, &nmap, &flist);
+		error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
+					XFS_BMAPI_METADATA, &firstblock,
+					resblks, &map, &nmap, &flist);
 		if (!error && nmap < 1)
 			error = XFS_ERROR(ENOSPC);
 		if (error)
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 63874a87b378..f47ecee8d437 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1633,10 +1633,9 @@ xfs_symlink(
 		first_fsb = 0;
 		nmaps = SYMLINK_MAPS;
 
-		error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
-				  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
-				  &first_block, resblks, mval, &nmaps,
-				  &free_list);
+		error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
+				  XFS_BMAPI_METADATA, &first_block, resblks,
+				  mval, &nmaps, &free_list);
 		if (error)
 			goto error2;
 
@@ -1782,7 +1781,6 @@ xfs_alloc_file_space(
 	xfs_fileoff_t		startoffset_fsb;
 	xfs_fsblock_t		firstfsb;
 	int			nimaps;
-	int			bmapi_flag;
 	int			quota_flag;
 	int			rt;
 	xfs_trans_t		*tp;
@@ -1810,7 +1808,6 @@ xfs_alloc_file_space(
 	count = len;
 	imapp = &imaps[0];
 	nimaps = 1;
-	bmapi_flag = XFS_BMAPI_WRITE | alloc_type;
 	startoffset_fsb	= XFS_B_TO_FSBT(mp, offset);
 	allocatesize_fsb = XFS_B_TO_FSB(mp, count);
 
@@ -1883,14 +1880,10 @@ xfs_alloc_file_space(
 
 		xfs_trans_ijoin(tp, ip);
 
-		/*
-		 * Issue the xfs_bmapi() call to allocate the blocks
-		 */
 		xfs_bmap_init(&free_list, &firstfsb);
-		error = xfs_bmapi(tp, ip, startoffset_fsb,
-				  allocatesize_fsb, bmapi_flag,
-				  &firstfsb, 0, imapp, &nimaps,
-				  &free_list);
+		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
+					allocatesize_fsb, alloc_type, &firstfsb,
+					0, imapp, &nimaps, &free_list);
 		if (error) {
 			goto error0;
 		}
-- 
cgit v1.2.3-58-ga151


From 27a3f8f2de758205765f277b3428bbf3d15da973 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Sep 2011 20:40:53 +0000
Subject: xfs: introduce xfs_bmap_last_extent

Add a common helper for finding the last extent in a file.

Largely based on a patch from Dave Chinner.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 226 +++++++++++++++++++++++++-----------------------------
 1 file changed, 105 insertions(+), 121 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 595cc6311937..d9de5ae1fd5c 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -203,19 +203,6 @@ xfs_bmap_search_extents(
 	xfs_bmbt_irec_t	*gotp,		/* out: extent entry found */
 	xfs_bmbt_irec_t	*prevp);	/* out: previous extent entry found */
 
-/*
- * Check the last inode extent to determine whether this allocation will result
- * in blocks being allocated at the end of the file. When we allocate new data
- * blocks at the end of the file which do not start at the previous data block,
- * we will try to align the new blocks at stripe unit boundaries.
- */
-STATIC int				/* error */
-xfs_bmap_isaeof(
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_fileoff_t   off,		/* file offset in fsblocks */
-	int             whichfork,	/* data or attribute fork */
-	char		*aeof);		/* return value */
-
 /*
  * Compute the worst-case number of indirect blocks that will be used
  * for ip's delayed extent of length "len".
@@ -3924,42 +3911,122 @@ xfs_bmap_last_before(
 	return 0;
 }
 
+STATIC int
+xfs_bmap_last_extent(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct xfs_bmbt_irec	*rec,
+	int			*is_empty)
+{
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	int			error;
+	int			nextents;
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(tp, ip, whichfork);
+		if (error)
+			return error;
+	}
+
+	nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+	if (nextents == 0) {
+		*is_empty = 1;
+		return 0;
+	}
+
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec);
+	*is_empty = 0;
+	return 0;
+}
+
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ *
+ * Returns 0 in *aeof if the file (fork) is empty as any new write will be at,
+ * or past the EOF.
+ */
+STATIC int
+xfs_bmap_isaeof(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		off,
+	int			whichfork,
+	char			*aeof)
+{
+	struct xfs_bmbt_irec	rec;
+	int			is_empty;
+	int			error;
+
+	*aeof = 0;
+	error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
+	if (error || is_empty)
+		return error;
+
+	/*
+	 * Check if we are allocation or past the last extent, or at least into
+	 * the last delayed allocated extent.
+	 */
+	*aeof = off >= rec.br_startoff + rec.br_blockcount ||
+		(off >= rec.br_startoff && isnullstartblock(rec.br_startblock));
+	return 0;
+}
+
+/*
+ * Check if the endoff is outside the last extent. If so the caller will grow
+ * the allocation to a stripe unit boundary.  All offsets are considered outside
+ * the end of file for an empty fork, so 1 is returned in *eof in that case.
+ */
+int
+xfs_bmap_eof(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		endoff,
+	int			whichfork,
+	int			*eof)
+{
+	struct xfs_bmbt_irec	rec;
+	int			error;
+
+	error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
+	if (error || *eof)
+		return error;
+
+	*eof = endoff >= rec.br_startoff + rec.br_blockcount;
+	return 0;
+}
+
 /*
  * Returns the file-relative block number of the first block past eof in
  * the file.  This is not based on i_size, it is based on the extent records.
  * Returns 0 for local files, as they do not have extent records.
  */
-int						/* error */
+int
 xfs_bmap_last_offset(
-	xfs_trans_t	*tp,			/* transaction pointer */
-	xfs_inode_t	*ip,			/* incore inode */
-	xfs_fileoff_t	*last_block,		/* last block */
-	int		whichfork)		/* data or attr fork */
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		*last_block,
+	int			whichfork)
 {
-	xfs_bmbt_rec_host_t *ep;		/* pointer to last extent */
-	int		error;			/* error return value */
-	xfs_ifork_t	*ifp;			/* inode fork pointer */
-	xfs_extnum_t	nextents;		/* number of extent entries */
+	struct xfs_bmbt_irec	rec;
+	int			is_empty;
+	int			error;
+
+	*last_block = 0;
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
+		return 0;
 
 	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
 	       return XFS_ERROR(EIO);
-	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
-		*last_block = 0;
-		return 0;
-	}
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-	    (error = xfs_iread_extents(tp, ip, whichfork)))
+
+	error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
+	if (error || is_empty)
 		return error;
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	if (!nextents) {
-		*last_block = 0;
-		return 0;
-	}
-	ep = xfs_iext_get_ext(ifp, nextents - 1);
-	*last_block = xfs_bmbt_get_startoff(ep) + xfs_bmbt_get_blockcount(ep);
+
+	*last_block = rec.br_startoff + rec.br_blockcount;
 	return 0;
 }
 
@@ -5687,89 +5754,6 @@ xfs_getbmap(
 	return error;
 }
 
-/*
- * Check the last inode extent to determine whether this allocation will result
- * in blocks being allocated at the end of the file. When we allocate new data
- * blocks at the end of the file which do not start at the previous data block,
- * we will try to align the new blocks at stripe unit boundaries.
- */
-STATIC int				/* error */
-xfs_bmap_isaeof(
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_fileoff_t   off,		/* file offset in fsblocks */
-	int             whichfork,	/* data or attribute fork */
-	char		*aeof)		/* return value */
-{
-	int		error;		/* error return value */
-	xfs_ifork_t	*ifp;		/* inode fork pointer */
-	xfs_bmbt_rec_host_t *lastrec;	/* extent record pointer */
-	xfs_extnum_t	nextents;	/* number of file extents */
-	xfs_bmbt_irec_t	s;		/* expanded extent record */
-
-	ASSERT(whichfork == XFS_DATA_FORK);
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-	    (error = xfs_iread_extents(NULL, ip, whichfork)))
-		return error;
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	if (nextents == 0) {
-		*aeof = 1;
-		return 0;
-	}
-	/*
-	 * Go to the last extent
-	 */
-	lastrec = xfs_iext_get_ext(ifp, nextents - 1);
-	xfs_bmbt_get_all(lastrec, &s);
-	/*
-	 * Check we are allocating in the last extent (for delayed allocations)
-	 * or past the last extent for non-delayed allocations.
-	 */
-	*aeof = (off >= s.br_startoff &&
-		 off < s.br_startoff + s.br_blockcount &&
-		 isnullstartblock(s.br_startblock)) ||
-		off >= s.br_startoff + s.br_blockcount;
-	return 0;
-}
-
-/*
- * Check if the endoff is outside the last extent. If so the caller will grow
- * the allocation to a stripe unit boundary.
- */
-int					/* error */
-xfs_bmap_eof(
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_fileoff_t	endoff,		/* file offset in fsblocks */
-	int		whichfork,	/* data or attribute fork */
-	int		*eof)		/* result value */
-{
-	xfs_fsblock_t	blockcount;	/* extent block count */
-	int		error;		/* error return value */
-	xfs_ifork_t	*ifp;		/* inode fork pointer */
-	xfs_bmbt_rec_host_t *lastrec;	/* extent record pointer */
-	xfs_extnum_t	nextents;	/* number of file extents */
-	xfs_fileoff_t	startoff;	/* extent starting file offset */
-
-	ASSERT(whichfork == XFS_DATA_FORK);
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-	    (error = xfs_iread_extents(NULL, ip, whichfork)))
-		return error;
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	if (nextents == 0) {
-		*eof = 1;
-		return 0;
-	}
-	/*
-	 * Go to the last extent
-	 */
-	lastrec = xfs_iext_get_ext(ifp, nextents - 1);
-	startoff = xfs_bmbt_get_startoff(lastrec);
-	blockcount = xfs_bmbt_get_blockcount(lastrec);
-	*eof = endoff >= startoff + blockcount;
-	return 0;
-}
-
 #ifdef DEBUG
 STATIC struct xfs_buf *
 xfs_bmap_get_bp(
-- 
cgit v1.2.3-58-ga151


From a5bd606ba65f24e5990edfc0e7b52702720ee6fa Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Sep 2011 20:40:54 +0000
Subject: xfs: remove xfs_bmap_add_extent

There is no real need to the xfs_bmap_add_extent, as the callers
know what kind of extents they need to it.  Removing it means
duplicating the extents to btree conversion logic in three places,
but overall it's still much simpler code and quite a bit less code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 416 +++++++++++++++++++++++-------------------------------
 1 file changed, 173 insertions(+), 243 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index d9de5ae1fd5c..0c9be0ed606e 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -50,17 +50,22 @@
 #include "xfs_trace.h"
 
 
-#ifdef DEBUG
-STATIC void
-xfs_bmap_check_leaf_extents(xfs_btree_cur_t *cur, xfs_inode_t *ip, int whichfork);
-#endif
-
 kmem_zone_t		*xfs_bmap_free_item_zone;
 
 /*
  * Prototypes for internal bmap routines.
  */
 
+#ifdef DEBUG
+STATIC void
+xfs_bmap_check_leaf_extents(
+	struct xfs_btree_cur	*cur,
+	struct xfs_inode	*ip,
+	int			whichfork);
+#else
+#define xfs_bmap_check_leaf_extents(cur, ip, whichfork)		do { } while (0)
+#endif
+
 
 /*
  * Called from xfs_bmap_add_attrfork to handle extents format files.
@@ -84,47 +89,6 @@ xfs_bmap_add_attrfork_local(
 	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
 	int			*flags);	/* inode logging flags */
 
-/*
- * Called by xfs_bmap_add_extent to handle cases converting a delayed
- * allocation to a real allocation.
- */
-STATIC int				/* error */
-xfs_bmap_add_extent_delay_real(
-	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
-	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
-	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	xfs_filblks_t		*dnew,	/* new delayed-alloc indirect blocks */
-	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
-	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
-	int			*logflagsp); /* inode logging flags */
-
-/*
- * Called by xfs_bmap_add_extent to handle cases converting a hole
- * to a real allocation.
- */
-STATIC int				/* error */
-xfs_bmap_add_extent_hole_real(
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
-	xfs_btree_cur_t		*cur,	/* if null, not a btree */
-	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	int			*logflagsp, /* inode logging flags */
-	int			whichfork); /* data or attr fork */
-
-/*
- * Called by xfs_bmap_add_extent to handle cases converting an unwritten
- * allocation to a real allocation or vice versa.
- */
-STATIC int				/* error */
-xfs_bmap_add_extent_unwritten_real(
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
-	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
-	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	int			*logflagsp); /* inode logging flags */
-
 /*
  * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
  * It figures out where to ask the underlying allocator to put the new extent.
@@ -407,147 +371,7 @@ xfs_bmap_add_attrfork_local(
 }
 
 /*
- * Update file extent records and the btree after allocating space.
- */
-STATIC int				/* error */
-xfs_bmap_add_extent(
-	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
-	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
-	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
-	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
-	int			*logflagsp, /* inode logging flags */
-	int			whichfork) /* data or attr fork */
-{
-	xfs_btree_cur_t		*cur;	/* btree cursor or null */
-	xfs_filblks_t		da_new; /* new count del alloc blocks used */
-	xfs_filblks_t		da_old; /* old count del alloc blocks used */
-	int			error;	/* error return value */
-	xfs_ifork_t		*ifp;	/* inode fork ptr */
-	int			logflags; /* returned value */
-	xfs_extnum_t		nextents; /* number of extents in file now */
-
-	XFS_STATS_INC(xs_add_exlist);
-
-	cur = *curp;
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	da_old = da_new = 0;
-	error = 0;
-
-	ASSERT(*idx >= 0);
-	ASSERT(*idx <= nextents);
-	ASSERT(!isnullstartblock(new->br_startblock));
-
-	/*
-	 * Real allocation off the end of the file.
-	 */
-	if (*idx == nextents) {
-		if (cur)
-			ASSERT((cur->bc_private.b.flags &
-				XFS_BTCUR_BPRV_WASDEL) == 0);
-		error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
-				&logflags, whichfork);
-	} else {
-		xfs_bmbt_irec_t	prev;	/* old extent at offset idx */
-
-		/*
-		 * Get the record referred to by idx.
-		 */
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev);
-		/*
-		 * If it's a real allocation record, and the new allocation ends
-		 * after the start of the referred to record, then we're filling
-		 * in a delayed or unwritten allocation with a real one, or
-		 * converting real back to unwritten.
-		 */
-		if (!isnullstartblock(new->br_startblock) &&
-		    new->br_startoff + new->br_blockcount > prev.br_startoff) {
-			if (prev.br_state != XFS_EXT_UNWRITTEN &&
-			    isnullstartblock(prev.br_startblock)) {
-				da_old = startblockval(prev.br_startblock);
-				if (cur)
-					ASSERT(cur->bc_private.b.flags &
-						XFS_BTCUR_BPRV_WASDEL);
-				error = xfs_bmap_add_extent_delay_real(tp, ip,
-						idx, &cur, new, &da_new,
-						first, flist, &logflags);
-			} else {
-				ASSERT(new->br_state == XFS_EXT_NORM ||
-				       new->br_state == XFS_EXT_UNWRITTEN);
-
-				error = xfs_bmap_add_extent_unwritten_real(ip,
-						idx, &cur, new, &logflags);
-				if (error)
-					goto done;
-			}
-		}
-		/*
-		 * Otherwise we're filling in a hole with an allocation.
-		 */
-		else {
-			if (cur)
-				ASSERT((cur->bc_private.b.flags &
-					XFS_BTCUR_BPRV_WASDEL) == 0);
-			error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
-					new, &logflags, whichfork);
-		}
-	}
-
-	if (error)
-		goto done;
-	ASSERT(*curp == cur || *curp == NULL);
-
-	/*
-	 * Convert to a btree if necessary.
-	 */
-	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
-		int	tmp_logflags;	/* partial log flag return val */
-
-		ASSERT(cur == NULL);
-		error = xfs_bmap_extents_to_btree(tp, ip, first,
-			flist, &cur, da_old > 0, &tmp_logflags, whichfork);
-		logflags |= tmp_logflags;
-		if (error)
-			goto done;
-	}
-	/*
-	 * Adjust for changes in reserved delayed indirect blocks.
-	 * Nothing to do for disk quotas here.
-	 */
-	if (da_old || da_new) {
-		xfs_filblks_t	nblks;
-
-		nblks = da_new;
-		if (cur)
-			nblks += cur->bc_private.b.allocated;
-		ASSERT(nblks <= da_old);
-		if (nblks < da_old)
-			xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-				(int64_t)(da_old - nblks), 0);
-	}
-	/*
-	 * Clear out the allocated field, done with it now in any case.
-	 */
-	if (cur) {
-		cur->bc_private.b.allocated = 0;
-		*curp = cur;
-	}
-done:
-#ifdef DEBUG
-	if (!error)
-		xfs_bmap_check_leaf_extents(*curp, ip, whichfork);
-#endif
-	*logflagsp = logflags;
-	return error;
-}
-
-/*
- * Called by xfs_bmap_add_extent to handle cases converting a delayed
- * allocation to a real allocation.
+ * Convert a delayed allocation to a real allocation.
  */
 STATIC int				/* error */
 xfs_bmap_add_extent_delay_real(
@@ -556,7 +380,6 @@ xfs_bmap_add_extent_delay_real(
 	xfs_extnum_t		*idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	xfs_filblks_t		*dnew,	/* new delayed-alloc indirect blocks */
 	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
 	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
 	int			*logflagsp) /* inode logging flags */
@@ -572,10 +395,24 @@ xfs_bmap_add_extent_delay_real(
 					/* left is 0, right is 1, prev is 2 */
 	int			rval=0;	/* return value (logging flags) */
 	int			state = 0;/* state bits, accessed thru macros */
-	xfs_filblks_t		temp=0;	/* value for dnew calculations */
-	xfs_filblks_t		temp2=0;/* value for dnew calculations */
+	xfs_filblks_t		da_new; /* new count del alloc blocks used */
+	xfs_filblks_t		da_old; /* old count del alloc blocks used */
+	xfs_filblks_t		temp=0;	/* value for da_new calculations */
+	xfs_filblks_t		temp2=0;/* value for da_new calculations */
 	int			tmp_rval;	/* partial logging flags */
 
+	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	cur = *curp;
+
+	ASSERT(*idx >= 0);
+	ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(!isnullstartblock(new->br_startblock));
+	ASSERT(!cur || (cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+
+	XFS_STATS_INC(xs_add_exlist);
+
+	*logflagsp = 0;
+
 #define	LEFT		r[0]
 #define	RIGHT		r[1]
 #define	PREV		r[2]
@@ -583,14 +420,15 @@ xfs_bmap_add_extent_delay_real(
 	/*
 	 * Set up a bunch of variables to make the tests simpler.
 	 */
-	cur = *curp;
-	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
 	ep = xfs_iext_get_ext(ifp, *idx);
 	xfs_bmbt_get_all(ep, &PREV);
 	new_endoff = new->br_startoff + new->br_blockcount;
 	ASSERT(PREV.br_startoff <= new->br_startoff);
 	ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
 
+	da_old = startblockval(PREV.br_startblock);
+	da_new = 0;
+
 	/*
 	 * Set flags determining what part of the previous delayed allocation
 	 * extent is being replaced by a real allocation.
@@ -688,7 +526,6 @@ xfs_bmap_add_extent_delay_real(
 					RIGHT.br_blockcount, LEFT.br_state)))
 				goto done;
 		}
-		*dnew = 0;
 		break;
 
 	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
@@ -719,7 +556,6 @@ xfs_bmap_add_extent_delay_real(
 					PREV.br_blockcount, LEFT.br_state)))
 				goto done;
 		}
-		*dnew = 0;
 		break;
 
 	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -749,8 +585,6 @@ xfs_bmap_add_extent_delay_real(
 					RIGHT.br_blockcount, PREV.br_state)))
 				goto done;
 		}
-
-		*dnew = 0;
 		break;
 
 	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
@@ -778,8 +612,6 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
-
-		*dnew = 0;
 		break;
 
 	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
@@ -813,13 +645,12 @@ xfs_bmap_add_extent_delay_real(
 					LEFT.br_state)))
 				goto done;
 		}
-		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 			startblockval(PREV.br_startblock));
-		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 
 		--*idx;
-		*dnew = temp;
 		break;
 
 	case BMAP_LEFT_FILLING:
@@ -856,14 +687,12 @@ xfs_bmap_add_extent_delay_real(
 			if (error)
 				goto done;
 		}
-		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 			startblockval(PREV.br_startblock) -
 			(cur ? cur->bc_private.b.allocated : 0));
 		ep = xfs_iext_get_ext(ifp, *idx + 1);
-		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
 		trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
-
-		*dnew = temp;
 		break;
 
 	case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -896,14 +725,13 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 		}
 
-		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 			startblockval(PREV.br_startblock));
 		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 
 		++*idx;
-		*dnew = temp;
 		break;
 
 	case BMAP_RIGHT_FILLING:
@@ -939,15 +767,14 @@ xfs_bmap_add_extent_delay_real(
 			if (error)
 				goto done;
 		}
-		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 			startblockval(PREV.br_startblock) -
 			(cur ? cur->bc_private.b.allocated : 0));
 		ep = xfs_iext_get_ext(ifp, *idx);
-		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 
 		++*idx;
-		*dnew = temp;
 		break;
 
 	case 0:
@@ -1029,7 +856,7 @@ xfs_bmap_add_extent_delay_real(
 		trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_);
 
 		++*idx;
-		*dnew = temp + temp2;
+		da_new = temp + temp2;
 		break;
 
 	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
@@ -1044,9 +871,39 @@ xfs_bmap_add_extent_delay_real(
 		 */
 		ASSERT(0);
 	}
-	*curp = cur;
+
+	/* convert to a btree if necessary */
+	if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+		int	tmp_logflags;	/* partial log flag return val */
+
+		ASSERT(cur == NULL);
+		error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur,
+				da_old > 0, &tmp_logflags, XFS_DATA_FORK);
+		*logflagsp |= tmp_logflags;
+		if (error)
+			goto done;
+	}
+
+	/* adjust for changes in reserved delayed indirect blocks */
+	if (da_old || da_new) {
+		temp = da_new;
+		if (cur)
+			temp += cur->bc_private.b.allocated;
+		ASSERT(temp <= da_old);
+		if (temp < da_old)
+			xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
+				(int64_t)(da_old - temp), 0);
+	}
+
+	/* clear out the allocated field, done with it now in any case. */
+	if (cur) {
+		cur->bc_private.b.allocated = 0;
+		*curp = cur;
+	}
+	xfs_bmap_check_leaf_extents(cur, ip, XFS_DATA_FORK);
 done:
-	*logflagsp = rval;
+	*logflagsp |= rval;
 	return error;
 #undef	LEFT
 #undef	RIGHT
@@ -1054,15 +911,17 @@ done:
 }
 
 /*
- * Called by xfs_bmap_add_extent to handle cases converting an unwritten
- * allocation to a real allocation or vice versa.
+ * Convert an unwritten allocation to a real allocation or vice versa.
  */
 STATIC int				/* error */
 xfs_bmap_add_extent_unwritten_real(
+	struct xfs_trans	*tp,
 	xfs_inode_t		*ip,	/* incore inode pointer */
 	xfs_extnum_t		*idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
+	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
+	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
 	int			*logflagsp) /* inode logging flags */
 {
 	xfs_btree_cur_t		*cur;	/* btree cursor */
@@ -1078,15 +937,25 @@ xfs_bmap_add_extent_unwritten_real(
 	int			rval=0;	/* return value (logging flags) */
 	int			state = 0;/* state bits, accessed thru macros */
 
+	*logflagsp = 0;
+
+	cur = *curp;
+	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+
+	ASSERT(*idx >= 0);
+	ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(!isnullstartblock(new->br_startblock));
+
+	XFS_STATS_INC(xs_add_exlist);
+
 #define	LEFT		r[0]
 #define	RIGHT		r[1]
 #define	PREV		r[2]
+
 	/*
 	 * Set up a bunch of variables to make the tests simpler.
 	 */
 	error = 0;
-	cur = *curp;
-	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
 	ep = xfs_iext_get_ext(ifp, *idx);
 	xfs_bmbt_get_all(ep, &PREV);
 	newext = new->br_state;
@@ -1537,9 +1406,29 @@ xfs_bmap_add_extent_unwritten_real(
 		 */
 		ASSERT(0);
 	}
-	*curp = cur;
+
+	/* convert to a btree if necessary */
+	if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+		int	tmp_logflags;	/* partial log flag return val */
+
+		ASSERT(cur == NULL);
+		error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur,
+				0, &tmp_logflags, XFS_DATA_FORK);
+		*logflagsp |= tmp_logflags;
+		if (error)
+			goto done;
+	}
+
+	/* clear out the allocated field, done with it now in any case. */
+	if (cur) {
+		cur->bc_private.b.allocated = 0;
+		*curp = cur;
+	}
+
+	xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK);
 done:
-	*logflagsp = rval;
+	*logflagsp |= rval;
 	return error;
 #undef	LEFT
 #undef	RIGHT
@@ -1691,30 +1580,42 @@ xfs_bmap_add_extent_hole_delay(
 }
 
 /*
- * Called by xfs_bmap_add_extent to handle cases converting a hole
- * to a real allocation.
+ * Convert a hole to a real allocation.
  */
 STATIC int				/* error */
 xfs_bmap_add_extent_hole_real(
+	struct xfs_trans	*tp,
 	xfs_inode_t		*ip,	/* incore inode pointer */
 	xfs_extnum_t		*idx,	/* extent number to update/insert */
-	xfs_btree_cur_t		*cur,	/* if null, not a btree */
+	xfs_btree_cur_t		**curp,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
+	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
+	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
 	int			*logflagsp, /* inode logging flags */
 	int			whichfork) /* data or attr fork */
 {
 	int			error;	/* error return value */
 	int			i;	/* temp state */
+	xfs_btree_cur_t		*cur;	/* if null, not a btree */
 	xfs_ifork_t		*ifp;	/* inode fork pointer */
 	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
 	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
 	int			rval=0;	/* return value (logging flags) */
 	int			state;	/* state bits, accessed thru macros */
 
+	*logflagsp = 0;
+
 	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
-	state = 0;
+	cur = *curp;
+
+	ASSERT(*idx >= 0);
+	ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(!isnullstartblock(new->br_startblock));
+	ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+
+	XFS_STATS_INC(xs_add_exlist);
 
+	state = 0;
 	if (whichfork == XFS_ATTR_FORK)
 		state |= BMAP_ATTRFORK;
 
@@ -1897,8 +1798,28 @@ xfs_bmap_add_extent_hole_real(
 		}
 		break;
 	}
+
+	/* convert to a btree if necessary */
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
+		int	tmp_logflags;	/* partial log flag return val */
+
+		ASSERT(cur == NULL);
+		error = xfs_bmap_extents_to_btree(tp, ip, first,
+			flist, &cur, 0, &tmp_logflags, whichfork);
+		*logflagsp |= tmp_logflags;
+		if (error)
+			goto done;
+	}
+
+	/* clear out the allocated field, done with it now in any case. */
+	if (cur) {
+		cur->bc_private.b.allocated = 0;
+		*curp = cur;
+	}
+	xfs_bmap_check_leaf_extents(cur, ip, whichfork);
 done:
-	*logflagsp = rval;
+	*logflagsp |= rval;
 	return error;
 }
 
@@ -4792,14 +4713,22 @@ xfs_bmapi_allocate(
 	    xfs_sb_version_hasextflgbit(&mp->m_sb))
 		bma->gotp->br_state = XFS_EXT_UNWRITTEN;
 
-	error = xfs_bmap_add_extent(bma->tp, bma->ip, lastx, cur, bma->gotp,
-				    firstblock, flist, logflags, whichfork);
+	if (bma->wasdel) {
+		error = xfs_bmap_add_extent_delay_real(bma->tp, bma->ip, lastx,
+				cur, bma->gotp, firstblock, flist, logflags);
+	} else {
+		error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, lastx,
+				cur, bma->gotp, firstblock, flist, logflags,
+				whichfork);
+	}
+
 	if (error)
 		return error;
 
 	/*
-	 * Update our extent pointer, given that xfs_bmap_add_extent  might
-	 * have merged it into one of the neighbouring ones.
+	 * Update our extent pointer, given that xfs_bmap_add_extent_delay_real
+	 * or xfs_bmap_add_extent_hole_real might have merged it into one of
+	 * the neighbouring ones.
 	 */
 	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), bma->gotp);
 
@@ -4854,14 +4783,15 @@ xfs_bmapi_convert_unwritten(
 	mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
 				? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
 
-	error = xfs_bmap_add_extent(bma->tp, bma->ip, lastx, cur, mval,
-				firstblock, flist, logflags, whichfork);
+	error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, lastx,
+			cur, mval, firstblock, flist, logflags);
 	if (error)
 		return error;
 
 	/*
-	 * Update our extent pointer, given that xfs_bmap_add_extent  might
-	 * have merged it into one of the neighbouring ones.
+	 * Update our extent pointer, given that
+	 * xfs_bmap_add_extent_unwritten_real might have merged it into one
+	 * of the neighbouring ones.
 	 */
 	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), bma->gotp);
 
@@ -5287,9 +5217,9 @@ xfs_bunmapi(
 				del.br_blockcount = mod;
 			}
 			del.br_state = XFS_EXT_UNWRITTEN;
-			error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, &del,
-				firstblock, flist, &logflags,
-				XFS_DATA_FORK);
+			error = xfs_bmap_add_extent_unwritten_real(tp, ip,
+					&lastx, &cur, &del, firstblock, flist,
+					&logflags);
 			if (error)
 				goto error0;
 			goto nodelete;
@@ -5345,18 +5275,18 @@ xfs_bunmapi(
 				}
 				prev.br_state = XFS_EXT_UNWRITTEN;
 				lastx--;
-				error = xfs_bmap_add_extent(tp, ip, &lastx,
-						&cur, &prev, firstblock, flist,
-						&logflags, XFS_DATA_FORK);
+				error = xfs_bmap_add_extent_unwritten_real(tp,
+						ip, &lastx, &cur, &prev,
+						firstblock, flist, &logflags);
 				if (error)
 					goto error0;
 				goto nodelete;
 			} else {
 				ASSERT(del.br_state == XFS_EXT_NORM);
 				del.br_state = XFS_EXT_UNWRITTEN;
-				error = xfs_bmap_add_extent(tp, ip, &lastx,
-						&cur, &del, firstblock, flist,
-						&logflags, XFS_DATA_FORK);
+				error = xfs_bmap_add_extent_unwritten_real(tp,
+						ip, &lastx, &cur, &del,
+						firstblock, flist, &logflags);
 				if (error)
 					goto error0;
 				goto nodelete;
-- 
cgit v1.2.3-58-ga151


From 1b16447ba24ae39c7fe7133fcdcb4f174dec1901 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sun, 18 Sep 2011 20:40:55 +0000
Subject: xfs: pass bmalloca structure to xfs_bmap_isaeof

All the variables xfs_bmap_isaeof() is passed are contained within
the xfs_bmalloca structure. Pass that instead.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 0c9be0ed606e..e2eb3ba5b420 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3867,22 +3867,21 @@ xfs_bmap_last_extent(
  * blocks at the end of the file which do not start at the previous data block,
  * we will try to align the new blocks at stripe unit boundaries.
  *
- * Returns 0 in *aeof if the file (fork) is empty as any new write will be at,
- * or past the EOF.
+ * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be
+ * at, or past the EOF.
  */
 STATIC int
 xfs_bmap_isaeof(
-	struct xfs_inode	*ip,
-	xfs_fileoff_t		off,
-	int			whichfork,
-	char			*aeof)
+	struct xfs_bmalloca	*bma,
+	int			whichfork)
 {
 	struct xfs_bmbt_irec	rec;
 	int			is_empty;
 	int			error;
 
-	*aeof = 0;
-	error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
+	bma->aeof = 0;
+	error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
+				     &is_empty);
 	if (error || is_empty)
 		return error;
 
@@ -3890,8 +3889,9 @@ xfs_bmap_isaeof(
 	 * Check if we are allocation or past the last extent, or at least into
 	 * the last delayed allocated extent.
 	 */
-	*aeof = off >= rec.br_startoff + rec.br_blockcount ||
-		(off >= rec.br_startoff && isnullstartblock(rec.br_startblock));
+	bma->aeof = bma->off >= rec.br_startoff + rec.br_blockcount ||
+		(bma->off >= rec.br_startoff &&
+		 isnullstartblock(rec.br_startblock));
 	return 0;
 }
 
@@ -4658,7 +4658,7 @@ xfs_bmapi_allocate(
 	 */
 	if (mp->m_dalign && alen >= mp->m_dalign &&
 	    !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
-		error = xfs_bmap_isaeof(bma->ip, aoff, whichfork, &bma->aeof);
+		error = xfs_bmap_isaeof(bma, whichfork);
 		if (error)
 			return error;
 	}
-- 
cgit v1.2.3-58-ga151


From baf41a52b9c62f9a825371806129ed12e2c1e2d8 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Sun, 18 Sep 2011 20:40:56 +0000
Subject: xfs: move extent records into bmalloca structure

Rather that putting extent records on the stack and then pointing to
them in the bmalloca structure which is in the same stack frame, put
the extent records directly in the bmalloca structure. This reduces
the number of args that need to be passed around.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 93 +++++++++++++++++++++++++++----------------------------
 fs/xfs/xfs_bmap.h |  4 +--
 2 files changed, 47 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index e2eb3ba5b420..68edf99bfad6 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2013,18 +2013,18 @@ xfs_bmap_adjacent(
 	 * If allocating at eof, and there's a previous real block,
 	 * try to use its last block as our starting point.
 	 */
-	if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
-	    !isnullstartblock(ap->prevp->br_startblock) &&
-	    ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount,
-		    ap->prevp->br_startblock)) {
-		ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount;
+	if (ap->eof && ap->prev.br_startoff != NULLFILEOFF &&
+	    !isnullstartblock(ap->prev.br_startblock) &&
+	    ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount,
+		    ap->prev.br_startblock)) {
+		ap->rval = ap->prev.br_startblock + ap->prev.br_blockcount;
 		/*
 		 * Adjust for the gap between prevp and us.
 		 */
 		adjust = ap->off -
-			(ap->prevp->br_startoff + ap->prevp->br_blockcount);
+			(ap->prev.br_startoff + ap->prev.br_blockcount);
 		if (adjust &&
-		    ISVALID(ap->rval + adjust, ap->prevp->br_startblock))
+		    ISVALID(ap->rval + adjust, ap->prev.br_startblock))
 			ap->rval += adjust;
 	}
 	/*
@@ -2042,17 +2042,17 @@ xfs_bmap_adjacent(
 		 * If there's a previous (left) block, select a requested
 		 * start block based on it.
 		 */
-		if (ap->prevp->br_startoff != NULLFILEOFF &&
-		    !isnullstartblock(ap->prevp->br_startblock) &&
-		    (prevbno = ap->prevp->br_startblock +
-			       ap->prevp->br_blockcount) &&
-		    ISVALID(prevbno, ap->prevp->br_startblock)) {
+		if (ap->prev.br_startoff != NULLFILEOFF &&
+		    !isnullstartblock(ap->prev.br_startblock) &&
+		    (prevbno = ap->prev.br_startblock +
+			       ap->prev.br_blockcount) &&
+		    ISVALID(prevbno, ap->prev.br_startblock)) {
 			/*
 			 * Calculate gap to end of previous block.
 			 */
 			adjust = prevdiff = ap->off -
-				(ap->prevp->br_startoff +
-				 ap->prevp->br_blockcount);
+				(ap->prev.br_startoff +
+				 ap->prev.br_blockcount);
 			/*
 			 * Figure the startblock based on the previous block's
 			 * end and the gap size.
@@ -2063,7 +2063,7 @@ xfs_bmap_adjacent(
 			 */
 			if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
 			    ISVALID(prevbno + prevdiff,
-				    ap->prevp->br_startblock))
+				    ap->prev.br_startblock))
 				prevbno += adjust;
 			else
 				prevdiff += adjust;
@@ -2084,16 +2084,16 @@ xfs_bmap_adjacent(
 		 * If there's a following (right) block, select a requested
 		 * start block based on it.
 		 */
-		if (!isnullstartblock(ap->gotp->br_startblock)) {
+		if (!isnullstartblock(ap->got.br_startblock)) {
 			/*
 			 * Calculate gap to start of next block.
 			 */
-			adjust = gotdiff = ap->gotp->br_startoff - ap->off;
+			adjust = gotdiff = ap->got.br_startoff - ap->off;
 			/*
 			 * Figure the startblock based on the next block's
 			 * start and the gap size.
 			 */
-			gotbno = ap->gotp->br_startblock;
+			gotbno = ap->got.br_startblock;
 			/*
 			 * Heuristic!
 			 * If the gap is large relative to the piece we're
@@ -2151,7 +2151,7 @@ xfs_bmap_rtalloc(
 	mp = ap->ip->i_mount;
 	align = xfs_get_extsz_hint(ap->ip);
 	prod = align / mp->m_sb.sb_rextsize;
-	error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp,
+	error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
 					align, 1, ap->eof, 0,
 					ap->conv, &ap->off, &ap->alen);
 	if (error)
@@ -2374,7 +2374,7 @@ xfs_bmap_btalloc(
 	mp = ap->ip->i_mount;
 	align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
 	if (unlikely(align)) {
-		error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp,
+		error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
 						align, 0, ap->eof, 0, ap->conv,
 						&ap->off, &ap->alen);
 		ASSERT(!error);
@@ -4619,17 +4619,17 @@ xfs_bmapi_allocate(
 	 * for in this bmap call but that wouldn't be as good.
 	 */
 	if (bma->wasdel) {
-		alen = (xfs_extlen_t)bma->gotp->br_blockcount;
-		aoff = bma->gotp->br_startoff;
+		alen = (xfs_extlen_t)bma->got.br_blockcount;
+		aoff = bma->got.br_startoff;
 		if (*lastx != NULLEXTNUM && *lastx) {
 			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx - 1),
-					 bma->prevp);
+					 &bma->prev);
 		}
 	} else {
 		alen = (xfs_extlen_t)XFS_FILBLKS_MIN(bma->alen, MAXEXTLEN);
 		if (!bma->eof)
 			alen = (xfs_extlen_t)XFS_FILBLKS_MIN(alen,
-					bma->gotp->br_startoff - bma->off);
+					bma->got.br_startoff - bma->off);
 		aoff = bma->off;
 	}
 
@@ -4700,10 +4700,10 @@ xfs_bmapi_allocate(
 		(*cur)->bc_private.b.flags =
 			bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
 
-	bma->gotp->br_startoff = aoff;
-	bma->gotp->br_startblock = abno;
-	bma->gotp->br_blockcount = alen;
-	bma->gotp->br_state = XFS_EXT_NORM;
+	bma->got.br_startoff = aoff;
+	bma->got.br_startblock = abno;
+	bma->got.br_blockcount = alen;
+	bma->got.br_state = XFS_EXT_NORM;
 
 	/*
 	 * A wasdelay extent has been initialized, so shouldn't be flagged
@@ -4711,14 +4711,14 @@ xfs_bmapi_allocate(
 	 */
 	if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) &&
 	    xfs_sb_version_hasextflgbit(&mp->m_sb))
-		bma->gotp->br_state = XFS_EXT_UNWRITTEN;
+		bma->got.br_state = XFS_EXT_UNWRITTEN;
 
 	if (bma->wasdel) {
 		error = xfs_bmap_add_extent_delay_real(bma->tp, bma->ip, lastx,
-				cur, bma->gotp, firstblock, flist, logflags);
+				cur, &bma->got, firstblock, flist, logflags);
 	} else {
 		error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, lastx,
-				cur, bma->gotp, firstblock, flist, logflags,
+				cur, &bma->got, firstblock, flist, logflags,
 				whichfork);
 	}
 
@@ -4730,13 +4730,12 @@ xfs_bmapi_allocate(
 	 * or xfs_bmap_add_extent_hole_real might have merged it into one of
 	 * the neighbouring ones.
 	 */
-	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), bma->gotp);
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), &bma->got);
 
-	ASSERT(bma->gotp->br_startoff <= aoff);
-	ASSERT(bma->gotp->br_startoff + bma->gotp->br_blockcount >=
-		aoff + alen);
-	ASSERT(bma->gotp->br_state == XFS_EXT_NORM ||
-	       bma->gotp->br_state == XFS_EXT_UNWRITTEN);
+	ASSERT(bma->got.br_startoff <= aoff);
+	ASSERT(bma->got.br_startoff + bma->got.br_blockcount >= aoff + alen);
+	ASSERT(bma->got.br_state == XFS_EXT_NORM ||
+	       bma->got.br_state == XFS_EXT_UNWRITTEN);
 	return 0;
 }
 
@@ -4793,7 +4792,7 @@ xfs_bmapi_convert_unwritten(
 	 * xfs_bmap_add_extent_unwritten_real might have merged it into one
 	 * of the neighbouring ones.
 	 */
-	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), bma->gotp);
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), &bma->got);
 
 	/*
 	 * We may have combined previously unwritten space with written space,
@@ -4837,14 +4836,12 @@ xfs_bmapi_write(
 	xfs_fileoff_t		end;		/* end of mapped file region */
 	int			eof;		/* after the end of extents */
 	int			error;		/* error return */
-	struct xfs_bmbt_irec	got;		/* current file extent record */
 	xfs_extnum_t		lastx;		/* last useful extent number */
 	int			logflags;	/* flags for transaction logging */
 	xfs_extlen_t		minleft;	/* min blocks left after allocation */
 	int			n;		/* current extent index */
 	int			nallocs;	/* number of extents alloc'd */
 	xfs_fileoff_t		obno;		/* old block number (offset) */
-	struct xfs_bmbt_irec	prev;		/* previous file extent record */
 	int			tmp_logflags;	/* temp flags holder */
 	int			whichfork;	/* data or attr fork */
 	char			inhole;		/* current location is hole in file */
@@ -4916,21 +4913,20 @@ xfs_bmapi_write(
 			goto error0;
 	}
 
-	xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev);
+	xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &bma.got,
+				&bma.prev);
 	n = 0;
 	end = bno + len;
 	obno = bno;
 
 	bma.tp = tp;
 	bma.ip = ip;
-	bma.prevp = &prev;
-	bma.gotp = &got;
 	bma.total = total;
 	bma.userdata = 0;
 
 	while (bno < end && n < *nmap) {
-		inhole = eof || got.br_startoff > bno;
-		wasdelay = !inhole && isnullstartblock(got.br_startblock);
+		inhole = eof || bma.got.br_startoff > bno;
+		wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
 
 		/*
 		 * First, deal with the hole before the allocated space
@@ -4957,7 +4953,8 @@ xfs_bmapi_write(
 		}
 
 		/* Deal with the allocated space we found.  */
-		xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
+		xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno,
+							end, n, flags);
 
 		/* Execute unwritten extent conversion if necessary */
 		error = xfs_bmapi_convert_unwritten(&bma, mval, len, &lastx,
@@ -4981,9 +4978,9 @@ xfs_bmapi_write(
 			break;
 
 		/* Else go on to the next record. */
-		prev = got;
+		bma.prev = bma.got;
 		if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
-			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &bma.got);
 		else
 			eof = 1;
 	}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index be235f56d2a6..6e8f8aee7cdb 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -114,8 +114,8 @@ typedef struct xfs_bmalloca {
 	xfs_fileoff_t		off;	/* offset in file filling in */
 	struct xfs_trans	*tp;	/* transaction pointer */
 	struct xfs_inode	*ip;	/* incore inode pointer */
-	struct xfs_bmbt_irec	*prevp;	/* extent before the new one */
-	struct xfs_bmbt_irec	*gotp;	/* extent after, or delayed */
+	struct xfs_bmbt_irec	prev;	/* extent before the new one */
+	struct xfs_bmbt_irec	got;	/* extent after, or delayed */
 	xfs_extlen_t		alen;	/* i/o length asked/allocated */
 	xfs_extlen_t		total;	/* total blocks needed for xaction */
 	xfs_extlen_t		minlen;	/* minimum allocation size (blocks) */
-- 
cgit v1.2.3-58-ga151


From 0937e0fd8be6f9c26844127d39d677bb752e8741 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sun, 18 Sep 2011 20:40:57 +0000
Subject: xfs: move firstblock and bmap freelist cursor into bmalloca structure

Rather than passing the firstblock and freelist structure around,
embed it into the bmalloca structure and remove it from the function
parameters.

This also enables the minleft parameter to be set only once in
xfs_bmapi_write(), and the freelist cursor directly queried in
xfs_bmapi_allocate to clear it when the lowspace algorithm is
activated.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c       | 92 ++++++++++++++++++++++++-------------------------
 fs/xfs/xfs_bmap.h       |  4 +--
 fs/xfs/xfs_filestream.c |  2 +-
 3 files changed, 48 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 68edf99bfad6..608a0013791b 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2006,9 +2006,9 @@ xfs_bmap_adjacent(
 		XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
 
 	mp = ap->ip->i_mount;
-	nullfb = ap->firstblock == NULLFSBLOCK;
+	nullfb = *ap->firstblock == NULLFSBLOCK;
 	rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
-	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
+	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
 	/*
 	 * If allocating at eof, and there's a previous real block,
 	 * try to use its last block as our starting point.
@@ -2380,8 +2380,8 @@ xfs_bmap_btalloc(
 		ASSERT(!error);
 		ASSERT(ap->alen);
 	}
-	nullfb = ap->firstblock == NULLFSBLOCK;
-	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
+	nullfb = *ap->firstblock == NULLFSBLOCK;
+	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
 	if (nullfb) {
 		if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
 			ag = xfs_filestream_lookup_ag(ap->ip);
@@ -2391,7 +2391,7 @@ xfs_bmap_btalloc(
 			ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
 		}
 	} else
-		ap->rval = ap->firstblock;
+		ap->rval = *ap->firstblock;
 
 	xfs_bmap_adjacent(ap);
 
@@ -2402,7 +2402,7 @@ xfs_bmap_btalloc(
 	if (nullfb || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno)
 		;
 	else
-		ap->rval = ap->firstblock;
+		ap->rval = *ap->firstblock;
 	/*
 	 * Normal allocation, done through xfs_alloc_vextent.
 	 */
@@ -2413,13 +2413,13 @@ xfs_bmap_btalloc(
 
 	/* Trim the allocation back to the maximum an AG can fit. */
 	args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp));
-	args.firstblock = ap->firstblock;
+	args.firstblock = *ap->firstblock;
 	blen = 0;
 	if (nullfb) {
 		error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
 		if (error)
 			return error;
-	} else if (ap->low) {
+	} else if (ap->flist->xbf_low) {
 		if (xfs_inode_is_filestream(ap->ip))
 			args.type = XFS_ALLOCTYPE_FIRST_AG;
 		else
@@ -2452,7 +2452,7 @@ xfs_bmap_btalloc(
 	 * is >= the stripe unit and the allocation offset is
 	 * at the end of file.
 	 */
-	if (!ap->low && ap->aeof) {
+	if (!ap->flist->xbf_low && ap->aeof) {
 		if (!ap->off) {
 			args.alignment = mp->m_dalign;
 			atype = args.type;
@@ -2540,12 +2540,25 @@ xfs_bmap_btalloc(
 		args.minleft = 0;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
-		ap->low = 1;
+		ap->flist->xbf_low = 1;
 	}
 	if (args.fsbno != NULLFSBLOCK) {
-		ap->firstblock = ap->rval = args.fsbno;
+		/*
+		 * check the allocation happened at the same or higher AG than
+		 * the first block that was allocated.
+		 */
+		ASSERT(*ap->firstblock == NULLFSBLOCK ||
+		       XFS_FSB_TO_AGNO(mp, *ap->firstblock) ==
+		       XFS_FSB_TO_AGNO(mp, args.fsbno) ||
+		       (ap->flist->xbf_low &&
+			XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
+			XFS_FSB_TO_AGNO(mp, args.fsbno)));
+
+		ap->rval = args.fsbno;
+		if (*ap->firstblock == NULLFSBLOCK)
+			*ap->firstblock = args.fsbno;
 		ASSERT(nullfb || fb_agno == args.agno ||
-		       (ap->low && fb_agno < args.agno));
+		       (ap->flist->xbf_low && fb_agno < args.agno));
 		ap->alen = args.len;
 		ap->ip->i_d.di_nblocks += args.len;
 		xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
@@ -4596,8 +4609,6 @@ xfs_bmapi_allocate(
 	struct xfs_bmalloca	*bma,
 	xfs_extnum_t		*lastx,
 	struct xfs_btree_cur	**cur,
-	xfs_fsblock_t		*firstblock,
-	struct xfs_bmap_free	*flist,
 	int			flags,
 	int			*nallocs,
 	int			*logflags)
@@ -4647,9 +4658,7 @@ xfs_bmapi_allocate(
 	 */
 	bma->alen = alen;
 	bma->off = aoff;
-	bma->firstblock = *firstblock;
 	bma->minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1;
-	bma->low = flist->xbf_low;
 	bma->aeof = 0;
 
 	/*
@@ -4671,24 +4680,18 @@ xfs_bmapi_allocate(
 	 * Copy out result fields.
 	 */
 	abno = bma->rval;
-	flist->xbf_low = bma->low;
 	alen = bma->alen;
 	aoff = bma->off;
-	ASSERT(*firstblock == NULLFSBLOCK ||
-	       XFS_FSB_TO_AGNO(mp, *firstblock) ==
-	       XFS_FSB_TO_AGNO(mp, bma->firstblock) ||
-	       (flist->xbf_low &&
-		XFS_FSB_TO_AGNO(mp, *firstblock) <
-			XFS_FSB_TO_AGNO(mp, bma->firstblock)));
-	*firstblock = bma->firstblock;
+	if (bma->flist->xbf_low)
+		bma->minleft = 0;
 	if (*cur)
-		(*cur)->bc_private.b.firstblock = *firstblock;
+		(*cur)->bc_private.b.firstblock = *bma->firstblock;
 	if (abno == NULLFSBLOCK)
 		return 0;
 	if ((ifp->if_flags & XFS_IFBROOT) && !*cur) {
 		(*cur) = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
-		(*cur)->bc_private.b.firstblock = *firstblock;
-		(*cur)->bc_private.b.flist = flist;
+		(*cur)->bc_private.b.firstblock = *bma->firstblock;
+		(*cur)->bc_private.b.flist = bma->flist;
 	}
 	/*
 	 * Bump the number of extents we've allocated
@@ -4715,11 +4718,12 @@ xfs_bmapi_allocate(
 
 	if (bma->wasdel) {
 		error = xfs_bmap_add_extent_delay_real(bma->tp, bma->ip, lastx,
-				cur, &bma->got, firstblock, flist, logflags);
+				cur, &bma->got, bma->firstblock, bma->flist,
+				logflags);
 	} else {
 		error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, lastx,
-				cur, &bma->got, firstblock, flist, logflags,
-				whichfork);
+				cur, &bma->got, bma->firstblock, bma->flist,
+				logflags, whichfork);
 	}
 
 	if (error)
@@ -4746,8 +4750,6 @@ xfs_bmapi_convert_unwritten(
 	xfs_filblks_t		len,
 	xfs_extnum_t		*lastx,
 	struct xfs_btree_cur	**cur,
-	xfs_fsblock_t		*firstblock,
-	struct xfs_bmap_free	*flist,
 	int			flags,
 	int			*logflags)
 {
@@ -4776,14 +4778,14 @@ xfs_bmapi_convert_unwritten(
 	if ((ifp->if_flags & XFS_IFBROOT) && !*cur) {
 		*cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
 					bma->ip, whichfork);
-		(*cur)->bc_private.b.firstblock = *firstblock;
-		(*cur)->bc_private.b.flist = flist;
+		(*cur)->bc_private.b.firstblock = *bma->firstblock;
+		(*cur)->bc_private.b.flist = bma->flist;
 	}
 	mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
 				? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
 
 	error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, lastx,
-			cur, mval, firstblock, flist, logflags);
+			cur, mval, bma->firstblock, bma->flist, logflags);
 	if (error)
 		return error;
 
@@ -4838,7 +4840,6 @@ xfs_bmapi_write(
 	int			error;		/* error return */
 	xfs_extnum_t		lastx;		/* last useful extent number */
 	int			logflags;	/* flags for transaction logging */
-	xfs_extlen_t		minleft;	/* min blocks left after allocation */
 	int			n;		/* current extent index */
 	int			nallocs;	/* number of extents alloc'd */
 	xfs_fileoff_t		obno;		/* old block number (offset) */
@@ -4900,11 +4901,11 @@ xfs_bmapi_write(
 
 	if (*firstblock == NULLFSBLOCK) {
 		if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
-			minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+			bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
 		else
-			minleft = 1;
+			bma.minleft = 1;
 	} else {
-		minleft = 0;
+		bma.minleft = 0;
 	}
 
 	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
@@ -4923,6 +4924,8 @@ xfs_bmapi_write(
 	bma.ip = ip;
 	bma.total = total;
 	bma.userdata = 0;
+	bma.flist = flist;
+	bma.firstblock = firstblock;
 
 	while (bno < end && n < *nmap) {
 		inhole = eof || bma.got.br_startoff > bno;
@@ -4938,16 +4941,12 @@ xfs_bmapi_write(
 			bma.wasdel = wasdelay;
 			bma.alen = len;
 			bma.off = bno;
-			bma.minleft = minleft;
 
-			error = xfs_bmapi_allocate(&bma, &lastx, &cur,
-					firstblock, flist, flags, &nallocs,
-					&tmp_logflags);
+			error = xfs_bmapi_allocate(&bma, &lastx, &cur, flags,
+					&nallocs, &tmp_logflags);
 			logflags |= tmp_logflags;
 			if (error)
 				goto error0;
-			if (flist && flist->xbf_low)
-				minleft = 0;
 			if (bma.rval == NULLFSBLOCK)
 				break;
 		}
@@ -4958,8 +4957,7 @@ xfs_bmapi_write(
 
 		/* Execute unwritten extent conversion if necessary */
 		error = xfs_bmapi_convert_unwritten(&bma, mval, len, &lastx,
-						    &cur, firstblock, flist,
-						    flags, &tmp_logflags);
+						    &cur, flags, &tmp_logflags);
 		logflags |= tmp_logflags;
 		if (error == EAGAIN)
 			continue;
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 6e8f8aee7cdb..6e7c7a50d248 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -109,7 +109,8 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
  * Argument structure for xfs_bmap_alloc.
  */
 typedef struct xfs_bmalloca {
-	xfs_fsblock_t		firstblock; /* i/o first block allocated */
+	xfs_fsblock_t		*firstblock; /* i/o first block allocated */
+	struct xfs_bmap_free	*flist;	/* bmap freelist */
 	xfs_fsblock_t		rval;	/* starting block of new extent */
 	xfs_fileoff_t		off;	/* offset in file filling in */
 	struct xfs_trans	*tp;	/* transaction pointer */
@@ -123,7 +124,6 @@ typedef struct xfs_bmalloca {
 	char			eof;	/* set if allocating past last extent */
 	char			wasdel;	/* replacing a delayed allocation */
 	char			userdata;/* set if is user data */
-	char			low;	/* low on space, using seq'l ags */
 	char			aeof;	/* allocated space at eof */
 	char			conv;	/* overwriting unwritten extents */
 } xfs_bmalloca_t;
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 3ff3d9e23ded..137f957b8c7e 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -761,7 +761,7 @@ xfs_filestream_new_ag(
 	 */
 	ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount;
 	flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
-	        (ap->low ? XFS_PICK_LOWSPACE : 0);
+	        (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0);
 
 	err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen);
 	if (err || *agp == NULLAGNUMBER)
-- 
cgit v1.2.3-58-ga151


From 3a75667e902dbdb87718b1ee2b3b745b344a8163 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sun, 18 Sep 2011 20:40:58 +0000
Subject: xfs: rename allocation range fields in struct xfs_bmalloca

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c       | 138 ++++++++++++++++++++++++------------------------
 fs/xfs/xfs_bmap.h       |   8 +--
 fs/xfs/xfs_filestream.c |   2 +-
 3 files changed, 75 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 608a0013791b..b47555cfbd8f 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2017,15 +2017,15 @@ xfs_bmap_adjacent(
 	    !isnullstartblock(ap->prev.br_startblock) &&
 	    ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount,
 		    ap->prev.br_startblock)) {
-		ap->rval = ap->prev.br_startblock + ap->prev.br_blockcount;
+		ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount;
 		/*
 		 * Adjust for the gap between prevp and us.
 		 */
-		adjust = ap->off -
+		adjust = ap->offset -
 			(ap->prev.br_startoff + ap->prev.br_blockcount);
 		if (adjust &&
-		    ISVALID(ap->rval + adjust, ap->prev.br_startblock))
-			ap->rval += adjust;
+		    ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
+			ap->blkno += adjust;
 	}
 	/*
 	 * If not at eof, then compare the two neighbor blocks.
@@ -2050,7 +2050,7 @@ xfs_bmap_adjacent(
 			/*
 			 * Calculate gap to end of previous block.
 			 */
-			adjust = prevdiff = ap->off -
+			adjust = prevdiff = ap->offset -
 				(ap->prev.br_startoff +
 				 ap->prev.br_blockcount);
 			/*
@@ -2061,7 +2061,7 @@ xfs_bmap_adjacent(
 			 * allocating, or using it gives us an invalid block
 			 * number, then just use the end of the previous block.
 			 */
-			if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
+			if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
 			    ISVALID(prevbno + prevdiff,
 				    ap->prev.br_startblock))
 				prevbno += adjust;
@@ -2088,7 +2088,7 @@ xfs_bmap_adjacent(
 			/*
 			 * Calculate gap to start of next block.
 			 */
-			adjust = gotdiff = ap->got.br_startoff - ap->off;
+			adjust = gotdiff = ap->got.br_startoff - ap->offset;
 			/*
 			 * Figure the startblock based on the next block's
 			 * start and the gap size.
@@ -2101,12 +2101,12 @@ xfs_bmap_adjacent(
 			 * number, then just use the start of the next block
 			 * offset by our length.
 			 */
-			if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
+			if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
 			    ISVALID(gotbno - gotdiff, gotbno))
 				gotbno -= adjust;
-			else if (ISVALID(gotbno - ap->alen, gotbno)) {
-				gotbno -= ap->alen;
-				gotdiff += adjust - ap->alen;
+			else if (ISVALID(gotbno - ap->length, gotbno)) {
+				gotbno -= ap->length;
+				gotdiff += adjust - ap->length;
 			} else
 				gotdiff += adjust;
 			/*
@@ -2124,14 +2124,14 @@ xfs_bmap_adjacent(
 			gotbno = NULLFSBLOCK;
 		/*
 		 * If both valid, pick the better one, else the only good
-		 * one, else ap->rval is already set (to 0 or the inode block).
+		 * one, else ap->blkno is already set (to 0 or the inode block).
 		 */
 		if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
-			ap->rval = prevdiff <= gotdiff ? prevbno : gotbno;
+			ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno;
 		else if (prevbno != NULLFSBLOCK)
-			ap->rval = prevbno;
+			ap->blkno = prevbno;
 		else if (gotbno != NULLFSBLOCK)
-			ap->rval = gotbno;
+			ap->blkno = gotbno;
 	}
 #undef ISVALID
 }
@@ -2153,22 +2153,22 @@ xfs_bmap_rtalloc(
 	prod = align / mp->m_sb.sb_rextsize;
 	error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
 					align, 1, ap->eof, 0,
-					ap->conv, &ap->off, &ap->alen);
+					ap->conv, &ap->offset, &ap->length);
 	if (error)
 		return error;
-	ASSERT(ap->alen);
-	ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0);
+	ASSERT(ap->length);
+	ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
 
 	/*
 	 * If the offset & length are not perfectly aligned
 	 * then kill prod, it will just get us in trouble.
 	 */
-	if (do_mod(ap->off, align) || ap->alen % align)
+	if (do_mod(ap->offset, align) || ap->length % align)
 		prod = 1;
 	/*
 	 * Set ralen to be the actual requested length in rtextents.
 	 */
-	ralen = ap->alen / mp->m_sb.sb_rextsize;
+	ralen = ap->length / mp->m_sb.sb_rextsize;
 	/*
 	 * If the old value was close enough to MAXEXTLEN that
 	 * we rounded up to it, cut it back so it's valid again.
@@ -2189,15 +2189,15 @@ xfs_bmap_rtalloc(
 	 * If it's an allocation to an empty file at offset 0,
 	 * pick an extent that will space things out in the rt area.
 	 */
-	if (ap->eof && ap->off == 0) {
+	if (ap->eof && ap->offset == 0) {
 		xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
 
 		error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
 		if (error)
 			return error;
-		ap->rval = rtx * mp->m_sb.sb_rextsize;
+		ap->blkno = rtx * mp->m_sb.sb_rextsize;
 	} else {
-		ap->rval = 0;
+		ap->blkno = 0;
 	}
 
 	xfs_bmap_adjacent(ap);
@@ -2205,23 +2205,23 @@ xfs_bmap_rtalloc(
 	/*
 	 * Realtime allocation, done through xfs_rtallocate_extent.
 	 */
-	atype = ap->rval == 0 ?  XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
-	do_div(ap->rval, mp->m_sb.sb_rextsize);
-	rtb = ap->rval;
-	ap->alen = ralen;
-	if ((error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, ap->alen,
+	atype = ap->blkno == 0 ?  XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
+	do_div(ap->blkno, mp->m_sb.sb_rextsize);
+	rtb = ap->blkno;
+	ap->length = ralen;
+	if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
 				&ralen, atype, ap->wasdel, prod, &rtb)))
 		return error;
 	if (rtb == NULLFSBLOCK && prod > 1 &&
-	    (error = xfs_rtallocate_extent(ap->tp, ap->rval, 1,
-					   ap->alen, &ralen, atype,
+	    (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
+					   ap->length, &ralen, atype,
 					   ap->wasdel, 1, &rtb)))
 		return error;
-	ap->rval = rtb;
-	if (ap->rval != NULLFSBLOCK) {
-		ap->rval *= mp->m_sb.sb_rextsize;
+	ap->blkno = rtb;
+	if (ap->blkno != NULLFSBLOCK) {
+		ap->blkno *= mp->m_sb.sb_rextsize;
 		ralen *= mp->m_sb.sb_rextsize;
-		ap->alen = ralen;
+		ap->length = ralen;
 		ap->ip->i_d.di_nblocks += ralen;
 		xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
 		if (ap->wasdel)
@@ -2234,7 +2234,7 @@ xfs_bmap_rtalloc(
 			ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
 					XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
 	} else {
-		ap->alen = 0;
+		ap->length = 0;
 	}
 	return 0;
 }
@@ -2349,7 +2349,7 @@ xfs_bmap_btalloc_nullfb(
 	 * AG as the stream may have moved.
 	 */
 	if (xfs_inode_is_filestream(ap->ip))
-		ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
+		ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
 
 	return 0;
 }
@@ -2376,9 +2376,9 @@ xfs_bmap_btalloc(
 	if (unlikely(align)) {
 		error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
 						align, 0, ap->eof, 0, ap->conv,
-						&ap->off, &ap->alen);
+						&ap->offset, &ap->length);
 		ASSERT(!error);
-		ASSERT(ap->alen);
+		ASSERT(ap->length);
 	}
 	nullfb = *ap->firstblock == NULLFSBLOCK;
 	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
@@ -2386,33 +2386,33 @@ xfs_bmap_btalloc(
 		if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
 			ag = xfs_filestream_lookup_ag(ap->ip);
 			ag = (ag != NULLAGNUMBER) ? ag : 0;
-			ap->rval = XFS_AGB_TO_FSB(mp, ag, 0);
+			ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
 		} else {
-			ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
+			ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
 		}
 	} else
-		ap->rval = *ap->firstblock;
+		ap->blkno = *ap->firstblock;
 
 	xfs_bmap_adjacent(ap);
 
 	/*
-	 * If allowed, use ap->rval; otherwise must use firstblock since
+	 * If allowed, use ap->blkno; otherwise must use firstblock since
 	 * it's in the right allocation group.
 	 */
-	if (nullfb || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno)
+	if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno)
 		;
 	else
-		ap->rval = *ap->firstblock;
+		ap->blkno = *ap->firstblock;
 	/*
 	 * Normal allocation, done through xfs_alloc_vextent.
 	 */
 	tryagain = isaligned = 0;
 	args.tp = ap->tp;
 	args.mp = mp;
-	args.fsbno = ap->rval;
+	args.fsbno = ap->blkno;
 
 	/* Trim the allocation back to the maximum an AG can fit. */
-	args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp));
+	args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp));
 	args.firstblock = *ap->firstblock;
 	blen = 0;
 	if (nullfb) {
@@ -2433,14 +2433,14 @@ xfs_bmap_btalloc(
 	/* apply extent size hints if obtained earlier */
 	if (unlikely(align)) {
 		args.prod = align;
-		if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod)))
+		if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod)))
 			args.mod = (xfs_extlen_t)(args.prod - args.mod);
 	} else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) {
 		args.prod = 1;
 		args.mod = 0;
 	} else {
 		args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog;
-		if ((args.mod = (xfs_extlen_t)(do_mod(ap->off, args.prod))))
+		if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod))))
 			args.mod = (xfs_extlen_t)(args.prod - args.mod);
 	}
 	/*
@@ -2453,7 +2453,7 @@ xfs_bmap_btalloc(
 	 * at the end of file.
 	 */
 	if (!ap->flist->xbf_low && ap->aeof) {
-		if (!ap->off) {
+		if (!ap->offset) {
 			args.alignment = mp->m_dalign;
 			atype = args.type;
 			isaligned = 1;
@@ -2506,7 +2506,7 @@ xfs_bmap_btalloc(
 		 * turned on.
 		 */
 		args.type = atype;
-		args.fsbno = ap->rval;
+		args.fsbno = ap->blkno;
 		args.alignment = mp->m_dalign;
 		args.minlen = nextminlen;
 		args.minalignslop = 0;
@@ -2520,7 +2520,7 @@ xfs_bmap_btalloc(
 		 * try again.
 		 */
 		args.type = atype;
-		args.fsbno = ap->rval;
+		args.fsbno = ap->blkno;
 		args.alignment = 0;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
@@ -2529,7 +2529,7 @@ xfs_bmap_btalloc(
 	    args.minlen > ap->minlen) {
 		args.minlen = ap->minlen;
 		args.type = XFS_ALLOCTYPE_START_BNO;
-		args.fsbno = ap->rval;
+		args.fsbno = ap->blkno;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 	}
@@ -2554,12 +2554,12 @@ xfs_bmap_btalloc(
 			XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
 			XFS_FSB_TO_AGNO(mp, args.fsbno)));
 
-		ap->rval = args.fsbno;
+		ap->blkno = args.fsbno;
 		if (*ap->firstblock == NULLFSBLOCK)
 			*ap->firstblock = args.fsbno;
 		ASSERT(nullfb || fb_agno == args.agno ||
 		       (ap->flist->xbf_low && fb_agno < args.agno));
-		ap->alen = args.len;
+		ap->length = args.len;
 		ap->ip->i_d.di_nblocks += args.len;
 		xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
 		if (ap->wasdel)
@@ -2573,8 +2573,8 @@ xfs_bmap_btalloc(
 					XFS_TRANS_DQ_BCOUNT,
 			(long) args.len);
 	} else {
-		ap->rval = NULLFSBLOCK;
-		ap->alen = 0;
+		ap->blkno = NULLFSBLOCK;
+		ap->length = 0;
 	}
 	return 0;
 }
@@ -3902,8 +3902,8 @@ xfs_bmap_isaeof(
 	 * Check if we are allocation or past the last extent, or at least into
 	 * the last delayed allocated extent.
 	 */
-	bma->aeof = bma->off >= rec.br_startoff + rec.br_blockcount ||
-		(bma->off >= rec.br_startoff &&
+	bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount ||
+		(bma->offset >= rec.br_startoff &&
 		 isnullstartblock(rec.br_startblock));
 	return 0;
 }
@@ -4637,11 +4637,11 @@ xfs_bmapi_allocate(
 					 &bma->prev);
 		}
 	} else {
-		alen = (xfs_extlen_t)XFS_FILBLKS_MIN(bma->alen, MAXEXTLEN);
+		alen = (xfs_extlen_t)XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
 		if (!bma->eof)
 			alen = (xfs_extlen_t)XFS_FILBLKS_MIN(alen,
-					bma->got.br_startoff - bma->off);
-		aoff = bma->off;
+					bma->got.br_startoff - bma->offset);
+		aoff = bma->offset;
 	}
 
 	/*
@@ -4656,8 +4656,8 @@ xfs_bmapi_allocate(
 	/*
 	 * Fill in changeable bma fields.
 	 */
-	bma->alen = alen;
-	bma->off = aoff;
+	bma->length = alen;
+	bma->offset = aoff;
 	bma->minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1;
 	bma->aeof = 0;
 
@@ -4679,9 +4679,9 @@ xfs_bmapi_allocate(
 	/*
 	 * Copy out result fields.
 	 */
-	abno = bma->rval;
-	alen = bma->alen;
-	aoff = bma->off;
+	abno = bma->blkno;
+	alen = bma->length;
+	aoff = bma->offset;
 	if (bma->flist->xbf_low)
 		bma->minleft = 0;
 	if (*cur)
@@ -4939,15 +4939,15 @@ xfs_bmapi_write(
 			bma.eof = eof;
 			bma.conv = !!(flags & XFS_BMAPI_CONVERT);
 			bma.wasdel = wasdelay;
-			bma.alen = len;
-			bma.off = bno;
+			bma.length = len;
+			bma.offset = bno;
 
 			error = xfs_bmapi_allocate(&bma, &lastx, &cur, flags,
 					&nallocs, &tmp_logflags);
 			logflags |= tmp_logflags;
 			if (error)
 				goto error0;
-			if (bma.rval == NULLFSBLOCK)
+			if (bma.blkno == NULLFSBLOCK)
 				break;
 		}
 
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 6e7c7a50d248..5f398b1ac70c 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -111,13 +111,15 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
 typedef struct xfs_bmalloca {
 	xfs_fsblock_t		*firstblock; /* i/o first block allocated */
 	struct xfs_bmap_free	*flist;	/* bmap freelist */
-	xfs_fsblock_t		rval;	/* starting block of new extent */
-	xfs_fileoff_t		off;	/* offset in file filling in */
 	struct xfs_trans	*tp;	/* transaction pointer */
 	struct xfs_inode	*ip;	/* incore inode pointer */
 	struct xfs_bmbt_irec	prev;	/* extent before the new one */
 	struct xfs_bmbt_irec	got;	/* extent after, or delayed */
-	xfs_extlen_t		alen;	/* i/o length asked/allocated */
+
+	xfs_fileoff_t		offset;	/* offset in file filling in */
+	xfs_extlen_t		length;	/* i/o length asked/allocated */
+	xfs_fsblock_t		blkno;	/* starting block of new extent */
+
 	xfs_extlen_t		total;	/* total blocks needed for xaction */
 	xfs_extlen_t		minlen;	/* minimum allocation size (blocks) */
 	xfs_extlen_t		minleft; /* amount must be left after alloc */
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 137f957b8c7e..5170306a1009 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -682,7 +682,7 @@ xfs_filestream_new_ag(
 	ip = ap->ip;
 	mp = ip->i_mount;
 	cache = mp->m_filestream;
-	minlen = ap->alen;
+	minlen = ap->length;
 	*agp = NULLAGNUMBER;
 
 	/*
-- 
cgit v1.2.3-58-ga151


From 963c30cf45e8c832ae11438ff9d99c954b9d0114 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sun, 18 Sep 2011 20:40:59 +0000
Subject: xfs: do not keep local copies of allocation ranges in
 xfs_bmapi_allocate

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 43 ++++++++++++++-----------------------------
 1 file changed, 14 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index b47555cfbd8f..6a7832d3540e 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4617,9 +4617,6 @@ xfs_bmapi_allocate(
 	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
 						XFS_ATTR_FORK : XFS_DATA_FORK;
 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
-	xfs_fsblock_t		abno;
-	xfs_extlen_t		alen;
-	xfs_fileoff_t		aoff;
 	int			error;
 	int			rt;
 
@@ -4630,18 +4627,17 @@ xfs_bmapi_allocate(
 	 * for in this bmap call but that wouldn't be as good.
 	 */
 	if (bma->wasdel) {
-		alen = (xfs_extlen_t)bma->got.br_blockcount;
-		aoff = bma->got.br_startoff;
+		bma->length = (xfs_extlen_t)bma->got.br_blockcount;
+		bma->offset = bma->got.br_startoff;
 		if (*lastx != NULLEXTNUM && *lastx) {
 			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx - 1),
 					 &bma->prev);
 		}
 	} else {
-		alen = (xfs_extlen_t)XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
+		bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
 		if (!bma->eof)
-			alen = (xfs_extlen_t)XFS_FILBLKS_MIN(alen,
+			bma->length = XFS_FILBLKS_MIN(bma->length,
 					bma->got.br_startoff - bma->offset);
-		aoff = bma->offset;
 	}
 
 	/*
@@ -4649,23 +4645,17 @@ xfs_bmapi_allocate(
 	 * user data.
 	 */
 	if (!(flags & XFS_BMAPI_METADATA)) {
-		bma->userdata = (aoff == 0) ?
+		bma->userdata = (bma->offset == 0) ?
 			XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
 	}
 
-	/*
-	 * Fill in changeable bma fields.
-	 */
-	bma->length = alen;
-	bma->offset = aoff;
-	bma->minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1;
-	bma->aeof = 0;
+	bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
 
 	/*
 	 * Only want to do the alignment at the eof if it is userdata and
 	 * allocation length is larger than a stripe unit.
 	 */
-	if (mp->m_dalign && alen >= mp->m_dalign &&
+	if (mp->m_dalign && bma->length >= mp->m_dalign &&
 	    !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
 		error = xfs_bmap_isaeof(bma, whichfork);
 		if (error)
@@ -4676,17 +4666,11 @@ xfs_bmapi_allocate(
 	if (error)
 		return error;
 
-	/*
-	 * Copy out result fields.
-	 */
-	abno = bma->blkno;
-	alen = bma->length;
-	aoff = bma->offset;
 	if (bma->flist->xbf_low)
 		bma->minleft = 0;
 	if (*cur)
 		(*cur)->bc_private.b.firstblock = *bma->firstblock;
-	if (abno == NULLFSBLOCK)
+	if (bma->blkno == NULLFSBLOCK)
 		return 0;
 	if ((ifp->if_flags & XFS_IFBROOT) && !*cur) {
 		(*cur) = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
@@ -4703,9 +4687,9 @@ xfs_bmapi_allocate(
 		(*cur)->bc_private.b.flags =
 			bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
 
-	bma->got.br_startoff = aoff;
-	bma->got.br_startblock = abno;
-	bma->got.br_blockcount = alen;
+	bma->got.br_startoff = bma->offset;
+	bma->got.br_startblock = bma->blkno;
+	bma->got.br_blockcount = bma->length;
 	bma->got.br_state = XFS_EXT_NORM;
 
 	/*
@@ -4736,8 +4720,9 @@ xfs_bmapi_allocate(
 	 */
 	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), &bma->got);
 
-	ASSERT(bma->got.br_startoff <= aoff);
-	ASSERT(bma->got.br_startoff + bma->got.br_blockcount >= aoff + alen);
+	ASSERT(bma->got.br_startoff <= bma->offset);
+	ASSERT(bma->got.br_startoff + bma->got.br_blockcount >=
+	       bma->offset + bma->length);
 	ASSERT(bma->got.br_state == XFS_EXT_NORM ||
 	       bma->got.br_state == XFS_EXT_UNWRITTEN);
 	return 0;
-- 
cgit v1.2.3-58-ga151


From 29c8d17a8938be88e36b93522753f3519aefd05d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sun, 18 Sep 2011 20:41:00 +0000
Subject: xfs: move btree cursor into bmalloca

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 56 ++++++++++++++++++++++++++-----------------------------
 fs/xfs/xfs_bmap.h |  2 ++
 2 files changed, 28 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 6a7832d3540e..a54326bf9aa0 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4608,7 +4608,6 @@ STATIC int
 xfs_bmapi_allocate(
 	struct xfs_bmalloca	*bma,
 	xfs_extnum_t		*lastx,
-	struct xfs_btree_cur	**cur,
 	int			flags,
 	int			*nallocs,
 	int			*logflags)
@@ -4668,14 +4667,14 @@ xfs_bmapi_allocate(
 
 	if (bma->flist->xbf_low)
 		bma->minleft = 0;
-	if (*cur)
-		(*cur)->bc_private.b.firstblock = *bma->firstblock;
+	if (bma->cur)
+		bma->cur->bc_private.b.firstblock = *bma->firstblock;
 	if (bma->blkno == NULLFSBLOCK)
 		return 0;
-	if ((ifp->if_flags & XFS_IFBROOT) && !*cur) {
-		(*cur) = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
-		(*cur)->bc_private.b.firstblock = *bma->firstblock;
-		(*cur)->bc_private.b.flist = bma->flist;
+	if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
+		bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
+		bma->cur->bc_private.b.firstblock = *bma->firstblock;
+		bma->cur->bc_private.b.flist = bma->flist;
 	}
 	/*
 	 * Bump the number of extents we've allocated
@@ -4683,8 +4682,8 @@ xfs_bmapi_allocate(
 	 */
 	(*nallocs)++;
 
-	if (*cur)
-		(*cur)->bc_private.b.flags =
+	if (bma->cur)
+		bma->cur->bc_private.b.flags =
 			bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
 
 	bma->got.br_startoff = bma->offset;
@@ -4702,12 +4701,12 @@ xfs_bmapi_allocate(
 
 	if (bma->wasdel) {
 		error = xfs_bmap_add_extent_delay_real(bma->tp, bma->ip, lastx,
-				cur, &bma->got, bma->firstblock, bma->flist,
-				logflags);
+				&bma->cur, &bma->got, bma->firstblock,
+				bma->flist, logflags);
 	} else {
 		error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, lastx,
-				cur, &bma->got, bma->firstblock, bma->flist,
-				logflags, whichfork);
+				&bma->cur, &bma->got, bma->firstblock,
+				bma->flist, logflags, whichfork);
 	}
 
 	if (error)
@@ -4734,7 +4733,6 @@ xfs_bmapi_convert_unwritten(
 	struct xfs_bmbt_irec	*mval,
 	xfs_filblks_t		len,
 	xfs_extnum_t		*lastx,
-	struct xfs_btree_cur	**cur,
 	int			flags,
 	int			*logflags)
 {
@@ -4760,17 +4758,17 @@ xfs_bmapi_convert_unwritten(
 	 * Modify (by adding) the state flag, if writing.
 	 */
 	ASSERT(mval->br_blockcount <= len);
-	if ((ifp->if_flags & XFS_IFBROOT) && !*cur) {
-		*cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
+	if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
+		bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
 					bma->ip, whichfork);
-		(*cur)->bc_private.b.firstblock = *bma->firstblock;
-		(*cur)->bc_private.b.flist = bma->flist;
+		bma->cur->bc_private.b.firstblock = *bma->firstblock;
+		bma->cur->bc_private.b.flist = bma->flist;
 	}
 	mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
 				? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
 
 	error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, lastx,
-			cur, mval, bma->firstblock, bma->flist, logflags);
+			&bma->cur, mval, bma->firstblock, bma->flist, logflags);
 	if (error)
 		return error;
 
@@ -4819,7 +4817,6 @@ xfs_bmapi_write(
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_ifork	*ifp;
 	struct xfs_bmalloca	bma = { 0 };	/* args for xfs_bmap_alloc */
-	struct xfs_btree_cur	*cur;		/* bmap btree cursor */
 	xfs_fileoff_t		end;		/* end of mapped file region */
 	int			eof;		/* after the end of extents */
 	int			error;		/* error return */
@@ -4875,7 +4872,6 @@ xfs_bmapi_write(
 
 	logflags = 0;
 	nallocs = 0;
-	cur = NULL;
 
 	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
 		error = xfs_bmap_local_to_extents(tp, ip, firstblock, total,
@@ -4927,7 +4923,7 @@ xfs_bmapi_write(
 			bma.length = len;
 			bma.offset = bno;
 
-			error = xfs_bmapi_allocate(&bma, &lastx, &cur, flags,
+			error = xfs_bmapi_allocate(&bma, &lastx, flags,
 					&nallocs, &tmp_logflags);
 			logflags |= tmp_logflags;
 			if (error)
@@ -4942,7 +4938,7 @@ xfs_bmapi_write(
 
 		/* Execute unwritten extent conversion if necessary */
 		error = xfs_bmapi_convert_unwritten(&bma, mval, len, &lastx,
-						    &cur, flags, &tmp_logflags);
+						    flags, &tmp_logflags);
 		logflags |= tmp_logflags;
 		if (error == EAGAIN)
 			continue;
@@ -4974,8 +4970,8 @@ xfs_bmapi_write(
 	 */
 	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
 	    XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
-		ASSERT(cur);
-		error = xfs_bmap_btree_to_extents(tp, ip, cur,
+		ASSERT(bma.cur);
+		error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
 			&tmp_logflags, whichfork);
 		logflags |= tmp_logflags;
 		if (error)
@@ -5005,19 +5001,19 @@ error0:
 	if (logflags)
 		xfs_trans_log_inode(tp, ip, logflags);
 
-	if (cur) {
+	if (bma.cur) {
 		if (!error) {
 			ASSERT(*firstblock == NULLFSBLOCK ||
 			       XFS_FSB_TO_AGNO(mp, *firstblock) ==
 			       XFS_FSB_TO_AGNO(mp,
-				       cur->bc_private.b.firstblock) ||
+				       bma.cur->bc_private.b.firstblock) ||
 			       (flist->xbf_low &&
 				XFS_FSB_TO_AGNO(mp, *firstblock) <
 				XFS_FSB_TO_AGNO(mp,
-					cur->bc_private.b.firstblock)));
-			*firstblock = cur->bc_private.b.firstblock;
+					bma.cur->bc_private.b.firstblock)));
+			*firstblock = bma.cur->bc_private.b.firstblock;
 		}
-		xfs_btree_del_cursor(cur,
+		xfs_btree_del_cursor(bma.cur,
 			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
 	}
 	if (!error)
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 5f398b1ac70c..858d9d509989 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -120,6 +120,8 @@ typedef struct xfs_bmalloca {
 	xfs_extlen_t		length;	/* i/o length asked/allocated */
 	xfs_fsblock_t		blkno;	/* starting block of new extent */
 
+	struct xfs_btree_cur	*cur;	/* btree cursor */
+
 	xfs_extlen_t		total;	/* total blocks needed for xaction */
 	xfs_extlen_t		minlen;	/* minimum allocation size (blocks) */
 	xfs_extlen_t		minleft; /* amount must be left after alloc */
-- 
cgit v1.2.3-58-ga151


From e0c3da5d89dc1aeef2275a8b751231e147603f0f Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sun, 18 Sep 2011 20:41:01 +0000
Subject: xfs: move lastx and nallocs into bmalloca

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 47 +++++++++++++++++++++--------------------------
 fs/xfs/xfs_bmap.h |  2 ++
 2 files changed, 23 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index a54326bf9aa0..3ef145307274 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4607,9 +4607,7 @@ xfs_bmapi_delay(
 STATIC int
 xfs_bmapi_allocate(
 	struct xfs_bmalloca	*bma,
-	xfs_extnum_t		*lastx,
 	int			flags,
-	int			*nallocs,
 	int			*logflags)
 {
 	struct xfs_mount	*mp = bma->ip->i_mount;
@@ -4628,8 +4626,8 @@ xfs_bmapi_allocate(
 	if (bma->wasdel) {
 		bma->length = (xfs_extlen_t)bma->got.br_blockcount;
 		bma->offset = bma->got.br_startoff;
-		if (*lastx != NULLEXTNUM && *lastx) {
-			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx - 1),
+		if (bma->idx != NULLEXTNUM && bma->idx) {
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1),
 					 &bma->prev);
 		}
 	} else {
@@ -4680,7 +4678,7 @@ xfs_bmapi_allocate(
 	 * Bump the number of extents we've allocated
 	 * in this call.
 	 */
-	(*nallocs)++;
+	bma->nallocs++;
 
 	if (bma->cur)
 		bma->cur->bc_private.b.flags =
@@ -4700,13 +4698,14 @@ xfs_bmapi_allocate(
 		bma->got.br_state = XFS_EXT_UNWRITTEN;
 
 	if (bma->wasdel) {
-		error = xfs_bmap_add_extent_delay_real(bma->tp, bma->ip, lastx,
-				&bma->cur, &bma->got, bma->firstblock,
-				bma->flist, logflags);
+		error = xfs_bmap_add_extent_delay_real(bma->tp, bma->ip,
+				&bma->idx, &bma->cur, &bma->got,
+				bma->firstblock, bma->flist, logflags);
 	} else {
-		error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, lastx,
-				&bma->cur, &bma->got, bma->firstblock,
-				bma->flist, logflags, whichfork);
+		error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip,
+				&bma->idx, &bma->cur, &bma->got,
+				bma->firstblock, bma->flist, logflags,
+				whichfork);
 	}
 
 	if (error)
@@ -4717,7 +4716,7 @@ xfs_bmapi_allocate(
 	 * or xfs_bmap_add_extent_hole_real might have merged it into one of
 	 * the neighbouring ones.
 	 */
-	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), &bma->got);
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
 
 	ASSERT(bma->got.br_startoff <= bma->offset);
 	ASSERT(bma->got.br_startoff + bma->got.br_blockcount >=
@@ -4732,7 +4731,6 @@ xfs_bmapi_convert_unwritten(
 	struct xfs_bmalloca	*bma,
 	struct xfs_bmbt_irec	*mval,
 	xfs_filblks_t		len,
-	xfs_extnum_t		*lastx,
 	int			flags,
 	int			*logflags)
 {
@@ -4767,7 +4765,7 @@ xfs_bmapi_convert_unwritten(
 	mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
 				? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
 
-	error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, lastx,
+	error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
 			&bma->cur, mval, bma->firstblock, bma->flist, logflags);
 	if (error)
 		return error;
@@ -4777,7 +4775,7 @@ xfs_bmapi_convert_unwritten(
 	 * xfs_bmap_add_extent_unwritten_real might have merged it into one
 	 * of the neighbouring ones.
 	 */
-	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), &bma->got);
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
 
 	/*
 	 * We may have combined previously unwritten space with written space,
@@ -4820,10 +4818,8 @@ xfs_bmapi_write(
 	xfs_fileoff_t		end;		/* end of mapped file region */
 	int			eof;		/* after the end of extents */
 	int			error;		/* error return */
-	xfs_extnum_t		lastx;		/* last useful extent number */
 	int			logflags;	/* flags for transaction logging */
 	int			n;		/* current extent index */
-	int			nallocs;	/* number of extents alloc'd */
 	xfs_fileoff_t		obno;		/* old block number (offset) */
 	int			tmp_logflags;	/* temp flags holder */
 	int			whichfork;	/* data or attr fork */
@@ -4871,7 +4867,6 @@ xfs_bmapi_write(
 	XFS_STATS_INC(xs_blk_mapw);
 
 	logflags = 0;
-	nallocs = 0;
 
 	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
 		error = xfs_bmap_local_to_extents(tp, ip, firstblock, total,
@@ -4895,7 +4890,7 @@ xfs_bmapi_write(
 			goto error0;
 	}
 
-	xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &bma.got,
+	xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got,
 				&bma.prev);
 	n = 0;
 	end = bno + len;
@@ -4923,8 +4918,7 @@ xfs_bmapi_write(
 			bma.length = len;
 			bma.offset = bno;
 
-			error = xfs_bmapi_allocate(&bma, &lastx, flags,
-					&nallocs, &tmp_logflags);
+			error = xfs_bmapi_allocate(&bma, flags, &tmp_logflags);
 			logflags |= tmp_logflags;
 			if (error)
 				goto error0;
@@ -4937,7 +4931,7 @@ xfs_bmapi_write(
 							end, n, flags);
 
 		/* Execute unwritten extent conversion if necessary */
-		error = xfs_bmapi_convert_unwritten(&bma, mval, len, &lastx,
+		error = xfs_bmapi_convert_unwritten(&bma, mval, len,
 						    flags, &tmp_logflags);
 		logflags |= tmp_logflags;
 		if (error == EAGAIN)
@@ -4953,14 +4947,15 @@ xfs_bmapi_write(
 		 * XFS_BMAP_MAX_NMAP extents no matter what.  Otherwise
 		 * the transaction may get too big.
 		 */
-		if (bno >= end || n >= *nmap || nallocs >= *nmap)
+		if (bno >= end || n >= *nmap || bma.nallocs >= *nmap)
 			break;
 
 		/* Else go on to the next record. */
 		bma.prev = bma.got;
-		if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
-			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &bma.got);
-		else
+		if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) {
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx),
+					 &bma.got);
+		} else
 			eof = 1;
 	}
 	*nmap = n;
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 858d9d509989..c2d9e1ef4ba3 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -121,6 +121,8 @@ typedef struct xfs_bmalloca {
 	xfs_fsblock_t		blkno;	/* starting block of new extent */
 
 	struct xfs_btree_cur	*cur;	/* btree cursor */
+	xfs_extnum_t		idx;	/* current extent index */
+	int			nallocs;/* number of extents alloc'd */
 
 	xfs_extlen_t		total;	/* total blocks needed for xaction */
 	xfs_extlen_t		minlen;	/* minimum allocation size (blocks) */
-- 
cgit v1.2.3-58-ga151


From c315c90b7d530d1ec3c226052e153b0cffa512c8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Sep 2011 20:41:02 +0000
Subject: xfs: move logflags into bmalloca

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 48 ++++++++++++++++++++++--------------------------
 fs/xfs/xfs_bmap.h |  1 +
 2 files changed, 23 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 3ef145307274..ed094749851b 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4607,13 +4607,13 @@ xfs_bmapi_delay(
 STATIC int
 xfs_bmapi_allocate(
 	struct xfs_bmalloca	*bma,
-	int			flags,
-	int			*logflags)
+	int			flags)
 {
 	struct xfs_mount	*mp = bma->ip->i_mount;
 	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
 						XFS_ATTR_FORK : XFS_DATA_FORK;
 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+	int			tmp_logflags = 0;
 	int			error;
 	int			rt;
 
@@ -4700,14 +4700,15 @@ xfs_bmapi_allocate(
 	if (bma->wasdel) {
 		error = xfs_bmap_add_extent_delay_real(bma->tp, bma->ip,
 				&bma->idx, &bma->cur, &bma->got,
-				bma->firstblock, bma->flist, logflags);
+				bma->firstblock, bma->flist, &tmp_logflags);
 	} else {
 		error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip,
 				&bma->idx, &bma->cur, &bma->got,
-				bma->firstblock, bma->flist, logflags,
+				bma->firstblock, bma->flist, &tmp_logflags,
 				whichfork);
 	}
 
+	bma->logflags |= tmp_logflags;
 	if (error)
 		return error;
 
@@ -4731,16 +4732,14 @@ xfs_bmapi_convert_unwritten(
 	struct xfs_bmalloca	*bma,
 	struct xfs_bmbt_irec	*mval,
 	xfs_filblks_t		len,
-	int			flags,
-	int			*logflags)
+	int			flags)
 {
 	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
 						XFS_ATTR_FORK : XFS_DATA_FORK;
 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+	int			tmp_logflags = 0;
 	int			error;
 
-	*logflags = 0;
-
 	/* check if we need to do unwritten->real conversion */
 	if (mval->br_state == XFS_EXT_UNWRITTEN &&
 	    (flags & XFS_BMAPI_PREALLOC))
@@ -4766,7 +4765,9 @@ xfs_bmapi_convert_unwritten(
 				? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
 
 	error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
-			&bma->cur, mval, bma->firstblock, bma->flist, logflags);
+			&bma->cur, mval, bma->firstblock, bma->flist,
+			&tmp_logflags);
+	bma->logflags |= tmp_logflags;
 	if (error)
 		return error;
 
@@ -4818,10 +4819,8 @@ xfs_bmapi_write(
 	xfs_fileoff_t		end;		/* end of mapped file region */
 	int			eof;		/* after the end of extents */
 	int			error;		/* error return */
-	int			logflags;	/* flags for transaction logging */
 	int			n;		/* current extent index */
 	xfs_fileoff_t		obno;		/* old block number (offset) */
-	int			tmp_logflags;	/* temp flags holder */
 	int			whichfork;	/* data or attr fork */
 	char			inhole;		/* current location is hole in file */
 	char			wasdelay;	/* old extent was delayed */
@@ -4866,11 +4865,9 @@ xfs_bmapi_write(
 
 	XFS_STATS_INC(xs_blk_mapw);
 
-	logflags = 0;
-
 	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
 		error = xfs_bmap_local_to_extents(tp, ip, firstblock, total,
-						  &logflags, whichfork);
+						  &bma.logflags, whichfork);
 		if (error)
 			goto error0;
 	}
@@ -4918,8 +4915,7 @@ xfs_bmapi_write(
 			bma.length = len;
 			bma.offset = bno;
 
-			error = xfs_bmapi_allocate(&bma, flags, &tmp_logflags);
-			logflags |= tmp_logflags;
+			error = xfs_bmapi_allocate(&bma, flags);
 			if (error)
 				goto error0;
 			if (bma.blkno == NULLFSBLOCK)
@@ -4931,9 +4927,7 @@ xfs_bmapi_write(
 							end, n, flags);
 
 		/* Execute unwritten extent conversion if necessary */
-		error = xfs_bmapi_convert_unwritten(&bma, mval, len,
-						    flags, &tmp_logflags);
-		logflags |= tmp_logflags;
+		error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags);
 		if (error == EAGAIN)
 			continue;
 		if (error)
@@ -4965,10 +4959,12 @@ xfs_bmapi_write(
 	 */
 	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
 	    XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+		int		tmp_logflags = 0;
+
 		ASSERT(bma.cur);
 		error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
 			&tmp_logflags, whichfork);
-		logflags |= tmp_logflags;
+		bma.logflags |= tmp_logflags;
 		if (error)
 			goto error0;
 	}
@@ -4982,19 +4978,19 @@ error0:
 	 * Log everything.  Do this after conversion, there's no point in
 	 * logging the extent records if we've converted to btree format.
 	 */
-	if ((logflags & xfs_ilog_fext(whichfork)) &&
+	if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
 	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-		logflags &= ~xfs_ilog_fext(whichfork);
-	else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
+		bma.logflags &= ~xfs_ilog_fext(whichfork);
+	else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
 		 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
-		logflags &= ~xfs_ilog_fbroot(whichfork);
+		bma.logflags &= ~xfs_ilog_fbroot(whichfork);
 	/*
 	 * Log whatever the flags say, even if error.  Otherwise we might miss
 	 * detecting a case where the data is changed, there's an error,
 	 * and it's not logged so we don't shutdown when we should.
 	 */
-	if (logflags)
-		xfs_trans_log_inode(tp, ip, logflags);
+	if (bma.logflags)
+		xfs_trans_log_inode(tp, ip, bma.logflags);
 
 	if (bma.cur) {
 		if (!error) {
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index c2d9e1ef4ba3..243e212b31f6 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -123,6 +123,7 @@ typedef struct xfs_bmalloca {
 	struct xfs_btree_cur	*cur;	/* btree cursor */
 	xfs_extnum_t		idx;	/* current extent index */
 	int			nallocs;/* number of extents alloc'd */
+	int			logflags;/* flags for transaction logging */
 
 	xfs_extlen_t		total;	/* total blocks needed for xaction */
 	xfs_extlen_t		minlen;	/* minimum allocation size (blocks) */
-- 
cgit v1.2.3-58-ga151


From 572a4cf04ac7f46e9206aabfef03dae602812341 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Sep 2011 20:41:04 +0000
Subject: xfs: pass bmalloca to xfs_bmap_add_extent_delay_real

All the parameters passed to xfs_bmap_add_extent_delay_real() are in
the xfs_bmalloca structure now. Just pass the bmalloca parameter to
the function instead of 8 separate parameters.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 332 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 171 insertions(+), 161 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index ed094749851b..c8b9c4ec9f6f 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -375,16 +375,9 @@ xfs_bmap_add_attrfork_local(
  */
 STATIC int				/* error */
 xfs_bmap_add_extent_delay_real(
-	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
-	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
-	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
-	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
-	int			*logflagsp) /* inode logging flags */
+	struct xfs_bmalloca	*bma)
 {
-	xfs_btree_cur_t		*cur;	/* btree cursor */
+	struct xfs_bmbt_irec	*new = &bma->got;
 	int			diff;	/* temp value */
 	xfs_bmbt_rec_host_t	*ep;	/* extent entry for idx */
 	int			error;	/* error return value */
@@ -401,18 +394,16 @@ xfs_bmap_add_extent_delay_real(
 	xfs_filblks_t		temp2=0;/* value for da_new calculations */
 	int			tmp_rval;	/* partial logging flags */
 
-	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-	cur = *curp;
+	ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
 
-	ASSERT(*idx >= 0);
-	ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(bma->idx >= 0);
+	ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
 	ASSERT(!isnullstartblock(new->br_startblock));
-	ASSERT(!cur || (cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+	ASSERT(!bma->cur ||
+	       (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
 
 	XFS_STATS_INC(xs_add_exlist);
 
-	*logflagsp = 0;
-
 #define	LEFT		r[0]
 #define	RIGHT		r[1]
 #define	PREV		r[2]
@@ -420,7 +411,7 @@ xfs_bmap_add_extent_delay_real(
 	/*
 	 * Set up a bunch of variables to make the tests simpler.
 	 */
-	ep = xfs_iext_get_ext(ifp, *idx);
+	ep = xfs_iext_get_ext(ifp, bma->idx);
 	xfs_bmbt_get_all(ep, &PREV);
 	new_endoff = new->br_startoff + new->br_blockcount;
 	ASSERT(PREV.br_startoff <= new->br_startoff);
@@ -442,9 +433,9 @@ xfs_bmap_add_extent_delay_real(
 	 * Check and set flags if this segment has a left neighbor.
 	 * Don't set contiguous if the combined extent would be too large.
 	 */
-	if (*idx > 0) {
+	if (bma->idx > 0) {
 		state |= BMAP_LEFT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT);
 
 		if (isnullstartblock(LEFT.br_startblock))
 			state |= BMAP_LEFT_DELAY;
@@ -462,9 +453,9 @@ xfs_bmap_add_extent_delay_real(
 	 * Don't set contiguous if the combined extent would be too large.
 	 * Also check for all-three-contiguous being too large.
 	 */
-	if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+	if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
 		state |= BMAP_RIGHT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
 
 		if (isnullstartblock(RIGHT.br_startblock))
 			state |= BMAP_RIGHT_DELAY;
@@ -495,35 +486,39 @@ xfs_bmap_add_extent_delay_real(
 		 * Filling in all of a previously delayed allocation extent.
 		 * The left and right neighbors are both contiguous with new.
 		 */
-		--*idx;
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+		bma->idx--;
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
 			LEFT.br_blockcount + PREV.br_blockcount +
 			RIGHT.br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
-		xfs_iext_remove(ip, *idx + 1, 2, state);
-		ip->i_d.di_nextents--;
-		if (cur == NULL)
+		xfs_iext_remove(bma->ip, bma->idx + 1, 2, state);
+		bma->ip->i_d.di_nextents--;
+		if (bma->cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
 			rval = XFS_ILOG_CORE;
-			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+			error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
 					RIGHT.br_startblock,
-					RIGHT.br_blockcount, &i)))
+					RIGHT.br_blockcount, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_btree_delete(cur, &i)))
+			error = xfs_btree_delete(bma->cur, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_btree_decrement(cur, 0, &i)))
+			error = xfs_btree_decrement(bma->cur, 0, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+			error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
 					LEFT.br_startblock,
 					LEFT.br_blockcount +
 					PREV.br_blockcount +
-					RIGHT.br_blockcount, LEFT.br_state)))
+					RIGHT.br_blockcount, LEFT.br_state);
+			if (error)
 				goto done;
 		}
 		break;
@@ -533,27 +528,29 @@ xfs_bmap_add_extent_delay_real(
 		 * Filling in all of a previously delayed allocation extent.
 		 * The left neighbor is contiguous, the right is not.
 		 */
-		--*idx;
+		bma->idx--;
 
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
 			LEFT.br_blockcount + PREV.br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
-		xfs_iext_remove(ip, *idx + 1, 1, state);
-		if (cur == NULL)
+		xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+		if (bma->cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
 			rval = 0;
-			if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff,
+			error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
 					LEFT.br_startblock, LEFT.br_blockcount,
-					&i)))
+					&i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+			error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
 					LEFT.br_startblock,
 					LEFT.br_blockcount +
-					PREV.br_blockcount, LEFT.br_state)))
+					PREV.br_blockcount, LEFT.br_state);
+			if (error)
 				goto done;
 		}
 		break;
@@ -563,26 +560,28 @@ xfs_bmap_add_extent_delay_real(
 		 * Filling in all of a previously delayed allocation extent.
 		 * The right neighbor is contiguous, the left is not.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
 		xfs_bmbt_set_startblock(ep, new->br_startblock);
 		xfs_bmbt_set_blockcount(ep,
 			PREV.br_blockcount + RIGHT.br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
-		xfs_iext_remove(ip, *idx + 1, 1, state);
-		if (cur == NULL)
+		xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+		if (bma->cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
 			rval = 0;
-			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+			error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
 					RIGHT.br_startblock,
-					RIGHT.br_blockcount, &i)))
+					RIGHT.br_blockcount, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+			error = xfs_bmbt_update(bma->cur, PREV.br_startoff,
 					new->br_startblock,
 					PREV.br_blockcount +
-					RIGHT.br_blockcount, PREV.br_state)))
+					RIGHT.br_blockcount, PREV.br_state);
+			if (error)
 				goto done;
 		}
 		break;
@@ -593,22 +592,24 @@ xfs_bmap_add_extent_delay_real(
 		 * Neither the left nor right neighbors are contiguous with
 		 * the new one.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
 		xfs_bmbt_set_startblock(ep, new->br_startblock);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
-		ip->i_d.di_nextents++;
-		if (cur == NULL)
+		bma->ip->i_d.di_nextents++;
+		if (bma->cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
 			rval = XFS_ILOG_CORE;
-			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+			error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
 					new->br_startblock, new->br_blockcount,
-					&i)))
+					&i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-			cur->bc_rec.b.br_state = XFS_EXT_NORM;
-			if ((error = xfs_btree_insert(cur, &i)))
+			bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
@@ -619,38 +620,40 @@ xfs_bmap_add_extent_delay_real(
 		 * Filling in the first part of a previous delayed allocation.
 		 * The left neighbor is contiguous.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1),
 			LEFT.br_blockcount + new->br_blockcount);
 		xfs_bmbt_set_startoff(ep,
 			PREV.br_startoff + new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
 
 		temp = PREV.br_blockcount - new->br_blockcount;
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
-		if (cur == NULL)
+		if (bma->cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
 			rval = 0;
-			if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff,
+			error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
 					LEFT.br_startblock, LEFT.br_blockcount,
-					&i)))
+					&i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+			error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
 					LEFT.br_startblock,
 					LEFT.br_blockcount +
 					new->br_blockcount,
-					LEFT.br_state)))
+					LEFT.br_state);
+			if (error)
 				goto done;
 		}
-		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
 			startblockval(PREV.br_startblock));
 		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
-		--*idx;
+		bma->idx--;
 		break;
 
 	case BMAP_LEFT_FILLING:
@@ -658,41 +661,43 @@ xfs_bmap_add_extent_delay_real(
 		 * Filling in the first part of a previous delayed allocation.
 		 * The left neighbor is not contiguous.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
 		xfs_bmbt_set_startoff(ep, new_endoff);
 		temp = PREV.br_blockcount - new->br_blockcount;
 		xfs_bmbt_set_blockcount(ep, temp);
-		xfs_iext_insert(ip, *idx, 1, new, state);
-		ip->i_d.di_nextents++;
-		if (cur == NULL)
+		xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
+		bma->ip->i_d.di_nextents++;
+		if (bma->cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
 			rval = XFS_ILOG_CORE;
-			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+			error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
 					new->br_startblock, new->br_blockcount,
-					&i)))
+					&i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-			cur->bc_rec.b.br_state = XFS_EXT_NORM;
-			if ((error = xfs_btree_insert(cur, &i)))
+			bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
-		if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-		    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
-			error = xfs_bmap_extents_to_btree(tp, ip,
-					first, flist, &cur, 1, &tmp_rval,
-					XFS_DATA_FORK);
+		if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+		    bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+					bma->firstblock, bma->flist,
+					&bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
 			rval |= tmp_rval;
 			if (error)
 				goto done;
 		}
-		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
 			startblockval(PREV.br_startblock) -
-			(cur ? cur->bc_private.b.allocated : 0));
-		ep = xfs_iext_get_ext(ifp, *idx + 1);
+			(bma->cur ? bma->cur->bc_private.b.allocated : 0));
+		ep = xfs_iext_get_ext(ifp, bma->idx + 1);
 		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
-		trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
 		break;
 
 	case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -701,37 +706,39 @@ xfs_bmap_add_extent_delay_real(
 		 * The right neighbor is contiguous with the new allocation.
 		 */
 		temp = PREV.br_blockcount - new->br_blockcount;
-		trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
-		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1),
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1),
 			new->br_startoff, new->br_startblock,
 			new->br_blockcount + RIGHT.br_blockcount,
 			RIGHT.br_state);
-		trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
-		if (cur == NULL)
+		trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
+		if (bma->cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
 			rval = 0;
-			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+			error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
 					RIGHT.br_startblock,
-					RIGHT.br_blockcount, &i)))
+					RIGHT.br_blockcount, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+			error = xfs_bmbt_update(bma->cur, new->br_startoff,
 					new->br_startblock,
 					new->br_blockcount +
 					RIGHT.br_blockcount,
-					RIGHT.br_state)))
+					RIGHT.br_state);
+			if (error)
 				goto done;
 		}
 
-		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
 			startblockval(PREV.br_startblock));
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
 		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
-		++*idx;
+		bma->idx++;
 		break;
 
 	case BMAP_RIGHT_FILLING:
@@ -740,41 +747,43 @@ xfs_bmap_add_extent_delay_real(
 		 * The right neighbor is not contiguous.
 		 */
 		temp = PREV.br_blockcount - new->br_blockcount;
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
-		xfs_iext_insert(ip, *idx + 1, 1, new, state);
-		ip->i_d.di_nextents++;
-		if (cur == NULL)
+		xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state);
+		bma->ip->i_d.di_nextents++;
+		if (bma->cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
 			rval = XFS_ILOG_CORE;
-			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+			error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
 					new->br_startblock, new->br_blockcount,
-					&i)))
+					&i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-			cur->bc_rec.b.br_state = XFS_EXT_NORM;
-			if ((error = xfs_btree_insert(cur, &i)))
+			bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
-		if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-		    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
-			error = xfs_bmap_extents_to_btree(tp, ip,
-				first, flist, &cur, 1, &tmp_rval,
-				XFS_DATA_FORK);
+		if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+		    bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+				bma->firstblock, bma->flist, &bma->cur, 1,
+				&tmp_rval, XFS_DATA_FORK);
 			rval |= tmp_rval;
 			if (error)
 				goto done;
 		}
-		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
 			startblockval(PREV.br_startblock) -
-			(cur ? cur->bc_private.b.allocated : 0));
-		ep = xfs_iext_get_ext(ifp, *idx);
+			(bma->cur ? bma->cur->bc_private.b.allocated : 0));
+		ep = xfs_iext_get_ext(ifp, bma->idx);
 		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
-		++*idx;
+		bma->idx++;
 		break;
 
 	case 0:
@@ -800,46 +809,48 @@ xfs_bmap_add_extent_delay_real(
 		 */
 		temp = new->br_startoff - PREV.br_startoff;
 		temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
-		trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_);
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);	/* truncate PREV */
 		LEFT = *new;
 		RIGHT.br_state = PREV.br_state;
 		RIGHT.br_startblock = nullstartblock(
-				(int)xfs_bmap_worst_indlen(ip, temp2));
+				(int)xfs_bmap_worst_indlen(bma->ip, temp2));
 		RIGHT.br_startoff = new_endoff;
 		RIGHT.br_blockcount = temp2;
 		/* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
-		xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state);
-		ip->i_d.di_nextents++;
-		if (cur == NULL)
+		xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state);
+		bma->ip->i_d.di_nextents++;
+		if (bma->cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
 			rval = XFS_ILOG_CORE;
-			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+			error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
 					new->br_startblock, new->br_blockcount,
-					&i)))
+					&i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-			cur->bc_rec.b.br_state = XFS_EXT_NORM;
-			if ((error = xfs_btree_insert(cur, &i)))
+			bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
-		if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-		    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
-			error = xfs_bmap_extents_to_btree(tp, ip,
-					first, flist, &cur, 1, &tmp_rval,
-					XFS_DATA_FORK);
+		if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+		    bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+					bma->firstblock, bma->flist, &bma->cur,
+					1, &tmp_rval, XFS_DATA_FORK);
 			rval |= tmp_rval;
 			if (error)
 				goto done;
 		}
-		temp = xfs_bmap_worst_indlen(ip, temp);
-		temp2 = xfs_bmap_worst_indlen(ip, temp2);
+		temp = xfs_bmap_worst_indlen(bma->ip, temp);
+		temp2 = xfs_bmap_worst_indlen(bma->ip, temp2);
 		diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
-			(cur ? cur->bc_private.b.allocated : 0));
+			(bma->cur ? bma->cur->bc_private.b.allocated : 0));
 		if (diff > 0) {
-			error = xfs_icsb_modify_counters(ip->i_mount,
+			error = xfs_icsb_modify_counters(bma->ip->i_mount,
 					XFS_SBS_FDBLOCKS,
 					-((int64_t)diff), 0);
 			ASSERT(!error);
@@ -847,15 +858,15 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 		}
 
-		ep = xfs_iext_get_ext(ifp, *idx);
+		ep = xfs_iext_get_ext(ifp, bma->idx);
 		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-		trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_);
-		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2),
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
+		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2),
 			nullstartblock((int)temp2));
-		trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
 
-		++*idx;
+		bma->idx++;
 		da_new = temp + temp2;
 		break;
 
@@ -873,14 +884,15 @@ xfs_bmap_add_extent_delay_real(
 	}
 
 	/* convert to a btree if necessary */
-	if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+	if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) {
 		int	tmp_logflags;	/* partial log flag return val */
 
-		ASSERT(cur == NULL);
-		error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur,
+		ASSERT(bma->cur == NULL);
+		error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+				bma->firstblock, bma->flist, &bma->cur,
 				da_old > 0, &tmp_logflags, XFS_DATA_FORK);
-		*logflagsp |= tmp_logflags;
+		bma->logflags |= tmp_logflags;
 		if (error)
 			goto done;
 	}
@@ -888,22 +900,22 @@ xfs_bmap_add_extent_delay_real(
 	/* adjust for changes in reserved delayed indirect blocks */
 	if (da_old || da_new) {
 		temp = da_new;
-		if (cur)
-			temp += cur->bc_private.b.allocated;
+		if (bma->cur)
+			temp += bma->cur->bc_private.b.allocated;
 		ASSERT(temp <= da_old);
 		if (temp < da_old)
-			xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-				(int64_t)(da_old - temp), 0);
+			xfs_icsb_modify_counters(bma->ip->i_mount,
+					XFS_SBS_FDBLOCKS,
+					(int64_t)(da_old - temp), 0);
 	}
 
 	/* clear out the allocated field, done with it now in any case. */
-	if (cur) {
-		cur->bc_private.b.allocated = 0;
-		*curp = cur;
-	}
-	xfs_bmap_check_leaf_extents(cur, ip, XFS_DATA_FORK);
+	if (bma->cur)
+		bma->cur->bc_private.b.allocated = 0;
+
+	xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK);
 done:
-	*logflagsp |= rval;
+	bma->logflags |= rval;
 	return error;
 #undef	LEFT
 #undef	RIGHT
@@ -4698,9 +4710,7 @@ xfs_bmapi_allocate(
 		bma->got.br_state = XFS_EXT_UNWRITTEN;
 
 	if (bma->wasdel) {
-		error = xfs_bmap_add_extent_delay_real(bma->tp, bma->ip,
-				&bma->idx, &bma->cur, &bma->got,
-				bma->firstblock, bma->flist, &tmp_logflags);
+		error = xfs_bmap_add_extent_delay_real(bma);
 	} else {
 		error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip,
 				&bma->idx, &bma->cur, &bma->got,
-- 
cgit v1.2.3-58-ga151


From c6534249851d062113ab4d8d226be8dba8ecb92e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Sep 2011 20:41:05 +0000
Subject: xfs: pass bmalloca to xfs_bmap_add_extent_hole_real

All the parameters passed to xfs_bmap_add_extent_hole_real() are in
the xfs_bmalloca structure now. Just pass the bmalloca parameter to
the function instead of 8 separate parameters.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 161 ++++++++++++++++++++++++++----------------------------
 1 file changed, 78 insertions(+), 83 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index c8b9c4ec9f6f..2b31945ce9fe 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1596,34 +1596,25 @@ xfs_bmap_add_extent_hole_delay(
  */
 STATIC int				/* error */
 xfs_bmap_add_extent_hole_real(
-	struct xfs_trans	*tp,
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
-	xfs_btree_cur_t		**curp,	/* if null, not a btree */
-	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
-	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
-	int			*logflagsp, /* inode logging flags */
-	int			whichfork) /* data or attr fork */
+	struct xfs_bmalloca	*bma,
+	int			whichfork)
 {
+	struct xfs_bmbt_irec	*new = &bma->got;
 	int			error;	/* error return value */
 	int			i;	/* temp state */
-	xfs_btree_cur_t		*cur;	/* if null, not a btree */
 	xfs_ifork_t		*ifp;	/* inode fork pointer */
 	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
 	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
 	int			rval=0;	/* return value (logging flags) */
 	int			state;	/* state bits, accessed thru macros */
 
-	*logflagsp = 0;
-
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	cur = *curp;
+	ifp = XFS_IFORK_PTR(bma->ip, whichfork);
 
-	ASSERT(*idx >= 0);
-	ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(bma->idx >= 0);
+	ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
 	ASSERT(!isnullstartblock(new->br_startblock));
-	ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+	ASSERT(!bma->cur ||
+	       !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
 
 	XFS_STATS_INC(xs_add_exlist);
 
@@ -1634,9 +1625,9 @@ xfs_bmap_add_extent_hole_real(
 	/*
 	 * Check and set flags if this segment has a left neighbor.
 	 */
-	if (*idx > 0) {
+	if (bma->idx > 0) {
 		state |= BMAP_LEFT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left);
 		if (isnullstartblock(left.br_startblock))
 			state |= BMAP_LEFT_DELAY;
 	}
@@ -1645,9 +1636,9 @@ xfs_bmap_add_extent_hole_real(
 	 * Check and set flags if this segment has a current value.
 	 * Not true if we're inserting into the "hole" at eof.
 	 */
-	if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+	if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
 		state |= BMAP_RIGHT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
 		if (isnullstartblock(right.br_startblock))
 			state |= BMAP_RIGHT_DELAY;
 	}
@@ -1684,39 +1675,42 @@ xfs_bmap_add_extent_hole_real(
 		 * left and on the right.
 		 * Merge all three into a single extent record.
 		 */
-		--*idx;
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+		--bma->idx;
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
 			left.br_blockcount + new->br_blockcount +
 			right.br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
-		xfs_iext_remove(ip, *idx + 1, 1, state);
+		xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
 
-		XFS_IFORK_NEXT_SET(ip, whichfork,
-			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
-		if (cur == NULL) {
+		XFS_IFORK_NEXT_SET(bma->ip, whichfork,
+			XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1);
+		if (bma->cur == NULL) {
 			rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
 		} else {
 			rval = XFS_ILOG_CORE;
-			if ((error = xfs_bmbt_lookup_eq(cur,
-					right.br_startoff,
-					right.br_startblock,
-					right.br_blockcount, &i)))
+			error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff,
+					right.br_startblock, right.br_blockcount,
+					&i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_btree_delete(cur, &i)))
+			error = xfs_btree_delete(bma->cur, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_btree_decrement(cur, 0, &i)))
+			error = xfs_btree_decrement(bma->cur, 0, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur, left.br_startoff,
+			error = xfs_bmbt_update(bma->cur, left.br_startoff,
 					left.br_startblock,
 					left.br_blockcount +
 						new->br_blockcount +
 						right.br_blockcount,
-					left.br_state)))
+					left.br_state);
+			if (error)
 				goto done;
 		}
 		break;
@@ -1727,27 +1721,28 @@ xfs_bmap_add_extent_hole_real(
 		 * on the left.
 		 * Merge the new allocation with the left neighbor.
 		 */
-		--*idx;
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+		--bma->idx;
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
 			left.br_blockcount + new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
-		if (cur == NULL) {
+		if (bma->cur == NULL) {
 			rval = xfs_ilog_fext(whichfork);
 		} else {
 			rval = 0;
-			if ((error = xfs_bmbt_lookup_eq(cur,
-					left.br_startoff,
-					left.br_startblock,
-					left.br_blockcount, &i)))
+			error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff,
+					left.br_startblock, left.br_blockcount,
+					&i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur, left.br_startoff,
+			error = xfs_bmbt_update(bma->cur, left.br_startoff,
 					left.br_startblock,
 					left.br_blockcount +
 						new->br_blockcount,
-					left.br_state)))
+					left.br_state);
+			if (error)
 				goto done;
 		}
 		break;
@@ -1758,28 +1753,30 @@ xfs_bmap_add_extent_hole_real(
 		 * on the right.
 		 * Merge the new allocation with the right neighbor.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx),
 			new->br_startoff, new->br_startblock,
 			new->br_blockcount + right.br_blockcount,
 			right.br_state);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
-		if (cur == NULL) {
+		if (bma->cur == NULL) {
 			rval = xfs_ilog_fext(whichfork);
 		} else {
 			rval = 0;
-			if ((error = xfs_bmbt_lookup_eq(cur,
+			error = xfs_bmbt_lookup_eq(bma->cur,
 					right.br_startoff,
 					right.br_startblock,
-					right.br_blockcount, &i)))
+					right.br_blockcount, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+			error = xfs_bmbt_update(bma->cur, new->br_startoff,
 					new->br_startblock,
 					new->br_blockcount +
 						right.br_blockcount,
-					right.br_state)))
+					right.br_state);
+			if (error)
 				goto done;
 		}
 		break;
@@ -1790,21 +1787,23 @@ xfs_bmap_add_extent_hole_real(
 		 * real allocation.
 		 * Insert a new entry.
 		 */
-		xfs_iext_insert(ip, *idx, 1, new, state);
-		XFS_IFORK_NEXT_SET(ip, whichfork,
-			XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
-		if (cur == NULL) {
+		xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
+		XFS_IFORK_NEXT_SET(bma->ip, whichfork,
+			XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1);
+		if (bma->cur == NULL) {
 			rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
 		} else {
 			rval = XFS_ILOG_CORE;
-			if ((error = xfs_bmbt_lookup_eq(cur,
+			error = xfs_bmbt_lookup_eq(bma->cur,
 					new->br_startoff,
 					new->br_startblock,
-					new->br_blockcount, &i)))
+					new->br_blockcount, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-			cur->bc_rec.b.br_state = new->br_state;
-			if ((error = xfs_btree_insert(cur, &i)))
+			bma->cur->bc_rec.b.br_state = new->br_state;
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
@@ -1812,26 +1811,26 @@ xfs_bmap_add_extent_hole_real(
 	}
 
 	/* convert to a btree if necessary */
-	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
+	if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) {
 		int	tmp_logflags;	/* partial log flag return val */
 
-		ASSERT(cur == NULL);
-		error = xfs_bmap_extents_to_btree(tp, ip, first,
-			flist, &cur, 0, &tmp_logflags, whichfork);
-		*logflagsp |= tmp_logflags;
+		ASSERT(bma->cur == NULL);
+		error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+				bma->firstblock, bma->flist, &bma->cur,
+				0, &tmp_logflags, whichfork);
+		bma->logflags |= tmp_logflags;
 		if (error)
 			goto done;
 	}
 
 	/* clear out the allocated field, done with it now in any case. */
-	if (cur) {
-		cur->bc_private.b.allocated = 0;
-		*curp = cur;
-	}
-	xfs_bmap_check_leaf_extents(cur, ip, whichfork);
+	if (bma->cur)
+		bma->cur->bc_private.b.allocated = 0;
+
+	xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
 done:
-	*logflagsp |= rval;
+	bma->logflags |= rval;
 	return error;
 }
 
@@ -4709,14 +4708,10 @@ xfs_bmapi_allocate(
 	    xfs_sb_version_hasextflgbit(&mp->m_sb))
 		bma->got.br_state = XFS_EXT_UNWRITTEN;
 
-	if (bma->wasdel) {
+	if (bma->wasdel)
 		error = xfs_bmap_add_extent_delay_real(bma);
-	} else {
-		error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip,
-				&bma->idx, &bma->cur, &bma->got,
-				bma->firstblock, bma->flist, &tmp_logflags,
-				whichfork);
-	}
+	else
+		error = xfs_bmap_add_extent_hole_real(bma, whichfork);
 
 	bma->logflags |= tmp_logflags;
 	if (error)
-- 
cgit v1.2.3-58-ga151


From b0eab14e74d2d7b22d065e18a1cdebcf7716debf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Sep 2011 20:41:06 +0000
Subject: xfs: dont ignore error code from xfs_bmbt_update

Fix a case in xfs_bmap_add_extent_unwritten_real where we aren't
passing the returned error on.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 2b31945ce9fe..bb31d37bf49b 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1217,10 +1217,11 @@ xfs_bmap_add_extent_unwritten_real(
 				goto done;
 			if ((error = xfs_btree_decrement(cur, 0, &i)))
 				goto done;
-			if (xfs_bmbt_update(cur, LEFT.br_startoff,
+			error = xfs_bmbt_update(cur, LEFT.br_startoff,
 				LEFT.br_startblock,
 				LEFT.br_blockcount + new->br_blockcount,
-				LEFT.br_state))
+				LEFT.br_state);
+			if (error)
 				goto done;
 		}
 		break;
-- 
cgit v1.2.3-58-ga151


From d952e2f81244d6502aff126df5011fab10f92187 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Sep 2011 20:41:07 +0000
Subject: xfs: cleanup xfs_bmap.h

Convert all function prototypes to the short form used elsewhere,
and remove duplicates of comments already placed at the function
body.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.h | 225 ++++++++----------------------------------------------
 1 file changed, 33 insertions(+), 192 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 243e212b31f6..89ee672d378a 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -155,121 +155,29 @@ typedef struct xfs_bmalloca {
 	{ BMAP_RIGHT_FILLING,	"RF" }, \
 	{ BMAP_ATTRFORK,	"ATTR" }
 
-/*
- * Add bmap trace insert entries for all the contents of the extent list.
- *
- * Quite excessive tracing.  Only do this for debug builds.
- */
 #if defined(__KERNEL) && defined(DEBUG)
-void
-xfs_bmap_trace_exlist(
-	struct xfs_inode	*ip,		/* incore inode pointer */
-	xfs_extnum_t		cnt,		/* count of entries in list */
-	int			whichfork,
-	unsigned long		caller_ip);	/* data or attr fork */
+void	xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
+		int whichfork, unsigned long caller_ip);
 #define	XFS_BMAP_TRACE_EXLIST(ip,c,w)	\
 	xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_)
 #else
 #define	XFS_BMAP_TRACE_EXLIST(ip,c,w)
 #endif
 
-/*
- * Convert inode from non-attributed to attributed.
- * Must not be in a transaction, ip must not be locked.
- */
-int					/* error code */
-xfs_bmap_add_attrfork(
-	struct xfs_inode	*ip,	/* incore inode pointer */
-	int			size,	/* space needed for new attribute */
-	int			rsvd);	/* flag for reserved block allocation */
-
-/*
- * Add the extent to the list of extents to be free at transaction end.
- * The list is maintained sorted (by block number).
- */
-void
-xfs_bmap_add_free(
-	xfs_fsblock_t		bno,		/* fs block number of extent */
-	xfs_filblks_t		len,		/* length of extent */
-	xfs_bmap_free_t		*flist,		/* list of extents */
-	struct xfs_mount	*mp);		/* mount point structure */
-
-/*
- * Routine to clean up the free list data structure when
- * an error occurs during a transaction.
- */
-void
-xfs_bmap_cancel(
-	xfs_bmap_free_t		*flist);	/* free list to clean up */
-
-/*
- * Compute and fill in the value of the maximum depth of a bmap btree
- * in this filesystem.  Done once, during mount.
- */
-void
-xfs_bmap_compute_maxlevels(
-	struct xfs_mount	*mp,	/* file system mount structure */
-	int			whichfork);	/* data or attr fork */
-
-/*
- * Returns the file-relative block number of the first unused block in the file.
- * This is the lowest-address hole if the file has holes, else the first block
- * past the end of file.
- */
-int						/* error */
-xfs_bmap_first_unused(
-	struct xfs_trans	*tp,		/* transaction pointer */
-	struct xfs_inode	*ip,		/* incore inode */
-	xfs_extlen_t		len,		/* size of hole to find */
-	xfs_fileoff_t		*unused,	/* unused block num */
-	int			whichfork);	/* data or attr fork */
-
-/*
- * Returns the file-relative block number of the last block + 1 before
- * last_block (input value) in the file.
- * This is not based on i_size, it is based on the extent list.
- * Returns 0 for local files, as they do not have an extent list.
- */
-int						/* error */
-xfs_bmap_last_before(
-	struct xfs_trans	*tp,		/* transaction pointer */
-	struct xfs_inode	*ip,		/* incore inode */
-	xfs_fileoff_t		*last_block,	/* last block */
-	int			whichfork);	/* data or attr fork */
-
-/*
- * Returns the file-relative block number of the first block past eof in
- * the file.  This is not based on i_size, it is based on the extent list.
- * Returns 0 for local files, as they do not have an extent list.
- */
-int						/* error */
-xfs_bmap_last_offset(
-	struct xfs_trans	*tp,		/* transaction pointer */
-	struct xfs_inode	*ip,		/* incore inode */
-	xfs_fileoff_t		*unused,	/* last block num */
-	int			whichfork);	/* data or attr fork */
-
-/*
- * Returns whether the selected fork of the inode has exactly one
- * block or not.  For the data fork we check this matches di_size,
- * implying the file's range is 0..bsize-1.
- */
-int
-xfs_bmap_one_block(
-	struct xfs_inode	*ip,		/* incore inode */
-	int			whichfork);	/* data or attr fork */
-
-/*
- * Read in the extents to iu_extents.
- * All inode fields are set up by caller, we just traverse the btree
- * and copy the records in.
- */
-int						/* error */
-xfs_bmap_read_extents(
-	struct xfs_trans	*tp,		/* transaction pointer */
-	struct xfs_inode	*ip,		/* incore inode */
-	int			whichfork);	/* data or attr fork */
-
+int	xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
+void	xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
+		struct xfs_bmap_free *flist, struct xfs_mount *mp);
+void	xfs_bmap_cancel(struct xfs_bmap_free *flist);
+void	xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
+int	xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
+int	xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t *last_block, int whichfork);
+int	xfs_bmap_last_offset(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t *unused, int whichfork);
+int	xfs_bmap_one_block(struct xfs_inode *ip, int whichfork);
+int	xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+		int whichfork);
 int	xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
 		xfs_filblks_t len, struct xfs_bmbt_irec *mval,
 		int *nmap, int flags);
@@ -281,95 +189,28 @@ int	xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
 		xfs_fsblock_t *firstblock, xfs_extlen_t total,
 		struct xfs_bmbt_irec *mval, int *nmap,
 		struct xfs_bmap_free *flist);
-
-/*
- * Unmap (remove) blocks from a file.
- * If nexts is nonzero then the number of extents to remove is limited to
- * that value.  If not all extents in the block range can be removed then
- * *done is set.
- */
-int						/* error */
-xfs_bunmapi(
-	struct xfs_trans	*tp,		/* transaction pointer */
-	struct xfs_inode	*ip,		/* incore inode */
-	xfs_fileoff_t		bno,		/* starting offset to unmap */
-	xfs_filblks_t		len,		/* length to unmap in file */
-	int			flags,		/* XFS_BMAPI_... */
-	xfs_extnum_t		nexts,		/* number of extents max */
-	xfs_fsblock_t		*firstblock,	/* first allocated block
-						   controls a.g. for allocs */
-	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
-	int			*done);		/* set if not done yet */
-
-/*
- * Check an extent list, which has just been read, for
- * any bit in the extent flag field.
- */
-int
-xfs_check_nostate_extents(
-	struct xfs_ifork	*ifp,
-	xfs_extnum_t		idx,
-	xfs_extnum_t		num);
-
-uint
-xfs_default_attroffset(
-	struct xfs_inode	*ip);
+int	xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+		xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
+		struct xfs_bmap_free *flist, int *done);
+int	xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
+		xfs_extnum_t num);
+uint	xfs_default_attroffset(struct xfs_inode *ip);
 
 #ifdef __KERNEL__
-
-/*
- * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
- * caller.  Frees all the extents that need freeing, which must be done
- * last due to locking considerations.
- *
- * Return 1 if the given transaction was committed and a new one allocated,
- * and 0 otherwise.
- */
-int						/* error */
-xfs_bmap_finish(
-	struct xfs_trans	**tp,		/* transaction pointer addr */
-	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
-	int			*committed);	/* xact committed or not */
-
 /* bmap to userspace formatter - copy to user & advance pointer */
 typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
 
-/*
- * Get inode's extents as described in bmv, and format for output.
- */
-int						/* error code */
-xfs_getbmap(
-	xfs_inode_t		*ip,
-	struct getbmapx		*bmv,		/* user bmap structure */
-	xfs_bmap_format_t	formatter,	/* format to user */
-	void			*arg);		/* formatter arg */
-
-/*
- * Check if the endoff is outside the last extent. If so the caller will grow
- * the allocation to a stripe unit boundary
- */
-int
-xfs_bmap_eof(
-	struct xfs_inode        *ip,
-	xfs_fileoff_t           endoff,
-	int                     whichfork,
-	int                     *eof);
-
-/*
- * Count fsblocks of the given fork.
- */
-int
-xfs_bmap_count_blocks(
-	xfs_trans_t		*tp,
-	struct xfs_inode	*ip,
-	int			whichfork,
-	int			*count);
-
-int
-xfs_bmap_punch_delalloc_range(
-	struct xfs_inode	*ip,
-	xfs_fileoff_t		start_fsb,
-	xfs_fileoff_t		length);
+int	xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
+		int *committed);
+int	xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
+		xfs_bmap_format_t formatter, void *arg);
+int	xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
+		int whichfork, int *eof);
+int	xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
+		int whichfork, int *count);
+int	xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
+		xfs_fileoff_t start_fsb, xfs_fileoff_t length);
 #endif	/* __KERNEL__ */
 
 #endif	/* __XFS_BMAP_H__ */
-- 
cgit v1.2.3-58-ga151


From c029a50d51b8a9520105ec903639de03389915d0 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Wed, 21 Sep 2011 09:42:30 +0000
Subject: xfs: fix possible overflow in xfs_ioc_trim()

In xfs_ioc_trim it is possible that computing the last allocation group
to discard might overflow for big start & len values, because the result
might be bigger then xfs_agnumber_t which is 32 bit long. Fix this by not
allowing the start and end block of the range to be beyond the end of the
file system.

Note that if the start is beyond the end of the file system we have to
return -EINVAL, but in the "end" case we have to truncate it to the fs
size.

Also introduce "end" variable, rather than using start+len which which
might be more confusing to get right as this bug shows.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_discard.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 244e797dae32..8a24f0c6c860 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -38,7 +38,7 @@ xfs_trim_extents(
 	struct xfs_mount	*mp,
 	xfs_agnumber_t		agno,
 	xfs_fsblock_t		start,
-	xfs_fsblock_t		len,
+	xfs_fsblock_t		end,
 	xfs_fsblock_t		minlen,
 	__uint64_t		*blocks_trimmed)
 {
@@ -100,7 +100,7 @@ xfs_trim_extents(
 		 * down partially overlapping ranges for now.
 		 */
 		if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
-		    XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
+		    XFS_AGB_TO_FSB(mp, agno, fbno) > end) {
 			trace_xfs_discard_exclude(mp, agno, fbno, flen);
 			goto next_extent;
 		}
@@ -145,7 +145,7 @@ xfs_ioc_trim(
 	struct request_queue	*q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
 	unsigned int		granularity = q->limits.discard_granularity;
 	struct fstrim_range	range;
-	xfs_fsblock_t		start, len, minlen;
+	xfs_fsblock_t		start, end, minlen;
 	xfs_agnumber_t		start_agno, end_agno, agno;
 	__uint64_t		blocks_trimmed = 0;
 	int			error, last_error = 0;
@@ -165,19 +165,19 @@ xfs_ioc_trim(
 	 * matter as trimming blocks is an advisory interface.
 	 */
 	start = XFS_B_TO_FSBT(mp, range.start);
-	len = XFS_B_TO_FSBT(mp, range.len);
+	end = start + XFS_B_TO_FSBT(mp, range.len) - 1;
 	minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
 
-	start_agno = XFS_FSB_TO_AGNO(mp, start);
-	if (start_agno >= mp->m_sb.sb_agcount)
+	if (start >= mp->m_sb.sb_dblocks)
 		return -XFS_ERROR(EINVAL);
+	if (end > mp->m_sb.sb_dblocks - 1)
+		end = mp->m_sb.sb_dblocks - 1;
 
-	end_agno = XFS_FSB_TO_AGNO(mp, start + len);
-	if (end_agno >= mp->m_sb.sb_agcount)
-		end_agno = mp->m_sb.sb_agcount - 1;
+	start_agno = XFS_FSB_TO_AGNO(mp, start);
+	end_agno = XFS_FSB_TO_AGNO(mp, end);
 
 	for (agno = start_agno; agno <= end_agno; agno++) {
-		error = -xfs_trim_extents(mp, agno, start, len, minlen,
+		error = -xfs_trim_extents(mp, agno, start, end, minlen,
 					  &blocks_trimmed);
 		if (error)
 			last_error = error;
-- 
cgit v1.2.3-58-ga151


From 815cb21662b914e1e14c256a3d662b1352c8509e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 26 Sep 2011 09:14:34 +0000
Subject: xfs: XFS_TRANS_SWAPEXT is not a valid flag for xfs_trans_commit

XFS_TRANS_SWAPEXT is a transaction type, not a flag for xfs_trans_commit, so
don't pass it in xfs_swap_extents.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_dfrag.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 9a84a85c03b1..645387b69738 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -438,7 +438,7 @@ xfs_swap_extents(
 	if (mp->m_flags & XFS_MOUNT_WSYNC)
 		xfs_trans_set_sync(tp);
 
-	error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
+	error = xfs_trans_commit(tp, 0);
 
 	trace_xfs_swap_extent_after(ip, 0);
 	trace_xfs_swap_extent_after(tip, 1);
-- 
cgit v1.2.3-58-ga151


From b10370585349d364ff3c550afa7922e6e21f029d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 19 Sep 2011 14:55:51 +0000
Subject: xfs: unlock the inode before log force in xfs_fsync

Only read the LSN we need to push to with the ilock held, and then release
it before we do the log force to improve concurrency.

This also removes the only direct caller of _xfs_trans_commit, thus
allowing it to be merged into the plain xfs_trans_commit again.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_file.c  | 15 ++++++++-------
 fs/xfs/xfs_trans.c | 11 ++++-------
 fs/xfs/xfs_trans.h |  5 +----
 3 files changed, 13 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 558543c146b3..d8ef02eb178a 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -137,6 +137,7 @@ xfs_file_fsync(
 	struct xfs_trans	*tp;
 	int			error = 0;
 	int			log_flushed = 0;
+	xfs_lsn_t		lsn = 0;
 
 	trace_xfs_file_fsync(ip);
 
@@ -214,9 +215,9 @@ xfs_file_fsync(
 		 */
 		xfs_trans_ijoin(tp, ip);
 		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-		xfs_trans_set_sync(tp);
-		error = _xfs_trans_commit(tp, 0, &log_flushed);
+		error = xfs_trans_commit(tp, 0);
 
+		lsn = ip->i_itemp->ili_last_lsn;
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	} else {
 		/*
@@ -227,14 +228,14 @@ xfs_file_fsync(
 		 * disk yet, the inode will be still be pinned.  If it is,
 		 * force the log.
 		 */
-		if (xfs_ipincount(ip)) {
-			error = _xfs_log_force_lsn(mp,
-					ip->i_itemp->ili_last_lsn,
-					XFS_LOG_SYNC, &log_flushed);
-		}
+		if (xfs_ipincount(ip))
+			lsn = ip->i_itemp->ili_last_lsn;
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 	}
 
+	if (!error && lsn)
+		error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
+
 	/*
 	 * If we only have a single device, and the log force about was
 	 * a no-op we might have to flush the data device cache here.
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index efc147f0e9b6..b64c8c79736a 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1790,9 +1790,7 @@ xfs_trans_commit_cil(
 }
 
 /*
- * xfs_trans_commit
- *
- * Commit the given transaction to the log a/synchronously.
+ * Commit the given transaction to the log.
  *
  * XFS disk error handling mechanism is not based on a typical
  * transaction abort mechanism. Logically after the filesystem
@@ -1804,10 +1802,9 @@ xfs_trans_commit_cil(
  * Do not reference the transaction structure after this call.
  */
 int
-_xfs_trans_commit(
+xfs_trans_commit(
 	struct xfs_trans	*tp,
-	uint			flags,
-	int			*log_flushed)
+	uint			flags)
 {
 	struct xfs_mount	*mp = tp->t_mountp;
 	xfs_lsn_t		commit_lsn = -1;
@@ -1866,7 +1863,7 @@ _xfs_trans_commit(
 	if (sync) {
 		if (!error) {
 			error = _xfs_log_force_lsn(mp, commit_lsn,
-				      XFS_LOG_SYNC, log_flushed);
+				      XFS_LOG_SYNC, NULL);
 		}
 		XFS_STATS_INC(xs_trans_sync);
 	} else {
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 06a9759b6352..54b0b1cc3e3f 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -487,10 +487,7 @@ void		xfs_trans_log_efd_extent(xfs_trans_t *,
 					 struct xfs_efd_log_item *,
 					 xfs_fsblock_t,
 					 xfs_extlen_t);
-int		_xfs_trans_commit(xfs_trans_t *,
-				  uint flags,
-				  int *);
-#define xfs_trans_commit(tp, flags)	_xfs_trans_commit(tp, flags, NULL)
+int		xfs_trans_commit(xfs_trans_t *, uint flags);
 void		xfs_trans_cancel(xfs_trans_t *, int);
 int		xfs_trans_ail_init(struct xfs_mount *);
 void		xfs_trans_ail_destroy(struct xfs_mount *);
-- 
cgit v1.2.3-58-ga151


From 8292d88c5c833fc8b837c3a018fd6d72c35a3231 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Sep 2011 20:47:50 +0000
Subject: xfs: unlock the inode before log force in xfs_fs_nfs_commit_metadata

Only read the LSN we need to push to with the ilock held, and then release
it before we do the log force to improve concurrency.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_export.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 75e5d322e48f..da108977b21f 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -229,16 +229,16 @@ xfs_fs_nfs_commit_metadata(
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
-	int			error = 0;
+	xfs_lsn_t		lsn = 0;
 
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
-	if (xfs_ipincount(ip)) {
-		error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn,
-				XFS_LOG_SYNC, NULL);
-	}
+	if (xfs_ipincount(ip))
+		lsn = ip->i_itemp->ili_last_lsn;
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
-	return error;
+	if (!lsn)
+		return 0;
+	return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
 }
 
 const struct export_operations xfs_export_operations = {
-- 
cgit v1.2.3-58-ga151


From 23bb0be1a237c8732ce1a43140e5cb103a676b92 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Sep 2011 20:47:51 +0000
Subject: xfs: unlock the inode before log force in xfs_change_file_space

Let the transaction commit unlock the inode before it potentially causes
a synchronous log force.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_vnodeops.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index f47ecee8d437..c9c8e8230b21 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2349,8 +2349,7 @@ xfs_change_file_space(
 	}
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-
-	xfs_trans_ijoin(tp, ip);
+	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
 
 	if ((attr_flags & XFS_ATTR_DMI) == 0) {
 		ip->i_d.di_mode &= ~S_ISUID;
@@ -2375,10 +2374,5 @@ xfs_change_file_space(
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	if (attr_flags & XFS_ATTR_SYNC)
 		xfs_trans_set_sync(tp);
-
-	error = xfs_trans_commit(tp, 0);
-
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-	return error;
+	return xfs_trans_commit(tp, 0);
 }
-- 
cgit v1.2.3-58-ga151


From ddc3415aba1cb2f86d1fcad720cea834ee178f54 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 19 Sep 2011 15:00:54 +0000
Subject: xfs: simplify xfs_trans_ijoin* again

There is no reason to keep a reference to the inode even if we unlock
it during transaction commit because we never drop a reference between
the ijoin and commit.  Also use this fact to merge xfs_trans_ijoin_ref
back into xfs_trans_ijoin - the third argument decides if an unlock
is needed now.

I'm actually starting to wonder if allowing inodes to be unlocked
at transaction commit really is worth the effort.  The only real
benefit is that they can be unlocked earlier when commiting a
synchronous transactions, but that could be solved by doing the
log force manually after the unlock, too.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_attr.c        | 28 ++++++++++++++--------------
 fs/xfs/xfs_bmap.c        |  4 ++--
 fs/xfs/xfs_dfrag.c       |  4 ++--
 fs/xfs/xfs_dquot.c       |  2 +-
 fs/xfs/xfs_file.c        |  2 +-
 fs/xfs/xfs_inode.c       |  6 +++---
 fs/xfs/xfs_inode_item.c  |  4 +---
 fs/xfs/xfs_ioctl.c       |  2 +-
 fs/xfs/xfs_iomap.c       |  6 +++---
 fs/xfs/xfs_iops.c        |  4 ++--
 fs/xfs/xfs_qm_syscalls.c |  2 +-
 fs/xfs/xfs_rename.c      |  8 ++++----
 fs/xfs/xfs_rtalloc.c     | 10 +++++-----
 fs/xfs/xfs_super.c       |  2 +-
 fs/xfs/xfs_trans.c       |  2 +-
 fs/xfs/xfs_trans.h       |  3 +--
 fs/xfs/xfs_trans_inode.c | 25 +++++--------------------
 fs/xfs/xfs_vnodeops.c    | 34 +++++++++++++++++-----------------
 18 files changed, 65 insertions(+), 83 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 5484766938f9..981a0624f382 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -319,7 +319,7 @@ xfs_attr_set_int(
 		return (error);
 	}
 
-	xfs_trans_ijoin(args.trans, dp);
+	xfs_trans_ijoin(args.trans, dp, 0);
 
 	/*
 	 * If the attribute list is non-existent or a shortform list,
@@ -389,7 +389,7 @@ xfs_attr_set_int(
 		 * a new one.  We need the inode to be in all transactions.
 		 */
 		if (committed)
-			xfs_trans_ijoin(args.trans, dp);
+			xfs_trans_ijoin(args.trans, dp, 0);
 
 		/*
 		 * Commit the leaf transformation.  We'll need another (linked)
@@ -537,7 +537,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
 	 * No need to make quota reservations here. We expect to release some
 	 * blocks not allocate in the common case.
 	 */
-	xfs_trans_ijoin(args.trans, dp);
+	xfs_trans_ijoin(args.trans, dp, 0);
 
 	/*
 	 * Decide on what work routines to call based on the inode size.
@@ -809,7 +809,7 @@ xfs_attr_inactive(xfs_inode_t *dp)
 	 * No need to make quota reservations here. We expect to release some
 	 * blocks, not allocate, in the common case.
 	 */
-	xfs_trans_ijoin(trans, dp);
+	xfs_trans_ijoin(trans, dp, 0);
 
 	/*
 	 * Decide on what work routines to call based on the inode size.
@@ -961,7 +961,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 		 * a new one.  We need the inode to be in all transactions.
 		 */
 		if (committed)
-			xfs_trans_ijoin(args->trans, dp);
+			xfs_trans_ijoin(args->trans, dp, 0);
 
 		/*
 		 * Commit the current trans (including the inode) and start
@@ -1063,7 +1063,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 			 * in all transactions.
 			 */
 			if (committed)
-				xfs_trans_ijoin(args->trans, dp);
+				xfs_trans_ijoin(args->trans, dp, 0);
 		} else
 			xfs_da_buf_done(bp);
 
@@ -1137,7 +1137,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
 		 * a new one.  We need the inode to be in all transactions.
 		 */
 		if (committed)
-			xfs_trans_ijoin(args->trans, dp);
+			xfs_trans_ijoin(args->trans, dp, 0);
 	} else
 		xfs_da_buf_done(bp);
 	return(0);
@@ -1291,7 +1291,7 @@ restart:
 			 * in all transactions.
 			 */
 			if (committed)
-				xfs_trans_ijoin(args->trans, dp);
+				xfs_trans_ijoin(args->trans, dp, 0);
 
 			/*
 			 * Commit the node conversion and start the next
@@ -1328,7 +1328,7 @@ restart:
 		 * a new one.  We need the inode to be in all transactions.
 		 */
 		if (committed)
-			xfs_trans_ijoin(args->trans, dp);
+			xfs_trans_ijoin(args->trans, dp, 0);
 	} else {
 		/*
 		 * Addition succeeded, update Btree hashvals.
@@ -1440,7 +1440,7 @@ restart:
 			 * in all transactions.
 			 */
 			if (committed)
-				xfs_trans_ijoin(args->trans, dp);
+				xfs_trans_ijoin(args->trans, dp, 0);
 		}
 
 		/*
@@ -1572,7 +1572,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 		 * a new one.  We need the inode to be in all transactions.
 		 */
 		if (committed)
-			xfs_trans_ijoin(args->trans, dp);
+			xfs_trans_ijoin(args->trans, dp, 0);
 
 		/*
 		 * Commit the Btree join operation and start a new trans.
@@ -1623,7 +1623,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 			 * in all transactions.
 			 */
 			if (committed)
-				xfs_trans_ijoin(args->trans, dp);
+				xfs_trans_ijoin(args->trans, dp, 0);
 		} else
 			xfs_da_brelse(args->trans, bp);
 	}
@@ -2060,7 +2060,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 		 * a new one.  We need the inode to be in all transactions.
 		 */
 		if (committed)
-			xfs_trans_ijoin(args->trans, dp);
+			xfs_trans_ijoin(args->trans, dp, 0);
 
 		ASSERT(nmap == 1);
 		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
@@ -2207,7 +2207,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 		 * a new one.  We need the inode to be in all transactions.
 		 */
 		if (committed)
-			xfs_trans_ijoin(args->trans, args->dp);
+			xfs_trans_ijoin(args->trans, args->dp, 0);
 
 		/*
 		 * Close out trans and start the next one in the chain.
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index bb31d37bf49b..c68baeb0974a 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2195,7 +2195,7 @@ xfs_bmap_rtalloc(
 	 * Lock out other modifications to the RT bitmap inode.
 	 */
 	xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin_ref(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
 
 	/*
 	 * If it's an allocation to an empty file at offset 0,
@@ -3460,7 +3460,7 @@ xfs_bmap_add_attrfork(
 	}
 	ASSERT(ip->i_d.di_anextents == 0);
 
-	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
 	switch (ip->i_d.di_format) {
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 645387b69738..654dc6f05bac 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -425,8 +425,8 @@ xfs_swap_extents(
 	}
 
 
-	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-	xfs_trans_ijoin_ref(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 
 	xfs_trans_log_inode(tp, ip,  ilf_fields);
 	xfs_trans_log_inode(tp, tip, tilf_fields);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 179673531f20..c597bfe4ada0 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -377,7 +377,7 @@ xfs_qm_dqalloc(
 		return (ESRCH);
 	}
 
-	xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
 	nmaps = 1;
 	error = xfs_bmapi_write(tp, quotip, offset_fsb,
 				XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index d8ef02eb178a..0b600b51778c 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -213,7 +213,7 @@ xfs_file_fsync(
 		 * transaction.	 So we play it safe and fire off the
 		 * transaction anyway.
 		 */
-		xfs_trans_ijoin(tp, ip);
+		xfs_trans_ijoin(tp, ip, 0);
 		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 		error = xfs_trans_commit(tp, 0);
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index f690d3a10b34..b676494a55ca 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1152,7 +1152,7 @@ xfs_ialloc(
 	/*
 	 * Log the new values stuffed into the inode.
 	 */
-	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 	xfs_trans_log_inode(tp, ip, flags);
 
 	/* now that we have an i_mode we can setup inode ops and unlock */
@@ -1297,7 +1297,7 @@ xfs_itruncate_extents(
 		 */
 		error = xfs_bmap_finish(&tp, &free_list, &committed);
 		if (committed)
-			xfs_trans_ijoin(tp, ip);
+			xfs_trans_ijoin(tp, ip, 0);
 		if (error)
 			goto out_bmap_cancel;
 
@@ -1313,7 +1313,7 @@ xfs_itruncate_extents(
 		error = xfs_trans_commit(tp, 0);
 		tp = ntp;
 
-		xfs_trans_ijoin(tp, ip);
+		xfs_trans_ijoin(tp, ip, 0);
 
 		if (error)
 			goto out;
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 588406dc6a35..8704a99241d7 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -658,10 +658,8 @@ xfs_inode_item_unlock(
 
 	lock_flags = iip->ili_lock_flags;
 	iip->ili_lock_flags = 0;
-	if (lock_flags) {
+	if (lock_flags)
 		xfs_iunlock(ip, lock_flags);
-		IRELE(ip);
-	}
 }
 
 /*
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index f7ce7debe14c..d99a90518909 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1069,7 +1069,7 @@ xfs_ioctl_setattr(
 		}
 	}
 
-	xfs_trans_ijoin(tp, ip);
+	xfs_trans_ijoin(tp, ip, 0);
 
 	/*
 	 * Change file ownership.  Must be the owner or privileged.
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index da5bf05c5bb7..9afa282aa937 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -208,7 +208,7 @@ xfs_iomap_write_direct(
 	if (error)
 		goto error1;
 
-	xfs_trans_ijoin(tp, ip);
+	xfs_trans_ijoin(tp, ip, 0);
 
 	bmapi_flag = 0;
 	if (offset < ip->i_size || extsz)
@@ -528,7 +528,7 @@ xfs_iomap_write_allocate(
 				return XFS_ERROR(error);
 			}
 			xfs_ilock(ip, XFS_ILOCK_EXCL);
-			xfs_trans_ijoin(tp, ip);
+			xfs_trans_ijoin(tp, ip, 0);
 
 			xfs_bmap_init(&free_list, &first_block);
 
@@ -692,7 +692,7 @@ xfs_iomap_write_unwritten(
 		}
 
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		xfs_trans_ijoin(tp, ip);
+		xfs_trans_ijoin(tp, ip, 0);
 
 		/*
 		 * Modify the unwritten extent state of the buffer.
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index e6b3e7620888..556bbe7751b7 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -611,7 +611,7 @@ xfs_setattr_nonsize(
 		}
 	}
 
-	xfs_trans_ijoin(tp, ip);
+	xfs_trans_ijoin(tp, ip, 0);
 
 	/*
 	 * Change file ownership.  Must be the owner or privileged.
@@ -863,7 +863,7 @@ xfs_setattr_size(
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 
-	xfs_trans_ijoin(tp, ip);
+	xfs_trans_ijoin(tp, ip, 0);
 
 	/*
 	 * Only change the c/mtime if we are changing the size or we are
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 609246f42e6c..5cc3dde1bc90 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -261,7 +261,7 @@ xfs_qm_scall_trunc_qfile(
 	}
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin(tp, ip);
+	xfs_trans_ijoin(tp, ip, 0);
 
 	error = xfs_itruncate_data(&tp, ip, 0);
 	if (error) {
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index df78c297d1a1..866de277079a 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -170,12 +170,12 @@ xfs_rename(
 	 * we can rely on either trans_commit or trans_cancel to unlock
 	 * them.
 	 */
-	xfs_trans_ijoin_ref(tp, src_dp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
 	if (new_parent)
-		xfs_trans_ijoin_ref(tp, target_dp, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin_ref(tp, src_ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
 	if (target_ip)
-		xfs_trans_ijoin_ref(tp, target_ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
 
 	/*
 	 * If we are using project inheritance, we only allow renames
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index f29424964521..87323f1ded64 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -112,7 +112,7 @@ xfs_growfs_rt_alloc(
 		 * Lock the inode.
 		 */
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
 		xfs_bmap_init(&flist, &firstblock);
 		/*
@@ -155,7 +155,7 @@ xfs_growfs_rt_alloc(
 			 * Lock the bitmap inode.
 			 */
 			xfs_ilock(ip, XFS_ILOCK_EXCL);
-			xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+			xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 			/*
 			 * Get a buffer for the block.
 			 */
@@ -1960,7 +1960,7 @@ xfs_growfs_rt(
 		 * Lock out other callers by grabbing the bitmap inode lock.
 		 */
 		xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
-		xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
 		/*
 		 * Update the bitmap inode's size.
 		 */
@@ -1972,7 +1972,7 @@ xfs_growfs_rt(
 		 * Get the summary inode into the transaction.
 		 */
 		xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
-		xfs_trans_ijoin_ref(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
 		/*
 		 * Update the summary inode's size.
 		 */
@@ -2143,7 +2143,7 @@ xfs_rtfree_extent(
 	 * Synchronize by locking the bitmap inode.
 	 */
 	xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
 
 #if defined(__KERNEL__) && defined(DEBUG)
 	/*
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 54d5e102ffe1..6ad05e68abda 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -884,7 +884,7 @@ xfs_log_inode(
 	}
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	return xfs_trans_commit(tp, 0);
 }
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index b64c8c79736a..1f35b2feca97 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -2018,6 +2018,6 @@ xfs_trans_roll(
 	if (error)
 		return error;
 
-	xfs_trans_ijoin(trans, dp);
+	xfs_trans_ijoin(trans, dp, 0);
 	return 0;
 }
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 54b0b1cc3e3f..f5df16969f82 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -470,8 +470,7 @@ void		xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
 void		xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
-void		xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
-void		xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
+void		xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
 void		xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
 void		xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
 struct xfs_efi_log_item	*xfs_trans_get_efi(xfs_trans_t *, uint);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index c8dea2fd7e68..32f0288ae10f 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -47,11 +47,13 @@ xfs_trans_inode_broot_debug(
  * Add a locked inode to the transaction.
  *
  * The inode must be locked, and it cannot be associated with any transaction.
+ * If lock_flags is non-zero the inode will be unlocked on transaction commit.
  */
 void
 xfs_trans_ijoin(
 	struct xfs_trans	*tp,
-	struct xfs_inode	*ip)
+	struct xfs_inode	*ip,
+	uint			lock_flags)
 {
 	xfs_inode_log_item_t	*iip;
 
@@ -59,7 +61,9 @@ xfs_trans_ijoin(
 	if (ip->i_itemp == NULL)
 		xfs_inode_item_init(ip, ip->i_mount);
 	iip = ip->i_itemp;
+
 	ASSERT(iip->ili_lock_flags == 0);
+	iip->ili_lock_flags = lock_flags;
 
 	/*
 	 * Get a log_item_desc to point at the new item.
@@ -69,25 +73,6 @@ xfs_trans_ijoin(
 	xfs_trans_inode_broot_debug(ip);
 }
 
-/*
- * Add a locked inode to the transaction.
- *
- *
- * Grabs a reference to the inode which will be dropped when the transaction
- * is committed.  The inode will also be unlocked at that point.  The inode
- * must be locked, and it cannot be associated with any transaction.
- */
-void
-xfs_trans_ijoin_ref(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip,
-	uint			lock_flags)
-{
-	xfs_trans_ijoin(tp, ip);
-	IHOLD(ip);
-	ip->i_itemp->ili_lock_flags = lock_flags;
-}
-
 /*
  * Transactional inode timestamp update. Requires the inode to be locked and
  * joined to the transaction supplied. Relies on the transaction subsystem to
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index c9c8e8230b21..fc38f15808c6 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -219,7 +219,7 @@ xfs_free_eofblocks(
 		}
 
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		xfs_trans_ijoin(tp, ip);
+		xfs_trans_ijoin(tp, ip, 0);
 
 		error = xfs_itruncate_data(&tp, ip, ip->i_size);
 		if (error) {
@@ -288,7 +288,7 @@ xfs_inactive_symlink_rmt(
 	xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 	size = (int)ip->i_d.di_size;
 	ip->i_d.di_size = 0;
-	xfs_trans_ijoin(tp, ip);
+	xfs_trans_ijoin(tp, ip, 0);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	/*
 	 * Find the block(s) so we can inval and unmap them.
@@ -336,7 +336,7 @@ xfs_inactive_symlink_rmt(
 	 * Mark it dirty so it will be logged and moved forward in the log as
 	 * part of every commit.
 	 */
-	xfs_trans_ijoin(tp, ip);
+	xfs_trans_ijoin(tp, ip, 0);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	/*
 	 * Get a new, empty transaction to return to our caller.
@@ -469,7 +469,7 @@ xfs_inactive_attrs(
 		goto error_cancel;
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin(tp, ip);
+	xfs_trans_ijoin(tp, ip, 0);
 	xfs_idestroy_fork(ip, XFS_ATTR_FORK);
 
 	ASSERT(ip->i_d.di_anextents == 0);
@@ -663,7 +663,7 @@ xfs_inactive(
 		}
 
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		xfs_trans_ijoin(tp, ip);
+		xfs_trans_ijoin(tp, ip, 0);
 
 		error = xfs_itruncate_data(&tp, ip, 0);
 		if (error) {
@@ -687,7 +687,7 @@ xfs_inactive(
 			return VN_INACTIVE_CACHE;
 		}
 
-		xfs_trans_ijoin(tp, ip);
+		xfs_trans_ijoin(tp, ip, 0);
 	} else {
 		error = xfs_trans_reserve(tp, 0,
 					  XFS_IFREE_LOG_RES(mp),
@@ -700,7 +700,7 @@ xfs_inactive(
 		}
 
 		xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-		xfs_trans_ijoin(tp, ip);
+		xfs_trans_ijoin(tp, ip, 0);
 	}
 
 	/*
@@ -940,7 +940,7 @@ xfs_create(
 	 * the transaction cancel unlocking dp so don't do it explicitly in the
 	 * error path.
 	 */
-	xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	unlock_dp_on_error = B_FALSE;
 
 	error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1261,8 +1261,8 @@ xfs_remove(
 
 	xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
 
-	xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
 	/*
 	 * If we're removing a directory perform some additional validation.
@@ -1407,8 +1407,8 @@ xfs_link(
 
 	xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
 
-	xfs_trans_ijoin_ref(tp, sip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin_ref(tp, tdp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
 
 	/*
 	 * If the source has too many links, we can't make any more to it.
@@ -1602,7 +1602,7 @@ xfs_symlink(
 	 * transaction cancel unlocking dp so don't do it explicitly in the
 	 * error path.
 	 */
-	xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	unlock_dp_on_error = B_FALSE;
 
 	/*
@@ -1735,7 +1735,7 @@ xfs_set_dmattrs(
 		return error;
 	}
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
 	ip->i_d.di_dmevmask = evmask;
 	ip->i_d.di_dmstate  = state;
@@ -1878,7 +1878,7 @@ xfs_alloc_file_space(
 		if (error)
 			goto error1;
 
-		xfs_trans_ijoin(tp, ip);
+		xfs_trans_ijoin(tp, ip, 0);
 
 		xfs_bmap_init(&free_list, &firstfsb);
 		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
@@ -2176,7 +2176,7 @@ xfs_free_file_space(
 		if (error)
 			goto error1;
 
-		xfs_trans_ijoin(tp, ip);
+		xfs_trans_ijoin(tp, ip, 0);
 
 		/*
 		 * issue the bunmapi() call to free the blocks
@@ -2349,7 +2349,7 @@ xfs_change_file_space(
 	}
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
 	if ((attr_flags & XFS_ATTR_DMI) == 0) {
 		ip->i_d.di_mode &= ~S_ISUID;
-- 
cgit v1.2.3-58-ga151


From 3815832a2aa4df9815d15dac05227e0c8551833f Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 30 Sep 2011 04:45:02 +0000
Subject: xfs: Don't allocate new buffers on every call to _xfs_buf_find

Stats show that for an 8-way unlink @ ~80,000 unlinks/s we are doing
~1 million cache hit lookups to ~3000 buffer creates. That's almost
3 orders of magnitude more cahce hits than misses, so optimising for
cache hits is quite important. In the cache hit case, we do not need
to allocate a new buffer in case of a cache miss, so we are
effectively hitting the allocator for no good reason for vast the
majority of calls to _xfs_buf_find. 8-way create workloads are
showing similar cache hit/miss ratios.

The result is profiles that look like this:

     samples  pcnt function                        DSO
     _______ _____ _______________________________ _________________

     1036.00 10.0% _xfs_buf_find                   [kernel.kallsyms]
      582.00  5.6% kmem_cache_alloc                [kernel.kallsyms]
      519.00  5.0% __memcpy                        [kernel.kallsyms]
      468.00  4.5% __ticket_spin_lock              [kernel.kallsyms]
      388.00  3.7% kmem_cache_free                 [kernel.kallsyms]
      331.00  3.2% xfs_log_commit_cil              [kernel.kallsyms]


Further, there is a fair bit of work involved in initialising a new
buffer once a cache miss has occurred and we currently do that under
the rbtree spinlock. That increases spinlock hold time on what are
heavily used trees.

To fix this, remove the initialisation of the buffer from
_xfs_buf_find() and only allocate the new buffer once we've had a
cache miss. Initialise the buffer immediately after allocating it in
xfs_buf_get, too, so that is it ready for insert if we get another
cache miss after allocation. This minimises lock hold time and
avoids unnecessary allocator churn. The resulting profiles look
like:

     samples  pcnt function                    DSO
     _______ _____ ___________________________ _________________

     8111.00  9.1% _xfs_buf_find               [kernel.kallsyms]
     4380.00  4.9% __memcpy                    [kernel.kallsyms]
     4341.00  4.8% __ticket_spin_lock          [kernel.kallsyms]
     3401.00  3.8% kmem_cache_alloc            [kernel.kallsyms]
     2856.00  3.2% xfs_log_commit_cil          [kernel.kallsyms]
     2625.00  2.9% __kmalloc                   [kernel.kallsyms]
     2380.00  2.7% kfree                       [kernel.kallsyms]
     2016.00  2.3% kmem_cache_free             [kernel.kallsyms]

Showing a significant reduction in time spent doing allocation and
freeing from slabs (kmem_cache_alloc and kmem_cache_free).

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_buf.c | 48 ++++++++++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index e3af85095ddb..6785b7bd952d 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -477,8 +477,6 @@ _xfs_buf_find(
 
 	/* No match found */
 	if (new_bp) {
-		_xfs_buf_initialize(new_bp, btp, range_base,
-				range_length, flags);
 		rb_link_node(&new_bp->b_rbnode, parent, rbp);
 		rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
 		/* the buffer keeps the perag reference until it is freed */
@@ -521,35 +519,53 @@ found:
 }
 
 /*
- *	Assembles a buffer covering the specified range.
- *	Storage in memory for all portions of the buffer will be allocated,
- *	although backing storage may not be.
+ * Assembles a buffer covering the specified range. The code is optimised for
+ * cache hits, as metadata intensive workloads will see 3 orders of magnitude
+ * more hits than misses.
  */
-xfs_buf_t *
+struct xfs_buf *
 xfs_buf_get(
 	xfs_buftarg_t		*target,/* target for buffer		*/
 	xfs_off_t		ioff,	/* starting offset of range	*/
 	size_t			isize,	/* length of range		*/
 	xfs_buf_flags_t		flags)
 {
-	xfs_buf_t		*bp, *new_bp;
+	struct xfs_buf		*bp;
+	struct xfs_buf		*new_bp;
 	int			error = 0;
 
+	bp = _xfs_buf_find(target, ioff, isize, flags, NULL);
+	if (likely(bp))
+		goto found;
+
 	new_bp = xfs_buf_allocate(flags);
 	if (unlikely(!new_bp))
 		return NULL;
 
+	_xfs_buf_initialize(new_bp, target,
+			    ioff << BBSHIFT, isize << BBSHIFT, flags);
+
 	bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
+	if (!bp) {
+		xfs_buf_deallocate(new_bp);
+		return NULL;
+	}
+
 	if (bp == new_bp) {
 		error = xfs_buf_allocate_memory(bp, flags);
 		if (error)
 			goto no_buffer;
-	} else {
+	} else
 		xfs_buf_deallocate(new_bp);
-		if (unlikely(bp == NULL))
-			return NULL;
-	}
 
+	/*
+	 * Now we have a workable buffer, fill in the block number so
+	 * that we can do IO on it.
+	 */
+	bp->b_bn = ioff;
+	bp->b_count_desired = bp->b_buffer_length;
+
+found:
 	if (!(bp->b_flags & XBF_MAPPED)) {
 		error = _xfs_buf_map_pages(bp, flags);
 		if (unlikely(error)) {
@@ -560,18 +576,10 @@ xfs_buf_get(
 	}
 
 	XFS_STATS_INC(xb_get);
-
-	/*
-	 * Always fill in the block number now, the mapped cases can do
-	 * their own overlay of this later.
-	 */
-	bp->b_bn = ioff;
-	bp->b_count_desired = bp->b_buffer_length;
-
 	trace_xfs_buf_get(bp, flags, _RET_IP_);
 	return bp;
 
- no_buffer:
+no_buffer:
 	if (flags & (XBF_LOCK | XBF_TRYLOCK))
 		xfs_buf_unlock(bp);
 	xfs_buf_rele(bp);
-- 
cgit v1.2.3-58-ga151


From 670ce93fef93bba8c8a422a79747385bec8e846a Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 30 Sep 2011 04:45:03 +0000
Subject: xfs: reduce the number of log forces from tail pushing

The AIL push code will issue a log force on ever single push loop
that it exits and has encountered pinned items. It doesn't rescan
these pinned items until it revisits the AIL from the start. Hence
we only need to force the log once per walk from the start of the
AIL to the target LSN.

This results in numbers like this:

	xs_push_ail_flush.....         1456
	xs_log_force.........          1485

For an 8-way 50M inode create workload - almost all the log forces
are coming from the AIL pushing code.

Reduce the number of log forces by only forcing the log if the
previous walk found pinned buffers. This reduces the numbers to:

	xs_push_ail_flush.....          665
	xs_log_force.........           682

For the same test.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_trans_ail.c  | 33 ++++++++++++++++++++-------------
 fs/xfs/xfs_trans_priv.h |  1 +
 2 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index c15aa29fa169..9df7f9f1b5ee 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -372,12 +372,24 @@ xfs_ail_worker(
 	xfs_lsn_t		lsn;
 	xfs_lsn_t		target;
 	long			tout = 10;
-	int			flush_log = 0;
 	int			stuck = 0;
 	int			count = 0;
 	int			push_xfsbufd = 0;
 
+	/*
+	 * If last time we ran we encountered pinned items, force the log first
+	 * and wait for it before pushing again.
+	 */
 	spin_lock(&ailp->xa_lock);
+	if (ailp->xa_last_pushed_lsn == 0 && ailp->xa_log_flush &&
+	    !list_empty(&ailp->xa_ail)) {
+		ailp->xa_log_flush = 0;
+		spin_unlock(&ailp->xa_lock);
+		XFS_STATS_INC(xs_push_ail_flush);
+		xfs_log_force(mp, XFS_LOG_SYNC);
+		spin_lock(&ailp->xa_lock);
+	}
+
 	target = ailp->xa_target;
 	lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn);
 	if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
@@ -435,7 +447,7 @@ xfs_ail_worker(
 		case XFS_ITEM_PINNED:
 			XFS_STATS_INC(xs_push_ail_pinned);
 			stuck++;
-			flush_log = 1;
+			ailp->xa_log_flush++;
 			break;
 
 		case XFS_ITEM_LOCKED:
@@ -480,16 +492,6 @@ xfs_ail_worker(
 	xfs_trans_ail_cursor_done(ailp, &cur);
 	spin_unlock(&ailp->xa_lock);
 
-	if (flush_log) {
-		/*
-		 * If something we need to push out was pinned, then
-		 * push out the log so it will become unpinned and
-		 * move forward in the AIL.
-		 */
-		XFS_STATS_INC(xs_push_ail_flush);
-		xfs_log_force(mp, 0);
-	}
-
 	if (push_xfsbufd) {
 		/* we've got delayed write buffers to flush */
 		wake_up_process(mp->m_ddev_targp->bt_task);
@@ -500,6 +502,7 @@ out_done:
 	if (!count) {
 		/* We're past our target or empty, so idle */
 		ailp->xa_last_pushed_lsn = 0;
+		ailp->xa_log_flush = 0;
 
 		/*
 		 * We clear the XFS_AIL_PUSHING_BIT first before checking
@@ -532,9 +535,13 @@ out_done:
 		 * were stuck.
 		 *
 		 * Backoff a bit more to allow some I/O to complete before
-		 * continuing from where we were.
+		 * restarting from the start of the AIL. This prevents us
+		 * from spinning on the same items, and if they are pinned will
+		 * all the restart to issue a log force to unpin the stuck
+		 * items.
 		 */
 		tout = 20;
+		ailp->xa_last_pushed_lsn = 0;
 	}
 
 	/* There is more to do, requeue us.  */
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 212946b97239..0a6eec6d472a 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -71,6 +71,7 @@ struct xfs_ail {
 	struct delayed_work	xa_work;
 	xfs_lsn_t		xa_last_pushed_lsn;
 	unsigned long		xa_flags;
+	int			xa_log_flush;
 };
 
 #define XFS_AIL_PUSHING_BIT	0
-- 
cgit v1.2.3-58-ga151


From 1da2f2dbf2d2aaa1b0f6ca2f61fcf07e24eb659b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 2 Oct 2011 14:25:16 +0000
Subject: xfs: optimize fsync on directories

Directories are only updated transactionally, which means fsync only
needs to flush the log the inode is currently dirty, but not bother
with checking for dirty data, non-transactional updates, and most
importanly doesn't have to flush disk caches except as part of a
transaction commit.

While the first two optimizations can't easily be measured, the
latter actually makes a difference when doing lots of fsync that do
not actually have to commit the inode, e.g. because an earlier fsync
already pushed the log far enough.

The new xfs_dir_fsync is identical to xfs_nfs_commit_metadata except
for the prototype, but I'm not sure creating a common helper for the
two is worth it given how simple the functions are.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_file.c  | 31 ++++++++++++++++++++++++++++++-
 fs/xfs/xfs_trace.h |  1 +
 2 files changed, 31 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 0b600b51778c..753ed9b5c70b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -124,6 +124,35 @@ xfs_iozero(
 	return (-status);
 }
 
+/*
+ * Fsync operations on directories are much simpler than on regular files,
+ * as there is no file data to flush, and thus also no need for explicit
+ * cache flush operations, and there are no non-transaction metadata updates
+ * on directories either.
+ */
+STATIC int
+xfs_dir_fsync(
+	struct file		*file,
+	loff_t			start,
+	loff_t			end,
+	int			datasync)
+{
+	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_lsn_t		lsn = 0;
+
+	trace_xfs_dir_fsync(ip);
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	if (xfs_ipincount(ip))
+		lsn = ip->i_itemp->ili_last_lsn;
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	if (!lsn)
+		return 0;
+	return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+}
+
 STATIC int
 xfs_file_fsync(
 	struct file		*file,
@@ -1140,7 +1169,7 @@ const struct file_operations xfs_dir_file_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= xfs_file_compat_ioctl,
 #endif
-	.fsync		= xfs_file_fsync,
+	.fsync		= xfs_dir_fsync,
 };
 
 static const struct vm_operations_struct xfs_file_vm_ops = {
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index bb5e660e0fab..b78f2c65438b 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -576,6 +576,7 @@ DEFINE_INODE_EVENT(xfs_vm_bmap);
 DEFINE_INODE_EVENT(xfs_file_ioctl);
 DEFINE_INODE_EVENT(xfs_file_compat_ioctl);
 DEFINE_INODE_EVENT(xfs_ioctl_setattr);
+DEFINE_INODE_EVENT(xfs_dir_fsync);
 DEFINE_INODE_EVENT(xfs_file_fsync);
 DEFINE_INODE_EVENT(xfs_destroy_inode);
 DEFINE_INODE_EVENT(xfs_write_inode);
-- 
cgit v1.2.3-58-ga151


From 87c7bec7fc3377b3873eb3a0f4b603981ea16ebb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 14 Sep 2011 14:08:26 +0000
Subject: xfs: fix buffer flushing during unmount

The code to flush buffers in the umount code is a bit iffy: we first
flush all delwri buffers out, but then might be able to queue up a
new one when logging the sb counts.  On a normal shutdown that one
would get flushed out when doing the synchronous superblock write in
xfs_unmountfs_writesb, but we skip that one if the filesystem has
been shut down.

Fix this by moving the delwri list flushing until just before unmounting
the log, and while we're at it also remove the superflous delwri list
and buffer lru flusing for the rt and log device that can never have
cached or delwri buffers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Amit Sahrawat <amit.sahrawat83@gmail.com>
Tested-by: Amit Sahrawat <amit.sahrawat83@gmail.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_buf.h   |  1 -
 fs/xfs/xfs_mount.c | 29 ++++++++++-------------------
 2 files changed, 10 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c23826685d3d..200f59138b8a 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -318,7 +318,6 @@ extern struct list_head *xfs_get_buftarg_list(void);
 #define xfs_getsize_buftarg(buftarg)	block_size((buftarg)->bt_bdev)
 #define xfs_readonly_buftarg(buftarg)	bdev_read_only((buftarg)->bt_bdev)
 
-#define xfs_binval(buftarg)		xfs_flush_buftarg(buftarg, 1)
 #define XFS_bflush(buftarg)		xfs_flush_buftarg(buftarg, 1)
 
 #endif	/* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index a957e437bee1..e8fe1cb54094 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -44,9 +44,6 @@
 #include "xfs_trace.h"
 
 
-STATIC void	xfs_unmountfs_wait(xfs_mount_t *);
-
-
 #ifdef HAVE_PERCPU_SB
 STATIC void	xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
 						int);
@@ -1496,11 +1493,6 @@ xfs_unmountfs(
 	 */
 	xfs_log_force(mp, XFS_LOG_SYNC);
 
-	xfs_binval(mp->m_ddev_targp);
-	if (mp->m_rtdev_targp) {
-		xfs_binval(mp->m_rtdev_targp);
-	}
-
 	/*
 	 * Unreserve any blocks we have so that when we unmount we don't account
 	 * the reserved free space as used. This is really only necessary for
@@ -1526,7 +1518,16 @@ xfs_unmountfs(
 		xfs_warn(mp, "Unable to update superblock counters. "
 				"Freespace may not be correct on next mount.");
 	xfs_unmountfs_writesb(mp);
-	xfs_unmountfs_wait(mp); 		/* wait for async bufs */
+
+	/*
+	 * Make sure all buffers have been flushed and completed before
+	 * unmounting the log.
+	 */
+	error = xfs_flush_buftarg(mp->m_ddev_targp, 1);
+	if (error)
+		xfs_warn(mp, "%d busy buffers during unmount.", error);
+	xfs_wait_buftarg(mp->m_ddev_targp);
+
 	xfs_log_unmount_write(mp);
 	xfs_log_unmount(mp);
 	xfs_uuid_unmount(mp);
@@ -1537,16 +1538,6 @@ xfs_unmountfs(
 	xfs_free_perag(mp);
 }
 
-STATIC void
-xfs_unmountfs_wait(xfs_mount_t *mp)
-{
-	if (mp->m_logdev_targp != mp->m_ddev_targp)
-		xfs_wait_buftarg(mp->m_logdev_targp);
-	if (mp->m_rtdev_targp)
-		xfs_wait_buftarg(mp->m_rtdev_targp);
-	xfs_wait_buftarg(mp->m_ddev_targp);
-}
-
 int
 xfs_fs_writable(xfs_mount_t *mp)
 {
-- 
cgit v1.2.3-58-ga151


From b17b833443a3b65907f5ecb36f8af33996f6ec78 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 10 Oct 2011 16:52:43 +0000
Subject: xfs: remove xfs_get_buftarg_list

The code is unused and under a config option that doesn't exist, remove it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_buf.c | 8 --------
 fs/xfs/xfs_buf.h | 4 ----
 2 files changed, 12 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 6785b7bd952d..e0339a4a0bc8 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1835,11 +1835,3 @@ xfs_buf_terminate(void)
 	destroy_workqueue(xfslogd_workqueue);
 	kmem_zone_destroy(xfs_buf_zone);
 }
-
-#ifdef CONFIG_KDB_MODULES
-struct list_head *
-xfs_get_buftarg_list(void)
-{
-	return &xfs_buftarg_list;
-}
-#endif
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 200f59138b8a..cb54cc234ed1 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -311,10 +311,6 @@ extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
 extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
 
-#ifdef CONFIG_KDB_MODULES
-extern struct list_head *xfs_get_buftarg_list(void);
-#endif
-
 #define xfs_getsize_buftarg(buftarg)	block_size((buftarg)->bt_bdev)
 #define xfs_readonly_buftarg(buftarg)	bdev_read_only((buftarg)->bt_bdev)
 
-- 
cgit v1.2.3-58-ga151


From 5fde0326ddb1472ef31034c8ed952a19d4679191 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 10 Oct 2011 16:52:44 +0000
Subject: xfs: remove XFS_BUF_FINISH_IOWAIT

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_buf.c | 2 +-
 fs/xfs/xfs_buf.h | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index e0339a4a0bc8..36fed03da26f 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1100,7 +1100,7 @@ xfs_bioerror_relse(
 		 * ASYNC buffers.
 		 */
 		xfs_buf_ioerror(bp, EIO);
-		XFS_BUF_FINISH_IOWAIT(bp);
+		complete(&bp->b_iowait);
 	} else {
 		xfs_buf_relse(bp);
 	}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index cb54cc234ed1..65181220c9a2 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -293,8 +293,6 @@ static inline int xfs_buf_ispinned(struct xfs_buf *bp)
 	return atomic_read(&bp->b_pin_count);
 }
 
-#define XFS_BUF_FINISH_IOWAIT(bp)	complete(&bp->b_iowait);
-
 static inline void xfs_buf_relse(xfs_buf_t *bp)
 {
 	xfs_buf_unlock(bp);
-- 
cgit v1.2.3-58-ga151


From 38f23232449c9d2c0bc8e9541cb8ab08b7c2b9ce Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 10 Oct 2011 16:52:45 +0000
Subject: xfs: remove XFS_BUF_SET_VTYPE and XFS_BUF_SET_VTYPE_REF

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_alloc.c    |  4 ++--
 fs/xfs/xfs_btree.c    |  8 ++++----
 fs/xfs/xfs_buf.h      |  7 +------
 fs/xfs/xfs_da_btree.c | 11 ++++-------
 fs/xfs/xfs_dquot.c    |  2 +-
 fs/xfs/xfs_ialloc.c   |  2 +-
 fs/xfs/xfs_inode.c    |  6 ------
 7 files changed, 13 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index bdd9cb54d63b..ce84ffd0264c 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -452,7 +452,7 @@ xfs_alloc_read_agfl(
 	if (error)
 		return error;
 	ASSERT(!xfs_buf_geterror(bp));
-	XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF);
+	xfs_buf_set_ref(bp, XFS_AGFL_REF);
 	*bpp = bp;
 	return 0;
 }
@@ -2139,7 +2139,7 @@ xfs_read_agf(
 		xfs_trans_brelse(tp, *bpp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
-	XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF);
+	xfs_buf_set_ref(*bpp, XFS_AGF_REF);
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 28cc0199dc1f..1f19f03af9d3 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -631,7 +631,7 @@ xfs_btree_read_bufl(
 	}
 	ASSERT(!xfs_buf_geterror(bp));
 	if (bp)
-		XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
+		xfs_buf_set_ref(bp, refval);
 	*bpp = bp;
 	return 0;
 }
@@ -939,13 +939,13 @@ xfs_btree_set_refs(
 	switch (cur->bc_btnum) {
 	case XFS_BTNUM_BNO:
 	case XFS_BTNUM_CNT:
-		XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+		xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
 		break;
 	case XFS_BTNUM_INO:
-		XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+		xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
 		break;
 	case XFS_BTNUM_BMAP:
-		XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+		xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
 		break;
 	default:
 		ASSERT(0);
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 65181220c9a2..ca2934717343 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -278,15 +278,10 @@ void xfs_buf_stale(struct xfs_buf *bp);
 #define XFS_BUF_SIZE(bp)		((bp)->b_buffer_length)
 #define XFS_BUF_SET_SIZE(bp, cnt)	((bp)->b_buffer_length = (cnt))
 
-static inline void
-xfs_buf_set_ref(
-	struct xfs_buf	*bp,
-	int		lru_ref)
+static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
 {
 	atomic_set(&bp->b_lru_ref, lru_ref);
 }
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)	xfs_buf_set_ref(bp, ref)
-#define XFS_BUF_SET_VTYPE(bp, type)		do { } while (0)
 
 static inline int xfs_buf_ispinned(struct xfs_buf *bp)
 {
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 46c8aa2740da..77c74257c2a3 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -2053,13 +2053,10 @@ xfs_da_do_buf(
 		if (!bp)
 			continue;
 		if (caller == 1) {
-			if (whichfork == XFS_ATTR_FORK) {
-				XFS_BUF_SET_VTYPE_REF(bp, B_FS_ATTR_BTREE,
-						XFS_ATTR_BTREE_REF);
-			} else {
-				XFS_BUF_SET_VTYPE_REF(bp, B_FS_DIR_BTREE,
-						XFS_DIR_BTREE_REF);
-			}
+			if (whichfork == XFS_ATTR_FORK)
+				xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
+			else
+				xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
 		}
 		if (bplist) {
 			bplist[nbplist++] = bp;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index c597bfe4ada0..25d7280e9f6b 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -605,7 +605,7 @@ xfs_qm_dqread(
 	dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
 
 	/* Mark the buf so that this will stay incore a little longer */
-	XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF);
+	xfs_buf_set_ref(bp, XFS_DQUOT_REF);
 
 	/*
 	 * We got the buffer with a xfs_trans_read_buf() (in dqtobp())
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 207e0b0c0730..169380e66057 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1505,7 +1505,7 @@ xfs_read_agi(
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 
-	XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGI, XFS_AGI_REF);
+	xfs_buf_set_ref(*bpp, XFS_AGI_REF);
 
 	xfs_check_agi_unlinked(agi);
 	return 0;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b676494a55ca..21cec6cdf453 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -190,12 +190,6 @@ xfs_imap_to_bp(
 	}
 
 	xfs_inobp_check(mp, bp);
-
-	/*
-	 * Mark the buffer as an inode buffer now that it looks good
-	 */
-	XFS_BUF_SET_VTYPE(bp, B_FS_INO);
-
 	*bpp = bp;
 	return 0;
 }
-- 
cgit v1.2.3-58-ga151


From c867cb61641751fd3d86350232d64ae2a10137d4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 10 Oct 2011 16:52:46 +0000
Subject: xfs: remove XFS_BUF_STALE and XFS_BUF_SUPER_STALE

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_attr.c        |  2 +-
 fs/xfs/xfs_buf.c         |  4 ++--
 fs/xfs/xfs_buf.h         |  6 ------
 fs/xfs/xfs_buf_item.c    |  6 ++++--
 fs/xfs/xfs_inode.c       |  4 ++--
 fs/xfs/xfs_log.c         |  4 ++--
 fs/xfs/xfs_log_recover.c |  2 +-
 fs/xfs/xfs_rw.c          |  2 +-
 fs/xfs/xfs_trans_buf.c   | 13 +++++++++----
 9 files changed, 22 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 981a0624f382..ae8f917490d4 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2168,7 +2168,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 		 */
 		bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
 		if (bp) {
-			XFS_BUF_STALE(bp);
+			xfs_buf_stale(bp);
 			xfs_buf_delwri_dequeue(bp);
 			xfs_buf_relse(bp);
 			bp = NULL;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 36fed03da26f..f88eab9e8144 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1061,7 +1061,7 @@ xfs_bioerror(
 	XFS_BUF_UNREAD(bp);
 	xfs_buf_delwri_dequeue(bp);
 	XFS_BUF_UNDONE(bp);
-	XFS_BUF_STALE(bp);
+	xfs_buf_stale(bp);
 
 	xfs_buf_ioend(bp, 0);
 
@@ -1090,7 +1090,7 @@ xfs_bioerror_relse(
 	XFS_BUF_UNREAD(bp);
 	xfs_buf_delwri_dequeue(bp);
 	XFS_BUF_DONE(bp);
-	XFS_BUF_STALE(bp);
+	xfs_buf_stale(bp);
 	bp->b_iodone = NULL;
 	if (!(fl & XBF_ASYNC)) {
 		/*
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index ca2934717343..fa38401449d9 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -242,14 +242,8 @@ xfs_buf_target_name(struct xfs_buftarg *target)
 			    XBF_SYNCIO|XBF_FUA|XBF_FLUSH))
 
 void xfs_buf_stale(struct xfs_buf *bp);
-#define XFS_BUF_STALE(bp)	xfs_buf_stale(bp);
 #define XFS_BUF_UNSTALE(bp)	((bp)->b_flags &= ~XBF_STALE)
 #define XFS_BUF_ISSTALE(bp)	((bp)->b_flags & XBF_STALE)
-#define XFS_BUF_SUPER_STALE(bp)	do {				\
-					XFS_BUF_STALE(bp);	\
-					xfs_buf_delwri_dequeue(bp);	\
-					XFS_BUF_DONE(bp);	\
-				} while (0)
 
 #define XFS_BUF_ISDELAYWRITE(bp)	((bp)->b_flags & XBF_DELWRI)
 
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 3243083d8693..8213f4a753dc 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -966,7 +966,9 @@ xfs_buf_iodone_callbacks(
 	 * I/O errors, there's no point in giving this a retry.
 	 */
 	if (XFS_FORCED_SHUTDOWN(mp)) {
-		XFS_BUF_SUPER_STALE(bp);
+		xfs_buf_stale(bp);
+		xfs_buf_delwri_dequeue(bp);
+		XFS_BUF_DONE(bp);
 		trace_xfs_buf_item_iodone(bp, _RET_IP_);
 		goto do_callbacks;
 	}
@@ -1005,7 +1007,7 @@ xfs_buf_iodone_callbacks(
 	 * If the write of the buffer was synchronous, we want to make
 	 * sure to return the error to the caller of xfs_bwrite().
 	 */
-	XFS_BUF_STALE(bp);
+	xfs_buf_stale(bp);
 	XFS_BUF_DONE(bp);
 	xfs_buf_delwri_dequeue(bp);
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 21cec6cdf453..c0237c602f11 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2469,11 +2469,11 @@ cluster_corrupt_out:
 		 */
 		if (bp->b_iodone) {
 			XFS_BUF_UNDONE(bp);
-			XFS_BUF_STALE(bp);
+			xfs_buf_stale(bp);
 			xfs_buf_ioerror(bp, EIO);
 			xfs_buf_ioend(bp, 0);
 		} else {
-			XFS_BUF_STALE(bp);
+			xfs_buf_stale(bp);
 			xfs_buf_relse(bp);
 		}
 	}
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3a8d4f66d702..493447dc4f55 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -881,7 +881,7 @@ xlog_iodone(xfs_buf_t *bp)
 	if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp,
 			XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
 		xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp));
-		XFS_BUF_STALE(bp);
+		xfs_buf_stale(bp);
 		xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
 		/*
 		 * This flag will be propagated to the trans-committed
@@ -1247,7 +1247,7 @@ xlog_bdstrat(
 
 	if (iclog->ic_state & XLOG_STATE_IOERROR) {
 		xfs_buf_ioerror(bp, EIO);
-		XFS_BUF_STALE(bp);
+		xfs_buf_stale(bp);
 		xfs_buf_ioend(bp, 0);
 		/*
 		 * It would seem logical to return EIO here, but we rely on
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index be173852b2ca..8f70f3469997 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2174,7 +2174,7 @@ xlog_recover_buffer_pass2(
 	    be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
 	    (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
 			(__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
-		XFS_BUF_STALE(bp);
+		xfs_buf_stale(bp);
 		error = xfs_bwrite(bp);
 	} else {
 		ASSERT(bp->b_target->bt_mount == mp);
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 99823c3b9aca..ff33645fe62d 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -150,7 +150,7 @@ xfs_read_buf(
 		if (bp) {
 			XFS_BUF_UNDONE(bp);
 			xfs_buf_delwri_dequeue(bp);
-			XFS_BUF_STALE(bp);
+			xfs_buf_stale(bp);
 			/*
 			 * brelse clears B_ERROR and b_error
 			 */
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 5e5196a269dd..d03a8ee19172 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -160,8 +160,11 @@ xfs_trans_get_buf(xfs_trans_t	*tp,
 	bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
 	if (bp != NULL) {
 		ASSERT(xfs_buf_islocked(bp));
-		if (XFS_FORCED_SHUTDOWN(tp->t_mountp))
-			XFS_BUF_SUPER_STALE(bp);
+		if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
+			xfs_buf_stale(bp);
+			xfs_buf_delwri_dequeue(bp);
+			XFS_BUF_DONE(bp);
+		}
 
 		/*
 		 * If the buffer is stale then it was binval'ed
@@ -387,7 +390,9 @@ xfs_trans_read_buf(
 	}
 	if (bp->b_error) {
 		error = bp->b_error;
-		XFS_BUF_SUPER_STALE(bp);
+		xfs_buf_stale(bp);
+		xfs_buf_delwri_dequeue(bp);
+		XFS_BUF_DONE(bp);
 		xfs_ioerror_alert("xfs_trans_read_buf", mp,
 				  bp, blkno);
 		if (tp->t_flags & XFS_TRANS_DIRTY)
@@ -740,7 +745,7 @@ xfs_trans_binval(
 	 * rid of it.
 	 */
 	xfs_buf_delwri_dequeue(bp);
-	XFS_BUF_STALE(bp);
+	xfs_buf_stale(bp);
 	bip->bli_flags |= XFS_BLI_STALE;
 	bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
 	bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
-- 
cgit v1.2.3-58-ga151


From af5c4bee499eb68bc36ca046030394d82d0e3669 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 10 Oct 2011 16:52:47 +0000
Subject: xfs: remove buffers from the delwri list in xfs_buf_stale

For each call to xfs_buf_stale we call xfs_buf_delwri_dequeue either
directly before or after it, or are guaranteed by the surrounding
conditionals that we are never called on delwri buffers.  Simply
this situation by moving the call to xfs_buf_delwri_dequeue into
xfs_buf_stale.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_attr.c      | 1 -
 fs/xfs/xfs_buf.c       | 3 +--
 fs/xfs/xfs_buf_item.c  | 2 --
 fs/xfs/xfs_rw.c        | 1 -
 fs/xfs/xfs_trans_buf.c | 3 ---
 5 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index ae8f917490d4..1e5d97f86ea8 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2169,7 +2169,6 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 		bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
 		if (bp) {
 			xfs_buf_stale(bp);
-			xfs_buf_delwri_dequeue(bp);
 			xfs_buf_relse(bp);
 			bp = NULL;
 		}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index f88eab9e8144..3df7d0a2b245 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -151,6 +151,7 @@ xfs_buf_stale(
 	struct xfs_buf	*bp)
 {
 	bp->b_flags |= XBF_STALE;
+	xfs_buf_delwri_dequeue(bp);
 	atomic_set(&(bp)->b_lru_ref, 0);
 	if (!list_empty(&bp->b_lru)) {
 		struct xfs_buftarg *btp = bp->b_target;
@@ -1059,7 +1060,6 @@ xfs_bioerror(
 	 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
 	 */
 	XFS_BUF_UNREAD(bp);
-	xfs_buf_delwri_dequeue(bp);
 	XFS_BUF_UNDONE(bp);
 	xfs_buf_stale(bp);
 
@@ -1088,7 +1088,6 @@ xfs_bioerror_relse(
 	 * change that interface.
 	 */
 	XFS_BUF_UNREAD(bp);
-	xfs_buf_delwri_dequeue(bp);
 	XFS_BUF_DONE(bp);
 	xfs_buf_stale(bp);
 	bp->b_iodone = NULL;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 8213f4a753dc..06a76ca99659 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -967,7 +967,6 @@ xfs_buf_iodone_callbacks(
 	 */
 	if (XFS_FORCED_SHUTDOWN(mp)) {
 		xfs_buf_stale(bp);
-		xfs_buf_delwri_dequeue(bp);
 		XFS_BUF_DONE(bp);
 		trace_xfs_buf_item_iodone(bp, _RET_IP_);
 		goto do_callbacks;
@@ -1009,7 +1008,6 @@ xfs_buf_iodone_callbacks(
 	 */
 	xfs_buf_stale(bp);
 	XFS_BUF_DONE(bp);
-	xfs_buf_delwri_dequeue(bp);
 
 	trace_xfs_buf_error_relse(bp, _RET_IP_);
 
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index ff33645fe62d..86f1928b4cfa 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -149,7 +149,6 @@ xfs_read_buf(
 		}
 		if (bp) {
 			XFS_BUF_UNDONE(bp);
-			xfs_buf_delwri_dequeue(bp);
 			xfs_buf_stale(bp);
 			/*
 			 * brelse clears B_ERROR and b_error
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index d03a8ee19172..5bab5980a6f9 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -162,7 +162,6 @@ xfs_trans_get_buf(xfs_trans_t	*tp,
 		ASSERT(xfs_buf_islocked(bp));
 		if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
 			xfs_buf_stale(bp);
-			xfs_buf_delwri_dequeue(bp);
 			XFS_BUF_DONE(bp);
 		}
 
@@ -391,7 +390,6 @@ xfs_trans_read_buf(
 	if (bp->b_error) {
 		error = bp->b_error;
 		xfs_buf_stale(bp);
-		xfs_buf_delwri_dequeue(bp);
 		XFS_BUF_DONE(bp);
 		xfs_ioerror_alert("xfs_trans_read_buf", mp,
 				  bp, blkno);
@@ -744,7 +742,6 @@ xfs_trans_binval(
 	 * We set the stale bit in the buffer as well since we're getting
 	 * rid of it.
 	 */
-	xfs_buf_delwri_dequeue(bp);
 	xfs_buf_stale(bp);
 	bip->bli_flags |= XFS_BLI_STALE;
 	bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
-- 
cgit v1.2.3-58-ga151


From 4347b9d7ad4223474d315c3ab6bc1ce7cce7fa2d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 10 Oct 2011 16:52:48 +0000
Subject: xfs: clean up buffer allocation

Change _xfs_buf_initialize to allocate the buffer directly and rename it to
xfs_buf_alloc now that is the only buffer allocation routine.  Also remove
the xfs_buf_deallocate wrapper around the kmem_zone_free calls for buffers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_buf.c | 50 ++++++++++++++++++--------------------------------
 fs/xfs/xfs_buf.h |  3 ++-
 fs/xfs/xfs_log.c |  2 +-
 3 files changed, 21 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 3df7d0a2b245..1f24ee5f0d7a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -65,10 +65,6 @@ struct workqueue_struct *xfsconvertd_workqueue;
 #define xb_to_km(flags) \
 	 (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
 
-#define xfs_buf_allocate(flags) \
-	kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags))
-#define xfs_buf_deallocate(bp) \
-	kmem_zone_free(xfs_buf_zone, (bp));
 
 static inline int
 xfs_buf_is_vmapped(
@@ -167,14 +163,19 @@ xfs_buf_stale(
 	ASSERT(atomic_read(&bp->b_hold) >= 1);
 }
 
-STATIC void
-_xfs_buf_initialize(
-	xfs_buf_t		*bp,
-	xfs_buftarg_t		*target,
+struct xfs_buf *
+xfs_buf_alloc(
+	struct xfs_buftarg	*target,
 	xfs_off_t		range_base,
 	size_t			range_length,
 	xfs_buf_flags_t		flags)
 {
+	struct xfs_buf		*bp;
+
+	bp = kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags));
+	if (unlikely(!bp))
+		return NULL;
+
 	/*
 	 * We don't want certain flags to appear in b_flags.
 	 */
@@ -203,8 +204,9 @@ _xfs_buf_initialize(
 	init_waitqueue_head(&bp->b_waiters);
 
 	XFS_STATS_INC(xb_create);
-
 	trace_xfs_buf_init(bp, _RET_IP_);
+
+	return bp;
 }
 
 /*
@@ -277,7 +279,7 @@ xfs_buf_free(
 	} else if (bp->b_flags & _XBF_KMEM)
 		kmem_free(bp->b_addr);
 	_xfs_buf_free_pages(bp);
-	xfs_buf_deallocate(bp);
+	kmem_zone_free(xfs_buf_zone, bp);
 }
 
 /*
@@ -539,16 +541,14 @@ xfs_buf_get(
 	if (likely(bp))
 		goto found;
 
-	new_bp = xfs_buf_allocate(flags);
+	new_bp = xfs_buf_alloc(target, ioff << BBSHIFT, isize << BBSHIFT,
+			       flags);
 	if (unlikely(!new_bp))
 		return NULL;
 
-	_xfs_buf_initialize(new_bp, target,
-			    ioff << BBSHIFT, isize << BBSHIFT, flags);
-
 	bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
 	if (!bp) {
-		xfs_buf_deallocate(new_bp);
+		kmem_zone_free(xfs_buf_zone, new_bp);
 		return NULL;
 	}
 
@@ -557,7 +557,7 @@ xfs_buf_get(
 		if (error)
 			goto no_buffer;
 	} else
-		xfs_buf_deallocate(new_bp);
+		kmem_zone_free(xfs_buf_zone, new_bp);
 
 	/*
 	 * Now we have a workable buffer, fill in the block number so
@@ -694,19 +694,6 @@ xfs_buf_read_uncached(
 	return bp;
 }
 
-xfs_buf_t *
-xfs_buf_get_empty(
-	size_t			len,
-	xfs_buftarg_t		*target)
-{
-	xfs_buf_t		*bp;
-
-	bp = xfs_buf_allocate(0);
-	if (bp)
-		_xfs_buf_initialize(bp, target, 0, len, 0);
-	return bp;
-}
-
 /*
  * Return a buffer allocated as an empty buffer and associated to external
  * memory via xfs_buf_associate_memory() back to it's empty state.
@@ -792,10 +779,9 @@ xfs_buf_get_uncached(
 	int			error, i;
 	xfs_buf_t		*bp;
 
-	bp = xfs_buf_allocate(0);
+	bp = xfs_buf_alloc(target, 0, len, 0);
 	if (unlikely(bp == NULL))
 		goto fail;
-	_xfs_buf_initialize(bp, target, 0, len, 0);
 
 	error = _xfs_buf_get_pages(bp, page_count, 0);
 	if (error)
@@ -823,7 +809,7 @@ xfs_buf_get_uncached(
 		__free_page(bp->b_pages[i]);
 	_xfs_buf_free_pages(bp);
  fail_free_buf:
-	xfs_buf_deallocate(bp);
+	kmem_zone_free(xfs_buf_zone, bp);
  fail:
 	return NULL;
 }
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index fa38401449d9..26b909417deb 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -175,7 +175,8 @@ extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t,
 extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
 				xfs_buf_flags_t);
 
-extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
+struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *, xfs_off_t, size_t,
+			      xfs_buf_flags_t);
 extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len);
 extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
 extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 493447dc4f55..8c9db4e5ddd2 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1047,7 +1047,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	xlog_get_iclog_buffer_size(mp, log);
 
 	error = ENOMEM;
-	bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
+	bp = xfs_buf_alloc(mp->m_logdev_targp, 0, log->l_iclog_size, 0);
 	if (!bp)
 		goto out_free_log;
 	bp->b_iodone = xlog_iodone;
-- 
cgit v1.2.3-58-ga151


From 901796afca0d31d97bf6d1bf2ab251a93a4b8c83 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 10 Oct 2011 16:52:49 +0000
Subject: xfs: clean up xfs_ioerror_alert

Instead of passing the block number and mount structure explicitly
get them off the bp and fix make the argument order more natural.

Also move it to xfs_buf.c and stop printing the device name given
that we already get the fs name as part of xfs_alert, and we know
what device is operates on because of the caller that gets printed,
finally rename it to xfs_buf_ioerror_alert and pass __func__ as
argument where it makes sense.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_buf.c         | 11 +++++++++++
 fs/xfs/xfs_buf.h         |  1 +
 fs/xfs/xfs_log.c         | 14 +++++++-------
 fs/xfs/xfs_log_recover.c | 25 ++++++++-----------------
 fs/xfs/xfs_mount.c       |  3 +--
 fs/xfs/xfs_rw.c          | 20 +-------------------
 fs/xfs/xfs_rw.h          |  2 --
 fs/xfs/xfs_trans_buf.c   |  9 +++------
 fs/xfs/xfs_vnodeops.c    | 11 +++++------
 9 files changed, 37 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 1f24ee5f0d7a..0a767fca0305 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1004,6 +1004,17 @@ xfs_buf_ioerror(
 	trace_xfs_buf_ioerror(bp, error, _RET_IP_);
 }
 
+void
+xfs_buf_ioerror_alert(
+	struct xfs_buf		*bp,
+	const char		*func)
+{
+	xfs_alert(bp->b_target->bt_mount,
+"metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd",
+		(__uint64_t)XFS_BUF_ADDR(bp), func,
+		bp->b_error, XFS_BUF_COUNT(bp));
+}
+
 int
 xfs_bwrite(
 	struct xfs_buf		*bp)
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 26b909417deb..357a3371cae7 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -205,6 +205,7 @@ extern int xfs_bdstrat_cb(struct xfs_buf *);
 
 extern void xfs_buf_ioend(xfs_buf_t *,	int);
 extern void xfs_buf_ioerror(xfs_buf_t *, int);
+extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
 extern int xfs_buf_iorequest(xfs_buf_t *);
 extern int xfs_buf_iowait(xfs_buf_t *);
 extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 8c9db4e5ddd2..2758a6277c52 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -880,7 +880,7 @@ xlog_iodone(xfs_buf_t *bp)
 	 */
 	if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp,
 			XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
-		xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp));
+		xfs_buf_ioerror_alert(bp, __func__);
 		xfs_buf_stale(bp);
 		xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
 		/*
@@ -1387,9 +1387,9 @@ xlog_sync(xlog_t		*log,
 	 */
 	XFS_BUF_WRITE(bp);
 
-	if ((error = xlog_bdstrat(bp))) {
-		xfs_ioerror_alert("xlog_sync", log->l_mp, bp,
-				  XFS_BUF_ADDR(bp));
+	error = xlog_bdstrat(bp);
+	if (error) {
+		xfs_buf_ioerror_alert(bp, "xlog_sync");
 		return error;
 	}
 	if (split) {
@@ -1423,9 +1423,9 @@ xlog_sync(xlog_t		*log,
 		/* account for internal log which doesn't start at block #0 */
 		XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
 		XFS_BUF_WRITE(bp);
-		if ((error = xlog_bdstrat(bp))) {
-			xfs_ioerror_alert("xlog_sync (split)", log->l_mp,
-					  bp, XFS_BUF_ADDR(bp));
+		error = xlog_bdstrat(bp);
+		if (error) {
+			xfs_buf_ioerror_alert(bp, "xlog_sync (split)");
 			return error;
 		}
 	}
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 8f70f3469997..82ee9db628ed 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -183,8 +183,7 @@ xlog_bread_noalign(
 	xfsbdstrat(log->l_mp, bp);
 	error = xfs_buf_iowait(bp);
 	if (error)
-		xfs_ioerror_alert("xlog_bread", log->l_mp,
-				  bp, XFS_BUF_ADDR(bp));
+		xfs_buf_ioerror_alert(bp, __func__);
 	return error;
 }
 
@@ -269,10 +268,8 @@ xlog_bwrite(
 	XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
 
 	error = xfs_bwrite(bp);
-	if (error) {
-		xfs_ioerror_alert("xlog_bwrite", log->l_mp,
-				  bp, XFS_BUF_ADDR(bp));
-	}
+	if (error)
+		xfs_buf_ioerror_alert(bp, __func__);
 	xfs_buf_relse(bp);
 	return error;
 }
@@ -364,9 +361,7 @@ xlog_recover_iodone(
 		 * We're not going to bother about retrying
 		 * this during recovery. One strike!
 		 */
-		xfs_ioerror_alert("xlog_recover_iodone",
-					bp->b_target->bt_mount, bp,
-					XFS_BUF_ADDR(bp));
+		xfs_buf_ioerror_alert(bp, __func__);
 		xfs_force_shutdown(bp->b_target->bt_mount,
 					SHUTDOWN_META_IO_ERROR);
 	}
@@ -2138,8 +2133,7 @@ xlog_recover_buffer_pass2(
 		return XFS_ERROR(ENOMEM);
 	error = bp->b_error;
 	if (error) {
-		xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
-				  bp, buf_f->blf_blkno);
+		xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
 		xfs_buf_relse(bp);
 		return error;
 	}
@@ -2234,8 +2228,7 @@ xlog_recover_inode_pass2(
 	}
 	error = bp->b_error;
 	if (error) {
-		xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
-				  bp, in_f->ilf_blkno);
+		xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
 		xfs_buf_relse(bp);
 		goto error;
 	}
@@ -2542,8 +2535,7 @@ xlog_recover_dquot_pass2(
 			     XFS_FSB_TO_BB(mp, dq_f->qlf_len),
 			     0, &bp);
 	if (error) {
-		xfs_ioerror_alert("xlog_recover_do..(read#3)", mp,
-				  bp, dq_f->qlf_blkno);
+		xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#3)");
 		return error;
 	}
 	ASSERT(bp);
@@ -3695,8 +3687,7 @@ xlog_do_recover(
 	xfsbdstrat(log->l_mp, bp);
 	error = xfs_buf_iowait(bp);
 	if (error) {
-		xfs_ioerror_alert("xlog_do_recover",
-				  log->l_mp, bp, XFS_BUF_ADDR(bp));
+		xfs_buf_ioerror_alert(bp, __func__);
 		ASSERT(0);
 		xfs_buf_relse(bp);
 		return error;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e8fe1cb54094..f3b1cec38b81 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1610,8 +1610,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
 		xfsbdstrat(mp, sbp);
 		error = xfs_buf_iowait(sbp);
 		if (error)
-			xfs_ioerror_alert("xfs_unmountfs_writesb",
-					  mp, sbp, XFS_BUF_ADDR(sbp));
+			xfs_buf_ioerror_alert(sbp, __func__);
 		xfs_buf_relse(sbp);
 	}
 	return error;
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 86f1928b4cfa..597d044a09a1 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -91,24 +91,6 @@ xfs_do_force_shutdown(
 	}
 }
 
-/*
- * Prints out an ALERT message about I/O error.
- */
-void
-xfs_ioerror_alert(
-	char			*func,
-	struct xfs_mount	*mp,
-	xfs_buf_t		*bp,
-	xfs_daddr_t		blkno)
-{
-	xfs_alert(mp,
-		 "I/O error occurred: meta-data dev %s block 0x%llx"
-		 "       (\"%s\") error %d buf count %zd",
-		xfs_buf_target_name(bp->b_target),
-		(__uint64_t)blkno, func,
-		bp->b_error, XFS_BUF_COUNT(bp));
-}
-
 /*
  * This isn't an absolute requirement, but it is
  * just a good idea to call xfs_read_buf instead of
@@ -143,7 +125,7 @@ xfs_read_buf(
 	} else {
 		*bpp = NULL;
 		if (error) {
-			xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp));
+			xfs_buf_ioerror_alert(bp, __func__);
 		} else {
 			error = XFS_ERROR(EIO);
 		}
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index 11c41ec6ed75..bbdb9ad6a4ba 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -42,8 +42,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
 extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
 			xfs_daddr_t blkno, int len, uint flags,
 			struct xfs_buf **bpp);
-extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
-				xfs_buf_t *bp, xfs_daddr_t blkno);
 extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
 
 #endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 5bab5980a6f9..475a4ded4f41 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -296,8 +296,7 @@ xfs_trans_read_buf(
 
 		if (bp->b_error) {
 			error = bp->b_error;
-			xfs_ioerror_alert("xfs_trans_read_buf", mp,
-					  bp, blkno);
+			xfs_buf_ioerror_alert(bp, __func__);
 			xfs_buf_relse(bp);
 			return error;
 		}
@@ -339,8 +338,7 @@ xfs_trans_read_buf(
 			xfsbdstrat(tp->t_mountp, bp);
 			error = xfs_buf_iowait(bp);
 			if (error) {
-				xfs_ioerror_alert("xfs_trans_read_buf", mp,
-						  bp, blkno);
+				xfs_buf_ioerror_alert(bp, __func__);
 				xfs_buf_relse(bp);
 				/*
 				 * We can gracefully recover from most read
@@ -391,8 +389,7 @@ xfs_trans_read_buf(
 		error = bp->b_error;
 		xfs_buf_stale(bp);
 		XFS_BUF_DONE(bp);
-		xfs_ioerror_alert("xfs_trans_read_buf", mp,
-				  bp, blkno);
+		xfs_buf_ioerror_alert(bp, __func__);
 		if (tp->t_flags & XFS_TRANS_DIRTY)
 			xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
 		xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index fc38f15808c6..4ecf2a549060 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -87,8 +87,7 @@ xfs_readlink_bmap(
 			return XFS_ERROR(ENOMEM);
 		error = bp->b_error;
 		if (error) {
-			xfs_ioerror_alert("xfs_readlink",
-				  ip->i_mount, bp, XFS_BUF_ADDR(bp));
+			xfs_buf_ioerror_alert(bp, __func__);
 			xfs_buf_relse(bp);
 			goto out;
 		}
@@ -1993,8 +1992,8 @@ xfs_zero_remaining_bytes(
 		xfsbdstrat(mp, bp);
 		error = xfs_buf_iowait(bp);
 		if (error) {
-			xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
-					  mp, bp, XFS_BUF_ADDR(bp));
+			xfs_buf_ioerror_alert(bp,
+					"xfs_zero_remaining_bytes(read)");
 			break;
 		}
 		memset(bp->b_addr +
@@ -2006,8 +2005,8 @@ xfs_zero_remaining_bytes(
 		xfsbdstrat(mp, bp);
 		error = xfs_buf_iowait(bp);
 		if (error) {
-			xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
-					  mp, bp, XFS_BUF_ADDR(bp));
+			xfs_buf_ioerror_alert(bp,
+					"xfs_zero_remaining_bytes(write)");
 			break;
 		}
 	}
-- 
cgit v1.2.3-58-ga151


From b38505b09b7854d446b2f60b4414e3231277aa1a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 10 Oct 2011 16:52:50 +0000
Subject: xfs: use xfs_ioerror_alert in xfs_buf_iodone_callbacks

Use xfs_ioerror_alert instead of opencoding a very similar error
message.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_buf_item.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 06a76ca99659..65d6f4432d28 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -975,9 +975,7 @@ xfs_buf_iodone_callbacks(
 	if (bp->b_target != lasttarg ||
 	    time_after(jiffies, (lasttime + 5*HZ))) {
 		lasttime = jiffies;
-		xfs_alert(mp, "Device %s: metadata write error block 0x%llx",
-			xfs_buf_target_name(bp->b_target),
-		      (__uint64_t)XFS_BUF_ADDR(bp));
+		xfs_buf_ioerror_alert(bp, __func__);
 	}
 	lasttarg = bp->b_target;
 
-- 
cgit v1.2.3-58-ga151


From 02b102df1502a7ea4167d115510e1e8fe6467f12 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 10 Oct 2011 16:52:51 +0000
Subject: xfs: remove xfs_buf_target_name

The calling convention that returns a pointer to a static buffer is
fairly nasty, so just opencode it in the only caller that is left.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_buf.c | 6 +++++-
 fs/xfs/xfs_buf.h | 9 ---------
 2 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 0a767fca0305..6f615c259411 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1457,9 +1457,13 @@ xfs_setsize_buftarg_flags(
 	btp->bt_smask = sectorsize - 1;
 
 	if (set_blocksize(btp->bt_bdev, sectorsize)) {
+		char name[BDEVNAME_SIZE];
+
+		bdevname(btp->bt_bdev, name);
+
 		xfs_warn(btp->bt_mount,
 			"Cannot set_blocksize to %u on device %s\n",
-			sectorsize, xfs_buf_target_name(btp));
+			sectorsize, name);
 		return EINVAL;
 	}
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 357a3371cae7..be19dd2b0212 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -230,15 +230,6 @@ extern void xfs_buf_delwri_promote(struct xfs_buf *);
 extern int xfs_buf_init(void);
 extern void xfs_buf_terminate(void);
 
-static inline const char *
-xfs_buf_target_name(struct xfs_buftarg *target)
-{
-	static char __b[BDEVNAME_SIZE];
-
-	return bdevname(target->bt_bdev, __b);
-}
-
-
 #define XFS_BUF_ZEROFLAGS(bp) \
 	((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \
 			    XBF_SYNCIO|XBF_FUA|XBF_FLUSH))
-- 
cgit v1.2.3-58-ga151


From a9add83e5abd29bf2b7b3658311199eeabbdefc6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 10 Oct 2011 16:52:52 +0000
Subject: xfs: remove XFS_bflush

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_buf.h         | 2 --
 fs/xfs/xfs_log_recover.c | 2 +-
 fs/xfs/xfs_mount.c       | 2 +-
 fs/xfs/xfs_qm.c          | 2 +-
 fs/xfs/xfs_super.c       | 4 ++--
 fs/xfs/xfs_sync.c        | 2 +-
 6 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index be19dd2b0212..5bab046e859f 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -294,6 +294,4 @@ extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
 #define xfs_getsize_buftarg(buftarg)	block_size((buftarg)->bt_bdev)
 #define xfs_readonly_buftarg(buftarg)	bdev_read_only((buftarg)->bt_bdev)
 
-#define XFS_bflush(buftarg)		xfs_flush_buftarg(buftarg, 1)
-
 #endif	/* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 82ee9db628ed..541a508adea1 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3654,7 +3654,7 @@ xlog_do_recover(
 		return error;
 	}
 
-	XFS_bflush(log->l_mp->m_ddev_targp);
+	xfs_flush_buftarg(log->l_mp->m_ddev_targp, 1);
 
 	/*
 	 * If IO errors happened during recovery, bail out.
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index f3b1cec38b81..d06afbc3540d 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1481,7 +1481,7 @@ xfs_unmountfs(
 	 * state as much as possible.
 	 */
 	xfs_reclaim_inodes(mp, 0);
-	XFS_bflush(mp->m_ddev_targp);
+	xfs_flush_buftarg(mp->m_ddev_targp, 1);
 	xfs_reclaim_inodes(mp, SYNC_WAIT);
 
 	xfs_qm_unmount(mp);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index ddaf97a57ec6..5cff443f6cdb 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1681,7 +1681,7 @@ xfs_qm_quotacheck(
 	 * quotacheck'd stamp on the superblock. So, here we do a synchronous
 	 * flush.
 	 */
-	XFS_bflush(mp->m_ddev_targp);
+	xfs_flush_buftarg(mp->m_ddev_targp, 1);
 
 	/*
 	 * If one type of quotas is off, then it will lose its
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 6ad05e68abda..ba16248bcf24 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1015,7 +1015,7 @@ xfs_fs_put_super(
 	 */
 	xfs_filestream_unmount(mp);
 
-	XFS_bflush(mp->m_ddev_targp);
+	xfs_flush_buftarg(mp->m_ddev_targp, 1);
 
 	xfs_unmountfs(mp);
 	xfs_freesb(mp);
@@ -1439,7 +1439,7 @@ xfs_fs_fill_super(
 	 */
 	xfs_filestream_unmount(mp);
 
-	XFS_bflush(mp->m_ddev_targp);
+	xfs_flush_buftarg(mp->m_ddev_targp, 1);
 
 	xfs_unmountfs(mp);
 	goto out_free_sb;
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index bf2b38c21caa..aa3dc1a4d53d 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -377,7 +377,7 @@ xfs_quiesce_data(
 
 	/* flush data-only devices */
 	if (mp->m_rtdev_targp)
-		XFS_bflush(mp->m_rtdev_targp);
+		xfs_flush_buftarg(mp->m_rtdev_targp, 1);
 
 	return error ? error : error2;
 }
-- 
cgit v1.2.3-58-ga151


From 5a93a064d27b42e4af1772b0599b53e3241191ac Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 10 Oct 2011 16:52:53 +0000
Subject: xfs: do not flush data workqueues in xfs_flush_buftarg

When we call xfs_flush_buftarg (generally from sync or umount) it already
is too late to flush the data workqueues, as I/O completion is signalled
for them and we are thus already done with the data we would flush here.

There are places where flushing them might be useful, but the current
sync interface doesn't give us that opportunity.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_buf.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 6f615c259411..cf0ac056815f 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1623,13 +1623,6 @@ xfs_buf_delwri_promote(
 	spin_unlock(&btp->bt_delwri_lock);
 }
 
-STATIC void
-xfs_buf_runall_queues(
-	struct workqueue_struct	*queue)
-{
-	flush_workqueue(queue);
-}
-
 /*
  * Move as many buffers as specified to the supplied list
  * idicating if we skipped any buffers to prevent deadlocks.
@@ -1752,9 +1745,7 @@ xfs_flush_buftarg(
 	LIST_HEAD(wait_list);
 	struct blk_plug plug;
 
-	xfs_buf_runall_queues(xfsconvertd_workqueue);
-	xfs_buf_runall_queues(xfsdatad_workqueue);
-	xfs_buf_runall_queues(xfslogd_workqueue);
+	flush_workqueue(xfslogd_workqueue);
 
 	set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
 	pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
-- 
cgit v1.2.3-58-ga151


From 5703728ac1194baee554163180415baf4ccd9996 Mon Sep 17 00:00:00 2001
From: Mi Jinlong <mijinlong@cn.fujitsu.com>
Date: Wed, 12 Oct 2011 15:09:34 +0800
Subject: nfs: fix bug about IPv6 address scope checking

The result from ipv6_addr_scope() is a set of flags, not a single value,
so we can't just compare the result with  IPV6_ADDR_SCOPE_LINKLOCAL.

This patch fixs the problem, and checks for unequal addresses before
scope_id.

Signed-off-by: Mi Jinlong <mijinlong@cn.fujitsu.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfs/client.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 5833fbbf59b0..b4e41dd4d0f6 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -336,11 +336,12 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
 	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
 	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
 
-	if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
-	    sin1->sin6_scope_id != sin2->sin6_scope_id)
+	if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
 		return 0;
+	else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+		return sin1->sin6_scope_id == sin2->sin6_scope_id;
 
-	return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr);
+	return 1;
 }
 #else	/* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
 static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
-- 
cgit v1.2.3-58-ga151


From b238b8fa93353ab50c9a2b1e2fa47a0ab01c37cd Mon Sep 17 00:00:00 2001
From: Chen Gong <gong.chen@linux.intel.com>
Date: Wed, 12 Oct 2011 09:17:24 -0700
Subject: pstore: make pstore write function return normal success/fail value

Currently pstore write interface employs record id as return
value, but it is not enough because it can't tell caller if
the write operation is successful. Pass the record id back via
an argument pointer and return zero for success, non-zero for
failure.

Signed-off-by: Chen Gong <gong.chen@linux.intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 drivers/acpi/apei/erst.c   | 10 ++++++----
 drivers/firmware/efivars.c | 17 +++++++++--------
 fs/pstore/platform.c       | 11 ++++++-----
 include/linux/pstore.h     |  4 ++--
 4 files changed, 23 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 5e820ea35570..127408069ca7 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -933,7 +933,7 @@ static int erst_open_pstore(struct pstore_info *psi);
 static int erst_close_pstore(struct pstore_info *psi);
 static ssize_t erst_reader(u64 *id, enum pstore_type_id *type,
 			   struct timespec *time, struct pstore_info *psi);
-static u64 erst_writer(enum pstore_type_id type, unsigned int part,
+static int erst_writer(enum pstore_type_id type, u64 *id, unsigned int part,
 		       size_t size, struct pstore_info *psi);
 static int erst_clearer(enum pstore_type_id type, u64 id,
 			struct pstore_info *psi);
@@ -1040,11 +1040,12 @@ out:
 	return (rc < 0) ? rc : (len - sizeof(*rcd));
 }
 
-static u64 erst_writer(enum pstore_type_id type, unsigned int part,
+static int erst_writer(enum pstore_type_id type, u64 *id, unsigned int part,
 		       size_t size, struct pstore_info *psi)
 {
 	struct cper_pstore_record *rcd = (struct cper_pstore_record *)
 					(erst_info.buf - sizeof(*rcd));
+	int ret;
 
 	memset(rcd, 0, sizeof(*rcd));
 	memcpy(rcd->hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
@@ -1079,9 +1080,10 @@ static u64 erst_writer(enum pstore_type_id type, unsigned int part,
 	}
 	rcd->sec_hdr.section_severity = CPER_SEV_FATAL;
 
-	erst_write(&rcd->hdr);
+	ret = erst_write(&rcd->hdr);
+	*id = rcd->hdr.record_id;
 
-	return rcd->hdr.record_id;
+	return ret;
 }
 
 static int erst_clearer(enum pstore_type_id type, u64 id,
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index be8bcb035e2a..8370f72d87ff 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -490,8 +490,8 @@ static ssize_t efi_pstore_read(u64 *id, enum pstore_type_id *type,
 	return 0;
 }
 
-static u64 efi_pstore_write(enum pstore_type_id type, unsigned int part,
-			    size_t size, struct pstore_info *psi)
+static int efi_pstore_write(enum pstore_type_id type, u64 *id,
+		unsigned int part, size_t size, struct pstore_info *psi)
 {
 	char name[DUMP_NAME_LEN];
 	char stub_name[DUMP_NAME_LEN];
@@ -499,7 +499,7 @@ static u64 efi_pstore_write(enum pstore_type_id type, unsigned int part,
 	efi_guid_t vendor = LINUX_EFI_CRASH_GUID;
 	struct efivars *efivars = psi->data;
 	struct efivar_entry *entry, *found = NULL;
-	int i;
+	int i, ret = 0;
 
 	sprintf(stub_name, "dump-type%u-%u-", type, part);
 	sprintf(name, "%s%lu", stub_name, get_seconds());
@@ -548,18 +548,19 @@ static u64 efi_pstore_write(enum pstore_type_id type, unsigned int part,
 		efivar_unregister(found);
 
 	if (size)
-		efivar_create_sysfs_entry(efivars,
+		ret = efivar_create_sysfs_entry(efivars,
 					  utf16_strsize(efi_name,
 							DUMP_NAME_LEN * 2),
 					  efi_name, &vendor);
 
-	return part;
+	*id = part;
+	return ret;
 };
 
 static int efi_pstore_erase(enum pstore_type_id type, u64 id,
 			    struct pstore_info *psi)
 {
-	efi_pstore_write(type, id, 0, psi);
+	efi_pstore_write(type, &id, (unsigned int)id, 0, psi);
 
 	return 0;
 }
@@ -580,8 +581,8 @@ static ssize_t efi_pstore_read(u64 *id, enum pstore_type_id *type,
 	return -1;
 }
 
-static u64 efi_pstore_write(enum pstore_type_id type, unsigned int part,
-			    size_t size, struct pstore_info *psi)
+static int efi_pstore_write(enum pstore_type_id type, u64 *id,
+		unsigned int part, size_t size, struct pstore_info *psi)
 {
 	return 0;
 }
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 0472924024cc..2bd620f0d796 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -87,7 +87,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 	unsigned long	size, total = 0;
 	char		*dst, *why;
 	u64		id;
-	int		hsize;
+	int		hsize, ret;
 	unsigned int	part = 1;
 	unsigned long	flags = 0;
 	int		is_locked = 0;
@@ -122,9 +122,9 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 		memcpy(dst, s1 + s1_start, l1_cpy);
 		memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy);
 
-		id = psinfo->write(PSTORE_TYPE_DMESG, part,
+		ret = psinfo->write(PSTORE_TYPE_DMESG, &id, part,
 				   hsize + l1_cpy + l2_cpy, psinfo);
-		if (reason == KMSG_DUMP_OOPS && pstore_is_mounted())
+		if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
 			pstore_new_entry = 1;
 		l1 -= l1_cpy;
 		l2 -= l2_cpy;
@@ -247,6 +247,7 @@ static void pstore_timefunc(unsigned long dummy)
 int pstore_write(enum pstore_type_id type, char *buf, size_t size)
 {
 	u64		id;
+	int		ret;
 	unsigned long	flags;
 
 	if (!psinfo)
@@ -257,8 +258,8 @@ int pstore_write(enum pstore_type_id type, char *buf, size_t size)
 
 	spin_lock_irqsave(&psinfo->buf_lock, flags);
 	memcpy(psinfo->buf, buf, size);
-	id = psinfo->write(type, 0, size, psinfo);
-	if (pstore_is_mounted())
+	ret = psinfo->write(type, &id, 0, size, psinfo);
+	if (ret == 0 && pstore_is_mounted())
 		pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf,
 			      size, CURRENT_TIME, psinfo);
 	spin_unlock_irqrestore(&psinfo->buf_lock, flags);
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index b91440e64d6e..ea567321ae3c 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -39,8 +39,8 @@ struct pstore_info {
 	int		(*close)(struct pstore_info *psi);
 	ssize_t		(*read)(u64 *id, enum pstore_type_id *type,
 			struct timespec *time, struct pstore_info *psi);
-	u64		(*write)(enum pstore_type_id type, unsigned int part,
-			size_t size, struct pstore_info *psi);
+	int		(*write)(enum pstore_type_id type, u64 *id,
+			unsigned int part, size_t size, struct pstore_info *psi);
 	int		(*erase)(enum pstore_type_id type, u64 id,
 			struct pstore_info *psi);
 	void		*data;
-- 
cgit v1.2.3-58-ga151


From ac423446d83e4fd5c3ffba393b887e531cf6bf46 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Oct 2011 06:41:32 -0400
Subject: cifs: switch CIFSSMBQAllEAs to use memcmp

...as that's more efficient when we know that the lengths are equal.

Reported-by: David Howells <dhowells@redhat.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifssmb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index a80f7bd97b90..b7df4bff3aaa 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -5840,7 +5840,7 @@ QAllEAsRetry:
 
 		if (ea_name) {
 			if (ea_name_len == name_len &&
-			    strncmp(ea_name, temp_ptr, name_len) == 0) {
+			    memcmp(ea_name, temp_ptr, name_len) == 0) {
 				temp_ptr += name_len + 1;
 				rc = value_len;
 				if (buf_size == 0)
-- 
cgit v1.2.3-58-ga151


From b4dacbc2823799b84d4d4563f865b0680cb44ed1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Oct 2011 06:41:32 -0400
Subject: cifs: use memcpy for magic string in cifs signature generation
 BSRSPYL

...it's more efficient since we know the length.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsencrypt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 30acd22147e1..12f1c1263013 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -92,7 +92,7 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
 		return rc;
 
 	if (!server->session_estab) {
-		strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
+		memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
 		return rc;
 	}
 
@@ -189,7 +189,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 		return rc;
 
 	if (!server->session_estab) {
-		strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
+		memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
 		return rc;
 	}
 
-- 
cgit v1.2.3-58-ga151


From 4a29a0bd1d63c1a434f29ce5dd6428b65604a398 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Oct 2011 06:41:32 -0400
Subject: cifs: get rid of unused xid in cifs_get_root

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsfs.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 03d2f6b27cc4..7986d330cbb5 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -532,7 +532,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
 	char *full_path = NULL;
 	char *s, *p;
 	char sep;
-	int xid;
 
 	full_path = cifs_build_path_to_root(vol, cifs_sb,
 					    cifs_sb_master_tcon(cifs_sb));
@@ -541,7 +540,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
 
 	cFYI(1, "Get root dentry for %s", full_path);
 
-	xid = GetXid();
 	sep = CIFS_DIR_SEP(cifs_sb);
 	dentry = dget(sb->s_root);
 	p = s = full_path;
@@ -572,7 +570,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
 		dput(dentry);
 		dentry = child;
 	} while (!IS_ERR(dentry));
-	_FreeXid(xid);
 	kfree(full_path);
 	return dentry;
 }
-- 
cgit v1.2.3-58-ga151


From f3a6a60e4c3ac83370c620dbbd08d2a418b9364d Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Wed, 12 Oct 2011 14:14:04 +0200
Subject: cifs: Fix typo 'CIFS_NFSD_EXPORT'

It should be 'CONFIG_CIFS_NFSD_EXPORT'. No-one noticed because that
symbol depends on BROKEN.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsfs.c | 4 ++--
 fs/cifs/cifsfs.h | 4 ++--
 fs/cifs/export.c | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 7986d330cbb5..aa8fe7cee0b4 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -132,12 +132,12 @@ cifs_read_super(struct super_block *sb)
 	else
 		sb->s_d_op = &cifs_dentry_ops;
 
-#ifdef CIFS_NFSD_EXPORT
+#ifdef CONFIG_CIFS_NFSD_EXPORT
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
 		cFYI(1, "export ops supported");
 		sb->s_export_op = &cifs_export_ops;
 	}
-#endif /* CIFS_NFSD_EXPORT */
+#endif /* CONFIG_CIFS_NFSD_EXPORT */
 
 	return 0;
 
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 95da8027983d..d9dbaf869cd1 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -121,9 +121,9 @@ extern ssize_t	cifs_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t	cifs_listxattr(struct dentry *, char *, size_t);
 extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 
-#ifdef CIFS_NFSD_EXPORT
+#ifdef CONFIG_CIFS_NFSD_EXPORT
 extern const struct export_operations cifs_export_ops;
-#endif /* CIFS_NFSD_EXPORT */
+#endif /* CONFIG_CIFS_NFSD_EXPORT */
 
 #define CIFS_VERSION   "1.75"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 55d87ac52000..9c7ecdccf2f3 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -45,7 +45,7 @@
 #include "cifs_debug.h"
 #include "cifsfs.h"
 
-#ifdef CIFS_NFSD_EXPORT
+#ifdef CONFIG_CIFS_NFSD_EXPORT
 static struct dentry *cifs_get_parent(struct dentry *dentry)
 {
 	/* BB need to add code here eventually to enable export via NFSD */
@@ -63,5 +63,5 @@ const struct export_operations cifs_export_ops = {
 	.encode_fs =  */
 };
 
-#endif /* CIFS_NFSD_EXPORT */
+#endif /* CONFIG_CIFS_NFSD_EXPORT */
 
-- 
cgit v1.2.3-58-ga151


From c974befa402b5eb2ed115b3083b5a46a4be85a9f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Oct 2011 06:41:32 -0400
Subject: cifs: untangle server->maxBuf and CIFSMaxBufSize

server->maxBuf is the maximum SMB size (including header) that the
server can handle. CIFSMaxBufSize is the maximum amount of data (sans
header) that the client can handle. Currently maxBuf is being capped at
CIFSMaxBufSize + the max headers size, and the two values are used
somewhat interchangeably in the code.

This makes little sense as these two values are not related at all.
Separate them and make sure the code uses the right values in the right
places.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifssmb.c | 32 +++++++++-----------------------
 fs/cifs/connect.c | 11 +++++------
 fs/cifs/file.c    |  2 +-
 fs/cifs/sess.c    |  4 +++-
 4 files changed, 18 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index b7df4bff3aaa..602326fa4a4f 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -453,8 +453,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
 		}
 		server->sec_mode = (__u8)le16_to_cpu(rsp->SecurityMode);
 		server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
-		server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
-				(__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
+		server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
 		server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
 		/* even though we do not use raw we might as well set this
 		accurately, in case we ever find a need for it */
@@ -561,8 +560,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
 	   little endian */
 	server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount);
 	/* probably no need to store and check maxvcs */
-	server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
-			(__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
+	server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize);
 	server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
 	cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
 	server->capabilities = le32_to_cpu(pSMBr->Capabilities);
@@ -2812,8 +2810,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifs_tcon *tcon,
 	pSMB->TotalDataCount = 0;
 	pSMB->MaxParameterCount = cpu_to_le32(2);
 	/* BB find exact data count max from sess structure BB */
-	pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
-					  MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+	pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
 	pSMB->MaxSetupCount = 4;
 	pSMB->Reserved = 0;
 	pSMB->ParameterOffset = 0;
@@ -3306,8 +3303,7 @@ smb_init_nttransact(const __u16 sub_command, const int setup_count,
 	pSMB->Reserved = 0;
 	pSMB->TotalParameterCount = cpu_to_le32(parm_len);
 	pSMB->TotalDataCount  = 0;
-	pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
-					  MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+	pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
 	pSMB->ParameterCount = pSMB->TotalParameterCount;
 	pSMB->DataCount  = pSMB->TotalDataCount;
 	temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
@@ -3977,8 +3973,7 @@ findFirstRetry:
 	params = 12 + name_len /* includes null */ ;
 	pSMB->TotalDataCount = 0;	/* no EAs */
 	pSMB->MaxParameterCount = cpu_to_le16(10);
-	pSMB->MaxDataCount = cpu_to_le16((tcon->ses->server->maxBuf -
-					  MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+	pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -4052,8 +4047,7 @@ findFirstRetry:
 			psrch_inf->index_of_last_entry = 2 /* skip . and .. */ +
 				psrch_inf->entries_in_buffer;
 			lnoff = le16_to_cpu(parms->LastNameOffset);
-			if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
-			      lnoff) {
+			if (CIFSMaxBufSize < lnoff) {
 				cERROR(1, "ignoring corrupt resume name");
 				psrch_inf->last_entry = NULL;
 				return rc;
@@ -4097,9 +4091,7 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
 	byte_count = 0;
 	pSMB->TotalDataCount = 0;       /* no EAs */
 	pSMB->MaxParameterCount = cpu_to_le16(8);
-	pSMB->MaxDataCount =
-		cpu_to_le16((tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) &
-				0xFFFFFF00);
+	pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -4181,8 +4173,7 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
 			psrch_inf->index_of_last_entry +=
 				psrch_inf->entries_in_buffer;
 			lnoff = le16_to_cpu(parms->LastNameOffset);
-			if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
-			      lnoff) {
+			if (CIFSMaxBufSize < lnoff) {
 				cERROR(1, "ignoring corrupt resume name");
 				psrch_inf->last_entry = NULL;
 				return rc;
@@ -6035,12 +6026,7 @@ int CIFSSMBNotify(const int xid, struct cifs_tcon *tcon,
 	pSMB->TotalParameterCount = 0 ;
 	pSMB->TotalDataCount = 0;
 	pSMB->MaxParameterCount = cpu_to_le32(2);
-	/* BB find exact data count max from sess structure BB */
-	pSMB->MaxDataCount = 0; /* same in little endian or be */
-/* BB VERIFY verify which is correct for above BB */
-	pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
-					     MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
-
+	pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
 	pSMB->MaxSetupCount = 4;
 	pSMB->Reserved = 0;
 	pSMB->ParameterOffset = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 71beb0201970..a0077a5e0669 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -181,7 +181,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		-EINVAL = invalid transact2
 
  */
-static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
+static int check2ndT2(struct smb_hdr *pSMB)
 {
 	struct smb_t2_rsp *pSMBt;
 	int remaining;
@@ -214,9 +214,9 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
 
 	cFYI(1, "missing %d bytes from transact2, check next response",
 		remaining);
-	if (total_data_size > maxBufSize) {
+	if (total_data_size > CIFSMaxBufSize) {
 		cERROR(1, "TotalDataSize %d is over maximum buffer %d",
-			total_data_size, maxBufSize);
+			total_data_size, CIFSMaxBufSize);
 		return -EINVAL;
 	}
 	return remaining;
@@ -486,7 +486,7 @@ find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf,
 		    mid->command != buf->Command)
 			continue;
 
-		if (*length == 0 && check2ndT2(buf, server->maxBuf) > 0) {
+		if (*length == 0 && check2ndT2(buf) > 0) {
 			/* We have a multipart transact2 resp */
 			*is_multi_rsp = true;
 			if (mid->resp_buf) {
@@ -3130,8 +3130,7 @@ try_mount_again:
 		cFYI(DBG2, "no very large read support, rsize now 127K");
 	}
 	if (!(tcon->ses->capabilities & CAP_LARGE_READ_X))
-		cifs_sb->rsize = min(cifs_sb->rsize,
-			       (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
+		cifs_sb->rsize = min(cifs_sb->rsize, CIFSMaxBufSize);
 
 	cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info);
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 9f41a10523a1..fd57165f55fa 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1868,7 +1868,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 		if ((pTcon->ses) &&
 			!(pTcon->ses->capabilities & CAP_LARGE_FILES)) {
 			current_read_size = min_t(const int, current_read_size,
-					pTcon->ses->server->maxBuf - 128);
+					CIFSMaxBufSize);
 		}
 		rc = -EAGAIN;
 		while (rc == -EAGAIN) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index d3e619692ee0..c7d80e24f24e 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -124,7 +124,9 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
 	/*	that we use in next few lines                               */
 	/* Note that header is initialized to zero in header_assemble */
 	pSMB->req.AndXCommand = 0xFF;
-	pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
+	pSMB->req.MaxBufferSize = cpu_to_le16(min_t(u32,
+					CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4,
+					USHRT_MAX));
 	pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
 	pSMB->req.VcNumber = get_next_vcnum(ses);
 
-- 
cgit v1.2.3-58-ga151


From 376b43f41c8b9315f7efdf085d214b6024337381 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Oct 2011 06:41:32 -0400
Subject: cifs: clean up checkSMB

The variable names in this function are so ambiguous that it's very
difficult to know what it's doing. Rename them to make it a bit more
clear.

Also, remove a redundant length check. cifsd checks to make sure that
the rfclen isn't larger than the maximum frame size when it does the
receive.

Finally, change checkSMB to return a real error code (-EIO) when
it finds an error. That will help simplify some coming changes in the
callers.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/misc.c | 51 +++++++++++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 7c1693392598..4a1801b3195f 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -420,19 +420,22 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid)
 }
 
 int
-checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
+checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int total_read)
 {
-	__u32 len = be32_to_cpu(smb->smb_buf_length);
+	__u32 rfclen = be32_to_cpu(smb->smb_buf_length);
 	__u32 clc_len;  /* calculated length */
-	cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len);
+	cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x",
+		total_read, rfclen);
 
-	if (length < 2 + sizeof(struct smb_hdr)) {
-		if ((length >= sizeof(struct smb_hdr) - 1)
+	/* is this frame too small to even get to a BCC? */
+	if (total_read < 2 + sizeof(struct smb_hdr)) {
+		if ((total_read >= sizeof(struct smb_hdr) - 1)
 			    && (smb->Status.CifsError != 0)) {
+			/* it's an error return */
 			smb->WordCount = 0;
 			/* some error cases do not return wct and bcc */
 			return 0;
-		} else if ((length == sizeof(struct smb_hdr) + 1) &&
+		} else if ((total_read == sizeof(struct smb_hdr) + 1) &&
 				(smb->WordCount == 0)) {
 			char *tmp = (char *)smb;
 			/* Need to work around a bug in two servers here */
@@ -452,39 +455,35 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 		} else {
 			cERROR(1, "Length less than smb header size");
 		}
-		return 1;
-	}
-	if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-		cERROR(1, "smb length greater than MaxBufSize, mid=%d",
-				   smb->Mid);
-		return 1;
+		return -EIO;
 	}
 
+	/* otherwise, there is enough to get to the BCC */
 	if (check_smb_hdr(smb, mid))
-		return 1;
+		return -EIO;
 	clc_len = smbCalcSize(smb);
 
-	if (4 + len != length) {
+	if (4 + rfclen != total_read) {
 		cERROR(1, "Length read does not match RFC1001 length %d",
-			   len);
-		return 1;
+				rfclen);
+		return -EIO;
 	}
 
-	if (4 + len != clc_len) {
+	if (4 + rfclen != clc_len) {
 		/* check if bcc wrapped around for large read responses */
-		if ((len > 64 * 1024) && (len > clc_len)) {
+		if ((rfclen > 64 * 1024) && (rfclen > clc_len)) {
 			/* check if lengths match mod 64K */
-			if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
+			if (((4 + rfclen) & 0xFFFF) == (clc_len & 0xFFFF))
 				return 0; /* bcc wrapped */
 		}
 		cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u",
-				clc_len, 4 + len, smb->Mid);
+				clc_len, 4 + rfclen, smb->Mid);
 
-		if (4 + len < clc_len) {
+		if (4 + rfclen < clc_len) {
 			cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u",
-					len, smb->Mid);
-			return 1;
-		} else if (len > clc_len + 512) {
+					rfclen, smb->Mid);
+			return -EIO;
+		} else if (rfclen > clc_len + 512) {
 			/*
 			 * Some servers (Windows XP in particular) send more
 			 * data than the lengths in the SMB packet would
@@ -495,8 +494,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 			 * data to 512 bytes.
 			 */
 			cERROR(1, "RFC1001 size %u more than 512 bytes larger "
-				  "than SMB for mid=%u", len, smb->Mid);
-			return 1;
+				  "than SMB for mid=%u", rfclen, smb->Mid);
+			return -EIO;
 		}
 	}
 	return 0;
-- 
cgit v1.2.3-58-ga151


From 826a95e4a33f3e9bfa0d31ab769d5b01130f7111 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Oct 2011 06:41:32 -0400
Subject: cifs: consolidate signature generating code

We have two versions of signature generating code. A vectorized and
non-vectorized version. Eliminate a large chunk of cut-and-paste
code by turning the non-vectorized version into a wrapper around the
vectorized one.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsencrypt.c | 103 ++++++++++----------------------------------------
 fs/cifs/cifsproto.h   |   2 +-
 fs/cifs/transport.c   |  11 ++++--
 3 files changed, 30 insertions(+), 86 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 12f1c1263013..2cfb695d1f89 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -37,83 +37,8 @@
  * the sequence number before this function is called. Also, this function
  * should be called with the server->srv_mutex held.
  */
-static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
-				struct TCP_Server_Info *server, char *signature)
-{
-	int rc;
-
-	if (cifs_pdu == NULL || signature == NULL || server == NULL)
-		return -EINVAL;
-
-	if (!server->secmech.sdescmd5) {
-		cERROR(1, "%s: Can't generate signature\n", __func__);
-		return -1;
-	}
-
-	rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
-	if (rc) {
-		cERROR(1, "%s: Could not init md5\n", __func__);
-		return rc;
-	}
-
-	rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
-		server->session_key.response, server->session_key.len);
-	if (rc) {
-		cERROR(1, "%s: Could not update with response\n", __func__);
-		return rc;
-	}
-
-	rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
-		cifs_pdu->Protocol, be32_to_cpu(cifs_pdu->smb_buf_length));
-	if (rc) {
-		cERROR(1, "%s: Could not update with payload\n", __func__);
-		return rc;
-	}
-
-	rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
-	if (rc)
-		cERROR(1, "%s: Could not generate md5 hash\n", __func__);
-
-	return rc;
-}
-
-/* must be called with server->srv_mutex held */
-int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
-		  __u32 *pexpected_response_sequence_number)
-{
-	int rc = 0;
-	char smb_signature[20];
-
-	if ((cifs_pdu == NULL) || (server == NULL))
-		return -EINVAL;
-
-	if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) ||
-	    server->tcpStatus == CifsNeedNegotiate)
-		return rc;
-
-	if (!server->session_estab) {
-		memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
-		return rc;
-	}
-
-	cifs_pdu->Signature.Sequence.SequenceNumber =
-			cpu_to_le32(server->sequence_number);
-	cifs_pdu->Signature.Sequence.Reserved = 0;
-
-	*pexpected_response_sequence_number = server->sequence_number++;
-	server->sequence_number++;
-
-	rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
-	if (rc)
-		memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
-	else
-		memcpy(cifs_pdu->Signature.SecuritySignature, smb_signature, 8);
-
-	return rc;
-}
-
-static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
-				struct TCP_Server_Info *server, char *signature)
+static int cifs_calc_signature(const struct kvec *iov, int n_vec,
+			struct TCP_Server_Info *server, char *signature)
 {
 	int i;
 	int rc;
@@ -179,7 +104,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 {
 	int rc = 0;
 	char smb_signature[20];
-	struct smb_hdr *cifs_pdu = iov[0].iov_base;
+	struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base;
 
 	if ((cifs_pdu == NULL) || (server == NULL))
 		return -EINVAL;
@@ -200,7 +125,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 	*pexpected_response_sequence_number = server->sequence_number++;
 	server->sequence_number++;
 
-	rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
+	rc = cifs_calc_signature(iov, n_vec, server, smb_signature);
 	if (rc)
 		memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
 	else
@@ -209,13 +134,27 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 	return rc;
 }
 
-int cifs_verify_signature(struct smb_hdr *cifs_pdu,
+/* must be called with server->srv_mutex held */
+int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
+		  __u32 *pexpected_response_sequence_number)
+{
+	struct kvec iov;
+
+	iov.iov_base = cifs_pdu;
+	iov.iov_len = be32_to_cpu(cifs_pdu->smb_buf_length) + 4;
+
+	return cifs_sign_smb2(&iov, 1, server,
+			      pexpected_response_sequence_number);
+}
+
+int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov,
 			  struct TCP_Server_Info *server,
 			  __u32 expected_sequence_number)
 {
 	unsigned int rc;
 	char server_response_sig[8];
 	char what_we_think_sig_should_be[20];
+	struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base;
 
 	if (cifs_pdu == NULL || server == NULL)
 		return -EINVAL;
@@ -247,8 +186,8 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 	cifs_pdu->Signature.Sequence.Reserved = 0;
 
 	mutex_lock(&server->srv_mutex);
-	rc = cifs_calculate_signature(cifs_pdu, server,
-		what_we_think_sig_should_be);
+	rc = cifs_calc_signature(iov, nr_iov, server,
+				 what_we_think_sig_should_be);
 	mutex_unlock(&server->srv_mutex);
 
 	if (rc)
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 8df28e925e5b..03dc945d94a3 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -380,7 +380,7 @@ extern void tconInfoFree(struct cifs_tcon *);
 extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
 extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
 			  __u32 *);
-extern int cifs_verify_signature(struct smb_hdr *,
+extern int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov,
 				 struct TCP_Server_Info *server,
 				__u32 expected_sequence_number);
 extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 10ca6b2c26b7..33a3fbf3a3a5 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -496,13 +496,18 @@ int
 cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
 		   bool log_error)
 {
-	dump_smb(mid->resp_buf,
-		 min_t(u32, 92, be32_to_cpu(mid->resp_buf->smb_buf_length)));
+	unsigned int len = be32_to_cpu(mid->resp_buf->smb_buf_length) + 4;
+
+	dump_smb(mid->resp_buf, min_t(u32, 92, len));
 
 	/* convert the length into a more usable form */
 	if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+		struct kvec iov;
+
+		iov.iov_base = mid->resp_buf;
+		iov.iov_len = len;
 		/* FIXME: add code to kill session */
-		if (cifs_verify_signature(mid->resp_buf, server,
+		if (cifs_verify_signature(&iov, 1, server,
 					  mid->sequence_number + 1) != 0)
 			cERROR(1, "Unexpected SMB signature");
 	}
-- 
cgit v1.2.3-58-ga151


From e2218eab2050e879b253ca112aabd5f7167572af Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Oct 2011 06:41:32 -0400
Subject: cifs: trivial: remove obsolete comment

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/connect.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a0077a5e0669..6126fbeaecb6 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -694,12 +694,6 @@ incomplete_rcv:
 		 * The right amount was read from socket - 4 bytes,
 		 * so we can now interpret the length field.
 		 */
-
-		/*
-		 * Note that RFC 1001 length is big endian on the wire,
-		 * but we convert it here so it is always manipulated
-		 * as host byte order.
-		 */
 		pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
 
 		cFYI(1, "rfc1002 length 0x%x", pdu_length+4);
-- 
cgit v1.2.3-58-ga151


From e831e6cf3acb058d898411367a582deef80e32f8 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Oct 2011 06:41:32 -0400
Subject: cifs: make smb_msg local to read_from_socket

If msg_controllen is 0, then the socket layer should never touch these
fields. Thus, there's no need to continually reset them. Also, there's
no need to keep this field on the stack for the demultiplex thread, just
make it a local variable in read_from_socket.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/connect.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 6126fbeaecb6..ed969fd5f7cc 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -359,16 +359,20 @@ allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size,
 }
 
 static int
-read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg,
+read_from_socket(struct TCP_Server_Info *server,
 		 struct kvec *iov, unsigned int to_read,
 		 unsigned int *ptotal_read, bool is_header_read)
 {
 	int length, rc = 0;
 	unsigned int total_read;
+	struct msghdr smb_msg;
 	char *buf = iov->iov_base;
 
+	smb_msg.msg_control = NULL;
+	smb_msg.msg_controllen = 0;
+
 	for (total_read = 0; total_read < to_read; total_read += length) {
-		length = kernel_recvmsg(server->ssocket, smb_msg, iov, 1,
+		length = kernel_recvmsg(server->ssocket, &smb_msg, iov, 1,
 					to_read - total_read, 0);
 		if (server->tcpStatus == CifsExiting) {
 			/* then will exit */
@@ -397,8 +401,6 @@ read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg,
 				iov->iov_base = (to_read - total_read) +
 						buf;
 				iov->iov_len = to_read - total_read;
-				smb_msg->msg_control = NULL;
-				smb_msg->msg_controllen = 0;
 				rc = 3;
 			} else
 				rc = 1;
@@ -634,7 +636,6 @@ cifs_demultiplex_thread(void *p)
 	unsigned int pdu_length, total_read;
 	char *buf = NULL, *bigbuf = NULL, *smallbuf = NULL;
 	struct smb_hdr *smb_buffer = NULL;
-	struct msghdr smb_msg;
 	struct kvec iov;
 	struct task_struct *task_to_wake = NULL;
 	struct mid_q_entry *mid_entry;
@@ -665,8 +666,6 @@ cifs_demultiplex_thread(void *p)
 		buf = smallbuf;
 		iov.iov_base = buf;
 		iov.iov_len = 4;
-		smb_msg.msg_control = NULL;
-		smb_msg.msg_controllen = 0;
 		pdu_length = 4; /* enough to get RFC1001 header */
 
 incomplete_rcv:
@@ -681,7 +680,7 @@ incomplete_rcv:
 			continue;
 		}
 
-		rc = read_from_socket(server, &smb_msg, &iov, pdu_length,
+		rc = read_from_socket(server, &iov, pdu_length,
 				      &total_read, true /* header read */);
 		if (rc == 3)
 			goto incomplete_rcv;
@@ -710,7 +709,7 @@ incomplete_rcv:
 
 		iov.iov_base = 4 + buf;
 		iov.iov_len = pdu_length;
-		rc = read_from_socket(server, &smb_msg, &iov, pdu_length,
+		rc = read_from_socket(server, &iov, pdu_length,
 				      &total_read, false);
 		if (rc == 2)
 			break;
-- 
cgit v1.2.3-58-ga151


From ba749e6d5227de22e442c6088cc7dc1f0c5c68bf Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Oct 2011 06:41:32 -0400
Subject: cifs: check for unresponsive server every time we call kernel_recvmsg

If the server stops sending data while in the middle of sending a
response then we still want to reconnect it if it doesn't come back.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/connect.c | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index ed969fd5f7cc..abbc6c3fe3f1 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -358,6 +358,23 @@ allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size,
 	return true;
 }
 
+static bool
+server_unresponsive(struct TCP_Server_Info *server)
+{
+	if (echo_retries > 0 && server->tcpStatus == CifsGood &&
+	    time_after(jiffies, server->lstrp +
+				(echo_retries * SMB_ECHO_INTERVAL))) {
+		cERROR(1, "Server %s has not responded in %d seconds. "
+			  "Reconnecting...", server->hostname,
+			  (echo_retries * SMB_ECHO_INTERVAL / HZ));
+		cifs_reconnect(server);
+		wake_up(&server->response_q);
+		return true;
+	}
+
+	return false;
+}
+
 static int
 read_from_socket(struct TCP_Server_Info *server,
 		 struct kvec *iov, unsigned int to_read,
@@ -372,6 +389,11 @@ read_from_socket(struct TCP_Server_Info *server,
 	smb_msg.msg_controllen = 0;
 
 	for (total_read = 0; total_read < to_read; total_read += length) {
+		if (server_unresponsive(server)) {
+			rc = 1;
+			break;
+		}
+
 		length = kernel_recvmsg(server->ssocket, &smb_msg, iov, 1,
 					to_read - total_read, 0);
 		if (server->tcpStatus == CifsExiting) {
@@ -669,17 +691,6 @@ cifs_demultiplex_thread(void *p)
 		pdu_length = 4; /* enough to get RFC1001 header */
 
 incomplete_rcv:
-		if (echo_retries > 0 && server->tcpStatus == CifsGood &&
-		    time_after(jiffies, server->lstrp +
-					(echo_retries * SMB_ECHO_INTERVAL))) {
-			cERROR(1, "Server %s has not responded in %d seconds. "
-				  "Reconnecting...", server->hostname,
-				  (echo_retries * SMB_ECHO_INTERVAL / HZ));
-			cifs_reconnect(server);
-			wake_up(&server->response_q);
-			continue;
-		}
-
 		rc = read_from_socket(server, &iov, pdu_length,
 				      &total_read, true /* header read */);
 		if (rc == 3)
-- 
cgit v1.2.3-58-ga151


From e75047344ea415760b2508a6fa29c0288c7b6b68 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Wed, 12 Oct 2011 17:47:03 -0500
Subject: add new module parameter 'enable_oplocks'

Thus spake Jeff Layton:

"Making that a module parm would allow you to set that parameter at boot
time without needing to add special startup scripts. IMO, all of the
procfile "switches" under /proc/fs/cifs should be module parms
instead."

This patch doesn't alter the default behavior (Oplocks are enabled by
default).

To disable oplocks when loading the module, use

   modprobe cifs enable_oplocks=0

(any of '0' or 'n' or 'N' conventions can be used).

To disable oplocks at runtime using the new interface, use

   echo 0 > /sys/module/cifs/parameters/enable_oplocks

The older /proc/fs/cifs/OplockEnabled interface will be deprecated
after two releases. A subsequent patch will add an warning message
about this deprecation.

Changes since v2:
   - make enable_oplocks a 'bool'

Changes since v1:
   - eliminate the use of extra variable by renaming the old one to
     enable_oplocks and make it an 'int' type.

Reported-by: Alexander Swen <alex@swen.nu>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifs_debug.c | 6 +++---
 fs/cifs/cifsfs.c     | 6 +++++-
 fs/cifs/cifsglob.h   | 3 ++-
 fs/cifs/dir.c        | 2 +-
 fs/cifs/file.c       | 4 ++--
 5 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 6d40656e1e29..e1715313f398 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -511,7 +511,7 @@ static const struct file_operations cifsFYI_proc_fops = {
 
 static int cifs_oplock_proc_show(struct seq_file *m, void *v)
 {
-	seq_printf(m, "%d\n", oplockEnabled);
+	seq_printf(m, "%d\n", enable_oplocks);
 	return 0;
 }
 
@@ -530,9 +530,9 @@ static ssize_t cifs_oplock_proc_write(struct file *file,
 	if (rc)
 		return rc;
 	if (c == '0' || c == 'n' || c == 'N')
-		oplockEnabled = 0;
+		enable_oplocks = false;
 	else if (c == '1' || c == 'y' || c == 'Y')
-		oplockEnabled = 1;
+		enable_oplocks = true;
 
 	return count;
 }
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index aa8fe7cee0b4..5c2972106816 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -53,7 +53,7 @@
 int cifsFYI = 0;
 int cifsERROR = 1;
 int traceSMB = 0;
-unsigned int oplockEnabled = 1;
+bool enable_oplocks = true;
 unsigned int linuxExtEnabled = 1;
 unsigned int lookupCacheEnabled = 1;
 unsigned int multiuser_mount = 0;
@@ -82,6 +82,10 @@ module_param(echo_retries, ushort, 0644);
 MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and "
 			       "reconnecting server. Default: 5. 0 means "
 			       "never reconnect.");
+module_param(enable_oplocks, bool, 0644);
+MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:"
+				 "y/Y/1");
+
 extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
 extern mempool_t *cifs_mid_poolp;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 95dad9d14cf1..d734dee9d495 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -964,7 +964,8 @@ GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
 				to be established on existing mount if we
 				have the uid/password or Kerberos credential
 				or equivalent for current user */
-GLOBAL_EXTERN unsigned int oplockEnabled;
+/* enable or disable oplocks */
+GLOBAL_EXTERN bool enable_oplocks;
 GLOBAL_EXTERN unsigned int lookupCacheEnabled;
 GLOBAL_EXTERN unsigned int global_secflags;	/* if on, session setup sent
 				with more secure ntlmssp2 challenge/resp */
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 72d448bf96ce..4dd5333753fa 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -171,7 +171,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	}
 	tcon = tlink_tcon(tlink);
 
-	if (oplockEnabled)
+	if (enable_oplocks)
 		oplock = REQ_OPLOCK;
 
 	if (nd)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index fd57165f55fa..8e184150cfb5 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -371,7 +371,7 @@ int cifs_open(struct inode *inode, struct file *file)
 	cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
 		 inode, file->f_flags, full_path);
 
-	if (oplockEnabled)
+	if (enable_oplocks)
 		oplock = REQ_OPLOCK;
 	else
 		oplock = 0;
@@ -495,7 +495,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
 	cFYI(1, "inode = 0x%p file flags 0x%x for %s",
 		 inode, pCifsFile->f_flags, full_path);
 
-	if (oplockEnabled)
+	if (enable_oplocks)
 		oplock = REQ_OPLOCK;
 	else
 		oplock = 0;
-- 
cgit v1.2.3-58-ga151


From c9c4708fdf63d26e3fdfc65bba9e82e654cae285 Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.com>
Date: Wed, 12 Oct 2011 11:52:01 +0530
Subject: cifs: update README about the kernel module parameters

Reported-by: Alexander Swen <alex@swen.nu>
Cc: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/README | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/README b/fs/cifs/README
index c5c2c5e5f0f2..95b37a1d58f2 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -745,4 +745,18 @@ installed and something like the following lines should be added to the
 create cifs.spnego * * /usr/local/sbin/cifs.upcall %k
 create dns_resolver * * /usr/local/sbin/cifs.upcall %k
 
+CIFS kernel module parameters
+=============================
+These module parameters can be specified or modified either during the time of
+module loading or during the runtime by using the interface
+	/proc/module/cifs/parameters/<param>
+
+i.e. echo "value" > /proc/module/cifs/parameters/<param>
+
+1. echo_retries - The number of echo attempts before giving up and
+		  reconnecting to the server. The default is 5. The value 0
+		  means never reconnect.
+
+2. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default.
+		    [Y/y/1]. To disable use any of [N/n/0].
 
-- 
cgit v1.2.3-58-ga151


From 8bc4392a1e50f346e97f8777aaefd9cfc3d45c9f Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.com>
Date: Wed, 12 Oct 2011 11:51:53 +0530
Subject: cifs: warn about deprecation of /proc/fs/cifs/OplockEnabled interface

The plan is to deprecate this interface by kernel version 3.4.

Changes since v1
   - add a '\n' to the printk.

Reported-by: Alexander Swen <alex@swen.nu>
Cc: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifs_debug.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index e1715313f398..84e8c0724704 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -526,6 +526,9 @@ static ssize_t cifs_oplock_proc_write(struct file *file,
 	char c;
 	int rc;
 
+	printk(KERN_WARNING "CIFS: The /proc/fs/cifs/OplockEnabled interface "
+	       "will be removed in kernel version 3.4. Please migrate to "
+	       "using the 'enable_oplocks' module parameter in cifs.ko.\n");
 	rc = get_user(c, buffer);
 	if (rc)
 		return rc;
-- 
cgit v1.2.3-58-ga151


From 3d3ea8e64efbeb3e4289675dbbfab82333395642 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Mon, 26 Sep 2011 09:56:44 -0500
Subject: cifs: Add mount options for backup intent (try #6)

Add mount options backupuid and backugid.

It allows an authenticated user to access files with the intent to back them
up including their ACLs, who may not have access permission but has
"Backup files and directories user right" on them (by virtue of being part
of the built-in group Backup Operators.

When mount options backupuid is specified, cifs client restricts the
use of backup intents to the user whose effective user id is specified
along with the mount option.

When mount options backupgid is specified, cifs client restricts the
use of backup intents to the users whose effective user id belongs to the
group id specified along with the mount option.

If an authenticated user is not part of the built-in group Backup Operators
at the server, access to such files is denied, even if allowed by the client.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifs_fs_sb.h |  4 ++++
 fs/cifs/cifsacl.c    | 18 ++++++++++++------
 fs/cifs/cifsglob.h   |  7 ++++++-
 fs/cifs/cifsproto.h  |  1 +
 fs/cifs/connect.c    | 27 +++++++++++++++++++++++++++
 fs/cifs/dir.c        | 10 ++++++++--
 fs/cifs/file.c       | 12 ++++++++++--
 fs/cifs/link.c       | 17 ++++++++++++-----
 fs/cifs/misc.c       | 15 +++++++++++++++
 9 files changed, 95 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 7260e11e21f8..500d65859279 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -43,6 +43,8 @@
 #define CIFS_MOUNT_STRICT_IO	0x40000 /* strict cache mode */
 #define CIFS_MOUNT_RWPIDFORWARD	0x80000 /* use pid forwarding for rw */
 #define CIFS_MOUNT_POSIXACL	0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */
+#define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */
+#define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */
 
 struct cifs_sb_info {
 	struct rb_root tlink_tree;
@@ -55,6 +57,8 @@ struct cifs_sb_info {
 	atomic_t active;
 	uid_t	mnt_uid;
 	gid_t	mnt_gid;
+	uid_t	mnt_backupuid;
+	gid_t	mnt_backupgid;
 	mode_t	mnt_file_mode;
 	mode_t	mnt_dir_mode;
 	unsigned int mnt_cifs_flags;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index d0f59faefb78..b244e07c3048 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -945,7 +945,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
 {
 	struct cifs_ntsd *pntsd = NULL;
 	int oplock = 0;
-	int xid, rc;
+	int xid, rc, create_options = 0;
 	__u16 fid;
 	struct cifs_tcon *tcon;
 	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
@@ -956,9 +956,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
 	tcon = tlink_tcon(tlink);
 	xid = GetXid();
 
-	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0,
-			 &fid, &oplock, NULL, cifs_sb->local_nls,
-			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL,
+			create_options, &fid, &oplock, NULL, cifs_sb->local_nls,
+			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (!rc) {
 		rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
 		CIFSSMBClose(xid, tcon, fid);
@@ -995,7 +998,7 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
 		struct cifs_ntsd *pnntsd, u32 acllen)
 {
 	int oplock = 0;
-	int xid, rc;
+	int xid, rc, create_options = 0;
 	__u16 fid;
 	struct cifs_tcon *tcon;
 	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
@@ -1006,7 +1009,10 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
 	tcon = tlink_tcon(tlink);
 	xid = GetXid();
 
-	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, 0,
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, create_options,
 			 &fid, &oplock, NULL, cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc) {
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index d734dee9d495..9551437a2498 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -167,6 +167,8 @@ struct smb_vol {
 	uid_t cred_uid;
 	uid_t linux_uid;
 	gid_t linux_gid;
+	uid_t backupuid;
+	gid_t backupgid;
 	mode_t file_mode;
 	mode_t dir_mode;
 	unsigned secFlg;
@@ -179,6 +181,8 @@ struct smb_vol {
 	bool noperm:1;
 	bool no_psx_acl:1; /* set if posix acl support should be disabled */
 	bool cifs_acl:1;
+	bool backupuid_specified; /* mount option  backupuid  is specified */
+	bool backupgid_specified; /* mount option  backupgid  is specified */
 	bool no_xattr:1;   /* set if xattr (EA) support should be disabled*/
 	bool server_ino:1; /* use inode numbers from server ie UniqueId */
 	bool direct_io:1;
@@ -219,7 +223,8 @@ struct smb_vol {
 			 CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \
 			 CIFS_MOUNT_NOPOSIXBRL | CIFS_MOUNT_NOSSYNC | \
 			 CIFS_MOUNT_FSCACHE | CIFS_MOUNT_MF_SYMLINKS | \
-			 CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO)
+			 CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \
+			 CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID)
 
 #define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \
 		      MS_NODEV | MS_SYNCHRONOUS)
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 03dc945d94a3..9ddb1eccde69 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -90,6 +90,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
 extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
 extern bool is_valid_oplock_break(struct smb_hdr *smb,
 				  struct TCP_Server_Info *);
+extern bool backup_cred(struct cifs_sb_info *);
 extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
 extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
 			    unsigned int bytes_written);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index abbc6c3fe3f1..70dd2c418276 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -831,6 +831,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 {
 	char *value, *data, *end;
 	char *mountdata_copy = NULL, *options;
+	int err;
 	unsigned int  temp_len, i, j;
 	char separator[2];
 	short int override_uid = -1;
@@ -887,6 +888,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			cFYI(1, "Null separator not allowed");
 		}
 	}
+	vol->backupuid_specified = false; /* no backup intent for a user */
+	vol->backupgid_specified = false; /* no backup intent for a group */
 
 	while ((data = strsep(&options, separator)) != NULL) {
 		if (!*data)
@@ -1446,6 +1449,22 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			vol->mfsymlinks = true;
 		} else if (strnicmp(data, "multiuser", 8) == 0) {
 			vol->multiuser = true;
+		} else if (!strnicmp(data, "backupuid", 9) && value && *value) {
+			err = kstrtouint(value, 0, &vol->backupuid);
+			if (err < 0) {
+				cERROR(1, "%s: Invalid backupuid value",
+					__func__);
+				goto cifs_parse_mount_err;
+			}
+			vol->backupuid_specified = true;
+		} else if (!strnicmp(data, "backupgid", 9) && value && *value) {
+			err = kstrtouint(value, 0, &vol->backupgid);
+			if (err < 0) {
+				cERROR(1, "%s: Invalid backupgid value",
+					__func__);
+				goto cifs_parse_mount_err;
+			}
+			vol->backupgid_specified = true;
 		} else
 			printk(KERN_WARNING "CIFS: Unknown mount option %s\n",
 						data);
@@ -2737,6 +2756,10 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 
 	cifs_sb->mnt_uid = pvolume_info->linux_uid;
 	cifs_sb->mnt_gid = pvolume_info->linux_gid;
+	if (pvolume_info->backupuid_specified)
+		cifs_sb->mnt_backupuid = pvolume_info->backupuid;
+	if (pvolume_info->backupgid_specified)
+		cifs_sb->mnt_backupgid = pvolume_info->backupgid;
 	cifs_sb->mnt_file_mode = pvolume_info->file_mode;
 	cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
 	cFYI(1, "file mode: 0x%x  dir mode: 0x%x",
@@ -2767,6 +2790,10 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD;
 	if (pvolume_info->cifs_acl)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
+	if (pvolume_info->backupuid_specified)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPUID;
+	if (pvolume_info->backupgid_specified)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPGID;
 	if (pvolume_info->override_uid)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
 	if (pvolume_info->override_gid)
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 4dd5333753fa..0c8098d54d2b 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -244,6 +244,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
 		create_options |= CREATE_OPTION_READONLY;
 
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
 	if (tcon->ses->capabilities & CAP_NT_SMBS)
 		rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
 			 desiredAccess, create_options,
@@ -357,6 +360,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 {
 	int rc = -EPERM;
 	int xid;
+	int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
 	struct cifs_sb_info *cifs_sb;
 	struct tcon_link *tlink;
 	struct cifs_tcon *pTcon;
@@ -431,9 +435,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 		return rc;
 	}
 
-	/* FIXME: would WRITE_OWNER | WRITE_DAC be better? */
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
 	rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE,
-			 GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL,
+			 GENERIC_WRITE, create_options,
 			 &fileHandle, &oplock, buf, cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8e184150cfb5..237192ae7587 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -174,6 +174,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
 	int rc;
 	int desiredAccess;
 	int disposition;
+	int create_options = CREATE_NOT_DIR;
 	FILE_ALL_INFO *buf;
 
 	desiredAccess = cifs_convert_flags(f_flags);
@@ -210,9 +211,12 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
 	if (!buf)
 		return -ENOMEM;
 
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
 	if (tcon->ses->capabilities & CAP_NT_SMBS)
 		rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
-			 desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
+			 desiredAccess, create_options, pnetfid, poplock, buf,
 			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
 				 & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	else
@@ -465,6 +469,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
 	char *full_path = NULL;
 	int desiredAccess;
 	int disposition = FILE_OPEN;
+	int create_options = CREATE_NOT_DIR;
 	__u16 netfid;
 
 	xid = GetXid();
@@ -524,6 +529,9 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
 
 	desiredAccess = cifs_convert_flags(pCifsFile->f_flags);
 
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
 	/* Can not refresh inode by passing in file_info buf to be returned
 	   by SMBOpen and then calling get_inode_info with returned buf
 	   since file might have write behind data that needs to be flushed
@@ -531,7 +539,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
 	   that inode was not dirty locally we could do this */
 
 	rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess,
-			 CREATE_NOT_DIR, &netfid, &oplock, NULL,
+			 create_options, &netfid, &oplock, NULL,
 			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
 				CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc) {
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index db3f18cdf024..8693b5d0e180 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -183,14 +183,20 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
 static int
 CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon,
 		    const char *fromName, const char *toName,
-		    const struct nls_table *nls_codepage, int remap)
+		    struct cifs_sb_info *cifs_sb)
 {
 	int rc;
 	int oplock = 0;
+	int remap;
+	int create_options = CREATE_NOT_DIR;
 	__u16 netfid = 0;
 	u8 *buf;
 	unsigned int bytes_written = 0;
 	struct cifs_io_parms io_parms;
+	struct nls_table *nls_codepage;
+
+	nls_codepage = cifs_sb->local_nls;
+	remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
 
 	buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
 	if (!buf)
@@ -202,8 +208,11 @@ CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon,
 		return rc;
 	}
 
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
 	rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE,
-			 CREATE_NOT_DIR, &netfid, &oplock, NULL,
+			 create_options, &netfid, &oplock, NULL,
 			 nls_codepage, remap);
 	if (rc != 0) {
 		kfree(buf);
@@ -559,9 +568,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 	/* BB what if DFS and this volume is on different share? BB */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
 		rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname,
-					 cifs_sb->local_nls,
-					 cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
+					cifs_sb);
 	else if (pTcon->unix_ext)
 		rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
 					   cifs_sb->local_nls);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 4a1801b3195f..703ef5c6fdb1 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -675,3 +675,18 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
 		cinode->clientCanCacheRead = false;
 	}
 }
+
+bool
+backup_cred(struct cifs_sb_info *cifs_sb)
+{
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) {
+		if (cifs_sb->mnt_backupuid == current_fsuid())
+			return true;
+	}
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) {
+		if (in_group_p(cifs_sb->mnt_backupgid))
+			return true;
+	}
+
+	return false;
+}
-- 
cgit v1.2.3-58-ga151


From d02616869221517582dea7972c9b5063ac9a2ae7 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Wed, 21 Sep 2011 14:33:31 -0500
Subject: cifs: clean up unused encryption code

Remove unsed  #if 0 encryption code.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/smbencrypt.c | 121 ---------------------------------------------------
 1 file changed, 121 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 42b9fff48751..ac1221d969d6 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -265,91 +265,6 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16)
 	return rc;
 }
 
-#if 0 /* currently unused */
-/* Does both the NT and LM owfs of a user's password */
-static void
-nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16])
-{
-	char passwd[514];
-
-	memset(passwd, '\0', 514);
-	if (strlen(pwd) < 513)
-		strcpy(passwd, pwd);
-	else
-		memcpy(passwd, pwd, 512);
-	/* Calculate the MD4 hash (NT compatible) of the password */
-	memset(nt_p16, '\0', 16);
-	E_md4hash(passwd, nt_p16);
-
-	/* Mangle the passwords into Lanman format */
-	passwd[14] = '\0';
-/*	strupper(passwd); */
-
-	/* Calculate the SMB (lanman) hash functions of the password */
-
-	memset(p16, '\0', 16);
-	E_P16((unsigned char *) passwd, (unsigned char *) p16);
-
-	/* clear out local copy of user's password (just being paranoid). */
-	memset(passwd, '\0', sizeof(passwd));
-}
-#endif
-
-/* Does the NTLMv2 owfs of a user's password */
-#if 0  /* function not needed yet - but will be soon */
-static void
-ntv2_owf_gen(const unsigned char owf[16], const char *user_n,
-		const char *domain_n, unsigned char kr_buf[16],
-		const struct nls_table *nls_codepage)
-{
-	wchar_t *user_u;
-	wchar_t *dom_u;
-	int user_l, domain_l;
-	struct HMACMD5Context ctx;
-
-	/* might as well do one alloc to hold both (user_u and dom_u) */
-	user_u = kmalloc(2048 * sizeof(wchar_t), GFP_KERNEL);
-	if (user_u == NULL)
-		return;
-	dom_u = user_u + 1024;
-
-	/* push_ucs2(NULL, user_u, user_n, (user_l+1)*2,
-			STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER);
-	   push_ucs2(NULL, dom_u, domain_n, (domain_l+1)*2,
-			STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER); */
-
-	/* BB user and domain may need to be uppercased */
-	user_l = cifs_strtoUCS(user_u, user_n, 511, nls_codepage);
-	domain_l = cifs_strtoUCS(dom_u, domain_n, 511, nls_codepage);
-
-	user_l++;		/* trailing null */
-	domain_l++;
-
-	hmac_md5_init_limK_to_64(owf, 16, &ctx);
-	hmac_md5_update((const unsigned char *) user_u, user_l * 2, &ctx);
-	hmac_md5_update((const unsigned char *) dom_u, domain_l * 2, &ctx);
-	hmac_md5_final(kr_buf, &ctx);
-
-	kfree(user_u);
-}
-#endif
-
-/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */
-#if 0 /* currently unused */
-static void
-NTLMSSPOWFencrypt(unsigned char passwd[8],
-		  unsigned char *ntlmchalresp, unsigned char p24[24])
-{
-	unsigned char p21[21];
-
-	memset(p21, '\0', 21);
-	memcpy(p21, passwd, 8);
-	memset(p21 + 8, 0xbd, 8);
-
-	E_P24(p21, ntlmchalresp, p24);
-}
-#endif
-
 /* Does the NT MD4 hash then des encryption. */
 int
 SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
@@ -369,39 +284,3 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
 	rc = E_P24(p21, c8, p24);
 	return rc;
 }
-
-
-/* Does the md5 encryption from the NT hash for NTLMv2. */
-/* These routines will be needed later */
-#if 0
-static void
-SMBOWFencrypt_ntv2(const unsigned char kr[16],
-		   const struct data_blob *srv_chal,
-		   const struct data_blob *cli_chal, unsigned char resp_buf[16])
-{
-	struct HMACMD5Context ctx;
-
-	hmac_md5_init_limK_to_64(kr, 16, &ctx);
-	hmac_md5_update(srv_chal->data, srv_chal->length, &ctx);
-	hmac_md5_update(cli_chal->data, cli_chal->length, &ctx);
-	hmac_md5_final(resp_buf, &ctx);
-}
-
-static void
-SMBsesskeygen_ntv2(const unsigned char kr[16],
-		   const unsigned char *nt_resp, __u8 sess_key[16])
-{
-	struct HMACMD5Context ctx;
-
-	hmac_md5_init_limK_to_64(kr, 16, &ctx);
-	hmac_md5_update(nt_resp, 16, &ctx);
-	hmac_md5_final((unsigned char *) sess_key, &ctx);
-}
-
-static void
-SMBsesskeygen_ntv1(const unsigned char kr[16],
-		   const unsigned char *nt_resp, __u8 sess_key[16])
-{
-	mdfour((unsigned char *) sess_key, (unsigned char *) kr, 16);
-}
-#endif
-- 
cgit v1.2.3-58-ga151


From 20c3a200c418ea1b02037800830ba8a7cdd1b275 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Wed, 12 Oct 2011 20:17:55 -0500
Subject: Typo in cifs readme in name of module parm directory

Suresh had a typo in his recent patch adding information on
the new oplock_endabled parm. Should be documented as in
directory /sys/module/cifs/parameters not /proc/module/cifs/parameters

Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/README | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/README b/fs/cifs/README
index 95b37a1d58f2..895da1dc1550 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -751,7 +751,7 @@ These module parameters can be specified or modified either during the time of
 module loading or during the runtime by using the interface
 	/proc/module/cifs/parameters/<param>
 
-i.e. echo "value" > /proc/module/cifs/parameters/<param>
+i.e. echo "value" > /sys/module/cifs/parameters/<param>
 
 1. echo_retries - The number of echo attempts before giving up and
 		  reconnecting to the server. The default is 5. The value 0
-- 
cgit v1.2.3-58-ga151


From 21fed0d5b763b94a7d1568c27d0cce892ab8d43e Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Tue, 9 Aug 2011 14:30:48 -0500
Subject: cifs: Add data structures and functions for uid/gid to SID mapping
 (try #4)

Add data structures and functions necessary to map a uid and gid to SID.
These functions are very similar to the ones used to map a SID to uid and gid.
This time, instead of storing sid to id mapping sorted on a sid value,
id to sid is stored, sorted on an id.
A cifs upcall sends an id (uid or gid) and expects a SID structure
in return, if mapping was done successfully.

A failed id to sid mapping to EINVAL.

This patchset aims to enable chown and chgrp commands when
cifsacl mount option is specified, especially to Windows SMB servers.
Currently we can't do that.  So now along with chmod command,
chown and chgrp work.

Winbind is used to map id to a SID.  chown and chgrp use an upcall
to provide an id to winbind and upcall returns with corrosponding
SID if any exists. That SID is used to build security descriptor.
The DACL part of a security descriptor is not changed by either
chown or chgrp functionality.

cifs client maintains a separate caches for uid to SID and
gid to SID mapping. This is similar to the one used earlier
to map SID to id (as part of ID mapping code).

I tested it by mounting shares from a Windows (2003) server by
authenticating as two users, one at a time, as Administrator and
as a ordinary user.
And then attempting to change owner of a file on the share.

Depending on the permissions/privileges at the server for that file,
chown request fails to either open a file (to change the ownership)
or to set security descriptor.
So it all depends on privileges on the file at the server and what
user you are authenticated as at the server, cifs client is just a
conduit.

I compared the security descriptor during chown command to that
what smbcacls sends when it is used with -M OWNNER: option
and they are similar.

This patchset aim to enable chown and chgrp commands when
cifsacl mount option is specified, especially to Windows SMB servers.
Currently we can't do that.  So now along with chmod command,
chown and chgrp work.

I tested it by mounting shares from a Windows (2003) server by
authenticating as two users, one at a time, as Administrator and
as a ordinary user.
And then attempting to change owner of a file on the share.

Depending on the permissions/privileges at the server for that file,
chown request fails to either open a file (to change the ownership)
or to set security descriptor.
So it all depends on privileges on the file at the server and what
user you are authenticated as at the server, cifs client is just a
conduit.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsacl.c  | 198 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/cifs/cifsglob.h |   6 ++
 2 files changed, 204 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index b244e07c3048..992f1fb299d1 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -91,9 +91,76 @@ cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc)
 	shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
 	spin_unlock(&sidgidlock);
 
+	root = &siduidtree;
+	spin_lock(&uidsidlock);
+	shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
+	spin_unlock(&uidsidlock);
+
+	root = &sidgidtree;
+	spin_lock(&gidsidlock);
+	shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
+	spin_unlock(&gidsidlock);
+
 	return nr_rem;
 }
 
+static void
+sid_rb_insert(struct rb_root *root, unsigned long cid,
+		struct cifs_sid_id **psidid, char *typestr)
+{
+	char *strptr;
+	struct rb_node *node = root->rb_node;
+	struct rb_node *parent = NULL;
+	struct rb_node **linkto = &(root->rb_node);
+	struct cifs_sid_id *lsidid;
+
+	while (node) {
+		lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
+		parent = node;
+		if (cid > lsidid->id) {
+			linkto = &(node->rb_left);
+			node = node->rb_left;
+		}
+		if (cid < lsidid->id) {
+			linkto = &(node->rb_right);
+			node = node->rb_right;
+		}
+	}
+
+	(*psidid)->id = cid;
+	(*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
+	(*psidid)->refcount = 0;
+
+	sprintf((*psidid)->sidstr, "%s", typestr);
+	strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr);
+	sprintf(strptr, "%ld", cid);
+
+	clear_bit(SID_ID_PENDING, &(*psidid)->state);
+	clear_bit(SID_ID_MAPPED, &(*psidid)->state);
+
+	rb_link_node(&(*psidid)->rbnode, parent, linkto);
+	rb_insert_color(&(*psidid)->rbnode, root);
+}
+
+static struct cifs_sid_id *
+sid_rb_search(struct rb_root *root, unsigned long cid)
+{
+	struct rb_node *node = root->rb_node;
+	struct cifs_sid_id *lsidid;
+
+	while (node) {
+		lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
+		if (cid > lsidid->id)
+			node = node->rb_left;
+		else if (cid < lsidid->id)
+			node = node->rb_right;
+		else /* node found */
+			return lsidid;
+	}
+
+	return NULL;
+}
+
 static struct shrinker cifs_shrinker = {
 	.shrink = cifs_idmap_shrinker,
 	.seeks = DEFAULT_SEEKS,
@@ -110,6 +177,7 @@ cifs_idmap_key_instantiate(struct key *key, const void *data, size_t datalen)
 
 	memcpy(payload, data, datalen);
 	key->payload.data = payload;
+	key->datalen = datalen;
 	return 0;
 }
 
@@ -223,6 +291,120 @@ sidid_pending_wait(void *unused)
 	return signal_pending(current) ? -ERESTARTSYS : 0;
 }
 
+static int
+id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid)
+{
+	int rc = 0;
+	struct key *sidkey;
+	const struct cred *saved_cred;
+	struct cifs_sid *lsid;
+	struct cifs_sid_id *psidid, *npsidid;
+	struct rb_root *cidtree;
+	spinlock_t *cidlock;
+
+	if (sidtype == SIDOWNER) {
+		cidlock = &siduidlock;
+		cidtree = &uidtree;
+	} else if (sidtype == SIDGROUP) {
+		cidlock = &sidgidlock;
+		cidtree = &gidtree;
+	} else
+		return -EINVAL;
+
+	spin_lock(cidlock);
+	psidid = sid_rb_search(cidtree, cid);
+
+	if (!psidid) { /* node does not exist, allocate one & attempt adding */
+		spin_unlock(cidlock);
+		npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL);
+		if (!npsidid)
+			return -ENOMEM;
+
+		npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL);
+		if (!npsidid->sidstr) {
+			kfree(npsidid);
+			return -ENOMEM;
+		}
+
+		spin_lock(cidlock);
+		psidid = sid_rb_search(cidtree, cid);
+		if (psidid) { /* node happened to get inserted meanwhile */
+			++psidid->refcount;
+			spin_unlock(cidlock);
+			kfree(npsidid->sidstr);
+			kfree(npsidid);
+		} else {
+			psidid = npsidid;
+			sid_rb_insert(cidtree, cid, &psidid,
+					sidtype == SIDOWNER ? "oi:" : "gi:");
+			++psidid->refcount;
+			spin_unlock(cidlock);
+		}
+	} else {
+		++psidid->refcount;
+		spin_unlock(cidlock);
+	}
+
+	/*
+	 * If we are here, it is safe to access psidid and its fields
+	 * since a reference was taken earlier while holding the spinlock.
+	 * A reference on the node is put without holding the spinlock
+	 * and it is OK to do so in this case, shrinker will not erase
+	 * this node until all references are put and we do not access
+	 * any fields of the node after a reference is put .
+	 */
+	if (test_bit(SID_ID_MAPPED, &psidid->state)) {
+		memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid));
+		psidid->time = jiffies; /* update ts for accessing */
+		goto id_sid_out;
+	}
+
+	if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) {
+		rc = -EINVAL;
+		goto id_sid_out;
+	}
+
+	if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) {
+		saved_cred = override_creds(root_cred);
+		sidkey = request_key(&cifs_idmap_key_type, psidid->sidstr, "");
+		if (IS_ERR(sidkey)) {
+			rc = -EINVAL;
+			cFYI(1, "%s: Can't map and id to a SID", __func__);
+		} else {
+			lsid = (struct cifs_sid *)sidkey->payload.data;
+			memcpy(&psidid->sid, lsid,
+				sidkey->datalen < sizeof(struct cifs_sid) ?
+				sidkey->datalen : sizeof(struct cifs_sid));
+			memcpy(ssid, &psidid->sid,
+				sidkey->datalen < sizeof(struct cifs_sid) ?
+				sidkey->datalen : sizeof(struct cifs_sid));
+			set_bit(SID_ID_MAPPED, &psidid->state);
+			key_put(sidkey);
+			kfree(psidid->sidstr);
+		}
+		psidid->time = jiffies; /* update ts for accessing */
+		revert_creds(saved_cred);
+		clear_bit(SID_ID_PENDING, &psidid->state);
+		wake_up_bit(&psidid->state, SID_ID_PENDING);
+	} else {
+		rc = wait_on_bit(&psidid->state, SID_ID_PENDING,
+				sidid_pending_wait, TASK_INTERRUPTIBLE);
+		if (rc) {
+			cFYI(1, "%s: sidid_pending_wait interrupted %d",
+					__func__, rc);
+			--psidid->refcount;
+			return rc;
+		}
+		if (test_bit(SID_ID_MAPPED, &psidid->state))
+			memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid));
+		else
+			rc = -EINVAL;
+	}
+id_sid_out:
+	--psidid->refcount;
+	return rc;
+}
+
 static int
 sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
 		struct cifs_fattr *fattr, uint sidtype)
@@ -383,6 +565,10 @@ init_cifs_idmap(void)
 	spin_lock_init(&sidgidlock);
 	gidtree = RB_ROOT;
 
+	spin_lock_init(&uidsidlock);
+	siduidtree = RB_ROOT;
+	spin_lock_init(&gidsidlock);
+	sidgidtree = RB_ROOT;
 	register_shrinker(&cifs_shrinker);
 
 	cFYI(1, "cifs idmap keyring: %d\n", key_serial(keyring));
@@ -422,6 +608,18 @@ cifs_destroy_idmaptrees(void)
 	while ((node = rb_first(root)))
 		rb_erase(node, root);
 	spin_unlock(&sidgidlock);
+
+	root = &siduidtree;
+	spin_lock(&uidsidlock);
+	while ((node = rb_first(root)))
+		rb_erase(node, root);
+	spin_unlock(&uidsidlock);
+
+	root = &sidgidtree;
+	spin_lock(&gidsidlock);
+	while ((node = rb_first(root)))
+		rb_erase(node, root);
+	spin_unlock(&gidsidlock);
 }
 
 /* if the two SIDs (roughly equivalent to a UUID for a user or group) are
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 9551437a2498..3b83fe7bfe60 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -984,10 +984,16 @@ GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
 /* reconnect after this many failed echo attempts */
 GLOBAL_EXTERN unsigned short echo_retries;
 
+#ifdef CONFIG_CIFS_ACL
 GLOBAL_EXTERN struct rb_root uidtree;
 GLOBAL_EXTERN struct rb_root gidtree;
 GLOBAL_EXTERN spinlock_t siduidlock;
 GLOBAL_EXTERN spinlock_t sidgidlock;
+GLOBAL_EXTERN struct rb_root siduidtree;
+GLOBAL_EXTERN struct rb_root sidgidtree;
+GLOBAL_EXTERN spinlock_t uidsidlock;
+GLOBAL_EXTERN spinlock_t gidsidlock;
+#endif /* CONFIG_CIFS_ACL */
 
 void cifs_oplock_break(struct work_struct *work);
 
-- 
cgit v1.2.3-58-ga151


From a52c1eb7ae79f0a11511d49cfc00d2f9e576abea Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Oct 2011 06:41:32 -0400
Subject: cifs: simplify read_from_socket

Move the iovec handling entirely into read_from_socket. That simplifies
the code and gets rid of the special handling for header reads. With
this we can also get rid of the "goto incomplete_rcv" label in the main
demultiplex thread function since we can now treat header and non-header
receives the same way.

Also, make it return an int (since we'll never receive enough to worry
about the sign bit anyway), and simply make it return the amount of bytes
read or a negative error code.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/connect.c | 77 +++++++++++++++++--------------------------------------
 1 file changed, 24 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 70dd2c418276..ff47c1842bec 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -376,35 +376,33 @@ server_unresponsive(struct TCP_Server_Info *server)
 }
 
 static int
-read_from_socket(struct TCP_Server_Info *server,
-		 struct kvec *iov, unsigned int to_read,
-		 unsigned int *ptotal_read, bool is_header_read)
+read_from_socket(struct TCP_Server_Info *server, char *buf,
+		 unsigned int to_read)
 {
-	int length, rc = 0;
-	unsigned int total_read;
+	int length = 0;
+	int total_read;
 	struct msghdr smb_msg;
-	char *buf = iov->iov_base;
+	struct kvec iov;
 
 	smb_msg.msg_control = NULL;
 	smb_msg.msg_controllen = 0;
 
-	for (total_read = 0; total_read < to_read; total_read += length) {
+	for (total_read = 0; to_read; total_read += length, to_read -= length) {
 		if (server_unresponsive(server)) {
-			rc = 1;
+			total_read = -EAGAIN;
 			break;
 		}
 
-		length = kernel_recvmsg(server->ssocket, &smb_msg, iov, 1,
-					to_read - total_read, 0);
+		iov.iov_base = buf + total_read;
+		iov.iov_len = to_read;
+		length = kernel_recvmsg(server->ssocket, &smb_msg, &iov, 1,
+					to_read, 0);
 		if (server->tcpStatus == CifsExiting) {
-			/* then will exit */
-			rc = 2;
+			total_read = -ESHUTDOWN;
 			break;
 		} else if (server->tcpStatus == CifsNeedReconnect) {
 			cifs_reconnect(server);
-			/* Reconnect wakes up rspns q */
-			/* Now we will reread sock */
-			rc = 1;
+			total_read = -EAGAIN;
 			break;
 		} else if (length == -ERESTARTSYS ||
 			   length == -EAGAIN ||
@@ -416,28 +414,16 @@ read_from_socket(struct TCP_Server_Info *server,
 			 */
 			usleep_range(1000, 2000);
 			length = 0;
-			if (!is_header_read)
-				continue;
-			/* Special handling for header read */
-			if (total_read) {
-				iov->iov_base = (to_read - total_read) +
-						buf;
-				iov->iov_len = to_read - total_read;
-				rc = 3;
-			} else
-				rc = 1;
-			break;
+			continue;
 		} else if (length <= 0) {
-			cERROR(1, "Received no data, expecting %d",
-			       to_read - total_read);
+			cFYI(1, "Received no data or error: expecting %d "
+				"got %d", to_read, length);
 			cifs_reconnect(server);
-			rc = 1;
+			total_read = -EAGAIN;
 			break;
 		}
 	}
-
-	*ptotal_read = total_read;
-	return rc;
+	return total_read;
 }
 
 static bool
@@ -658,12 +644,10 @@ cifs_demultiplex_thread(void *p)
 	unsigned int pdu_length, total_read;
 	char *buf = NULL, *bigbuf = NULL, *smallbuf = NULL;
 	struct smb_hdr *smb_buffer = NULL;
-	struct kvec iov;
 	struct task_struct *task_to_wake = NULL;
 	struct mid_q_entry *mid_entry;
 	bool isLargeBuf = false;
 	bool isMultiRsp = false;
-	int rc;
 
 	current->flags |= PF_MEMALLOC;
 	cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
@@ -686,19 +670,12 @@ cifs_demultiplex_thread(void *p)
 		isMultiRsp = false;
 		smb_buffer = (struct smb_hdr *)smallbuf;
 		buf = smallbuf;
-		iov.iov_base = buf;
-		iov.iov_len = 4;
 		pdu_length = 4; /* enough to get RFC1001 header */
 
-incomplete_rcv:
-		rc = read_from_socket(server, &iov, pdu_length,
-				      &total_read, true /* header read */);
-		if (rc == 3)
-			goto incomplete_rcv;
-		else if (rc == 2)
-			break;
-		else if (rc == 1)
+		length = read_from_socket(server, buf, pdu_length);
+		if (length < 0)
 			continue;
+		total_read = length;
 
 		/*
 		 * The right amount was read from socket - 4 bytes,
@@ -718,16 +695,10 @@ incomplete_rcv:
 			buf = bigbuf;
 		}
 
-		iov.iov_base = 4 + buf;
-		iov.iov_len = pdu_length;
-		rc = read_from_socket(server, &iov, pdu_length,
-				      &total_read, false);
-		if (rc == 2)
-			break;
-		else if (rc == 1)
+		length = read_from_socket(server, buf + 4, pdu_length);
+		if (length < 0)
 			continue;
-
-		total_read += 4; /* account for rfc1002 hdr */
+		total_read += length;
 
 		dump_smb(smb_buffer, total_read);
 
-- 
cgit v1.2.3-58-ga151


From 94443f43404239c2a6dc4252a7cb9e77f5b1eb6e Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastry@etersoft.ru>
Date: Fri, 7 Oct 2011 18:57:45 +0400
Subject: CIFS: Fix incorrect max RFC1002 write size value

..the length field has only 17 bits.

Cc: <stable@kernel.org>
Acked-by: Jeff Layton <jlayton@samba.org>
Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/connect.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index ff47c1842bec..82bc0d27e495 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2809,10 +2809,10 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 
 /*
  * When the server doesn't allow large posix writes, only allow a wsize of
- * 128k minus the size of the WRITE_AND_X header. That allows for a write up
+ * 2^17-1 minus the size of the WRITE_AND_X header. That allows for a write up
  * to the maximum size described by RFC1002.
  */
-#define CIFS_MAX_RFC1002_WSIZE (128 * 1024 - sizeof(WRITE_REQ) + 4)
+#define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4)
 
 /*
  * The default wsize is 1M. find_get_pages seems to return a maximum of 256
-- 
cgit v1.2.3-58-ga151


From 03776f4516bc299b3145595bdd704d40d69adc02 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastryyy@gmail.com>
Date: Tue, 17 Aug 2010 11:26:00 +0400
Subject: CIFS: Simplify byte range locking code

Split cifs_lock into several functions and let CIFSSMBLock get pid
as an argument.

Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsglob.h  |   1 +
 fs/cifs/cifsproto.h |   2 +-
 fs/cifs/cifssmb.c   |   4 +-
 fs/cifs/file.c      | 370 ++++++++++++++++++++++++++++------------------------
 4 files changed, 205 insertions(+), 172 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 3b83fe7bfe60..1f265ffe7e63 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -492,6 +492,7 @@ struct cifsLockInfo {
 	struct list_head llist;	/* pointer to next cifsLockInfo */
 	__u64 offset;
 	__u64 length;
+	__u32 pid;
 	__u8 type;
 };
 
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 9ddb1eccde69..94834dbb46da 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -361,7 +361,7 @@ extern int CIFSGetSrvInodeNumber(const int xid, struct cifs_tcon *tcon,
 			int remap_special_chars);
 
 extern int CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
-			const __u16 netfid, const __u64 len,
+			const __u16 netfid, const __u32 netpid, const __u64 len,
 			const __u64 offset, const __u32 numUnlock,
 			const __u32 numLock, const __u8 lockType,
 			const bool waitFlag, const __u8 oplock_level);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 602326fa4a4f..e33093f7ef0d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1963,7 +1963,7 @@ CIFSSMBWrite2(const int xid, struct cifs_io_parms *io_parms,
 
 int
 CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
-	    const __u16 smb_file_id, const __u64 len,
+	    const __u16 smb_file_id, const __u32 netpid, const __u64 len,
 	    const __u64 offset, const __u32 numUnlock,
 	    const __u32 numLock, const __u8 lockType,
 	    const bool waitFlag, const __u8 oplock_level)
@@ -1999,7 +1999,7 @@ CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
 	pSMB->Fid = smb_file_id; /* netfid stays le */
 
 	if ((numLock != 0) || (numUnlock != 0)) {
-		pSMB->Locks[0].Pid = cpu_to_le16(current->tgid);
+		pSMB->Locks[0].Pid = cpu_to_le16(netpid);
 		/* BB where to store pid high? */
 		pSMB->Locks[0].LengthLow = cpu_to_le32((u32)len);
 		pSMB->Locks[0].LengthHigh = cpu_to_le32((u32)(len>>32));
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 237192ae7587..ab85699c5653 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -639,8 +639,8 @@ int cifs_closedir(struct inode *inode, struct file *file)
 	return rc;
 }
 
-static int store_file_lock(struct cifsFileInfo *fid, __u64 len,
-				__u64 offset, __u8 lockType)
+static int store_file_lock(struct cifsFileInfo *cfile, __u64 len,
+			   __u64 offset, __u8 type, __u16 netfid)
 {
 	struct cifsLockInfo *li =
 		kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL);
@@ -648,210 +648,241 @@ static int store_file_lock(struct cifsFileInfo *fid, __u64 len,
 		return -ENOMEM;
 	li->offset = offset;
 	li->length = len;
-	li->type = lockType;
-	mutex_lock(&fid->lock_mutex);
-	list_add(&li->llist, &fid->llist);
-	mutex_unlock(&fid->lock_mutex);
+	li->type = type;
+	li->pid = current->tgid;
+	mutex_lock(&cfile->lock_mutex);
+	list_add_tail(&li->llist, &cfile->llist);
+	mutex_unlock(&cfile->lock_mutex);
 	return 0;
 }
 
-int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
+static void
+cifs_read_flock(struct file_lock *flock, __u8 *type, int *lock, int *unlock,
+		bool *wait_flag)
 {
-	int rc, xid;
-	__u32 numLock = 0;
-	__u32 numUnlock = 0;
-	__u64 length;
-	bool wait_flag = false;
-	struct cifs_sb_info *cifs_sb;
-	struct cifs_tcon *tcon;
-	__u16 netfid;
-	__u8 lockType = LOCKING_ANDX_LARGE_FILES;
-	bool posix_locking = 0;
-
-	length = 1 + pfLock->fl_end - pfLock->fl_start;
-	rc = -EACCES;
-	xid = GetXid();
-
-	cFYI(1, "Lock parm: 0x%x flockflags: "
-		 "0x%x flocktype: 0x%x start: %lld end: %lld",
-		cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start,
-		pfLock->fl_end);
-
-	if (pfLock->fl_flags & FL_POSIX)
+	if (flock->fl_flags & FL_POSIX)
 		cFYI(1, "Posix");
-	if (pfLock->fl_flags & FL_FLOCK)
+	if (flock->fl_flags & FL_FLOCK)
 		cFYI(1, "Flock");
-	if (pfLock->fl_flags & FL_SLEEP) {
+	if (flock->fl_flags & FL_SLEEP) {
 		cFYI(1, "Blocking lock");
-		wait_flag = true;
+		*wait_flag = true;
 	}
-	if (pfLock->fl_flags & FL_ACCESS)
+	if (flock->fl_flags & FL_ACCESS)
 		cFYI(1, "Process suspended by mandatory locking - "
-			 "not implemented yet");
-	if (pfLock->fl_flags & FL_LEASE)
+			"not implemented yet");
+	if (flock->fl_flags & FL_LEASE)
 		cFYI(1, "Lease on file - not implemented yet");
-	if (pfLock->fl_flags &
+	if (flock->fl_flags &
 	    (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE)))
-		cFYI(1, "Unknown lock flags 0x%x", pfLock->fl_flags);
+		cFYI(1, "Unknown lock flags 0x%x", flock->fl_flags);
 
-	if (pfLock->fl_type == F_WRLCK) {
+	*type = LOCKING_ANDX_LARGE_FILES;
+	if (flock->fl_type == F_WRLCK) {
 		cFYI(1, "F_WRLCK ");
-		numLock = 1;
-	} else if (pfLock->fl_type == F_UNLCK) {
+		*lock = 1;
+	} else if (flock->fl_type == F_UNLCK) {
 		cFYI(1, "F_UNLCK");
-		numUnlock = 1;
-		/* Check if unlock includes more than
-		one lock range */
-	} else if (pfLock->fl_type == F_RDLCK) {
+		*unlock = 1;
+		/* Check if unlock includes more than one lock range */
+	} else if (flock->fl_type == F_RDLCK) {
 		cFYI(1, "F_RDLCK");
-		lockType |= LOCKING_ANDX_SHARED_LOCK;
-		numLock = 1;
-	} else if (pfLock->fl_type == F_EXLCK) {
+		*type |= LOCKING_ANDX_SHARED_LOCK;
+		*lock = 1;
+	} else if (flock->fl_type == F_EXLCK) {
 		cFYI(1, "F_EXLCK");
-		numLock = 1;
-	} else if (pfLock->fl_type == F_SHLCK) {
+		*lock = 1;
+	} else if (flock->fl_type == F_SHLCK) {
 		cFYI(1, "F_SHLCK");
-		lockType |= LOCKING_ANDX_SHARED_LOCK;
-		numLock = 1;
+		*type |= LOCKING_ANDX_SHARED_LOCK;
+		*lock = 1;
 	} else
 		cFYI(1, "Unknown type of lock");
+}
 
-	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink);
-	netfid = ((struct cifsFileInfo *)file->private_data)->netfid;
-
-	if ((tcon->ses->capabilities & CAP_UNIX) &&
-	    (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
-	    ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
-		posix_locking = 1;
-	/* BB add code here to normalize offset and length to
-	account for negative length which we can not accept over the
-	wire */
-	if (IS_GETLK(cmd)) {
-		if (posix_locking) {
-			int posix_lock_type;
-			if (lockType & LOCKING_ANDX_SHARED_LOCK)
-				posix_lock_type = CIFS_RDLCK;
-			else
-				posix_lock_type = CIFS_WRLCK;
-			rc = CIFSSMBPosixLock(xid, tcon, netfid, 1 /* get */,
-					length, pfLock, posix_lock_type,
-					wait_flag);
-			FreeXid(xid);
-			return rc;
-		}
-
-		/* BB we could chain these into one lock request BB */
-		rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start,
-				 0, 1, lockType, 0 /* wait flag */, 0);
-		if (rc == 0) {
-			rc = CIFSSMBLock(xid, tcon, netfid, length,
-					 pfLock->fl_start, 1 /* numUnlock */ ,
-					 0 /* numLock */ , lockType,
-					 0 /* wait flag */, 0);
-			pfLock->fl_type = F_UNLCK;
-			if (rc != 0)
-				cERROR(1, "Error unlocking previously locked "
-					   "range %d during test of lock", rc);
-			rc = 0;
-
-		} else {
-			/* if rc == ERR_SHARING_VIOLATION ? */
-			rc = 0;
+static int
+cifs_getlk(struct cifsFileInfo *cfile, struct file_lock *flock, __u8 type,
+	   bool wait_flag, bool posix_lck, int xid)
+{
+	int rc = 0;
+	__u64 length = 1 + flock->fl_end - flock->fl_start;
+	__u16 netfid = cfile->netfid;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
 
-			if (lockType & LOCKING_ANDX_SHARED_LOCK) {
-				pfLock->fl_type = F_WRLCK;
-			} else {
-				rc = CIFSSMBLock(xid, tcon, netfid, length,
-					pfLock->fl_start, 0, 1,
-					lockType | LOCKING_ANDX_SHARED_LOCK,
-					0 /* wait flag */, 0);
-				if (rc == 0) {
-					rc = CIFSSMBLock(xid, tcon, netfid,
-						length, pfLock->fl_start, 1, 0,
-						lockType |
-						LOCKING_ANDX_SHARED_LOCK,
-						0 /* wait flag */, 0);
-					pfLock->fl_type = F_RDLCK;
-					if (rc != 0)
-						cERROR(1, "Error unlocking "
-						"previously locked range %d "
-						"during test of lock", rc);
-					rc = 0;
-				} else {
-					pfLock->fl_type = F_WRLCK;
-					rc = 0;
-				}
-			}
-		}
+	if (posix_lck) {
+		int posix_lock_type;
+		if (type & LOCKING_ANDX_SHARED_LOCK)
+			posix_lock_type = CIFS_RDLCK;
+		else
+			posix_lock_type = CIFS_WRLCK;
+		rc = CIFSSMBPosixLock(xid, tcon, netfid, 1 /* get */,
+				      length, flock, posix_lock_type,
+				      wait_flag);
+		return rc;
+	}
 
-		FreeXid(xid);
+	/* BB we could chain these into one lock request BB */
+	rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
+			 flock->fl_start, 0, 1, type, 0, 0);
+	if (rc == 0) {
+		rc = CIFSSMBLock(xid, tcon, netfid, current->tgid,
+				 length, flock->fl_start, 1, 0,
+				 type, 0, 0);
+		flock->fl_type = F_UNLCK;
+		if (rc != 0)
+			cERROR(1, "Error unlocking previously locked "
+				   "range %d during test of lock", rc);
+		rc = 0;
 		return rc;
 	}
 
-	if (!numLock && !numUnlock) {
-		/* if no lock or unlock then nothing
-		to do since we do not know what it is */
-		FreeXid(xid);
-		return -EOPNOTSUPP;
+	if (type & LOCKING_ANDX_SHARED_LOCK) {
+		flock->fl_type = F_WRLCK;
+		rc = 0;
+		return rc;
 	}
 
-	if (posix_locking) {
+	rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
+			 flock->fl_start, 0, 1,
+			 type | LOCKING_ANDX_SHARED_LOCK, 0, 0);
+	if (rc == 0) {
+		rc = CIFSSMBLock(xid, tcon, netfid, current->tgid,
+				 length, flock->fl_start, 1, 0,
+				 type | LOCKING_ANDX_SHARED_LOCK,
+				 0, 0);
+		flock->fl_type = F_RDLCK;
+		if (rc != 0)
+			cERROR(1, "Error unlocking previously locked "
+				  "range %d during test of lock", rc);
+	} else
+		flock->fl_type = F_WRLCK;
+
+	rc = 0;
+	return rc;
+}
+
+static int
+cifs_setlk(struct file *file,  struct file_lock *flock, __u8 type,
+	   bool wait_flag, bool posix_lck, int lock, int unlock, int xid)
+{
+	int rc = 0;
+	__u64 length = 1 + flock->fl_end - flock->fl_start;
+	struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	__u16 netfid = cfile->netfid;
+
+	if (posix_lck) {
 		int posix_lock_type;
-		if (lockType & LOCKING_ANDX_SHARED_LOCK)
+		if (type & LOCKING_ANDX_SHARED_LOCK)
 			posix_lock_type = CIFS_RDLCK;
 		else
 			posix_lock_type = CIFS_WRLCK;
 
-		if (numUnlock == 1)
+		if (unlock == 1)
 			posix_lock_type = CIFS_UNLCK;
 
-		rc = CIFSSMBPosixLock(xid, tcon, netfid, 0 /* set */,
-				      length, pfLock, posix_lock_type,
-				      wait_flag);
-	} else {
-		struct cifsFileInfo *fid = file->private_data;
-
-		if (numLock) {
-			rc = CIFSSMBLock(xid, tcon, netfid, length,
-					 pfLock->fl_start, 0, numLock, lockType,
-					 wait_flag, 0);
-
-			if (rc == 0) {
-				/* For Windows locks we must store them. */
-				rc = store_file_lock(fid, length,
-						pfLock->fl_start, lockType);
-			}
-		} else if (numUnlock) {
-			/* For each stored lock that this unlock overlaps
-			   completely, unlock it. */
-			int stored_rc = 0;
-			struct cifsLockInfo *li, *tmp;
+		rc = CIFSSMBPosixLock(xid, tcon, netfid, 0 /* set */, length,
+				      flock, posix_lock_type, wait_flag);
+		goto out;
+	}
 
-			rc = 0;
-			mutex_lock(&fid->lock_mutex);
-			list_for_each_entry_safe(li, tmp, &fid->llist, llist) {
-				if (pfLock->fl_start <= li->offset &&
-						(pfLock->fl_start + length) >=
-						(li->offset + li->length)) {
-					stored_rc = CIFSSMBLock(xid, tcon,
-							netfid, li->length,
-							li->offset, 1, 0,
-							li->type, false, 0);
-					if (stored_rc)
-						rc = stored_rc;
-					else {
-						list_del(&li->llist);
-						kfree(li);
-					}
-				}
+	if (lock) {
+		rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
+				 flock->fl_start, 0, lock, type, wait_flag, 0);
+		if (rc == 0) {
+			/* For Windows locks we must store them. */
+			rc = store_file_lock(cfile, length, flock->fl_start,
+					     type, netfid);
+		}
+	} else if (unlock) {
+		/*
+		 * For each stored lock that this unlock overlaps completely,
+		 * unlock it.
+		 */
+		int stored_rc = 0;
+		struct cifsLockInfo *li, *tmp;
+
+		mutex_lock(&cfile->lock_mutex);
+		list_for_each_entry_safe(li, tmp, &cfile->llist, llist) {
+			if (flock->fl_start > li->offset ||
+			    (flock->fl_start + length) <
+			    (li->offset + li->length))
+				continue;
+			if (current->tgid != li->pid)
+				continue;
+
+			stored_rc = CIFSSMBLock(xid, tcon, netfid,
+						current->tgid, li->length,
+						li->offset, 1, 0, li->type,
+						0, 0);
+			if (stored_rc)
+				rc = stored_rc;
+			else {
+				list_del(&li->llist);
+				kfree(li);
 			}
-			mutex_unlock(&fid->lock_mutex);
 		}
+		mutex_unlock(&cfile->lock_mutex);
+	}
+out:
+	if (flock->fl_flags & FL_POSIX)
+		posix_lock_file_wait(file, flock);
+	return rc;
+}
+
+int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
+{
+	int rc, xid;
+	int lock = 0, unlock = 0;
+	bool wait_flag = false;
+	bool posix_lck = false;
+	struct cifs_sb_info *cifs_sb;
+	struct cifs_tcon *tcon;
+	struct cifsInodeInfo *cinode;
+	struct cifsFileInfo *cfile;
+	__u16 netfid;
+	__u8 type;
+
+	rc = -EACCES;
+	xid = GetXid();
+
+	cFYI(1, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld "
+		"end: %lld", cmd, flock->fl_flags, flock->fl_type,
+		flock->fl_start, flock->fl_end);
+
+	cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag);
+
+	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+	cfile = (struct cifsFileInfo *)file->private_data;
+	tcon = tlink_tcon(cfile->tlink);
+	netfid = cfile->netfid;
+	cinode = CIFS_I(file->f_path.dentry->d_inode);
+
+	if ((tcon->ses->capabilities & CAP_UNIX) &&
+	    (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+	    ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+		posix_lck = true;
+	/*
+	 * BB add code here to normalize offset and length to account for
+	 * negative length which we can not accept over the wire.
+	 */
+	if (IS_GETLK(cmd)) {
+		rc = cifs_getlk(cfile, flock, type, wait_flag, posix_lck, xid);
+		FreeXid(xid);
+		return rc;
+	}
+
+	if (!lock && !unlock) {
+		/*
+		 * if no lock or unlock then nothing to do since we do not
+		 * know what it is
+		 */
+		FreeXid(xid);
+		return -EOPNOTSUPP;
 	}
 
-	if (pfLock->fl_flags & FL_POSIX)
-		posix_lock_file_wait(file, pfLock);
+	rc = cifs_setlk(file, flock, type, wait_flag, posix_lck, lock, unlock,
+			xid);
 	FreeXid(xid);
 	return rc;
 }
@@ -2423,8 +2454,9 @@ void cifs_oplock_break(struct work_struct *work)
 	 * disconnected since oplock already released by the server
 	 */
 	if (!cfile->oplock_break_cancelled) {
-		rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
-				 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false,
+		rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid,
+				 current->tgid, 0, 0, 0, 0,
+				 LOCKING_ANDX_OPLOCK_RELEASE, false,
 				 cinode->clientCanCacheRead ? 1 : 0);
 		cFYI(1, "Oplock release rc = %d", rc);
 	}
-- 
cgit v1.2.3-58-ga151


From fe11e4ccb8479d92cd2a101d380d332544b84aaa Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Oct 2011 06:41:32 -0400
Subject: cifs: clean up check_rfc1002_header

Rename it for better clarity as to what it does and have the caller pass
in just the single type byte. Turn the if statement into a switch and
optimize it by placing the most common message type at the top. Move the
header length check back into cifs_demultiplex_thread in preparation
for adding a new receive phase and normalize the cFYI messages.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/connect.c | 63 ++++++++++++++++++++++++++-----------------------------
 1 file changed, 30 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 82bc0d27e495..97a65af2a08a 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -427,29 +427,29 @@ read_from_socket(struct TCP_Server_Info *server, char *buf,
 }
 
 static bool
-check_rfc1002_header(struct TCP_Server_Info *server, char *buf)
+is_smb_response(struct TCP_Server_Info *server, unsigned char type)
 {
-	char temp = *buf;
-	unsigned int pdu_length = be32_to_cpu(
-				((struct smb_hdr *)buf)->smb_buf_length);
-
 	/*
 	 * The first byte big endian of the length field,
 	 * is actually not part of the length but the type
 	 * with the most common, zero, as regular data.
 	 */
-	if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) {
-		return false;
-	} else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) {
-		cFYI(1, "Good RFC 1002 session rsp");
-		return false;
-	} else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) {
+	switch (type) {
+	case RFC1002_SESSION_MESSAGE:
+		/* Regular SMB response */
+		return true;
+	case RFC1002_SESSION_KEEP_ALIVE:
+		cFYI(1, "RFC 1002 session keep alive");
+		break;
+	case RFC1002_POSITIVE_SESSION_RESPONSE:
+		cFYI(1, "RFC 1002 positive session response");
+		break;
+	case RFC1002_NEGATIVE_SESSION_RESPONSE:
 		/*
 		 * We get this from Windows 98 instead of an error on
 		 * SMB negprot response.
 		 */
-		cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
-			pdu_length);
+		cFYI(1, "RFC 1002 negative session response");
 		/* give server a second to clean up */
 		msleep(1000);
 		/*
@@ -458,29 +458,16 @@ check_rfc1002_header(struct TCP_Server_Info *server, char *buf)
 		 * is since we do not begin with RFC1001 session
 		 * initialize frame).
 		 */
-		cifs_set_port((struct sockaddr *)
-				&server->dstaddr, CIFS_PORT);
+		cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT);
 		cifs_reconnect(server);
 		wake_up(&server->response_q);
-		return false;
-	} else if (temp != (char) 0) {
-		cERROR(1, "Unknown RFC 1002 frame");
-		cifs_dump_mem(" Received Data: ", buf, 4);
-		cifs_reconnect(server);
-		return false;
-	}
-
-	/* else we have an SMB response */
-	if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
-	    (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
-		cERROR(1, "Invalid size SMB length %d pdu_length %d",
-		       4, pdu_length+4);
+		break;
+	default:
+		cERROR(1, "RFC 1002 unknown response type 0x%x", type);
 		cifs_reconnect(server);
-		wake_up(&server->response_q);
-		return false;
 	}
 
-	return true;
+	return false;
 }
 
 static struct mid_q_entry *
@@ -683,10 +670,20 @@ cifs_demultiplex_thread(void *p)
 		 */
 		pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
 
-		cFYI(1, "rfc1002 length 0x%x", pdu_length+4);
-		if (!check_rfc1002_header(server, buf))
+		cFYI(1, "RFC1002 header 0x%x", pdu_length);
+		if (!is_smb_response(server, buf[0]))
 			continue;
 
+		/* check the length */
+		if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
+		    (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
+			cERROR(1, "Invalid size SMB length %d pdu_length %d",
+			       4, pdu_length + 4);
+			cifs_reconnect(server);
+			wake_up(&server->response_q);
+			continue;
+		}
+
 		/* else length ok */
 		if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
 			isLargeBuf = true;
-- 
cgit v1.2.3-58-ga151


From d59dad2be038132259ac99a2837d65a87fd90588 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastry@etersoft.ru>
Date: Thu, 22 Sep 2011 09:53:59 +0400
Subject: CIFS: Move byte range lock list from fd to inode

that let us do local lock checks before requesting to the server.

Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsfs.c   |  3 ++-
 fs/cifs/cifsglob.h |  7 ++++---
 fs/cifs/file.c     | 30 +++++++++++++++++-------------
 3 files changed, 23 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5c2972106816..b0a2e1647390 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -945,7 +945,8 @@ cifs_init_once(void *inode)
 	struct cifsInodeInfo *cifsi = inode;
 
 	inode_init_once(&cifsi->vfs_inode);
-	INIT_LIST_HEAD(&cifsi->lockList);
+	INIT_LIST_HEAD(&cifsi->llist);
+	mutex_init(&cifsi->lock_mutex);
 }
 
 static int
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 1f265ffe7e63..55ebf39fb3fd 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -494,6 +494,7 @@ struct cifsLockInfo {
 	__u64 length;
 	__u32 pid;
 	__u8 type;
+	__u16 netfid;
 };
 
 /*
@@ -526,8 +527,6 @@ struct cifsFileInfo {
 	struct dentry *dentry;
 	unsigned int f_flags;
 	struct tcon_link *tlink;
-	struct mutex lock_mutex;
-	struct list_head llist; /* list of byte range locks we have. */
 	bool invalidHandle:1;	/* file closed via session abend */
 	bool oplock_break_cancelled:1;
 	int count;		/* refcount protected by cifs_file_list_lock */
@@ -560,7 +559,9 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
  */
 
 struct cifsInodeInfo {
-	struct list_head lockList;
+	struct list_head llist;		/* brlocks for this inode */
+	bool can_cache_brlcks;
+	struct mutex lock_mutex;	/* protect two fields above */
 	/* BB add in lists for dirty pages i.e. write caching info for oplock */
 	struct list_head openFileList;
 	__u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ab85699c5653..7f84ece116d0 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -262,8 +262,6 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 	pCifsFile->invalidHandle = false;
 	pCifsFile->tlink = cifs_get_tlink(tlink);
 	mutex_init(&pCifsFile->fh_mutex);
-	mutex_init(&pCifsFile->lock_mutex);
-	INIT_LIST_HEAD(&pCifsFile->llist);
 	INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
 
 	spin_lock(&cifs_file_list_lock);
@@ -331,12 +329,14 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 	/* Delete any outstanding lock records. We'll lose them when the file
 	 * is closed anyway.
 	 */
-	mutex_lock(&cifs_file->lock_mutex);
-	list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) {
+	mutex_lock(&cifsi->lock_mutex);
+	list_for_each_entry_safe(li, tmp, &cifsi->llist, llist) {
+		if (li->netfid != cifs_file->netfid)
+			continue;
 		list_del(&li->llist);
 		kfree(li);
 	}
-	mutex_unlock(&cifs_file->lock_mutex);
+	mutex_unlock(&cifsi->lock_mutex);
 
 	cifs_put_tlink(cifs_file->tlink);
 	dput(cifs_file->dentry);
@@ -639,20 +639,21 @@ int cifs_closedir(struct inode *inode, struct file *file)
 	return rc;
 }
 
-static int store_file_lock(struct cifsFileInfo *cfile, __u64 len,
+static int store_file_lock(struct cifsInodeInfo *cinode, __u64 len,
 			   __u64 offset, __u8 type, __u16 netfid)
 {
 	struct cifsLockInfo *li =
 		kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL);
 	if (li == NULL)
 		return -ENOMEM;
+	li->netfid = netfid;
 	li->offset = offset;
 	li->length = len;
 	li->type = type;
 	li->pid = current->tgid;
-	mutex_lock(&cfile->lock_mutex);
-	list_add_tail(&li->llist, &cfile->llist);
-	mutex_unlock(&cfile->lock_mutex);
+	mutex_lock(&cinode->lock_mutex);
+	list_add_tail(&li->llist, &cinode->llist);
+	mutex_unlock(&cinode->lock_mutex);
 	return 0;
 }
 
@@ -769,6 +770,7 @@ cifs_setlk(struct file *file,  struct file_lock *flock, __u8 type,
 	__u64 length = 1 + flock->fl_end - flock->fl_start;
 	struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
 	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
 	__u16 netfid = cfile->netfid;
 
 	if (posix_lck) {
@@ -791,7 +793,7 @@ cifs_setlk(struct file *file,  struct file_lock *flock, __u8 type,
 				 flock->fl_start, 0, lock, type, wait_flag, 0);
 		if (rc == 0) {
 			/* For Windows locks we must store them. */
-			rc = store_file_lock(cfile, length, flock->fl_start,
+			rc = store_file_lock(cinode, length, flock->fl_start,
 					     type, netfid);
 		}
 	} else if (unlock) {
@@ -802,14 +804,16 @@ cifs_setlk(struct file *file,  struct file_lock *flock, __u8 type,
 		int stored_rc = 0;
 		struct cifsLockInfo *li, *tmp;
 
-		mutex_lock(&cfile->lock_mutex);
-		list_for_each_entry_safe(li, tmp, &cfile->llist, llist) {
+		mutex_lock(&cinode->lock_mutex);
+		list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
 			if (flock->fl_start > li->offset ||
 			    (flock->fl_start + length) <
 			    (li->offset + li->length))
 				continue;
 			if (current->tgid != li->pid)
 				continue;
+			if (cfile->netfid != li->netfid)
+				continue;
 
 			stored_rc = CIFSSMBLock(xid, tcon, netfid,
 						current->tgid, li->length,
@@ -822,7 +826,7 @@ cifs_setlk(struct file *file,  struct file_lock *flock, __u8 type,
 				kfree(li);
 			}
 		}
-		mutex_unlock(&cfile->lock_mutex);
+		mutex_unlock(&cinode->lock_mutex);
 	}
 out:
 	if (flock->fl_flags & FL_POSIX)
-- 
cgit v1.2.3-58-ga151


From b916c5cd4d895a27b47a652648958f73e4f23ac6 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 28 Sep 2011 11:55:51 +0300
Subject: ore: Only IO one group at a time (API change)

Usually a single IO is confined to one group of devices
(group_width) and at the boundary of a raid group it can
spill into a second group. Current code would allocate a
full device_table size array at each io_state so it can
comply to requests that span two groups. Needless to say
that is very wasteful, specially when device_table count
can get very large (hundreds even thousands), while a
group_width is usually 8 or 10.

* Change ore API to trim on IO that spans two raid groups.
  The user passes offset+length to ore_get_rw_state, the
  ore might trim on that length if spanning a group boundary.
  The user must check ios->length or ios->nrpages to see
  how much IO will be preformed. It is the responsibility
  of the user to re-issue the reminder of the IO.

* Modify exofs To copy spilled pages on to the next IO.
  This means one last kick is needed after all coalescing
  of pages is done.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/inode.c | 100 ++++++++++++++++++++++++++++++++++++++++++++--------
 fs/exofs/ore.c   | 105 ++++++++++++++++++++++++++++++++++++-------------------
 2 files changed, 154 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 61b2f7e5cdbd..d87c1f7562fb 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -259,6 +259,46 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
 	}
 }
 
+static int _maybe_not_all_in_one_io(struct ore_io_state *ios,
+	struct page_collect *pcol_src, struct page_collect *pcol)
+{
+	/* length was wrong or offset was not page aligned */
+	BUG_ON(pcol_src->nr_pages < ios->nr_pages);
+
+	if (pcol_src->nr_pages > ios->nr_pages) {
+		struct page **src_page;
+		unsigned pages_less = pcol_src->nr_pages - ios->nr_pages;
+		unsigned long len_less = pcol_src->length - ios->length;
+		unsigned i;
+		int ret;
+
+		/* This IO was trimmed */
+		pcol_src->nr_pages = ios->nr_pages;
+		pcol_src->length = ios->length;
+
+		/* Left over pages are passed to the next io */
+		pcol->expected_pages += pages_less;
+		pcol->nr_pages = pages_less;
+		pcol->length = len_less;
+		src_page = pcol_src->pages + pcol_src->nr_pages;
+		pcol->pg_first = (*src_page)->index;
+
+		ret = pcol_try_alloc(pcol);
+		if (unlikely(ret))
+			return ret;
+
+		for (i = 0; i < pages_less; ++i)
+			pcol->pages[i] = *src_page++;
+
+		EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x "
+			"pages_less=0x%x expected_pages=0x%x "
+			"next_offset=0x%llx next_len=0x%lx\n",
+			pcol_src->nr_pages, pages_less, pcol->expected_pages,
+			pcol->pg_first * PAGE_SIZE, pcol->length);
+	}
+	return 0;
+}
+
 static int read_exec(struct page_collect *pcol)
 {
 	struct exofs_i_info *oi = exofs_i(pcol->inode);
@@ -280,7 +320,6 @@ static int read_exec(struct page_collect *pcol)
 
 	ios = pcol->ios;
 	ios->pages = pcol->pages;
-	ios->nr_pages = pcol->nr_pages;
 
 	if (pcol->read_4_write) {
 		ore_read(pcol->ios);
@@ -296,17 +335,23 @@ static int read_exec(struct page_collect *pcol)
 	*pcol_copy = *pcol;
 	ios->done = readpages_done;
 	ios->private = pcol_copy;
+
+	/* pages ownership was passed to pcol_copy */
+	_pcol_reset(pcol);
+
+	ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
+	if (unlikely(ret))
+		goto err;
+
+	EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n",
+		pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
+
 	ret = ore_read(ios);
 	if (unlikely(ret))
 		goto err;
 
 	atomic_inc(&pcol->sbi->s_curr_pending);
 
-	EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
-		  oi->one_comp.obj.id, _LLU(ios->offset), pcol->length);
-
-	/* pages ownership was passed to pcol_copy */
-	_pcol_reset(pcol);
 	return 0;
 
 err:
@@ -429,6 +474,10 @@ static int exofs_readpages(struct file *file, struct address_space *mapping,
 		return ret;
 	}
 
+	ret = read_exec(&pcol);
+	if (unlikely(ret))
+		return ret;
+
 	return read_exec(&pcol);
 }
 
@@ -519,7 +568,6 @@ static int write_exec(struct page_collect *pcol)
 	ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
 				 pcol->pg_first << PAGE_CACHE_SHIFT,
 				 pcol->length, &pcol->ios);
-
 	if (unlikely(ret))
 		goto err;
 
@@ -534,10 +582,19 @@ static int write_exec(struct page_collect *pcol)
 
 	ios = pcol->ios;
 	ios->pages = pcol_copy->pages;
-	ios->nr_pages = pcol_copy->nr_pages;
 	ios->done = writepages_done;
 	ios->private = pcol_copy;
 
+	/* pages ownership was passed to pcol_copy */
+	_pcol_reset(pcol);
+
+	ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
+	if (unlikely(ret))
+		goto err;
+
+	EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n",
+		pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
+
 	ret = ore_write(ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("write_exec: ore_write() Failed\n");
@@ -545,11 +602,6 @@ static int write_exec(struct page_collect *pcol)
 	}
 
 	atomic_inc(&pcol->sbi->s_curr_pending);
-	EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
-		  pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
-		  pcol->length);
-	/* pages ownership was passed to pcol_copy */
-	_pcol_reset(pcol);
 	return 0;
 
 err:
@@ -689,12 +741,30 @@ static int exofs_writepages(struct address_space *mapping,
 	_pcol_init(&pcol, expected_pages, mapping->host);
 
 	ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
-	if (ret) {
+	if (unlikely(ret)) {
 		EXOFS_ERR("write_cache_pages => %d\n", ret);
 		return ret;
 	}
 
-	return write_exec(&pcol);
+	ret = write_exec(&pcol);
+	if (unlikely(ret))
+		return ret;
+
+	if (wbc->sync_mode == WB_SYNC_ALL) {
+		return write_exec(&pcol); /* pump the last reminder */
+	} else if (pcol.nr_pages) {
+		/* not SYNC let the reminder join the next writeout */
+		unsigned i;
+
+		for (i = 0; i < pcol.nr_pages; i++) {
+			struct page *page = pcol.pages[i];
+
+			end_page_writeback(page);
+			set_page_dirty(page);
+			unlock_page(page);
+		}
+	}
+	return 0;
 }
 
 static int exofs_writepage(struct page *page, struct writeback_control *wbc)
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index a7d79257fc65..c1c2cc607adf 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -47,6 +47,9 @@ MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
 MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
 MODULE_LICENSE("GPL");
 
+static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
+				 struct ore_striping_info *si);
+
 static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
 {
 	return ios->oc->comps[index & ios->oc->single_comp].cred;
@@ -62,38 +65,85 @@ static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
 	return ore_comp_dev(ios->oc, index);
 }
 
-int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
-		      bool is_reading, u64 offset, u64 length,
-		      struct ore_io_state **pios)
+static int  _get_io_state(struct ore_layout *layout,
+			  struct ore_components *oc, unsigned numdevs,
+			  struct ore_io_state **pios)
 {
 	struct ore_io_state *ios;
 
 	/*TODO: Maybe use kmem_cach per sbi of size
 	 * exofs_io_state_size(layout->s_numdevs)
 	 */
-	ios = kzalloc(ore_io_state_size(oc->numdevs), GFP_KERNEL);
+	ios = kzalloc(ore_io_state_size(numdevs), GFP_KERNEL);
 	if (unlikely(!ios)) {
 		ORE_DBGMSG("Failed kzalloc bytes=%d\n",
-			     ore_io_state_size(oc->numdevs));
+			   ore_io_state_size(numdevs));
 		*pios = NULL;
 		return -ENOMEM;
 	}
 
 	ios->layout = layout;
 	ios->oc = oc;
-	ios->offset = offset;
-	ios->length = length;
+	*pios = ios;
+	return 0;
+}
+
+/* Allocate an io_state for only a single group of devices
+ *
+ * If a user needs to call ore_read/write() this version must be used becase it
+ * allocates extra stuff for striping and raid.
+ * The ore might decide to only IO less then @length bytes do to alignmets
+ * and constrains as follows:
+ * - The IO cannot cross group boundary.
+ * - In raid5/6 The end of the IO must align at end of a stripe eg.
+ *   (@offset + @length) % strip_size == 0. Or the complete range is within a
+ *   single stripe.
+ * - Memory condition only permitted a shorter IO. (A user can use @length=~0
+ *   And check the returned ios->length for max_io_size.)
+ *
+ * The caller must check returned ios->length (and/or ios->nr_pages) and
+ * re-issue these pages that fall outside of ios->length
+ */
+int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
+		      bool is_reading, u64 offset, u64 length,
+		      struct ore_io_state **pios)
+{
+	struct ore_io_state *ios;
+	unsigned numdevs = layout->group_width * layout->mirrors_p1;
+	int ret;
+
+	ret = _get_io_state(layout, oc, numdevs, pios);
+	if (unlikely(ret))
+		return ret;
+
+	ios = *pios;
 	ios->reading = is_reading;
+	ios->offset = offset;
+
+	if (length) {
+		struct ore_striping_info si;
+
+		ore_calc_stripe_info(layout, offset, &si);
+		ios->length = (length <= si.group_length) ? length :
+							si.group_length;
+		ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
+	}
 
-	*pios = ios;
 	return 0;
 }
 EXPORT_SYMBOL(ore_get_rw_state);
 
+/* Allocate an io_state for all the devices in the comps array
+ *
+ * This version of io_state allocation is used mostly by create/remove
+ * and trunc where we currently need all the devices. The only wastful
+ * bit is the read/write_attributes with no IO. Those sites should
+ * be converted to use ore_get_rw_state() with length=0
+ */
 int  ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
-		      struct ore_io_state **ios)
+		      struct ore_io_state **pios)
 {
-	return ore_get_rw_state(layout, oc, true, 0, 0, ios);
+	return _get_io_state(layout, oc, oc->numdevs, pios);
 }
 EXPORT_SYMBOL(ore_get_io_state);
 
@@ -374,12 +424,12 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length,
 	unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
 	unsigned dev = si->dev;
 	unsigned first_dev = dev - (dev % devs_in_group);
-	unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
 	unsigned cur_pg = ios->pages_consumed;
 	int ret = 0;
 
 	while (length) {
-		struct ore_per_dev_state *per_dev = &ios->per_dev[dev];
+		unsigned comp = dev - first_dev;
+		struct ore_per_dev_state *per_dev = &ios->per_dev[comp];
 		unsigned cur_len, page_off = 0;
 
 		if (!per_dev->length) {
@@ -397,9 +447,6 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length,
 				per_dev->offset = si->obj_offset - si->unit_off;
 				cur_len = stripe_unit;
 			}
-
-			if (max_comp < dev)
-				max_comp = dev;
 		} else {
 			cur_len = stripe_unit;
 		}
@@ -417,17 +464,15 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length,
 		length -= cur_len;
 	}
 out:
-	ios->numdevs = max_comp + mirrors_p1;
+	ios->numdevs = devs_in_group;
 	ios->pages_consumed = cur_pg;
 	return ret;
 }
 
 static int _prepare_for_striping(struct ore_io_state *ios)
 {
-	u64 length = ios->length;
-	u64 offset = ios->offset;
 	struct ore_striping_info si;
-	int ret = 0;
+	int ret;
 
 	if (!ios->pages) {
 		if (ios->kern_buff) {
@@ -446,21 +491,11 @@ static int _prepare_for_striping(struct ore_io_state *ios)
 		return 0;
 	}
 
-	while (length) {
-		ore_calc_stripe_info(ios->layout, offset, &si);
-
-		if (length < si.group_length)
-			si.group_length = length;
+	ore_calc_stripe_info(ios->layout, ios->offset, &si);
 
-		ret = _prepare_one_group(ios, si.group_length, &si);
-		if (unlikely(ret))
-			goto out;
+	BUG_ON(ios->length > si.group_length);
+	ret = _prepare_one_group(ios, ios->length, &si);
 
-		offset += si.group_length;
-		length -= si.group_length;
-	}
-
-out:
 	return ret;
 }
 
@@ -742,7 +777,6 @@ struct _trunc_info {
 
 	unsigned first_group_dev;
 	unsigned nex_group_dev;
-	unsigned max_devs;
 };
 
 static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
@@ -757,7 +791,6 @@ static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
 
 	ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width);
 	ti->nex_group_dev = ti->first_group_dev + layout->group_width;
-	ti->max_devs = layout->group_width * layout->group_count;
 }
 
 int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
@@ -777,7 +810,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
 
 	_calc_trunk_info(ios->layout, size, &ti);
 
-	size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs),
+	size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs),
 			     GFP_KERNEL);
 	if (unlikely(!size_attrs)) {
 		ret = -ENOMEM;
@@ -786,7 +819,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
 
 	ios->numdevs = ios->oc->numdevs;
 
-	for (i = 0; i < ti.max_devs; ++i) {
+	for (i = 0; i < ios->numdevs; ++i) {
 		struct exofs_trunc_attr *size_attr = &size_attrs[i];
 		u64 obj_size;
 
-- 
cgit v1.2.3-58-ga151


From 98260754046eee4cc7d75751a4a20182ade39f58 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Sun, 2 Oct 2011 15:32:50 +0200
Subject: ore: cleanup: Embed an ore_striping_info inside ore_io_state

Now that each ore_io_state covers only a single raid group.
A single striping_info math is needed. Embed one inside
ore_io_state to cache the calculation results and eliminate
an extra call.

Also the outer _prepare_for_striping is removed since it does nothing.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/ore.c         | 61 ++++++++++++++++++++------------------------------
 include/scsi/osd_ore.h |  1 +
 2 files changed, 25 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index c1c2cc607adf..7e02d33b5e5c 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -121,11 +121,9 @@ int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
 	ios->offset = offset;
 
 	if (length) {
-		struct ore_striping_info si;
-
-		ore_calc_stripe_info(layout, offset, &si);
-		ios->length = (length <= si.group_length) ? length :
-							si.group_length;
+		ore_calc_stripe_info(layout, offset, &ios->si);
+		ios->length = (length <= ios->si.group_length) ? length :
+							ios->si.group_length;
 		ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
 	}
 
@@ -416,17 +414,36 @@ static int _add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
 	return 0;
 }
 
-static int _prepare_one_group(struct ore_io_state *ios, u64 length,
-			      struct ore_striping_info *si)
+static int _prepare_for_striping(struct ore_io_state *ios)
 {
+	struct ore_striping_info *si = &ios->si;
 	unsigned stripe_unit = ios->layout->stripe_unit;
 	unsigned mirrors_p1 = ios->layout->mirrors_p1;
 	unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
 	unsigned dev = si->dev;
 	unsigned first_dev = dev - (dev % devs_in_group);
 	unsigned cur_pg = ios->pages_consumed;
+	u64 length = ios->length;
 	int ret = 0;
 
+	if (!ios->pages) {
+		if (ios->kern_buff) {
+			struct ore_per_dev_state *per_dev = &ios->per_dev[0];
+
+			per_dev->offset = si->obj_offset;
+			per_dev->dev = si->dev;
+
+			/* no cross device without page array */
+			BUG_ON((ios->layout->group_width > 1) &&
+			       (si->unit_off + ios->length >
+				ios->layout->stripe_unit));
+		}
+		ios->numdevs = ios->layout->mirrors_p1;
+		return 0;
+	}
+
+	BUG_ON(length > si->group_length);
+
 	while (length) {
 		unsigned comp = dev - first_dev;
 		struct ore_per_dev_state *per_dev = &ios->per_dev[comp];
@@ -469,36 +486,6 @@ out:
 	return ret;
 }
 
-static int _prepare_for_striping(struct ore_io_state *ios)
-{
-	struct ore_striping_info si;
-	int ret;
-
-	if (!ios->pages) {
-		if (ios->kern_buff) {
-			struct ore_per_dev_state *per_dev = &ios->per_dev[0];
-
-			ore_calc_stripe_info(ios->layout, ios->offset, &si);
-			per_dev->offset = si.obj_offset;
-			per_dev->dev = si.dev;
-
-			/* no cross device without page array */
-			BUG_ON((ios->layout->group_width > 1) &&
-			       (si.unit_off + ios->length >
-				ios->layout->stripe_unit));
-		}
-		ios->numdevs = ios->layout->mirrors_p1;
-		return 0;
-	}
-
-	ore_calc_stripe_info(ios->layout, ios->offset, &si);
-
-	BUG_ON(ios->length > si.group_length);
-	ret = _prepare_one_group(ios, ios->length, &si);
-
-	return ret;
-}
-
 int ore_create(struct ore_io_state *ios)
 {
 	int i, ret;
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index 8fefdfbb1ced..baeef0200a1f 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -93,6 +93,7 @@ typedef void (*ore_io_done_fn)(struct ore_io_state *ios, void *private);
 
 struct ore_io_state {
 	struct kref		kref;
+	struct ore_striping_info si;
 
 	void			*private;
 	ore_io_done_fn	done;
-- 
cgit v1.2.3-58-ga151


From 6851a5e5c12b35f8bb8986d0ff16ca9be4cf2c09 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 24 Aug 2011 18:10:49 -0700
Subject: ore: Remove check for ios->kern_buff in _prepare_for_striping to
 later

Move the check and preparation of the ios->kern_buff case to
later inside _write_mirror().

Since read was never used with ios->kern_buff its support is removed
instead of fixed.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/ore.c | 36 +++++++++++++-----------------------
 1 file changed, 13 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 7e02d33b5e5c..d0f8db5e46c5 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -427,17 +427,6 @@ static int _prepare_for_striping(struct ore_io_state *ios)
 	int ret = 0;
 
 	if (!ios->pages) {
-		if (ios->kern_buff) {
-			struct ore_per_dev_state *per_dev = &ios->per_dev[0];
-
-			per_dev->offset = si->obj_offset;
-			per_dev->dev = si->dev;
-
-			/* no cross device without page array */
-			BUG_ON((ios->layout->group_width > 1) &&
-			       (si->unit_off + ios->length >
-				ios->layout->stripe_unit));
-		}
 		ios->numdevs = ios->layout->mirrors_p1;
 		return 0;
 	}
@@ -557,7 +546,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
 			goto out;
 		}
 		per_dev->or = or;
-		per_dev->offset = master_dev->offset;
 
 		if (ios->pages) {
 			struct bio *bio;
@@ -576,6 +564,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
 				__bio_clone(bio, master_dev->bio);
 				bio->bi_bdev = NULL;
 				bio->bi_next = NULL;
+				per_dev->offset = master_dev->offset;
 				per_dev->length = master_dev->length;
 				per_dev->bio =  bio;
 				per_dev->dev = dev;
@@ -593,7 +582,15 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
 				     _LLU(per_dev->offset),
 				     _LLU(per_dev->length), dev);
 		} else if (ios->kern_buff) {
-			ret = osd_req_write_kern(or, _ios_obj(ios, dev),
+			per_dev->offset = ios->si.obj_offset;
+			per_dev->dev = ios->si.dev + dev;
+
+			/* no cross device without page array */
+			BUG_ON((ios->layout->group_width > 1) &&
+			       (ios->si.unit_off + ios->length >
+				ios->layout->stripe_unit));
+
+			ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev),
 						 per_dev->offset,
 						 ios->kern_buff, ios->length);
 			if (unlikely(ret))
@@ -602,7 +599,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
 				      "length=0x%llx dev=%d\n",
 				     _LLU(_ios_obj(ios, dev)->id),
 				     _LLU(per_dev->offset),
-				     _LLU(ios->length), dev);
+				     _LLU(ios->length), per_dev->dev);
 		} else {
 			osd_req_set_attributes(or, _ios_obj(ios, dev));
 			ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
@@ -668,16 +665,9 @@ static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp)
 			     " dev=%d\n", _LLU(obj->id),
 			     _LLU(per_dev->offset), _LLU(per_dev->length),
 			     first_dev);
-	} else if (ios->kern_buff) {
-		int ret = osd_req_read_kern(or, obj, per_dev->offset,
-					    ios->kern_buff, ios->length);
-		ORE_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
-			      "length=0x%llx dev=%d ret=>%d\n",
-			      _LLU(obj->id), _LLU(per_dev->offset),
-			      _LLU(ios->length), first_dev, ret);
-		if (unlikely(ret))
-			return ret;
 	} else {
+		BUG_ON(ios->kern_buff);
+
 		osd_req_get_attributes(or, obj);
 		ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
 			      _LLU(obj->id),
-- 
cgit v1.2.3-58-ga151


From 154a9300cd87eee7538f356c9d339f06b0b1d257 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Fri, 26 Aug 2011 21:00:32 -0700
Subject: exofs: Support for short read/writes

If at read/write_done the actual IO was shorter then requested,
reported in returned ios->length. It is not an error. The reminder
of the pages should just be unlocked but not marked uptodate or
end_page_writeback. They will be re issued later by the VFS.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/inode.c | 35 ++++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index d87c1f7562fb..96366a1d7eae 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -149,14 +149,17 @@ static int pcol_add_page(struct page_collect *pcol, struct page *page,
 	return 0;
 }
 
+enum {PAGE_WAS_NOT_IN_IO = 17};
 static int update_read_page(struct page *page, int ret)
 {
-	if (ret == 0) {
+	switch (ret) {
+	case 0:
 		/* Everything is OK */
 		SetPageUptodate(page);
 		if (PageError(page))
 			ClearPageError(page);
-	} else if (ret == -EFAULT) {
+		break;
+	case -EFAULT:
 		/* In this case we were trying to read something that wasn't on
 		 * disk yet - return a page full of zeroes.  This should be OK,
 		 * because the object should be empty (if there was a write
@@ -167,16 +170,22 @@ static int update_read_page(struct page *page, int ret)
 		SetPageUptodate(page);
 		if (PageError(page))
 			ClearPageError(page);
-		ret = 0; /* recovered error */
 		EXOFS_DBGMSG("recovered read error\n");
-	} else /* Error */
+		/* fall through */
+	case PAGE_WAS_NOT_IN_IO:
+		ret = 0; /* recovered error */
+		break;
+	default:
 		SetPageError(page);
-
+	}
 	return ret;
 }
 
 static void update_write_page(struct page *page, int ret)
 {
+	if (unlikely(ret == PAGE_WAS_NOT_IN_IO))
+		return; /* don't pass start don't collect $200 */
+
 	if (ret) {
 		mapping_set_error(page->mapping, ret);
 		SetPageError(page);
@@ -195,10 +204,14 @@ static int __readpages_done(struct page_collect *pcol)
 	u64 length = 0;
 	int ret = ore_check_io(pcol->ios, &resid);
 
-	if (likely(!ret))
+	if (likely(!ret)) {
 		good_bytes = pcol->length;
-	else
+		ret = PAGE_WAS_NOT_IN_IO;
+	} else {
 		good_bytes = pcol->length - resid;
+	}
+	if (good_bytes > pcol->ios->length)
+		good_bytes = pcol->ios->length;
 
 	EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
 		     " length=0x%lx nr_pages=%u\n",
@@ -518,10 +531,14 @@ static void writepages_done(struct ore_io_state *ios, void *p)
 
 	atomic_dec(&pcol->sbi->s_curr_pending);
 
-	if (likely(!ret))
+	if (likely(!ret)) {
 		good_bytes = pcol->length;
-	else
+		ret = PAGE_WAS_NOT_IN_IO;
+	} else {
 		good_bytes = pcol->length - resid;
+	}
+	if (good_bytes > pcol->ios->length)
+		good_bytes = pcol->ios->length;
 
 	EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
 		     " length=0x%lx nr_pages=%u\n",
-- 
cgit v1.2.3-58-ga151


From bbf9a31bba8c985780fe94da059cc5813a7920f5 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Fri, 26 Aug 2011 21:04:52 -0700
Subject: ore: Support for short read/writes

Memory conditions and max_bio constraints might cause us to
not comply to the full length of the requested IO. Instead of
failing the complete IO we can issue a shorter read/write and
report how much was actually executed in the ios->length
member.

All users must check ios->length at IO_done or upon return of
ore_read/write and re-issue the reminder of the bytes. Because
other wise there is no error returned like before.

This is part of the effort to support the pnfs-obj layout driver.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/ore.c | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index d0f8db5e46c5..8354fe061d1c 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -377,8 +377,8 @@ static int _add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
 	unsigned pg = *cur_pg;
 	struct request_queue *q =
 			osd_request_queue(_ios_od(ios, per_dev->dev));
-
-	per_dev->length += cur_len;
+	unsigned len = cur_len;
+	int ret;
 
 	if (per_dev->bio == NULL) {
 		unsigned pages_in_stripe = ios->layout->group_width *
@@ -390,7 +390,8 @@ static int _add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
 		if (unlikely(!per_dev->bio)) {
 			ORE_DBGMSG("Failed to allocate BIO size=%u\n",
 				     bio_size);
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto out;
 		}
 	}
 
@@ -403,15 +404,24 @@ static int _add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
 
 		added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg],
 					    pglen, pgbase);
-		if (unlikely(pglen != added_len))
-			return -ENOMEM;
+		if (unlikely(pglen != added_len)) {
+			ret = -ENOMEM;
+			goto out;
+		}
 		pgbase = 0;
 		++pg;
 	}
 	BUG_ON(cur_len);
 
+	per_dev->length += len;
 	*cur_pg = pg;
-	return 0;
+	ret = 0;
+out:	/* we fail the complete unit on an error eg don't advance
+	 * per_dev->length and cur_pg. This means that we might have a bigger
+	 * bio than the CDB requested length (per_dev->length). That's fine
+	 * only the oposite is fatal.
+	 */
+	return ret;
 }
 
 static int _prepare_for_striping(struct ore_io_state *ios)
@@ -472,7 +482,13 @@ static int _prepare_for_striping(struct ore_io_state *ios)
 out:
 	ios->numdevs = devs_in_group;
 	ios->pages_consumed = cur_pg;
-	return ret;
+	if (unlikely(ret)) {
+		if (length == ios->length)
+			return ret;
+		else
+			ios->length -= length;
+	}
+	return 0;
 }
 
 int ore_create(struct ore_io_state *ios)
-- 
cgit v1.2.3-58-ga151


From 3bd9856857339d7ee8c4ad50030583f1b9415c39 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 28 Sep 2011 12:04:23 +0300
Subject: ore: Support for partial component table

Users like the objlayout-driver would like to only pass
a partial device table that covers the IO in question.
For example exofs divides the file into raid-group-sized
chunks and only serves group_width number of devices at
a time.

The partiality is communicated by setting
ore_componets->first_dev and the array covers all logical
devices from oc->first_dev upto (oc->first_dev + oc->numdevs)

The ore_comp_dev() API receives a logical device index
and returns the actual present device in the table.
An out-of-range dev_index will BUG.

Logical device index is the theoretical device index as if
all the devices of a file are present. .i.e:
	total_devs = group_width * mirror_p1 * group_count
	0 <= dev_index < total_devs

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/exofs.h       | 1 +
 fs/exofs/ore.c         | 4 ++++
 include/scsi/osd_ore.h | 7 ++++---
 3 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 006fd6f33571..51f4b4c40f09 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -217,6 +217,7 @@ static inline void exofs_init_comps(struct ore_components *oc,
 	one_comp->obj.id = oid;
 	exofs_make_credential(one_comp->cred, &one_comp->obj);
 
+	oc->first_dev = 0;
 	oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 *
 							sbi->layout.group_count;
 	oc->single_comp = EC_SINGLE_COMP;
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 8354fe061d1c..f1b718028a1f 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -62,6 +62,10 @@ static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)
 
 static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
 {
+	ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n",
+		    ios->oc->first_dev, ios->oc->numdevs, index,
+		    ios->oc->ods);
+
 	return ore_comp_dev(ios->oc, index);
 }
 
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index baeef0200a1f..492b70d43bb6 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -49,6 +49,7 @@ struct ore_dev {
 };
 
 struct ore_components {
+	unsigned	first_dev;		/* First logical device no    */
 	unsigned	numdevs;		/* Num of devices in array    */
 	/* If @single_comp == EC_SINGLE_COMP, @comps points to a single
 	 * component. else there are @numdevs components
@@ -70,14 +71,14 @@ struct ore_components {
 static inline struct osd_dev *ore_comp_dev(
 	const struct ore_components *oc, unsigned i)
 {
-	BUG_ON(oc->numdevs <= i);
-	return oc->ods[i]->od;
+	BUG_ON((i < oc->first_dev) || (oc->first_dev + oc->numdevs <= i));
+	return oc->ods[i - oc->first_dev]->od;
 }
 
 static inline void ore_comp_set_dev(
 	struct ore_components *oc, unsigned i, struct osd_dev *od)
 {
-	oc->ods[i]->od = od;
+	oc->ods[i - oc->first_dev]->od = od;
 }
 
 struct ore_striping_info {
-- 
cgit v1.2.3-58-ga151


From 5a51c0c7e9a913649aa65d8233470682bcbb7694 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 28 Sep 2011 13:18:45 +0300
Subject: ore/exofs: Define new ore_verify_layout

All users of the ore will need to check if current code
supports the given layout. For example RAID5/6 is not
currently supported.

So move all the checks from exofs/super.c to a new
ore_verify_layout() to be used by ore users.

Note that any new layout should be passed through the
ore_verify_layout() because the ore engine will prepare
and verify some internal members of ore_layout, and
assumes it's called.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/inode.c       |  9 ++-----
 fs/exofs/ore.c         | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/exofs/super.c       | 49 +++---------------------------------
 include/scsi/osd_ore.h |  8 ++++++
 4 files changed, 80 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 96366a1d7eae..5a62420cbdb1 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -37,11 +37,7 @@
 
 #define EXOFS_DBGMSG2(M...) do {} while (0)
 
-enum { BIO_MAX_PAGES_KMALLOC =
-		(PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
-	MAX_PAGES_KMALLOC =
-		PAGE_SIZE / sizeof(struct page *),
-};
+enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), };
 
 unsigned exofs_max_io_pages(struct ore_layout *layout,
 			    unsigned expected_pages)
@@ -49,8 +45,7 @@ unsigned exofs_max_io_pages(struct ore_layout *layout,
 	unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC);
 
 	/* TODO: easily support bio chaining */
-	pages =  min_t(unsigned, pages,
-		       layout->group_width * BIO_MAX_PAGES_KMALLOC);
+	pages =  min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE);
 	return pages;
 }
 
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index f1b718028a1f..4ca59d492798 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -47,9 +47,76 @@ MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
 MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
 MODULE_LICENSE("GPL");
 
+/* ore_verify_layout does a couple of things:
+ * 1. Given a minimum number of needed parameters fixes up the rest of the
+ *    members to be operatonals for the ore. The needed parameters are those
+ *    that are defined by the pnfs-objects layout STD.
+ * 2. Check to see if the current ore code actually supports these parameters
+ *    for example stripe_unit must be a multple of the system PAGE_SIZE,
+ *    and etc...
+ * 3. Cache some havily used calculations that will be needed by users.
+ */
+
 static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
 				 struct ore_striping_info *si);
 
+enum { BIO_MAX_PAGES_KMALLOC =
+		(PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),};
+
+int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
+{
+	u64 stripe_length;
+
+/* FIXME: Only raid0 is supported for now. */
+	if (layout->raid_algorithm != PNFS_OSD_RAID_0) {
+		ORE_ERR("Only RAID_0 for now\n");
+		return -EINVAL;
+	}
+	if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
+		ORE_ERR("Stripe Unit(0x%llx)"
+			  " must be Multples of PAGE_SIZE(0x%lx)\n",
+			  _LLU(layout->stripe_unit), PAGE_SIZE);
+		return -EINVAL;
+	}
+	if (layout->group_width) {
+		if (!layout->group_depth) {
+			ORE_ERR("group_depth == 0 && group_width != 0\n");
+			return -EINVAL;
+		}
+		if (total_comps < (layout->group_width * layout->mirrors_p1)) {
+			ORE_ERR("Data Map wrong, "
+				"numdevs=%d < group_width=%d * mirrors=%d\n",
+				total_comps, layout->group_width,
+				layout->mirrors_p1);
+			return -EINVAL;
+		}
+		layout->group_count = total_comps / layout->mirrors_p1 /
+						layout->group_width;
+	} else {
+		if (layout->group_depth) {
+			printk(KERN_NOTICE "Warning: group_depth ignored "
+				"group_width == 0 && group_depth == %lld\n",
+				_LLU(layout->group_depth));
+		}
+		layout->group_width = total_comps / layout->mirrors_p1;
+		layout->group_depth = -1;
+		layout->group_count = 1;
+	}
+
+	stripe_length = (u64)layout->group_width * layout->stripe_unit;
+	if (stripe_length >= (1ULL << 32)) {
+		ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n",
+			_LLU(stripe_length));
+		return -EINVAL;
+	}
+
+	layout->max_io_length =
+		(BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
+							layout->group_width;
+	return 0;
+}
+EXPORT_SYMBOL(ore_verify_layout);
+
 static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
 {
 	return ios->oc->comps[index & ios->oc->single_comp].cred;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index bce3686f0aa0..057b237b8b69 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -480,7 +480,7 @@ static void exofs_put_super(struct super_block *sb)
 static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
 				    struct exofs_device_table *dt)
 {
-	u64 stripe_length;
+	int ret;
 
 	sbi->layout.stripe_unit =
 				le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
@@ -493,50 +493,7 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
 	sbi->layout.raid_algorithm  =
 				le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
 
-/* FIXME: Only raid0 for now. if not so, do not mount */
-	if (sbi->layout.raid_algorithm != PNFS_OSD_RAID_0) {
-		EXOFS_ERR("Only RAID_0 for now\n");
-		return -EINVAL;
-	}
-	if (numdevs < (sbi->layout.group_width * sbi->layout.mirrors_p1)) {
-		EXOFS_ERR("Data Map wrong, "
-			  "numdevs=%d < group_width=%d * mirrors=%d\n",
-			  numdevs, sbi->layout.group_width,
-			  sbi->layout.mirrors_p1);
-		return -EINVAL;
-	}
-
-	if (0 != (sbi->layout.stripe_unit & ~PAGE_MASK)) {
-		EXOFS_ERR("Stripe Unit(0x%llx)"
-			  " must be Multples of PAGE_SIZE(0x%lx)\n",
-			  _LLU(sbi->layout.stripe_unit), PAGE_SIZE);
-		return -EINVAL;
-	}
-
-	if (sbi->layout.group_width) {
-		if (!sbi->layout.group_depth) {
-			EXOFS_ERR("group_depth == 0 && group_width != 0\n");
-			return -EINVAL;
-		}
-		sbi->layout.group_count = numdevs / sbi->layout.mirrors_p1 /
-						sbi->layout.group_width;
-	} else {
-		if (sbi->layout.group_depth) {
-			printk(KERN_NOTICE "Warning: group_depth ignored "
-				"group_width == 0 && group_depth == %lld\n",
-				_LLU(sbi->layout.group_depth));
-		}
-		sbi->layout.group_width = numdevs / sbi->layout.mirrors_p1;
-		sbi->layout.group_depth = -1;
-		sbi->layout.group_count = 1;
-	}
-
-	stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit;
-	if (stripe_length >= (1ULL << 32)) {
-		EXOFS_ERR("Total Stripe length(0x%llx)"
-			  " >= 32bit is not supported\n", _LLU(stripe_length));
-		return -EINVAL;
-	}
+	ret = ore_verify_layout(numdevs, &sbi->layout);
 
 	EXOFS_DBGMSG("exofs: layout: "
 		"num_comps=%u stripe_unit=0x%x group_width=%u "
@@ -547,7 +504,7 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
 		_LLU(sbi->layout.group_depth),
 		sbi->layout.mirrors_p1,
 		sbi->layout.raid_algorithm);
-	return 0;
+	return ret;
 }
 
 static unsigned __ra_pages(struct ore_layout *layout)
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index 492b70d43bb6..716dbeae8cd2 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -42,6 +42,13 @@ struct ore_layout {
 	unsigned group_width;
 	u64	 group_depth;
 	unsigned group_count;
+
+	/* Cached often needed calculations filled in by
+	 * ore_verify_layout
+	 */
+	unsigned long max_io_length;	/* Max length that should be passed to
+					 * ore_get_rw_state
+					 */
 };
 
 struct ore_dev {
@@ -138,6 +145,7 @@ static inline unsigned ore_io_state_size(unsigned numdevs)
 }
 
 /* ore.c */
+int ore_verify_layout(unsigned total_comps, struct ore_layout *layout);
 int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps,
 		     bool is_reading, u64 offset, u64 length,
 		     struct ore_io_state **ios);
-- 
cgit v1.2.3-58-ga151


From 4b46c9f5cf69505f0bc708995b88b0cc60317ffd Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 28 Sep 2011 13:25:50 +0300
Subject: ore/exofs: Change ore_check_io API

Current ore_check_io API receives a residual
pointer, to report partial IO. But it is actually
not used, because in a multiple devices IO there
is never a linearity in the IO failure.

On the other hand if every failing device is reported
through a received callback measures can be taken to
handle only failed devices. One at a time.

This will also be needed by the objects-layout-driver
for it's error reporting facility.

Exofs is not currently using the new information and
keeps the old behaviour of failing the complete IO in
case of an error. (No partial completion)

TODO: Use an ore_check_io callback to set_page_error only
the failing pages. And re-dirty write pages.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/inode.c       | 14 ++++----------
 fs/exofs/ore.c         | 29 ++++++++++++++++-------------
 include/scsi/osd_ore.h |  5 ++++-
 3 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 5a62420cbdb1..86c0ac87b8e3 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -194,19 +194,16 @@ static void update_write_page(struct page *page, int ret)
 static int __readpages_done(struct page_collect *pcol)
 {
 	int i;
-	u64 resid;
 	u64 good_bytes;
 	u64 length = 0;
-	int ret = ore_check_io(pcol->ios, &resid);
+	int ret = ore_check_io(pcol->ios, NULL);
 
 	if (likely(!ret)) {
 		good_bytes = pcol->length;
 		ret = PAGE_WAS_NOT_IN_IO;
 	} else {
-		good_bytes = pcol->length - resid;
+		good_bytes = 0;
 	}
-	if (good_bytes > pcol->ios->length)
-		good_bytes = pcol->ios->length;
 
 	EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
 		     " length=0x%lx nr_pages=%u\n",
@@ -519,10 +516,9 @@ static void writepages_done(struct ore_io_state *ios, void *p)
 {
 	struct page_collect *pcol = p;
 	int i;
-	u64 resid;
 	u64  good_bytes;
 	u64  length = 0;
-	int ret = ore_check_io(ios, &resid);
+	int ret = ore_check_io(ios, NULL);
 
 	atomic_dec(&pcol->sbi->s_curr_pending);
 
@@ -530,10 +526,8 @@ static void writepages_done(struct ore_io_state *ios, void *p)
 		good_bytes = pcol->length;
 		ret = PAGE_WAS_NOT_IN_IO;
 	} else {
-		good_bytes = pcol->length - resid;
+		good_bytes = 0;
 	}
-	if (good_bytes > pcol->ios->length)
-		good_bytes = pcol->ios->length;
 
 	EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
 		     " length=0x%lx nr_pages=%u\n",
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 4ca59d492798..3b1cc3a132d7 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -317,7 +317,7 @@ static void _clear_bio(struct bio *bio)
 	}
 }
 
-int ore_check_io(struct ore_io_state *ios, u64 *resid)
+int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
 {
 	enum osd_err_priority acumulated_osd_err = 0;
 	int acumulated_lin_err = 0;
@@ -325,7 +325,8 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid)
 
 	for (i = 0; i < ios->numdevs; i++) {
 		struct osd_sense_info osi;
-		struct osd_request *or = ios->per_dev[i].or;
+		struct ore_per_dev_state *per_dev = &ios->per_dev[i];
+		struct osd_request *or = per_dev->or;
 		int ret;
 
 		if (unlikely(!or))
@@ -337,29 +338,31 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid)
 
 		if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
 			/* start read offset passed endof file */
-			_clear_bio(ios->per_dev[i].bio);
+			_clear_bio(per_dev->bio);
 			ORE_DBGMSG("start read offset passed end of file "
 				"offset=0x%llx, length=0x%llx\n",
-				_LLU(ios->per_dev[i].offset),
-				_LLU(ios->per_dev[i].length));
+				_LLU(per_dev->offset),
+				_LLU(per_dev->length));
 
 			continue; /* we recovered */
 		}
 
+		if (on_dev_error) {
+			u64 residual = ios->reading ?
+					or->in.residual : or->out.residual;
+			u64 offset = (ios->offset + ios->length) - residual;
+			struct ore_dev *od = ios->oc->ods[
+					per_dev->dev - ios->oc->first_dev];
+
+			on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri,
+				     offset, residual);
+		}
 		if (osi.osd_err_pri >= acumulated_osd_err) {
 			acumulated_osd_err = osi.osd_err_pri;
 			acumulated_lin_err = ret;
 		}
 	}
 
-	/* TODO: raid specific residual calculations */
-	if (resid) {
-		if (likely(!acumulated_lin_err))
-			*resid = 0;
-		else
-			*resid = ios->length;
-	}
-
 	return acumulated_lin_err;
 }
 EXPORT_SYMBOL(ore_check_io);
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index 716dbeae8cd2..af2231a0fd09 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -153,7 +153,10 @@ int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps,
 		     struct ore_io_state **ios);
 void ore_put_io_state(struct ore_io_state *ios);
 
-int ore_check_io(struct ore_io_state *ios, u64 *resid);
+typedef void (*ore_on_dev_error)(struct ore_io_state *ios, struct ore_dev *od,
+	unsigned dev_index, enum osd_err_priority oep,
+	u64 dev_offset, u64  dev_len);
+int ore_check_io(struct ore_io_state *ios, ore_on_dev_error rep);
 
 int ore_create(struct ore_io_state *ios);
 int ore_remove(struct ore_io_state *ios);
-- 
cgit v1.2.3-58-ga151


From 01cd4afadbf376de07d364a632cc82a0fc5e8655 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 17 Oct 2011 10:41:17 +0300
Subject: nfsd4: typo logical vs bitwise negate

This should be a bitwise negate here.  It silences a Sparse warning:
fs/nfsd/nfs4xdr.c:693:16: warning: dubious: x & !y

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2cab33cc3238..645a0a9d8073 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -690,7 +690,7 @@ static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x)
 	READ_BUF(4);
 	READ32(*x);
 	/* Note: unlinke access bits, deny bits may be zero. */
-	if (*x & !NFS4_SHARE_DENY_BOTH)
+	if (*x & ~NFS4_SHARE_DENY_BOTH)
 		return nfserr_bad_xdr;
 	return nfs_ok;
 xdr_error:
-- 
cgit v1.2.3-58-ga151


From a5ff376966c079bd2f078524eff11b0c63cc2507 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Thu, 13 Oct 2011 10:26:03 -0500
Subject: cifs: Call id to SID mapping functions to change owner/group (try #4
 repost)

Now build security descriptor to change either owner or group at the
server.  Initially security descriptor was built to change only
(D)ACL, that functionality has been extended.

When either an Owner or a Group of a file object at the server is changed,
rest of security descriptor remains same (DACL etc.).

To set security descriptor, it is necessary to open that file
with permission bits of either WRITE_DAC if DACL is being modified or
WRITE_OWNER (Take Ownership) if Owner or Group is being changed.

It is the server that decides whether a set security descriptor with
either owner or group change succeeds or not.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsacl.c   | 135 ++++++++++++++++++++++++++++++++--------------------
 fs/cifs/cifsproto.h |   7 +--
 fs/cifs/cifssmb.c   |   4 +-
 fs/cifs/inode.c     |  35 +++++++++-----
 fs/cifs/xattr.c     |   2 +-
 5 files changed, 113 insertions(+), 70 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 992f1fb299d1..72ddf23ef6f7 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -904,7 +904,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 	acl_size = sizeof(struct cifs_acl);
 
 	num_aces = le32_to_cpu(pdacl->num_aces);
-	if (num_aces  > 0) {
+	if (num_aces > 0) {
 		umode_t user_mask = S_IRWXU;
 		umode_t group_mask = S_IRWXG;
 		umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
@@ -1066,52 +1066,82 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
 	else
 		cFYI(1, "no ACL"); /* BB grant all or default perms? */
 
-/*	cifscred->uid = owner_sid_ptr->rid;
-	cifscred->gid = group_sid_ptr->rid;
-	memcpy((void *)(&(cifscred->osid)), (void *)owner_sid_ptr,
-			sizeof(struct cifs_sid));
-	memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr,
-			sizeof(struct cifs_sid)); */
-
 	return rc;
 }
 
-
 /* Convert permission bits from mode to equivalent CIFS ACL */
 static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
-				struct inode *inode, __u64 nmode)
+	__u32 secdesclen, __u64 nmode, uid_t uid, gid_t gid, int *aclflag)
 {
 	int rc = 0;
 	__u32 dacloffset;
 	__u32 ndacloffset;
 	__u32 sidsoffset;
 	struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
+	struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr;
 	struct cifs_acl *dacl_ptr = NULL;  /* no need for SACL ptr */
 	struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */
 
-	if ((inode == NULL) || (pntsd == NULL) || (pnntsd == NULL))
-		return -EIO;
-
-	owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+	if (nmode != NO_CHANGE_64) { /* chmod */
+		owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
 				le32_to_cpu(pntsd->osidoffset));
-	group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+		group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
 				le32_to_cpu(pntsd->gsidoffset));
-
-	dacloffset = le32_to_cpu(pntsd->dacloffset);
-	dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
-
-	ndacloffset = sizeof(struct cifs_ntsd);
-	ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset);
-	ndacl_ptr->revision = dacl_ptr->revision;
-	ndacl_ptr->size = 0;
-	ndacl_ptr->num_aces = 0;
-
-	rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, nmode);
-
-	sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size);
-
-	/* copy security descriptor control portion and owner and group sid */
-	copy_sec_desc(pntsd, pnntsd, sidsoffset);
+		dacloffset = le32_to_cpu(pntsd->dacloffset);
+		dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
+		ndacloffset = sizeof(struct cifs_ntsd);
+		ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset);
+		ndacl_ptr->revision = dacl_ptr->revision;
+		ndacl_ptr->size = 0;
+		ndacl_ptr->num_aces = 0;
+
+		rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr,
+					nmode);
+		sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size);
+		/* copy sec desc control portion & owner and group sids */
+		copy_sec_desc(pntsd, pnntsd, sidsoffset);
+		*aclflag = CIFS_ACL_DACL;
+	} else {
+		memcpy(pnntsd, pntsd, secdesclen);
+		if (uid != NO_CHANGE_32) { /* chown */
+			owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
+					le32_to_cpu(pnntsd->osidoffset));
+			nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid),
+								GFP_KERNEL);
+			if (!nowner_sid_ptr)
+				return -ENOMEM;
+			rc = id_to_sid(uid, SIDOWNER, nowner_sid_ptr);
+			if (rc) {
+				cFYI(1, "%s: Mapping error %d for owner id %d",
+						__func__, rc, uid);
+				kfree(nowner_sid_ptr);
+				return rc;
+			}
+			memcpy(owner_sid_ptr, nowner_sid_ptr,
+					sizeof(struct cifs_sid));
+			kfree(nowner_sid_ptr);
+			*aclflag = CIFS_ACL_OWNER;
+		}
+		if (gid != NO_CHANGE_32) { /* chgrp */
+			group_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
+					le32_to_cpu(pnntsd->gsidoffset));
+			ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid),
+								GFP_KERNEL);
+			if (!ngroup_sid_ptr)
+				return -ENOMEM;
+			rc = id_to_sid(gid, SIDGROUP, ngroup_sid_ptr);
+			if (rc) {
+				cFYI(1, "%s: Mapping error %d for group id %d",
+						__func__, rc, gid);
+				kfree(ngroup_sid_ptr);
+				return rc;
+			}
+			memcpy(group_sid_ptr, ngroup_sid_ptr,
+					sizeof(struct cifs_sid));
+			kfree(ngroup_sid_ptr);
+			*aclflag = CIFS_ACL_GROUP;
+		}
+	}
 
 	return rc;
 }
@@ -1192,13 +1222,15 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
 	return pntsd;
 }
 
-static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
-		struct cifs_ntsd *pnntsd, u32 acllen)
+ /* Set an ACL on the server */
+int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+			struct inode *inode, const char *path, int aclflag)
 {
 	int oplock = 0;
-	int xid, rc, create_options = 0;
+	int xid, rc, access_flags, create_options = 0;
 	__u16 fid;
 	struct cifs_tcon *tcon;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
 
 	if (IS_ERR(tlink))
@@ -1210,15 +1242,20 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
 	if (backup_cred(cifs_sb))
 		create_options |= CREATE_OPEN_BACKUP_INTENT;
 
-	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, create_options,
-			 &fid, &oplock, NULL, cifs_sb->local_nls,
-			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP)
+		access_flags = WRITE_OWNER;
+	else
+		access_flags = WRITE_DAC;
+
+	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, access_flags,
+			create_options, &fid, &oplock, NULL, cifs_sb->local_nls,
+			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc) {
 		cERROR(1, "Unable to open file to set ACL");
 		goto out;
 	}
 
-	rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen);
+	rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen, aclflag);
 	cFYI(DBG2, "SetCIFSACL rc = %d", rc);
 
 	CIFSSMBClose(xid, tcon, fid);
@@ -1228,17 +1265,6 @@ out:
 	return rc;
 }
 
-/* Set an ACL on the server */
-int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
-				struct inode *inode, const char *path)
-{
-	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-
-	cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
-
-	return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
-}
-
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
 int
 cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
@@ -1270,9 +1296,12 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
 }
 
 /* Convert mode bits to an ACL so we can update the ACL on the server */
-int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode)
+int
+id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
+			uid_t uid, gid_t gid)
 {
 	int rc = 0;
+	int aclflag = CIFS_ACL_DACL; /* default flag to set */
 	__u32 secdesclen = 0;
 	struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
 	struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
@@ -1302,13 +1331,15 @@ int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode)
 			return -ENOMEM;
 		}
 
-		rc = build_sec_desc(pntsd, pnntsd, inode, nmode);
+		rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid,
+					&aclflag);
 
 		cFYI(DBG2, "build_sec_desc rc: %d", rc);
 
 		if (!rc) {
 			/* Set the security descriptor */
-			rc = set_cifs_acl(pnntsd, secdesclen, inode, path);
+			rc = set_cifs_acl(pnntsd, secdesclen, inode,
+						path, aclflag);
 			cFYI(DBG2, "set_cifs_acl rc: %d", rc);
 		}
 
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 94834dbb46da..a1fa9cec05d6 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -146,11 +146,12 @@ extern int cifs_get_inode_info_unix(struct inode **pinode,
 extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
 			      struct cifs_fattr *fattr, struct inode *inode,
 			      const char *path, const __u16 *pfid);
-extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64);
+extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64,
+					uid_t, gid_t);
 extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
 					const char *, u32 *);
 extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
-				const char *);
+				const char *, int);
 
 extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 			       struct cifs_sb_info *cifs_sb);
@@ -420,7 +421,7 @@ extern int CIFSSMBSetEA(const int xid, struct cifs_tcon *tcon,
 extern int CIFSSMBGetCIFSACL(const int xid, struct cifs_tcon *tcon,
 			__u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen);
 extern int CIFSSMBSetCIFSACL(const int, struct cifs_tcon *, __u16,
-			struct cifs_ntsd *, __u32);
+			struct cifs_ntsd *, __u32, int);
 extern int CIFSSMBGetPosixACL(const int xid, struct cifs_tcon *tcon,
 		const unsigned char *searchName,
 		char *acl_inf, const int buflen, const int acl_type,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index e33093f7ef0d..c824c106b2b7 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3463,7 +3463,7 @@ qsec_out:
 
 int
 CIFSSMBSetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid,
-			struct cifs_ntsd *pntsd, __u32 acllen)
+			struct cifs_ntsd *pntsd, __u32 acllen, int aclflag)
 {
 	__u16 byte_count, param_count, data_count, param_offset, data_offset;
 	int rc = 0;
@@ -3500,7 +3500,7 @@ setCifsAclRetry:
 
 	pSMB->Fid = fid; /* file handle always le */
 	pSMB->Reserved2 = 0;
-	pSMB->AclFlags = cpu_to_le32(CIFS_ACL_DACL);
+	pSMB->AclFlags = cpu_to_le32(aclflag);
 
 	if (pntsd && acllen) {
 		memcpy((char *) &pSMBr->hdr.Protocol + data_offset,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a7b2dcd4a53e..663c4e313be4 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2096,6 +2096,8 @@ static int
 cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 {
 	int xid;
+	uid_t uid = NO_CHANGE_32;
+	gid_t gid = NO_CHANGE_32;
 	struct inode *inode = direntry->d_inode;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
@@ -2146,13 +2148,25 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 			goto cifs_setattr_exit;
 	}
 
-	/*
-	 * Without unix extensions we can't send ownership changes to the
-	 * server, so silently ignore them. This is consistent with how
-	 * local DOS/Windows filesystems behave (VFAT, NTFS, etc). With
-	 * CIFSACL support + proper Windows to Unix idmapping, we may be
-	 * able to support this in the future.
-	 */
+	if (attrs->ia_valid & ATTR_UID)
+		uid = attrs->ia_uid;
+
+	if (attrs->ia_valid & ATTR_GID)
+		gid = attrs->ia_gid;
+
+#ifdef CONFIG_CIFS_ACL
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+		if (uid != NO_CHANGE_32 || gid != NO_CHANGE_32) {
+			rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64,
+							uid, gid);
+			if (rc) {
+				cFYI(1, "%s: Setting id failed with error: %d",
+					__func__, rc);
+				goto cifs_setattr_exit;
+			}
+		}
+	} else
+#endif /* CONFIG_CIFS_ACL */
 	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID))
 		attrs->ia_valid &= ~(ATTR_UID | ATTR_GID);
 
@@ -2161,15 +2175,12 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 		attrs->ia_valid &= ~ATTR_MODE;
 
 	if (attrs->ia_valid & ATTR_MODE) {
-		cFYI(1, "Mode changed to 0%o", attrs->ia_mode);
 		mode = attrs->ia_mode;
-	}
-
-	if (attrs->ia_valid & ATTR_MODE) {
 		rc = 0;
 #ifdef CONFIG_CIFS_ACL
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-			rc = mode_to_cifs_acl(inode, full_path, mode);
+			rc = id_mode_to_cifs_acl(inode, full_path, mode,
+						NO_CHANGE_32, NO_CHANGE_32);
 			if (rc) {
 				cFYI(1, "%s: Setting ACL failed with error: %d",
 					__func__, rc);
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 2a22fb2989e4..fde15b1a1eb5 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -178,7 +178,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 #ifdef CONFIG_CIFS_ACL
 			memcpy(pacl, ea_value, value_size);
 			rc = set_cifs_acl(pacl, value_size,
-				direntry->d_inode, full_path);
+				direntry->d_inode, full_path, CIFS_ACL_DACL);
 			if (rc == 0) /* force revalidate of the inode */
 				CIFS_I(direntry->d_inode)->time = 0;
 			kfree(pacl);
-- 
cgit v1.2.3-58-ga151


From f472e02669073e4f1a388142bafa0f806fae841c Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Mon, 17 Oct 2011 10:13:46 -0400
Subject: ext4: avoid stamping on other memories in ext4_ext_insert_index()

Add a sanity check to make sure ix hasn't gone beyond the valid bounds
of the extent block.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index f473ddf0bd94..5c4861210d4c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -779,6 +779,11 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 		ix = curp->p_idx;
 	}
 
+	if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
+		EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
+		return -EIO;
+	}
+
 	ix->ei_block = cpu_to_le32(logical);
 	ext4_idx_store_pblock(ix, ptr);
 	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
-- 
cgit v1.2.3-58-ga151


From a50d2ad1721c0c785e9a74c0003ca044de6868a5 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 12 Oct 2011 16:24:27 -0400
Subject: nfsd4: centralize renew_client() calls

There doesn't seem to be any harm to renewing the client a bit earlier,
when it is looked up.  That saves us from having to sprinkle
renew_client calls over quite so many places.

Also remove a redundant comment and do a little cleanup.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2042805da960..d90461eb9368 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -931,9 +931,6 @@ renew_client_locked(struct nfs4_client *clp)
 		return;
 	}
 
-	/*
-	* Move client to the end to the LRU list.
-	*/
 	dprintk("renewing client (clientid %08x/%08x)\n", 
 			clp->cl_clientid.cl_boot, 
 			clp->cl_clientid.cl_id);
@@ -1220,8 +1217,10 @@ find_confirmed_client(clientid_t *clid)
 	unsigned int idhashval = clientid_hashval(clid->cl_id);
 
 	list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) {
-		if (same_clid(&clp->cl_clientid, clid))
+		if (same_clid(&clp->cl_clientid, clid)) {
+			renew_client(clp);
 			return clp;
+		}
 	}
 	return NULL;
 }
@@ -2372,11 +2371,15 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner,
 static struct nfs4_openowner *
 find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open)
 {
-	struct nfs4_stateowner *so = NULL;
+	struct nfs4_stateowner *so;
+	struct nfs4_openowner *oo;
 
 	list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) {
-		if (same_owner_str(so, &open->op_owner, &open->op_clientid))
-			return container_of(so, struct nfs4_openowner, oo_owner);
+		if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
+			oo = openowner(so);
+			renew_client(oo->oo_owner.so_client);
+			return oo;
+		}
 	}
 	return NULL;
 }
@@ -2536,7 +2539,6 @@ renew:
 		open->op_openowner = oo;
 	}
 	list_del_init(&oo->oo_close_lru);
-	renew_client(oo->oo_owner.so_client);
 	return nfs_ok;
 }
 
@@ -2970,7 +2972,6 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		dprintk("nfsd4_renew: clientid not found!\n");
 		goto out;
 	}
-	renew_client(clp);
 	status = nfserr_cb_path_down;
 	if (!list_empty(&clp->cl_delegations)
 			&& clp->cl_cb_state != NFSD4_CB_UP)
@@ -3275,7 +3276,6 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 		status = nfs4_check_delegmode(dp, flags);
 		if (status)
 			goto out;
-		renew_client(dp->dl_stid.sc_client);
 		if (filpp) {
 			*filpp = dp->dl_file->fi_deleg_file;
 			BUG_ON(!*filpp);
@@ -3293,7 +3293,6 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 		status = nfs4_check_openmode(stp, flags);
 		if (status)
 			goto out;
-		renew_client(stp->st_stateowner->so_client);
 		if (filpp) {
 			if (flags & RD_STATE)
 				*filpp = find_readable_file(stp->st_file);
@@ -3412,7 +3411,6 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 		return status;
 	*stpp = openlockstateid(s);
 	cstate->replay_owner = (*stpp)->st_stateowner;
-	renew_client((*stpp)->st_stateowner->so_client);
 
 	return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp);
 }
@@ -3643,7 +3641,6 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate));
 	if (status)
 		goto out;
-	renew_client(dp->dl_stid.sc_client);
 
 	unhash_delegation(dp);
 out:
-- 
cgit v1.2.3-58-ga151


From 3557e43b8f78e5c2347bab31626fdb4d09220ae7 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 12 Oct 2011 16:58:21 -0400
Subject: nfsd4: make is_open_owner boolean

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/state.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index aa14f06af2df..87eecfd9b968 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -335,13 +335,13 @@ struct nfs4_replay {
 struct nfs4_stateowner {
 	struct list_head        so_strhash;   /* hash by op_name */
 	struct list_head        so_stateids;
-	int			so_is_open_owner; /* 1=openowner,0=lockowner */
 	struct nfs4_client *    so_client;
 	/* after increment in ENCODE_SEQID_OP_TAIL, represents the next
 	 * sequence id expected from the client: */
 	u32                     so_seqid;
 	struct xdr_netobj       so_owner;     /* open owner name */
 	struct nfs4_replay	so_replay;
+	bool			so_is_open_owner;
 };
 
 struct nfs4_openowner {
-- 
cgit v1.2.3-58-ga151


From bcf130f9dfbaccf91376a44b18f51ed8007840d6 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 12 Oct 2011 20:44:20 -0400
Subject: nfsd4: simplify process_open1 logic

No change in behavior.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d90461eb9368..62aa91ae278b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2506,7 +2506,6 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
 	struct nfs4_client *clp = NULL;
 	unsigned int strhashval;
 	struct nfs4_openowner *oo = NULL;
-	__be32 status;
 
 	if (STALE_CLIENTID(&open->op_clientid))
 		return nfserr_stale_clientid;
@@ -2515,30 +2514,25 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
 	oo = find_openstateowner_str(strhashval, open);
 	open->op_openowner = oo;
 	if (!oo) {
-		/* Make sure the client's lease hasn't expired. */
 		clp = find_confirmed_client(clientid);
 		if (clp == NULL)
 			return nfserr_expired;
-		goto renew;
+		goto new_owner;
 	}
 	if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
 		/* Replace unconfirmed owners without checking for replay. */
 		clp = oo->oo_owner.so_client;
 		release_openowner(oo);
 		open->op_openowner = NULL;
-		goto renew;
-	}
-	status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
-	if (status)
-		return status;
-renew:
-	if (open->op_openowner == NULL) {
-		oo = alloc_init_open_stateowner(strhashval, clp, open);
-		if (oo == NULL)
-			return nfserr_jukebox;
-		open->op_openowner = oo;
+		goto new_owner;
 	}
 	list_del_init(&oo->oo_close_lru);
+	return nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
+new_owner:
+	oo = alloc_init_open_stateowner(strhashval, clp, open);
+	if (oo == NULL)
+		return nfserr_jukebox;
+	open->op_openowner = oo;
 	return nfs_ok;
 }
 
-- 
cgit v1.2.3-58-ga151


From d29b20cd589128a599e5045d4effc2d7dbc388f5 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 13 Oct 2011 15:12:59 -0400
Subject: nfsd4: clean up open owners on OPEN failure

If process_open1() creates a new open owner, but the open later fails,
the current code will leave the open owner around.  It won't be on the
close_lru list, and the client isn't expected to send a CLOSE, so it
will hang around as long as the client does.

Similarly, if process_open1() removes an existing open owner from the
close lru, anticipating that an open owner that previously had no
associated stateid's now will, but the open subsequently fails, then
we'll again be left with the same leak.

Fix both problems.

Reported-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c  |  1 +
 fs/nfsd/nfs4state.c | 20 ++++++++++++++++++--
 fs/nfsd/state.h     |  1 +
 fs/nfsd/xdr4.h      |  1 +
 4 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 5b192a2512b6..10b50d78bdc3 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -409,6 +409,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 */
 	status = nfsd4_process_open2(rqstp, &cstate->current_fh, open);
 out:
+	nfsd4_cleanup_open_state(open, status);
 	if (open->op_openowner)
 		cstate->replay_owner = &open->op_openowner->oo_owner;
 	else
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 62aa91ae278b..2c9a1a20e014 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2320,7 +2320,7 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
 		return NULL;
 	oo->oo_owner.so_is_open_owner = 1;
 	oo->oo_owner.so_seqid = open->op_seqid;
-	oo->oo_flags = 0;
+	oo->oo_flags = NFS4_OO_NEW;
 	oo->oo_time = 0;
 	oo->oo_last_closed_stid = NULL;
 	INIT_LIST_HEAD(&oo->oo_close_lru);
@@ -2526,7 +2526,6 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
 		open->op_openowner = NULL;
 		goto new_owner;
 	}
-	list_del_init(&oo->oo_close_lru);
 	return nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
 new_owner:
 	oo = alloc_init_open_stateowner(strhashval, clp, open);
@@ -2946,6 +2945,23 @@ out:
 	return status;
 }
 
+void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status)
+{
+	if (open->op_openowner) {
+		struct nfs4_openowner *oo = open->op_openowner;
+
+		if (!list_empty(&oo->oo_owner.so_stateids))
+			list_del_init(&oo->oo_close_lru);
+		if (oo->oo_flags & NFS4_OO_NEW) {
+			if (status) {
+				release_openowner(oo);
+				open->op_openowner = NULL;
+			} else
+				oo->oo_flags &= ~NFS4_OO_NEW;
+		}
+	}
+}
+
 __be32
 nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	    clientid_t *clid)
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 87eecfd9b968..eab9dae23c06 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -359,6 +359,7 @@ struct nfs4_openowner {
 	time_t			oo_time; /* time of placement on so_close_lru */
 #define NFS4_OO_CONFIRMED   1
 #define NFS4_OO_PURGE_CLOSE 2
+#define NFS4_OO_NEW         4
 	unsigned char		oo_flags;
 };
 
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 4c8a7ec3f25d..32e6fd8d9768 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -554,6 +554,7 @@ extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
 		struct nfsd4_open *open);
 extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
 		struct svc_fh *current_fh, struct nfsd4_open *open);
+extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status);
 extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc);
 extern __be32 nfsd4_close(struct svc_rqst *rqstp,
-- 
cgit v1.2.3-58-ga151


From 32513b40efdc693b3675f1c691ab901518fbcb6a Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 13 Oct 2011 16:00:16 -0400
Subject: nfsd4: preallocate nfs4_file in process_open1()

Creating a new file is an irrevocable step--once it's visible in the
filesystem, other processes may have seen it and done something with it,
and unlinking it wouldn't simply undo the effects of the create.

Therefore, in the case where OPEN creates a new file, we shouldn't do
the create until we know that the rest of the OPEN processing will
succeed.

For example, we should preallocate a struct file in case we need it
until waiting to allocate it till process_open2(), which is already too
late.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 60 ++++++++++++++++++++++++++++++++---------------------
 fs/nfsd/xdr4.h      |  1 +
 2 files changed, 37 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2c9a1a20e014..ae5d25075f67 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -104,6 +104,11 @@ opaque_hashval(const void *ptr, int nbytes)
 
 static struct list_head del_recall_lru;
 
+static void nfsd4_free_file(struct nfs4_file *f)
+{
+	kmem_cache_free(file_slab, f);
+}
+
 static inline void
 put_nfs4_file(struct nfs4_file *fi)
 {
@@ -111,7 +116,7 @@ put_nfs4_file(struct nfs4_file *fi)
 		list_del(&fi->fi_hash);
 		spin_unlock(&recall_lock);
 		iput(fi->fi_inode);
-		kmem_cache_free(file_slab, fi);
+		nfsd4_free_file(fi);
 	}
 }
 
@@ -2190,30 +2195,28 @@ out:
 	return status;
 }
 
+static struct nfs4_file *nfsd4_alloc_file(void)
+{
+	return kmem_cache_alloc(file_slab, GFP_KERNEL);
+}
+
 /* OPEN Share state helper functions */
-static inline struct nfs4_file *
-alloc_init_file(struct inode *ino)
+static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino)
 {
-	struct nfs4_file *fp;
 	unsigned int hashval = file_hashval(ino);
 
-	fp = kmem_cache_alloc(file_slab, GFP_KERNEL);
-	if (fp) {
-		atomic_set(&fp->fi_ref, 1);
-		INIT_LIST_HEAD(&fp->fi_hash);
-		INIT_LIST_HEAD(&fp->fi_stateids);
-		INIT_LIST_HEAD(&fp->fi_delegations);
-		fp->fi_inode = igrab(ino);
-		fp->fi_had_conflict = false;
-		fp->fi_lease = NULL;
-		memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
-		memset(fp->fi_access, 0, sizeof(fp->fi_access));
-		spin_lock(&recall_lock);
-		list_add(&fp->fi_hash, &file_hashtbl[hashval]);
-		spin_unlock(&recall_lock);
-		return fp;
-	}
-	return NULL;
+	atomic_set(&fp->fi_ref, 1);
+	INIT_LIST_HEAD(&fp->fi_hash);
+	INIT_LIST_HEAD(&fp->fi_stateids);
+	INIT_LIST_HEAD(&fp->fi_delegations);
+	fp->fi_inode = igrab(ino);
+	fp->fi_had_conflict = false;
+	fp->fi_lease = NULL;
+	memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
+	memset(fp->fi_access, 0, sizeof(fp->fi_access));
+	spin_lock(&recall_lock);
+	list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+	spin_unlock(&recall_lock);
 }
 
 static void
@@ -2509,6 +2512,13 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
 
 	if (STALE_CLIENTID(&open->op_clientid))
 		return nfserr_stale_clientid;
+	/*
+	 * In case we need it later, after we've already created the
+	 * file and don't want to risk a further failure:
+	 */
+	open->op_file = nfsd4_alloc_file();
+	if (open->op_file == NULL)
+		return nfserr_jukebox;
 
 	strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner);
 	oo = find_openstateowner_str(strhashval, open);
@@ -2884,9 +2894,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
 			goto out;
 		status = nfserr_jukebox;
-		fp = alloc_init_file(ino);
-		if (fp == NULL)
-			goto out;
+		fp = open->op_file;
+		open->op_file = NULL;
+		nfsd4_init_file(fp, ino);
 	}
 
 	/*
@@ -2960,6 +2970,8 @@ void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status)
 				oo->oo_flags &= ~NFS4_OO_NEW;
 		}
 	}
+	if (open->op_file)
+		nfsd4_free_file(open->op_file);
 }
 
 __be32
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 32e6fd8d9768..502dd43634f9 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -228,6 +228,7 @@ struct nfsd4_open {
 	u32		op_rflags;          /* response */
 	int		op_truncate;        /* used during processing */
 	struct nfs4_openowner *op_openowner; /* used during processing */
+	struct nfs4_file *op_file;          /* used during processing */
 	struct nfs4_acl *op_acl;
 };
 #define op_iattr	iattr
-- 
cgit v1.2.3-58-ga151


From 996e09385c364f97a89648b401409521e2a3a094 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 17 Oct 2011 11:14:48 -0400
Subject: nfsd4: do idr preallocation with stateid allocation

Move idr preallocation out of stateid initialization, into stateid
allocation, so that we no longer have to handle any errors from the
former.

This is a little subtle due to the way the idr code manages these
preallocated items--document that in comments.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 76 ++++++++++++++++++++++++++---------------------------
 fs/nfsd/state.h     |  4 +--
 2 files changed, 39 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ae5d25075f67..1f8c781c2a28 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -215,13 +215,12 @@ static inline int get_new_stid(struct nfs4_stid *stid)
 	int new_stid;
 	int error;
 
-	if (!idr_pre_get(stateids, GFP_KERNEL))
-		return -ENOMEM;
-
 	error = idr_get_new_above(stateids, stid, min_stateid, &new_stid);
 	/*
-	 * All this code is currently serialized; the preallocation
-	 * above should still be ours:
+	 * Note: the necessary preallocation was done in
+	 * nfs4_alloc_stateid().  The idr code caps the number of
+	 * preallocations that can exist at a time, but the state lock
+	 * prevents anyone from using ours before we get here:
 	 */
 	BUG_ON(error);
 	/*
@@ -240,7 +239,7 @@ static inline int get_new_stid(struct nfs4_stid *stid)
 	return new_stid;
 }
 
-static inline __be32 init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type)
+static void init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type)
 {
 	stateid_t *s = &stid->sc_stateid;
 	int new_id;
@@ -249,12 +248,24 @@ static inline __be32 init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, u
 	stid->sc_client = cl;
 	s->si_opaque.so_clid = cl->cl_clientid;
 	new_id = get_new_stid(stid);
-	if (new_id < 0)
-		return nfserr_jukebox;
 	s->si_opaque.so_id = (u32)new_id;
 	/* Will be incremented before return to client: */
 	s->si_generation = 0;
-	return 0;
+}
+
+static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab)
+{
+	struct idr *stateids = &cl->cl_stateids;
+
+	if (!idr_pre_get(stateids, GFP_KERNEL))
+		return NULL;
+	/*
+	 * Note: if we fail here (or any time between now and the time
+	 * we actually get the new idr), we won't need to undo the idr
+	 * preallocation, since the idr code caps the number of
+	 * preallocated entries.
+	 */
+	return kmem_cache_alloc(slab, GFP_KERNEL);
 }
 
 static struct nfs4_delegation *
@@ -262,7 +273,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
 {
 	struct nfs4_delegation *dp;
 	struct nfs4_file *fp = stp->st_file;
-	__be32 status;
 
 	dprintk("NFSD alloc_init_deleg\n");
 	/*
@@ -276,14 +286,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
 		return NULL;
 	if (num_delegations > max_delegations)
 		return NULL;
-	dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL);
+	dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
 	if (dp == NULL)
 		return dp;
-	status = init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID);
-	if (status) {
-		kmem_cache_free(deleg_slab, dp);
-		return NULL;
-	}
+	init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID);
 	/*
 	 * delegation seqid's are never incremented.  The 4.1 special
 	 * meaning of seqid 0 isn't meaningful, really, but let's avoid
@@ -2331,14 +2337,11 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
 	return oo;
 }
 
-static inline __be32 init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
+static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
 	struct nfs4_openowner *oo = open->op_openowner;
 	struct nfs4_client *clp = oo->oo_owner.so_client;
-	__be32 status;
 
-	status = init_stid(&stp->st_stid, clp, NFS4_OPEN_STID);
-	if (status)
-		return status;
+	init_stid(&stp->st_stid, clp, NFS4_OPEN_STID);
 	INIT_LIST_HEAD(&stp->st_lockowners);
 	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
@@ -2350,7 +2353,6 @@ static inline __be32 init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_
 	__set_bit(open->op_share_access, &stp->st_access_bmap);
 	__set_bit(open->op_share_deny, &stp->st_deny_bmap);
 	stp->st_openstp = NULL;
-	return nfs_ok;
 }
 
 static void
@@ -2614,10 +2616,14 @@ nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_st
 	return nfs_ok;
 }
 
-static inline struct nfs4_ol_stateid *
-nfs4_alloc_stateid(void)
+static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
 {
-	return kmem_cache_alloc(stateid_slab, GFP_KERNEL);
+	return openlockstateid(nfs4_alloc_stid(clp, stateid_slab));
+}
+
+static void nfs4_free_stateid(struct nfs4_ol_stateid *s)
+{
+	kmem_cache_free(stateid_slab, s);
 }
 
 static inline int nfs4_access_to_access(u32 nfs4_access)
@@ -2661,15 +2667,16 @@ nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_ol_stateid **stpp,
 		struct nfsd4_open *open)
 {
 	struct nfs4_ol_stateid *stp;
+	struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
 	__be32 status;
 
-	stp = nfs4_alloc_stateid();
+	stp = nfs4_alloc_stateid(cl);
 	if (stp == NULL)
 		return nfserr_jukebox;
 
 	status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open);
 	if (status) {
-		kmem_cache_free(stateid_slab, stp);
+		nfs4_free_stateid(stp);
 		return status;
 	}
 	*stpp = stp;
@@ -2912,11 +2919,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		status = nfs4_new_open(rqstp, &stp, fp, current_fh, open);
 		if (status)
 			goto out;
-		status = init_open_stateid(stp, fp, open);
-		if (status) {
-			release_open_stateid(stp);
-			goto out;
-		}
+		init_open_stateid(stp, fp, open);
 		status = nfsd4_truncate(rqstp, current_fh, open);
 		if (status) {
 			release_open_stateid(stp);
@@ -3812,16 +3815,11 @@ alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct
 {
 	struct nfs4_ol_stateid *stp;
 	struct nfs4_client *clp = lo->lo_owner.so_client;
-	__be32 status;
 
-	stp = nfs4_alloc_stateid();
+	stp = nfs4_alloc_stateid(clp);
 	if (stp == NULL)
 		return NULL;
-	status = init_stid(&stp->st_stid, clp, NFS4_LOCK_STID);
-	if (status) {
-		free_generic_stateid(stp);
-		return NULL;
-	}
+	init_stid(&stp->st_stid, clp, NFS4_LOCK_STID);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
 	list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
 	stp->st_stateowner = &lo->lo_owner;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index eab9dae23c06..1a5820066040 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -85,6 +85,7 @@ struct nfs4_stid {
 };
 
 struct nfs4_delegation {
+	struct nfs4_stid	dl_stid; /* must be first field */
 	struct list_head	dl_perfile;
 	struct list_head	dl_perclnt;
 	struct list_head	dl_recall_lru;  /* delegation recalled */
@@ -93,7 +94,6 @@ struct nfs4_delegation {
 	u32			dl_type;
 	time_t			dl_time;
 /* For recall: */
-	struct nfs4_stid	dl_stid;
 	struct knfsd_fh		dl_fh;
 	int			dl_retries;
 	struct nfsd4_callback	dl_recall;
@@ -434,7 +434,7 @@ static inline struct file *find_any_file(struct nfs4_file *f)
 
 /* "ol" stands for "Open or Lock".  Better suggestions welcome. */
 struct nfs4_ol_stateid {
-	struct nfs4_stid    st_stid;
+	struct nfs4_stid    st_stid; /* must be first field */
 	struct list_head              st_perfile;
 	struct list_head              st_perstateowner;
 	struct list_head              st_lockowners;
-- 
cgit v1.2.3-58-ga151


From 4cdc951b8611de4ce25e35c9fb8c0656150c9245 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 17 Oct 2011 15:57:47 -0400
Subject: nfsd4: preallocate open stateid in process_open1()

As with the nfs4_file, we'd prefer to find out about any failure before
creating a new file rather than after.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 49 ++++++++++++++++++++-----------------------------
 fs/nfsd/xdr4.h      |  1 +
 2 files changed, 21 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1f8c781c2a28..3e1d4e08dfad 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -268,6 +268,11 @@ static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cac
 	return kmem_cache_alloc(slab, GFP_KERNEL);
 }
 
+static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
+{
+	return openlockstateid(nfs4_alloc_stid(clp, stateid_slab));
+}
+
 static struct nfs4_delegation *
 alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type)
 {
@@ -2511,6 +2516,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
 	struct nfs4_client *clp = NULL;
 	unsigned int strhashval;
 	struct nfs4_openowner *oo = NULL;
+	__be32 status;
 
 	if (STALE_CLIENTID(&open->op_clientid))
 		return nfserr_stale_clientid;
@@ -2538,12 +2544,20 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
 		open->op_openowner = NULL;
 		goto new_owner;
 	}
-	return nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
+	status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
+	if (status)
+		return status;
+	clp = oo->oo_owner.so_client;
+	goto alloc_stateid;
 new_owner:
 	oo = alloc_init_open_stateowner(strhashval, clp, open);
 	if (oo == NULL)
 		return nfserr_jukebox;
 	open->op_openowner = oo;
+alloc_stateid:
+	open->op_stp = nfs4_alloc_stateid(clp);
+	if (!open->op_stp)
+		return nfserr_jukebox;
 	return nfs_ok;
 }
 
@@ -2616,11 +2630,6 @@ nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_st
 	return nfs_ok;
 }
 
-static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
-{
-	return openlockstateid(nfs4_alloc_stid(clp, stateid_slab));
-}
-
 static void nfs4_free_stateid(struct nfs4_ol_stateid *s)
 {
 	kmem_cache_free(stateid_slab, s);
@@ -2661,28 +2670,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
 	return nfs_ok;
 }
 
-static __be32
-nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_ol_stateid **stpp,
-		struct nfs4_file *fp, struct svc_fh *cur_fh,
-		struct nfsd4_open *open)
-{
-	struct nfs4_ol_stateid *stp;
-	struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
-	__be32 status;
-
-	stp = nfs4_alloc_stateid(cl);
-	if (stp == NULL)
-		return nfserr_jukebox;
-
-	status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open);
-	if (status) {
-		nfs4_free_stateid(stp);
-		return status;
-	}
-	*stpp = stp;
-	return 0;
-}
-
 static inline __be32
 nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
 		struct nfsd4_open *open)
@@ -2916,9 +2903,11 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		if (status)
 			goto out;
 	} else {
-		status = nfs4_new_open(rqstp, &stp, fp, current_fh, open);
+		status = nfs4_get_vfs_file(rqstp, fp, current_fh, open);
 		if (status)
 			goto out;
+		stp = open->op_stp;
+		open->op_stp = NULL;
 		init_open_stateid(stp, fp, open);
 		status = nfsd4_truncate(rqstp, current_fh, open);
 		if (status) {
@@ -2975,6 +2964,8 @@ void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status)
 	}
 	if (open->op_file)
 		nfsd4_free_file(open->op_file);
+	if (open->op_stp)
+		nfs4_free_stateid(open->op_stp);
 }
 
 __be32
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 502dd43634f9..ce8c59196b4e 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -229,6 +229,7 @@ struct nfsd4_open {
 	int		op_truncate;        /* used during processing */
 	struct nfs4_openowner *op_openowner; /* used during processing */
 	struct nfs4_file *op_file;          /* used during processing */
+	struct nfs4_ol_stateid *op_stp;	    /* used during processing */
 	struct nfs4_acl *op_acl;
 };
 #define op_iattr	iattr
-- 
cgit v1.2.3-58-ga151


From 856121b2e83bd64bffdc8de449d24c9295e92ff3 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 13 Oct 2011 11:37:11 -0400
Subject: nfsd4: warn on open failure after create

If we create the object and then return failure to the client, we're
left with an unexpected file in the filesystem.

I'm trying to eliminate such cases but not 100% sure I have so an
assertion might be helpful for now.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 7 ++++---
 fs/nfsd/vfs.c      | 2 +-
 fs/nfsd/vfs.h      | 2 +-
 fs/nfsd/xdr4.h     | 3 ++-
 4 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 10b50d78bdc3..710b97b7a2f3 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -194,7 +194,6 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 {
 	struct svc_fh resfh;
 	__be32 status;
-	int created = 0;
 
 	fh_init(&resfh, NFS4_FHSIZE);
 	open->op_truncate = 0;
@@ -223,7 +222,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 					open->op_fname.len, &open->op_iattr,
 					&resfh, open->op_createmode,
 					(u32 *)open->op_verf.data,
-					&open->op_truncate, &created);
+					&open->op_truncate, &open->op_created);
 
 		/*
 		 * Following rfc 3530 14.2.16, use the returned bitmask
@@ -253,7 +252,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 	/* set reply cache */
 	fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
 			&resfh.fh_handle);
-	if (!created)
+	if (!open->op_created)
 		status = do_open_permission(rqstp, current_fh, open,
 					    NFSD_MAY_NOP);
 
@@ -318,6 +317,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	/* We don't yet support WANT bits: */
 	open->op_share_access &= NFS4_SHARE_ACCESS_MASK;
 
+	open->op_created = 0;
 	/*
 	 * RFC5661 18.51.3
 	 * Before RECLAIM_COMPLETE done, server should deny new lock
@@ -408,6 +408,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 * set, (2) sets open->op_stateid, (3) sets open->op_delegation.
 	 */
 	status = nfsd4_process_open2(rqstp, &cstate->current_fh, open);
+	WARN_ON(status && open->op_created);
 out:
 	nfsd4_cleanup_open_state(open, status);
 	if (open->op_openowner)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 4dd91283d039..7a2e442623c8 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1370,7 +1370,7 @@ __be32
 do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		char *fname, int flen, struct iattr *iap,
 		struct svc_fh *resfhp, int createmode, u32 *verifier,
-	        int *truncp, int *created)
+	        bool *truncp, bool *created)
 {
 	struct dentry	*dentry, *dchild = NULL;
 	struct inode	*dirp;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 503f3bf11abd..3f54ad03bb2b 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -62,7 +62,7 @@ __be32		nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
 __be32		do_nfsd_create(struct svc_rqst *, struct svc_fh *,
 				char *name, int len, struct iattr *attrs,
 				struct svc_fh *res, int createmode,
-				u32 *verifier, int *truncp, int *created);
+				u32 *verifier, bool *truncp, bool *created);
 __be32		nfsd_commit(struct svc_rqst *, struct svc_fh *,
 				loff_t, unsigned long);
 #endif /* CONFIG_NFSD_V3 */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index ce8c59196b4e..e3057350eea1 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -226,7 +226,8 @@ struct nfsd4_open {
 	u32		op_recall;          /* recall */
 	struct nfsd4_change_info  op_cinfo; /* response */
 	u32		op_rflags;          /* response */
-	int		op_truncate;        /* used during processing */
+	bool		op_truncate;        /* used during processing */
+	bool		op_created;         /* used during processing */
 	struct nfs4_openowner *op_openowner; /* used during processing */
 	struct nfs4_file *op_file;          /* used during processing */
 	struct nfs4_ol_stateid *op_stp;	    /* used during processing */
-- 
cgit v1.2.3-58-ga151


From 215fc6af739d2dfdcad5496248d0d6302eb8a604 Mon Sep 17 00:00:00 2001
From: Nikitas Angelinas <nikitasangelinas@gmail.com>
Date: Tue, 18 Oct 2011 10:49:51 -0400
Subject: ext4: MMP: kmmpd should use nodename from init_uts_ns.name, not
 sysname

sysname holds "Linux" by default, i.e. what appears when doing a "uname
-s"; nodename should be used to print the machine's hostname, i.e. what
is returned when doing a "uname -n" or "hostname", and what
gethostname(2)/sethostname(2) manipulate, in order to notify the
administrator of the node which is contending to mount the filesystem.

Acked-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
Signed-off-by: Andrew Perepechko <andrew_perepechko@xyratex.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mmp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 9bdef3f537c5..2fca64efd6db 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -109,7 +109,7 @@ static int kmmpd(void *data)
 	mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
 	bdevname(bh->b_bdev, mmp->mmp_bdevname);
 
-	memcpy(mmp->mmp_nodename, init_utsname()->sysname,
+	memcpy(mmp->mmp_nodename, init_utsname()->nodename,
 	       sizeof(mmp->mmp_nodename));
 
 	while (!kthread_should_stop()) {
-- 
cgit v1.2.3-58-ga151


From bdfc230f33a9dab316dcb6be4696ae59e1562e3c Mon Sep 17 00:00:00 2001
From: Nikitas Angelinas <nikitasangelinas@gmail.com>
Date: Tue, 18 Oct 2011 10:51:51 -0400
Subject: ext4: MMP: fix error message rate-limiting logic in kmmpd

Current logic would print an error message only once, and then
'failed_writes' would stay at 1.  Rework the loop to increment
'failed_writes' and print the error message every
s_mmp_update_interval * 60 seconds, as intended according to the
comment.

Signed-off-by: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
Signed-off-by: Andrew Perepechko <andrew_perepechko@xyratex.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Acked-by: Andreas Dilger <adilger@dilger.ca>
---
 fs/ext4/mmp.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 2fca64efd6db..6b327423e622 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -125,8 +125,9 @@ static int kmmpd(void *data)
 		 * Don't spew too many error messages. Print one every
 		 * (s_mmp_update_interval * 60) seconds.
 		 */
-		if (retval && (failed_writes % 60) == 0) {
-			ext4_error(sb, "Error writing to MMP block");
+		if (retval) {
+			if ((failed_writes % 60) == 0)
+				ext4_error(sb, "Error writing to MMP block");
 			failed_writes++;
 		}
 
-- 
cgit v1.2.3-58-ga151


From f6f96fdb8c2779f9bd8ed7b0b08405b4439c982b Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@us.ibm.com>
Date: Tue, 18 Oct 2011 10:53:51 -0400
Subject: ext4: Fix comparison endianness problem in MMP initialization

As part of startup, the MMP initialization code does this:

mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());

Next, mmp->mmp_seq is written out to disk, a delay happens, and then
the MMP block is read back in and the sequence value is tested:

if (seq != le32_to_cpu(mmp->mmp_seq)) {
	/* fail the mount */

On a LE system such as x86, the *le32* functions do nothing and this
works.  Unfortunately, on a BE system such as ppc64, this comparison
becomes:

if (cpu_to_le32(new_seq) != le32_to_cpu(cpu_to_le32(new_seq)) {
	/* fail the mount */

Except for a few palindromic sequence numbers, this test always causes
the mount to fail, which makes MMP filesystems generally unmountable
on ppc64.  The attached patch fixes this situation.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mmp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 6b327423e622..7ea4ba4eff2a 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -296,7 +296,8 @@ skip:
 	/*
 	 * write a new random sequence number.
 	 */
-	mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
+	seq = mmp_new_seq();
+	mmp->mmp_seq = cpu_to_le32(seq);
 
 	retval = write_mmp_block(bh);
 	if (retval)
-- 
cgit v1.2.3-58-ga151


From 1bce63d1a2a2c8929442b79acd4eab2e3db10a0b Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Tue, 18 Oct 2011 10:55:51 -0400
Subject: ext4: add block plug for .writepages

Add block plug for ext4 .writepages. Though ext4 .writepages
already handles request merge very well, block plug is still
helpful to reduce block lock contention.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4863238c3754..081bb25a9ad3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2160,6 +2160,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 	pgoff_t done_index = 0;
 	pgoff_t end;
+	struct blk_plug plug;
 
 	trace_ext4_da_writepages(inode, wbc);
 
@@ -2238,6 +2239,7 @@ retry:
 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 		tag_pages_for_writeback(mapping, index, end);
 
+	blk_start_plug(&plug);
 	while (!ret && wbc->nr_to_write > 0) {
 
 		/*
@@ -2302,6 +2304,7 @@ retry:
 			 */
 			break;
 	}
+	blk_finish_plug(&plug);
 	if (!io_done && !cycled) {
 		cycled = 1;
 		index = 0;
-- 
cgit v1.2.3-58-ga151


From e0cbee3e14195ef07b8ab6ff30930fb93d2e510a Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Tue, 18 Oct 2011 10:57:51 -0400
Subject: ext4: functions should not be declared extern

The function declarations in ext4.h are already marked extern, so it's
not necessary to do so in the .c files.

This quiets the sparse noise:

warning: function 'ext4_flush_completed_IO' with external linkage has definition
warning: function 'ext4_init_inode_table' with external linkage has definition

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/fsync.c  | 2 +-
 fs/ext4/ialloc.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 036f78f7a1ef..c942924a0645 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -75,7 +75,7 @@ static void dump_completed_IO(struct inode * inode)
  * to written.
  * The function return the number of pending IOs on success.
  */
-extern int ext4_flush_completed_IO(struct inode *inode)
+int ext4_flush_completed_IO(struct inode *inode)
 {
 	ext4_io_end_t *io;
 	struct ext4_inode_info *ei = EXT4_I(inode);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 04219988e75f..e007fecdaedc 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1100,7 +1100,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
  * inode allocation from the current group, so we take alloc_sem lock, to
  * block ext4_claim_inode until we are finished.
  */
-extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
+int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
 				 int barrier)
 {
 	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
-- 
cgit v1.2.3-58-ga151


From e6705f7c255d1ffae7cd161d0b657296f4dd62fd Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Tue, 18 Oct 2011 10:59:51 -0400
Subject: ext4: add __user decoration to calls of copy_{from,to}_user()

This quiets the sparse noise:

warning: incorrect type in argument 2 (different address spaces)
   expected void const [noderef] <asn:1>*from
   got struct fstrim_range *<noident>
warning: incorrect type in argument 1 (different address spaces)
   expected void [noderef] <asn:1>*to
   got struct fstrim_range *<noident>

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ioctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 8f7ea6905421..a56796814d6a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -350,7 +350,7 @@ mext_out:
 			return -EOPNOTSUPP;
 		}
 
-		if (copy_from_user(&range, (struct fstrim_range *)arg,
+		if (copy_from_user(&range, (struct fstrim_range __user *)arg,
 		    sizeof(range)))
 			return -EFAULT;
 
@@ -360,7 +360,7 @@ mext_out:
 		if (ret < 0)
 			return ret;
 
-		if (copy_to_user((struct fstrim_range *)arg, &range,
+		if (copy_to_user((struct fstrim_range __user *)arg, &range,
 		    sizeof(range)))
 			return -EFAULT;
 
-- 
cgit v1.2.3-58-ga151


From ee90d57e20bd4749dda3b14397392b18b89dc3ef Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Tue, 18 Oct 2011 11:01:51 -0400
Subject: ext4: quiet sparse noise about plain integer as NULL pointer

The third parameter to ext4_free_blocks is a struct buffer_head *.  This
parameter should be NULL not 0.

This quiets the sparse noise:

warning: Using plain integer as NULL pointer

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 5c4861210d4c..2fc6cc0a51ca 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2291,7 +2291,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 		start = ext4_ext_pblock(ex);
 
 		ext_debug("free first %u blocks starting %llu\n", num, start);
-		ext4_free_blocks(handle, inode, 0, start, num, flags);
+		ext4_free_blocks(handle, inode, NULL, start, num, flags);
 
 	} else {
 		printk(KERN_INFO "strange request: removal(2) "
-- 
cgit v1.2.3-58-ga151


From 7748dd6eab8e13f974d4664395e76afffacda04b Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 18 Oct 2011 12:41:35 +0300
Subject: CIFS: cleanup min_t() cast in cifs_read()

Smatch complains that the cast to "int" in min_t() changes very large
values of current_read_size into negative values and so min_t()
could return the wrong value.  I removed the const as well, as that
doesn't do anything here.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 7f84ece116d0..852d1f39adae 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1904,13 +1904,13 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 	for (total_read = 0, current_offset = read_data;
 	     read_size > total_read;
 	     total_read += bytes_read, current_offset += bytes_read) {
-		current_read_size = min_t(const int, read_size - total_read,
+		current_read_size = min_t(uint, read_size - total_read,
 					  cifs_sb->rsize);
 		/* For windows me and 9x we do not want to request more
 		than it negotiated since it will refuse the read then */
 		if ((pTcon->ses) &&
 			!(pTcon->ses->capabilities & CAP_LARGE_FILES)) {
-			current_read_size = min_t(const int, current_read_size,
+			current_read_size = min_t(uint, current_read_size,
 					CIFSMaxBufSize);
 		}
 		rc = -EAGAIN;
-- 
cgit v1.2.3-58-ga151


From ad4778fb40994dd7c779069dad6ff704d75b81e6 Mon Sep 17 00:00:00 2001
From: Gerlando Falauto <gerlando.falauto@keymile.com>
Date: Tue, 18 Oct 2011 10:58:50 +0200
Subject: CIFS: fix automount for DFS shares

Automounting directories are now invalidated by .d_revalidate()
so to be d_instantiate()d again with the right DCACHE_NEED_AUTOMOUNT
flag

Reviewed-by: Jeff Layton <jlayton@samba.org>
Signed-off-by: Gerlando Falauto <gerlando.falauto@keymile.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/dir.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 0c8098d54d2b..d7eeb9d3ed6f 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -648,8 +648,16 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 	if (direntry->d_inode) {
 		if (cifs_revalidate_dentry(direntry))
 			return 0;
-		else
+		else {
+			/*
+			 * Forcibly invalidate automounting directory inodes
+			 * (remote DFS directories) so to have them
+			 * instantiated again for automount
+			 */
+			if (IS_AUTOMOUNT(direntry->d_inode))
+				return 0;
 			return 1;
+		}
 	}
 
 	/*
-- 
cgit v1.2.3-58-ga151


From 59b7c05fffba030e5d9e72324691e2f99aa69b79 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 17 Oct 2011 18:22:55 -0700
Subject: Revert "NFS: Ensure that writeback_single_inode() calls write_inode()
 when syncing"

This reverts commit b80c3cb628f0ebc241b02e38dd028969fb8026a2.

The reverted commit was rendered obsolete by a VFS fix: commit
5547e8aac6f71505d621a612de2fca0dd988b439 (writeback: Update dirty flags in
two steps). We now no longer need to worry about writeback_single_inode()
missing our marking the inode for COMMIT in 'do_writepages()' call.

Reverting this patch, fixes a performance regression in which the inode
would continuously get queued to the dirty list, causing the writeback
code to unnecessarily try to send a COMMIT.

Signed-off-by: Trond Myklebust <Trond.Myklebust>
Tested-by: Simon Kirby <sim@hostway.ca>
Cc: stable@kernel.org [2.6.35+]
---
 fs/nfs/write.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c9bd2a6b7d4b..71296cc97f98 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -428,7 +428,6 @@ static void
 nfs_mark_request_dirty(struct nfs_page *req)
 {
 	__set_page_dirty_nobuffers(req->wb_page);
-	__mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC);
 }
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -762,6 +761,8 @@ int nfs_updatepage(struct file *file, struct page *page,
 	status = nfs_writepage_setup(ctx, page, offset, count);
 	if (status < 0)
 		nfs_set_pageerror(page);
+	else
+		__set_page_dirty_nobuffers(page);
 
 	dprintk("NFS:       nfs_updatepage returns %d (isize %lld)\n",
 			status, (long long)i_size_read(inode));
-- 
cgit v1.2.3-58-ga151


From 3236c3e1adc0c7ec83eaff1de2d06746b7c5bb28 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Oct 2011 09:49:21 -0400
Subject: nfs: don't redirty inode when ncommit == 0 in
 nfs_commit_unstable_pages

commit 420e3646 allowed the kernel to reduce the number of unnecessary
commit calls by skipping the commit when there are a large number of
outstanding pages.

However, the current test in nfs_commit_unstable_pages does not handle
the edge condition properly. When ncommit == 0, then that means that the
kernel doesn't need to do anything more for the inode. The current test
though in the WB_SYNC_NONE case will return true, and the inode will end
up being marked dirty. Once that happens the inode will never be clean
until there's a WB_SYNC_ALL flush.

Fix this by immediately returning from nfs_commit_unstable_pages when
ncommit == 0.

Mike noticed this problem initially in RHEL5 (2.6.18-based kernel) which
has a backported version of 420e3646. The inode cache there was growing
very large. The inode cache was unable to be shrunk since the inodes
were all marked dirty. Calling sync() would essentially "fix" the
problem -- the WB_SYNC_ALL flush would result in the inodes all being
marked clean.

What I'm not clear on is how big a problem this is in mainline kernels
as the writeback code there is very different. Either way, it seems
incorrect to re-mark the inode dirty in this case.

Reported-by: Mike McLean <mikem@redhat.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Cc: stable@kernel.org [2.6.34+]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 71296cc97f98..46aa4389ce13 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1554,6 +1554,10 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
 	int flags = FLUSH_SYNC;
 	int ret = 0;
 
+	/* no commits means nothing needs to be done */
+	if (!nfsi->ncommit)
+		return ret;
+
 	if (wbc->sync_mode == WB_SYNC_NONE) {
 		/* Don't commit yet if this is a non-blocking flush and there
 		 * are a lot of outstanding writes for this mapping.
-- 
cgit v1.2.3-58-ga151


From b9dd3abbbc708da5e3c53424a5b2c66ab580f97e Mon Sep 17 00:00:00 2001
From: Mi Jinlong <mijinlong@cn.fujitsu.com>
Date: Wed, 12 Oct 2011 15:09:34 +0800
Subject: nfs: fix bug about IPv6 address scope checking

The result from ipv6_addr_scope() always not be a single SCOPE,
so we can't use equal to compare the result with IPV6_ADDR_SCOPE_LINKLOCAL
at nfs_sockaddr_match_ipaddr6.

This patch fixs the problem, and lets checking address before scope_id.

Signed-off-by: Mi Jinlong <mijinlong@cn.fujitsu.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 5833fbbf59b0..b4e41dd4d0f6 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -336,11 +336,12 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
 	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
 	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
 
-	if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
-	    sin1->sin6_scope_id != sin2->sin6_scope_id)
+	if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
 		return 0;
+	else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+		return sin1->sin6_scope_id == sin2->sin6_scope_id;
 
-	return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr);
+	return 1;
 }
 #else	/* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
 static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
-- 
cgit v1.2.3-58-ga151


From 2da956523526e440ef4f4dd174e26f5ac06fe011 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 12 Oct 2011 10:57:42 -0400
Subject: nfs: don't try to migrate pages with active requests

nfs_find_and_lock_request will take a reference to the nfs_page and
will then put it if the req is already locked. It's possible though
that the reference will be the last one. That put then can kick off
a whole series of reference puts:

nfs_page
   nfs_open_context
      dentry
          inode

If the inode ends up being deleted, then the VFS will call
truncate_inode_pages. That function will try to take the page lock, but
it was already locked when migrate_page was called. The code
deadlocks.

Fix this by simply refusing the migration request if PagePrivate is
already set, indicating that the page is already associated with an
active read or write request.

We've had a customer test a backported version of this patch and
the preliminary results seem good.

Cc: stable@kernel.org
Cc: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: Harshula Jayasuriya <harshula@redhat.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 36 +++++++++++-------------------------
 1 file changed, 11 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 46aa4389ce13..72813ede029e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1691,34 +1691,20 @@ out_error:
 int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
 		struct page *page)
 {
-	struct nfs_page *req;
-	int ret;
+	/*
+	 * If PagePrivate is set, then the page is currently associated with
+	 * an in-progress read or write request. Don't try to migrate it.
+	 *
+	 * FIXME: we could do this in principle, but we'll need a way to ensure
+	 *        that we can safely release the inode reference while holding
+	 *        the page lock.
+	 */
+	if (PagePrivate(page))
+		return -EBUSY;
 
 	nfs_fscache_release_page(page, GFP_KERNEL);
 
-	req = nfs_find_and_lock_request(page, false);
-	ret = PTR_ERR(req);
-	if (IS_ERR(req))
-		goto out;
-
-	ret = migrate_page(mapping, newpage, page);
-	if (!req)
-		goto out;
-	if (ret)
-		goto out_unlock;
-	page_cache_get(newpage);
-	spin_lock(&mapping->host->i_lock);
-	req->wb_page = newpage;
-	SetPagePrivate(newpage);
-	set_page_private(newpage, (unsigned long)req);
-	ClearPagePrivate(page);
-	set_page_private(page, 0);
-	spin_unlock(&mapping->host->i_lock);
-	page_cache_release(page);
-out_unlock:
-	nfs_clear_page_tag_locked(req);
-out:
-	return ret;
+	return migrate_page(mapping, newpage, page);
 }
 #endif
 
-- 
cgit v1.2.3-58-ga151


From 516f2e24faa7548a61d9ba790958528469c2e284 Mon Sep 17 00:00:00 2001
From: Jim Rees <rees@umich.edu>
Date: Thu, 22 Sep 2011 21:50:08 -0400
Subject: pnfsblock: fix return code confusion

Always return PTR_ERR, not NULL, from nfs4_blk_get_deviceinfo and
nfs4_blk_decode_device.

Check for IS_ERR, not NULL, in bl_set_layoutdriver when calling
nfs4_blk_get_deviceinfo.

Signed-off-by: Jim Rees <rees@umich.edu>
Signed-off-by: Benny Halevy <bhalevy@tonian.com>
Cc: stable@kernel.org [3.0]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/blocklayout/blocklayout.c    | 20 ++++++++++++--------
 fs/nfs/blocklayout/blocklayoutdev.c | 13 ++++++++-----
 2 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 9561c8fc8bdb..d2432f0dc40c 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -805,7 +805,7 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
 			struct nfs4_deviceid *d_id)
 {
 	struct pnfs_device *dev;
-	struct pnfs_block_dev *rv = NULL;
+	struct pnfs_block_dev *rv;
 	u32 max_resp_sz;
 	int max_pages;
 	struct page **pages = NULL;
@@ -823,18 +823,20 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
 	dev = kmalloc(sizeof(*dev), GFP_NOFS);
 	if (!dev) {
 		dprintk("%s kmalloc failed\n", __func__);
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	}
 
 	pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
 	if (pages == NULL) {
 		kfree(dev);
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	}
 	for (i = 0; i < max_pages; i++) {
 		pages[i] = alloc_page(GFP_NOFS);
-		if (!pages[i])
+		if (!pages[i]) {
+			rv = ERR_PTR(-ENOMEM);
 			goto out_free;
+		}
 	}
 
 	memcpy(&dev->dev_id, d_id, sizeof(*d_id));
@@ -847,8 +849,10 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
 	dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
 	rc = nfs4_proc_getdeviceinfo(server, dev);
 	dprintk("%s getdevice info returns %d\n", __func__, rc);
-	if (rc)
+	if (rc) {
+		rv = ERR_PTR(rc);
 		goto out_free;
+	}
 
 	rv = nfs4_blk_decode_device(server, dev);
  out_free:
@@ -866,7 +870,7 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
 	struct pnfs_devicelist *dlist = NULL;
 	struct pnfs_block_dev *bdev;
 	LIST_HEAD(block_disklist);
-	int status = 0, i;
+	int status, i;
 
 	dprintk("%s enter\n", __func__);
 
@@ -898,8 +902,8 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
 		for (i = 0; i < dlist->num_devs; i++) {
 			bdev = nfs4_blk_get_deviceinfo(server, fh,
 						       &dlist->dev_id[i]);
-			if (!bdev) {
-				status = -ENODEV;
+			if (IS_ERR(bdev)) {
+				status = PTR_ERR(bdev);
 				goto out_error;
 			}
 			spin_lock(&b_mt_id->bm_lock);
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index a83b393fb01c..0b1fb0e25b93 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -131,7 +131,7 @@ struct pnfs_block_dev *
 nfs4_blk_decode_device(struct nfs_server *server,
 		       struct pnfs_device *dev)
 {
-	struct pnfs_block_dev *rv = NULL;
+	struct pnfs_block_dev *rv;
 	struct block_device *bd = NULL;
 	struct rpc_pipe_msg msg;
 	struct bl_msg_hdr bl_msg = {
@@ -141,7 +141,7 @@ nfs4_blk_decode_device(struct nfs_server *server,
 	uint8_t *dataptr;
 	DECLARE_WAITQUEUE(wq, current);
 	struct bl_dev_msg *reply = &bl_mount_reply;
-	int offset, len, i;
+	int offset, len, i, rc;
 
 	dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
 	dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
@@ -168,8 +168,10 @@ nfs4_blk_decode_device(struct nfs_server *server,
 
 	dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
 	add_wait_queue(&bl_wq, &wq);
-	if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) {
+	rc = rpc_queue_upcall(bl_device_pipe->d_inode, &msg);
+	if (rc < 0) {
 		remove_wait_queue(&bl_wq, &wq);
+		rv = ERR_PTR(rc);
 		goto out;
 	}
 
@@ -187,8 +189,9 @@ nfs4_blk_decode_device(struct nfs_server *server,
 
 	bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor));
 	if (IS_ERR(bd)) {
-		dprintk("%s failed to open device : %ld\n",
-			__func__, PTR_ERR(bd));
+		rc = PTR_ERR(bd);
+		dprintk("%s failed to open device : %d\n", __func__, rc);
+		rv = ERR_PTR(rc);
 		goto out;
 	}
 
-- 
cgit v1.2.3-58-ga151


From fdc17abbc4b6094b34ee8ff5d91eaba8637594a2 Mon Sep 17 00:00:00 2001
From: Jim Rees <rees@umich.edu>
Date: Thu, 22 Sep 2011 21:50:09 -0400
Subject: pnfsblock: fix size of upcall message

Make the status field explicitly 32 bits.  "...it's unlikely that the kernel
and userspace would differ on the size of an int here, but it might be a
good idea to go ahead and make that explicitly 32 bits in case we end up
dealing with more exotic arches at some point in the future."

Suggested-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Jim Rees <rees@umich.edu>
Signed-off-by: Benny Halevy <bhalevy@tonian.com>
Cc: stable@kernel.org [3.0]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/blocklayout/blocklayout.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index f27d827960a3..58dc256402e3 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -150,7 +150,7 @@ BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
 }
 
 struct bl_dev_msg {
-	int status;
+	int32_t status;
 	uint32_t major, minor;
 };
 
-- 
cgit v1.2.3-58-ga151


From c1225158a8dad9e9d5eee8a17dbbd9c7cda05ab9 Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Thu, 22 Sep 2011 21:50:10 -0400
Subject: SUNRPC/NFS: make rpc pipe upcall generic

The same function is used by idmap, gss and blocklayout code. Make it
generic.

Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Jim Rees <rees@umich.edu>
Cc: stable@kernel.org [3.0]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/blocklayout/blocklayout.c    |  2 +-
 fs/nfs/blocklayout/blocklayout.h    |  2 --
 fs/nfs/blocklayout/blocklayoutdev.c | 22 ----------------------
 fs/nfs/idmap.c                      | 25 +------------------------
 include/linux/sunrpc/rpc_pipe_fs.h  |  2 ++
 net/sunrpc/auth_gss/auth_gss.c      | 24 ++----------------------
 net/sunrpc/rpc_pipe.c               | 20 ++++++++++++++++++++
 7 files changed, 26 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index d2432f0dc40c..dc23833c0231 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -964,7 +964,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
 };
 
 static const struct rpc_pipe_ops bl_upcall_ops = {
-	.upcall		= bl_pipe_upcall,
+	.upcall		= rpc_pipe_generic_upcall,
 	.downcall	= bl_pipe_downcall,
 	.destroy_msg	= bl_pipe_destroy_msg,
 };
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 58dc256402e3..42acf7ef5992 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -169,8 +169,6 @@ extern wait_queue_head_t bl_wq;
 #define BL_DEVICE_REQUEST_ERR          0x2 /* User level process fails */
 
 /* blocklayoutdev.c */
-ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *,
-		       char __user *, size_t);
 ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
 void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
 struct block_device *nfs4_blkdev_get(dev_t dev);
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index 0b1fb0e25b93..d08ba9107fde 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -79,28 +79,6 @@ int nfs4_blkdev_put(struct block_device *bdev)
 	return blkdev_put(bdev, FMODE_READ);
 }
 
-/*
- * Shouldn't there be a rpc_generic_upcall() to do this for us?
- */
-ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
-		       char __user *dst, size_t buflen)
-{
-	char *data = (char *)msg->data + msg->copied;
-	size_t mlen = min(msg->len - msg->copied, buflen);
-	unsigned long left;
-
-	left = copy_to_user(dst, data, mlen);
-	if (left == mlen) {
-		msg->errno = -EFAULT;
-		return -EFAULT;
-	}
-
-	mlen -= left;
-	msg->copied += mlen;
-	msg->errno = 0;
-	return mlen;
-}
-
 static struct bl_dev_msg bl_mount_reply;
 
 ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index f20801ae0a16..47d1c6ff2d8e 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -336,8 +336,6 @@ struct idmap {
 	struct idmap_hashtable	idmap_group_hash;
 };
 
-static ssize_t idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *,
-				 char __user *, size_t);
 static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
 				   size_t);
 static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
@@ -345,7 +343,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
 static unsigned int fnvhash32(const void *, size_t);
 
 static const struct rpc_pipe_ops idmap_upcall_ops = {
-	.upcall		= idmap_pipe_upcall,
+	.upcall		= rpc_pipe_generic_upcall,
 	.downcall	= idmap_pipe_downcall,
 	.destroy_msg	= idmap_pipe_destroy_msg,
 };
@@ -595,27 +593,6 @@ nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h,
 	return ret;
 }
 
-/* RPC pipefs upcall/downcall routines */
-static ssize_t
-idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
-		  char __user *dst, size_t buflen)
-{
-	char *data = (char *)msg->data + msg->copied;
-	size_t mlen = min(msg->len, buflen);
-	unsigned long left;
-
-	left = copy_to_user(dst, data, mlen);
-	if (left == mlen) {
-		msg->errno = -EFAULT;
-		return -EFAULT;
-	}
-
-	mlen -= left;
-	msg->copied += mlen;
-	msg->errno = 0;
-	return mlen;
-}
-
 static ssize_t
 idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 {
diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h
index cf14db975da0..e4ea43058d8f 100644
--- a/include/linux/sunrpc/rpc_pipe_fs.h
+++ b/include/linux/sunrpc/rpc_pipe_fs.h
@@ -44,6 +44,8 @@ RPC_I(struct inode *inode)
 	return container_of(inode, struct rpc_inode, vfs_inode);
 }
 
+extern ssize_t rpc_pipe_generic_upcall(struct file *, struct rpc_pipe_msg *,
+				       char __user *, size_t);
 extern int rpc_queue_upcall(struct inode *, struct rpc_pipe_msg *);
 
 struct rpc_clnt;
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 364eb45e989d..e9b76939268d 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -603,26 +603,6 @@ out:
 	return err;
 }
 
-static ssize_t
-gss_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
-		char __user *dst, size_t buflen)
-{
-	char *data = (char *)msg->data + msg->copied;
-	size_t mlen = min(msg->len, buflen);
-	unsigned long left;
-
-	left = copy_to_user(dst, data, mlen);
-	if (left == mlen) {
-		msg->errno = -EFAULT;
-		return -EFAULT;
-	}
-
-	mlen -= left;
-	msg->copied += mlen;
-	msg->errno = 0;
-	return mlen;
-}
-
 #define MSG_BUF_MAXSIZE 1024
 
 static ssize_t
@@ -1590,7 +1570,7 @@ static const struct rpc_credops gss_nullops = {
 };
 
 static const struct rpc_pipe_ops gss_upcall_ops_v0 = {
-	.upcall		= gss_pipe_upcall,
+	.upcall		= rpc_pipe_generic_upcall,
 	.downcall	= gss_pipe_downcall,
 	.destroy_msg	= gss_pipe_destroy_msg,
 	.open_pipe	= gss_pipe_open_v0,
@@ -1598,7 +1578,7 @@ static const struct rpc_pipe_ops gss_upcall_ops_v0 = {
 };
 
 static const struct rpc_pipe_ops gss_upcall_ops_v1 = {
-	.upcall		= gss_pipe_upcall,
+	.upcall		= rpc_pipe_generic_upcall,
 	.downcall	= gss_pipe_downcall,
 	.destroy_msg	= gss_pipe_destroy_msg,
 	.open_pipe	= gss_pipe_open_v1,
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index b181e3441323..67dbc1884383 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -77,6 +77,26 @@ rpc_timeout_upcall_queue(struct work_struct *work)
 	rpc_purge_list(rpci, &free_list, destroy_msg, -ETIMEDOUT);
 }
 
+ssize_t rpc_pipe_generic_upcall(struct file *filp, struct rpc_pipe_msg *msg,
+				char __user *dst, size_t buflen)
+{
+	char *data = (char *)msg->data + msg->copied;
+	size_t mlen = min(msg->len - msg->copied, buflen);
+	unsigned long left;
+
+	left = copy_to_user(dst, data, mlen);
+	if (left == mlen) {
+		msg->errno = -EFAULT;
+		return -EFAULT;
+	}
+
+	mlen -= left;
+	msg->copied += mlen;
+	msg->errno = 0;
+	return mlen;
+}
+EXPORT_SYMBOL_GPL(rpc_pipe_generic_upcall);
+
 /**
  * rpc_queue_upcall - queue an upcall message to userspace
  * @inode: inode of upcall pipe on which to queue given message
-- 
cgit v1.2.3-58-ga151


From 760383f1ee4d14b0e0bdf0cddee648d9b8633429 Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Thu, 22 Sep 2011 21:50:11 -0400
Subject: pnfsblock: add missing rpc_put_mount and path_put

Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Jim Rees <rees@umich.edu>
Cc: stable@kernel.org [3.0]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/blocklayout/blocklayout.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index dc23833c0231..dee6cae80fea 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -993,17 +993,20 @@ static int __init nfs4blocklayout_init(void)
 			      mnt,
 			      NFS_PIPE_DIRNAME, 0, &path);
 	if (ret)
-		goto out_remove;
+		goto out_putrpc;
 
 	bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
 				    &bl_upcall_ops, 0);
+	path_put(&path);
 	if (IS_ERR(bl_device_pipe)) {
 		ret = PTR_ERR(bl_device_pipe);
-		goto out_remove;
+		goto out_putrpc;
 	}
 out:
 	return ret;
 
+out_putrpc:
+	rpc_put_mount();
 out_remove:
 	pnfs_unregister_layoutdriver(&blocklayout_type);
 	return ret;
@@ -1016,6 +1019,7 @@ static void __exit nfs4blocklayout_exit(void)
 
 	pnfs_unregister_layoutdriver(&blocklayout_type);
 	rpc_unlink(bl_device_pipe);
+	rpc_put_mount();
 }
 
 MODULE_ALIAS("nfs-layouttype4-3");
-- 
cgit v1.2.3-58-ga151


From 1b0ae068779874f54b55aac3a2a992bcf3f2c3c4 Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Thu, 22 Sep 2011 21:50:12 -0400
Subject: pnfs: make _set_lo_fail generic

file layout and block layout both use it to set mark layout io failure
bit. So make it generic.

Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Jim Rees <rees@umich.edu>
Cc: stable@kernel.org [3.0]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/blocklayout/blocklayout.c | 17 +++--------------
 fs/nfs/nfs4filelayout.c          | 19 +++----------------
 fs/nfs/pnfs.c                    | 12 ++++++++++++
 fs/nfs/pnfs.h                    |  1 +
 4 files changed, 19 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index dee6cae80fea..2167ba2afdbb 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -176,17 +176,6 @@ retry:
 	return bio;
 }
 
-static void bl_set_lo_fail(struct pnfs_layout_segment *lseg)
-{
-	if (lseg->pls_range.iomode == IOMODE_RW) {
-		dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
-		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
-	} else {
-		dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
-		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
-	}
-}
-
 /* This is basically copied from mpage_end_io_read */
 static void bl_end_io_read(struct bio *bio, int err)
 {
@@ -206,7 +195,7 @@ static void bl_end_io_read(struct bio *bio, int err)
 	if (!uptodate) {
 		if (!rdata->pnfs_error)
 			rdata->pnfs_error = -EIO;
-		bl_set_lo_fail(rdata->lseg);
+		pnfs_set_lo_fail(rdata->lseg);
 	}
 	bio_put(bio);
 	put_parallel(par);
@@ -370,7 +359,7 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
 	if (!uptodate) {
 		if (!wdata->pnfs_error)
 			wdata->pnfs_error = -EIO;
-		bl_set_lo_fail(wdata->lseg);
+		pnfs_set_lo_fail(wdata->lseg);
 	}
 	bio_put(bio);
 	put_parallel(par);
@@ -386,7 +375,7 @@ static void bl_end_io_write(struct bio *bio, int err)
 	if (!uptodate) {
 		if (!wdata->pnfs_error)
 			wdata->pnfs_error = -EIO;
-		bl_set_lo_fail(wdata->lseg);
+		pnfs_set_lo_fail(wdata->lseg);
 	}
 	bio_put(bio);
 	put_parallel(par);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index e8915d4840ad..4c78c62639e6 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -77,19 +77,6 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
 	BUG();
 }
 
-/* For data server errors we don't recover from */
-static void
-filelayout_set_lo_fail(struct pnfs_layout_segment *lseg)
-{
-	if (lseg->pls_range.iomode == IOMODE_RW) {
-		dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
-		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
-	} else {
-		dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
-		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
-	}
-}
-
 static int filelayout_async_handle_error(struct rpc_task *task,
 					 struct nfs4_state *state,
 					 struct nfs_client *clp,
@@ -145,7 +132,7 @@ static int filelayout_read_done_cb(struct rpc_task *task,
 		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
 			__func__, data->ds_clp, data->ds_clp->cl_session);
 		if (reset) {
-			filelayout_set_lo_fail(data->lseg);
+			pnfs_set_lo_fail(data->lseg);
 			nfs4_reset_read(task, data);
 			clp = NFS_SERVER(data->inode)->nfs_client;
 		}
@@ -221,7 +208,7 @@ static int filelayout_write_done_cb(struct rpc_task *task,
 		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
 			__func__, data->ds_clp, data->ds_clp->cl_session);
 		if (reset) {
-			filelayout_set_lo_fail(data->lseg);
+			pnfs_set_lo_fail(data->lseg);
 			nfs4_reset_write(task, data);
 			clp = NFS_SERVER(data->inode)->nfs_client;
 		} else
@@ -256,7 +243,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
 			__func__, data->ds_clp, data->ds_clp->cl_session);
 		if (reset) {
 			prepare_to_resend_writes(data);
-			filelayout_set_lo_fail(data->lseg);
+			pnfs_set_lo_fail(data->lseg);
 		} else
 			nfs_restart_rpc(task, data->ds_clp);
 		return -EAGAIN;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index e550e8836c37..6b19fffa812a 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1381,6 +1381,18 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
 	}
 }
 
+void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
+{
+	if (lseg->pls_range.iomode == IOMODE_RW) {
+		dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
+		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+	} else {
+		dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
+		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+	}
+}
+EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
+
 void
 pnfs_set_layoutcommit(struct nfs_write_data *wdata)
 {
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 01cbfd54f3cb..94e760eb543a 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -178,6 +178,7 @@ int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
 void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *);
 int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
 bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
+void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
-- 
cgit v1.2.3-58-ga151


From 8ce160c5ef06cc89c2b6b26bfa5ef7a5ce2c93e0 Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Thu, 22 Sep 2011 21:50:14 -0400
Subject: pnfs: recoalesce when ld write pagelist fails

For pnfs pagelist write failure, we need to pg_recoalesce and resend IO to
mds.

Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Jim Rees <rees@umich.edu>
Cc: stable@kernel.org [3.0]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pnfs.c  | 20 +++++++-------------
 fs/nfs/pnfs.h  |  2 +-
 fs/nfs/write.c | 25 ++++++++++++++++++++++++-
 3 files changed, 32 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 6b19fffa812a..a205c8e2c731 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1168,23 +1168,17 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
 /*
  * Called by non rpc-based layout drivers
  */
-int
-pnfs_ld_write_done(struct nfs_write_data *data)
+void pnfs_ld_write_done(struct nfs_write_data *data)
 {
-	int status;
-
-	if (!data->pnfs_error) {
+	if (likely(!data->pnfs_error)) {
 		pnfs_set_layoutcommit(data);
 		data->mds_ops->rpc_call_done(&data->task, data);
-		data->mds_ops->rpc_release(data);
-		return 0;
+	} else {
+		put_lseg(data->lseg);
+		data->lseg = NULL;
+		dprintk("pnfs write error = %d\n", data->pnfs_error);
 	}
-
-	dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
-		data->pnfs_error);
-	status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
-				    data->mds_ops, NFS_FILE_SYNC);
-	return status ? : -EAGAIN;
+	data->mds_ops->rpc_release(data);
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
 
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 94e760eb543a..71c23d40f735 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -201,7 +201,7 @@ void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
 int _pnfs_return_layout(struct inode *);
-int pnfs_ld_write_done(struct nfs_write_data *);
+void pnfs_ld_write_done(struct nfs_write_data *);
 int pnfs_ld_read_done(struct nfs_read_data *);
 struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
 					       struct nfs_open_context *ctx,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 72813ede029e..106fd0634ab3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1166,7 +1166,13 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
 static void nfs_writeback_release_full(void *calldata)
 {
 	struct nfs_write_data	*data = calldata;
-	int status = data->task.tk_status;
+	int ret, status = data->task.tk_status;
+	struct nfs_pageio_descriptor pgio;
+
+	if (data->pnfs_error) {
+		nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE);
+		pgio.pg_recoalesce = 1;
+	}
 
 	/* Update attributes as result of writeback. */
 	while (!list_empty(&data->pages)) {
@@ -1182,6 +1188,11 @@ static void nfs_writeback_release_full(void *calldata)
 			req->wb_bytes,
 			(long long)req_offset(req));
 
+		if (data->pnfs_error) {
+			dprintk(", pnfs error = %d\n", data->pnfs_error);
+			goto next;
+		}
+
 		if (status < 0) {
 			nfs_set_pageerror(page);
 			nfs_context_set_write_error(req->wb_context, status);
@@ -1201,7 +1212,19 @@ remove_request:
 	next:
 		nfs_clear_page_tag_locked(req);
 		nfs_end_page_writeback(page);
+		if (data->pnfs_error) {
+			lock_page(page);
+			nfs_pageio_cond_complete(&pgio, page->index);
+			ret = nfs_page_async_flush(&pgio, page, 0);
+			if (ret) {
+				nfs_set_pageerror(page);
+				dprintk("rewrite to MDS error = %d\n", ret);
+			}
+			unlock_page(page);
+		}
 	}
+	if (data->pnfs_error)
+		nfs_pageio_complete(&pgio);
 	nfs_writedata_release(calldata);
 }
 
-- 
cgit v1.2.3-58-ga151


From 9b7eecdcfeb943f130d86bbc249fde4994b6fe30 Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Thu, 22 Sep 2011 21:50:15 -0400
Subject: pnfs: recoalesce when ld read pagelist fails

For pnfs pagelist read failure, we need to pg_recoalesce and resend IO to
mds.

Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Jim Rees <rees@umich.edu>
Cc: stable@kernel.org [3.0]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pnfs.c | 20 +++++++-------------
 fs/nfs/pnfs.h |  2 +-
 fs/nfs/read.c | 12 +++++++++++-
 3 files changed, 19 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a205c8e2c731..ee73d9a4f700 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1262,23 +1262,17 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
 /*
  * Called by non rpc-based layout drivers
  */
-int
-pnfs_ld_read_done(struct nfs_read_data *data)
+void pnfs_ld_read_done(struct nfs_read_data *data)
 {
-	int status;
-
-	if (!data->pnfs_error) {
+	if (likely(!data->pnfs_error)) {
 		__nfs4_read_done_cb(data);
 		data->mds_ops->rpc_call_done(&data->task, data);
-		data->mds_ops->rpc_release(data);
-		return 0;
+	} else {
+		put_lseg(data->lseg);
+		data->lseg = NULL;
+		dprintk("pnfs write error = %d\n", data->pnfs_error);
 	}
-
-	dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
-		data->pnfs_error);
-	status = nfs_initiate_read(data, NFS_CLIENT(data->inode),
-				   data->mds_ops);
-	return status ? : -EAGAIN;
+	data->mds_ops->rpc_release(data);
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
 
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 71c23d40f735..1509530cb111 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -202,7 +202,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
 int _pnfs_return_layout(struct inode *);
 void pnfs_ld_write_done(struct nfs_write_data *);
-int pnfs_ld_read_done(struct nfs_read_data *);
+void pnfs_ld_read_done(struct nfs_read_data *);
 struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
 					       struct nfs_open_context *ctx,
 					       loff_t pos,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 2171c043ab08..bfc20b160243 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -541,13 +541,23 @@ static void nfs_readpage_result_full(struct rpc_task *task, void *calldata)
 static void nfs_readpage_release_full(void *calldata)
 {
 	struct nfs_read_data *data = calldata;
+	struct nfs_pageio_descriptor pgio;
 
+	if (data->pnfs_error) {
+		nfs_pageio_init_read_mds(&pgio, data->inode);
+		pgio.pg_recoalesce = 1;
+	}
 	while (!list_empty(&data->pages)) {
 		struct nfs_page *req = nfs_list_entry(data->pages.next);
 
 		nfs_list_remove_request(req);
-		nfs_readpage_release(req);
+		if (!data->pnfs_error)
+			nfs_readpage_release(req);
+		else
+			nfs_pageio_add_request(&pgio, req);
 	}
+	if (data->pnfs_error)
+		nfs_pageio_complete(&pgio);
 	nfs_readdata_release(calldata);
 }
 
-- 
cgit v1.2.3-58-ga151


From e6d05a757c314ad88d0649d3835a8a1daa964236 Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Thu, 22 Sep 2011 21:50:16 -0400
Subject: pnfsblock: fix NULL pointer dereference

bl_add_page_to_bio returns error pointer. bio should be reset to
NULL in failure cases as the out path always calls bl_submit_bio.

Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Jim Rees <rees@umich.edu>
Cc: stable@kernel.org [3.0]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/blocklayout/blocklayout.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 2167ba2afdbb..4ddbfbf1c3ad 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -292,6 +292,7 @@ bl_read_pagelist(struct nfs_read_data *rdata)
 						 bl_end_io_read, par);
 			if (IS_ERR(bio)) {
 				rdata->pnfs_error = PTR_ERR(bio);
+				bio = NULL;
 				goto out;
 			}
 		}
@@ -581,6 +582,7 @@ fill_invalid_ext:
 						 bl_end_io_write_zero, par);
 			if (IS_ERR(bio)) {
 				wdata->pnfs_error = PTR_ERR(bio);
+				bio = NULL;
 				goto out;
 			}
 			/* FIXME: This should be done in bi_end_io */
@@ -629,6 +631,7 @@ next_page:
 					 bl_end_io_write, par);
 		if (IS_ERR(bio)) {
 			wdata->pnfs_error = PTR_ERR(bio);
+			bio = NULL;
 			goto out;
 		}
 		isect += PAGE_CACHE_SECTORS;
-- 
cgit v1.2.3-58-ga151


From 7542274519b3ba87555410c66e8356ac1e3bc9b3 Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Thu, 22 Sep 2011 21:50:17 -0400
Subject: pnfsblock: fix writeback deadlock

We should check if the sector is already initialized before
trying to grab the page from page cache. Otherwise when two
pages of the same block are written back by two threads each
calling from writepage_locked, it can cause deadlock like bellow.

 [ 1080.972099] INFO: task kswapd0:25 blocked for more than 120 seconds.
 [ 1080.972377] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 [ 1080.972812] kswapd0         D ffff88000c4926c0     0    25      2 0x00000000
 [ 1080.972816]  ffff88000df276b0 0000000000000046 ffff88000df27640 ffffffff81013ba7
 [ 1080.972821]  ffff88000c492310 ffff88000df27fd8 ffff88000df27fd8 00000000001d3440
 [ 1080.972824]  ffff88000c378000 ffff88000c492310 ffff8800175d3d40 ffff880017fc75a8
 [ 1080.972828] Call Trace:
 [ 1080.972860]  [<ffffffff81013ba7>] ? read_tsc+0x9/0x19
 [ 1080.972877]  [<ffffffff810e0b23>] ? lock_page+0x2b/0x2b
 [ 1080.972899]  [<ffffffff81475a1d>] io_schedule+0x63/0x7e
 [ 1080.972902]  [<ffffffff810e0b31>] sleep_on_page+0xe/0x12
 [ 1080.972905]  [<ffffffff81475fe8>] __wait_on_bit_lock+0x46/0x8f
 [ 1080.972916]  [<ffffffff810822d7>] ? lock_release_holdtime.part.7+0x6b/0x72
 [ 1080.972919]  [<ffffffff810e0af6>] __lock_page+0x66/0x68
 [ 1080.972928]  [<ffffffff81072705>] ? autoremove_wake_function+0x3d/0x3d
 [ 1080.972932]  [<ffffffff810e0b1f>] lock_page+0x27/0x2b
 [ 1080.972934]  [<ffffffff810e0bcf>] find_lock_page+0x34/0x57
 [ 1080.972937]  [<ffffffff810e1738>] find_or_create_page+0x34/0x8a
 [ 1080.972947]  [<ffffffffa034245b>] bl_write_pagelist+0x205/0x6da [blocklayoutdriver]
 [ 1080.972951]  [<ffffffffa034145d>] ? bl_free_lseg+0x38/0x38 [blocklayoutdriver]
 [ 1080.972995]  [<ffffffffa02e27b9>] ? nfs_write_rpcsetup+0x118/0x123 [nfs]
 [ 1080.973033]  [<ffffffffa030246b>] pnfs_generic_pg_writepages+0x10b/0x1f4 [nfs]
 [ 1080.973089]  [<ffffffffa02deaae>] nfs_pageio_doio+0x1a/0x43 [nfs]
 [ 1080.973098]  [<ffffffffa02df035>] nfs_pageio_complete+0x16/0x2d [nfs]
 [ 1080.973108]  [<ffffffffa02e2d8f>] nfs_writepage_locked+0xa0/0xbf [nfs]
 [ 1080.973119]  [<ffffffffa02e36a1>] nfs_writepage+0x16/0x2b [nfs]
 [ 1080.973122]  [<ffffffff810e8762>] ? clear_page_dirty_for_io+0x87/0x9a
 [ 1080.973133]  [<ffffffff810efc5b>] shrink_page_list+0x39b/0x6c8
 [ 1080.973139]  [<ffffffff810f03bb>] shrink_inactive_list+0x22c/0x39e
 [ 1080.973144]  [<ffffffff810822d7>] ? lock_release_holdtime.part.7+0x6b/0x72
 [ 1080.973148]  [<ffffffff810f0c33>] shrink_zone+0x445/0x588
 [ 1080.973152]  [<ffffffff810f1a11>] balance_pgdat+0x2c2/0x56b
 [ 1080.973170]  [<ffffffff81254208>] ? __bitmap_weight+0x34/0x80
 [ 1080.973175]  [<ffffffff810f1f78>] kswapd+0x2be/0x2fa
 [ 1080.973179]  [<ffffffff810726c8>] ? __init_waitqueue_head+0x4b/0x4b
 [ 1080.973183]  [<ffffffff810f1cba>] ? balance_pgdat+0x56b/0x56b
 [ 1080.973187]  [<ffffffff81071f69>] kthread+0xa8/0xb0
 [ 1080.973200]  [<ffffffff814806b4>] kernel_thread_helper+0x4/0x10
 [ 1080.973205]  [<ffffffff81071ec1>] ? __init_kthread_worker+0x5a/0x5a
 [ 1080.973210]  [<ffffffff814806b0>] ? gs_change+0x13/0x13
 [ 1080.973213] no locks held by kswapd0/25.

Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Jim Rees <rees@umich.edu>
Cc: stable@kernel.org [3.0]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/blocklayout/blocklayout.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 4ddbfbf1c3ad..281ae95932c9 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -533,6 +533,11 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
 fill_invalid_ext:
 		dprintk("%s need to zero %d pages\n", __func__, npg_zero);
 		for (;npg_zero > 0; npg_zero--) {
+			if (bl_is_sector_init(be->be_inval, isect)) {
+				dprintk("isect %llu already init\n",
+					(unsigned long long)isect);
+				goto next_page;
+			}
 			/* page ref released in bl_end_io_write_zero */
 			index = isect >> PAGE_CACHE_SECTOR_SHIFT;
 			dprintk("%s zero %dth page: index %lu isect %llu\n",
@@ -552,8 +557,7 @@ fill_invalid_ext:
 			 * PageUptodate: It was read before
 			 * sector_initialized: already written out
 			 */
-			if (PageDirty(page) || PageWriteback(page) ||
-			    bl_is_sector_init(be->be_inval, isect)) {
+			if (PageDirty(page) || PageWriteback(page)) {
 				print_page(page);
 				unlock_page(page);
 				page_cache_release(page);
-- 
cgit v1.2.3-58-ga151


From 45402c38eec740f52422aafc92937c6a4a8c8c0e Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Fri, 2 Sep 2011 14:39:12 -0700
Subject: nfs/super.c: local functions should be static

commit ae50c0b5 "pnfs: client stats" added additional information to
the output of /proc/self/mountstats. The new functions introduced are
only used in this file and should be marked static.

If CONFIG_NFS_V4_1 is not defined, empty stub functions are used.  If
CONFIG_NFS_V4 is not defined these stub functions are not used at all.
Adding static for the functions results in compile warnings:

fs/nfs/super.c:743: warning: 'show_sessions' defined but not used
fs/nfs/super.c:756: warning: 'show_pnfs' defined but not used

Fix this by adding a #ifdef CONFIG_NFS_V4 guard around the two
show_ functions.

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 5b19b6aabe18..480b3b6bf71e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -733,18 +733,22 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
 
 	return 0;
 }
+
+#ifdef CONFIG_NFS_V4
 #ifdef CONFIG_NFS_V4_1
-void show_sessions(struct seq_file *m, struct nfs_server *server)
+static void show_sessions(struct seq_file *m, struct nfs_server *server)
 {
 	if (nfs4_has_session(server->nfs_client))
 		seq_printf(m, ",sessions");
 }
 #else
-void show_sessions(struct seq_file *m, struct nfs_server *server) {}
+static void show_sessions(struct seq_file *m, struct nfs_server *server) {}
+#endif
 #endif
 
+#ifdef CONFIG_NFS_V4
 #ifdef CONFIG_NFS_V4_1
-void show_pnfs(struct seq_file *m, struct nfs_server *server)
+static void show_pnfs(struct seq_file *m, struct nfs_server *server)
 {
 	seq_printf(m, ",pnfs=");
 	if (server->pnfs_curr_ld)
@@ -752,9 +756,10 @@ void show_pnfs(struct seq_file *m, struct nfs_server *server)
 	else
 		seq_printf(m, "not configured");
 }
-#else  /* CONFIG_NFS_V4_1 */
-void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
-#endif /* CONFIG_NFS_V4_1 */
+#else
+static void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
+#endif
+#endif
 
 static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
 {
-- 
cgit v1.2.3-58-ga151


From a9a4a87a5942e9271523197a90aaa82349c818fb Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 17 Oct 2011 16:08:46 -0700
Subject: NFS: Use the inode->i_version to cache NFSv4 change attribute
 information

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/delegation.c    |  2 +-
 fs/nfs/fscache-index.c |  4 ++--
 fs/nfs/inode.c         | 16 ++++++++--------
 fs/nfs/nfs4proc.c      |  4 ++--
 fs/nfs/write.c         |  2 +-
 include/linux/nfs_fs.h |  1 -
 6 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 321a66bc3846..7f2654069806 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -240,7 +240,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 			sizeof(delegation->stateid.data));
 	delegation->type = res->delegation_type;
 	delegation->maxsize = res->maxsize;
-	delegation->change_attr = nfsi->change_attr;
+	delegation->change_attr = inode->i_version;
 	delegation->cred = get_rpccred(cred);
 	delegation->inode = inode;
 	delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 5b1006480bc2..7cf2c4699b08 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -212,7 +212,7 @@ static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
 	auxdata.ctime = nfsi->vfs_inode.i_ctime;
 
 	if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
-		auxdata.change_attr = nfsi->change_attr;
+		auxdata.change_attr = nfsi->vfs_inode.i_version;
 
 	if (bufmax > sizeof(auxdata))
 		bufmax = sizeof(auxdata);
@@ -244,7 +244,7 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
 	auxdata.ctime = nfsi->vfs_inode.i_ctime;
 
 	if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
-		auxdata.change_attr = nfsi->change_attr;
+		auxdata.change_attr = nfsi->vfs_inode.i_version;
 
 	if (memcmp(data, &auxdata, datalen) != 0)
 		return FSCACHE_CHECKAUX_OBSOLETE;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index fe1203797b2b..4dc6d078f108 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -318,7 +318,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		memset(&inode->i_atime, 0, sizeof(inode->i_atime));
 		memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
 		memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
-		nfsi->change_attr = 0;
+		inode->i_version = 0;
 		inode->i_size = 0;
 		inode->i_nlink = 0;
 		inode->i_uid = -2;
@@ -344,7 +344,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 				| NFS_INO_INVALID_ACCESS
 				| NFS_INO_INVALID_ACL;
 		if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
-			nfsi->change_attr = fattr->change_attr;
+			inode->i_version = fattr->change_attr;
 		else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
 			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
 				| NFS_INO_INVALID_DATA;
@@ -897,8 +897,8 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
 
 	if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
 			&& (fattr->valid & NFS_ATTR_FATTR_CHANGE)
-			&& nfsi->change_attr == fattr->pre_change_attr) {
-		nfsi->change_attr = fattr->change_attr;
+			&& inode->i_version == fattr->pre_change_attr) {
+		inode->i_version = fattr->change_attr;
 		if (S_ISDIR(inode->i_mode))
 			nfsi->cache_validity |= NFS_INO_INVALID_DATA;
 		ret |= NFS_INO_INVALID_ATTR;
@@ -952,7 +952,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 		return -EIO;
 
 	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
-			nfsi->change_attr != fattr->change_attr)
+			inode->i_version != fattr->change_attr)
 		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
 	/* Verify a few of the more important attributes */
@@ -1163,7 +1163,7 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
 	}
 	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
 			(fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
-		fattr->pre_change_attr = NFS_I(inode)->change_attr;
+		fattr->pre_change_attr = inode->i_version;
 		fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
 	}
 	if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
@@ -1244,13 +1244,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 
 	/* More cache consistency checks */
 	if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
-		if (nfsi->change_attr != fattr->change_attr) {
+		if (inode->i_version != fattr->change_attr) {
 			dprintk("NFS: change_attr change on server for file %s/%ld\n",
 					inode->i_sb->s_id, inode->i_ino);
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
 			if (S_ISDIR(inode->i_mode))
 				nfs_force_lookup_revalidate(inode);
-			nfsi->change_attr = fattr->change_attr;
+			inode->i_version = fattr->change_attr;
 		}
 	} else if (server->caps & NFS_CAP_CHANGE_ATTR)
 		invalid |= save_cache_validity;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4700fae1ada0..0f0b6076de62 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -753,9 +753,9 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 
 	spin_lock(&dir->i_lock);
 	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
-	if (!cinfo->atomic || cinfo->before != nfsi->change_attr)
+	if (!cinfo->atomic || cinfo->before != dir->i_version)
 		nfs_force_lookup_revalidate(dir);
-	nfsi->change_attr = cinfo->after;
+	dir->i_version = cinfo->after;
 	spin_unlock(&dir->i_lock);
 }
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 106fd0634ab3..2084a6494218 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -390,7 +390,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 	error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
 	BUG_ON(error);
 	if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE))
-		nfsi->change_attr++;
+		inode->i_version++;
 	set_bit(PG_MAPPED, &req->wb_flags);
 	SetPagePrivate(req->wb_page);
 	set_page_private(req->wb_page, (unsigned long)req);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index eaac770f886e..60a137b7f171 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -149,7 +149,6 @@ struct nfs_inode {
 	unsigned long		read_cache_jiffies;
 	unsigned long		attrtimeo;
 	unsigned long		attrtimeo_timestamp;
-	__u64			change_attr;		/* v4 only */
 
 	unsigned long		attr_gencount;
 	/* "Generation counter" for the attribute cache. This is
-- 
cgit v1.2.3-58-ga151


From 2900b33999e2fc8a8edf0dddaafffec4da25ee10 Mon Sep 17 00:00:00 2001
From: Alex Elder <aelder@sgi.com>
Date: Tue, 18 Oct 2011 20:00:14 +0000
Subject: xfs: put in missed fix for merge problem

I intended to do this as part of fixing part of the conflict with
the merge with Linus' tree, but evidently it didn't get included in
the commit.

Signed-off-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_trans_ail.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 512ff646d01c..4e3f9bbe0141 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -434,7 +434,7 @@ xfsaild_push(
 
 			if (!IOP_PUSHBUF(lip)) {
 				stuck++;
-				flush_log = 1;
+				ailp->xa_log_flush++;
 			} else {
 				ailp->xa_last_pushed_lsn = lsn;
 			}
-- 
cgit v1.2.3-58-ga151


From 9e4c109ac822395e0aae650e4e3c9e4903f6602f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 11 Oct 2011 15:14:11 +0000
Subject: xfs: add AIL pushing tracepoints

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_trace.h     | 37 +++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_trans_ail.c |  8 ++++++++
 2 files changed, 45 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index b78f2c65438b..f1d2802b2f07 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -30,6 +30,7 @@ struct xfs_buf_log_item;
 struct xfs_da_args;
 struct xfs_da_node_entry;
 struct xfs_dquot;
+struct xfs_log_item;
 struct xlog_ticket;
 struct log;
 struct xlog_recover;
@@ -853,6 +854,42 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
 
+DECLARE_EVENT_CLASS(xfs_log_item_class,
+	TP_PROTO(struct xfs_log_item *lip),
+	TP_ARGS(lip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(void *, lip)
+		__field(uint, type)
+		__field(uint, flags)
+		__field(xfs_lsn_t, lsn)
+	),
+	TP_fast_assign(
+		__entry->dev = lip->li_mountp->m_super->s_dev;
+		__entry->lip = lip;
+		__entry->type = lip->li_type;
+		__entry->flags = lip->li_flags;
+		__entry->lsn = lip->li_lsn;
+	),
+	TP_printk("dev %d:%d lip 0x%p lsn %d/%d type %s flags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->lip,
+		  CYCLE_LSN(__entry->lsn), BLOCK_LSN(__entry->lsn),
+		  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+		  __print_flags(__entry->flags, "|", XFS_LI_FLAGS))
+)
+
+#define DEFINE_LOG_ITEM_EVENT(name) \
+DEFINE_EVENT(xfs_log_item_class, name, \
+	TP_PROTO(struct xfs_log_item *lip), \
+	TP_ARGS(lip))
+DEFINE_LOG_ITEM_EVENT(xfs_ail_push);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf_pinned);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
+
+
 DECLARE_EVENT_CLASS(xfs_file_class,
 	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
 	TP_ARGS(ip, count, offset, flags),
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 4e3f9bbe0141..ed9252bcdac9 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -26,6 +26,7 @@
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
+#include "xfs_trace.h"
 #include "xfs_error.h"
 
 #ifdef DEBUG
@@ -425,14 +426,18 @@ xfsaild_push(
 		switch (lock_result) {
 		case XFS_ITEM_SUCCESS:
 			XFS_STATS_INC(xs_push_ail_success);
+			trace_xfs_ail_push(lip);
+
 			IOP_PUSH(lip);
 			ailp->xa_last_pushed_lsn = lsn;
 			break;
 
 		case XFS_ITEM_PUSHBUF:
 			XFS_STATS_INC(xs_push_ail_pushbuf);
+			trace_xfs_ail_pushbuf(lip);
 
 			if (!IOP_PUSHBUF(lip)) {
+				trace_xfs_ail_pushbuf_pinned(lip);
 				stuck++;
 				ailp->xa_log_flush++;
 			} else {
@@ -443,12 +448,15 @@ xfsaild_push(
 
 		case XFS_ITEM_PINNED:
 			XFS_STATS_INC(xs_push_ail_pinned);
+			trace_xfs_ail_pinned(lip);
+
 			stuck++;
 			ailp->xa_log_flush++;
 			break;
 
 		case XFS_ITEM_LOCKED:
 			XFS_STATS_INC(xs_push_ail_locked);
+			trace_xfs_ail_locked(lip);
 			stuck++;
 			break;
 
-- 
cgit v1.2.3-58-ga151


From 0c2e53f11a6dae9e3af5f50f5ad0382e7c3e0cfa Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 18 Oct 2011 16:11:22 -0700
Subject: NFS: Remove the unused "lookupfh()" version of nfs4_proc_lookup()

...and also remove the associated nfs_v4_clientops entry.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 62 ++++++++++++++-----------------------------------
 include/linux/nfs_xdr.h |  3 ---
 2 files changed, 17 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0f0b6076de62..b0c01b23422d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -73,9 +73,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
-static int _nfs4_proc_lookup(struct rpc_clnt *client, struct inode *dir,
-			     const struct qstr *name, struct nfs_fh *fhandle,
-			     struct nfs_fattr *fattr);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 			    struct nfs_fattr *fattr, struct iattr *sattr,
@@ -2408,14 +2405,15 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 	return status;
 }
 
-static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server,
-		const struct nfs_fh *dirfh, const struct qstr *name,
-		struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
+		const struct qstr *name, struct nfs_fh *fhandle,
+		struct nfs_fattr *fattr)
 {
+	struct nfs_server *server = NFS_SERVER(dir);
 	int		       status;
 	struct nfs4_lookup_arg args = {
 		.bitmask = server->attr_bitmask,
-		.dir_fh = dirfh,
+		.dir_fh = NFS_FH(dir),
 		.name = name,
 	};
 	struct nfs4_lookup_res res = {
@@ -2431,40 +2429,8 @@ static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server,
 
 	nfs_fattr_init(fattr);
 
-	dprintk("NFS call  lookupfh %s\n", name->name);
-	status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0);
-	dprintk("NFS reply lookupfh: %d\n", status);
-	return status;
-}
-
-static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh,
-			      struct qstr *name, struct nfs_fh *fhandle,
-			      struct nfs_fattr *fattr)
-{
-	struct nfs4_exception exception = { };
-	int err;
-	do {
-		err = _nfs4_proc_lookupfh(server->client, server, dirfh, name, fhandle, fattr);
-		/* FIXME: !!!! */
-		if (err == -NFS4ERR_MOVED) {
-			err = -EREMOTE;
-			break;
-		}
-		err = nfs4_handle_exception(server, err, &exception);
-	} while (exception.retry);
-	return err;
-}
-
-static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
-		const struct qstr *name, struct nfs_fh *fhandle,
-		struct nfs_fattr *fattr)
-{
-	int status;
-	
 	dprintk("NFS call  lookup %s\n", name->name);
-	status = _nfs4_proc_lookupfh(clnt, NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr);
-	if (status == -NFS4ERR_MOVED)
-		status = nfs4_get_referral(dir, name, fattr, fhandle);
+	status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0);
 	dprintk("NFS reply lookup: %d\n", status);
 	return status;
 }
@@ -2485,11 +2451,18 @@ static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qst
 	struct nfs4_exception exception = { };
 	int err;
 	do {
-		err = nfs4_handle_exception(NFS_SERVER(dir),
-				_nfs4_proc_lookup(clnt, dir, name, fhandle, fattr),
-				&exception);
-		if (err == -EPERM)
+		int status;
+
+		status = _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr);
+		switch (status) {
+		case -NFS4ERR_MOVED:
+			err = nfs4_get_referral(dir, name, fattr, fhandle);
+			break;
+		case -NFS4ERR_WRONGSEC:
 			nfs_fixup_secinfo_attributes(fattr, fhandle);
+		}
+		err = nfs4_handle_exception(NFS_SERVER(dir),
+				status, &exception);
 	} while (exception.retry);
 	return err;
 }
@@ -6270,7 +6243,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.getroot	= nfs4_proc_get_root,
 	.getattr	= nfs4_proc_getattr,
 	.setattr	= nfs4_proc_setattr,
-	.lookupfh	= nfs4_proc_lookupfh,
 	.lookup		= nfs4_proc_lookup,
 	.access		= nfs4_proc_access,
 	.readlink	= nfs4_proc_readlink,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index abd615d74a29..1e31e1c8655b 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1197,9 +1197,6 @@ struct nfs_rpc_ops {
 
 	int	(*getroot) (struct nfs_server *, struct nfs_fh *,
 			    struct nfs_fsinfo *);
-	int	(*lookupfh)(struct nfs_server *, struct nfs_fh *,
-			    struct qstr *, struct nfs_fh *,
-			    struct nfs_fattr *);
 	int	(*getattr) (struct nfs_server *, struct nfs_fh *,
 			    struct nfs_fattr *);
 	int	(*setattr) (struct dentry *, struct nfs_fattr *,
-- 
cgit v1.2.3-58-ga151


From 08ef7bd3bc04261d14d570ac7eaac3eac947b1ba Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 18 Oct 2011 16:11:49 -0700
Subject: NFSv4: Translate NFS4ERR_BADNAME into ENOENT when applied to a lookup

Both LOOKUP and OPEN operations may return NFS4ERR_BADNAME if we send a
an invalid name as a filename argument. As far as the application is
concerned, it just has to know that the file doesn't exist, and so
ENOENT would be the appropriate reply. We should only return EINVAL
if the filename is being used to _create_ a new object on the
remote filesystem.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b0c01b23422d..ba0da50865fe 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1593,8 +1593,14 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 	int status;
 
 	status = nfs4_run_open_task(data, 0);
-	if (status != 0 || !data->rpc_done)
+	if (!data->rpc_done)
+		return status;
+	if (status != 0) {
+		if (status == -NFS4ERR_BADNAME &&
+				!(o_arg->open_flags & O_CREAT))
+			return -ENOENT;
 		return status;
+	}
 
 	if (o_arg->open_flags & O_CREAT) {
 		update_changeattr(dir, &o_res->cinfo);
@@ -2455,6 +2461,8 @@ static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qst
 
 		status = _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr);
 		switch (status) {
+		case -NFS4ERR_BADNAME:
+			return -ENOENT;
 		case -NFS4ERR_MOVED:
 			err = nfs4_get_referral(dir, name, fattr, fhandle);
 			break;
-- 
cgit v1.2.3-58-ga151


From 523e1d399ce0e23bec562abe2b2f8d297af81161 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Oct 2011 14:31:07 +0200
Subject: block: make gendisk hold a reference to its queue

The following command sequence triggers an oops.

# mount /dev/sdb1 /mnt
# echo 1 > /sys/class/scsi_device/0\:0\:1\:0/device/delete
# umount /mnt

 general protection fault: 0000 [#1] PREEMPT SMP
 CPU 2
 Modules linked in:

 Pid: 791, comm: umount Not tainted 3.1.0-rc3-work+ #8 Bochs Bochs
 RIP: 0010:[<ffffffff810d0879>]  [<ffffffff810d0879>] __lock_acquire+0x389/0x1d60
...
 Call Trace:
  [<ffffffff810d2845>] lock_acquire+0x95/0x140
  [<ffffffff81aed87b>] _raw_spin_lock+0x3b/0x50
  [<ffffffff811573bc>] bdi_lock_two+0x5c/0x70
  [<ffffffff811c2f6c>] bdev_inode_switch_bdi+0x4c/0xf0
  [<ffffffff811c3fcb>] __blkdev_put+0x11b/0x1d0
  [<ffffffff811c4010>] __blkdev_put+0x160/0x1d0
  [<ffffffff811c40df>] blkdev_put+0x5f/0x190
  [<ffffffff8118f18d>] kill_block_super+0x4d/0x80
  [<ffffffff8118f4a5>] deactivate_locked_super+0x45/0x70
  [<ffffffff8119003a>] deactivate_super+0x4a/0x70
  [<ffffffff811ac4ad>] mntput_no_expire+0xed/0x130
  [<ffffffff811acf2e>] sys_umount+0x7e/0x3a0
  [<ffffffff81aeeeab>] system_call_fastpath+0x16/0x1b

This is because bdev holds on to disk but disk doesn't pin the
associated queue.  If a SCSI device is removed while the device is
still open, the sdev puts the base reference to the queue on release.
When the bdev is finally released, the associated queue is already
gone along with the bdi and bdev_inode_switch_bdi() ends up
dereferencing already freed bdi.

Even if it were not for this bug, disk not holding onto the associated
queue is very unusual and error-prone.

Fix it by making add_disk() take an extra reference to its queue and
put it on disk_release() and ensuring that disk and its fops owner are
put in that order after all accesses to the disk and queue are
complete.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c  |  8 ++++++++
 fs/block_dev.c | 13 ++++++++-----
 2 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/block/genhd.c b/block/genhd.c
index e2f67902dd02..d261b73b9744 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -611,6 +611,12 @@ void add_disk(struct gendisk *disk)
 	register_disk(disk);
 	blk_register_queue(disk);
 
+	/*
+	 * Take an extra ref on queue which will be put on disk_release()
+	 * so that it sticks around as long as @disk is there.
+	 */
+	WARN_ON_ONCE(blk_get_queue(disk->queue));
+
 	retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
 				   "bdi");
 	WARN_ON(retval);
@@ -1095,6 +1101,8 @@ static void disk_release(struct device *dev)
 	disk_replace_part_tbl(disk, NULL);
 	free_part_stats(&disk->part0);
 	free_part_info(&disk->part0);
+	if (disk->queue)
+		blk_put_queue(disk->queue);
 	kfree(disk);
 }
 struct class block_class = {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 95f786ec7f08..1c44b8d54504 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1085,6 +1085,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 {
 	struct gendisk *disk;
+	struct module *owner;
 	int ret;
 	int partno;
 	int perm = 0;
@@ -1110,6 +1111,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	disk = get_gendisk(bdev->bd_dev, &partno);
 	if (!disk)
 		goto out;
+	owner = disk->fops->owner;
 
 	disk_block_events(disk);
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
@@ -1137,8 +1139,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 					bdev->bd_disk = NULL;
 					mutex_unlock(&bdev->bd_mutex);
 					disk_unblock_events(disk);
-					module_put(disk->fops->owner);
 					put_disk(disk);
+					module_put(owner);
 					goto restart;
 				}
 			}
@@ -1194,8 +1196,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				goto out_unlock_bdev;
 		}
 		/* only one opener holds refs to the module and disk */
-		module_put(disk->fops->owner);
 		put_disk(disk);
+		module_put(owner);
 	}
 	bdev->bd_openers++;
 	if (for_part)
@@ -1215,8 +1217,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
  out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
 	disk_unblock_events(disk);
-	module_put(disk->fops->owner);
 	put_disk(disk);
+	module_put(owner);
  out:
 	bdput(bdev);
 
@@ -1442,14 +1444,15 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 	if (!bdev->bd_openers) {
 		struct module *owner = disk->fops->owner;
 
-		put_disk(disk);
-		module_put(owner);
 		disk_put_part(bdev->bd_part);
 		bdev->bd_part = NULL;
 		bdev->bd_disk = NULL;
 		if (bdev != bdev->bd_contains)
 			victim = bdev->bd_contains;
 		bdev->bd_contains = NULL;
+
+		put_disk(disk);
+		module_put(owner);
 	}
 	mutex_unlock(&bdev->bd_mutex);
 	bdput(bdev);
-- 
cgit v1.2.3-58-ga151


From 92abc475d8de1c29373f6d96ed63d8ecaa199d25 Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Sun, 16 Oct 2011 18:15:16 -0700
Subject: jffs2: implement mount option parsing and compression overriding

Currently jffs2 has compile-time constants (and .config options)
controlling whether or not the various compression/decompression
drivers are built in and enabled.  This is fine for embedded
systems, but it clashes with distribution kernels.  Distro kernels
tend to turn on everything; this causes OpenFirmware to fall
over, as it understands ZLIB-compressed inodes.  Booting a kernel
that has LZO compression enabled, writing to the boot partition,
and then rebooting causes OFW to fail to read the kernel from
the filesystem.  This is because LZO compression has priority
when writing new data to jffs2, if LZO is enabled.

This patch adds mount option parsing, and a single supported
option ("compr=none").  This adds the flexibility of being
able to specify which compressor overrides on a per-superblock
basis.  For now, we can simply disable compression;
additional flexibility coming soon.

v2: kill some printks, and implement show_options as suggested
by Artem Bityutskiy.

Signed-off-by: Andres Salomon <dilinger@queued.net>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@intel.com>
---
 fs/jffs2/compr.c       |  9 +++--
 fs/jffs2/fs.c          |  2 +-
 fs/jffs2/jffs2_fs_sb.h |  6 ++++
 fs/jffs2/os-linux.h    |  2 +-
 fs/jffs2/super.c       | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 112 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index de4247021d25..97bc74db2c96 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -76,13 +76,18 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 			uint32_t *datalen, uint32_t *cdatalen)
 {
 	int ret = JFFS2_COMPR_NONE;
-	int compr_ret;
+	int mode, compr_ret;
 	struct jffs2_compressor *this, *best=NULL;
 	unsigned char *output_buf = NULL, *tmp_buf;
 	uint32_t orig_slen, orig_dlen;
 	uint32_t best_slen=0, best_dlen=0;
 
-	switch (jffs2_compression_mode) {
+	if (c->mount_opts.override_compr)
+		mode = c->mount_opts.compr;
+	else
+		mode = jffs2_compression_mode;
+
+	switch (mode) {
 	case JFFS2_COMPR_MODE_NONE:
 		break;
 	case JFFS2_COMPR_MODE_PRIORITY:
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index bbcb9755dd2b..5d54b4ed1b6c 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -379,7 +379,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags)
 	jffs2_do_setattr(inode, &iattr);
 }
 
-int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
+int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data)
 {
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
 
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 0bc6a6c80a56..55a0c1dceadf 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -29,6 +29,11 @@
 
 struct jffs2_inodirty;
 
+struct jffs2_mount_opts {
+	bool override_compr;
+	unsigned int compr;
+};
+
 /* A struct for the overall file system control.  Pointers to
    jffs2_sb_info structs are named `c' in the source code.
    Nee jffs_control
@@ -126,6 +131,7 @@ struct jffs2_sb_info {
 #endif
 
 	struct jffs2_summary *summary;		/* Summary information */
+	struct jffs2_mount_opts mount_opts;
 
 #ifdef CONFIG_JFFS2_FS_XATTR
 #define XATTRINDEX_HASHSIZE	(57)
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 6c1755c59c0f..ab65ee3ec858 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -176,7 +176,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags);
 struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode,
 			       struct jffs2_raw_inode *ri);
 int jffs2_statfs (struct dentry *, struct kstatfs *);
-int jffs2_remount_fs (struct super_block *, int *, char *);
+int jffs2_do_remount_fs(struct super_block *, int *, char *);
 int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
 void jffs2_gc_release_inode(struct jffs2_sb_info *c,
 			    struct jffs2_inode_info *f);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 853b8e300084..40f6e6385fd1 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -17,11 +17,13 @@
 #include <linux/fs.h>
 #include <linux/err.h>
 #include <linux/mount.h>
+#include <linux/parser.h>
 #include <linux/jffs2.h>
 #include <linux/pagemap.h>
 #include <linux/mtd/super.h>
 #include <linux/ctype.h>
 #include <linux/namei.h>
+#include <linux/seq_file.h>
 #include <linux/exportfs.h>
 #include "compr.h"
 #include "nodelist.h"
@@ -75,6 +77,29 @@ static void jffs2_write_super(struct super_block *sb)
 	unlock_super(sb);
 }
 
+static const char *jffs2_compr_name(unsigned int compr)
+{
+	switch (compr) {
+	case JFFS2_COMPR_MODE_NONE:
+		return "none";
+	default:
+		/* should never happen; programmer error */
+		WARN_ON(1);
+		return "";
+	}
+}
+
+static int jffs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(mnt->mnt_sb);
+	struct jffs2_mount_opts *opts = &c->mount_opts;
+
+	if (opts->override_compr)
+		seq_printf(s, ",compr=%s", jffs2_compr_name(opts->compr));
+
+	return 0;
+}
+
 static int jffs2_sync_fs(struct super_block *sb, int wait)
 {
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
@@ -133,6 +158,71 @@ static const struct export_operations jffs2_export_ops = {
 	.fh_to_parent = jffs2_fh_to_parent,
 };
 
+/*
+ * JFFS2 mount options.
+ *
+ * Opt_override_compr: override default compressor
+ * Opt_err: just end of array marker
+ */
+enum {
+	Opt_override_compr,
+	Opt_err,
+};
+
+static const match_table_t tokens = {
+	{Opt_override_compr, "compr=%s"},
+	{Opt_err, NULL},
+};
+
+static int jffs2_parse_options(struct jffs2_sb_info *c, char *data)
+{
+	substring_t args[MAX_OPT_ARGS];
+	char *p, *name;
+
+	if (!data)
+		return 0;
+
+	while ((p = strsep(&data, ","))) {
+		int token;
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_override_compr:
+			name = match_strdup(&args[0]);
+
+			if (!name)
+				return -ENOMEM;
+			if (!strcmp(name, "none")) {
+				c->mount_opts.compr = JFFS2_COMPR_MODE_NONE;
+				c->mount_opts.override_compr = true;
+			}
+			kfree(name);
+			break;
+		default:
+			printk(KERN_ERR "JFFS2 Error: unrecognized mount option '%s' or missing value\n",
+					p);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+	int err;
+
+	err = jffs2_parse_options(c, data);
+	if (err)
+		return -EINVAL;
+
+	return jffs2_do_remount_fs(sb, flags, data);
+}
+
 static const struct super_operations jffs2_super_operations =
 {
 	.alloc_inode =	jffs2_alloc_inode,
@@ -143,6 +233,7 @@ static const struct super_operations jffs2_super_operations =
 	.remount_fs =	jffs2_remount_fs,
 	.evict_inode =	jffs2_evict_inode,
 	.dirty_inode =	jffs2_dirty_inode,
+	.show_options =	jffs2_show_options,
 	.sync_fs =	jffs2_sync_fs,
 };
 
@@ -166,6 +257,12 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
 	c->os_priv = sb;
 	sb->s_fs_info = c;
 
+	ret = jffs2_parse_options(c, data);
+	if (ret) {
+		kfree(c);
+		return -EINVAL;
+	}
+
 	/* Initialize JFFS2 superblock locks, the further initialization will
 	 * be done later */
 	mutex_init(&c->alloc_sem);
-- 
cgit v1.2.3-58-ga151


From 123005f3ccfa58637ad6e1a8b9f7f3f861ca65f4 Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Sun, 16 Oct 2011 18:15:23 -0700
Subject: jffs2: add compr=lzo and compr=zlib options

..to allow forcing of either compression scheme.  This will override
compiled-in defaults.  jffs2_compress is reworked a bit, as the lzo/zlib
override shares lots of code w/ the PRIORITY mode.

v2: update show_options accordingly.

Signed-off-by: Andres Salomon <dilinger@queued.net>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@intel.com>
---
 fs/jffs2/compr.c | 119 +++++++++++++++++++++++++++++++++++++++----------------
 fs/jffs2/compr.h |   2 +
 fs/jffs2/super.c |  26 +++++++++++-
 3 files changed, 110 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index 97bc74db2c96..5b6c9d1a2fb9 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -53,6 +53,78 @@ static int jffs2_is_best_compression(struct jffs2_compressor *this,
 	return 0;
 }
 
+/*
+ * jffs2_selected_compress:
+ * @compr: Explicit compression type to use (ie, JFFS2_COMPR_ZLIB).
+ *	If 0, just take the first available compression mode.
+ * @data_in: Pointer to uncompressed data
+ * @cpage_out: Pointer to returned pointer to buffer for compressed data
+ * @datalen: On entry, holds the amount of data available for compression.
+ *	On exit, expected to hold the amount of data actually compressed.
+ * @cdatalen: On entry, holds the amount of space available for compressed
+ *	data. On exit, expected to hold the actual size of the compressed
+ *	data.
+ *
+ * Returns: the compression type used.  Zero is used to show that the data
+ * could not be compressed; probably because we couldn't find the requested
+ * compression mode.
+ */
+static int jffs2_selected_compress(u8 compr, unsigned char *data_in,
+		unsigned char **cpage_out, u32 *datalen, u32 *cdatalen)
+{
+	struct jffs2_compressor *this;
+	int err, ret = JFFS2_COMPR_NONE;
+	uint32_t orig_slen, orig_dlen;
+	char *output_buf;
+
+	output_buf = kmalloc(*cdatalen, GFP_KERNEL);
+	if (!output_buf) {
+		printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n");
+		return ret;
+	}
+	orig_slen = *datalen;
+	orig_dlen = *cdatalen;
+	spin_lock(&jffs2_compressor_list_lock);
+	list_for_each_entry(this, &jffs2_compressor_list, list) {
+		/* Skip decompress-only and disabled modules */
+		if (!this->compress || this->disabled)
+			continue;
+
+		/* Skip if not the desired compression type */
+		if (compr && (compr != this->compr))
+			continue;
+
+		/*
+		 * Either compression type was unspecified, or we found our
+		 * compressor; either way, we're good to go.
+		 */
+		this->usecount++;
+		spin_unlock(&jffs2_compressor_list_lock);
+
+		*datalen  = orig_slen;
+		*cdatalen = orig_dlen;
+		err = this->compress(data_in, output_buf, datalen, cdatalen);
+
+		spin_lock(&jffs2_compressor_list_lock);
+		this->usecount--;
+		if (!err) {
+			/* Success */
+			ret = this->compr;
+			this->stat_compr_blocks++;
+			this->stat_compr_orig_size += *datalen;
+			this->stat_compr_new_size += *cdatalen;
+			break;
+		}
+	}
+	spin_unlock(&jffs2_compressor_list_lock);
+	if (ret == JFFS2_COMPR_NONE)
+		kfree(output_buf);
+	else
+		*cpage_out = output_buf;
+
+	return ret;
+}
+
 /* jffs2_compress:
  * @data_in: Pointer to uncompressed data
  * @cpage_out: Pointer to returned pointer to buffer for compressed data
@@ -91,37 +163,8 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 	case JFFS2_COMPR_MODE_NONE:
 		break;
 	case JFFS2_COMPR_MODE_PRIORITY:
-		output_buf = kmalloc(*cdatalen,GFP_KERNEL);
-		if (!output_buf) {
-			printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n");
-			goto out;
-		}
-		orig_slen = *datalen;
-		orig_dlen = *cdatalen;
-		spin_lock(&jffs2_compressor_list_lock);
-		list_for_each_entry(this, &jffs2_compressor_list, list) {
-			/* Skip decompress-only backwards-compatibility and disabled modules */
-			if ((!this->compress)||(this->disabled))
-				continue;
-
-			this->usecount++;
-			spin_unlock(&jffs2_compressor_list_lock);
-			*datalen  = orig_slen;
-			*cdatalen = orig_dlen;
-			compr_ret = this->compress(data_in, output_buf, datalen, cdatalen);
-			spin_lock(&jffs2_compressor_list_lock);
-			this->usecount--;
-			if (!compr_ret) {
-				ret = this->compr;
-				this->stat_compr_blocks++;
-				this->stat_compr_orig_size += *datalen;
-				this->stat_compr_new_size  += *cdatalen;
-				break;
-			}
-		}
-		spin_unlock(&jffs2_compressor_list_lock);
-		if (ret == JFFS2_COMPR_NONE)
-			kfree(output_buf);
+		ret = jffs2_selected_compress(0, data_in, cpage_out, datalen,
+				cdatalen);
 		break;
 	case JFFS2_COMPR_MODE_SIZE:
 	case JFFS2_COMPR_MODE_FAVOURLZO:
@@ -179,22 +222,28 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 			best->stat_compr_orig_size += best_slen;
 			best->stat_compr_new_size  += best_dlen;
 			ret = best->compr;
+			*cpage_out = output_buf;
 		}
 		spin_unlock(&jffs2_compressor_list_lock);
 		break;
+	case JFFS2_COMPR_MODE_FORCELZO:
+		ret = jffs2_selected_compress(JFFS2_COMPR_LZO, data_in,
+				cpage_out, datalen, cdatalen);
+		break;
+	case JFFS2_COMPR_MODE_FORCEZLIB:
+		ret = jffs2_selected_compress(JFFS2_COMPR_ZLIB, data_in,
+				cpage_out, datalen, cdatalen);
+		break;
 	default:
 		printk(KERN_ERR "JFFS2: unknown compression mode.\n");
 	}
- out:
+
 	if (ret == JFFS2_COMPR_NONE) {
 		*cpage_out = data_in;
 		*datalen = *cdatalen;
 		none_stat_compr_blocks++;
 		none_stat_compr_size += *datalen;
 	}
-	else {
-		*cpage_out = output_buf;
-	}
 	return ret;
 }
 
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index 13bb7597ab39..5e91d578f4ed 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -40,6 +40,8 @@
 #define JFFS2_COMPR_MODE_PRIORITY   1
 #define JFFS2_COMPR_MODE_SIZE       2
 #define JFFS2_COMPR_MODE_FAVOURLZO  3
+#define JFFS2_COMPR_MODE_FORCELZO   4
+#define JFFS2_COMPR_MODE_FORCEZLIB  5
 
 #define FAVOUR_LZO_PERCENT 80
 
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 40f6e6385fd1..e7e974454115 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -82,6 +82,14 @@ static const char *jffs2_compr_name(unsigned int compr)
 	switch (compr) {
 	case JFFS2_COMPR_MODE_NONE:
 		return "none";
+#ifdef CONFIG_JFFS2_LZO
+	case JFFS2_COMPR_MODE_FORCELZO:
+		return "lzo";
+#endif
+#ifdef CONFIG_JFFS2_ZLIB
+	case JFFS2_COMPR_MODE_FORCEZLIB:
+		return "zlib";
+#endif
 	default:
 		/* should never happen; programmer error */
 		WARN_ON(1);
@@ -195,11 +203,25 @@ static int jffs2_parse_options(struct jffs2_sb_info *c, char *data)
 
 			if (!name)
 				return -ENOMEM;
-			if (!strcmp(name, "none")) {
+			if (!strcmp(name, "none"))
 				c->mount_opts.compr = JFFS2_COMPR_MODE_NONE;
-				c->mount_opts.override_compr = true;
+#ifdef CONFIG_JFFS2_LZO
+			else if (!strcmp(name, "lzo"))
+				c->mount_opts.compr = JFFS2_COMPR_MODE_FORCELZO;
+#endif
+#ifdef CONFIG_JFFS2_ZLIB
+			else if (!strcmp(name, "zlib"))
+				c->mount_opts.compr =
+						JFFS2_COMPR_MODE_FORCEZLIB;
+#endif
+			else {
+				printk(KERN_ERR "JFFS2 Error: unknown compressor \"%s\"",
+						name);
+				kfree(name);
+				return -EINVAL;
 			}
 			kfree(name);
+			c->mount_opts.override_compr = true;
 			break;
 		default:
 			printk(KERN_ERR "JFFS2 Error: unrecognized mount option '%s' or missing value\n",
-- 
cgit v1.2.3-58-ga151


From a8d86cd75b709a9c9402c46674ea188493c53901 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 19 Oct 2011 11:42:03 -0400
Subject: nfsd4: remove unneeded CLAIM_DELEGATE_CUR workaround

0c12eaffdf09466f36a9ffe970dda8f4aeb6efc0 "nfsd: don't break lease on
CLAIM_DELEGATE_CUR" was a temporary workaround for a problem fixed
properly in the vfs layer by 778fc546f749c588aa2f6cd50215d2715c374252
"locks: fix tracking of inprogress lease breaks", so we can revert that
change (but keeping some minor cleanup from that commit).

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3e1d4e08dfad..15e0db140403 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2653,12 +2653,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
 	int oflag = nfs4_access_to_omode(open->op_share_access);
 	int access = nfs4_access_to_access(open->op_share_access);
 
-	/* CLAIM_DELEGATE_CUR is used in response to a broken lease;
-	 * allowing it to break the lease and return EAGAIN leaves the
-	 * client unable to make progress in returning the delegation */
-	if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
-		access |= NFSD_MAY_NOT_BREAK_LEASE;
-
 	if (!fp->fi_fds[oflag]) {
 		status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
 			&fp->fi_fds[oflag]);
-- 
cgit v1.2.3-58-ga151


From 8b289b2c2355c3bea75f3e499b4aa251a3191382 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 19 Oct 2011 11:52:12 -0400
Subject: nfsd4: implement new 4.1 open reclaim types

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c   | 15 +++------------
 fs/nfsd/nfs4state.c  | 10 ++++++++--
 fs/nfsd/nfs4xdr.c    | 13 +++++++++++++
 include/linux/nfs4.h |  5 ++++-
 4 files changed, 28 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 710b97b7a2f3..458ebb6b59c7 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -366,12 +366,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	switch (open->op_claim_type) {
 		case NFS4_OPEN_CLAIM_DELEGATE_CUR:
 		case NFS4_OPEN_CLAIM_NULL:
-			/*
-			 * (1) set CURRENT_FH to the file being opened,
-			 * creating it if necessary, (2) set open->op_cinfo,
-			 * (3) set open->op_truncate if the file is to be
-			 * truncated after opening, (4) do permission checking.
-			 */
 			status = do_open_lookup(rqstp, &cstate->current_fh,
 						open);
 			if (status)
@@ -379,17 +373,14 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			break;
 		case NFS4_OPEN_CLAIM_PREVIOUS:
 			open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
-			/*
-			 * The CURRENT_FH is already set to the file being
-			 * opened.  (1) set open->op_cinfo, (2) set
-			 * open->op_truncate if the file is to be truncated
-			 * after opening, (3) do permission checking.
-			*/
+		case NFS4_OPEN_CLAIM_FH:
+		case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
 			status = do_open_fhandle(rqstp, &cstate->current_fh,
 						 open);
 			if (status)
 				goto out;
 			break;
+		case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
              	case NFS4_OPEN_CLAIM_DELEGATE_PREV:
 			open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
 			dprintk("NFSD: unsupported OPEN claim type %d\n",
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 15e0db140403..e8c2a3ec0e60 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2587,6 +2587,12 @@ static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, statei
 	return delegstateid(ret);
 }
 
+static bool nfsd4_is_deleg_cur(struct nfsd4_open *open)
+{
+	return open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR ||
+	       open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH;
+}
+
 static __be32
 nfs4_check_deleg(struct nfs4_client *cl, struct nfs4_file *fp, struct nfsd4_open *open,
 		struct nfs4_delegation **dp)
@@ -2602,7 +2608,7 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfs4_file *fp, struct nfsd4_open
 	if (status)
 		*dp = NULL;
 out:
-	if (open->op_claim_type != NFS4_OPEN_CLAIM_DELEGATE_CUR)
+	if (!nfsd4_is_deleg_cur(open))
 		return nfs_ok;
 	if (status)
 		return status;
@@ -2879,7 +2885,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 			goto out;
 	} else {
 		status = nfserr_bad_stateid;
-		if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
+		if (nfsd4_is_deleg_cur(open))
 			goto out;
 		status = nfserr_jukebox;
 		fp = open->op_file;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 645a0a9d8073..fdc09a52cd8d 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -803,6 +803,19 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 		if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval)))
 			return status;
 		break;
+	case NFS4_OPEN_CLAIM_FH:
+	case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+		if (argp->minorversion < 1)
+			goto xdr_error;
+		/* void */
+		break;
+	case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+		if (argp->minorversion < 1)
+			goto xdr_error;
+		status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
+		if (status)
+			return status;
+		break;
 	default:
 		goto xdr_error;
 	}
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index b875b0324fc0..32345c2805c0 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -410,7 +410,10 @@ enum open_claim_type4 {
 	NFS4_OPEN_CLAIM_NULL = 0,
 	NFS4_OPEN_CLAIM_PREVIOUS = 1,
 	NFS4_OPEN_CLAIM_DELEGATE_CUR = 2,
-	NFS4_OPEN_CLAIM_DELEGATE_PREV = 3
+	NFS4_OPEN_CLAIM_DELEGATE_PREV = 3,
+	NFS4_OPEN_CLAIM_FH = 4, /* 4.1 */
+	NFS4_OPEN_CLAIM_DELEG_CUR_FH = 5, /* 4.1 */
+	NFS4_OPEN_CLAIM_DELEG_PREV_FH = 6, /* 4.1 */
 };
 
 enum opentype4 {
-- 
cgit v1.2.3-58-ga151


From f1bdcc0a8278aa42cb77331275890aac85a4e7cd Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 14 Jul 2011 14:28:08 -0400
Subject: Btrfs: move stuff around in btrfs_inode to get better packing

Moving things around to give us better packing in the btrfs_inode.  This reduces
the size of our inode by 8 bytes.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/btrfs_inode.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d9f99a16edd6..bf325f40cf92 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -115,9 +115,6 @@ struct btrfs_inode {
 	 */
 	u64 disk_i_size;
 
-	/* flags field from the on disk inode */
-	u32 flags;
-
 	/*
 	 * if this is a directory then index_cnt is the counter for the index
 	 * number for new files that are created
@@ -131,6 +128,9 @@ struct btrfs_inode {
 	 */
 	u64 last_unlink_trans;
 
+	/* flags field from the on disk inode */
+	u32 flags;
+
 	/*
 	 * Counters to keep track of the number of extent item's we may use due
 	 * to delalloc and such.  outstanding_extents is the number of extent
-- 
cgit v1.2.3-58-ga151


From 0cbbdf7c9c46467bfb7129c30236f36a679ab244 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 14 Jul 2011 16:02:04 -0400
Subject: Btrfs: kill reserved_bytes in inode

reserved_bytes is not used for anything in the inode, remove it.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/btrfs_inode.h | 5 -----
 fs/btrfs/extent-tree.c | 2 --
 fs/btrfs/inode.c       | 1 -
 3 files changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index bf325f40cf92..c70fb10a307b 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -103,11 +103,6 @@ struct btrfs_inode {
 	 */
 	u64 delalloc_bytes;
 
-	/* total number of bytes that may be used for this inode for
-	 * delalloc
-	 */
-	u64 reserved_bytes;
-
 	/*
 	 * the size of the file stored in the metadata on disk.  data=ordered
 	 * means the in-memory i_size might be larger than the size on disk
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5be06a2462f..03edac4f7771 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3122,7 +3122,6 @@ commit_trans:
 		return -ENOSPC;
 	}
 	data_sinfo->bytes_may_use += bytes;
-	BTRFS_I(inode)->reserved_bytes += bytes;
 	spin_unlock(&data_sinfo->lock);
 
 	return 0;
@@ -3144,7 +3143,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 	data_sinfo = BTRFS_I(inode)->space_info;
 	spin_lock(&data_sinfo->lock);
 	data_sinfo->bytes_may_use -= bytes;
-	BTRFS_I(inode)->reserved_bytes -= bytes;
 	spin_unlock(&data_sinfo->lock);
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b2d004ad66a0..156c3a0da792 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6755,7 +6755,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->last_sub_trans = 0;
 	ei->logged_trans = 0;
 	ei->delalloc_bytes = 0;
-	ei->reserved_bytes = 0;
 	ei->disk_i_size = 0;
 	ei->flags = 0;
 	ei->index_cnt = (u64)-1;
-- 
cgit v1.2.3-58-ga151


From ba5b8958dabbd7890a6929af1ffc0d87187765dc Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 25 Jul 2011 15:40:35 -0400
Subject: Btrfs: use d_obtain_alias when mounting subvol/subvolid

Currently what we do is just wrong.  We either

1) Alloc a new "root" dentry with sb->s_root as it's parent which is just wrong
as we could walk into this subvol later on via another path and hilarity could
ensue.  Also we don't check the return value of d_splice_alias which isn't good
either.

or

2) Do a d_find_alias() which we could have lost our dentry from cache at this
point and found nothing.

So use d_obtain_alias().  In the case that we already have the inode/dentry in
cache we will get the correct dentry.  If not we will get a disconnected dentry
tree so if we walk into it later on everything will be connected up properly.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/super.c | 25 +------------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 15634d4648d7..244fa46c50b8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -492,7 +492,6 @@ static struct dentry *get_default_root(struct super_block *sb,
 	struct btrfs_path *path;
 	struct btrfs_key location;
 	struct inode *inode;
-	struct dentry *dentry;
 	u64 dir_id;
 	int new = 0;
 
@@ -566,29 +565,7 @@ setup_root:
 		return dget(sb->s_root);
 	}
 
-	if (new) {
-		const struct qstr name = { .name = "/", .len = 1 };
-
-		/*
-		 * New inode, we need to make the dentry a sibling of s_root so
-		 * everything gets cleaned up properly on unmount.
-		 */
-		dentry = d_alloc(sb->s_root, &name);
-		if (!dentry) {
-			iput(inode);
-			return ERR_PTR(-ENOMEM);
-		}
-		d_splice_alias(inode, dentry);
-	} else {
-		/*
-		 * We found the inode in cache, just find a dentry for it and
-		 * put the reference to the inode we just got.
-		 */
-		dentry = d_find_alias(inode);
-		iput(inode);
-	}
-
-	return dentry;
+	return d_obtain_alias(inode);
 }
 
 static int btrfs_fill_super(struct super_block *sb,
-- 
cgit v1.2.3-58-ga151


From 830c4adbd04a79f806d4fa579546f36a71b727c1 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 25 Jul 2011 15:55:42 -0400
Subject: Btrfs: fix how we mount subvol=<whatever>

We've only been able to mount with subvol=<whatever> where whatever was a subvol
within whatever root we had as the default.  This allows us to mount -o
subvol=path/to/subvol/you/want relative from the normal fs_tree root.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/super.c | 199 +++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 135 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 244fa46c50b8..934789f7fd33 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -40,6 +40,7 @@
 #include <linux/magic.h>
 #include <linux/slab.h>
 #include <linux/cleancache.h>
+#include <linux/mnt_namespace.h>
 #include "compat.h"
 #include "delayed-inode.h"
 #include "ctree.h"
@@ -58,6 +59,7 @@
 #include <trace/events/btrfs.h>
 
 static const struct super_operations btrfs_super_ops;
+static struct file_system_type btrfs_fs_type;
 
 static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
 				      char nbuf[16])
@@ -411,7 +413,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 	int intarg;
 
 	if (!options)
-		goto out;
+		return 0;
 
 	/*
 	 * strsep changes the string, duplicate it because parse_options
@@ -460,26 +462,15 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 			error = btrfs_scan_one_device(match_strdup(&args[0]),
 					flags, holder, fs_devices);
 			if (error)
-				goto out_free_opts;
+				goto out;
 			break;
 		default:
 			break;
 		}
 	}
 
- out_free_opts:
+out:
 	kfree(orig);
- out:
-	/*
-	 * If no subvolume name is specified we use the default one.  Allocate
-	 * a copy of the string "." here so that code later in the
-	 * mount path doesn't care if it's the default volume or another one.
-	 */
-	if (!*subvol_name) {
-		*subvol_name = kstrdup(".", GFP_KERNEL);
-		if (!*subvol_name)
-			return -ENOMEM;
-	}
 	return error;
 }
 
@@ -730,6 +721,118 @@ static int btrfs_set_super(struct super_block *s, void *data)
 	return set_anon_super(s, data);
 }
 
+/*
+ * This will strip out the subvol=%s argument for an argument string and add
+ * subvolid=0 to make sure we get the actual tree root for path walking to the
+ * subvol we want.
+ */
+static char *setup_root_args(char *args)
+{
+	unsigned copied = 0;
+	unsigned len = strlen(args) + 2;
+	char *pos;
+	char *ret;
+
+	/*
+	 * We need the same args as before, but minus
+	 *
+	 * subvol=a
+	 *
+	 * and add
+	 *
+	 * subvolid=0
+	 *
+	 * which is a difference of 2 characters, so we allocate strlen(args) +
+	 * 2 characters.
+	 */
+	ret = kzalloc(len * sizeof(char), GFP_NOFS);
+	if (!ret)
+		return NULL;
+	pos = strstr(args, "subvol=");
+
+	/* This shouldn't happen, but just in case.. */
+	if (!pos) {
+		kfree(ret);
+		return NULL;
+	}
+
+	/*
+	 * The subvol=<> arg is not at the front of the string, copy everybody
+	 * up to that into ret.
+	 */
+	if (pos != args) {
+		*pos = '\0';
+		strcpy(ret, args);
+		copied += strlen(args);
+		pos++;
+	}
+
+	strncpy(ret + copied, "subvolid=0", len - copied);
+
+	/* Length of subvolid=0 */
+	copied += 10;
+
+	/*
+	 * If there is no , after the subvol= option then we know there's no
+	 * other options and we can just return.
+	 */
+	pos = strchr(pos, ',');
+	if (!pos)
+		return ret;
+
+	/* Copy the rest of the arguments into our buffer */
+	strncpy(ret + copied, pos, len - copied);
+	copied += strlen(pos);
+
+	return ret;
+}
+
+static struct dentry *mount_subvol(const char *subvol_name, int flags,
+				   const char *device_name, char *data)
+{
+	struct super_block *s;
+	struct dentry *root;
+	struct vfsmount *mnt;
+	struct mnt_namespace *ns_private;
+	char *newargs;
+	struct path path;
+	int error;
+
+	newargs = setup_root_args(data);
+	if (!newargs)
+		return ERR_PTR(-ENOMEM);
+	mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
+			     newargs);
+	kfree(newargs);
+	if (IS_ERR(mnt))
+		return ERR_CAST(mnt);
+
+	ns_private = create_mnt_ns(mnt);
+	if (IS_ERR(ns_private)) {
+		mntput(mnt);
+		return ERR_CAST(ns_private);
+	}
+
+	/*
+	 * This will trigger the automount of the subvol so we can just
+	 * drop the mnt we have here and return the dentry that we
+	 * found.
+	 */
+	error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
+				LOOKUP_FOLLOW, &path);
+	put_mnt_ns(ns_private);
+	if (error)
+		return ERR_PTR(error);
+
+	/* Get a ref to the sb and the dentry we found and return it */
+	s = path.mnt->mnt_sb;
+	atomic_inc(&s->s_active);
+	root = dget(path.dentry);
+	path_put(&path);
+	down_write(&s->s_umount);
+
+	return root;
+}
 
 /*
  * Find a superblock for the given device / mount point.
@@ -761,13 +864,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	if (error)
 		return ERR_PTR(error);
 
+	if (subvol_name) {
+		root = mount_subvol(subvol_name, flags, device_name, data);
+		kfree(subvol_name);
+		return root;
+	}
+
 	error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
 	if (error)
-		goto error_free_subvol_name;
+		return ERR_PTR(error);
 
 	error = btrfs_open_devices(fs_devices, mode, fs_type);
 	if (error)
-		goto error_free_subvol_name;
+		return ERR_PTR(error);
 
 	if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
 		error = -EACCES;
@@ -792,14 +901,15 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 
 	bdev = fs_devices->latest_bdev;
 	s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
-	if (IS_ERR(s))
-		goto error_s;
+	if (IS_ERR(s)) {
+		error = PTR_ERR(s);
+		goto error_close_devices;
+	}
 
 	if (s->s_root) {
 		if ((flags ^ s->s_flags) & MS_RDONLY) {
 			deactivate_locked_super(s);
-			error = -EBUSY;
-			goto error_close_devices;
+			return ERR_PTR(-EBUSY);
 		}
 
 		btrfs_close_devices(fs_devices);
@@ -814,64 +924,25 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 					 flags & MS_SILENT ? 1 : 0);
 		if (error) {
 			deactivate_locked_super(s);
-			goto error_free_subvol_name;
+			return ERR_PTR(error);
 		}
 
 		btrfs_sb(s)->fs_info->bdev_holder = fs_type;
 		s->s_flags |= MS_ACTIVE;
 	}
 
-	/* if they gave us a subvolume name bind mount into that */
-	if (strcmp(subvol_name, ".")) {
-		struct dentry *new_root;
-
-		root = get_default_root(s, subvol_rootid);
-		if (IS_ERR(root)) {
-			error = PTR_ERR(root);
-			deactivate_locked_super(s);
-			goto error_free_subvol_name;
-		}
-
-		mutex_lock(&root->d_inode->i_mutex);
-		new_root = lookup_one_len(subvol_name, root,
-				      strlen(subvol_name));
-		mutex_unlock(&root->d_inode->i_mutex);
-
-		if (IS_ERR(new_root)) {
-			dput(root);
-			deactivate_locked_super(s);
-			error = PTR_ERR(new_root);
-			goto error_free_subvol_name;
-		}
-		if (!new_root->d_inode) {
-			dput(root);
-			dput(new_root);
-			deactivate_locked_super(s);
-			error = -ENXIO;
-			goto error_free_subvol_name;
-		}
-		dput(root);
-		root = new_root;
-	} else {
-		root = get_default_root(s, subvol_objectid);
-		if (IS_ERR(root)) {
-			error = PTR_ERR(root);
-			deactivate_locked_super(s);
-			goto error_free_subvol_name;
-		}
+	root = get_default_root(s, subvol_objectid);
+	if (IS_ERR(root)) {
+		deactivate_locked_super(s);
+		return root;
 	}
 
-	kfree(subvol_name);
 	return root;
 
-error_s:
-	error = PTR_ERR(s);
 error_close_devices:
 	btrfs_close_devices(fs_devices);
 	kfree(fs_info);
 	kfree(tree_root);
-error_free_subvol_name:
-	kfree(subvol_name);
 	return ERR_PTR(error);
 }
 
-- 
cgit v1.2.3-58-ga151


From fb25e9141ab843794d5cdef3936ccb58435e2371 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 26 Jul 2011 17:00:46 -0400
Subject: Btrfs: use bytes_may_use for all ENOSPC reservations

We have been using bytes_reserved for metadata reservations, which is wrong
since we use that to keep track of outstanding reservations from the allocator.
This resulted in us doing a lot of silly things to make sure we don't allocate a
bunch of metadata chunks since we never had a real view of how much space was
actually in use by metadata.

This passes Arne's enospc test and xfstests as well as my own enospc tests.
Hopefully this will get us moving in the right direction.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h            |   2 -
 fs/btrfs/extent-tree.c      | 163 ++++++++++++++++++++++++--------------------
 fs/btrfs/free-space-cache.c |  29 ++++++--
 3 files changed, 112 insertions(+), 82 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03912c5c6f49..332cbdc86ad4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2196,8 +2196,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      u64 root_objectid, u64 owner, u64 offset);
 
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
-int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
-				u64 num_bytes, int reserve, int sinfo);
 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 03edac4f7771..fbe6278f466b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -52,6 +52,21 @@ enum {
 	CHUNK_ALLOC_LIMITED = 2,
 };
 
+/*
+ * Control how reservations are dealt with.
+ *
+ * RESERVE_FREE - freeing a reservation.
+ * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
+ *   ENOSPC accounting
+ * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
+ *   bytes_may_use as the ENOSPC accounting is done elsewhere
+ */
+enum {
+	RESERVE_FREE = 0,
+	RESERVE_ALLOC = 1,
+	RESERVE_ALLOC_NO_ACCOUNT = 2,
+};
+
 static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc);
@@ -81,6 +96,8 @@ static int find_next_key(struct btrfs_path *path, int level,
 			 struct btrfs_key *key);
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 			    int dump_block_groups);
+static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+				       u64 num_bytes, int reserve);
 
 static noinline int
 block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -3128,9 +3145,7 @@ commit_trans:
 }
 
 /*
- * called when we are clearing an delalloc extent from the
- * inode's io_tree or there was an error for whatever reason
- * after calling btrfs_check_data_free_space
+ * Called if we need to clear a data reservation for this inode.
  */
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 {
@@ -3163,6 +3178,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
 			      struct btrfs_space_info *sinfo, u64 alloc_bytes,
 			      int force)
 {
+	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
 	u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
 	u64 thresh;
@@ -3170,6 +3186,13 @@ static int should_alloc_chunk(struct btrfs_root *root,
 	if (force == CHUNK_ALLOC_FORCE)
 		return 1;
 
+	/*
+	 * We need to take into account the global rsv because for all intents
+	 * and purposes it's used space.  Don't worry about locking the
+	 * global_rsv, it doesn't change except when the transaction commits.
+	 */
+	num_allocated += global_rsv->size;
+
 	/*
 	 * in limited mode, we want to have some free space up to
 	 * about 1% of the FS size.
@@ -3317,7 +3340,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	space_info = block_rsv->space_info;
 
 	smp_mb();
-	reserved = space_info->bytes_reserved;
+	reserved = space_info->bytes_may_use;
 	progress = space_info->reservation_progress;
 
 	if (reserved == 0)
@@ -3341,9 +3364,9 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
 
 		spin_lock(&space_info->lock);
-		if (reserved > space_info->bytes_reserved)
-			reclaimed += reserved - space_info->bytes_reserved;
-		reserved = space_info->bytes_reserved;
+		if (reserved > space_info->bytes_may_use)
+			reclaimed += reserved - space_info->bytes_may_use;
+		reserved = space_info->bytes_may_use;
 		spin_unlock(&space_info->lock);
 
 		loops++;
@@ -3401,7 +3424,6 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
 	int ret = 0;
 	bool committed = false;
 	bool flushing = false;
-
 again:
 	ret = 0;
 	spin_lock(&space_info->lock);
@@ -3443,7 +3465,7 @@ again:
 	if (unused <= space_info->total_bytes) {
 		unused = space_info->total_bytes - unused;
 		if (unused >= num_bytes) {
-			space_info->bytes_reserved += orig_bytes;
+			space_info->bytes_may_use += orig_bytes;
 			ret = 0;
 		} else {
 			/*
@@ -3614,7 +3636,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
 		}
 		if (num_bytes) {
 			spin_lock(&space_info->lock);
-			space_info->bytes_reserved -= num_bytes;
+			space_info->bytes_may_use -= num_bytes;
 			space_info->reservation_progress++;
 			spin_unlock(&space_info->lock);
 		}
@@ -3825,12 +3847,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 	if (sinfo->total_bytes > num_bytes) {
 		num_bytes = sinfo->total_bytes - num_bytes;
 		block_rsv->reserved += num_bytes;
-		sinfo->bytes_reserved += num_bytes;
+		sinfo->bytes_may_use += num_bytes;
 	}
 
 	if (block_rsv->reserved >= block_rsv->size) {
 		num_bytes = block_rsv->reserved - block_rsv->size;
-		sinfo->bytes_reserved -= num_bytes;
+		sinfo->bytes_may_use -= num_bytes;
 		sinfo->reservation_progress++;
 		block_rsv->reserved = block_rsv->size;
 		block_rsv->full = 1;
@@ -4133,7 +4155,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			btrfs_set_block_group_used(&cache->item, old_val);
 			cache->reserved -= num_bytes;
 			cache->space_info->bytes_reserved -= num_bytes;
-			cache->space_info->reservation_progress++;
 			cache->space_info->bytes_used += num_bytes;
 			cache->space_info->disk_used += num_bytes * factor;
 			spin_unlock(&cache->lock);
@@ -4185,7 +4206,6 @@ static int pin_down_extent(struct btrfs_root *root,
 	if (reserved) {
 		cache->reserved -= num_bytes;
 		cache->space_info->bytes_reserved -= num_bytes;
-		cache->space_info->reservation_progress++;
 	}
 	spin_unlock(&cache->lock);
 	spin_unlock(&cache->space_info->lock);
@@ -4212,46 +4232,55 @@ int btrfs_pin_extent(struct btrfs_root *root,
 	return 0;
 }
 
-/*
- * update size of reserved extents. this function may return -EAGAIN
- * if 'reserve' is true or 'sinfo' is false.
+/**
+ * btrfs_update_reserved_bytes - update the block_group and space info counters
+ * @cache:	The cache we are manipulating
+ * @num_bytes:	The number of bytes in question
+ * @reserve:	One of the reservation enums
+ *
+ * This is called by the allocator when it reserves space, or by somebody who is
+ * freeing space that was never actually used on disk.  For example if you
+ * reserve some space for a new leaf in transaction A and before transaction A
+ * commits you free that leaf, you call this with reserve set to 0 in order to
+ * clear the reservation.
+ *
+ * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
+ * ENOSPC accounting.  For data we handle the reservation through clearing the
+ * delalloc bits in the io_tree.  We have to do this since we could end up
+ * allocating less disk space for the amount of data we have reserved in the
+ * case of compression.
+ *
+ * If this is a reservation and the block group has become read only we cannot
+ * make the reservation and return -EAGAIN, otherwise this function always
+ * succeeds.
  */
-int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
-				u64 num_bytes, int reserve, int sinfo)
+static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+				       u64 num_bytes, int reserve)
 {
+	struct btrfs_space_info *space_info = cache->space_info;
 	int ret = 0;
-	if (sinfo) {
-		struct btrfs_space_info *space_info = cache->space_info;
-		spin_lock(&space_info->lock);
-		spin_lock(&cache->lock);
-		if (reserve) {
-			if (cache->ro) {
-				ret = -EAGAIN;
-			} else {
-				cache->reserved += num_bytes;
-				space_info->bytes_reserved += num_bytes;
-			}
-		} else {
-			if (cache->ro)
-				space_info->bytes_readonly += num_bytes;
-			cache->reserved -= num_bytes;
-			space_info->bytes_reserved -= num_bytes;
-			space_info->reservation_progress++;
-		}
-		spin_unlock(&cache->lock);
-		spin_unlock(&space_info->lock);
-	} else {
-		spin_lock(&cache->lock);
+	spin_lock(&space_info->lock);
+	spin_lock(&cache->lock);
+	if (reserve != RESERVE_FREE) {
 		if (cache->ro) {
 			ret = -EAGAIN;
 		} else {
-			if (reserve)
-				cache->reserved += num_bytes;
-			else
-				cache->reserved -= num_bytes;
+			cache->reserved += num_bytes;
+			space_info->bytes_reserved += num_bytes;
+			if (reserve == RESERVE_ALLOC) {
+				BUG_ON(space_info->bytes_may_use < num_bytes);
+				space_info->bytes_may_use -= num_bytes;
+			}
 		}
-		spin_unlock(&cache->lock);
+	} else {
+		if (cache->ro)
+			space_info->bytes_readonly += num_bytes;
+		cache->reserved -= num_bytes;
+		space_info->bytes_reserved -= num_bytes;
+		space_info->reservation_progress++;
 	}
+	spin_unlock(&cache->lock);
+	spin_unlock(&space_info->lock);
 	return ret;
 }
 
@@ -4322,7 +4351,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 		} else if (cache->reserved_pinned > 0) {
 			len = min(len, cache->reserved_pinned);
 			cache->reserved_pinned -= len;
-			cache->space_info->bytes_reserved += len;
+			cache->space_info->bytes_may_use += len;
 		}
 		spin_unlock(&cache->lock);
 		spin_unlock(&cache->space_info->lock);
@@ -4701,27 +4730,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
 
 		btrfs_add_free_space(cache, buf->start, buf->len);
-		ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0);
-		if (ret == -EAGAIN) {
-			/* block group became read-only */
-			btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
-			goto out;
-		}
+		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
 
-		ret = 1;
-		spin_lock(&block_rsv->lock);
-		if (block_rsv->reserved < block_rsv->size) {
-			block_rsv->reserved += buf->len;
-			ret = 0;
-		}
-		spin_unlock(&block_rsv->lock);
-
-		if (ret) {
-			spin_lock(&cache->space_info->lock);
-			cache->space_info->bytes_reserved -= buf->len;
-			cache->space_info->reservation_progress++;
-			spin_unlock(&cache->space_info->lock);
-		}
 		goto out;
 	}
 pin:
@@ -4881,6 +4891,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	int last_ptr_loop = 0;
 	int loop = 0;
 	int index = 0;
+	int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
+		RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
 	bool found_uncached_bg = false;
 	bool failed_cluster_refill = false;
 	bool failed_alloc = false;
@@ -5200,8 +5212,8 @@ checks:
 					     search_start - offset);
 		BUG_ON(offset > search_start);
 
-		ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1,
-					    (data & BTRFS_BLOCK_GROUP_DATA));
+		ret = btrfs_update_reserved_bytes(block_group, num_bytes,
+						  alloc_type);
 		if (ret == -EAGAIN) {
 			btrfs_add_free_space(block_group, offset, num_bytes);
 			goto loop;
@@ -5323,7 +5335,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 	int index = 0;
 
 	spin_lock(&info->lock);
-	printk(KERN_INFO "space_info has %llu free, is %sfull\n",
+	printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
+	       (unsigned long long)info->flags,
 	       (unsigned long long)(info->total_bytes - info->bytes_used -
 				    info->bytes_pinned - info->bytes_reserved -
 				    info->bytes_readonly),
@@ -5425,7 +5438,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 		ret = btrfs_discard_extent(root, start, len, NULL);
 
 	btrfs_add_free_space(cache, start, len);
-	btrfs_update_reserved_bytes(cache, len, 0, 1);
+	btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
 	btrfs_put_block_group(cache);
 
 	trace_btrfs_reserved_extent_free(root, start, len);
@@ -5628,7 +5641,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 		put_caching_control(caching_ctl);
 	}
 
-	ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1);
+	ret = btrfs_update_reserved_bytes(block_group, ins->offset,
+					  RESERVE_ALLOC_NO_ACCOUNT);
 	BUG_ON(ret);
 	btrfs_put_block_group(block_group);
 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -6594,7 +6608,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 	    cache->reserved_pinned + num_bytes + min_allocable_bytes <=
 	    sinfo->total_bytes) {
 		sinfo->bytes_readonly += num_bytes;
-		sinfo->bytes_reserved += cache->reserved_pinned;
+		sinfo->bytes_may_use += cache->reserved_pinned;
 		cache->reserved_pinned = 0;
 		cache->ro = 1;
 		ret = 0;
@@ -6962,7 +6976,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 					struct btrfs_space_info,
 					list);
 		if (space_info->bytes_pinned > 0 ||
-		    space_info->bytes_reserved > 0) {
+		    space_info->bytes_reserved > 0 ||
+		    space_info->bytes_may_use > 0) {
 			WARN_ON(1);
 			dump_space_info(space_info, 0, 0);
 		}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 41ac927401d0..79c16a68a2bc 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2472,9 +2472,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
 		spin_unlock(&ctl->tree_lock);
 
 		if (bytes >= minlen) {
-			int update_ret;
-			update_ret = btrfs_update_reserved_bytes(block_group,
-								 bytes, 1, 1);
+			struct btrfs_space_info *space_info;
+			int update = 0;
+
+			space_info = block_group->space_info;
+			spin_lock(&space_info->lock);
+			spin_lock(&block_group->lock);
+			if (!block_group->ro) {
+				block_group->reserved += bytes;
+				space_info->bytes_reserved += bytes;
+				update = 1;
+			}
+			spin_unlock(&block_group->lock);
+			spin_unlock(&space_info->lock);
 
 			ret = btrfs_error_discard_extent(fs_info->extent_root,
 							 start,
@@ -2482,9 +2492,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
 							 &actually_trimmed);
 
 			btrfs_add_free_space(block_group, start, bytes);
-			if (!update_ret)
-				btrfs_update_reserved_bytes(block_group,
-							    bytes, 0, 1);
+			if (update) {
+				spin_lock(&space_info->lock);
+				spin_lock(&block_group->lock);
+				if (block_group->ro)
+					space_info->bytes_readonly += bytes;
+				block_group->reserved -= bytes;
+				space_info->bytes_reserved -= bytes;
+				spin_unlock(&space_info->lock);
+				spin_unlock(&block_group->lock);
+			}
 
 			if (ret)
 				break;
-- 
cgit v1.2.3-58-ga151


From 9e4871070b5f070cacf26525389d56e0345ba156 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 1 Aug 2011 12:08:18 -0400
Subject: Btrfs: skip looking for delalloc if we don't have ->fill_delalloc

We always look for delalloc bytes in our io_tree so we can fill in delalloc.
This is fine in most cases, but if we're writing out the btree_inode this is
just a superfluous tree search on the io_tree, and if we have a lot of metadata
dirty this could be an expensive check.  So instead check to see if our io_tree
has a ->fill_delalloc op, and if not don't even bother doing the lookup.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent_io.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d418164a35f1..7d5e55632809 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2136,6 +2136,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	int compressed;
 	int write_flags;
 	unsigned long nr_written = 0;
+	bool fill_delalloc = true;
 
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		write_flags = WRITE_SYNC;
@@ -2166,10 +2167,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
 	set_page_extent_mapped(page);
 
+	if (!tree->ops || !tree->ops->fill_delalloc)
+		fill_delalloc = false;
+
 	delalloc_start = start;
 	delalloc_end = 0;
 	page_started = 0;
-	if (!epd->extent_locked) {
+	if (!epd->extent_locked && fill_delalloc) {
 		u64 delalloc_to_write = 0;
 		/*
 		 * make sure the wbc mapping index is at least updated
-- 
cgit v1.2.3-58-ga151


From 7709cde33f12db71efb377fae4ae7aab6c94ebc6 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 4 Aug 2011 10:25:02 -0400
Subject: Btrfs: calculate checksum space correctly

We have not been reserving enough space for checksums.  We were just reserving
bytes for the checksum items themselves, we were not taking into account having
to cow the tree and such.  This patch adds a csum_bytes counter to the inode for
keeping track of the number of bytes outstanding we have for checksums.  Then we
calculate how many leaves would be required for the checksums we are given and
use that to reserve space.  This adds a significant amount of bytes to our
reservations, but we will handle this later.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/btrfs_inode.h |   6 +++
 fs/btrfs/extent-tree.c | 117 +++++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/inode.c       |   3 ++
 3 files changed, 118 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index c70fb10a307b..5a5d325a3935 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -123,6 +123,12 @@ struct btrfs_inode {
 	 */
 	u64 last_unlink_trans;
 
+	/*
+	 * Number of bytes outstanding that are going to need csums.  This is
+	 * used in ENOSPC accounting.
+	 */
+	u64 csum_bytes;
+
 	/* flags field from the on disk inode */
 	u32 flags;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fbe6278f466b..4add1ac2dda0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3984,11 +3984,19 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
 
+/**
+ * drop_outstanding_extent - drop an outstanding extent
+ * @inode: the inode we're dropping the extent for
+ *
+ * This is called when we are freeing up an outstanding extent, either called
+ * after an error or after an extent is written.  This will return the number of
+ * reserved extents that need to be freed.  This must be called with
+ * BTRFS_I(inode)->lock held.
+ */
 static unsigned drop_outstanding_extent(struct inode *inode)
 {
 	unsigned dropped_extents = 0;
 
-	spin_lock(&BTRFS_I(inode)->lock);
 	BUG_ON(!BTRFS_I(inode)->outstanding_extents);
 	BTRFS_I(inode)->outstanding_extents--;
 
@@ -3998,19 +4006,70 @@ static unsigned drop_outstanding_extent(struct inode *inode)
 	 */
 	if (BTRFS_I(inode)->outstanding_extents >=
 	    BTRFS_I(inode)->reserved_extents)
-		goto out;
+		return 0;
 
 	dropped_extents = BTRFS_I(inode)->reserved_extents -
 		BTRFS_I(inode)->outstanding_extents;
 	BTRFS_I(inode)->reserved_extents -= dropped_extents;
-out:
-	spin_unlock(&BTRFS_I(inode)->lock);
 	return dropped_extents;
 }
 
-static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
+/**
+ * calc_csum_metadata_size - return the amount of metada space that must be
+ *	reserved/free'd for the given bytes.
+ * @inode: the inode we're manipulating
+ * @num_bytes: the number of bytes in question
+ * @reserve: 1 if we are reserving space, 0 if we are freeing space
+ *
+ * This adjusts the number of csum_bytes in the inode and then returns the
+ * correct amount of metadata that must either be reserved or freed.  We
+ * calculate how many checksums we can fit into one leaf and then divide the
+ * number of bytes that will need to be checksumed by this value to figure out
+ * how many checksums will be required.  If we are adding bytes then the number
+ * may go up and we will return the number of additional bytes that must be
+ * reserved.  If it is going down we will return the number of bytes that must
+ * be freed.
+ *
+ * This must be called with BTRFS_I(inode)->lock held.
+ */
+static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
+				   int reserve)
 {
-	return num_bytes >>= 3;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 csum_size;
+	int num_csums_per_leaf;
+	int num_csums;
+	int old_csums;
+
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
+	    BTRFS_I(inode)->csum_bytes == 0)
+		return 0;
+
+	old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+	if (reserve)
+		BTRFS_I(inode)->csum_bytes += num_bytes;
+	else
+		BTRFS_I(inode)->csum_bytes -= num_bytes;
+	csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+	num_csums_per_leaf = (int)div64_u64(csum_size,
+					    sizeof(struct btrfs_csum_item) +
+					    sizeof(struct btrfs_disk_key));
+	num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+	num_csums = num_csums + num_csums_per_leaf - 1;
+	num_csums = num_csums / num_csums_per_leaf;
+
+	old_csums = old_csums + num_csums_per_leaf - 1;
+	old_csums = old_csums / num_csums_per_leaf;
+
+	/* No change, no need to reserve more */
+	if (old_csums == num_csums)
+		return 0;
+
+	if (reserve)
+		return btrfs_calc_trans_metadata_size(root,
+						      num_csums - old_csums);
+
+	return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
 }
 
 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
@@ -4037,9 +4096,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 
 		to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
 	}
+	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
 	spin_unlock(&BTRFS_I(inode)->lock);
 
-	to_reserve += calc_csum_metadata_size(inode, num_bytes);
 	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
 	if (ret) {
 		unsigned dropped;
@@ -4047,8 +4106,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 		 * We don't need the return value since our reservation failed,
 		 * we just need to clean up our counter.
 		 */
+		spin_lock(&BTRFS_I(inode)->lock);
 		dropped = drop_outstanding_extent(inode);
 		WARN_ON(dropped > 1);
+		BTRFS_I(inode)->csum_bytes -= num_bytes;
+		spin_unlock(&BTRFS_I(inode)->lock);
 		return ret;
 	}
 
@@ -4057,6 +4119,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	return 0;
 }
 
+/**
+ * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
+ * @inode: the inode to release the reservation for
+ * @num_bytes: the number of bytes we're releasing
+ *
+ * This will release the metadata reservation for an inode.  This can be called
+ * once we complete IO for a given set of bytes to release their metadata
+ * reservations.
+ */
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4064,9 +4135,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 	unsigned dropped;
 
 	num_bytes = ALIGN(num_bytes, root->sectorsize);
+	spin_lock(&BTRFS_I(inode)->lock);
 	dropped = drop_outstanding_extent(inode);
 
-	to_free = calc_csum_metadata_size(inode, num_bytes);
+	to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+	spin_unlock(&BTRFS_I(inode)->lock);
 	if (dropped > 0)
 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
 
@@ -4074,6 +4147,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 				to_free);
 }
 
+/**
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ * @inode: inode we're writing to
+ * @num_bytes: the number of bytes we want to allocate
+ *
+ * This will do the following things
+ *
+ * o reserve space in the data space info for num_bytes
+ * o reserve space in the metadata space info based on number of outstanding
+ *   extents and how much csums will be needed
+ * o add to the inodes ->delalloc_bytes
+ * o add it to the fs_info's delalloc inodes list.
+ *
+ * This will return 0 for success and -ENOSPC if there is no space left.
+ */
 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
 {
 	int ret;
@@ -4091,6 +4179,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
 	return 0;
 }
 
+/**
+ * btrfs_delalloc_release_space - release data and metadata space for delalloc
+ * @inode: inode we're releasing space for
+ * @num_bytes: the number of bytes we want to free up
+ *
+ * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
+ * called in the case that we don't need the metadata AND data reservations
+ * anymore.  So if there is an error or we insert an inline extent.
+ *
+ * This function will release the metadata space that was not used and will
+ * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
+ * list if there are no delalloc bytes left.
+ */
 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
 {
 	btrfs_delalloc_release_metadata(inode, num_bytes);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 156c3a0da792..98b9fa2d77f7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6757,6 +6757,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->delalloc_bytes = 0;
 	ei->disk_i_size = 0;
 	ei->flags = 0;
+	ei->csum_bytes = 0;
 	ei->index_cnt = (u64)-1;
 	ei->last_unlink_trans = 0;
 
@@ -6802,6 +6803,8 @@ void btrfs_destroy_inode(struct inode *inode)
 	WARN_ON(inode->i_data.nrpages);
 	WARN_ON(BTRFS_I(inode)->outstanding_extents);
 	WARN_ON(BTRFS_I(inode)->reserved_extents);
+	WARN_ON(BTRFS_I(inode)->delalloc_bytes);
+	WARN_ON(BTRFS_I(inode)->csum_bytes);
 
 	/*
 	 * This can happen where we create an inode, but somebody else also
-- 
cgit v1.2.3-58-ga151


From dba68306f3fae681b1005137f130f5bcfdfed34a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 4 Aug 2011 15:34:57 -0400
Subject: Btrfs: kill the orphan space calculation for snapshots

This patch kills off the calculation for the amount of space needed for the
orphan operations during a snapshot.  The thing is we only do snapshots on
commit, so any space that is in the block_rsv->freed[] isn't going to be in the
new snapshot anyway, so there isn't any reason to require that space to be
reserved for the snapshot to occur.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h       |  5 ---
 fs/btrfs/inode.c       | 83 --------------------------------------------------
 fs/btrfs/transaction.c |  2 --
 3 files changed, 90 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 332cbdc86ad4..4ef777e85c89 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2577,11 +2577,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
 int btrfs_orphan_cleanup(struct btrfs_root *root);
-void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
-				struct btrfs_pending_snapshot *pending,
-				u64 *bytes_to_reserve);
-void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
-				struct btrfs_pending_snapshot *pending);
 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root);
 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 98b9fa2d77f7..5f5f8a577e69 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2079,89 +2079,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
 	up_read(&root->fs_info->cleanup_work_sem);
 }
 
-/*
- * calculate extra metadata reservation when snapshotting a subvolume
- * contains orphan files.
- */
-void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
-				struct btrfs_pending_snapshot *pending,
-				u64 *bytes_to_reserve)
-{
-	struct btrfs_root *root;
-	struct btrfs_block_rsv *block_rsv;
-	u64 num_bytes;
-	int index;
-
-	root = pending->root;
-	if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
-		return;
-
-	block_rsv = root->orphan_block_rsv;
-
-	/* orphan block reservation for the snapshot */
-	num_bytes = block_rsv->size;
-
-	/*
-	 * after the snapshot is created, COWing tree blocks may use more
-	 * space than it frees. So we should make sure there is enough
-	 * reserved space.
-	 */
-	index = trans->transid & 0x1;
-	if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
-		num_bytes += block_rsv->size -
-			     (block_rsv->reserved + block_rsv->freed[index]);
-	}
-
-	*bytes_to_reserve += num_bytes;
-}
-
-void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
-				struct btrfs_pending_snapshot *pending)
-{
-	struct btrfs_root *root = pending->root;
-	struct btrfs_root *snap = pending->snap;
-	struct btrfs_block_rsv *block_rsv;
-	u64 num_bytes;
-	int index;
-	int ret;
-
-	if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
-		return;
-
-	/* refill source subvolume's orphan block reservation */
-	block_rsv = root->orphan_block_rsv;
-	index = trans->transid & 0x1;
-	if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
-		num_bytes = block_rsv->size -
-			    (block_rsv->reserved + block_rsv->freed[index]);
-		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
-					      root->orphan_block_rsv,
-					      num_bytes);
-		BUG_ON(ret);
-	}
-
-	/* setup orphan block reservation for the snapshot */
-	block_rsv = btrfs_alloc_block_rsv(snap);
-	BUG_ON(!block_rsv);
-
-	btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
-	snap->orphan_block_rsv = block_rsv;
-
-	num_bytes = root->orphan_block_rsv->size;
-	ret = btrfs_block_rsv_migrate(&pending->block_rsv,
-				      block_rsv, num_bytes);
-	BUG_ON(ret);
-
-#if 0
-	/* insert orphan item for the snapshot */
-	WARN_ON(!root->orphan_item_inserted);
-	ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
-				       snap->root_key.objectid);
-	BUG_ON(ret);
-	snap->orphan_item_inserted = 1;
-#endif
-}
-
 enum btrfs_orphan_cleanup_state {
 	ORPHAN_CLEANUP_STARTED	= 1,
 	ORPHAN_CLEANUP_DONE	= 2,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e24b7964a155..3e20cc8c1c06 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -911,7 +911,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	}
 
 	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
-	btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
 
 	if (to_reserve > 0) {
 		ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
@@ -1002,7 +1001,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	BUG_ON(IS_ERR(pending->snap));
 
 	btrfs_reloc_post_snapshot(trans, pending);
-	btrfs_orphan_post_snapshot(trans, pending);
 fail:
 	kfree(new_root_item);
 	trans->block_rsv = rsv;
-- 
cgit v1.2.3-58-ga151


From 37be25bcb6d731914e126f8de59c4367f0d66b80 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 5 Aug 2011 10:25:38 -0400
Subject: Btrfs: kill the durable block rsv stuff

This is confusing code and isn't used by anything anymore, so delete it.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h       |  11 ------
 fs/btrfs/disk-io.c     |   2 -
 fs/btrfs/extent-tree.c | 100 +++++++++----------------------------------------
 fs/btrfs/inode.c       |   4 --
 fs/btrfs/relocation.c  |   1 -
 5 files changed, 17 insertions(+), 101 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4ef777e85c89..c5ceba4078cc 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -772,13 +772,10 @@ struct btrfs_space_info {
 struct btrfs_block_rsv {
 	u64 size;
 	u64 reserved;
-	u64 freed[2];
 	struct btrfs_space_info *space_info;
-	struct list_head list;
 	spinlock_t lock;
 	atomic_t usage;
 	unsigned int priority:8;
-	unsigned int durable:1;
 	unsigned int refill_used:1;
 	unsigned int full:1;
 };
@@ -840,7 +837,6 @@ struct btrfs_block_group_cache {
 	spinlock_t lock;
 	u64 pinned;
 	u64 reserved;
-	u64 reserved_pinned;
 	u64 bytes_super;
 	u64 flags;
 	u64 sectorsize;
@@ -919,11 +915,6 @@ struct btrfs_fs_info {
 
 	struct btrfs_block_rsv empty_block_rsv;
 
-	/* list of block reservations that cross multiple transactions */
-	struct list_head durable_block_rsv_list;
-
-	struct mutex durable_block_rsv_mutex;
-
 	u64 generation;
 	u64 last_trans_committed;
 
@@ -2238,8 +2229,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
 void btrfs_free_block_rsv(struct btrfs_root *root,
 			  struct btrfs_block_rsv *rsv);
-void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
-				 struct btrfs_block_rsv *rsv);
 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07b3ac662e19..0b5643a68d57 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1665,8 +1665,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_init_block_rsv(&fs_info->trans_block_rsv);
 	btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
 	btrfs_init_block_rsv(&fs_info->empty_block_rsv);
-	INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
-	mutex_init(&fs_info->durable_block_rsv_mutex);
 	atomic_set(&fs_info->nr_async_submits, 0);
 	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->async_submit_draining, 0);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4add1ac2dda0..30c0558eae84 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -121,7 +121,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 	if (atomic_dec_and_test(&cache->count)) {
 		WARN_ON(cache->pinned > 0);
 		WARN_ON(cache->reserved > 0);
-		WARN_ON(cache->reserved_pinned > 0);
 		kfree(cache->free_space_ctl);
 		kfree(cache);
 	}
@@ -3662,7 +3661,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
 	spin_lock_init(&rsv->lock);
 	atomic_set(&rsv->usage, 1);
 	rsv->priority = 6;
-	INIT_LIST_HEAD(&rsv->list);
 }
 
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
@@ -3685,25 +3683,10 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
 {
 	if (rsv && atomic_dec_and_test(&rsv->usage)) {
 		btrfs_block_rsv_release(root, rsv, (u64)-1);
-		if (!rsv->durable)
-			kfree(rsv);
+		kfree(rsv);
 	}
 }
 
-/*
- * make the block_rsv struct be able to capture freed space.
- * the captured space will re-add to the the block_rsv struct
- * after transaction commit
- */
-void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
-				 struct btrfs_block_rsv *block_rsv)
-{
-	block_rsv->durable = 1;
-	mutex_lock(&fs_info->durable_block_rsv_mutex);
-	list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
-	mutex_unlock(&fs_info->durable_block_rsv_mutex);
-}
-
 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
@@ -3745,9 +3728,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 		ret = 0;
 	} else {
 		num_bytes -= block_rsv->reserved;
-		if (block_rsv->durable &&
-		    block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
-			commit_trans = 1;
+		commit_trans = 1;
 	}
 	spin_unlock(&block_rsv->lock);
 	if (!ret)
@@ -3763,8 +3744,18 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 	}
 
 	if (commit_trans) {
+		struct btrfs_space_info *sinfo = block_rsv->space_info;
+
 		if (trans)
 			return -EAGAIN;
+
+		spin_lock(&sinfo->lock);
+		if (sinfo->bytes_pinned < num_bytes) {
+			spin_unlock(&sinfo->lock);
+			return -ENOSPC;
+		}
+		spin_unlock(&sinfo->lock);
+
 		trans = btrfs_join_transaction(root);
 		BUG_ON(IS_ERR(trans));
 		ret = btrfs_commit_transaction(trans, root);
@@ -3885,10 +3876,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
 	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
 
-	btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
-
-	btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
-
 	update_global_block_rsv(fs_info);
 }
 
@@ -4447,13 +4434,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 		spin_lock(&cache->lock);
 		cache->pinned -= len;
 		cache->space_info->bytes_pinned -= len;
-		if (cache->ro) {
+		if (cache->ro)
 			cache->space_info->bytes_readonly += len;
-		} else if (cache->reserved_pinned > 0) {
-			len = min(len, cache->reserved_pinned);
-			cache->reserved_pinned -= len;
-			cache->space_info->bytes_may_use += len;
-		}
 		spin_unlock(&cache->lock);
 		spin_unlock(&cache->space_info->lock);
 	}
@@ -4468,11 +4450,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct extent_io_tree *unpin;
-	struct btrfs_block_rsv *block_rsv;
-	struct btrfs_block_rsv *next_rsv;
 	u64 start;
 	u64 end;
-	int idx;
 	int ret;
 
 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -4495,30 +4474,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 		cond_resched();
 	}
 
-	mutex_lock(&fs_info->durable_block_rsv_mutex);
-	list_for_each_entry_safe(block_rsv, next_rsv,
-				 &fs_info->durable_block_rsv_list, list) {
-
-		idx = trans->transid & 0x1;
-		if (block_rsv->freed[idx] > 0) {
-			block_rsv_add_bytes(block_rsv,
-					    block_rsv->freed[idx], 0);
-			block_rsv->freed[idx] = 0;
-		}
-		if (atomic_read(&block_rsv->usage) == 0) {
-			btrfs_block_rsv_release(root, block_rsv, (u64)-1);
-
-			if (block_rsv->freed[0] == 0 &&
-			    block_rsv->freed[1] == 0) {
-				list_del_init(&block_rsv->list);
-				kfree(block_rsv);
-			}
-		} else {
-			btrfs_block_rsv_release(root, block_rsv, 0);
-		}
-	}
-	mutex_unlock(&fs_info->durable_block_rsv_mutex);
-
 	return 0;
 }
 
@@ -4820,36 +4775,18 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
 			ret = check_ref_cleanup(trans, root, buf->start);
 			if (!ret)
-				goto pin;
+				goto out;
 		}
 
 		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 			pin_down_extent(root, cache, buf->start, buf->len, 1);
-			goto pin;
+			goto out;
 		}
 
 		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
 
 		btrfs_add_free_space(cache, buf->start, buf->len);
 		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
-
-		goto out;
-	}
-pin:
-	if (block_rsv->durable && !cache->ro) {
-		ret = 0;
-		spin_lock(&cache->lock);
-		if (!cache->ro) {
-			cache->reserved_pinned += buf->len;
-			ret = 1;
-		}
-		spin_unlock(&cache->lock);
-
-		if (ret) {
-			spin_lock(&block_rsv->lock);
-			block_rsv->freed[trans->transid & 0x1] += buf->len;
-			spin_unlock(&block_rsv->lock);
-		}
 	}
 out:
 	/*
@@ -6705,12 +6642,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
 
 	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
-	    sinfo->bytes_may_use + sinfo->bytes_readonly +
-	    cache->reserved_pinned + num_bytes + min_allocable_bytes <=
-	    sinfo->total_bytes) {
+	    sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
+	    min_allocable_bytes <= sinfo->total_bytes) {
 		sinfo->bytes_readonly += num_bytes;
-		sinfo->bytes_may_use += cache->reserved_pinned;
-		cache->reserved_pinned = 0;
 		cache->ro = 1;
 		ret = 0;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5f5f8a577e69..6402a41b9023 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2164,9 +2164,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 	}
 	spin_unlock(&root->orphan_lock);
 
-	if (block_rsv)
-		btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
-
 	/* grab metadata reservation from transaction handle */
 	if (reserve) {
 		ret = btrfs_orphan_reserve_metadata(trans, inode);
@@ -6505,7 +6502,6 @@ static int btrfs_truncate(struct inode *inode)
 	rsv = btrfs_alloc_block_rsv(root);
 	if (!rsv)
 		return -ENOMEM;
-	btrfs_add_durable_block_rsv(root->fs_info, rsv);
 
 	trans = btrfs_start_transaction(root, 4);
 	if (IS_ERR(trans)) {
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 59bb1764273d..545b04358249 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3651,7 +3651,6 @@ int prepare_to_relocate(struct reloc_control *rc)
 		return ret;
 
 	rc->block_rsv->refill_used = 1;
-	btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
 
 	memset(&rc->cluster, 0, sizeof(rc->cluster));
 	rc->search_start = rc->block_group->key.objectid;
-- 
cgit v1.2.3-58-ga151


From 4289a667a0d7c6b134898cac7bfbe950267c305c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 5 Aug 2011 13:22:24 -0400
Subject: Btrfs: fix how we reserve space for deleting inodes

I converted btrfs_truncate to do sane reservations for truncate, but didn't
convert btrfs_evict_inode.  Basically we need to save the orphan_rsv for
deleting the orphan item, and do normal reservations for our truncate.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 49 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 38 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6402a41b9023..fe3891e240b3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3527,6 +3527,8 @@ void btrfs_evict_inode(struct inode *inode)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_block_rsv *rsv;
+	u64 min_size = btrfs_calc_trans_metadata_size(root, 2);
 	unsigned long nr;
 	int ret;
 
@@ -3554,22 +3556,44 @@ void btrfs_evict_inode(struct inode *inode)
 		goto no_delete;
 	}
 
+	rsv = btrfs_alloc_block_rsv(root);
+	if (!rsv) {
+		btrfs_orphan_del(NULL, inode);
+		goto no_delete;
+	}
+
 	btrfs_i_size_write(inode, 0);
 
+	/*
+	 * This is a bit simpler than btrfs_truncate since
+	 *
+	 * 1) We've already reserved our space for our orphan item in the
+	 *    unlink.
+	 * 2) We're going to delete the inode item, so we don't need to update
+	 *    it at all.
+	 *
+	 * So we just need to reserve some slack space in case we add bytes when
+	 * doing the truncate.
+	 */
 	while (1) {
-		trans = btrfs_join_transaction(root);
-		BUG_ON(IS_ERR(trans));
-		trans->block_rsv = root->orphan_block_rsv;
-
-		ret = btrfs_block_rsv_check(trans, root,
-					    root->orphan_block_rsv, 0, 5);
+		ret = btrfs_block_rsv_check(NULL, root, rsv, min_size, 0);
 		if (ret) {
-			BUG_ON(ret != -EAGAIN);
-			ret = btrfs_commit_transaction(trans, root);
-			BUG_ON(ret);
-			continue;
+			printk(KERN_WARNING "Could not get space for a "
+			       "delete, will truncate on mount\n");
+			btrfs_orphan_del(NULL, inode);
+			btrfs_free_block_rsv(root, rsv);
+			goto no_delete;
+		}
+
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans)) {
+			btrfs_orphan_del(NULL, inode);
+			btrfs_free_block_rsv(root, rsv);
+			goto no_delete;
 		}
 
+		trans->block_rsv = rsv;
+
 		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
 		if (ret != -EAGAIN)
 			break;
@@ -3578,14 +3602,17 @@ void btrfs_evict_inode(struct inode *inode)
 		btrfs_end_transaction(trans, root);
 		trans = NULL;
 		btrfs_btree_balance_dirty(root, nr);
-
 	}
 
+	btrfs_free_block_rsv(root, rsv);
+
 	if (ret == 0) {
+		trans->block_rsv = root->orphan_block_rsv;
 		ret = btrfs_orphan_del(trans, inode);
 		BUG_ON(ret);
 	}
 
+	trans->block_rsv = &root->fs_info->trans_block_rsv;
 	if (!(root == root->fs_info->tree_root ||
 	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
 		btrfs_return_ino(root, btrfs_ino(inode));
-- 
cgit v1.2.3-58-ga151


From 6ab60601d518d563ca1a47eaa399096e69d3b64a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 8 Aug 2011 08:24:46 -0400
Subject: Btrfs: ratelimit the generation printk for the free space cache

A user reported getting spammed when moving to 3.0 by this message.  Since we
switched to the normal checksumming infrastructure all old free space caches
will be wrong and need to be regenerated so people are likely to see this
message a lot, so ratelimit it so it doesn't fill up their logs and freak them
out.  Thanks,

Reported-by: Andrew Lutomirski <luto@mit.edu>
Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/free-space-cache.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 79c16a68a2bc..ecc1a4f85d20 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -20,6 +20,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/math64.h>
+#include <linux/ratelimit.h>
 #include "ctree.h"
 #include "free-space-cache.h"
 #include "transaction.h"
@@ -341,11 +342,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 
 			gen = addr;
 			if (*gen != BTRFS_I(inode)->generation) {
-				printk(KERN_ERR "btrfs: space cache generation"
-				       " (%llu) does not match inode (%llu)\n",
-				       (unsigned long long)*gen,
-				       (unsigned long long)
-				       BTRFS_I(inode)->generation);
+				printk_ratelimited(KERN_ERR "btrfs: space cache"
+					" generation (%llu) does not match "
+					"inode (%llu)\n",
+					(unsigned long long)*gen,
+					(unsigned long long)
+					BTRFS_I(inode)->generation);
 				kunmap(page);
 				unlock_page(page);
 				page_cache_release(page);
-- 
cgit v1.2.3-58-ga151


From dabdb6408cb801644fa613c7432da012640b348c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 8 Aug 2011 12:50:18 -0400
Subject: Btrfs: kill unused parts of block_rsv

The priority and refill_used flags are not used anymore, and neither is the
usage counter, so just remove them from btrfs_block_rsv.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h       |  3 ---
 fs/btrfs/extent-tree.c | 23 ++++++-----------------
 fs/btrfs/relocation.c  |  2 --
 3 files changed, 6 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c5ceba4078cc..58a06dea4791 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -774,9 +774,6 @@ struct btrfs_block_rsv {
 	u64 reserved;
 	struct btrfs_space_info *space_info;
 	spinlock_t lock;
-	atomic_t usage;
-	unsigned int priority:8;
-	unsigned int refill_used:1;
 	unsigned int full:1;
 };
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 30c0558eae84..5395cc639270 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3659,8 +3659,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
 {
 	memset(rsv, 0, sizeof(*rsv));
 	spin_lock_init(&rsv->lock);
-	atomic_set(&rsv->usage, 1);
-	rsv->priority = 6;
 }
 
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
@@ -3681,10 +3679,8 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
 void btrfs_free_block_rsv(struct btrfs_root *root,
 			  struct btrfs_block_rsv *rsv)
 {
-	if (rsv && atomic_dec_and_test(&rsv->usage)) {
-		btrfs_block_rsv_release(root, rsv, (u64)-1);
-		kfree(rsv);
-	}
+	btrfs_block_rsv_release(root, rsv, (u64)-1);
+	kfree(rsv);
 }
 
 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
@@ -3734,13 +3730,10 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 	if (!ret)
 		return 0;
 
-	if (block_rsv->refill_used) {
-		ret = reserve_metadata_bytes(trans, root, block_rsv,
-					     num_bytes, 0);
-		if (!ret) {
-			block_rsv_add_bytes(block_rsv, num_bytes, 0);
-			return 0;
-		}
+	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 0);
+	if (!ret) {
+		block_rsv_add_bytes(block_rsv, num_bytes, 0);
+		return 0;
 	}
 
 	if (commit_trans) {
@@ -3859,16 +3852,12 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 
 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
 	fs_info->chunk_block_rsv.space_info = space_info;
-	fs_info->chunk_block_rsv.priority = 10;
 
 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 	fs_info->global_block_rsv.space_info = space_info;
-	fs_info->global_block_rsv.priority = 10;
-	fs_info->global_block_rsv.refill_used = 1;
 	fs_info->delalloc_block_rsv.space_info = space_info;
 	fs_info->trans_block_rsv.space_info = space_info;
 	fs_info->empty_block_rsv.space_info = space_info;
-	fs_info->empty_block_rsv.priority = 10;
 
 	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
 	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 545b04358249..aeaed99e9cfe 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3650,8 +3650,6 @@ int prepare_to_relocate(struct reloc_control *rc)
 	if (ret)
 		return ret;
 
-	rc->block_rsv->refill_used = 1;
-
 	memset(&rc->cluster, 0, sizeof(rc->cluster));
 	rc->search_start = rc->block_group->key.objectid;
 	rc->extents_found = 0;
-- 
cgit v1.2.3-58-ga151


From 13553e5221d6901a33b3f2157a389de085c161fe Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 8 Aug 2011 13:33:21 -0400
Subject: Btrfs: don't try to commit in btrfs_block_rsv_check

We will try and reserve metadata bytes in btrfs_block_rsv_check and if we cannot
because we have a transaction open it will return EAGAIN, so we do not need to
try and commit the transaction again.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 29 ++++-------------------------
 1 file changed, 4 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5395cc639270..6356ef2f0c80 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3708,7 +3708,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 			  u64 min_reserved, int min_factor)
 {
 	u64 num_bytes = 0;
-	int commit_trans = 0;
 	int ret = -ENOSPC;
 
 	if (!block_rsv)
@@ -3720,13 +3719,12 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 	if (min_reserved > num_bytes)
 		num_bytes = min_reserved;
 
-	if (block_rsv->reserved >= num_bytes) {
+	if (block_rsv->reserved >= num_bytes)
 		ret = 0;
-	} else {
+	else
 		num_bytes -= block_rsv->reserved;
-		commit_trans = 1;
-	}
 	spin_unlock(&block_rsv->lock);
+
 	if (!ret)
 		return 0;
 
@@ -3736,26 +3734,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 
-	if (commit_trans) {
-		struct btrfs_space_info *sinfo = block_rsv->space_info;
-
-		if (trans)
-			return -EAGAIN;
-
-		spin_lock(&sinfo->lock);
-		if (sinfo->bytes_pinned < num_bytes) {
-			spin_unlock(&sinfo->lock);
-			return -ENOSPC;
-		}
-		spin_unlock(&sinfo->lock);
-
-		trans = btrfs_join_transaction(root);
-		BUG_ON(IS_ERR(trans));
-		ret = btrfs_commit_transaction(trans, root);
-		return 0;
-	}
-
-	return -ENOSPC;
+	return ret;
 }
 
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
-- 
cgit v1.2.3-58-ga151


From 907cbcebd4e5f641faf08601f216b1ceb6cb3bdf Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 8 Aug 2011 13:46:15 -0400
Subject: Btrfs: optimize how we account for space in truncate

Currently we're starting and stopping a transaction for no real reason, so kill
that and just reserve enough space as if we can truncate all in one transaction.
Also use btrfs_block_rsv_check() for our reserve to minimize the amount of space
we may have to allocate for our slack space.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 58 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fe3891e240b3..4d057c084de6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6482,6 +6482,7 @@ static int btrfs_truncate(struct inode *inode)
 	struct btrfs_trans_handle *trans;
 	unsigned long nr;
 	u64 mask = root->sectorsize - 1;
+	u64 min_size = btrfs_calc_trans_metadata_size(root, 2);
 
 	ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
 	if (ret)
@@ -6530,17 +6531,21 @@ static int btrfs_truncate(struct inode *inode)
 	if (!rsv)
 		return -ENOMEM;
 
-	trans = btrfs_start_transaction(root, 4);
+	/*
+	 * 2 for the truncate slack space
+	 * 1 for the orphan item we're going to add
+	 * 1 for the orphan item deletion
+	 * 1 for updating the inode.
+	 */
+	trans = btrfs_start_transaction(root, 5);
 	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
 		goto out;
 	}
 
-	/*
-	 * Reserve space for the truncate process.  Truncate should be adding
-	 * space, but if there are snapshots it may end up using space.
-	 */
-	ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
+	/* Migrate the slack space for the truncate to our reserve */
+	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+				      min_size);
 	BUG_ON(ret);
 
 	ret = btrfs_orphan_add(trans, inode);
@@ -6549,21 +6554,6 @@ static int btrfs_truncate(struct inode *inode)
 		goto out;
 	}
 
-	nr = trans->blocks_used;
-	btrfs_end_transaction(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
-
-	/*
-	 * Ok so we've already migrated our bytes over for the truncate, so here
-	 * just reserve the one slot we need for updating the inode.
-	 */
-	trans = btrfs_start_transaction(root, 1);
-	if (IS_ERR(trans)) {
-		err = PTR_ERR(trans);
-		goto out;
-	}
-	trans->block_rsv = rsv;
-
 	/*
 	 * setattr is responsible for setting the ordered_data_close flag,
 	 * but that is only tested during the last file release.  That
@@ -6585,20 +6575,30 @@ static int btrfs_truncate(struct inode *inode)
 		btrfs_add_ordered_operation(trans, root, inode);
 
 	while (1) {
+		ret = btrfs_block_rsv_check(trans, root, rsv, min_size, 0);
+		if (ret) {
+			/*
+			 * This can only happen with the original transaction we
+			 * started above, every other time we shouldn't have a
+			 * transaction started yet.
+			 */
+			if (ret == -EAGAIN)
+				goto end_trans;
+			err = ret;
+			break;
+		}
+
 		if (!trans) {
-			trans = btrfs_start_transaction(root, 3);
+			/* Just need the 1 for updating the inode */
+			trans = btrfs_start_transaction(root, 1);
 			if (IS_ERR(trans)) {
 				err = PTR_ERR(trans);
 				goto out;
 			}
-
-			ret = btrfs_truncate_reserve_metadata(trans, root,
-							      rsv);
-			BUG_ON(ret);
-
-			trans->block_rsv = rsv;
 		}
 
+		trans->block_rsv = rsv;
+
 		ret = btrfs_truncate_inode_items(trans, root, inode,
 						 inode->i_size,
 						 BTRFS_EXTENT_DATA_KEY);
@@ -6613,7 +6613,7 @@ static int btrfs_truncate(struct inode *inode)
 			err = ret;
 			break;
 		}
-
+end_trans:
 		nr = trans->blocks_used;
 		btrfs_end_transaction(trans, root);
 		trans = NULL;
-- 
cgit v1.2.3-58-ga151


From 5e962c7850c273b483acc747b41bd5cddf631049 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 8 Aug 2011 14:03:37 -0400
Subject: Btrfs: kill btrfs_truncate_reserve_metadata

Since we've optimized the truncate path, we no longer require this function.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h       |  3 ---
 fs/btrfs/extent-tree.c | 31 -------------------------------
 2 files changed, 34 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 58a06dea4791..22a9347a3908 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2240,9 +2240,6 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
 void btrfs_block_rsv_release(struct btrfs_root *root,
 			     struct btrfs_block_rsv *block_rsv,
 			     u64 num_bytes);
-int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
-				    struct btrfs_root *root,
-				    struct btrfs_block_rsv *rsv);
 int btrfs_set_block_group_ro(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *cache);
 int btrfs_set_block_group_rw(struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6356ef2f0c80..d05967e9d613 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3858,37 +3858,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
 }
 
-int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
-				    struct btrfs_root *root,
-				    struct btrfs_block_rsv *rsv)
-{
-	struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
-	u64 num_bytes;
-	int ret;
-
-	/*
-	 * Truncate should be freeing data, but give us 2 items just in case it
-	 * needs to use some space.  We may want to be smarter about this in the
-	 * future.
-	 */
-	num_bytes = btrfs_calc_trans_metadata_size(root, 2);
-
-	/* We already have enough bytes, just return */
-	if (rsv->reserved >= num_bytes)
-		return 0;
-
-	num_bytes -= rsv->reserved;
-
-	/*
-	 * You should have reserved enough space before hand to do this, so this
-	 * should not fail.
-	 */
-	ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
-	BUG_ON(ret);
-
-	return 0;
-}
-
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root)
 {
-- 
cgit v1.2.3-58-ga151


From 1b9c332b6c92e992b1971a08412c6f460a54b514 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 17 Aug 2011 10:19:52 -0400
Subject: Btrfs: only reserve space in fallocate if we have to do a preallocate

Lukas found a problem where if he tries to fallocate over the same region twice
and the first fallocate took up all the space we would fail with ENOSPC.  This
is because we reserve the total space we want to use for fallocate, regardless
of wether or not we will have to actually preallocate.  So instead move the
check into the loop where we actually have to do the preallocate.  Thanks,

Tested-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/file.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4e57d59edb7..de569af766fe 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1615,10 +1615,6 @@ static long btrfs_fallocate(struct file *file, int mode,
 			goto out;
 	}
 
-	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
-	if (ret)
-		goto out;
-
 	locked_end = alloc_end - 1;
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
@@ -1664,11 +1660,27 @@ static long btrfs_fallocate(struct file *file, int mode,
 		if (em->block_start == EXTENT_MAP_HOLE ||
 		    (cur_offset >= inode->i_size &&
 		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+
+			/*
+			 * Make sure we have enough space before we do the
+			 * allocation.
+			 */
+			ret = btrfs_check_data_free_space(inode, last_byte -
+							  cur_offset);
+			if (ret) {
+				free_extent_map(em);
+				break;
+			}
+
 			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
 							last_byte - cur_offset,
 							1 << inode->i_blkbits,
 							offset + len,
 							&alloc_hint);
+
+			/* Let go of our reservation. */
+			btrfs_free_reserved_data_space(inode, last_byte -
+						       cur_offset);
 			if (ret < 0) {
 				free_extent_map(em);
 				break;
@@ -1694,8 +1706,6 @@ static long btrfs_fallocate(struct file *file, int mode,
 	}
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
 			     &cached_state, GFP_NOFS);
-
-	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
 out:
 	mutex_unlock(&inode->i_mutex);
 	return ret;
-- 
cgit v1.2.3-58-ga151


From 07127184efb629f1336c0592bfdacec258cab731 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 19 Aug 2011 10:29:59 -0400
Subject: Btrfs: reduce the amount of space needed for truncates

With btrfs_truncate_inode_items we always return if we have to go to another
leaf, which makes us do our reservation again.  This means we will only ever
modify one leaf at a time, so we only need 1 items worth of slack space.  Also,
since we are deleting we will not be creating nodes as we go down, if anything
we'll be free'ing them as we merge them together, so make a different
calculation for truncate which will only have the worst case useage of COW'ing
the entire path down to the leaf.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h | 11 +++++++++++
 fs/btrfs/inode.c |  8 ++++----
 2 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 22a9347a3908..2e18b068841b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2125,6 +2125,17 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
 		3 * num_items;
 }
 
+/*
+ * Doing a truncate won't result in new nodes or leaves, just what we need for
+ * COW.
+ */
+static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
+						 unsigned num_items)
+{
+	return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+		num_items;
+}
+
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, unsigned long count);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4d057c084de6..8316b570db55 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3528,7 +3528,7 @@ void btrfs_evict_inode(struct inode *inode)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_block_rsv *rsv;
-	u64 min_size = btrfs_calc_trans_metadata_size(root, 2);
+	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 	unsigned long nr;
 	int ret;
 
@@ -6482,7 +6482,7 @@ static int btrfs_truncate(struct inode *inode)
 	struct btrfs_trans_handle *trans;
 	unsigned long nr;
 	u64 mask = root->sectorsize - 1;
-	u64 min_size = btrfs_calc_trans_metadata_size(root, 2);
+	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 
 	ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
 	if (ret)
@@ -6532,12 +6532,12 @@ static int btrfs_truncate(struct inode *inode)
 		return -ENOMEM;
 
 	/*
-	 * 2 for the truncate slack space
+	 * 1 for the truncate slack space
 	 * 1 for the orphan item we're going to add
 	 * 1 for the orphan item deletion
 	 * 1 for updating the inode.
 	 */
-	trans = btrfs_start_transaction(root, 5);
+	trans = btrfs_start_transaction(root, 4);
 	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
 		goto out;
-- 
cgit v1.2.3-58-ga151


From 482e6dc5261406fdb921946e70b51467b0305bad Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 19 Aug 2011 10:31:56 -0400
Subject: Btrfs: allow callers to specify if flushing can occur for
 btrfs_block_rsv_check

If you run xfstest 224 it you will get lots of messages about not being able to
delete inodes and that they will be cleaned up next mount.  This is because
btrfs_block_rsv_check was not calling reserve_metadata_bytes with the ability to
flush, so if there was not enough space, it simply failed.  But in truncate and
evict case we could easily flush space to try and get enough space to do our
work, so make btrfs_block_rsv_check take a flush argument to pass down to
reserve_metadata_bytes.  Now xfstests 224 runs fine without all those
complaints.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h            | 2 +-
 fs/btrfs/extent-tree.c      | 4 ++--
 fs/btrfs/free-space-cache.c | 2 +-
 fs/btrfs/inode.c            | 6 +++---
 fs/btrfs/relocation.c       | 4 ++--
 fs/btrfs/transaction.c      | 2 +-
 6 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2e18b068841b..caa73cd8c00a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2244,7 +2244,7 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv,
-			  u64 min_reserved, int min_factor);
+			  u64 min_reserved, int min_factor, int flush);
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
 			    struct btrfs_block_rsv *dst_rsv,
 			    u64 num_bytes);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d05967e9d613..a71fcf506531 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3705,7 +3705,7 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv,
-			  u64 min_reserved, int min_factor)
+			  u64 min_reserved, int min_factor, int flush)
 {
 	u64 num_bytes = 0;
 	int ret = -ENOSPC;
@@ -3728,7 +3728,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 	if (!ret)
 		return 0;
 
-	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 0);
+	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, flush);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
 		return 0;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ecc1a4f85d20..b0122e19db6b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -199,7 +199,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 	trans->block_rsv = root->orphan_block_rsv;
 	ret = btrfs_block_rsv_check(trans, root,
 				    root->orphan_block_rsv,
-				    0, 5);
+				    0, 5, 0);
 	if (ret)
 		return ret;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8316b570db55..e40b9239660d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3576,10 +3576,10 @@ void btrfs_evict_inode(struct inode *inode)
 	 * doing the truncate.
 	 */
 	while (1) {
-		ret = btrfs_block_rsv_check(NULL, root, rsv, min_size, 0);
+		ret = btrfs_block_rsv_check(NULL, root, rsv, min_size, 0, 1);
 		if (ret) {
 			printk(KERN_WARNING "Could not get space for a "
-			       "delete, will truncate on mount\n");
+			       "delete, will truncate on mount %d\n", ret);
 			btrfs_orphan_del(NULL, inode);
 			btrfs_free_block_rsv(root, rsv);
 			goto no_delete;
@@ -6575,7 +6575,7 @@ static int btrfs_truncate(struct inode *inode)
 		btrfs_add_ordered_operation(trans, root, inode);
 
 	while (1) {
-		ret = btrfs_block_rsv_check(trans, root, rsv, min_size, 0);
+		ret = btrfs_block_rsv_check(trans, root, rsv, min_size, 0, 1);
 		if (ret) {
 			/*
 			 * This can only happen with the original transaction we
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index aeaed99e9cfe..fd9ac66434b0 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2042,7 +2042,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 		trans->block_rsv = rc->block_rsv;
 
 		ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
-					    min_reserved, 0);
+					    min_reserved, 0, 0);
 		if (ret) {
 			BUG_ON(ret != -EAGAIN);
 			ret = btrfs_commit_transaction(trans, root);
@@ -3775,7 +3775,7 @@ restart:
 		}
 
 		ret = btrfs_block_rsv_check(trans, rc->extent_root,
-					    rc->block_rsv, 0, 5);
+					    rc->block_rsv, 0, 5, 0);
 		if (ret < 0) {
 			if (ret != -EAGAIN) {
 				err = ret;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3e20cc8c1c06..a1d8c322c1ba 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -419,7 +419,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
 {
 	int ret;
 	ret = btrfs_block_rsv_check(trans, root,
-				    &root->fs_info->global_block_rsv, 0, 5);
+				    &root->fs_info->global_block_rsv, 0, 5, 0);
 	return ret ? 1 : 0;
 }
 
-- 
cgit v1.2.3-58-ga151


From a9b5fcddce594a408a48d523087b5bb64ce82469 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 19 Aug 2011 12:06:12 -0400
Subject: Btrfs: fix call to btrfs_search_slot in free space cache

We are setting ins_len to 1 even tho we are just modifying an item that should
be there already.  This may cause the search stuff to split nodes on the way
down needelessly.  Set this to 0 since we aren't inserting anything.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/free-space-cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index b0122e19db6b..701ef5951e3f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -802,7 +802,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	key.offset = offset;
 	key.type = 0;
 
-	ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 	if (ret < 0) {
 		ret = -1;
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
-- 
cgit v1.2.3-58-ga151


From 7ed49f187c82821e35f8869399bcf90822a74a23 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 19 Aug 2011 15:45:52 -0400
Subject: Btrfs: fix space leak when we fail to make an allocation

When changing back to using a spin_lock to protect the extent counters I decided
that since we would only be dropping our original extent, it was ok to just drop
the extent and return.  However since somebody else could have come in and done
a reservation, we need to do the normal song and dance to clear the reservation
out properly.  So calculate how much space we need to free, and then subtract
what we just attempted to reserve.  If it's more then we know we need to drop
those bytes from the delalloc block rsv.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a71fcf506531..99ab5716baad 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4025,16 +4025,24 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 
 	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
 	if (ret) {
+		u64 to_free = 0;
 		unsigned dropped;
-		/*
-		 * We don't need the return value since our reservation failed,
-		 * we just need to clean up our counter.
-		 */
+
 		spin_lock(&BTRFS_I(inode)->lock);
 		dropped = drop_outstanding_extent(inode);
-		WARN_ON(dropped > 1);
-		BTRFS_I(inode)->csum_bytes -= num_bytes;
+		to_free = calc_csum_metadata_size(inode, num_bytes, 0);
 		spin_unlock(&BTRFS_I(inode)->lock);
+		to_free += btrfs_calc_trans_metadata_size(root, dropped);
+
+		/*
+		 * Somebody could have come in and twiddled with the
+		 * reservation, so if we have to free more than we would have
+		 * reserved from this reservation go ahead and release those
+		 * bytes.
+		 */
+		to_free -= to_reserve;
+		if (to_free)
+			btrfs_block_rsv_release(root, block_rsv, to_free);
 		return ret;
 	}
 
-- 
cgit v1.2.3-58-ga151


From 7f70150896ebd1169d9c43484c8c424f755353c4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 22 Aug 2011 15:23:19 -0400
Subject: Btrfs: don't increase the block_rsv's size when emergency allocating
 space

If we have to emergency reserve space we need to not increase the block_rsv
size, otherwise we'll leak space.  Take for instance delalloc, say we reserve
4k, and we use that 4k, and then we have to emergency allocate another 4k, we
bump the size up to 8k, however we've only accounted for 4k in reservations in
all of our supporting logic, so we'll go to free the 4k and end up having a size
of 4k, which will cause us to later not free as much space.  I saw this doing
testing where I wasn't reserving enough space for something but was still
leaking space, very frustrating.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 99ab5716baad..1f1d3e8dcec9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5707,9 +5707,6 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 		ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
 					     0);
 		if (!ret) {
-			spin_lock(&block_rsv->lock);
-			block_rsv->size += blocksize;
-			spin_unlock(&block_rsv->lock);
 			return block_rsv;
 		} else if (ret && block_rsv != global_rsv) {
 			ret = block_rsv_use_bytes(global_rsv, blocksize);
-- 
cgit v1.2.3-58-ga151


From 4a33854257764c2ec6337ee0c8ecafb64f8e29e1 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 29 Aug 2011 11:01:31 -0400
Subject: Btrfs: set truncate block rsv's size

While debugging a different issue I noticed that we were always reserving space
when we tried to use our truncate block rsv's.  This is because they didn't have
a ->size value, so use_block_rsv just assumes there is nothing reserved and it
does a reserve_metadata_bytes.  This is because btrfs_check_block_rsv() doesn't
actually add to the size of the block rsv.  That seems to be the right thing to
do so set ->size to the minimum truncate size we need, since we will always only
refill to that size anyway, and this way everything works out correctly.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e40b9239660d..06ae5b173fd7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3561,6 +3561,7 @@ void btrfs_evict_inode(struct inode *inode)
 		btrfs_orphan_del(NULL, inode);
 		goto no_delete;
 	}
+	rsv->size = min_size;
 
 	btrfs_i_size_write(inode, 0);
 
@@ -6530,6 +6531,7 @@ static int btrfs_truncate(struct inode *inode)
 	rsv = btrfs_alloc_block_rsv(root);
 	if (!rsv)
 		return -ENOMEM;
+	rsv->size = min_size;
 
 	/*
 	 * 1 for the truncate slack space
-- 
cgit v1.2.3-58-ga151


From 300e4f8a56f263797568c95b71c949f9f02e4534 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 29 Aug 2011 14:06:00 -0400
Subject: Btrfs: put the block group cache after we commit the super

In moving some enospc stuff around I noticed that when we unmount we are often
evicting the free space cache inodes before we do our last commit.  This isn't
bad, but it makes us constantly have to re-read the inodes back.  So instead
don't evict the cache until after we do our last commit, this will make things a
little less crappy and makes a future enospc change work properly.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/disk-io.c          | 4 ++--
 fs/btrfs/free-space-cache.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0b5643a68d57..4965a0179b31 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2543,8 +2543,6 @@ int close_ctree(struct btrfs_root *root)
 	/* clear out the rbtree of defraggable inodes */
 	btrfs_run_defrag_inodes(root->fs_info);
 
-	btrfs_put_block_group_cache(fs_info);
-
 	/*
 	 * Here come 2 situations when btrfs is broken to flip readonly:
 	 *
@@ -2570,6 +2568,8 @@ int close_ctree(struct btrfs_root *root)
 			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
 	}
 
+	btrfs_put_block_group_cache(fs_info);
+
 	kthread_stop(root->fs_info->transaction_kthread);
 	kthread_stop(root->fs_info->cleaner_kthread);
 
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 701ef5951e3f..1ea10731797a 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -105,7 +105,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
 		block_group->disk_cache_state = BTRFS_DC_CLEAR;
 	}
 
-	if (!btrfs_fs_closing(root->fs_info)) {
+	if (!block_group->iref) {
 		block_group->inode = igrab(inode);
 		block_group->iref = 1;
 	}
-- 
cgit v1.2.3-58-ga151


From c09544e07f8cdc455ed8615d4c067d694c33bd18 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 30 Aug 2011 10:19:10 -0400
Subject: Btrfs: handle enospc accounting for free space inodes

Since free space inodes now use normal checksumming we need to make sure to
account for their metadata use.  So reserve metadata space, and then if we fail
to write out the metadata we can just release it, otherwise it will be freed up
when the io completes.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c      | 18 +++++++++++++-----
 fs/btrfs/free-space-cache.c | 44 +++++++++++++++++++++++++++++---------------
 fs/btrfs/inode-map.c        |  6 ++++--
 fs/btrfs/inode.c            |  2 +-
 4 files changed, 47 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1f1d3e8dcec9..ccdc4d12e8d4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2755,16 +2755,20 @@ again:
 	num_pages *= 16;
 	num_pages *= PAGE_CACHE_SIZE;
 
-	ret = btrfs_check_data_free_space(inode, num_pages);
+	ret = btrfs_delalloc_reserve_space(inode, num_pages);
 	if (ret)
 		goto out_put;
 
 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
 					      num_pages, num_pages,
 					      &alloc_hint);
-	if (!ret)
+	if (!ret) {
 		dcs = BTRFS_DC_SETUP;
-	btrfs_free_reserved_data_space(inode, num_pages);
+		btrfs_free_reserved_data_space(inode, num_pages);
+	} else {
+		btrfs_delalloc_release_space(inode, num_pages);
+	}
+
 out_put:
 	iput(inode);
 out_free:
@@ -4002,9 +4006,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
 	u64 to_reserve = 0;
 	unsigned nr_extents = 0;
+	int flush = 1;
 	int ret;
 
-	if (btrfs_transaction_in_commit(root->fs_info))
+	if (btrfs_is_free_space_inode(root, inode))
+		flush = 0;
+
+	if (flush && btrfs_transaction_in_commit(root->fs_info))
 		schedule_timeout(1);
 
 	num_bytes = ALIGN(num_bytes, root->sectorsize);
@@ -4023,7 +4031,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
 	spin_unlock(&BTRFS_I(inode)->lock);
 
-	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
+	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, flush);
 	if (ret) {
 		u64 to_free = 0;
 		unsigned dropped;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 1ea10731797a..3bde17ff14c0 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -532,6 +532,19 @@ out:
 	return ret;
 }
 
+/**
+ * __btrfs_write_out_cache - write out cached info to an inode
+ * @root - the root the inode belongs to
+ * @ctl - the free space cache we are going to write out
+ * @block_group - the block_group for this cache if it belongs to a block_group
+ * @trans - the trans handle
+ * @path - the path to use
+ * @offset - the offset for the key we'll insert
+ *
+ * This function writes out a free space cache struct to disk for quick recovery
+ * on mount.  This will return 0 if it was successfull in writing the cache out,
+ * and -1 if it was not.
+ */
 int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 			    struct btrfs_free_space_ctl *ctl,
 			    struct btrfs_block_group_cache *block_group,
@@ -555,7 +568,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	int index = 0, num_pages = 0;
 	int entries = 0;
 	int bitmaps = 0;
-	int ret = -1;
+	int ret;
+	int err = -1;
 	bool next_page = false;
 	bool out_of_space = false;
 
@@ -563,7 +577,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 
 	node = rb_first(&ctl->free_space_offset);
 	if (!node)
-		return 0;
+		return -1;
 
 	if (!i_size_read(inode))
 		return -1;
@@ -767,7 +781,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
 				     i_size_read(inode) - 1, &cached_state,
 				     GFP_NOFS);
-		ret = 0;
 		goto out;
 	}
 
@@ -789,10 +802,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
 			     i_size_read(inode) - 1, &cached_state, GFP_NOFS);
 
-	if (ret) {
-		ret = 0;
+	if (ret)
 		goto out;
-	}
 
 	BTRFS_I(inode)->generation = trans->transid;
 
@@ -804,7 +815,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 
 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 	if (ret < 0) {
-		ret = -1;
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
 				 EXTENT_DIRTY | EXTENT_DELALLOC |
 				 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
@@ -818,7 +828,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
 		    found_key.offset != offset) {
-			ret = -1;
 			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
 					 EXTENT_DIRTY | EXTENT_DELALLOC |
 					 EXTENT_DO_ACCOUNTING, 0, 0, NULL,
@@ -835,16 +844,15 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(path);
 
-	ret = 1;
-
+	err = 0;
 out:
 	kfree(pages);
-	if (ret != 1) {
+	if (err) {
 		invalidate_inode_pages2_range(inode->i_mapping, 0, index);
 		BTRFS_I(inode)->generation = 0;
 	}
 	btrfs_update_inode(trans, root, inode);
-	return ret;
+	return err;
 }
 
 int btrfs_write_out_cache(struct btrfs_root *root,
@@ -871,14 +879,16 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 
 	ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
 				      path, block_group->key.objectid);
-	if (ret < 0) {
+	if (ret) {
+		btrfs_delalloc_release_metadata(inode, inode->i_size);
 		spin_lock(&block_group->lock);
 		block_group->disk_cache_state = BTRFS_DC_ERROR;
 		spin_unlock(&block_group->lock);
 		ret = 0;
-
+#ifdef DEBUG
 		printk(KERN_ERR "btrfs: failed to write free space cace "
 		       "for block group %llu\n", block_group->key.objectid);
+#endif
 	}
 
 	iput(inode);
@@ -2662,9 +2672,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
 		return 0;
 
 	ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
-	if (ret < 0)
+	if (ret) {
+		btrfs_delalloc_release_metadata(inode, inode->i_size);
+#ifdef DEBUG
 		printk(KERN_ERR "btrfs: failed to write free ino cache "
 		       "for root %llu\n", root->root_key.objectid);
+#endif
+	}
 
 	iput(inode);
 	return ret;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b4087e0fa871..53dcbdf446cd 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -465,14 +465,16 @@ again:
 	/* Just to make sure we have enough space */
 	prealloc += 8 * PAGE_CACHE_SIZE;
 
-	ret = btrfs_check_data_free_space(inode, prealloc);
+	ret = btrfs_delalloc_reserve_space(inode, prealloc);
 	if (ret)
 		goto out_put;
 
 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
 					      prealloc, prealloc, &alloc_hint);
-	if (ret)
+	if (ret) {
+		btrfs_delalloc_release_space(inode, prealloc);
 		goto out_put;
+	}
 	btrfs_free_reserved_data_space(inode, prealloc);
 
 out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 06ae5b173fd7..78b5ae59ac4f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1792,11 +1792,11 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	}
 	ret = 0;
 out:
+	btrfs_delalloc_release_metadata(inode, ordered_extent->len);
 	if (nolock) {
 		if (trans)
 			btrfs_end_transaction_nolock(trans, root);
 	} else {
-		btrfs_delalloc_release_metadata(inode, ordered_extent->len);
 		if (trans)
 			btrfs_end_transaction(trans, root);
 	}
-- 
cgit v1.2.3-58-ga151


From 4c13d758b7e79c14a0026c1f783f0c79e339b7bb Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 30 Aug 2011 11:31:29 -0400
Subject: Btrfs: use the transactions block_rsv for the csum root

The alloc warnings everybody has been seeing is because we have been reserving
space for csums, but we weren't actually using that space.  So make
get_block_rsv() return the trans->block_rsv if we're modifying the csum root.
Also set the trans->block_rsv to NULL so that if we modify the csum root when
running delayed ref's that comes out of the global reserve like it's supposed
to.  With this patch I'm not seeing those alloc warnings anymore.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 15 +++++++++------
 fs/btrfs/transaction.c |  1 +
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ccdc4d12e8d4..53f6dbdab510 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3565,10 +3565,12 @@ out:
 static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root)
 {
-	struct btrfs_block_rsv *block_rsv;
-	if (root->ref_cows)
+	struct btrfs_block_rsv *block_rsv = NULL;
+
+	if (root->ref_cows || root == root->fs_info->csum_root)
 		block_rsv = trans->block_rsv;
-	else
+
+	if (!block_rsv)
 		block_rsv = root->block_rsv;
 
 	if (!block_rsv)
@@ -3865,12 +3867,13 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root)
 {
+	struct btrfs_block_rsv *block_rsv;
+
 	if (!trans->bytes_reserved)
 		return;
 
-	BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
-	btrfs_block_rsv_release(root, trans->block_rsv,
-				trans->bytes_reserved);
+	block_rsv = &root->fs_info->trans_block_rsv;
+	btrfs_block_rsv_release(root, block_rsv, trans->bytes_reserved);
 	trans->bytes_reserved = 0;
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a1d8c322c1ba..a770f4bd9d31 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -453,6 +453,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 
+	trans->block_rsv = NULL;
 	while (count < 4) {
 		unsigned long cur = trans->delayed_ref_updates;
 		trans->delayed_ref_updates = 0;
-- 
cgit v1.2.3-58-ga151


From d02c9955ded7fc56dd1edc987558b084ccb03eb4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 30 Aug 2011 11:40:48 -0400
Subject: Btrfs: don't get the block_rsv in btrfs_free_tree_block

Since the durable block rsv stuff has been killed there is no need to get the
block_rsv in btrfs_free_tree_block anymore.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 53f6dbdab510..d236df790156 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4707,7 +4707,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 			   struct extent_buffer *buf,
 			   u64 parent, int last_ref)
 {
-	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_block_group_cache *cache = NULL;
 	int ret;
 
@@ -4722,10 +4721,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 	if (!last_ref)
 		return;
 
-	block_rsv = get_block_rsv(trans, root);
 	cache = btrfs_lookup_block_group(root->fs_info, buf->start);
-	if (block_rsv->space_info != cache->space_info)
-		goto out;
 
 	if (btrfs_header_generation(buf) == trans->transid) {
 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
-- 
cgit v1.2.3-58-ga151


From 4a92b1b8d2810db4ea0c34616b94c0b3810fa027 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 30 Aug 2011 12:34:28 -0400
Subject: Btrfs: stop passing a trans handle all around the reservation code

The only thing that we need to have a trans handle for is in
reserve_metadata_bytes and thats to know how much flushing we can do.  So
instead of passing it around, just check current->journal_info for a
trans_handle so we know if we can commit a transaction to try and free up space
or not.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h            |  6 ++----
 fs/btrfs/extent-tree.c      | 45 +++++++++++++++++++++++----------------------
 fs/btrfs/free-space-cache.c |  4 +---
 fs/btrfs/inode.c            |  4 ++--
 fs/btrfs/relocation.c       | 15 +++++++--------
 fs/btrfs/transaction.c      |  8 ++++----
 6 files changed, 39 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index caa73cd8c00a..a5faf8e33baa 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2237,12 +2237,10 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
 void btrfs_free_block_rsv(struct btrfs_root *root,
 			  struct btrfs_block_rsv *rsv);
-int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
+int btrfs_block_rsv_add(struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
 			u64 num_bytes);
-int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
+int btrfs_block_rsv_check(struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv,
 			  u64 min_reserved, int min_factor, int flush);
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d236df790156..9bb71597aa54 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3404,29 +3404,34 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	return reclaimed >= to_reclaim;
 }
 
-/*
- * Retries tells us how many times we've called reserve_metadata_bytes.  The
- * idea is if this is the first call (retries == 0) then we will add to our
- * reserved count if we can't make the allocation in order to hold our place
- * while we go and try and free up space.  That way for retries > 1 we don't try
- * and add space, we just check to see if the amount of unused space is >= the
- * total space, meaning that our reservation is valid.
+/**
+ * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
+ * @root - the root we're allocating for
+ * @block_rsv - the block_rsv we're allocating for
+ * @orig_bytes - the number of bytes we want
+ * @flush - wether or not we can flush to make our reservation
  *
- * However if we don't intend to retry this reservation, pass -1 as retries so
- * that it short circuits this logic.
+ * This will reserve orgi_bytes number of bytes from the space info associated
+ * with the block_rsv.  If there is not enough space it will make an attempt to
+ * flush out space to make room.  It will do this by flushing delalloc if
+ * possible or committing the transaction.  If flush is 0 then no attempts to
+ * regain reservations will be made and this will fail if there is not enough
+ * space already.
  */
-static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
-				  struct btrfs_root *root,
+static int reserve_metadata_bytes(struct btrfs_root *root,
 				  struct btrfs_block_rsv *block_rsv,
 				  u64 orig_bytes, int flush)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
+	struct btrfs_trans_handle *trans;
 	u64 unused;
 	u64 num_bytes = orig_bytes;
 	int retries = 0;
 	int ret = 0;
 	bool committed = false;
 	bool flushing = false;
+
+	trans = (struct btrfs_trans_handle *)current->journal_info;
 again:
 	ret = 0;
 	spin_lock(&space_info->lock);
@@ -3689,8 +3694,7 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
 	kfree(rsv);
 }
 
-int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
+int btrfs_block_rsv_add(struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
 			u64 num_bytes)
 {
@@ -3699,7 +3703,7 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 	if (num_bytes == 0)
 		return 0;
 
-	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
 		return 0;
@@ -3708,8 +3712,7 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
+int btrfs_block_rsv_check(struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv,
 			  u64 min_reserved, int min_factor, int flush)
 {
@@ -3734,7 +3737,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 	if (!ret)
 		return 0;
 
-	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, flush);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
 		return 0;
@@ -4034,7 +4037,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
 	spin_unlock(&BTRFS_I(inode)->lock);
 
-	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, flush);
+	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
 	if (ret) {
 		u64 to_free = 0;
 		unsigned dropped;
@@ -5689,8 +5692,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	block_rsv = get_block_rsv(trans, root);
 
 	if (block_rsv->size == 0) {
-		ret = reserve_metadata_bytes(trans, root, block_rsv,
-					     blocksize, 0);
+		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
 		/*
 		 * If we couldn't reserve metadata bytes try and use some from
 		 * the global reserve.
@@ -5711,8 +5713,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 		return block_rsv;
 	if (ret) {
 		WARN_ON(1);
-		ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
-					     0);
+		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
 		if (!ret) {
 			return block_rsv;
 		} else if (ret && block_rsv != global_rsv) {
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 3bde17ff14c0..ffc42ef44711 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -197,9 +197,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 
 	rsv = trans->block_rsv;
 	trans->block_rsv = root->orphan_block_rsv;
-	ret = btrfs_block_rsv_check(trans, root,
-				    root->orphan_block_rsv,
-				    0, 5, 0);
+	ret = btrfs_block_rsv_check(root, root->orphan_block_rsv, 0, 5, 0);
 	if (ret)
 		return ret;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 78b5ae59ac4f..2947e94947b5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3577,7 +3577,7 @@ void btrfs_evict_inode(struct inode *inode)
 	 * doing the truncate.
 	 */
 	while (1) {
-		ret = btrfs_block_rsv_check(NULL, root, rsv, min_size, 0, 1);
+		ret = btrfs_block_rsv_check(root, rsv, min_size, 0, 1);
 		if (ret) {
 			printk(KERN_WARNING "Could not get space for a "
 			       "delete, will truncate on mount %d\n", ret);
@@ -6577,7 +6577,7 @@ static int btrfs_truncate(struct inode *inode)
 		btrfs_add_ordered_operation(trans, root, inode);
 
 	while (1) {
-		ret = btrfs_block_rsv_check(trans, root, rsv, min_size, 0, 1);
+		ret = btrfs_block_rsv_check(root, rsv, min_size, 0, 1);
 		if (ret) {
 			/*
 			 * This can only happen with the original transaction we
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index fd9ac66434b0..3ab67409f90f 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2041,8 +2041,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 		BUG_ON(IS_ERR(trans));
 		trans->block_rsv = rc->block_rsv;
 
-		ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
-					    min_reserved, 0, 0);
+		ret = btrfs_block_rsv_check(root, rc->block_rsv, min_reserved,
+					    0, 0);
 		if (ret) {
 			BUG_ON(ret != -EAGAIN);
 			ret = btrfs_commit_transaction(trans, root);
@@ -2152,8 +2152,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
 again:
 	if (!err) {
 		num_bytes = rc->merging_rsv_size;
-		ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
-					  num_bytes);
+		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
 		if (ret)
 			err = ret;
 	}
@@ -2427,7 +2426,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
 	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
 
 	trans->block_rsv = rc->block_rsv;
-	ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes);
+	ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
 	if (ret) {
 		if (ret == -EAGAIN)
 			rc->commit_transaction = 1;
@@ -3645,7 +3644,7 @@ int prepare_to_relocate(struct reloc_control *rc)
 	 * btrfs_init_reloc_root will use them when there
 	 * is no reservation in transaction handle.
 	 */
-	ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
+	ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
 				  rc->extent_root->nodesize * 256);
 	if (ret)
 		return ret;
@@ -3774,8 +3773,8 @@ restart:
 			}
 		}
 
-		ret = btrfs_block_rsv_check(trans, rc->extent_root,
-					    rc->block_rsv, 0, 5, 0);
+		ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 0,
+					    5, 0);
 		if (ret < 0) {
 			if (ret != -EAGAIN) {
 				err = ret;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a770f4bd9d31..8d6f4c78f73f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -275,7 +275,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	 */
 	if (num_items > 0 && root != root->fs_info->chunk_root) {
 		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
-		ret = btrfs_block_rsv_add(NULL, root,
+		ret = btrfs_block_rsv_add(root,
 					  &root->fs_info->trans_block_rsv,
 					  num_bytes);
 		if (ret)
@@ -418,8 +418,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root)
 {
 	int ret;
-	ret = btrfs_block_rsv_check(trans, root,
-				    &root->fs_info->global_block_rsv, 0, 5, 0);
+	ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 0,
+				    5, 0);
 	return ret ? 1 : 0;
 }
 
@@ -914,7 +914,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
 
 	if (to_reserve > 0) {
-		ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
+		ret = btrfs_block_rsv_add(root, &pending->block_rsv,
 					  to_reserve);
 		if (ret) {
 			pending->error = ret;
-- 
cgit v1.2.3-58-ga151


From 9c8d86db9aee6f85866d480e0f9b37817264814c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 19 Sep 2011 11:58:54 -0400
Subject: Btrfs: make sure to unset trans->block_rsv before running delayed
 refs

Checksums are charged in 2 different ways.  The first case is when we're writing
to the disk, we account for the new checksums with the delalloc block rsv.  In
order for this to work we check if we're allocating a block for the csum root
and if trans->block_rsv == the delalloc block rsv.  But when we're deleting the
csums because of cow, this is charged to the global block rsv, and is done when
we run the delayed refs.  So we need to make sure that trans->block_rsv == NULL
when running the delayed refs.  So set it to NULL and reset it in
should_end_transaction, and set it to NULL in commit_transaction.  This got rid
of the ridiculous amount of warnings I was seeing when trying to do a balance.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/transaction.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8d6f4c78f73f..7debbf396ef3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -427,17 +427,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root)
 {
 	struct btrfs_transaction *cur_trans = trans->transaction;
+	struct btrfs_block_rsv *rsv = trans->block_rsv;
 	int updates;
 
 	smp_mb();
 	if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
 		return 1;
 
+	/*
+	 * We need to do this in case we're deleting csums so the global block
+	 * rsv get's used instead of the csum block rsv.
+	 */
+	trans->block_rsv = NULL;
+
 	updates = trans->delayed_ref_updates;
 	trans->delayed_ref_updates = 0;
 	if (updates)
 		btrfs_run_delayed_refs(trans, root, updates);
 
+	trans->block_rsv = rsv;
+
 	return should_end_transaction(trans, root);
 }
 
@@ -1167,6 +1176,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_run_ordered_operations(root, 0);
 
+	trans->block_rsv = NULL;
+
 	/* make a pass through all the delayed refs we have so far
 	 * any runnings procs may add more while we are here
 	 */
-- 
cgit v1.2.3-58-ga151


From 455757c322cc0a0f2a692c5625dd88aaf6a7b889 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 19 Sep 2011 12:26:24 -0400
Subject: Btrfs: delay iput when deleting a block group

I kept getting warnings from evict because we were calling
btrfs_start_transaction() with a transaction already started when doing a
balance.  This is because we remove a block group which requires a transaction,
and the put the last reference on the cache inode.  Instead of doing this we
need to delay the iput so it is done not within a transaction having started.
This gets rid of our warnings.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9bb71597aa54..5498bdacd4c3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7266,7 +7266,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			spin_unlock(&block_group->lock);
 		}
 		/* One for our lookup ref */
-		iput(inode);
+		btrfs_add_delayed_iput(inode);
 	}
 
 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
-- 
cgit v1.2.3-58-ga151


From 3b16a4e3c355ee3c790473decfcf83d4faeb8ce0 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 21 Sep 2011 15:05:58 -0400
Subject: Btrfs: use the inode's mapping mask for allocating pages

Johannes pointed out we were allocating only kernel pages for doing writes,
which is kind of a big deal if you are on 32bit and have more than a gig of ram.
So fix our allocations to use the mapping's gfp but still clear __GFP_FS so we
don't re-enter.  Thanks,

Reported-by: Johannes Weiner <jweiner@redhat.com>
Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h            | 6 ++++++
 fs/btrfs/file.c             | 3 ++-
 fs/btrfs/free-space-cache.c | 6 ++++--
 fs/btrfs/inode.c            | 3 ++-
 fs/btrfs/ioctl.c            | 3 ++-
 fs/btrfs/relocation.c       | 3 ++-
 6 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a5faf8e33baa..47dea7118e0e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -30,6 +30,7 @@
 #include <linux/kobject.h>
 #include <trace/events/btrfs.h>
 #include <asm/kmap_types.h>
+#include <linux/pagemap.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
@@ -2117,6 +2118,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
 		(space_info->flags & BTRFS_BLOCK_GROUP_DATA));
 }
 
+static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
+{
+	return mapping_gfp_mask(mapping) & ~__GFP_FS;
+}
+
 /* extent-tree.c */
 static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
 						 unsigned num_items)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index de569af766fe..f2e928289600 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1069,6 +1069,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 	int i;
 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
 	struct inode *inode = fdentry(file)->d_inode;
+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 	int err = 0;
 	int faili = 0;
 	u64 start_pos;
@@ -1080,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 again:
 	for (i = 0; i < num_pages; i++) {
 		pages[i] = find_or_create_page(inode->i_mapping, index + i,
-					       GFP_NOFS);
+					       mask);
 		if (!pages[i]) {
 			faili = i - 1;
 			err = -ENOMEM;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ffc42ef44711..0a8ccdbdd464 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -254,6 +254,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 	u64 num_bitmaps;
 	u64 generation;
 	pgoff_t index = 0;
+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 	int ret = 0;
 
 	INIT_LIST_HEAD(&bitmaps);
@@ -310,7 +311,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 		if (!num_entries && !num_bitmaps)
 			break;
 
-		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+		page = find_or_create_page(inode->i_mapping, index, mask);
 		if (!page)
 			goto free_cache;
 
@@ -563,6 +564,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	u64 start, end, len;
 	u64 bytes = 0;
 	u32 crc = ~(u32)0;
+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 	int index = 0, num_pages = 0;
 	int entries = 0;
 	int bitmaps = 0;
@@ -612,7 +614,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	 * know and don't freak out.
 	 */
 	while (index < num_pages) {
-		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+		page = find_or_create_page(inode->i_mapping, index, mask);
 		if (!page) {
 			int i;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2947e94947b5..88e3956be57d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3282,6 +3282,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	struct page *page;
+	gfp_t mask = btrfs_alloc_write_mask(mapping);
 	int ret = 0;
 	u64 page_start;
 	u64 page_end;
@@ -3294,7 +3295,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 
 	ret = -ENOMEM;
 again:
-	page = find_or_create_page(mapping, index, GFP_NOFS);
+	page = find_or_create_page(mapping, index, mask);
 	if (!page) {
 		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 		goto out;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 538f65a79ec5..24fd75bb0f96 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -843,6 +843,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
 	int i_done;
 	struct btrfs_ordered_extent *ordered;
 	struct extent_state *cached_state = NULL;
+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 
 	if (isize == 0)
 		return 0;
@@ -860,7 +861,7 @@ again:
 	for (i = 0; i < num_pages; i++) {
 		struct page *page;
 		page = find_or_create_page(inode->i_mapping,
-					    start_index + i, GFP_NOFS);
+					    start_index + i, mask);
 		if (!page)
 			break;
 
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 3ab67409f90f..7fa090fa0d39 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2921,6 +2921,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
 	unsigned long last_index;
 	struct page *page;
 	struct file_ra_state *ra;
+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 	int nr = 0;
 	int ret = 0;
 
@@ -2955,7 +2956,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
 						  ra, NULL, index,
 						  last_index + 1 - index);
 			page = find_or_create_page(inode->i_mapping, index,
-						   GFP_NOFS);
+						   mask);
 			if (!page) {
 				btrfs_delalloc_release_metadata(inode,
 							PAGE_CACHE_SIZE);
-- 
cgit v1.2.3-58-ga151


From a8c9e5769718d47e87cce40c9b84cab421804797 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 21 Sep 2011 16:55:59 -0400
Subject: Btrfs: fix orphan cleanup regression

In fixing how we deal with bad inodes, we had a regression in the orphan cleanup
code, since it expects to get a bad inode back.  So fix it to deal with getting
-ESTALE back by deleting the orphan item manually and moving on.  Thanks,

Reported-by: Simon Kirby <sim@hostway.ca>
Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 88e3956be57d..8005be176252 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2285,37 +2285,35 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 		found_key.type = BTRFS_INODE_ITEM_KEY;
 		found_key.offset = 0;
 		inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
-		if (IS_ERR(inode)) {
-			ret = PTR_ERR(inode);
+		ret = PTR_RET(inode);
+		if (ret && ret != -ESTALE)
 			goto out;
-		}
-
-		/*
-		 * add this inode to the orphan list so btrfs_orphan_del does
-		 * the proper thing when we hit it
-		 */
-		spin_lock(&root->orphan_lock);
-		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
-		spin_unlock(&root->orphan_lock);
 
 		/*
-		 * if this is a bad inode, means we actually succeeded in
-		 * removing the inode, but not the orphan record, which means
-		 * we need to manually delete the orphan since iput will just
-		 * do a destroy_inode
+		 * Inode is already gone but the orphan item is still there,
+		 * kill the orphan item.
 		 */
-		if (is_bad_inode(inode)) {
-			trans = btrfs_start_transaction(root, 0);
+		if (ret == -ESTALE) {
+			trans = btrfs_start_transaction(root, 1);
 			if (IS_ERR(trans)) {
 				ret = PTR_ERR(trans);
 				goto out;
 			}
-			btrfs_orphan_del(trans, inode);
+			ret = btrfs_del_orphan_item(trans, root,
+						    found_key.objectid);
+			BUG_ON(ret);
 			btrfs_end_transaction(trans, root);
-			iput(inode);
 			continue;
 		}
 
+		/*
+		 * add this inode to the orphan list so btrfs_orphan_del does
+		 * the proper thing when we hit it
+		 */
+		spin_lock(&root->orphan_lock);
+		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+		spin_unlock(&root->orphan_lock);
+
 		/* if we have links, this was a truncate, lets do that */
 		if (inode->i_nlink) {
 			if (!S_ISREG(inode->i_mode)) {
-- 
cgit v1.2.3-58-ga151


From ef3be45722317f8c2fb0e861065df0c3830ff9ac Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 22 Sep 2011 14:30:02 -0400
Subject: Btrfs: check unused against how much space we actually want

There is a bug that may lead to early ENOSPC in our reservation code.  We've
been checking against num_bytes which may be above and beyond what we want to
actually reserve, which could give us a false ENOSPC.  Fix this by making sure
the unused space is above how much we want to reserve and not how much we're
trying to flush.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5498bdacd4c3..fd65f6bc676c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3472,7 +3472,7 @@ again:
 	 */
 	if (unused <= space_info->total_bytes) {
 		unused = space_info->total_bytes - unused;
-		if (unused >= num_bytes) {
+		if (unused >= orig_bytes) {
 			space_info->bytes_may_use += orig_bytes;
 			ret = 0;
 		} else {
-- 
cgit v1.2.3-58-ga151


From 462d6fac8960a3ba797927adfcbd29d503eb16fd Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 26 Sep 2011 13:56:12 -0400
Subject: Btrfs: introduce convert_extent_bit

If I have a range where I know a certain bit is and I want to set it to another
bit the only option I have is to call set and then clear bit, which will result
in 2 tree searches.  This is inefficient, so introduce convert_extent_bit which
will go through and set the bit I want and clear the old bit I don't want.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent_io.c | 188 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/extent_io.h |   2 +
 2 files changed, 190 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7d5e55632809..0ada0b700b44 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -894,6 +894,194 @@ search_again:
 	goto again;
 }
 
+/**
+ * convert_extent - convert all bits in a given range from one bit to another
+ * @tree:	the io tree to search
+ * @start:	the start offset in bytes
+ * @end:	the end offset in bytes (inclusive)
+ * @bits:	the bits to set in this range
+ * @clear_bits:	the bits to clear in this range
+ * @mask:	the allocation mask
+ *
+ * This will go through and set bits for the given range.  If any states exist
+ * already in this range they are set with the given bit and cleared of the
+ * clear_bits.  This is only meant to be used by things that are mergeable, ie
+ * converting from say DELALLOC to DIRTY.  This is not meant to be used with
+ * boundary bits like LOCK.
+ */
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		       int bits, int clear_bits, gfp_t mask)
+{
+	struct extent_state *state;
+	struct extent_state *prealloc = NULL;
+	struct rb_node *node;
+	int err = 0;
+	u64 last_start;
+	u64 last_end;
+
+again:
+	if (!prealloc && (mask & __GFP_WAIT)) {
+		prealloc = alloc_extent_state(mask);
+		if (!prealloc)
+			return -ENOMEM;
+	}
+
+	spin_lock(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, start);
+	if (!node) {
+		prealloc = alloc_extent_state_atomic(prealloc);
+		if (!prealloc)
+			return -ENOMEM;
+		err = insert_state(tree, prealloc, start, end, &bits);
+		prealloc = NULL;
+		BUG_ON(err == -EEXIST);
+		goto out;
+	}
+	state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
+	last_start = state->start;
+	last_end = state->end;
+
+	/*
+	 * | ---- desired range ---- |
+	 * | state |
+	 *
+	 * Just lock what we found and keep going
+	 */
+	if (state->start == start && state->end <= end) {
+		struct rb_node *next_node;
+
+		set_state_bits(tree, state, &bits);
+		clear_state_bit(tree, state, &clear_bits, 0);
+
+		merge_state(tree, state);
+		if (last_end == (u64)-1)
+			goto out;
+
+		start = last_end + 1;
+		next_node = rb_next(&state->rb_node);
+		if (next_node && start < end && prealloc && !need_resched()) {
+			state = rb_entry(next_node, struct extent_state,
+					 rb_node);
+			if (state->start == start)
+				goto hit_next;
+		}
+		goto search_again;
+	}
+
+	/*
+	 *     | ---- desired range ---- |
+	 * | state |
+	 *   or
+	 * | ------------- state -------------- |
+	 *
+	 * We need to split the extent we found, and may flip bits on
+	 * second half.
+	 *
+	 * If the extent we found extends past our
+	 * range, we just split and search again.  It'll get split
+	 * again the next time though.
+	 *
+	 * If the extent we found is inside our range, we set the
+	 * desired bit on it.
+	 */
+	if (state->start < start) {
+		prealloc = alloc_extent_state_atomic(prealloc);
+		if (!prealloc)
+			return -ENOMEM;
+		err = split_state(tree, state, prealloc, start);
+		BUG_ON(err == -EEXIST);
+		prealloc = NULL;
+		if (err)
+			goto out;
+		if (state->end <= end) {
+			set_state_bits(tree, state, &bits);
+			clear_state_bit(tree, state, &clear_bits, 0);
+			merge_state(tree, state);
+			if (last_end == (u64)-1)
+				goto out;
+			start = last_end + 1;
+		}
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *     | state | or               | state |
+	 *
+	 * There's a hole, we need to insert something in it and
+	 * ignore the extent we found.
+	 */
+	if (state->start > start) {
+		u64 this_end;
+		if (end < last_start)
+			this_end = end;
+		else
+			this_end = last_start - 1;
+
+		prealloc = alloc_extent_state_atomic(prealloc);
+		if (!prealloc)
+			return -ENOMEM;
+
+		/*
+		 * Avoid to free 'prealloc' if it can be merged with
+		 * the later extent.
+		 */
+		err = insert_state(tree, prealloc, start, this_end,
+				   &bits);
+		BUG_ON(err == -EEXIST);
+		if (err) {
+			free_extent_state(prealloc);
+			prealloc = NULL;
+			goto out;
+		}
+		prealloc = NULL;
+		start = this_end + 1;
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *                        | state |
+	 * We need to split the extent, and set the bit
+	 * on the first half
+	 */
+	if (state->start <= end && state->end > end) {
+		prealloc = alloc_extent_state_atomic(prealloc);
+		if (!prealloc)
+			return -ENOMEM;
+
+		err = split_state(tree, state, prealloc, end + 1);
+		BUG_ON(err == -EEXIST);
+
+		set_state_bits(tree, prealloc, &bits);
+		clear_state_bit(tree, prealloc, &clear_bits, 0);
+
+		merge_state(tree, prealloc);
+		prealloc = NULL;
+		goto out;
+	}
+
+	goto search_again;
+
+out:
+	spin_unlock(&tree->lock);
+	if (prealloc)
+		free_extent_state(prealloc);
+
+	return err;
+
+search_again:
+	if (start > end)
+		goto out;
+	spin_unlock(&tree->lock);
+	if (mask & __GFP_WAIT)
+		cond_resched();
+	goto again;
+}
+
 /* wrappers around set/clear extent bit */
 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7b2f0c3e7929..cea445dcd806 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -214,6 +214,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask);
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask);
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		       int bits, int clear_bits, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 			struct extent_state **cached_state, gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-- 
cgit v1.2.3-58-ga151


From 1728366efa5ebf48bd2ed544afa8700cd07ba822 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 26 Sep 2011 13:58:47 -0400
Subject: Btrfs: stop using write_one_page

While looking for a performance regression a user was complaining about, I
noticed that we had a regression with the varmail test of filebench.  This was
introduced by

0d10ee2e6deb5c8409ae65b970846344897d5e4e

which keeps us from calling writepages in writepage.  This is a correct change,
however it happens to help the varmail test because we write out in larger
chunks.  This is largly to do with how we write out dirty pages for each
transaction.  If you run filebench with

load varmail
set $dir=/mnt/btrfs-test
run 60

prior to this patch you would get ~1420 ops/second, but with the patch you get
~1200 ops/second.  This is a 16% decrease.  So since we know the range of dirty
pages we want to write out, don't write out in one page chunks, write out in
ranges.  So to do this we call filemap_fdatawrite_range() on the range of bytes.
Then we convert the DIRTY extents to NEED_WAIT extents.  When we then call
btrfs_wait_marked_extents() we only have to filemap_fdatawait_range() on that
range and clear the NEED_WAIT extents.  This doesn't get us back to our original
speeds, but I've been seeing ~1380 ops/second, which is a <5% regression as
opposed to a >15% regression.  That is acceptable given that the original commit
greatly reduces our latency to begin with.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent_io.h   |  1 +
 fs/btrfs/transaction.c | 86 +++++++++++---------------------------------------
 2 files changed, 20 insertions(+), 67 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index cea445dcd806..325a346369da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -17,6 +17,7 @@
 #define EXTENT_NODATASUM (1 << 10)
 #define EXTENT_DO_ACCOUNTING (1 << 11)
 #define EXTENT_FIRST_DELALLOC (1 << 12)
+#define EXTENT_NEED_WAIT (1 << 13)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7debbf396ef3..45655793a2c5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -572,50 +572,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
 int btrfs_write_marked_extents(struct btrfs_root *root,
 			       struct extent_io_tree *dirty_pages, int mark)
 {
-	int ret;
 	int err = 0;
 	int werr = 0;
-	struct page *page;
-	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
 	u64 start = 0;
 	u64 end;
-	unsigned long index;
 
-	while (1) {
-		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
-					    mark);
-		if (ret)
-			break;
-		while (start <= end) {
-			cond_resched();
-
-			index = start >> PAGE_CACHE_SHIFT;
-			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
-			page = find_get_page(btree_inode->i_mapping, index);
-			if (!page)
-				continue;
-
-			btree_lock_page_hook(page);
-			if (!page->mapping) {
-				unlock_page(page);
-				page_cache_release(page);
-				continue;
-			}
-
-			if (PageWriteback(page)) {
-				if (PageDirty(page))
-					wait_on_page_writeback(page);
-				else {
-					unlock_page(page);
-					page_cache_release(page);
-					continue;
-				}
-			}
-			err = write_one_page(page, 0);
-			if (err)
-				werr = err;
-			page_cache_release(page);
-		}
+	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
+				      mark)) {
+		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
+				   GFP_NOFS);
+		err = filemap_fdatawrite_range(mapping, start, end);
+		if (err)
+			werr = err;
+		cond_resched();
+		start = end + 1;
 	}
 	if (err)
 		werr = err;
@@ -631,39 +602,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
 int btrfs_wait_marked_extents(struct btrfs_root *root,
 			      struct extent_io_tree *dirty_pages, int mark)
 {
-	int ret;
 	int err = 0;
 	int werr = 0;
-	struct page *page;
-	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
 	u64 start = 0;
 	u64 end;
-	unsigned long index;
 
-	while (1) {
-		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
-					    mark);
-		if (ret)
-			break;
-
-		clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
-		while (start <= end) {
-			index = start >> PAGE_CACHE_SHIFT;
-			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
-			page = find_get_page(btree_inode->i_mapping, index);
-			if (!page)
-				continue;
-			if (PageDirty(page)) {
-				btree_lock_page_hook(page);
-				wait_on_page_writeback(page);
-				err = write_one_page(page, 0);
-				if (err)
-					werr = err;
-			}
-			wait_on_page_writeback(page);
-			page_cache_release(page);
-			cond_resched();
-		}
+	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
+				      EXTENT_NEED_WAIT)) {
+		clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
+		err = filemap_fdatawait_range(mapping, start, end);
+		if (err)
+			werr = err;
+		cond_resched();
+		start = end + 1;
 	}
 	if (err)
 		werr = err;
-- 
cgit v1.2.3-58-ga151


From 726c35fa0edf1d9b8a88b73255532e73089aedda Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 26 Sep 2011 15:46:06 -0400
Subject: Btrfs: use the global reserve as a backup for deleting inodes

Xfstests 83 really stresses our ENOSPC since it uses a 100mb fs which ends up
with the mixed block group stuff.  Because of this we can run into a situation
where we don't have enough space to delete inodes, or even worse we can't free
the inodes when we next mount the fs which causes the orphan code to lose its
mind.  So if we fail to make our reservation, steal from the global reserve.
The global reserve will end up taking up the entire rest of the free space on
the fs in this worst case so there really is no other option.  With this patch
test 83 doesn't freak out.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8005be176252..96fc9e342219 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3526,7 +3526,7 @@ void btrfs_evict_inode(struct inode *inode)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_block_rsv *rsv;
+	struct btrfs_block_rsv *rsv, *global_rsv;
 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 	unsigned long nr;
 	int ret;
@@ -3561,6 +3561,7 @@ void btrfs_evict_inode(struct inode *inode)
 		goto no_delete;
 	}
 	rsv->size = min_size;
+	global_rsv = &root->fs_info->global_block_rsv;
 
 	btrfs_i_size_write(inode, 0);
 
@@ -3577,6 +3578,15 @@ void btrfs_evict_inode(struct inode *inode)
 	 */
 	while (1) {
 		ret = btrfs_block_rsv_check(root, rsv, min_size, 0, 1);
+
+		/*
+		 * Try and steal from the global reserve since we will
+		 * likely not use this space anyway, we want to try as
+		 * hard as possible to get this to work.
+		 */
+		if (ret)
+			ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
+
 		if (ret) {
 			printk(KERN_WARNING "Could not get space for a "
 			       "delete, will truncate on mount %d\n", ret);
-- 
cgit v1.2.3-58-ga151


From 8f6d7f4f45f18a5b669dbbf068c74b3d5be59dbf Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 26 Sep 2011 15:55:20 -0400
Subject: Btrfs: break out of orphan cleanup if we can't make progress

I noticed while running xfstests 83 that if we didn't have enough space to
delete our inode the orphan cleanup would just loop.  This is because it keeps
finding the same orphan item and keeps trying to kill it but can't because we
don't get an error back from iput for deleting the inode.  So keep track of the
last guy we tried to kill, if it's the same as the one we're trying to kill
currently we know we are having problems and can just error out.  I don't have a
way to test this so look hard and make sure it's right.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 96fc9e342219..15adfb542502 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2230,6 +2230,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 	struct btrfs_key key, found_key;
 	struct btrfs_trans_handle *trans;
 	struct inode *inode;
+	u64 last_objectid = 0;
 	int ret = 0, nr_unlink = 0, nr_truncate = 0;
 
 	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
@@ -2281,6 +2282,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 		 * crossing root thing.  we store the inode number in the
 		 * offset of the orphan item.
 		 */
+
+		if (found_key.offset == last_objectid) {
+			printk(KERN_ERR "btrfs: Error removing orphan entry, "
+			       "stopping orphan cleanup\n");
+			ret = -EINVAL;
+			goto out;
+		}
+
+		last_objectid = found_key.offset;
+
 		found_key.objectid = found_key.offset;
 		found_key.type = BTRFS_INODE_ITEM_KEY;
 		found_key.offset = 0;
-- 
cgit v1.2.3-58-ga151


From 2bf64758fd6290797a5ce97d4b9c698a4ed1cbad Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 26 Sep 2011 17:12:22 -0400
Subject: Btrfs: allow us to overcommit our enospc reservations

One of the things that kills us is the fact that our ENOSPC reservations are
horribly over the top in most normal cases.  There isn't too much that can be
done about this because when we are completely full we really need them to work
like this so we don't under reserve.  However if there is plenty of unallocated
chunks on the disk we can use that to gauge how much we can overcommit.  So this
patch adds chunk free space accounting so we always know how much unallocated
space we have.  Then if we fail to make a reservation within our allocated
space, check to see if we can overcommit.  In the normal flushing case (like
with delalloc metadata reservations) we'll take the free space and divide it by
2 if our metadata profile is setup for DUP or any of those, and then divide it
by 8 to make sure we don't overcommit too much.  Then if we're in a non-flushing
case (we really need this reservation now!) we only limit ourselves to half of
the free space.  This makes this fio test

[torrent]
filename=torrent-test
rw=randwrite
size=4g
ioengine=sync
directory=/mnt/btrfs-test

go from taking around 45 minutes to 10 seconds on my freshly formatted 3 TiB
file system.  This doesn't seem to break my other enospc tests, but could really
use some more testing as this is a super scary change.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h       |  4 ++++
 fs/btrfs/disk-io.c     |  2 ++
 fs/btrfs/extent-tree.c | 61 ++++++++++++++++++++++++++++++++++++++------------
 fs/btrfs/volumes.c     | 39 ++++++++++++++++++++++++++++----
 4 files changed, 88 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 47dea7118e0e..1eafccb162ee 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -893,6 +893,10 @@ struct btrfs_fs_info {
 	spinlock_t block_group_cache_lock;
 	struct rb_root block_group_cache_tree;
 
+	/* keep track of unallocated space */
+	spinlock_t free_chunk_lock;
+	u64 free_chunk_space;
+
 	struct extent_io_tree freed_extents[2];
 	struct extent_io_tree *pinned_extents;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4965a0179b31..51372a521167 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1648,6 +1648,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	spin_lock_init(&fs_info->fs_roots_radix_lock);
 	spin_lock_init(&fs_info->delayed_iput_lock);
 	spin_lock_init(&fs_info->defrag_inodes_lock);
+	spin_lock_init(&fs_info->free_chunk_lock);
 	mutex_init(&fs_info->reloc_mutex);
 
 	init_completion(&fs_info->kobj_unregister);
@@ -1675,6 +1676,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->metadata_ratio = 0;
 	fs_info->defrag_inodes = RB_ROOT;
 	fs_info->trans_no_join = 0;
+	fs_info->free_chunk_space = 0;
 
 	fs_info->thread_pool_size = min_t(unsigned long,
 					  num_online_cpus() + 2, 8);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fd65f6bc676c..25b69d0f9135 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3410,6 +3410,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
  * @block_rsv - the block_rsv we're allocating for
  * @orig_bytes - the number of bytes we want
  * @flush - wether or not we can flush to make our reservation
+ * @check - wether this is just to check if we have enough space or not
  *
  * This will reserve orgi_bytes number of bytes from the space info associated
  * with the block_rsv.  If there is not enough space it will make an attempt to
@@ -3420,11 +3421,11 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
  */
 static int reserve_metadata_bytes(struct btrfs_root *root,
 				  struct btrfs_block_rsv *block_rsv,
-				  u64 orig_bytes, int flush)
+				  u64 orig_bytes, int flush, int check)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
 	struct btrfs_trans_handle *trans;
-	u64 unused;
+	u64 used;
 	u64 num_bytes = orig_bytes;
 	int retries = 0;
 	int ret = 0;
@@ -3459,9 +3460,9 @@ again:
 	}
 
 	ret = -ENOSPC;
-	unused = space_info->bytes_used + space_info->bytes_reserved +
-		 space_info->bytes_pinned + space_info->bytes_readonly +
-		 space_info->bytes_may_use;
+	used = space_info->bytes_used + space_info->bytes_reserved +
+		space_info->bytes_pinned + space_info->bytes_readonly +
+		space_info->bytes_may_use;
 
 	/*
 	 * The idea here is that we've not already over-reserved the block group
@@ -3470,9 +3471,8 @@ again:
 	 * lets start flushing stuff first and then come back and try to make
 	 * our reservation.
 	 */
-	if (unused <= space_info->total_bytes) {
-		unused = space_info->total_bytes - unused;
-		if (unused >= orig_bytes) {
+	if (used <= space_info->total_bytes) {
+		if (used + orig_bytes <= space_info->total_bytes) {
 			space_info->bytes_may_use += orig_bytes;
 			ret = 0;
 		} else {
@@ -3489,10 +3489,43 @@ again:
 		 * amount plus the amount of bytes that we need for this
 		 * reservation.
 		 */
-		num_bytes = unused - space_info->total_bytes +
+		num_bytes = used - space_info->total_bytes +
 			(orig_bytes * (retries + 1));
 	}
 
+	if (ret && !check) {
+		u64 profile = btrfs_get_alloc_profile(root, 0);
+		u64 avail;
+
+		spin_lock(&root->fs_info->free_chunk_lock);
+		avail = root->fs_info->free_chunk_space;
+
+		/*
+		 * If we have dup, raid1 or raid10 then only half of the free
+		 * space is actually useable.
+		 */
+		if (profile & (BTRFS_BLOCK_GROUP_DUP |
+			       BTRFS_BLOCK_GROUP_RAID1 |
+			       BTRFS_BLOCK_GROUP_RAID10))
+			avail >>= 1;
+
+		/*
+		 * If we aren't flushing don't let us overcommit too much, say
+		 * 1/8th of the space.  If we can flush, let it overcommit up to
+		 * 1/2 of the space.
+		 */
+		if (flush)
+			avail >>= 3;
+		else
+			avail >>= 1;
+		 spin_unlock(&root->fs_info->free_chunk_lock);
+
+		if (used + orig_bytes < space_info->total_bytes + avail) {
+			space_info->bytes_may_use += orig_bytes;
+			ret = 0;
+		}
+	}
+
 	/*
 	 * Couldn't make our reservation, save our place so while we're trying
 	 * to reclaim space we can actually use it instead of somebody else
@@ -3703,7 +3736,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
 	if (num_bytes == 0)
 		return 0;
 
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1, 0);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
 		return 0;
@@ -3737,7 +3770,7 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
 	if (!ret)
 		return 0;
 
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush, !flush);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
 		return 0;
@@ -4037,7 +4070,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
 	spin_unlock(&BTRFS_I(inode)->lock);
 
-	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush, 0);
 	if (ret) {
 		u64 to_free = 0;
 		unsigned dropped;
@@ -5692,7 +5725,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	block_rsv = get_block_rsv(trans, root);
 
 	if (block_rsv->size == 0) {
-		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0);
 		/*
 		 * If we couldn't reserve metadata bytes try and use some from
 		 * the global reserve.
@@ -5713,7 +5746,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 		return block_rsv;
 	if (ret) {
 		WARN_ON(1);
-		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0);
 		if (!ret) {
 			return block_rsv;
 		} else if (ret && block_rsv != global_rsv) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f2a4cc79da61..e138af710de2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1013,8 +1013,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 	}
 	BUG_ON(ret);
 
-	if (device->bytes_used > 0)
-		device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
+	if (device->bytes_used > 0) {
+		u64 len = btrfs_dev_extent_length(leaf, extent);
+		device->bytes_used -= len;
+		spin_lock(&root->fs_info->free_chunk_lock);
+		root->fs_info->free_chunk_space += len;
+		spin_unlock(&root->fs_info->free_chunk_lock);
+	}
 	ret = btrfs_del_item(trans, root, path);
 
 out:
@@ -1356,6 +1361,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	if (ret)
 		goto error_undo;
 
+	spin_lock(&root->fs_info->free_chunk_lock);
+	root->fs_info->free_chunk_space = device->total_bytes -
+		device->bytes_used;
+	spin_unlock(&root->fs_info->free_chunk_lock);
+
 	device->in_fs_metadata = 0;
 	btrfs_scrub_cancel_dev(root, device);
 
@@ -1691,6 +1701,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		root->fs_info->fs_devices->num_can_discard++;
 	root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
 
+	spin_lock(&root->fs_info->free_chunk_lock);
+	root->fs_info->free_chunk_space += device->total_bytes;
+	spin_unlock(&root->fs_info->free_chunk_lock);
+
 	if (!blk_queue_nonrot(bdev_get_queue(bdev)))
 		root->fs_info->fs_devices->rotating = 1;
 
@@ -2192,8 +2206,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	lock_chunks(root);
 
 	device->total_bytes = new_size;
-	if (device->writeable)
+	if (device->writeable) {
 		device->fs_devices->total_rw_bytes -= diff;
+		spin_lock(&root->fs_info->free_chunk_lock);
+		root->fs_info->free_chunk_space -= diff;
+		spin_unlock(&root->fs_info->free_chunk_lock);
+	}
 	unlock_chunks(root);
 
 again:
@@ -2257,6 +2275,9 @@ again:
 		device->total_bytes = old_size;
 		if (device->writeable)
 			device->fs_devices->total_rw_bytes += diff;
+		spin_lock(&root->fs_info->free_chunk_lock);
+		root->fs_info->free_chunk_space += diff;
+		spin_unlock(&root->fs_info->free_chunk_lock);
 		unlock_chunks(root);
 		goto done;
 	}
@@ -2615,6 +2636,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
 		index++;
 	}
 
+	spin_lock(&extent_root->fs_info->free_chunk_lock);
+	extent_root->fs_info->free_chunk_space -= (stripe_size *
+						   map->num_stripes);
+	spin_unlock(&extent_root->fs_info->free_chunk_lock);
+
 	index = 0;
 	stripe = &chunk->stripe;
 	while (index < map->num_stripes) {
@@ -3616,8 +3642,13 @@ static int read_one_dev(struct btrfs_root *root,
 	fill_device_from_item(leaf, dev_item, device);
 	device->dev_root = root->fs_info->dev_root;
 	device->in_fs_metadata = 1;
-	if (device->writeable)
+	if (device->writeable) {
 		device->fs_devices->total_rw_bytes += device->total_bytes;
+		spin_lock(&root->fs_info->free_chunk_lock);
+		root->fs_info->free_chunk_space += device->total_bytes -
+			device->bytes_used;
+		spin_unlock(&root->fs_info->free_chunk_lock);
+	}
 	ret = 0;
 	return ret;
 }
-- 
cgit v1.2.3-58-ga151


From e27425d614d68daa08f60735982a7c3a0230e855 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 27 Sep 2011 11:01:30 -0400
Subject: Btrfs: only inherit btrfs specific flags when creating files

Xfstests 79 was failing because we were inheriting the S_APPEND flag when we
weren't supposed to.  There isn't any specific documentation on this so I'm
taking the test as the standard of how things work, and having S_APPEND set on a
directory doesn't mean that S_APPEND gets inherited by its children according to
this test.  So only inherit btrfs specific things.  This will let us set
compress/nocompress on specific directories and everything in the directories
will inherit this flag, same with nodatacow.  With this patch test 79 passes.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ioctl.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 24fd75bb0f96..d2b53eb8a8c2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -117,7 +117,7 @@ void btrfs_update_iflags(struct inode *inode)
 /*
  * Inherit flags from the parent inode.
  *
- * Unlike extN we don't have any flags we don't want to inherit currently.
+ * Currently only the compression flags and the cow flags are inherited.
  */
 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
 {
@@ -128,12 +128,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
 
 	flags = BTRFS_I(dir)->flags;
 
-	if (S_ISREG(inode->i_mode))
-		flags &= ~BTRFS_INODE_DIRSYNC;
-	else if (!S_ISDIR(inode->i_mode))
-		flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME);
+	if (flags & BTRFS_INODE_NOCOMPRESS) {
+		BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+	} else if (flags & BTRFS_INODE_COMPRESS) {
+		BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+		BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+	}
+
+	if (flags & BTRFS_INODE_NODATACOW)
+		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
 
-	BTRFS_I(inode)->flags = flags;
 	btrfs_update_iflags(inode);
 }
 
-- 
cgit v1.2.3-58-ga151


From 73bc187680f94bed498f8a669103cad290e41180 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 3 Oct 2011 14:07:49 -0400
Subject: Btrfs: introduce mount option no_space_cache

Some users have requested this and I've found I needed a way to disable cache
loading without actually clearing the cache, so introduce the no_space_cache
option.  Before we check the super blocks cache generation field and if it was
populated we always turned space caching on.  Now we check this and set the
space cache option on, and then parse the mount options so that if we want it
off it get's turned off.  Then we check the mount option all the places we do
the caching work instead of checking the super's cache generation.  This makes
things more consistent and lets us turn space caching off.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c |  9 ++++-----
 fs/btrfs/super.c       | 21 +++++++++++++++++----
 fs/btrfs/transaction.c |  2 +-
 3 files changed, 22 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 25b69d0f9135..f9711a82fc54 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -481,7 +481,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 	 * we likely hold important locks.
 	 */
 	if (trans && (!trans->transaction->in_commit) &&
-	    (root && root != root->fs_info->tree_root)) {
+	    (root && root != root->fs_info->tree_root) &&
+	    btrfs_test_opt(root, SPACE_CACHE)) {
 		spin_lock(&cache->lock);
 		if (cache->cached != BTRFS_CACHE_NO) {
 			spin_unlock(&cache->lock);
@@ -4223,7 +4224,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		spin_lock(&cache->space_info->lock);
 		spin_lock(&cache->lock);
 
-		if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
+		if (btrfs_test_opt(root, SPACE_CACHE) &&
 		    cache->disk_cache_state < BTRFS_DC_CLEAR)
 			cache->disk_cache_state = BTRFS_DC_CLEAR;
 
@@ -7038,13 +7039,11 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	path->reada = 1;
 
 	cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
-	if (cache_gen != 0 &&
+	if (btrfs_test_opt(root, SPACE_CACHE) &&
 	    btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
 		need_clear = 1;
 	if (btrfs_test_opt(root, CLEAR_CACHE))
 		need_clear = 1;
-	if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
-		printk(KERN_INFO "btrfs: disk space caching is enabled\n");
 
 	while (1) {
 		ret = find_first_block_group(root, path, &key);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 934789f7fd33..266d1f35465d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -164,7 +164,7 @@ enum {
 	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
 	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
 	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
-	Opt_inode_cache, Opt_err,
+	Opt_inode_cache, Opt_no_space_cache, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -197,6 +197,7 @@ static match_table_t tokens = {
 	{Opt_subvolrootid, "subvolrootid=%d"},
 	{Opt_defrag, "autodefrag"},
 	{Opt_inode_cache, "inode_cache"},
+	{Opt_no_space_cache, "no_space_cache"},
 	{Opt_err, NULL},
 };
 
@@ -208,14 +209,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 {
 	struct btrfs_fs_info *info = root->fs_info;
 	substring_t args[MAX_OPT_ARGS];
-	char *p, *num, *orig;
+	char *p, *num, *orig = NULL;
+	u64 cache_gen;
 	int intarg;
 	int ret = 0;
 	char *compress_type;
 	bool compress_force = false;
 
+	cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
+	if (cache_gen)
+		btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+
 	if (!options)
-		return 0;
+		goto out;
 
 	/*
 	 * strsep changes the string, duplicate it because parse_options
@@ -362,9 +368,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_set_opt(info->mount_opt, DISCARD);
 			break;
 		case Opt_space_cache:
-			printk(KERN_INFO "btrfs: enabling disk space caching\n");
 			btrfs_set_opt(info->mount_opt, SPACE_CACHE);
 			break;
+		case Opt_no_space_cache:
+			printk(KERN_INFO "btrfs: disabling disk space caching\n");
+			btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
+			break;
 		case Opt_inode_cache:
 			printk(KERN_INFO "btrfs: enabling inode map caching\n");
 			btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
@@ -393,6 +402,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		}
 	}
 out:
+	if (!ret && btrfs_test_opt(root, SPACE_CACHE))
+		printk(KERN_INFO "btrfs: disk space caching is enabled\n");
 	kfree(orig);
 	return ret;
 }
@@ -687,6 +698,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",noacl");
 	if (btrfs_test_opt(root, SPACE_CACHE))
 		seq_puts(seq, ",space_cache");
+	else
+		seq_puts(seq, ",no_space_cache");
 	if (btrfs_test_opt(root, CLEAR_CACHE))
 		seq_puts(seq, ",clear_cache");
 	if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 45655793a2c5..1e1a4816ccb0 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1003,7 +1003,7 @@ static void update_super_roots(struct btrfs_root *root)
 	super->root = root_item->bytenr;
 	super->generation = root_item->generation;
 	super->root_level = root_item->level;
-	if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
+	if (btrfs_test_opt(root, SPACE_CACHE))
 		super->cache_generation = root_item->generation;
 }
 
-- 
cgit v1.2.3-58-ga151


From f75b130e9bb361850787e156c79311adb84f551e Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 5 Oct 2011 10:00:18 -0400
Subject: Btrfs: don't skip writing out a empty block groups cache

I noticed a slight bug where we will not bother writing out the block group
cache's space cache if it's space tree is empty.  Since it could have a cluster
or pinned extents that need to be written out this is just not a valid test.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/free-space-cache.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0a8ccdbdd464..b81556ca75ea 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -575,10 +575,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 
 	INIT_LIST_HEAD(&bitmap_list);
 
-	node = rb_first(&ctl->free_space_offset);
-	if (!node)
-		return -1;
-
 	if (!i_size_read(inode))
 		return -1;
 
@@ -639,6 +635,12 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	if (block_group)
 		start = block_group->key.objectid;
 
+	node = rb_first(&ctl->free_space_offset);
+	if (!node && cluster) {
+		node = rb_first(&cluster->root);
+		cluster = NULL;
+	}
+
 	/* Write out the extent entries */
 	do {
 		struct btrfs_free_space_entry *entry;
-- 
cgit v1.2.3-58-ga151


From a67509c30079f4c5025fb19ea443fb2906c3a85e Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 5 Oct 2011 15:18:58 -0400
Subject: Btrfs: add a io_ctl struct and helpers for dealing with the space
 cache

In writing and reading the space cache we have one big loop that keeps track of
which page we are on and then a bunch of sizeable loops underneath this big loop
to try and read/write out properly.  Especially in the write case this makes
things hugely complicated and hard to follow, and makes our error checking and
recovery equally as complex.  So add a io_ctl struct with a bunch of helpers to
keep track of the pages we have, where we are, if we have enough space etc.
This unifies how we deal with the pages we're writing and keeps all the messy
tracking internal.  This allows us to kill the big loops in both the read and
write case and makes reviewing and chaning the write and read paths much
simpler.  I've run xfstests and stress.sh on this code and it survives.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/free-space-cache.c | 693 ++++++++++++++++++++++++--------------------
 1 file changed, 375 insertions(+), 318 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index b81556ca75ea..35bfc13c9d42 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -241,27 +241,275 @@ static int readahead_cache(struct inode *inode)
 	return 0;
 }
 
+struct io_ctl {
+	void *cur, *orig;
+	struct page *page;
+	struct page **pages;
+	struct btrfs_root *root;
+	unsigned long size;
+	int index;
+	int num_pages;
+};
+
+static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
+		       struct btrfs_root *root)
+{
+	memset(io_ctl, 0, sizeof(struct io_ctl));
+	io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+		PAGE_CACHE_SHIFT;
+	io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
+				GFP_NOFS);
+	if (!io_ctl->pages)
+		return -ENOMEM;
+	io_ctl->root = root;
+	return 0;
+}
+
+static void io_ctl_free(struct io_ctl *io_ctl)
+{
+	kfree(io_ctl->pages);
+}
+
+static void io_ctl_unmap_page(struct io_ctl *io_ctl)
+{
+	if (io_ctl->cur) {
+		kunmap(io_ctl->page);
+		io_ctl->cur = NULL;
+		io_ctl->orig = NULL;
+	}
+}
+
+static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
+{
+	WARN_ON(io_ctl->cur);
+	BUG_ON(io_ctl->index >= io_ctl->num_pages);
+	io_ctl->page = io_ctl->pages[io_ctl->index++];
+	io_ctl->cur = kmap(io_ctl->page);
+	io_ctl->orig = io_ctl->cur;
+	io_ctl->size = PAGE_CACHE_SIZE;
+	if (clear)
+		memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
+}
+
+static void io_ctl_drop_pages(struct io_ctl *io_ctl)
+{
+	int i;
+
+	io_ctl_unmap_page(io_ctl);
+
+	for (i = 0; i < io_ctl->num_pages; i++) {
+		ClearPageChecked(io_ctl->pages[i]);
+		unlock_page(io_ctl->pages[i]);
+		page_cache_release(io_ctl->pages[i]);
+	}
+}
+
+static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
+				int uptodate)
+{
+	struct page *page;
+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+	int i;
+
+	for (i = 0; i < io_ctl->num_pages; i++) {
+		page = find_or_create_page(inode->i_mapping, i, mask);
+		if (!page) {
+			io_ctl_drop_pages(io_ctl);
+			return -ENOMEM;
+		}
+		io_ctl->pages[i] = page;
+		if (uptodate && !PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				printk(KERN_ERR "btrfs: error reading free "
+				       "space cache\n");
+				io_ctl_drop_pages(io_ctl);
+				return -EIO;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
+{
+	u64 *val;
+
+	io_ctl_map_page(io_ctl, 1);
+
+	/*
+	 * Skip the first 64bits to make sure theres a bogus crc for old
+	 * kernels
+	 */
+	io_ctl->cur += sizeof(u64);
+
+	val = io_ctl->cur;
+	*val = cpu_to_le64(generation);
+	io_ctl->cur += sizeof(u64);
+	io_ctl->size -= sizeof(u64) * 2;
+}
+
+static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
+{
+	u64 *gen;
+
+	io_ctl_map_page(io_ctl, 0);
+
+	/* Skip the bogus crc area */
+	io_ctl->cur += sizeof(u64);
+	gen = io_ctl->cur;
+	if (le64_to_cpu(*gen) != generation) {
+		printk_ratelimited(KERN_ERR "btrfs: space cache generation "
+				   "(%Lu) does not match inode (%Lu)\n", *gen,
+				   generation);
+		io_ctl_unmap_page(io_ctl);
+		return -EIO;
+	}
+	io_ctl->cur += sizeof(u64);
+	io_ctl->size -= sizeof(u64) * 2;
+	return 0;
+}
+
+static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
+			    void *bitmap)
+{
+	struct btrfs_free_space_entry *entry;
+
+	if (!io_ctl->cur)
+		return -ENOSPC;
+
+	entry = io_ctl->cur;
+	entry->offset = cpu_to_le64(offset);
+	entry->bytes = cpu_to_le64(bytes);
+	entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
+		BTRFS_FREE_SPACE_EXTENT;
+	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
+	io_ctl->size -= sizeof(struct btrfs_free_space_entry);
+
+	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
+		return 0;
+
+	/*
+	 * index == 1 means the current page is 0, we need to generate a bogus
+	 * crc for older kernels.
+	 */
+	if (io_ctl->index == 1) {
+		u32 *tmp;
+		u32 crc = ~(u32)0;
+
+		crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + sizeof(u64),
+				      crc, PAGE_CACHE_SIZE - sizeof(u64));
+		btrfs_csum_final(crc, (char *)&crc);
+		crc++;
+		tmp = io_ctl->orig;
+		*tmp = crc;
+	}
+	io_ctl_unmap_page(io_ctl);
+
+	/* No more pages to map */
+	if (io_ctl->index >= io_ctl->num_pages)
+		return 0;
+
+	/* map the next page */
+	io_ctl_map_page(io_ctl, 1);
+	return 0;
+}
+
+static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
+{
+	if (!io_ctl->cur)
+		return -ENOSPC;
+
+	/*
+	 * If we aren't at the start of the current page, unmap this one and
+	 * map the next one if there is any left.
+	 */
+	if (io_ctl->cur != io_ctl->orig) {
+		io_ctl_unmap_page(io_ctl);
+		if (io_ctl->index >= io_ctl->num_pages)
+			return -ENOSPC;
+		io_ctl_map_page(io_ctl, 0);
+	}
+
+	memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
+	io_ctl_unmap_page(io_ctl);
+	if (io_ctl->index < io_ctl->num_pages)
+		io_ctl_map_page(io_ctl, 0);
+	return 0;
+}
+
+static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
+{
+	io_ctl_unmap_page(io_ctl);
+
+	while (io_ctl->index < io_ctl->num_pages) {
+		io_ctl_map_page(io_ctl, 1);
+		io_ctl_unmap_page(io_ctl);
+	}
+}
+
+static u8 io_ctl_read_entry(struct io_ctl *io_ctl,
+			    struct btrfs_free_space *entry)
+{
+	struct btrfs_free_space_entry *e;
+	u8 type;
+
+	e = io_ctl->cur;
+	entry->offset = le64_to_cpu(e->offset);
+	entry->bytes = le64_to_cpu(e->bytes);
+	type = e->type;
+	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
+	io_ctl->size -= sizeof(struct btrfs_free_space_entry);
+
+	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
+		return type;
+
+	io_ctl_unmap_page(io_ctl);
+
+	if (io_ctl->index >= io_ctl->num_pages)
+		return type;
+
+	io_ctl_map_page(io_ctl, 0);
+	return type;
+}
+
+static void io_ctl_read_bitmap(struct io_ctl *io_ctl,
+			       struct btrfs_free_space *entry)
+{
+	BUG_ON(!io_ctl->cur);
+	if (io_ctl->cur != io_ctl->orig) {
+		io_ctl_unmap_page(io_ctl);
+		io_ctl_map_page(io_ctl, 0);
+	}
+	memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
+	io_ctl_unmap_page(io_ctl);
+	if (io_ctl->index < io_ctl->num_pages)
+		io_ctl_map_page(io_ctl, 0);
+}
+
 int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 			    struct btrfs_free_space_ctl *ctl,
 			    struct btrfs_path *path, u64 offset)
 {
 	struct btrfs_free_space_header *header;
 	struct extent_buffer *leaf;
-	struct page *page;
+	struct io_ctl io_ctl;
 	struct btrfs_key key;
+	struct btrfs_free_space *e, *n;
 	struct list_head bitmaps;
 	u64 num_entries;
 	u64 num_bitmaps;
 	u64 generation;
-	pgoff_t index = 0;
-	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+	u8 type;
 	int ret = 0;
 
 	INIT_LIST_HEAD(&bitmaps);
 
 	/* Nothing in the space cache, goodbye */
 	if (!i_size_read(inode))
-		goto out;
+		return 0;
 
 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 	key.offset = offset;
@@ -269,11 +517,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
-		goto out;
+		return 0;
 	else if (ret > 0) {
 		btrfs_release_path(path);
-		ret = 0;
-		goto out;
+		return 0;
 	}
 
 	ret = -1;
@@ -291,170 +538,89 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 		       " not match free space cache generation (%llu)\n",
 		       (unsigned long long)BTRFS_I(inode)->generation,
 		       (unsigned long long)generation);
-		goto out;
+		return 0;
 	}
 
 	if (!num_entries)
-		goto out;
+		return 0;
 
+	io_ctl_init(&io_ctl, inode, root);
 	ret = readahead_cache(inode);
 	if (ret)
 		goto out;
 
-	while (1) {
-		struct btrfs_free_space_entry *entry;
-		struct btrfs_free_space *e;
-		void *addr;
-		unsigned long offset = 0;
-		int need_loop = 0;
+	ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
+	if (ret)
+		goto out;
 
-		if (!num_entries && !num_bitmaps)
-			break;
+	ret = io_ctl_check_generation(&io_ctl, generation);
+	if (ret)
+		goto free_cache;
 
-		page = find_or_create_page(inode->i_mapping, index, mask);
-		if (!page)
+	while (num_entries) {
+		e = kmem_cache_zalloc(btrfs_free_space_cachep,
+				      GFP_NOFS);
+		if (!e)
 			goto free_cache;
 
-		if (!PageUptodate(page)) {
-			btrfs_readpage(NULL, page);
-			lock_page(page);
-			if (!PageUptodate(page)) {
-				unlock_page(page);
-				page_cache_release(page);
-				printk(KERN_ERR "btrfs: error reading free "
-				       "space cache\n");
-				goto free_cache;
-			}
+		type = io_ctl_read_entry(&io_ctl, e);
+		if (!e->bytes) {
+			kmem_cache_free(btrfs_free_space_cachep, e);
+			goto free_cache;
 		}
-		addr = kmap(page);
-
-		if (index == 0) {
-			u64 *gen;
 
-			/*
-			 * We put a bogus crc in the front of the first page in
-			 * case old kernels try to mount a fs with the new
-			 * format to make sure they discard the cache.
-			 */
-			addr += sizeof(u64);
-			offset += sizeof(u64);
-
-			gen = addr;
-			if (*gen != BTRFS_I(inode)->generation) {
-				printk_ratelimited(KERN_ERR "btrfs: space cache"
-					" generation (%llu) does not match "
-					"inode (%llu)\n",
-					(unsigned long long)*gen,
-					(unsigned long long)
-					BTRFS_I(inode)->generation);
-				kunmap(page);
-				unlock_page(page);
-				page_cache_release(page);
+		if (type == BTRFS_FREE_SPACE_EXTENT) {
+			spin_lock(&ctl->tree_lock);
+			ret = link_free_space(ctl, e);
+			spin_unlock(&ctl->tree_lock);
+			if (ret) {
+				printk(KERN_ERR "Duplicate entries in "
+				       "free space cache, dumping\n");
+				kmem_cache_free(btrfs_free_space_cachep, e);
 				goto free_cache;
 			}
-			addr += sizeof(u64);
-			offset += sizeof(u64);
-		}
-		entry = addr;
-
-		while (1) {
-			if (!num_entries)
-				break;
-
-			need_loop = 1;
-			e = kmem_cache_zalloc(btrfs_free_space_cachep,
-					      GFP_NOFS);
-			if (!e) {
-				kunmap(page);
-				unlock_page(page);
-				page_cache_release(page);
+		} else {
+			BUG_ON(!num_bitmaps);
+			num_bitmaps--;
+			e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+			if (!e->bitmap) {
+				kmem_cache_free(
+					btrfs_free_space_cachep, e);
 				goto free_cache;
 			}
-
-			e->offset = le64_to_cpu(entry->offset);
-			e->bytes = le64_to_cpu(entry->bytes);
-			if (!e->bytes) {
-				kunmap(page);
+			spin_lock(&ctl->tree_lock);
+			ret = link_free_space(ctl, e);
+			ctl->total_bitmaps++;
+			ctl->op->recalc_thresholds(ctl);
+			spin_unlock(&ctl->tree_lock);
+			if (ret) {
+				printk(KERN_ERR "Duplicate entries in "
+				       "free space cache, dumping\n");
 				kmem_cache_free(btrfs_free_space_cachep, e);
-				unlock_page(page);
-				page_cache_release(page);
 				goto free_cache;
 			}
-
-			if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
-				spin_lock(&ctl->tree_lock);
-				ret = link_free_space(ctl, e);
-				spin_unlock(&ctl->tree_lock);
-				if (ret) {
-					printk(KERN_ERR "Duplicate entries in "
-					       "free space cache, dumping\n");
-					kunmap(page);
-					unlock_page(page);
-					page_cache_release(page);
-					goto free_cache;
-				}
-			} else {
-				e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
-				if (!e->bitmap) {
-					kunmap(page);
-					kmem_cache_free(
-						btrfs_free_space_cachep, e);
-					unlock_page(page);
-					page_cache_release(page);
-					goto free_cache;
-				}
-				spin_lock(&ctl->tree_lock);
-				ret = link_free_space(ctl, e);
-				ctl->total_bitmaps++;
-				ctl->op->recalc_thresholds(ctl);
-				spin_unlock(&ctl->tree_lock);
-				if (ret) {
-					printk(KERN_ERR "Duplicate entries in "
-					       "free space cache, dumping\n");
-					kunmap(page);
-					unlock_page(page);
-					page_cache_release(page);
-					goto free_cache;
-				}
-				list_add_tail(&e->list, &bitmaps);
-			}
-
-			num_entries--;
-			offset += sizeof(struct btrfs_free_space_entry);
-			if (offset + sizeof(struct btrfs_free_space_entry) >=
-			    PAGE_CACHE_SIZE)
-				break;
-			entry++;
+			list_add_tail(&e->list, &bitmaps);
 		}
 
-		/*
-		 * We read an entry out of this page, we need to move on to the
-		 * next page.
-		 */
-		if (need_loop) {
-			kunmap(page);
-			goto next;
-		}
+		num_entries--;
+	}
 
-		/*
-		 * We add the bitmaps at the end of the entries in order that
-		 * the bitmap entries are added to the cache.
-		 */
-		e = list_entry(bitmaps.next, struct btrfs_free_space, list);
+	/*
+	 * We add the bitmaps at the end of the entries in order that
+	 * the bitmap entries are added to the cache.
+	 */
+	list_for_each_entry_safe(e, n, &bitmaps, list) {
 		list_del_init(&e->list);
-		memcpy(e->bitmap, addr, PAGE_CACHE_SIZE);
-		kunmap(page);
-		num_bitmaps--;
-next:
-		unlock_page(page);
-		page_cache_release(page);
-		index++;
+		io_ctl_read_bitmap(&io_ctl, e);
 	}
 
+	io_ctl_drop_pages(&io_ctl);
 	ret = 1;
 out:
+	io_ctl_free(&io_ctl);
 	return ret;
 free_cache:
+	io_ctl_drop_pages(&io_ctl);
 	__btrfs_remove_free_space_cache(ctl);
 	goto out;
 }
@@ -554,40 +720,28 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	struct extent_buffer *leaf;
 	struct rb_node *node;
 	struct list_head *pos, *n;
-	struct page **pages;
-	struct page *page;
 	struct extent_state *cached_state = NULL;
 	struct btrfs_free_cluster *cluster = NULL;
 	struct extent_io_tree *unpin = NULL;
+	struct io_ctl io_ctl;
 	struct list_head bitmap_list;
 	struct btrfs_key key;
 	u64 start, end, len;
-	u64 bytes = 0;
-	u32 crc = ~(u32)0;
-	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
-	int index = 0, num_pages = 0;
 	int entries = 0;
 	int bitmaps = 0;
 	int ret;
 	int err = -1;
-	bool next_page = false;
-	bool out_of_space = false;
 
 	INIT_LIST_HEAD(&bitmap_list);
 
 	if (!i_size_read(inode))
 		return -1;
 
-	num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
-		PAGE_CACHE_SHIFT;
-
 	filemap_write_and_wait(inode->i_mapping);
 	btrfs_wait_ordered_range(inode, inode->i_size &
 				 ~(root->sectorsize - 1), (u64)-1);
 
-	pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
-	if (!pages)
-		return -1;
+	io_ctl_init(&io_ctl, inode, root);
 
 	/* Get the cluster for this block_group if it exists */
 	if (block_group && !list_empty(&block_group->cluster_list))
@@ -601,30 +755,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	 */
 	unpin = root->fs_info->pinned_extents;
 
-	/*
-	 * Lock all pages first so we can lock the extent safely.
-	 *
-	 * NOTE: Because we hold the ref the entire time we're going to write to
-	 * the page find_get_page should never fail, so we don't do a check
-	 * after find_get_page at this point.  Just putting this here so people
-	 * know and don't freak out.
-	 */
-	while (index < num_pages) {
-		page = find_or_create_page(inode->i_mapping, index, mask);
-		if (!page) {
-			int i;
-
-			for (i = 0; i < num_pages; i++) {
-				unlock_page(pages[i]);
-				page_cache_release(pages[i]);
-			}
-			goto out;
-		}
-		pages[index] = page;
-		index++;
-	}
+	/* Lock all pages first so we can lock the extent safely. */
+	io_ctl_prepare_pages(&io_ctl, inode, 0);
 
-	index = 0;
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
 			 0, &cached_state, GFP_NOFS);
 
@@ -641,166 +774,78 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 		cluster = NULL;
 	}
 
-	/* Write out the extent entries */
-	do {
-		struct btrfs_free_space_entry *entry;
-		void *addr, *orig;
-		unsigned long offset = 0;
-
-		next_page = false;
+	io_ctl_set_generation(&io_ctl, trans->transid);
 
-		if (index >= num_pages) {
-			out_of_space = true;
-			break;
-		}
-
-		page = pages[index];
+	/* Write out the extent entries */
+	while (node) {
+		struct btrfs_free_space *e;
 
-		orig = addr = kmap(page);
-		if (index == 0) {
-			u64 *gen;
+		e = rb_entry(node, struct btrfs_free_space, offset_index);
+		entries++;
 
-			/*
-			 * We're going to put in a bogus crc for this page to
-			 * make sure that old kernels who aren't aware of this
-			 * format will be sure to discard the cache.
-			 */
-			addr += sizeof(u64);
-			offset += sizeof(u64);
+		ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes,
+				       e->bitmap);
+		if (ret)
+			goto out_nospc;
 
-			gen = addr;
-			*gen = trans->transid;
-			addr += sizeof(u64);
-			offset += sizeof(u64);
+		if (e->bitmap) {
+			list_add_tail(&e->list, &bitmap_list);
+			bitmaps++;
 		}
-		entry = addr;
-
-		memset(addr, 0, PAGE_CACHE_SIZE - offset);
-		while (node && !next_page) {
-			struct btrfs_free_space *e;
-
-			e = rb_entry(node, struct btrfs_free_space, offset_index);
-			entries++;
-
-			entry->offset = cpu_to_le64(e->offset);
-			entry->bytes = cpu_to_le64(e->bytes);
-			if (e->bitmap) {
-				entry->type = BTRFS_FREE_SPACE_BITMAP;
-				list_add_tail(&e->list, &bitmap_list);
-				bitmaps++;
-			} else {
-				entry->type = BTRFS_FREE_SPACE_EXTENT;
-			}
-			node = rb_next(node);
-			if (!node && cluster) {
-				node = rb_first(&cluster->root);
-				cluster = NULL;
-			}
-			offset += sizeof(struct btrfs_free_space_entry);
-			if (offset + sizeof(struct btrfs_free_space_entry) >=
-			    PAGE_CACHE_SIZE)
-				next_page = true;
-			entry++;
+		node = rb_next(node);
+		if (!node && cluster) {
+			node = rb_first(&cluster->root);
+			cluster = NULL;
 		}
+	}
 
-		/*
-		 * We want to add any pinned extents to our free space cache
-		 * so we don't leak the space
-		 */
-		while (block_group && !next_page &&
-		       (start < block_group->key.objectid +
-			block_group->key.offset)) {
-			ret = find_first_extent_bit(unpin, start, &start, &end,
-						    EXTENT_DIRTY);
-			if (ret) {
-				ret = 0;
-				break;
-			}
-
-			/* This pinned extent is out of our range */
-			if (start >= block_group->key.objectid +
-			    block_group->key.offset)
-				break;
-
-			len = block_group->key.objectid +
-				block_group->key.offset - start;
-			len = min(len, end + 1 - start);
-
-			entries++;
-			entry->offset = cpu_to_le64(start);
-			entry->bytes = cpu_to_le64(len);
-			entry->type = BTRFS_FREE_SPACE_EXTENT;
-
-			start = end + 1;
-			offset += sizeof(struct btrfs_free_space_entry);
-			if (offset + sizeof(struct btrfs_free_space_entry) >=
-			    PAGE_CACHE_SIZE)
-				next_page = true;
-			entry++;
+	/*
+	 * We want to add any pinned extents to our free space cache
+	 * so we don't leak the space
+	 */
+	while (block_group && (start < block_group->key.objectid +
+			       block_group->key.offset)) {
+		ret = find_first_extent_bit(unpin, start, &start, &end,
+					    EXTENT_DIRTY);
+		if (ret) {
+			ret = 0;
+			break;
 		}
 
-		/* Generate bogus crc value */
-		if (index == 0) {
-			u32 *tmp;
-			crc = btrfs_csum_data(root, orig + sizeof(u64), crc,
-					      PAGE_CACHE_SIZE - sizeof(u64));
-			btrfs_csum_final(crc, (char *)&crc);
-			crc++;
-			tmp = orig;
-			*tmp = crc;
-		}
+		/* This pinned extent is out of our range */
+		if (start >= block_group->key.objectid +
+		    block_group->key.offset)
+			break;
 
-		kunmap(page);
+		len = block_group->key.objectid +
+			block_group->key.offset - start;
+		len = min(len, end + 1 - start);
 
-		bytes += PAGE_CACHE_SIZE;
+		entries++;
+		ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
+		if (ret)
+			goto out_nospc;
 
-		index++;
-	} while (node || next_page);
+		start = end + 1;
+	}
 
 	/* Write out the bitmaps */
 	list_for_each_safe(pos, n, &bitmap_list) {
-		void *addr;
 		struct btrfs_free_space *entry =
 			list_entry(pos, struct btrfs_free_space, list);
 
-		if (index >= num_pages) {
-			out_of_space = true;
-			break;
-		}
-		page = pages[index];
-
-		addr = kmap(page);
-		memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
-		kunmap(page);
-		bytes += PAGE_CACHE_SIZE;
-
+		ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap);
+		if (ret)
+			goto out_nospc;
 		list_del_init(&entry->list);
-		index++;
-	}
-
-	if (out_of_space) {
-		btrfs_drop_pages(pages, num_pages);
-		unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
-				     i_size_read(inode) - 1, &cached_state,
-				     GFP_NOFS);
-		goto out;
 	}
 
 	/* Zero out the rest of the pages just to make sure */
-	while (index < num_pages) {
-		void *addr;
+	io_ctl_zero_remaining_pages(&io_ctl);
 
-		page = pages[index];
-		addr = kmap(page);
-		memset(addr, 0, PAGE_CACHE_SIZE);
-		kunmap(page);
-		bytes += PAGE_CACHE_SIZE;
-		index++;
-	}
-
-	ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
-					    bytes, &cached_state);
-	btrfs_drop_pages(pages, num_pages);
+	ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
+				0, i_size_read(inode), &cached_state);
+	io_ctl_drop_pages(&io_ctl);
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
 			     i_size_read(inode) - 1, &cached_state, GFP_NOFS);
 
@@ -817,7 +862,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 
 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 	if (ret < 0) {
-		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
 				 EXTENT_DIRTY | EXTENT_DELALLOC |
 				 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
 		goto out;
@@ -830,7 +875,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
 		    found_key.offset != offset) {
-			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
+			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
+					 inode->i_size - 1,
 					 EXTENT_DIRTY | EXTENT_DELALLOC |
 					 EXTENT_DO_ACCOUNTING, 0, 0, NULL,
 					 GFP_NOFS);
@@ -848,13 +894,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 
 	err = 0;
 out:
-	kfree(pages);
+	io_ctl_free(&io_ctl);
 	if (err) {
-		invalidate_inode_pages2_range(inode->i_mapping, 0, index);
+		invalidate_inode_pages2(inode->i_mapping);
 		BTRFS_I(inode)->generation = 0;
 	}
 	btrfs_update_inode(trans, root, inode);
 	return err;
+
+out_nospc:
+	list_for_each_safe(pos, n, &bitmap_list) {
+		struct btrfs_free_space *entry =
+			list_entry(pos, struct btrfs_free_space, list);
+		list_del_init(&entry->list);
+	}
+	io_ctl_drop_pages(&io_ctl);
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+			     i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+	goto out;
 }
 
 int btrfs_write_out_cache(struct btrfs_root *root,
-- 
cgit v1.2.3-58-ga151


From 549b4fdb8f3c0708bbc0ee12ff955cd206c0f60c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 5 Oct 2011 16:33:53 -0400
Subject: Btrfs: check the return value of filemap_write_and_wait in the space
 cache

We need to check the return value of filemap_write_and_wait in the space cache
writeout code.  Also don't set the inode's generation until we're sure nothing
else is going to fail.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/free-space-cache.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 35bfc13c9d42..abc924c9467c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -852,9 +852,10 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	if (ret)
 		goto out;
 
-	BTRFS_I(inode)->generation = trans->transid;
 
-	filemap_write_and_wait(inode->i_mapping);
+	ret = filemap_write_and_wait(inode->i_mapping);
+	if (ret)
+		goto out;
 
 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 	key.offset = offset;
@@ -884,6 +885,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 			goto out;
 		}
 	}
+
+	BTRFS_I(inode)->generation = trans->transid;
 	header = btrfs_item_ptr(leaf, path->slots[0],
 				struct btrfs_free_space_header);
 	btrfs_set_free_space_entries(leaf, header, entries);
-- 
cgit v1.2.3-58-ga151


From 9a82ca659d8bfd99afc0e89bbde2202322df5755 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 5 Oct 2011 16:35:28 -0400
Subject: Btrfs: take overflow into account in reserving space

My overcommit stuff can be a little racy when we're filling up the disk with
fs_mark and we overcommit into things that quickly get used up for data.  So use
num_bytes to see if we have enough available space so we're less likely to
overcommit ourselves out of the ability to make reservations.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f9711a82fc54..f95e55083bdb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3521,7 +3521,7 @@ again:
 			avail >>= 1;
 		 spin_unlock(&root->fs_info->free_chunk_lock);
 
-		if (used + orig_bytes < space_info->total_bytes + avail) {
+		if (used + num_bytes < space_info->total_bytes + avail) {
 			space_info->bytes_may_use += orig_bytes;
 			ret = 0;
 		}
-- 
cgit v1.2.3-58-ga151


From 5b0e95bf607ddd59b39f52d3d55e6581c817b530 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 6 Oct 2011 08:58:24 -0400
Subject: Btrfs: inline checksums into the disk free space cache

Yeah yeah I know this is how we used to do it and then I changed it, but damnit
I'm changing it back.  The fact is that writing out checksums will modify
metadata, which could cause us to dirty a block group we've already written out,
so we have to truncate it and all of it's checksums and re-write it which will
write new checksums which could dirty a blockg roup that has already been
written and you see where I'm going with this?  This can cause unmount or really
anything that depends on a transaction to commit to take it's sweet damned time
to happen.  So go back to the way it was, only this time we're specifically
setting NODATACOW because we can't go through the COW pathway anyway and we're
doing our own built-in cow'ing by truncating the free space cache.  The other
new thing is once we truncate the old cache and preallocate the new space, we
don't need to do that song and dance at all for the rest of the transaction, we
can just overwrite the existing space with the new cache if the block group
changes for whatever reason, and the NODATACOW will let us do this fine.  So
keep track of which transaction we last cleared our cache in and if we cleared
it in this transaction just say we're all setup and carry on.  This survives
xfstests and stress.sh.

The inode cache will continue to use the normal csum infrastructure since it
only gets written once and there will be no more modifications to the fs tree in
a transaction commit.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h            |   1 +
 fs/btrfs/extent-tree.c      |  18 ++--
 fs/btrfs/free-space-cache.c | 211 ++++++++++++++++++++++++++++++++------------
 fs/btrfs/inode.c            |  10 +--
 4 files changed, 172 insertions(+), 68 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1eafccb162ee..ea60897a9171 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -838,6 +838,7 @@ struct btrfs_block_group_cache {
 	u64 bytes_super;
 	u64 flags;
 	u64 sectorsize;
+	u64 cache_generation;
 	unsigned int ro:1;
 	unsigned int dirty:1;
 	unsigned int iref:1;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f95e55083bdb..0abf70c984e9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2717,6 +2717,13 @@ again:
 		goto again;
 	}
 
+	/* We've already setup this transaction, go ahead and exit */
+	if (block_group->cache_generation == trans->transid &&
+	    i_size_read(inode)) {
+		dcs = BTRFS_DC_SETUP;
+		goto out_put;
+	}
+
 	/*
 	 * We want to set the generation to 0, that way if anything goes wrong
 	 * from here on out we know not to trust this cache when we load up next
@@ -2756,19 +2763,16 @@ again:
 	num_pages *= 16;
 	num_pages *= PAGE_CACHE_SIZE;
 
-	ret = btrfs_delalloc_reserve_space(inode, num_pages);
+	ret = btrfs_check_data_free_space(inode, num_pages);
 	if (ret)
 		goto out_put;
 
 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
 					      num_pages, num_pages,
 					      &alloc_hint);
-	if (!ret) {
+	if (!ret)
 		dcs = BTRFS_DC_SETUP;
-		btrfs_free_reserved_data_space(inode, num_pages);
-	} else {
-		btrfs_delalloc_release_space(inode, num_pages);
-	}
+	btrfs_free_reserved_data_space(inode, num_pages);
 
 out_put:
 	iput(inode);
@@ -2776,6 +2780,8 @@ out_free:
 	btrfs_release_path(path);
 out:
 	spin_lock(&block_group->lock);
+	if (!ret)
+		block_group->cache_generation = trans->transid;
 	block_group->disk_cache_state = dcs;
 	spin_unlock(&block_group->lock);
 
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index abc924c9467c..5d40c1ed8225 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -85,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
 				      *block_group, struct btrfs_path *path)
 {
 	struct inode *inode = NULL;
+	u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
 
 	spin_lock(&block_group->lock);
 	if (block_group->inode)
@@ -99,9 +100,10 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
 		return inode;
 
 	spin_lock(&block_group->lock);
-	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) {
+	if (!((BTRFS_I(inode)->flags & flags) == flags)) {
 		printk(KERN_INFO "Old style space inode found, converting.\n");
-		BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM;
+		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
+			BTRFS_INODE_NODATACOW;
 		block_group->disk_cache_state = BTRFS_DC_CLEAR;
 	}
 
@@ -123,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root,
 	struct btrfs_free_space_header *header;
 	struct btrfs_inode_item *inode_item;
 	struct extent_buffer *leaf;
+	u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
 	int ret;
 
 	ret = btrfs_insert_empty_inode(trans, root, path, ino);
 	if (ret)
 		return ret;
 
+	/* We inline crc's for the free disk space cache */
+	if (ino != BTRFS_FREE_INO_OBJECTID)
+		flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
+
 	leaf = path->nodes[0];
 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_inode_item);
@@ -141,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root,
 	btrfs_set_inode_uid(leaf, inode_item, 0);
 	btrfs_set_inode_gid(leaf, inode_item, 0);
 	btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
-	btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
-			      BTRFS_INODE_PREALLOC);
+	btrfs_set_inode_flags(leaf, inode_item, flags);
 	btrfs_set_inode_nlink(leaf, inode_item, 1);
 	btrfs_set_inode_transid(leaf, inode_item, trans->transid);
 	btrfs_set_inode_block_group(leaf, inode_item, offset);
@@ -249,6 +255,7 @@ struct io_ctl {
 	unsigned long size;
 	int index;
 	int num_pages;
+	unsigned check_crcs:1;
 };
 
 static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
@@ -262,6 +269,8 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
 	if (!io_ctl->pages)
 		return -ENOMEM;
 	io_ctl->root = root;
+	if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
+		io_ctl->check_crcs = 1;
 	return 0;
 }
 
@@ -340,25 +349,39 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
 	io_ctl_map_page(io_ctl, 1);
 
 	/*
-	 * Skip the first 64bits to make sure theres a bogus crc for old
-	 * kernels
+	 * Skip the csum areas.  If we don't check crcs then we just have a
+	 * 64bit chunk at the front of the first page.
 	 */
-	io_ctl->cur += sizeof(u64);
+	if (io_ctl->check_crcs) {
+		io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
+		io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
+	} else {
+		io_ctl->cur += sizeof(u64);
+		io_ctl->size -= sizeof(u64) * 2;
+	}
 
 	val = io_ctl->cur;
 	*val = cpu_to_le64(generation);
 	io_ctl->cur += sizeof(u64);
-	io_ctl->size -= sizeof(u64) * 2;
 }
 
 static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
 {
 	u64 *gen;
 
-	io_ctl_map_page(io_ctl, 0);
+	/*
+	 * Skip the crc area.  If we don't check crcs then we just have a 64bit
+	 * chunk at the front of the first page.
+	 */
+	if (io_ctl->check_crcs) {
+		io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
+		io_ctl->size -= sizeof(u64) +
+			(sizeof(u32) * io_ctl->num_pages);
+	} else {
+		io_ctl->cur += sizeof(u64);
+		io_ctl->size -= sizeof(u64) * 2;
+	}
 
-	/* Skip the bogus crc area */
-	io_ctl->cur += sizeof(u64);
 	gen = io_ctl->cur;
 	if (le64_to_cpu(*gen) != generation) {
 		printk_ratelimited(KERN_ERR "btrfs: space cache generation "
@@ -368,7 +391,63 @@ static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
 		return -EIO;
 	}
 	io_ctl->cur += sizeof(u64);
-	io_ctl->size -= sizeof(u64) * 2;
+	return 0;
+}
+
+static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
+{
+	u32 *tmp;
+	u32 crc = ~(u32)0;
+	unsigned offset = 0;
+
+	if (!io_ctl->check_crcs) {
+		io_ctl_unmap_page(io_ctl);
+		return;
+	}
+
+	if (index == 0)
+		offset = sizeof(u32) * io_ctl->num_pages;;
+
+	crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
+			      PAGE_CACHE_SIZE - offset);
+	btrfs_csum_final(crc, (char *)&crc);
+	io_ctl_unmap_page(io_ctl);
+	tmp = kmap(io_ctl->pages[0]);
+	tmp += index;
+	*tmp = crc;
+	kunmap(io_ctl->pages[0]);
+}
+
+static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
+{
+	u32 *tmp, val;
+	u32 crc = ~(u32)0;
+	unsigned offset = 0;
+
+	if (!io_ctl->check_crcs) {
+		io_ctl_map_page(io_ctl, 0);
+		return 0;
+	}
+
+	if (index == 0)
+		offset = sizeof(u32) * io_ctl->num_pages;
+
+	tmp = kmap(io_ctl->pages[0]);
+	tmp += index;
+	val = *tmp;
+	kunmap(io_ctl->pages[0]);
+
+	io_ctl_map_page(io_ctl, 0);
+	crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
+			      PAGE_CACHE_SIZE - offset);
+	btrfs_csum_final(crc, (char *)&crc);
+	if (val != crc) {
+		printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free "
+				   "space cache\n");
+		io_ctl_unmap_page(io_ctl);
+		return -EIO;
+	}
+
 	return 0;
 }
 
@@ -391,22 +470,7 @@ static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
 	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
 		return 0;
 
-	/*
-	 * index == 1 means the current page is 0, we need to generate a bogus
-	 * crc for older kernels.
-	 */
-	if (io_ctl->index == 1) {
-		u32 *tmp;
-		u32 crc = ~(u32)0;
-
-		crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + sizeof(u64),
-				      crc, PAGE_CACHE_SIZE - sizeof(u64));
-		btrfs_csum_final(crc, (char *)&crc);
-		crc++;
-		tmp = io_ctl->orig;
-		*tmp = crc;
-	}
-	io_ctl_unmap_page(io_ctl);
+	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 
 	/* No more pages to map */
 	if (io_ctl->index >= io_ctl->num_pages)
@@ -427,14 +491,14 @@ static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
 	 * map the next one if there is any left.
 	 */
 	if (io_ctl->cur != io_ctl->orig) {
-		io_ctl_unmap_page(io_ctl);
+		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 		if (io_ctl->index >= io_ctl->num_pages)
 			return -ENOSPC;
 		io_ctl_map_page(io_ctl, 0);
 	}
 
 	memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
-	io_ctl_unmap_page(io_ctl);
+	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 	if (io_ctl->index < io_ctl->num_pages)
 		io_ctl_map_page(io_ctl, 0);
 	return 0;
@@ -442,51 +506,60 @@ static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
 
 static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
 {
-	io_ctl_unmap_page(io_ctl);
+	/*
+	 * If we're not on the boundary we know we've modified the page and we
+	 * need to crc the page.
+	 */
+	if (io_ctl->cur != io_ctl->orig)
+		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+	else
+		io_ctl_unmap_page(io_ctl);
 
 	while (io_ctl->index < io_ctl->num_pages) {
 		io_ctl_map_page(io_ctl, 1);
-		io_ctl_unmap_page(io_ctl);
+		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 	}
 }
 
-static u8 io_ctl_read_entry(struct io_ctl *io_ctl,
-			    struct btrfs_free_space *entry)
+static int io_ctl_read_entry(struct io_ctl *io_ctl,
+			    struct btrfs_free_space *entry, u8 *type)
 {
 	struct btrfs_free_space_entry *e;
-	u8 type;
 
 	e = io_ctl->cur;
 	entry->offset = le64_to_cpu(e->offset);
 	entry->bytes = le64_to_cpu(e->bytes);
-	type = e->type;
+	*type = e->type;
 	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
 	io_ctl->size -= sizeof(struct btrfs_free_space_entry);
 
 	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
-		return type;
+		return 0;
 
 	io_ctl_unmap_page(io_ctl);
 
 	if (io_ctl->index >= io_ctl->num_pages)
-		return type;
+		return 0;
 
-	io_ctl_map_page(io_ctl, 0);
-	return type;
+	return io_ctl_check_crc(io_ctl, io_ctl->index);
 }
 
-static void io_ctl_read_bitmap(struct io_ctl *io_ctl,
-			       struct btrfs_free_space *entry)
+static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
+			      struct btrfs_free_space *entry)
 {
-	BUG_ON(!io_ctl->cur);
-	if (io_ctl->cur != io_ctl->orig) {
+	int ret;
+
+	if (io_ctl->cur && io_ctl->cur != io_ctl->orig)
 		io_ctl_unmap_page(io_ctl);
-		io_ctl_map_page(io_ctl, 0);
-	}
+
+	ret = io_ctl_check_crc(io_ctl, io_ctl->index);
+	if (ret)
+		return ret;
+
 	memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
 	io_ctl_unmap_page(io_ctl);
-	if (io_ctl->index < io_ctl->num_pages)
-		io_ctl_map_page(io_ctl, 0);
+
+	return 0;
 }
 
 int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
@@ -553,6 +626,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 	if (ret)
 		goto out;
 
+	ret = io_ctl_check_crc(&io_ctl, 0);
+	if (ret)
+		goto free_cache;
+
 	ret = io_ctl_check_generation(&io_ctl, generation);
 	if (ret)
 		goto free_cache;
@@ -563,7 +640,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 		if (!e)
 			goto free_cache;
 
-		type = io_ctl_read_entry(&io_ctl, e);
+		ret = io_ctl_read_entry(&io_ctl, e, &type);
+		if (ret) {
+			kmem_cache_free(btrfs_free_space_cachep, e);
+			goto free_cache;
+		}
+
 		if (!e->bytes) {
 			kmem_cache_free(btrfs_free_space_cachep, e);
 			goto free_cache;
@@ -611,7 +693,9 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 	 */
 	list_for_each_entry_safe(e, n, &bitmaps, list) {
 		list_del_init(&e->list);
-		io_ctl_read_bitmap(&io_ctl, e);
+		ret = io_ctl_read_bitmap(&io_ctl, e);
+		if (ret)
+			goto free_cache;
 	}
 
 	io_ctl_drop_pages(&io_ctl);
@@ -632,7 +716,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
 	struct btrfs_root *root = fs_info->tree_root;
 	struct inode *inode;
 	struct btrfs_path *path;
-	int ret;
+	int ret = 0;
 	bool matched;
 	u64 used = btrfs_block_group_used(&block_group->item);
 
@@ -664,6 +748,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
 		return 0;
 	}
 
+	/* We may have converted the inode and made the cache invalid. */
+	spin_lock(&block_group->lock);
+	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
+		spin_unlock(&block_group->lock);
+		goto out;
+	}
+	spin_unlock(&block_group->lock);
+
 	ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
 				      path, block_group->key.objectid);
 	btrfs_free_path(path);
@@ -774,6 +866,13 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 		cluster = NULL;
 	}
 
+	/* Make sure we can fit our crcs into the first page */
+	if (io_ctl.check_crcs &&
+	    (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) {
+		WARN_ON(1);
+		goto out_nospc;
+	}
+
 	io_ctl_set_generation(&io_ctl, trans->transid);
 
 	/* Write out the extent entries */
@@ -864,8 +963,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 	if (ret < 0) {
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
-				 EXTENT_DIRTY | EXTENT_DELALLOC |
-				 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
+				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
+				 GFP_NOFS);
 		goto out;
 	}
 	leaf = path->nodes[0];
@@ -878,9 +977,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 		    found_key.offset != offset) {
 			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
 					 inode->i_size - 1,
-					 EXTENT_DIRTY | EXTENT_DELALLOC |
-					 EXTENT_DO_ACCOUNTING, 0, 0, NULL,
-					 GFP_NOFS);
+					 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
+					 NULL, GFP_NOFS);
 			btrfs_release_path(path);
 			goto out;
 		}
@@ -942,7 +1040,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 	ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
 				      path, block_group->key.objectid);
 	if (ret) {
-		btrfs_delalloc_release_metadata(inode, inode->i_size);
 		spin_lock(&block_group->lock);
 		block_group->disk_cache_state = BTRFS_DC_ERROR;
 		spin_unlock(&block_group->lock);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 15adfb542502..246397d8478f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1792,12 +1792,12 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	}
 	ret = 0;
 out:
-	btrfs_delalloc_release_metadata(inode, ordered_extent->len);
-	if (nolock) {
-		if (trans)
+	if (root != root->fs_info->tree_root)
+		btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+	if (trans) {
+		if (nolock)
 			btrfs_end_transaction_nolock(trans, root);
-	} else {
-		if (trans)
+		else
 			btrfs_end_transaction(trans, root);
 	}
 
-- 
cgit v1.2.3-58-ga151


From 4b91c14f913f649d4302b3677b85c4ce87a3d8e7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 7 Oct 2011 11:55:34 -0400
Subject: Btrfs: wait for ordered extents if we didn't reclaim enough

I noticed recently that my overcommit patch was causing one of my enospc tests
to fail 25% of the time with early ENOSPC.  This is because my overcommit patch
was letting us go way over board, but it wasn't waiting long enough to let the
delalloc shrinker do it's job.  The problem is we just start writeback and wait
a little bit hoping we flush enough, but we only free up delalloc space by
having the writes complete all the way.  We do this by waiting for ordered
extents, which we do but only if we already free'd enough for the reservation,
which isn't right, we should flush ordered extents if we didn't reclaim enough
in case that will push us over the edge.  With this patch I've not seen a
failure in this enospc test after running it in a loop for an hour.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0abf70c984e9..fc0de6880045 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3406,7 +3406,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 		}
 
 	}
-	if (reclaimed >= to_reclaim && !trans)
+	if (reclaimed < to_reclaim && !trans)
 		btrfs_wait_ordered_extents(root, 0, 0);
 	return reclaimed >= to_reclaim;
 }
-- 
cgit v1.2.3-58-ga151


From e70bea5fe0e3d6355fd95674eaff5aa5a32f0564 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 11 Oct 2011 14:18:24 -0400
Subject: Btrfs: fix the amount of space reserved for unlink

Our unlink reservations were a bit much, we were reserving 10 and I only count 8
possible items we're touching, so comment what we're reserving for and fix the
count value.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 246397d8478f..1f013c5c36aa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2758,7 +2758,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
 	u64 ino = btrfs_ino(inode);
 	u64 dir_ino = btrfs_ino(dir);
 
-	trans = btrfs_start_transaction(root, 10);
+	/*
+	 * 1 for the possible orphan item
+	 * 1 for the dir item
+	 * 1 for the dir index
+	 * 1 for the inode ref
+	 * 1 for the inode ref in the tree log
+	 * 2 for the dir entries in the log
+	 * 1 for the inode
+	 */
+	trans = btrfs_start_transaction(root, 8);
 	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
 		return trans;
 
-- 
cgit v1.2.3-58-ga151


From ed3ee9f44ba55eb6acfbfc8caa881e0253710d2a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 13 Oct 2011 13:09:22 -0400
Subject: Btrfs: fix regression in re-setting a large xattr

Recently I changed the xattr stuff to unconditionally set the xattr first in
case the xattr didn't exist yet.  This has introduced a regression when setting
an xattr that already exists with a large value.  If we find the key we are
looking for split_leaf will assume that we're extending that item.  The problem
is the size we pass down to btrfs_search_slot includes the size of the item
already, so if we have the largest xattr we can possibly have plus the size of
the xattr item plus the xattr item that btrfs_search_slot we'd overflow the
leaf.  Thankfully this is not what we're doing, but split_leaf doesn't know this
so it just returns EOVERFLOW.  So in the xattr code we need to check and see if
we got back EOVERFLOW and treat it like EEXIST since that's really what
happened.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/xattr.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 69565e5fc6a0..a76e41c04b71 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -127,6 +127,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
 again:
 	ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
 				      name, name_len, value, size);
+	/*
+	 * If we're setting an xattr to a new value but the new value is say
+	 * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
+	 * back from split_leaf.  This is because it thinks we'll be extending
+	 * the existing item size, but we're asking for enough space to add the
+	 * item itself.  So if we get EOVERFLOW just set ret to EEXIST and let
+	 * the rest of the function figure it out.
+	 */
+	if (ret == -EOVERFLOW)
+		ret = -EEXIST;
+
 	if (ret == -EEXIST) {
 		if (flags & XATTR_CREATE)
 			goto out;
-- 
cgit v1.2.3-58-ga151


From bbb495c2ed9d34894cb3d6d366866fc92a2e1adc Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 14 Oct 2011 13:37:45 -0400
Subject: Btrfs: don't check bytes_pinned to determine if we should commit the
 transaction

Before the only reason to commit the transaction to recover space in
reserve_metadata_bytes() was if there were enough pinned_bytes to satisfy our
reservation.  But now we have the delayed inode stuff which will hold it's
reservations until we commit the transaction.  So say we max out our reservation
by creating a bunch of files but don't have any pinned bytes we will ENOSPC out
early even though we could commit the transaction and get that space back.  So
now just unconditionally commit the transaction since currently there is no way
to know how much metadata space is being reserved by delayed inode stuff.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fc0de6880045..79365a40cb3a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3568,17 +3568,6 @@ again:
 		goto again;
 	}
 
-	/*
-	 * Not enough space to be reclaimed, don't bother committing the
-	 * transaction.
-	 */
-	spin_lock(&space_info->lock);
-	if (space_info->bytes_pinned < orig_bytes)
-		ret = -ENOSPC;
-	spin_unlock(&space_info->lock);
-	if (ret)
-		goto out;
-
 	ret = -EAGAIN;
 	if (trans)
 		goto out;
-- 
cgit v1.2.3-58-ga151


From f104d044376aadcee74605d66b8d9dc2e145782c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 14 Oct 2011 13:56:58 -0400
Subject: Btrfs: wait for ordered extents if we're in trouble when shrinking
 delalloc

The only way we actually reclaim delalloc space is waiting for the IO to
completely finish.  Usually we kick off a bunch of IO and wait for a little bit
and hope we can make our reservation, and usually this works out pretty well.
With overcommit however we can get seriously underwater if we're filling up the
disk quickly, so we need to be able to force the delalloc shrinker to wait for
the ordered IO to finish to give us a better chance of actually reclaiming
enough space to get our reservation.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 79365a40cb3a..96cbc5104959 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3334,7 +3334,8 @@ out:
  * shrink metadata reservation for delalloc
  */
 static int shrink_delalloc(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, u64 to_reclaim, int sync)
+			   struct btrfs_root *root, u64 to_reclaim,
+			   bool wait_ordered)
 {
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_space_info *space_info;
@@ -3387,11 +3388,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 		if (trans && trans->transaction->blocked)
 			return -EAGAIN;
 
-		time_left = schedule_timeout_interruptible(1);
+		if (wait_ordered && !trans) {
+			btrfs_wait_ordered_extents(root, 0, 0);
+		} else {
+			time_left = schedule_timeout_interruptible(1);
 
-		/* We were interrupted, exit */
-		if (time_left)
-			break;
+			/* We were interrupted, exit */
+			if (time_left)
+				break;
+		}
 
 		/* we've kicked the IO a few times, if anything has been freed,
 		 * exit.  There is no sense in looping here for a long time
@@ -3406,8 +3411,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 		}
 
 	}
-	if (reclaimed < to_reclaim && !trans)
-		btrfs_wait_ordered_extents(root, 0, 0);
+
 	return reclaimed >= to_reclaim;
 }
 
@@ -3438,6 +3442,7 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
 	int ret = 0;
 	bool committed = false;
 	bool flushing = false;
+	bool wait_ordered = false;
 
 	trans = (struct btrfs_trans_handle *)current->journal_info;
 again:
@@ -3496,6 +3501,7 @@ again:
 		 * amount plus the amount of bytes that we need for this
 		 * reservation.
 		 */
+		wait_ordered = true;
 		num_bytes = used - space_info->total_bytes +
 			(orig_bytes * (retries + 1));
 	}
@@ -3530,6 +3536,8 @@ again:
 		if (used + num_bytes < space_info->total_bytes + avail) {
 			space_info->bytes_may_use += orig_bytes;
 			ret = 0;
+		} else {
+			wait_ordered = true;
 		}
 	}
 
@@ -3552,7 +3560,7 @@ again:
 	 * We do synchronous shrinking since we don't actually unreserve
 	 * metadata until after the IO is completed.
 	 */
-	ret = shrink_delalloc(trans, root, num_bytes, 1);
+	ret = shrink_delalloc(trans, root, num_bytes, wait_ordered);
 	if (ret < 0)
 		goto out;
 
@@ -3564,6 +3572,7 @@ again:
 	 * so go back around and try again.
 	 */
 	if (retries < 2) {
+		wait_ordered = true;
 		retries++;
 		goto again;
 	}
-- 
cgit v1.2.3-58-ga151


From 877da174301dde9062b915da4c8103048be49702 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 14 Oct 2011 14:02:10 -0400
Subject: Btrfs: allow shrink_delalloc flush the needed reclaimed pages

Currently we only allow a maximum of 2 megabytes of pages to be flushed at a
time.  This was ok before, but now we have overcommit which will screw us in a
heartbeat if we are quickly filling the disk.  So instead pick either 2
megabytes or the number of pages we need to reclaim to be safe again, which ever
is larger.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 96cbc5104959..424ae82855c8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3343,7 +3343,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	u64 max_reclaim;
 	u64 reclaimed = 0;
 	long time_left;
-	int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
+	unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
 	int loops = 0;
 	unsigned long progress;
 
@@ -3366,7 +3366,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	}
 
 	max_reclaim = min(reserved, to_reclaim);
-
+	nr_pages = max_t(unsigned long, nr_pages,
+			 max_reclaim >> PAGE_CACHE_SHIFT);
 	while (loops < 1024) {
 		/* have the flusher threads jump in and do some IO */
 		smp_mb();
-- 
cgit v1.2.3-58-ga151


From b24e03db0df3e9164c9649db12fecc8c2d81b0d1 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 14 Oct 2011 14:40:17 -0400
Subject: Btrfs: release trans metadata bytes before flushing delayed refs

We started setting trans->block_rsv = NULL to allow the delayed refs flushing
stuff to use the right block_rsv and then just made
btrfs_trans_release_metadata() unconditionally use the trans block rsv.  The
problem with this is we need to reserve some space in the transaction and then
migrate it to the global block rsv, so we need to be able to free that out
properly.  So instead just move btrfs_trans_release_metadata() before the
delayed ref flushing and use trans->block_rsv for the freeing.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 5 +----
 fs/btrfs/transaction.c | 6 ++----
 2 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 424ae82855c8..eb4fe56b08bb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3909,13 +3909,10 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root)
 {
-	struct btrfs_block_rsv *block_rsv;
-
 	if (!trans->bytes_reserved)
 		return;
 
-	block_rsv = &root->fs_info->trans_block_rsv;
-	btrfs_block_rsv_release(root, block_rsv, trans->bytes_reserved);
+	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
 	trans->bytes_reserved = 0;
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1e1a4816ccb0..d064fa0a4a07 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -462,6 +462,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 
+	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
 	while (count < 4) {
 		unsigned long cur = trans->delayed_ref_updates;
@@ -483,8 +484,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		count++;
 	}
 
-	btrfs_trans_release_metadata(trans, root);
-
 	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
 	    should_end_transaction(trans, root)) {
 		trans->transaction->blocked = 1;
@@ -1128,6 +1127,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_run_ordered_operations(root, 0);
 
+	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
 
 	/* make a pass through all the delayed refs we have so far
@@ -1136,8 +1136,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	ret = btrfs_run_delayed_refs(trans, root, 0);
 	BUG_ON(ret);
 
-	btrfs_trans_release_metadata(trans, root);
-
 	cur_trans = trans->transaction;
 	/*
 	 * set the flushing flag so procs in this transaction have to
-- 
cgit v1.2.3-58-ga151


From 3880a1b46d87a6b030c31889875befc745d95dff Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 14 Oct 2011 14:46:51 -0400
Subject: Btrfs: reserve some space for an orphan item when unlinking

In __unlink_start_trans() if we don't have enough room for a reservation we will
check to see if the unlink will free up space.  If it does that's great, but we
will still could add an orphan item, so we need to reserve enough space to add
the orphan item.  Do this and migrate the space the global reserve so it all
works out right.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1f013c5c36aa..b6b70bdd0992 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2790,7 +2790,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
 		return ERR_PTR(-ENOMEM);
 	}
 
-	trans = btrfs_start_transaction(root, 0);
+	/* 1 for the orphan item */
+	trans = btrfs_start_transaction(root, 1);
 	if (IS_ERR(trans)) {
 		btrfs_free_path(path);
 		root->fs_info->enospc_unlink = 0;
@@ -2895,6 +2896,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
 	err = 0;
 out:
 	btrfs_free_path(path);
+	/* Migrate the orphan reservation over */
+	if (!err)
+		err = btrfs_block_rsv_migrate(trans->block_rsv,
+				&root->fs_info->global_block_rsv,
+				btrfs_calc_trans_metadata_size(root, 1));
+
 	if (err) {
 		btrfs_end_transaction(trans, root);
 		root->fs_info->enospc_unlink = 0;
-- 
cgit v1.2.3-58-ga151


From 36ba022ac0b748dd543f43430b03198e899426c9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 18 Oct 2011 12:15:48 -0400
Subject: Btrfs: seperate out btrfs_block_rsv_check out into 2 different
 functions

Currently btrfs_block_rsv_check does 2 things, it will either refill a block
reserve like in the truncate or refill case, or it will check to see if there is
enough space in the global reserve and possibly refill it.  However because of
overcommit we could be well overcommitting ourselves just to try and refill the
global reserve, when really we should just be committing the transaction.  So
breack this out into btrfs_block_rsv_refill and btrfs_block_rsv_check.  Refill
will try to reserve more metadata if it can and btrfs_block_rsv_check will not,
it will only tell you if the factor of the total space is still reserved.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h            |  4 +++-
 fs/btrfs/extent-tree.c      | 41 +++++++++++++++++++++++++++--------------
 fs/btrfs/free-space-cache.c |  2 +-
 fs/btrfs/inode.c            |  4 ++--
 fs/btrfs/relocation.c       |  6 ++----
 fs/btrfs/transaction.c      |  4 ++--
 6 files changed, 37 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ea60897a9171..227620993bce 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2252,8 +2252,10 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
 			u64 num_bytes);
 int btrfs_block_rsv_check(struct btrfs_root *root,
+			  struct btrfs_block_rsv *block_rsv, int min_factor);
+int btrfs_block_rsv_refill(struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv,
-			  u64 min_reserved, int min_factor, int flush);
+			  u64 min_reserved);
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
 			    struct btrfs_block_rsv *dst_rsv,
 			    u64 num_bytes);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index eb4fe56b08bb..a5f1421eeee9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3422,7 +3422,6 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
  * @block_rsv - the block_rsv we're allocating for
  * @orig_bytes - the number of bytes we want
  * @flush - wether or not we can flush to make our reservation
- * @check - wether this is just to check if we have enough space or not
  *
  * This will reserve orgi_bytes number of bytes from the space info associated
  * with the block_rsv.  If there is not enough space it will make an attempt to
@@ -3433,7 +3432,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
  */
 static int reserve_metadata_bytes(struct btrfs_root *root,
 				  struct btrfs_block_rsv *block_rsv,
-				  u64 orig_bytes, int flush, int check)
+				  u64 orig_bytes, int flush)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
 	struct btrfs_trans_handle *trans;
@@ -3507,7 +3506,7 @@ again:
 			(orig_bytes * (retries + 1));
 	}
 
-	if (ret && !check) {
+	if (ret) {
 		u64 profile = btrfs_get_alloc_profile(root, 0);
 		u64 avail;
 
@@ -3742,7 +3741,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
 	if (num_bytes == 0)
 		return 0;
 
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1, 0);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
 		return 0;
@@ -3752,8 +3751,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
 }
 
 int btrfs_block_rsv_check(struct btrfs_root *root,
-			  struct btrfs_block_rsv *block_rsv,
-			  u64 min_reserved, int min_factor, int flush)
+			  struct btrfs_block_rsv *block_rsv, int min_factor)
 {
 	u64 num_bytes = 0;
 	int ret = -ENOSPC;
@@ -3762,11 +3760,26 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
 		return 0;
 
 	spin_lock(&block_rsv->lock);
-	if (min_factor > 0)
-		num_bytes = div_factor(block_rsv->size, min_factor);
-	if (min_reserved > num_bytes)
-		num_bytes = min_reserved;
+	num_bytes = div_factor(block_rsv->size, min_factor);
+	if (block_rsv->reserved >= num_bytes)
+		ret = 0;
+	spin_unlock(&block_rsv->lock);
 
+	return ret;
+}
+
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+			  struct btrfs_block_rsv *block_rsv,
+			  u64 min_reserved)
+{
+	u64 num_bytes = 0;
+	int ret = -ENOSPC;
+
+	if (!block_rsv)
+		return 0;
+
+	spin_lock(&block_rsv->lock);
+	num_bytes = min_reserved;
 	if (block_rsv->reserved >= num_bytes)
 		ret = 0;
 	else
@@ -3776,7 +3789,7 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
 	if (!ret)
 		return 0;
 
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush, !flush);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
 		return 0;
@@ -4073,7 +4086,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
 	spin_unlock(&BTRFS_I(inode)->lock);
 
-	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush, 0);
+	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
 	if (ret) {
 		u64 to_free = 0;
 		unsigned dropped;
@@ -5728,7 +5741,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	block_rsv = get_block_rsv(trans, root);
 
 	if (block_rsv->size == 0) {
-		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0);
+		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
 		/*
 		 * If we couldn't reserve metadata bytes try and use some from
 		 * the global reserve.
@@ -5749,7 +5762,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 		return block_rsv;
 	if (ret) {
 		WARN_ON(1);
-		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0);
+		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
 		if (!ret) {
 			return block_rsv;
 		} else if (ret && block_rsv != global_rsv) {
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 5d40c1ed8225..2fecfc3183ee 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -203,7 +203,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 
 	rsv = trans->block_rsv;
 	trans->block_rsv = root->orphan_block_rsv;
-	ret = btrfs_block_rsv_check(root, root->orphan_block_rsv, 0, 5, 0);
+	ret = btrfs_block_rsv_check(root, root->orphan_block_rsv, 5);
 	if (ret)
 		return ret;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b6b70bdd0992..f12747c9447b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3604,7 +3604,7 @@ void btrfs_evict_inode(struct inode *inode)
 	 * doing the truncate.
 	 */
 	while (1) {
-		ret = btrfs_block_rsv_check(root, rsv, min_size, 0, 1);
+		ret = btrfs_block_rsv_refill(root, rsv, min_size);
 
 		/*
 		 * Try and steal from the global reserve since we will
@@ -6613,7 +6613,7 @@ static int btrfs_truncate(struct inode *inode)
 		btrfs_add_ordered_operation(trans, root, inode);
 
 	while (1) {
-		ret = btrfs_block_rsv_check(root, rsv, min_size, 0, 1);
+		ret = btrfs_block_rsv_refill(root, rsv, min_size);
 		if (ret) {
 			/*
 			 * This can only happen with the original transaction we
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 7fa090fa0d39..10af6a0e0865 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2041,8 +2041,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 		BUG_ON(IS_ERR(trans));
 		trans->block_rsv = rc->block_rsv;
 
-		ret = btrfs_block_rsv_check(root, rc->block_rsv, min_reserved,
-					    0, 0);
+		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
 		if (ret) {
 			BUG_ON(ret != -EAGAIN);
 			ret = btrfs_commit_transaction(trans, root);
@@ -3774,8 +3773,7 @@ restart:
 			}
 		}
 
-		ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 0,
-					    5, 0);
+		ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
 		if (ret < 0) {
 			if (ret != -EAGAIN) {
 				err = ret;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index d064fa0a4a07..29bef63e23ba 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -418,8 +418,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root)
 {
 	int ret;
-	ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 0,
-				    5, 0);
+
+	ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
 	return ret ? 1 : 0;
 }
 
-- 
cgit v1.2.3-58-ga151


From 7e355b83efa80e5f5821591c13c17649594d82ac Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 18 Oct 2011 13:07:31 -0400
Subject: Btrfs: if we have a lot of pinned space, commit the transaction

Mitch kept hitting a panic because he was getting ENOSPC.  One of my previous
patches makes it so we are much better at not allocating new metadata chunks.
Unfortunately coupled with the overcommit patch this works us into a bit of a
problem if we are removing a bunch of space and end up chewing up all of our
space with pinned extents.  We can allocate chunks fine and overflow is ok, but
the only way to reclaim this space is to commit the transaction.  So if we go to
overcommit, first check and see how much pinned space we have.  If we have more
than 80% of the free space chewed up with pinned extents, just commit the
transaction, this will free up enough space for our reservation and we won't
have this problem anymore.  With this patch Mitch's test doesn't blow up
anymore.  Thanks,

Reported-and-tested-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a5f1421eeee9..4eb7d2ba38f8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3510,6 +3510,20 @@ again:
 		u64 profile = btrfs_get_alloc_profile(root, 0);
 		u64 avail;
 
+		/*
+		 * If we have a lot of space that's pinned, don't bother doing
+		 * the overcommit dance yet and just commit the transaction.
+		 */
+		avail = (space_info->total_bytes - space_info->bytes_used) * 8;
+		do_div(avail, 10);
+		if (space_info->bytes_pinned >= avail && flush && !trans &&
+		    !committed) {
+			space_info->flush = 1;
+			flushing = true;
+			spin_unlock(&space_info->lock);
+			goto commit;
+		}
+
 		spin_lock(&root->fs_info->free_chunk_lock);
 		avail = root->fs_info->free_chunk_space;
 
@@ -3581,6 +3595,7 @@ again:
 	if (trans)
 		goto out;
 
+commit:
 	ret = -ENOSPC;
 	if (committed)
 		goto out;
-- 
cgit v1.2.3-58-ga151


From 016fc6a63e465d5b94e4028f6d05d9703e195428 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 19 Oct 2011 10:31:37 -0400
Subject: Btrfs: don't flush the cache inode before writing it

I noticed we had a little bit of latency when writing out the space cache
inodes.  It's because we flush it before we write anything in case we have dirty
pages already there.  This doesn't matter though since we're just going to
overwrite the space, and there really shouldn't be any dirty pages anyway.  This
makes some of my tests run a little bit faster.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/free-space-cache.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2fecfc3183ee..de205d59b74b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -829,10 +829,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	if (!i_size_read(inode))
 		return -1;
 
-	filemap_write_and_wait(inode->i_mapping);
-	btrfs_wait_ordered_range(inode, inode->i_size &
-				 ~(root->sectorsize - 1), (u64)-1);
-
 	io_ctl_init(&io_ctl, inode, root);
 
 	/* Get the cluster for this block_group if it exists */
-- 
cgit v1.2.3-58-ga151


From 42c4dfc213190fafffc53815c2ee6064430bc379 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 19 Oct 2011 15:28:17 -0400
Subject: cifs: turn read_from_socket into a wrapper around a vectorized
 version

Eventually we'll want to allow cifsd to read data directly into the
pagecache. In order to do that we'll need a routine that can take a
kvec array and pass that directly to kernel_recvmsg.

Unfortunately though, the kernel's recvmsg routines modify the kvec
array that gets passed in, so we need to use a copy of the kvec array
and refresh that copy on each pass through the loop.

Reviewed-and-Tested-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/connect.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 61 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 97a65af2a08a..4860940b748b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -375,14 +375,54 @@ server_unresponsive(struct TCP_Server_Info *server)
 	return false;
 }
 
+/*
+ * kvec_array_init - clone a kvec array, and advance into it
+ * @new:	pointer to memory for cloned array
+ * @iov:	pointer to original array
+ * @nr_segs:	number of members in original array
+ * @bytes:	number of bytes to advance into the cloned array
+ *
+ * This function will copy the array provided in iov to a section of memory
+ * and advance the specified number of bytes into the new array. It returns
+ * the number of segments in the new array. "new" must be at least as big as
+ * the original iov array.
+ */
+static unsigned int
+kvec_array_init(struct kvec *new, struct kvec *iov, unsigned int nr_segs,
+		size_t bytes)
+{
+	size_t base = 0;
+
+	while (bytes || !iov->iov_len) {
+		int copy = min(bytes, iov->iov_len);
+
+		bytes -= copy;
+		base += copy;
+		if (iov->iov_len == base) {
+			iov++;
+			nr_segs--;
+			base = 0;
+		}
+	}
+	memcpy(new, iov, sizeof(*iov) * nr_segs);
+	new->iov_base += base;
+	new->iov_len -= base;
+	return nr_segs;
+}
+
 static int
-read_from_socket(struct TCP_Server_Info *server, char *buf,
-		 unsigned int to_read)
+readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
+		  unsigned int nr_segs, unsigned int to_read)
 {
 	int length = 0;
 	int total_read;
+	unsigned int segs;
 	struct msghdr smb_msg;
-	struct kvec iov;
+	struct kvec *iov;
+
+	iov = kmalloc(sizeof(*iov_orig) * nr_segs, GFP_NOFS);
+	if (!iov)
+		return -ENOMEM;
 
 	smb_msg.msg_control = NULL;
 	smb_msg.msg_controllen = 0;
@@ -393,10 +433,11 @@ read_from_socket(struct TCP_Server_Info *server, char *buf,
 			break;
 		}
 
-		iov.iov_base = buf + total_read;
-		iov.iov_len = to_read;
-		length = kernel_recvmsg(server->ssocket, &smb_msg, &iov, 1,
-					to_read, 0);
+		segs = kvec_array_init(iov, iov_orig, nr_segs, total_read);
+
+		length = kernel_recvmsg(server->ssocket, &smb_msg,
+					iov, segs, to_read, 0);
+
 		if (server->tcpStatus == CifsExiting) {
 			total_read = -ESHUTDOWN;
 			break;
@@ -423,9 +464,22 @@ read_from_socket(struct TCP_Server_Info *server, char *buf,
 			break;
 		}
 	}
+	kfree(iov);
 	return total_read;
 }
 
+static int
+read_from_socket(struct TCP_Server_Info *server, char *buf,
+		 unsigned int to_read)
+{
+	struct kvec iov;
+
+	iov.iov_base = buf;
+	iov.iov_len = to_read;
+
+	return readv_from_socket(server, &iov, 1, to_read);
+}
+
 static bool
 is_smb_response(struct TCP_Server_Info *server, unsigned char type)
 {
-- 
cgit v1.2.3-58-ga151


From 1041e3f9919999b22c9c2a453aa0d92cd16b76ee Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 19 Oct 2011 15:28:27 -0400
Subject: cifs: keep a reusable kvec array for receives

Having to continually allocate a new kvec array is expensive. Allocate
one that's big enough, and only reallocate it as needed.

Reviewed-and-Tested-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/cifsglob.h |  2 ++
 fs/cifs/connect.c  | 22 ++++++++++++++++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 55ebf39fb3fd..51ed2de23070 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -292,6 +292,8 @@ struct TCP_Server_Info {
 	bool	sec_kerberos;		/* supports plain Kerberos */
 	bool	sec_mskerberos;		/* supports legacy MS Kerberos */
 	struct delayed_work	echo; /* echo ping workqueue job */
+	struct kvec *iov;	/* reusable kvec array for receives */
+	unsigned int nr_iov;	/* number of kvecs in array */
 #ifdef CONFIG_CIFS_FSCACHE
 	struct fscache_cookie   *fscache; /* client index cache cookie */
 #endif
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4860940b748b..ee70075c5fb1 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -410,6 +410,24 @@ kvec_array_init(struct kvec *new, struct kvec *iov, unsigned int nr_segs,
 	return nr_segs;
 }
 
+static struct kvec *
+get_server_iovec(struct TCP_Server_Info *server, unsigned int nr_segs)
+{
+	struct kvec *new_iov;
+
+	if (server->iov && nr_segs <= server->nr_iov)
+		return server->iov;
+
+	/* not big enough -- allocate a new one and release the old */
+	new_iov = kmalloc(sizeof(*new_iov) * nr_segs, GFP_NOFS);
+	if (new_iov) {
+		kfree(server->iov);
+		server->iov = new_iov;
+		server->nr_iov = nr_segs;
+	}
+	return new_iov;
+}
+
 static int
 readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
 		  unsigned int nr_segs, unsigned int to_read)
@@ -420,7 +438,7 @@ readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
 	struct msghdr smb_msg;
 	struct kvec *iov;
 
-	iov = kmalloc(sizeof(*iov_orig) * nr_segs, GFP_NOFS);
+	iov = get_server_iovec(server, nr_segs);
 	if (!iov)
 		return -ENOMEM;
 
@@ -464,7 +482,6 @@ readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
 			break;
 		}
 	}
-	kfree(iov);
 	return total_read;
 }
 
@@ -669,6 +686,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
 	}
 
 	kfree(server->hostname);
+	kfree(server->iov);
 	kfree(server);
 
 	length = atomic_dec_return(&tcpSesAllocCount);
-- 
cgit v1.2.3-58-ga151


From 89482a56a079f01c2f4c709f8e23fbf7eeda1b43 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 19 Oct 2011 15:28:57 -0400
Subject: cifs: add a third receive phase to cifs_demultiplex_thread

Have the demultiplex thread receive just enough to get to the MID, and
then find it before receiving the rest. Later, we'll use this to swap
in a preallocated receive buffer for some calls.

Reviewed-and-Tested-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/connect.c | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index ee70075c5fb1..5308bc6e1248 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -746,11 +746,25 @@ cifs_demultiplex_thread(void *p)
 		if (!is_smb_response(server, buf[0]))
 			continue;
 
-		/* check the length */
-		if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
-		    (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
-			cERROR(1, "Invalid size SMB length %d pdu_length %d",
-			       4, pdu_length + 4);
+		/* make sure we have enough to get to the MID */
+		if (pdu_length < sizeof(struct smb_hdr) - 1 - 4) {
+			cERROR(1, "SMB response too short (%u bytes)",
+				pdu_length);
+			cifs_reconnect(server);
+			wake_up(&server->response_q);
+			continue;
+		}
+
+		/* read down to the MID */
+		length = read_from_socket(server, buf + 4,
+					  sizeof(struct smb_hdr) - 1 - 4);
+		if (length < 0)
+			continue;
+		total_read += length;
+
+		if (pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+			cERROR(1, "SMB response too long (%u bytes)",
+				pdu_length);
 			cifs_reconnect(server);
 			wake_up(&server->response_q);
 			continue;
@@ -759,12 +773,15 @@ cifs_demultiplex_thread(void *p)
 		/* else length ok */
 		if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
 			isLargeBuf = true;
-			memcpy(bigbuf, smallbuf, 4);
+			memcpy(bigbuf, smallbuf, total_read);
 			smb_buffer = (struct smb_hdr *)bigbuf;
 			buf = bigbuf;
 		}
 
-		length = read_from_socket(server, buf + 4, pdu_length);
+		/* now read the rest */
+		length = read_from_socket(server,
+				  buf + sizeof(struct smb_hdr) - 1,
+				  pdu_length - sizeof(struct smb_hdr) + 1 + 4);
 		if (length < 0)
 			continue;
 		total_read += length;
-- 
cgit v1.2.3-58-ga151


From ea1f4502fc939b64807f9ab0eca259321047fe83 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 19 Oct 2011 15:29:05 -0400
Subject: cifs: move mid finding into separate routine

Begin breaking up find_cifs_mid into smaller pieces. The parts that
coalesce T2 responses don't really need to be done under the
GlobalMid_lock anyway. Create a new function that just finds the
mid on the list, and then later takes it off the list if the entire
response has been received.

Reviewed-and-Tested-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/connect.c | 113 +++++++++++++++++++++++++++++++-----------------------
 1 file changed, 66 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 5308bc6e1248..0f69b311d3fc 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -542,61 +542,80 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type)
 }
 
 static struct mid_q_entry *
-find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf,
-	      int *length, bool is_large_buf, bool *is_multi_rsp, char **bigbuf)
+find_mid(struct TCP_Server_Info *server, struct smb_hdr *buf)
 {
-	struct mid_q_entry *mid = NULL, *tmp_mid, *ret = NULL;
+	struct mid_q_entry *mid;
 
 	spin_lock(&GlobalMid_Lock);
-	list_for_each_entry_safe(mid, tmp_mid, &server->pending_mid_q, qhead) {
-		if (mid->mid != buf->Mid ||
-		    mid->midState != MID_REQUEST_SUBMITTED ||
-		    mid->command != buf->Command)
-			continue;
-
-		if (*length == 0 && check2ndT2(buf) > 0) {
-			/* We have a multipart transact2 resp */
-			*is_multi_rsp = true;
-			if (mid->resp_buf) {
-				/* merge response - fix up 1st*/
-				*length = coalesce_t2(buf, mid->resp_buf);
-				if (*length > 0) {
-					*length = 0;
-					mid->multiRsp = true;
-					break;
-				}
-				/* All parts received or packet is malformed. */
-				mid->multiEnd = true;
-				goto multi_t2_fnd;
-			}
-			if (!is_large_buf) {
-				/*FIXME: switch to already allocated largebuf?*/
-				cERROR(1, "1st trans2 resp needs bigbuf");
-			} else {
-				/* Have first buffer */
-				mid->resp_buf = buf;
-				mid->largeBuf = true;
-				*bigbuf = NULL;
-			}
-			break;
+	list_for_each_entry(mid, &server->pending_mid_q, qhead) {
+		if (mid->mid == buf->Mid &&
+		    mid->midState == MID_REQUEST_SUBMITTED &&
+		    mid->command == buf->Command) {
+			spin_unlock(&GlobalMid_Lock);
+			return mid;
 		}
-		mid->resp_buf = buf;
-		mid->largeBuf = is_large_buf;
-multi_t2_fnd:
-		if (*length == 0)
-			mid->midState = MID_RESPONSE_RECEIVED;
-		else
-			mid->midState = MID_RESPONSE_MALFORMED;
+	}
+	spin_unlock(&GlobalMid_Lock);
+	return NULL;
+}
+
+static void
+dequeue_mid(struct mid_q_entry *mid, int malformed)
+{
 #ifdef CONFIG_CIFS_STATS2
-		mid->when_received = jiffies;
+	mid->when_received = jiffies;
 #endif
-		list_del_init(&mid->qhead);
-		ret = mid;
-		break;
-	}
+	spin_lock(&GlobalMid_Lock);
+	if (!malformed)
+		mid->midState = MID_RESPONSE_RECEIVED;
+	else
+		mid->midState = MID_RESPONSE_MALFORMED;
+	list_del_init(&mid->qhead);
 	spin_unlock(&GlobalMid_Lock);
+}
 
-	return ret;
+static struct mid_q_entry *
+find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf,
+	      int *malformed, bool is_large_buf, bool *is_multi_rsp,
+	      char **bigbuf)
+{
+	struct mid_q_entry *mid = NULL;
+
+	mid = find_mid(server, buf);
+	if (!mid)
+		return mid;
+
+	if (*malformed == 0 && check2ndT2(buf) > 0) {
+		/* We have a multipart transact2 resp */
+		*is_multi_rsp = true;
+		if (mid->resp_buf) {
+			/* merge response - fix up 1st*/
+			*malformed = coalesce_t2(buf, mid->resp_buf);
+			if (*malformed > 0) {
+				*malformed = 0;
+				mid->multiRsp = true;
+				return NULL;
+			}
+			/* All parts received or packet is malformed. */
+			mid->multiEnd = true;
+			goto multi_t2_fnd;
+		}
+		if (!is_large_buf) {
+			/*FIXME: switch to already allocated largebuf?*/
+			cERROR(1, "1st trans2 resp needs bigbuf");
+		} else {
+			/* Have first buffer */
+			mid->resp_buf = buf;
+			mid->largeBuf = true;
+			*bigbuf = NULL;
+		}
+		return mid;
+	}
+	mid->resp_buf = buf;
+	mid->largeBuf = is_large_buf;
+multi_t2_fnd:
+	dequeue_mid(mid, *malformed);
+	return mid;
 }
 
 static void clean_demultiplex_info(struct TCP_Server_Info *server)
-- 
cgit v1.2.3-58-ga151


From ffc00e27aa5d343eb71068c185cdbd65871ccdce Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 19 Oct 2011 15:29:13 -0400
Subject: cifs: eliminate is_multi_rsp parm to find_cifs_mid

Change find_cifs_mid to only return NULL if a mid could not be found.
If we got part of a multi-part T2 response, then coalesce it and still
return the mid. The caller can determine the T2 receive status from
the flags in the mid.

With this change, there is no need to pass a pointer to "length" as
well so just pass by value. If a mid is found, then we can just mark
it as malformed. If one isn't found, then the value of "length" won't
change anyway.

Reviewed-and-Tested-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/connect.c | 36 ++++++++++++++++--------------------
 1 file changed, 16 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 0f69b311d3fc..8d5a61561909 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -576,8 +576,7 @@ dequeue_mid(struct mid_q_entry *mid, int malformed)
 
 static struct mid_q_entry *
 find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf,
-	      int *malformed, bool is_large_buf, bool *is_multi_rsp,
-	      char **bigbuf)
+	      int malformed, bool is_large_buf, char **bigbuf)
 {
 	struct mid_q_entry *mid = NULL;
 
@@ -585,17 +584,14 @@ find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf,
 	if (!mid)
 		return mid;
 
-	if (*malformed == 0 && check2ndT2(buf) > 0) {
-		/* We have a multipart transact2 resp */
-		*is_multi_rsp = true;
+	if (malformed == 0 && check2ndT2(buf) > 0) {
+		mid->multiRsp = true;
 		if (mid->resp_buf) {
 			/* merge response - fix up 1st*/
-			*malformed = coalesce_t2(buf, mid->resp_buf);
-			if (*malformed > 0) {
-				*malformed = 0;
-				mid->multiRsp = true;
-				return NULL;
-			}
+			malformed = coalesce_t2(buf, mid->resp_buf);
+			if (malformed > 0)
+				return mid;
+
 			/* All parts received or packet is malformed. */
 			mid->multiEnd = true;
 			goto multi_t2_fnd;
@@ -614,7 +610,7 @@ find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf,
 	mid->resp_buf = buf;
 	mid->largeBuf = is_large_buf;
 multi_t2_fnd:
-	dequeue_mid(mid, *malformed);
+	dequeue_mid(mid, malformed);
 	return mid;
 }
 
@@ -725,7 +721,6 @@ cifs_demultiplex_thread(void *p)
 	struct task_struct *task_to_wake = NULL;
 	struct mid_q_entry *mid_entry;
 	bool isLargeBuf = false;
-	bool isMultiRsp = false;
 
 	current->flags |= PF_MEMALLOC;
 	cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
@@ -745,7 +740,6 @@ cifs_demultiplex_thread(void *p)
 			continue;
 
 		isLargeBuf = false;
-		isMultiRsp = false;
 		smb_buffer = (struct smb_hdr *)smallbuf;
 		buf = smallbuf;
 		pdu_length = 4; /* enough to get RFC1001 header */
@@ -823,23 +817,25 @@ cifs_demultiplex_thread(void *p)
 
 		server->lstrp = jiffies;
 
-		mid_entry = find_cifs_mid(server, smb_buffer, &length,
-					  isLargeBuf, &isMultiRsp, &bigbuf);
+		mid_entry = find_cifs_mid(server, smb_buffer, length,
+					  isLargeBuf, &bigbuf);
 		if (mid_entry != NULL) {
-			mid_entry->callback(mid_entry);
+			if (mid_entry->multiRsp && !mid_entry->multiEnd)
+				continue;
+
 			/* Was previous buf put in mpx struct for multi-rsp? */
-			if (!isMultiRsp) {
+			if (!mid_entry->multiRsp) {
 				/* smb buffer will be freed by user thread */
 				if (isLargeBuf)
 					bigbuf = NULL;
 				else
 					smallbuf = NULL;
 			}
+			mid_entry->callback(mid_entry);
 		} else if (length != 0) {
 			/* response sanity checks failed */
 			continue;
-		} else if (!is_valid_oplock_break(smb_buffer, server) &&
-			   !isMultiRsp) {
+		} else if (!is_valid_oplock_break(smb_buffer, server)) {
 			cERROR(1, "No task to wake, unknown frame received! "
 				   "NumMids %d", atomic_read(&midCount));
 			cifs_dump_mem("Received Data is: ", buf,
-- 
cgit v1.2.3-58-ga151


From 2a37ef94bb153fad13cbb091aab679d7c8b9a67f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 19 Oct 2011 15:29:23 -0400
Subject: cifs: move buffer pointers into TCP_Server_Info

We have several functions that need to access these pointers. Currently
that's done with a lot of double pointer passing. Instead, move them
into the TCP_Server_Info and simplify the handling.

Reviewed-and-Tested-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/cifsglob.h |   4 +++
 fs/cifs/connect.c  | 101 ++++++++++++++++++++++++-----------------------------
 2 files changed, 50 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 51ed2de23070..a73dd1d5e7ef 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -291,9 +291,13 @@ struct TCP_Server_Info {
 	bool	sec_kerberosu2u;	/* supports U2U Kerberos */
 	bool	sec_kerberos;		/* supports plain Kerberos */
 	bool	sec_mskerberos;		/* supports legacy MS Kerberos */
+	bool	large_buf;		/* is current buffer large? */
 	struct delayed_work	echo; /* echo ping workqueue job */
 	struct kvec *iov;	/* reusable kvec array for receives */
 	unsigned int nr_iov;	/* number of kvecs in array */
+	char	*smallbuf;	/* pointer to current "small" buffer */
+	char	*bigbuf;	/* pointer to current "big" buffer */
+	unsigned int total_read; /* total amount of data read in this pass */
 #ifdef CONFIG_CIFS_FSCACHE
 	struct fscache_cookie   *fscache; /* client index cache cookie */
 #endif
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 8d5a61561909..8918f935bf07 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -320,27 +320,24 @@ requeue_echo:
 }
 
 static bool
-allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size,
-		 bool is_large_buf)
+allocate_buffers(struct TCP_Server_Info *server)
 {
-	char *bbuf = *bigbuf, *sbuf = *smallbuf;
-
-	if (bbuf == NULL) {
-		bbuf = (char *)cifs_buf_get();
-		if (!bbuf) {
+	if (!server->bigbuf) {
+		server->bigbuf = (char *)cifs_buf_get();
+		if (!server->bigbuf) {
 			cERROR(1, "No memory for large SMB response");
 			msleep(3000);
 			/* retry will check if exiting */
 			return false;
 		}
-	} else if (is_large_buf) {
+	} else if (server->large_buf) {
 		/* we are reusing a dirty large buf, clear its start */
-		memset(bbuf, 0, size);
+		memset(server->bigbuf, 0, sizeof(struct smb_hdr));
 	}
 
-	if (sbuf == NULL) {
-		sbuf = (char *)cifs_small_buf_get();
-		if (!sbuf) {
+	if (!server->smallbuf) {
+		server->smallbuf = (char *)cifs_small_buf_get();
+		if (!server->smallbuf) {
 			cERROR(1, "No memory for SMB response");
 			msleep(1000);
 			/* retry will check if exiting */
@@ -349,12 +346,9 @@ allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size,
 		/* beginning of smb buffer is cleared in our buf_get */
 	} else {
 		/* if existing small buf clear beginning */
-		memset(sbuf, 0, size);
+		memset(server->smallbuf, 0, sizeof(struct smb_hdr));
 	}
 
-	*bigbuf = bbuf;
-	*smallbuf = sbuf;
-
 	return true;
 }
 
@@ -576,7 +570,7 @@ dequeue_mid(struct mid_q_entry *mid, int malformed)
 
 static struct mid_q_entry *
 find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf,
-	      int malformed, bool is_large_buf, char **bigbuf)
+	      int malformed)
 {
 	struct mid_q_entry *mid = NULL;
 
@@ -596,19 +590,27 @@ find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf,
 			mid->multiEnd = true;
 			goto multi_t2_fnd;
 		}
-		if (!is_large_buf) {
+		if (!server->large_buf) {
 			/*FIXME: switch to already allocated largebuf?*/
 			cERROR(1, "1st trans2 resp needs bigbuf");
 		} else {
 			/* Have first buffer */
 			mid->resp_buf = buf;
 			mid->largeBuf = true;
-			*bigbuf = NULL;
+			server->bigbuf = NULL;
 		}
 		return mid;
 	}
 	mid->resp_buf = buf;
-	mid->largeBuf = is_large_buf;
+	mid->largeBuf = server->large_buf;
+	/* Was previous buf put in mpx struct for multi-rsp? */
+	if (!mid->multiRsp) {
+		/* smb buffer will be freed by user thread */
+		if (server->large_buf)
+			server->bigbuf = NULL;
+		else
+			server->smallbuf = NULL;
+	}
 multi_t2_fnd:
 	dequeue_mid(mid, malformed);
 	return mid;
@@ -715,12 +717,11 @@ cifs_demultiplex_thread(void *p)
 {
 	int length;
 	struct TCP_Server_Info *server = p;
-	unsigned int pdu_length, total_read;
-	char *buf = NULL, *bigbuf = NULL, *smallbuf = NULL;
+	unsigned int pdu_length;
+	char *buf = NULL;
 	struct smb_hdr *smb_buffer = NULL;
 	struct task_struct *task_to_wake = NULL;
 	struct mid_q_entry *mid_entry;
-	bool isLargeBuf = false;
 
 	current->flags |= PF_MEMALLOC;
 	cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
@@ -735,19 +736,18 @@ cifs_demultiplex_thread(void *p)
 		if (try_to_freeze())
 			continue;
 
-		if (!allocate_buffers(&bigbuf, &smallbuf,
-				      sizeof(struct smb_hdr), isLargeBuf))
+		if (!allocate_buffers(server))
 			continue;
 
-		isLargeBuf = false;
-		smb_buffer = (struct smb_hdr *)smallbuf;
-		buf = smallbuf;
+		server->large_buf = false;
+		smb_buffer = (struct smb_hdr *)server->smallbuf;
+		buf = server->smallbuf;
 		pdu_length = 4; /* enough to get RFC1001 header */
 
 		length = read_from_socket(server, buf, pdu_length);
 		if (length < 0)
 			continue;
-		total_read = length;
+		server->total_read = length;
 
 		/*
 		 * The right amount was read from socket - 4 bytes,
@@ -773,7 +773,7 @@ cifs_demultiplex_thread(void *p)
 					  sizeof(struct smb_hdr) - 1 - 4);
 		if (length < 0)
 			continue;
-		total_read += length;
+		server->total_read += length;
 
 		if (pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
 			cERROR(1, "SMB response too long (%u bytes)",
@@ -785,10 +785,11 @@ cifs_demultiplex_thread(void *p)
 
 		/* else length ok */
 		if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
-			isLargeBuf = true;
-			memcpy(bigbuf, smallbuf, total_read);
-			smb_buffer = (struct smb_hdr *)bigbuf;
-			buf = bigbuf;
+			server->large_buf = true;
+			memcpy(server->bigbuf, server->smallbuf,
+			       server->total_read);
+			smb_buffer = (struct smb_hdr *)server->bigbuf;
+			buf = server->bigbuf;
 		}
 
 		/* now read the rest */
@@ -797,9 +798,9 @@ cifs_demultiplex_thread(void *p)
 				  pdu_length - sizeof(struct smb_hdr) + 1 + 4);
 		if (length < 0)
 			continue;
-		total_read += length;
+		server->total_read += length;
 
-		dump_smb(smb_buffer, total_read);
+		dump_smb(smb_buffer, server->total_read);
 
 		/*
 		 * We know that we received enough to get to the MID as we
@@ -810,28 +811,18 @@ cifs_demultiplex_thread(void *p)
 		 * 48 bytes is enough to display the header and a little bit
 		 * into the payload for debugging purposes.
 		 */
-		length = checkSMB(smb_buffer, smb_buffer->Mid, total_read);
+		length = checkSMB(smb_buffer, smb_buffer->Mid,
+				  server->total_read);
 		if (length != 0)
 			cifs_dump_mem("Bad SMB: ", buf,
-				      min_t(unsigned int, total_read, 48));
+				min_t(unsigned int, server->total_read, 48));
 
 		server->lstrp = jiffies;
 
-		mid_entry = find_cifs_mid(server, smb_buffer, length,
-					  isLargeBuf, &bigbuf);
+		mid_entry = find_cifs_mid(server, smb_buffer, length);
 		if (mid_entry != NULL) {
-			if (mid_entry->multiRsp && !mid_entry->multiEnd)
-				continue;
-
-			/* Was previous buf put in mpx struct for multi-rsp? */
-			if (!mid_entry->multiRsp) {
-				/* smb buffer will be freed by user thread */
-				if (isLargeBuf)
-					bigbuf = NULL;
-				else
-					smallbuf = NULL;
-			}
-			mid_entry->callback(mid_entry);
+			if (!mid_entry->multiRsp || mid_entry->multiEnd)
+				mid_entry->callback(mid_entry);
 		} else if (length != 0) {
 			/* response sanity checks failed */
 			continue;
@@ -849,9 +840,9 @@ cifs_demultiplex_thread(void *p)
 	} /* end while !EXITING */
 
 	/* buffer usually freed in free_mid - need to free it here on exit */
-	cifs_buf_release(bigbuf);
-	if (smallbuf) /* no sense logging a debug message if NULL */
-		cifs_small_buf_release(smallbuf);
+	cifs_buf_release(server->bigbuf);
+	if (server->smallbuf) /* no sense logging a debug message if NULL */
+		cifs_small_buf_release(server->smallbuf);
 
 	task_to_wake = xchg(&server->tsk, NULL);
 	clean_demultiplex_info(server);
-- 
cgit v1.2.3-58-ga151


From c8054ebdb6903208b83aa59c387b16d5129492d5 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 19 Oct 2011 15:29:31 -0400
Subject: cifs: find mid earlier in receive codepath

In order to receive directly into a preallocated buffer, we need to ID
the mid earlier, before the bulk of the response is read. Call the mid
finding routine as soon as we're able to read the mid.

Reviewed-and-Tested-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/connect.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 8918f935bf07..52195bad5e67 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -568,27 +568,21 @@ dequeue_mid(struct mid_q_entry *mid, int malformed)
 	spin_unlock(&GlobalMid_Lock);
 }
 
-static struct mid_q_entry *
-find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf,
-	      int malformed)
+static void
+handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server,
+	   struct smb_hdr *buf, int malformed)
 {
-	struct mid_q_entry *mid = NULL;
-
-	mid = find_mid(server, buf);
-	if (!mid)
-		return mid;
-
 	if (malformed == 0 && check2ndT2(buf) > 0) {
 		mid->multiRsp = true;
 		if (mid->resp_buf) {
 			/* merge response - fix up 1st*/
 			malformed = coalesce_t2(buf, mid->resp_buf);
 			if (malformed > 0)
-				return mid;
+				return;
 
 			/* All parts received or packet is malformed. */
 			mid->multiEnd = true;
-			goto multi_t2_fnd;
+			return dequeue_mid(mid, malformed);
 		}
 		if (!server->large_buf) {
 			/*FIXME: switch to already allocated largebuf?*/
@@ -599,7 +593,7 @@ find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf,
 			mid->largeBuf = true;
 			server->bigbuf = NULL;
 		}
-		return mid;
+		return;
 	}
 	mid->resp_buf = buf;
 	mid->largeBuf = server->large_buf;
@@ -611,9 +605,7 @@ find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf,
 		else
 			server->smallbuf = NULL;
 	}
-multi_t2_fnd:
 	dequeue_mid(mid, malformed);
-	return mid;
 }
 
 static void clean_demultiplex_info(struct TCP_Server_Info *server)
@@ -775,6 +767,8 @@ cifs_demultiplex_thread(void *p)
 			continue;
 		server->total_read += length;
 
+		mid_entry = find_mid(server, smb_buffer);
+
 		if (pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
 			cERROR(1, "SMB response too long (%u bytes)",
 				pdu_length);
@@ -819,8 +813,8 @@ cifs_demultiplex_thread(void *p)
 
 		server->lstrp = jiffies;
 
-		mid_entry = find_cifs_mid(server, smb_buffer, length);
 		if (mid_entry != NULL) {
+			handle_mid(mid_entry, server, smb_buffer, length);
 			if (!mid_entry->multiRsp || mid_entry->multiEnd)
 				mid_entry->callback(mid_entry);
 		} else if (length != 0) {
-- 
cgit v1.2.3-58-ga151


From e9097ab48978c89b9c0926e2ae5d49bf6ea91b18 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 19 Oct 2011 15:29:40 -0400
Subject: cifs: break out 3rd receive phase into separate function

Move the entire 3rd phase of the receive codepath into a separate
function in preparation for the addition of a pluggable receive
function.

Reviewed-and-Tested-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/connect.c | 101 +++++++++++++++++++++++++++++++-----------------------
 1 file changed, 59 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 52195bad5e67..f05dedda37c6 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -704,6 +704,61 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
 				GFP_KERNEL);
 }
 
+static int
+standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+	int length;
+	char *buf = server->smallbuf;
+	struct smb_hdr *smb_buffer = (struct smb_hdr *)buf;
+	unsigned int pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
+
+	/* make sure this will fit in a large buffer */
+	if (pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+		cERROR(1, "SMB response too long (%u bytes)",
+			pdu_length);
+		cifs_reconnect(server);
+		wake_up(&server->response_q);
+		return -EAGAIN;
+	}
+
+	/* switch to large buffer if too big for a small one */
+	if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
+		server->large_buf = true;
+		memcpy(server->bigbuf, server->smallbuf, server->total_read);
+		buf = server->bigbuf;
+		smb_buffer = (struct smb_hdr *)buf;
+	}
+
+	/* now read the rest */
+	length = read_from_socket(server,
+			  buf + sizeof(struct smb_hdr) - 1,
+			  pdu_length - sizeof(struct smb_hdr) + 1 + 4);
+	if (length < 0)
+		return length;
+	server->total_read += length;
+
+	dump_smb(smb_buffer, server->total_read);
+
+	/*
+	 * We know that we received enough to get to the MID as we
+	 * checked the pdu_length earlier. Now check to see
+	 * if the rest of the header is OK. We borrow the length
+	 * var for the rest of the loop to avoid a new stack var.
+	 *
+	 * 48 bytes is enough to display the header and a little bit
+	 * into the payload for debugging purposes.
+	 */
+	length = checkSMB(smb_buffer, smb_buffer->Mid, server->total_read);
+	if (length != 0)
+		cifs_dump_mem("Bad SMB: ", buf,
+			min_t(unsigned int, server->total_read, 48));
+
+	if (mid)
+		handle_mid(mid, server, smb_buffer, length);
+
+	return length;
+}
+
 static int
 cifs_demultiplex_thread(void *p)
 {
@@ -769,57 +824,19 @@ cifs_demultiplex_thread(void *p)
 
 		mid_entry = find_mid(server, smb_buffer);
 
-		if (pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-			cERROR(1, "SMB response too long (%u bytes)",
-				pdu_length);
-			cifs_reconnect(server);
-			wake_up(&server->response_q);
+		length = standard_receive3(server, mid_entry);
+		if (length < 0)
 			continue;
-		}
 
-		/* else length ok */
-		if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
-			server->large_buf = true;
-			memcpy(server->bigbuf, server->smallbuf,
-			       server->total_read);
-			smb_buffer = (struct smb_hdr *)server->bigbuf;
+		if (server->large_buf) {
 			buf = server->bigbuf;
+			smb_buffer = (struct smb_hdr *)buf;
 		}
 
-		/* now read the rest */
-		length = read_from_socket(server,
-				  buf + sizeof(struct smb_hdr) - 1,
-				  pdu_length - sizeof(struct smb_hdr) + 1 + 4);
-		if (length < 0)
-			continue;
-		server->total_read += length;
-
-		dump_smb(smb_buffer, server->total_read);
-
-		/*
-		 * We know that we received enough to get to the MID as we
-		 * checked the pdu_length earlier. Now check to see
-		 * if the rest of the header is OK. We borrow the length
-		 * var for the rest of the loop to avoid a new stack var.
-		 *
-		 * 48 bytes is enough to display the header and a little bit
-		 * into the payload for debugging purposes.
-		 */
-		length = checkSMB(smb_buffer, smb_buffer->Mid,
-				  server->total_read);
-		if (length != 0)
-			cifs_dump_mem("Bad SMB: ", buf,
-				min_t(unsigned int, server->total_read, 48));
-
 		server->lstrp = jiffies;
-
 		if (mid_entry != NULL) {
-			handle_mid(mid_entry, server, smb_buffer, length);
 			if (!mid_entry->multiRsp || mid_entry->multiEnd)
 				mid_entry->callback(mid_entry);
-		} else if (length != 0) {
-			/* response sanity checks failed */
-			continue;
 		} else if (!is_valid_oplock_break(smb_buffer, server)) {
 			cERROR(1, "No task to wake, unknown frame received! "
 				   "NumMids %d", atomic_read(&midCount));
-- 
cgit v1.2.3-58-ga151


From 44d22d846fdc7c3e688fc1ff5ae6d06d08bb5656 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 19 Oct 2011 15:29:49 -0400
Subject: cifs: add a callback function to receive the rest of the frame

In order to handle larger SMBs for readpages and other calls, we want
to be able to read into a preallocated set of buffers. Rather than
changing all of the existing code to preallocate buffers however, we
instead add a receive callback function to the MID.

cifsd will call this function once the mid_q_entry has been identified
in order to receive the rest of the SMB. If the mid can't be identified
or the receive pointer is unset, then the standard 3rd phase receive
function will be called.

Reviewed-and-Tested-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/cifsglob.h  | 23 ++++++++++++++++++++---
 fs/cifs/cifsproto.h |  5 +++--
 fs/cifs/cifssmb.c   |  5 +++--
 fs/cifs/connect.c   |  6 +++++-
 fs/cifs/transport.c |  5 +++--
 5 files changed, 34 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a73dd1d5e7ef..d153d0b89d39 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -656,8 +656,24 @@ static inline void cifs_stats_bytes_read(struct cifs_tcon *tcon,
 struct mid_q_entry;
 
 /*
- * This is the prototype for the mid callback function. When creating one,
- * take special care to avoid deadlocks. Things to bear in mind:
+ * This is the prototype for the mid receive function. This function is for
+ * receiving the rest of the SMB frame, starting with the WordCount (which is
+ * just after the MID in struct smb_hdr). Note:
+ *
+ * - This will be called by cifsd, with no locks held.
+ * - The mid will still be on the pending_mid_q.
+ * - mid->resp_buf will point to the current buffer.
+ *
+ * Returns zero on a successful receive, or an error. The receive state in
+ * the TCP_Server_Info will also be updated.
+ */
+typedef int (mid_receive_t)(struct TCP_Server_Info *server,
+			    struct mid_q_entry *mid);
+
+/*
+ * This is the prototype for the mid callback function. This is called once the
+ * mid has been received off of the socket. When creating one, take special
+ * care to avoid deadlocks. Things to bear in mind:
  *
  * - it will be called by cifsd, with no locks held
  * - the mid will be removed from any lists
@@ -675,9 +691,10 @@ struct mid_q_entry {
 	unsigned long when_sent; /* time when smb send finished */
 	unsigned long when_received; /* when demux complete (taken off wire) */
 #endif
+	mid_receive_t *receive; /* call receive callback */
 	mid_callback_t *callback; /* call completion callback */
 	void *callback_data;	  /* general purpose pointer for callback */
-	struct smb_hdr *resp_buf;	/* response buffer */
+	struct smb_hdr *resp_buf;	/* pointer to received SMB header */
 	int midState;	/* wish this were enum but can not pass to wait_event */
 	__u8 command;	/* smb command code */
 	bool largeBuf:1;	/* if valid response, is pointer to large buf */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index a1fa9cec05d6..8a7adb31ffed 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -69,8 +69,9 @@ extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
 					struct TCP_Server_Info *server);
 extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
 extern int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
-			   unsigned int nvec, mid_callback_t *callback,
-			   void *cbdata, bool ignore_pend);
+			   unsigned int nvec, mid_receive_t *receive,
+			   mid_callback_t *callback, void *cbdata,
+			   bool ignore_pend);
 extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *,
 			struct smb_hdr * /* input */ ,
 			struct smb_hdr * /* out */ ,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index c824c106b2b7..0613df4d8e74 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -737,7 +737,8 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
 	iov.iov_base = smb;
 	iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
 
-	rc = cifs_call_async(server, &iov, 1, cifs_echo_callback, server, true);
+	rc = cifs_call_async(server, &iov, 1, NULL, cifs_echo_callback,
+			     server, true);
 	if (rc)
 		cFYI(1, "Echo request failed: %d", rc);
 
@@ -1834,7 +1835,7 @@ cifs_async_writev(struct cifs_writedata *wdata)
 
 	kref_get(&wdata->refcount);
 	rc = cifs_call_async(tcon->ses->server, iov, wdata->nr_pages + 1,
-			     cifs_writev_callback, wdata, false);
+			     NULL, cifs_writev_callback, wdata, false);
 
 	if (rc == 0)
 		cifs_stats_inc(&tcon->num_writes);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index f05dedda37c6..eeee2f5d13ce 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -824,7 +824,11 @@ cifs_demultiplex_thread(void *p)
 
 		mid_entry = find_mid(server, smb_buffer);
 
-		length = standard_receive3(server, mid_entry);
+		if (!mid_entry || !mid_entry->receive)
+			length = standard_receive3(server, mid_entry);
+		else
+			length = mid_entry->receive(server, mid_entry);
+
 		if (length < 0)
 			continue;
 
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 33a3fbf3a3a5..e7398d0cd054 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -339,8 +339,8 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
  */
 int
 cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
-		unsigned int nvec, mid_callback_t *callback, void *cbdata,
-		bool ignore_pend)
+		unsigned int nvec, mid_receive_t *receive,
+		mid_callback_t *callback, void *cbdata, bool ignore_pend)
 {
 	int rc;
 	struct mid_q_entry *mid;
@@ -374,6 +374,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
 		goto out_err;
 	}
 
+	mid->receive = receive;
 	mid->callback = callback;
 	mid->callback_data = cbdata;
 	mid->midState = MID_REQUEST_SUBMITTED;
-- 
cgit v1.2.3-58-ga151


From 2ab2593f4b8953ff951f5531e695e487dfe0b51f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 19 Oct 2011 15:29:59 -0400
Subject: cifs: fix protocol definition for READ_RSP

There is no pad, and it simplifies the code to remove the "Data" field.

None of the existing code relies on these fields, or on the READ_RSP
being a particular length.

Reviewed-and-Tested-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/cifspdu.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index de3aa285de03..3c6ef34fe2bc 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1089,9 +1089,7 @@ typedef struct smb_com_read_rsp {
 	__le16 DataLengthHigh;
 	__u64 Reserved2;
 	__u16 ByteCount;
-	__u8 Pad;		/* BB check for whether padded to DWORD
-				   boundary and optimum performance here */
-	char Data[1];
+	/* read response data immediately follows */
 } __attribute__((packed)) READ_RSP;
 
 typedef struct locking_andx_range {
-- 
cgit v1.2.3-58-ga151


From e28bc5b1fdbd6e850488234d6072e6b66fc46146 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 19 Oct 2011 15:30:07 -0400
Subject: cifs: add cifs_async_readv

...which will allow cifs to do an asynchronous read call to the server.
The caller will allocate and set up cifs_readdata for each READ_AND_X
call that should be issued on the wire. The pages passed in are added
to the pagecache, but not placed on the LRU list yet (as we need the
page->lru to keep the pages on the list in the readdata).

When cifsd identifies the mid, it will see that there is a special
receive handler for the call, and use that to receive the rest of the
frame. cifs_readv_receive will then marshal up a kvec array with
kmapped pages from the pagecache, which eliminates one copy of the
data. Once the data is received, the pages are added to the LRU list,
set uptodate, and unlocked.

Reviewed-and-Tested-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/cifsproto.h |  24 ++++
 fs/cifs/cifssmb.c   | 359 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/cifs/connect.c   |  26 ++--
 3 files changed, 396 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 8a7adb31ffed..c25d0636cc4f 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -154,6 +154,12 @@ extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
 extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
 				const char *, int);
 
+extern void dequeue_mid(struct mid_q_entry *mid, bool malformed);
+extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
+		     unsigned int to_read);
+extern int cifs_readv_from_socket(struct TCP_Server_Info *server,
+		struct kvec *iov_orig, unsigned int nr_segs,
+		unsigned int to_read);
 extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 			       struct cifs_sb_info *cifs_sb);
 extern int cifs_match_super(struct super_block *, void *);
@@ -443,6 +449,24 @@ extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
 extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
 			unsigned char *p24);
 
+/* asynchronous read support */
+struct cifs_readdata {
+	struct cifsFileInfo		*cfile;
+	struct address_space		*mapping;
+	__u64				offset;
+	unsigned int			bytes;
+	pid_t				pid;
+	int				result;
+	struct list_head		pages;
+	struct work_struct		work;
+	unsigned int			nr_iov;
+	struct kvec			iov[1];
+};
+
+struct cifs_readdata *cifs_readdata_alloc(unsigned int nr_pages);
+void cifs_readdata_free(struct cifs_readdata *rdata);
+int cifs_async_readv(struct cifs_readdata *rdata);
+
 /* asynchronous write support */
 struct cifs_writedata {
 	struct kref			refcount;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 0613df4d8e74..aaad4ce6e6c5 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -33,6 +33,8 @@
 #include <linux/slab.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/task_io_accounting_ops.h>
 #include <asm/uaccess.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -40,6 +42,7 @@
 #include "cifsproto.h"
 #include "cifs_unicode.h"
 #include "cifs_debug.h"
+#include "fscache.h"
 
 #ifdef CONFIG_CIFS_POSIX
 static struct {
@@ -83,6 +86,9 @@ static struct {
 #endif /* CONFIG_CIFS_WEAK_PW_HASH */
 #endif /* CIFS_POSIX */
 
+/* Forward declarations */
+static void cifs_readv_complete(struct work_struct *work);
+
 /* Mark as invalid, all open files on tree connections since they
    were closed when session to server was lost */
 static void mark_open_files_invalid(struct cifs_tcon *pTcon)
@@ -1375,6 +1381,359 @@ openRetry:
 	return rc;
 }
 
+struct cifs_readdata *
+cifs_readdata_alloc(unsigned int nr_pages)
+{
+	struct cifs_readdata *rdata;
+
+	/* readdata + 1 kvec for each page */
+	rdata = kzalloc(sizeof(*rdata) +
+			sizeof(struct kvec) * nr_pages, GFP_KERNEL);
+	if (rdata != NULL) {
+		INIT_WORK(&rdata->work, cifs_readv_complete);
+		INIT_LIST_HEAD(&rdata->pages);
+	}
+	return rdata;
+}
+
+void
+cifs_readdata_free(struct cifs_readdata *rdata)
+{
+	cifsFileInfo_put(rdata->cfile);
+	kfree(rdata);
+}
+
+/*
+ * Discard any remaining data in the current SMB. To do this, we borrow the
+ * current bigbuf.
+ */
+static int
+cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+	READ_RSP *rsp = (READ_RSP *)server->smallbuf;
+	unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length);
+	int remaining = rfclen + 4 - server->total_read;
+	struct cifs_readdata *rdata = mid->callback_data;
+
+	while (remaining > 0) {
+		int length;
+
+		length = cifs_read_from_socket(server, server->bigbuf,
+				min_t(unsigned int, remaining,
+					CIFSMaxBufSize + MAX_CIFS_HDR_SIZE));
+		if (length < 0)
+			return length;
+		server->total_read += length;
+		remaining -= length;
+	}
+
+	dequeue_mid(mid, rdata->result);
+	return 0;
+}
+
+static int
+cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+	int length, len;
+	unsigned int data_offset, remaining, data_len;
+	struct cifs_readdata *rdata = mid->callback_data;
+	READ_RSP *rsp = (READ_RSP *)server->smallbuf;
+	unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length) + 4;
+	u64 eof;
+	pgoff_t eof_index;
+	struct page *page, *tpage;
+
+	cFYI(1, "%s: mid=%u offset=%llu bytes=%u", __func__,
+		mid->mid, rdata->offset, rdata->bytes);
+
+	/*
+	 * read the rest of READ_RSP header (sans Data array), or whatever we
+	 * can if there's not enough data. At this point, we've read down to
+	 * the Mid.
+	 */
+	len = min_t(unsigned int, rfclen, sizeof(*rsp)) -
+			sizeof(struct smb_hdr) + 1;
+
+	rdata->iov[0].iov_base = server->smallbuf + sizeof(struct smb_hdr) - 1;
+	rdata->iov[0].iov_len = len;
+
+	length = cifs_readv_from_socket(server, rdata->iov, 1, len);
+	if (length < 0)
+		return length;
+	server->total_read += length;
+
+	/* Was the SMB read successful? */
+	rdata->result = map_smb_to_linux_error(&rsp->hdr, false);
+	if (rdata->result != 0) {
+		cFYI(1, "%s: server returned error %d", __func__,
+			rdata->result);
+		return cifs_readv_discard(server, mid);
+	}
+
+	/* Is there enough to get to the rest of the READ_RSP header? */
+	if (server->total_read < sizeof(READ_RSP)) {
+		cFYI(1, "%s: server returned short header. got=%u expected=%zu",
+			__func__, server->total_read, sizeof(READ_RSP));
+		rdata->result = -EIO;
+		return cifs_readv_discard(server, mid);
+	}
+
+	data_offset = le16_to_cpu(rsp->DataOffset) + 4;
+	if (data_offset < server->total_read) {
+		/*
+		 * win2k8 sometimes sends an offset of 0 when the read
+		 * is beyond the EOF. Treat it as if the data starts just after
+		 * the header.
+		 */
+		cFYI(1, "%s: data offset (%u) inside read response header",
+			__func__, data_offset);
+		data_offset = server->total_read;
+	} else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) {
+		/* data_offset is beyond the end of smallbuf */
+		cFYI(1, "%s: data offset (%u) beyond end of smallbuf",
+			__func__, data_offset);
+		rdata->result = -EIO;
+		return cifs_readv_discard(server, mid);
+	}
+
+	cFYI(1, "%s: total_read=%u data_offset=%u", __func__,
+		server->total_read, data_offset);
+
+	len = data_offset - server->total_read;
+	if (len > 0) {
+		/* read any junk before data into the rest of smallbuf */
+		rdata->iov[0].iov_base = server->smallbuf + server->total_read;
+		rdata->iov[0].iov_len = len;
+		length = cifs_readv_from_socket(server, rdata->iov, 1, len);
+		if (length < 0)
+			return length;
+		server->total_read += length;
+	}
+
+	/* set up first iov for signature check */
+	rdata->iov[0].iov_base = server->smallbuf;
+	rdata->iov[0].iov_len = server->total_read;
+	cFYI(1, "0: iov_base=%p iov_len=%zu",
+		rdata->iov[0].iov_base, rdata->iov[0].iov_len);
+
+	/* how much data is in the response? */
+	data_len = le16_to_cpu(rsp->DataLengthHigh) << 16;
+	data_len += le16_to_cpu(rsp->DataLength);
+	if (data_offset + data_len > rfclen) {
+		/* data_len is corrupt -- discard frame */
+		rdata->result = -EIO;
+		return cifs_readv_discard(server, mid);
+	}
+
+	/* marshal up the page array */
+	len = 0;
+	remaining = data_len;
+	rdata->nr_iov = 1;
+
+	/* determine the eof that the server (probably) has */
+	eof = CIFS_I(rdata->mapping->host)->server_eof;
+	eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
+	cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index);
+
+	list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
+		if (remaining >= PAGE_CACHE_SIZE) {
+			/* enough data to fill the page */
+			rdata->iov[rdata->nr_iov].iov_base = kmap(page);
+			rdata->iov[rdata->nr_iov].iov_len = PAGE_CACHE_SIZE;
+			cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
+				rdata->nr_iov, page->index,
+				rdata->iov[rdata->nr_iov].iov_base,
+				rdata->iov[rdata->nr_iov].iov_len);
+			++rdata->nr_iov;
+			len += PAGE_CACHE_SIZE;
+			remaining -= PAGE_CACHE_SIZE;
+		} else if (remaining > 0) {
+			/* enough for partial page, fill and zero the rest */
+			rdata->iov[rdata->nr_iov].iov_base = kmap(page);
+			rdata->iov[rdata->nr_iov].iov_len = remaining;
+			cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
+				rdata->nr_iov, page->index,
+				rdata->iov[rdata->nr_iov].iov_base,
+				rdata->iov[rdata->nr_iov].iov_len);
+			memset(rdata->iov[rdata->nr_iov].iov_base + remaining,
+				'\0', PAGE_CACHE_SIZE - remaining);
+			++rdata->nr_iov;
+			len += remaining;
+			remaining = 0;
+		} else if (page->index > eof_index) {
+			/*
+			 * The VFS will not try to do readahead past the
+			 * i_size, but it's possible that we have outstanding
+			 * writes with gaps in the middle and the i_size hasn't
+			 * caught up yet. Populate those with zeroed out pages
+			 * to prevent the VFS from repeatedly attempting to
+			 * fill them until the writes are flushed.
+			 */
+			zero_user(page, 0, PAGE_CACHE_SIZE);
+			list_del(&page->lru);
+			lru_cache_add_file(page);
+			flush_dcache_page(page);
+			SetPageUptodate(page);
+			unlock_page(page);
+			page_cache_release(page);
+		} else {
+			/* no need to hold page hostage */
+			list_del(&page->lru);
+			lru_cache_add_file(page);
+			unlock_page(page);
+			page_cache_release(page);
+		}
+	}
+
+	/* issue the read if we have any iovecs left to fill */
+	if (rdata->nr_iov > 1) {
+		length = cifs_readv_from_socket(server, &rdata->iov[1],
+						rdata->nr_iov - 1, len);
+		if (length < 0)
+			return length;
+		server->total_read += length;
+	} else {
+		length = 0;
+	}
+
+	rdata->bytes = length;
+
+	cFYI(1, "total_read=%u rfclen=%u remaining=%u", server->total_read,
+		rfclen, remaining);
+
+	/* discard anything left over */
+	if (server->total_read < rfclen)
+		return cifs_readv_discard(server, mid);
+
+	dequeue_mid(mid, false);
+	return length;
+}
+
+static void
+cifs_readv_complete(struct work_struct *work)
+{
+	struct cifs_readdata *rdata = container_of(work,
+						struct cifs_readdata, work);
+	struct page *page, *tpage;
+
+	list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
+		list_del(&page->lru);
+		lru_cache_add_file(page);
+		kunmap(page);
+
+		if (rdata->result == 0) {
+			flush_dcache_page(page);
+			SetPageUptodate(page);
+		}
+
+		unlock_page(page);
+
+		if (rdata->result == 0)
+			cifs_readpage_to_fscache(rdata->mapping->host, page);
+
+		page_cache_release(page);
+	}
+	cifs_readdata_free(rdata);
+}
+
+static void
+cifs_readv_callback(struct mid_q_entry *mid)
+{
+	struct cifs_readdata *rdata = mid->callback_data;
+	struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+	struct TCP_Server_Info *server = tcon->ses->server;
+
+	cFYI(1, "%s: mid=%u state=%d result=%d bytes=%u", __func__,
+		mid->mid, mid->midState, rdata->result, rdata->bytes);
+
+	switch (mid->midState) {
+	case MID_RESPONSE_RECEIVED:
+		/* result already set, check signature */
+		if (server->sec_mode &
+		    (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+			if (cifs_verify_signature(rdata->iov, rdata->nr_iov,
+					  server, mid->sequence_number + 1))
+				cERROR(1, "Unexpected SMB signature");
+		}
+		/* FIXME: should this be counted toward the initiating task? */
+		task_io_account_read(rdata->bytes);
+		cifs_stats_bytes_read(tcon, rdata->bytes);
+		break;
+	case MID_REQUEST_SUBMITTED:
+	case MID_RETRY_NEEDED:
+		rdata->result = -EAGAIN;
+		break;
+	default:
+		rdata->result = -EIO;
+	}
+
+	queue_work(system_nrt_wq, &rdata->work);
+	DeleteMidQEntry(mid);
+	atomic_dec(&server->inFlight);
+	wake_up(&server->request_q);
+}
+
+/* cifs_async_readv - send an async write, and set up mid to handle result */
+int
+cifs_async_readv(struct cifs_readdata *rdata)
+{
+	int rc;
+	READ_REQ *smb = NULL;
+	int wct;
+	struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+
+	cFYI(1, "%s: offset=%llu bytes=%u", __func__,
+		rdata->offset, rdata->bytes);
+
+	if (tcon->ses->capabilities & CAP_LARGE_FILES)
+		wct = 12;
+	else {
+		wct = 10; /* old style read */
+		if ((rdata->offset >> 32) > 0)  {
+			/* can not handle this big offset for old */
+			return -EIO;
+		}
+	}
+
+	rc = small_smb_init(SMB_COM_READ_ANDX, wct, tcon, (void **)&smb);
+	if (rc)
+		return rc;
+
+	smb->hdr.Pid = cpu_to_le16((__u16)rdata->pid);
+	smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16));
+
+	smb->AndXCommand = 0xFF;	/* none */
+	smb->Fid = rdata->cfile->netfid;
+	smb->OffsetLow = cpu_to_le32(rdata->offset & 0xFFFFFFFF);
+	if (wct == 12)
+		smb->OffsetHigh = cpu_to_le32(rdata->offset >> 32);
+	smb->Remaining = 0;
+	smb->MaxCount = cpu_to_le16(rdata->bytes & 0xFFFF);
+	smb->MaxCountHigh = cpu_to_le32(rdata->bytes >> 16);
+	if (wct == 12)
+		smb->ByteCount = 0;
+	else {
+		/* old style read */
+		struct smb_com_readx_req *smbr =
+			(struct smb_com_readx_req *)smb;
+		smbr->ByteCount = 0;
+	}
+
+	/* 4 for RFC1001 length + 1 for BCC */
+	rdata->iov[0].iov_base = smb;
+	rdata->iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
+
+	rc = cifs_call_async(tcon->ses->server, rdata->iov, 1,
+			     cifs_readv_receive, cifs_readv_callback,
+			     rdata, false);
+
+	if (rc == 0)
+		cifs_stats_inc(&tcon->num_reads);
+
+	cifs_small_buf_release(smb);
+	return rc;
+}
+
 int
 CIFSSMBRead(const int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes,
 	    char **buf, int *pbuf_type)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index eeee2f5d13ce..3d518b9e8c18 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -422,9 +422,9 @@ get_server_iovec(struct TCP_Server_Info *server, unsigned int nr_segs)
 	return new_iov;
 }
 
-static int
-readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
-		  unsigned int nr_segs, unsigned int to_read)
+int
+cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
+		       unsigned int nr_segs, unsigned int to_read)
 {
 	int length = 0;
 	int total_read;
@@ -479,16 +479,16 @@ readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
 	return total_read;
 }
 
-static int
-read_from_socket(struct TCP_Server_Info *server, char *buf,
-		 unsigned int to_read)
+int
+cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
+		      unsigned int to_read)
 {
 	struct kvec iov;
 
 	iov.iov_base = buf;
 	iov.iov_len = to_read;
 
-	return readv_from_socket(server, &iov, 1, to_read);
+	return cifs_readv_from_socket(server, &iov, 1, to_read);
 }
 
 static bool
@@ -553,8 +553,8 @@ find_mid(struct TCP_Server_Info *server, struct smb_hdr *buf)
 	return NULL;
 }
 
-static void
-dequeue_mid(struct mid_q_entry *mid, int malformed)
+void
+dequeue_mid(struct mid_q_entry *mid, bool malformed)
 {
 #ifdef CONFIG_CIFS_STATS2
 	mid->when_received = jiffies;
@@ -730,7 +730,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 	}
 
 	/* now read the rest */
-	length = read_from_socket(server,
+	length = cifs_read_from_socket(server,
 			  buf + sizeof(struct smb_hdr) - 1,
 			  pdu_length - sizeof(struct smb_hdr) + 1 + 4);
 	if (length < 0)
@@ -791,7 +791,7 @@ cifs_demultiplex_thread(void *p)
 		buf = server->smallbuf;
 		pdu_length = 4; /* enough to get RFC1001 header */
 
-		length = read_from_socket(server, buf, pdu_length);
+		length = cifs_read_from_socket(server, buf, pdu_length);
 		if (length < 0)
 			continue;
 		server->total_read = length;
@@ -816,8 +816,8 @@ cifs_demultiplex_thread(void *p)
 		}
 
 		/* read down to the MID */
-		length = read_from_socket(server, buf + 4,
-					  sizeof(struct smb_hdr) - 1 - 4);
+		length = cifs_read_from_socket(server, buf + 4,
+					sizeof(struct smb_hdr) - 1 - 4);
 		if (length < 0)
 			continue;
 		server->total_read += length;
-- 
cgit v1.2.3-58-ga151


From 690c5e3163502f229e5b5d455e5212e28c20cd6d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 19 Oct 2011 15:30:16 -0400
Subject: cifs: convert cifs_readpages to use async reads

Now that we have code in place to do asynchronous reads, convert
cifs_readpages to use it. The new cifs_readpages walks the page_list
that gets passed in, locks and adds the pages to the pagecache and
sets up cifs_readdata to handle the reads.

The rest is handled by the cifs_async_readv infrastructure.

Reviewed-and-Tested-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/file.c | 281 +++++++++++++++++++++++----------------------------------
 1 file changed, 113 insertions(+), 168 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 852d1f39adae..8f6917816fec 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -32,6 +32,7 @@
 #include <linux/delay.h>
 #include <linux/mount.h>
 #include <linux/slab.h>
+#include <linux/swap.h>
 #include <asm/div64.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
@@ -2000,82 +2001,24 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	return rc;
 }
 
-
-static void cifs_copy_cache_pages(struct address_space *mapping,
-	struct list_head *pages, int bytes_read, char *data)
-{
-	struct page *page;
-	char *target;
-
-	while (bytes_read > 0) {
-		if (list_empty(pages))
-			break;
-
-		page = list_entry(pages->prev, struct page, lru);
-		list_del(&page->lru);
-
-		if (add_to_page_cache_lru(page, mapping, page->index,
-				      GFP_KERNEL)) {
-			page_cache_release(page);
-			cFYI(1, "Add page cache failed");
-			data += PAGE_CACHE_SIZE;
-			bytes_read -= PAGE_CACHE_SIZE;
-			continue;
-		}
-		page_cache_release(page);
-
-		target = kmap_atomic(page, KM_USER0);
-
-		if (PAGE_CACHE_SIZE > bytes_read) {
-			memcpy(target, data, bytes_read);
-			/* zero the tail end of this partial page */
-			memset(target + bytes_read, 0,
-			       PAGE_CACHE_SIZE - bytes_read);
-			bytes_read = 0;
-		} else {
-			memcpy(target, data, PAGE_CACHE_SIZE);
-			bytes_read -= PAGE_CACHE_SIZE;
-		}
-		kunmap_atomic(target, KM_USER0);
-
-		flush_dcache_page(page);
-		SetPageUptodate(page);
-		unlock_page(page);
-		data += PAGE_CACHE_SIZE;
-
-		/* add page to FS-Cache */
-		cifs_readpage_to_fscache(mapping->host, page);
-	}
-	return;
-}
-
 static int cifs_readpages(struct file *file, struct address_space *mapping,
 	struct list_head *page_list, unsigned num_pages)
 {
-	int rc = -EACCES;
-	int xid;
-	loff_t offset;
-	struct page *page;
-	struct cifs_sb_info *cifs_sb;
-	struct cifs_tcon *pTcon;
-	unsigned int bytes_read = 0;
-	unsigned int read_size, i;
-	char *smb_read_data = NULL;
-	struct smb_com_read_rsp *pSMBr;
-	struct cifsFileInfo *open_file;
-	struct cifs_io_parms io_parms;
-	int buf_type = CIFS_NO_BUFFER;
-	__u32 pid;
+	int rc;
+	struct list_head tmplist;
+	struct cifsFileInfo *open_file = file->private_data;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+	unsigned int rsize = cifs_sb->rsize;
+	pid_t pid;
 
-	xid = GetXid();
-	if (file->private_data == NULL) {
-		rc = -EBADF;
-		FreeXid(xid);
-		return rc;
-	}
-	open_file = file->private_data;
-	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	pTcon = tlink_tcon(open_file->tlink);
+	/*
+	 * Give up immediately if rsize is too small to read an entire page.
+	 * The VFS will fall back to readpage. We should never reach this
+	 * point however since we set ra_pages to 0 when the rsize is smaller
+	 * than a cache page.
+	 */
+	if (unlikely(rsize < PAGE_CACHE_SIZE))
+		return 0;
 
 	/*
 	 * Reads as many pages as possible from fscache. Returns -ENOBUFS
@@ -2084,125 +2027,127 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 	rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
 					 &num_pages);
 	if (rc == 0)
-		goto read_complete;
+		return rc;
 
-	cFYI(DBG2, "rpages: num pages %d", num_pages);
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
 		pid = open_file->pid;
 	else
 		pid = current->tgid;
 
-	for (i = 0; i < num_pages; ) {
-		unsigned contig_pages;
-		struct page *tmp_page;
-		unsigned long expected_index;
+	rc = 0;
+	INIT_LIST_HEAD(&tmplist);
 
-		if (list_empty(page_list))
-			break;
+	cFYI(1, "%s: file=%p mapping=%p num_pages=%u", __func__, file,
+		mapping, num_pages);
+
+	/*
+	 * Start with the page at end of list and move it to private
+	 * list. Do the same with any following pages until we hit
+	 * the rsize limit, hit an index discontinuity, or run out of
+	 * pages. Issue the async read and then start the loop again
+	 * until the list is empty.
+	 *
+	 * Note that list order is important. The page_list is in
+	 * the order of declining indexes. When we put the pages in
+	 * the rdata->pages, then we want them in increasing order.
+	 */
+	while (!list_empty(page_list)) {
+		unsigned int bytes = PAGE_CACHE_SIZE;
+		unsigned int expected_index;
+		unsigned int nr_pages = 1;
+		loff_t offset;
+		struct page *page, *tpage;
+		struct cifs_readdata *rdata;
 
 		page = list_entry(page_list->prev, struct page, lru);
+
+		/*
+		 * Lock the page and put it in the cache. Since no one else
+		 * should have access to this page, we're safe to simply set
+		 * PG_locked without checking it first.
+		 */
+		__set_page_locked(page);
+		rc = add_to_page_cache_locked(page, mapping,
+					      page->index, GFP_KERNEL);
+
+		/* give up if we can't stick it in the cache */
+		if (rc) {
+			__clear_page_locked(page);
+			break;
+		}
+
+		/* move first page to the tmplist */
 		offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+		list_move_tail(&page->lru, &tmplist);
+
+		/* now try and add more pages onto the request */
+		expected_index = page->index + 1;
+		list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
+			/* discontinuity ? */
+			if (page->index != expected_index)
+				break;
+
+			/* would this page push the read over the rsize? */
+			if (bytes + PAGE_CACHE_SIZE > rsize)
+				break;
 
-		/* count adjacent pages that we will read into */
-		contig_pages = 0;
-		expected_index =
-			list_entry(page_list->prev, struct page, lru)->index;
-		list_for_each_entry_reverse(tmp_page, page_list, lru) {
-			if (tmp_page->index == expected_index) {
-				contig_pages++;
-				expected_index++;
-			} else
+			__set_page_locked(page);
+			if (add_to_page_cache_locked(page, mapping,
+						page->index, GFP_KERNEL)) {
+				__clear_page_locked(page);
 				break;
+			}
+			list_move_tail(&page->lru, &tmplist);
+			bytes += PAGE_CACHE_SIZE;
+			expected_index++;
+			nr_pages++;
 		}
-		if (contig_pages + i >  num_pages)
-			contig_pages = num_pages - i;
-
-		/* for reads over a certain size could initiate async
-		   read ahead */
-
-		read_size = contig_pages * PAGE_CACHE_SIZE;
-		/* Read size needs to be in multiples of one page */
-		read_size = min_t(const unsigned int, read_size,
-				  cifs_sb->rsize & PAGE_CACHE_MASK);
-		cFYI(DBG2, "rpages: read size 0x%x  contiguous pages %d",
-				read_size, contig_pages);
-		rc = -EAGAIN;
-		while (rc == -EAGAIN) {
+
+		rdata = cifs_readdata_alloc(nr_pages);
+		if (!rdata) {
+			/* best to give up if we're out of mem */
+			list_for_each_entry_safe(page, tpage, &tmplist, lru) {
+				list_del(&page->lru);
+				lru_cache_add_file(page);
+				unlock_page(page);
+				page_cache_release(page);
+			}
+			rc = -ENOMEM;
+			break;
+		}
+
+		spin_lock(&cifs_file_list_lock);
+		cifsFileInfo_get(open_file);
+		spin_unlock(&cifs_file_list_lock);
+		rdata->cfile = open_file;
+		rdata->mapping = mapping;
+		rdata->offset = offset;
+		rdata->bytes = bytes;
+		rdata->pid = pid;
+		list_splice_init(&tmplist, &rdata->pages);
+
+		do {
 			if (open_file->invalidHandle) {
 				rc = cifs_reopen_file(open_file, true);
 				if (rc != 0)
-					break;
+					continue;
 			}
-			io_parms.netfid = open_file->netfid;
-			io_parms.pid = pid;
-			io_parms.tcon = pTcon;
-			io_parms.offset = offset;
-			io_parms.length = read_size;
-			rc = CIFSSMBRead(xid, &io_parms, &bytes_read,
-					 &smb_read_data, &buf_type);
-			/* BB more RC checks ? */
-			if (rc == -EAGAIN) {
-				if (smb_read_data) {
-					if (buf_type == CIFS_SMALL_BUFFER)
-						cifs_small_buf_release(smb_read_data);
-					else if (buf_type == CIFS_LARGE_BUFFER)
-						cifs_buf_release(smb_read_data);
-					smb_read_data = NULL;
-				}
-			}
-		}
-		if ((rc < 0) || (smb_read_data == NULL)) {
-			cFYI(1, "Read error in readpages: %d", rc);
-			break;
-		} else if (bytes_read > 0) {
-			task_io_account_read(bytes_read);
-			pSMBr = (struct smb_com_read_rsp *)smb_read_data;
-			cifs_copy_cache_pages(mapping, page_list, bytes_read,
-				smb_read_data + 4 /* RFC1001 hdr */ +
-				le16_to_cpu(pSMBr->DataOffset));
-
-			i +=  bytes_read >> PAGE_CACHE_SHIFT;
-			cifs_stats_bytes_read(pTcon, bytes_read);
-			if ((bytes_read & PAGE_CACHE_MASK) != bytes_read) {
-				i++; /* account for partial page */
-
-				/* server copy of file can have smaller size
-				   than client */
-				/* BB do we need to verify this common case ?
-				   this case is ok - if we are at server EOF
-				   we will hit it on next read */
+			rc = cifs_async_readv(rdata);
+		} while (rc == -EAGAIN);
 
-				/* break; */
+		if (rc != 0) {
+			list_for_each_entry_safe(page, tpage, &rdata->pages,
+						 lru) {
+				list_del(&page->lru);
+				lru_cache_add_file(page);
+				unlock_page(page);
+				page_cache_release(page);
 			}
-		} else {
-			cFYI(1, "No bytes read (%d) at offset %lld . "
-				"Cleaning remaining pages from readahead list",
-				bytes_read, offset);
-			/* BB turn off caching and do new lookup on
-			   file size at server? */
+			cifs_readdata_free(rdata);
 			break;
 		}
-		if (smb_read_data) {
-			if (buf_type == CIFS_SMALL_BUFFER)
-				cifs_small_buf_release(smb_read_data);
-			else if (buf_type == CIFS_LARGE_BUFFER)
-				cifs_buf_release(smb_read_data);
-			smb_read_data = NULL;
-		}
-		bytes_read = 0;
-	}
-
-/* need to free smb_read_data buf before exit */
-	if (smb_read_data) {
-		if (buf_type == CIFS_SMALL_BUFFER)
-			cifs_small_buf_release(smb_read_data);
-		else if (buf_type == CIFS_LARGE_BUFFER)
-			cifs_buf_release(smb_read_data);
-		smb_read_data = NULL;
 	}
 
-read_complete:
-	FreeXid(xid);
 	return rc;
 }
 
-- 
cgit v1.2.3-58-ga151


From 5eba8ab3606621f7e175ae9f521d71f3ac534f82 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 19 Oct 2011 15:30:26 -0400
Subject: cifs: allow for larger rsize= options and change defaults

Currently we cap the rsize at a value that fits in CIFSMaxBufSize. That's
not needed any longer for readpages. Allow the use of larger values for
readpages. cifs_iovec_read and cifs_read however are still limited to the
CIFSMaxBufSize. Make sure they don't exceed that.

The patch also changes the rsize defaults. The default when unix
extensions are enabled is set to 1M for parity with the wsize, and there
is a hard cap of ~16M.

When unix extensions are not enabled, the default is set to 60k. According
to MS-CIFS, Windows servers can only send a max of 60k at a time, so
this is more efficient than requesting a larger size. If the user wishes
however, the max can be extended up to 128k - the length of the READ_RSP
header.

Really old servers however require a special hack to ensure that we don't
request too large a read.

Reviewed-and-Tested-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/connect.c | 120 +++++++++++++++++++++++++++++++++---------------------
 fs/cifs/file.c    |  14 +++++--
 2 files changed, 84 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 3d518b9e8c18..06dfaacfea5d 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2310,16 +2310,16 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
 	    (new->mnt_cifs_flags & CIFS_MOUNT_MASK))
 		return 0;
 
-	if (old->rsize != new->rsize)
-		return 0;
-
 	/*
-	 * We want to share sb only if we don't specify wsize or specified wsize
-	 * is greater or equal than existing one.
+	 * We want to share sb only if we don't specify an r/wsize or
+	 * specified r/wsize is greater than or equal to existing one.
 	 */
 	if (new->wsize && new->wsize < old->wsize)
 		return 0;
 
+	if (new->rsize && new->rsize < old->rsize)
+		return 0;
+
 	if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid)
 		return 0;
 
@@ -2757,14 +2757,6 @@ void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon,
 					CIFS_MOUNT_POSIX_PATHS;
 		}
 
-		if (cifs_sb && (cifs_sb->rsize > 127 * 1024)) {
-			if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) {
-				cifs_sb->rsize = 127 * 1024;
-				cFYI(DBG2, "larger reads not supported by srv");
-			}
-		}
-
-
 		cFYI(1, "Negotiate caps 0x%x", (int)cap);
 #ifdef CONFIG_CIFS_DEBUG2
 		if (cap & CIFS_UNIX_FCNTL_CAP)
@@ -2809,27 +2801,11 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 	spin_lock_init(&cifs_sb->tlink_tree_lock);
 	cifs_sb->tlink_tree = RB_ROOT;
 
-	if (pvolume_info->rsize > CIFSMaxBufSize) {
-		cERROR(1, "rsize %d too large, using MaxBufSize",
-			pvolume_info->rsize);
-		cifs_sb->rsize = CIFSMaxBufSize;
-	} else if ((pvolume_info->rsize) &&
-			(pvolume_info->rsize <= CIFSMaxBufSize))
-		cifs_sb->rsize = pvolume_info->rsize;
-	else /* default */
-		cifs_sb->rsize = CIFSMaxBufSize;
-
-	if (cifs_sb->rsize < 2048) {
-		cifs_sb->rsize = 2048;
-		/* Windows ME may prefer this */
-		cFYI(1, "readsize set to minimum: 2048");
-	}
-
 	/*
-	 * Temporarily set wsize for matching superblock. If we end up using
-	 * new sb then cifs_negotiate_wsize will later negotiate it downward
-	 * if needed.
+	 * Temporarily set r/wsize for matching superblock. If we end up using
+	 * new sb then client will later negotiate it downward if needed.
 	 */
+	cifs_sb->rsize = pvolume_info->rsize;
 	cifs_sb->wsize = pvolume_info->wsize;
 
 	cifs_sb->mnt_uid = pvolume_info->linux_uid;
@@ -2904,29 +2880,41 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 }
 
 /*
- * When the server supports very large writes via POSIX extensions, we can
- * allow up to 2^24-1, minus the size of a WRITE_AND_X header, not including
- * the RFC1001 length.
+ * When the server supports very large reads and writes via POSIX extensions,
+ * we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not
+ * including the RFC1001 length.
  *
  * Note that this might make for "interesting" allocation problems during
  * writeback however as we have to allocate an array of pointers for the
  * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096.
+ *
+ * For reads, there is a similar problem as we need to allocate an array
+ * of kvecs to handle the receive, though that should only need to be done
+ * once.
  */
 #define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4)
+#define CIFS_MAX_RSIZE ((1<<24) - sizeof(READ_RSP) + 4)
 
 /*
- * When the server doesn't allow large posix writes, only allow a wsize of
- * 2^17-1 minus the size of the WRITE_AND_X header. That allows for a write up
- * to the maximum size described by RFC1002.
+ * When the server doesn't allow large posix writes, only allow a rsize/wsize
+ * of 2^17-1 minus the size of the call header. That allows for a read or
+ * write up to the maximum size described by RFC1002.
  */
 #define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4)
+#define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4)
 
 /*
  * The default wsize is 1M. find_get_pages seems to return a maximum of 256
  * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill
  * a single wsize request with a single call.
  */
-#define CIFS_DEFAULT_WSIZE (1024 * 1024)
+#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
+
+/*
+ * Windows only supports a max of 60k reads. Default to that when posix
+ * extensions aren't in force.
+ */
+#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024)
 
 static unsigned int
 cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
@@ -2934,7 +2922,7 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
 	__u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
 	struct TCP_Server_Info *server = tcon->ses->server;
 	unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize :
-				CIFS_DEFAULT_WSIZE;
+				CIFS_DEFAULT_IOSIZE;
 
 	/* can server support 24-bit write sizes? (via UNIX extensions) */
 	if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
@@ -2957,6 +2945,50 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
 	return wsize;
 }
 
+static unsigned int
+cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
+{
+	__u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
+	struct TCP_Server_Info *server = tcon->ses->server;
+	unsigned int rsize, defsize;
+
+	/*
+	 * Set default value...
+	 *
+	 * HACK alert! Ancient servers have very small buffers. Even though
+	 * MS-CIFS indicates that servers are only limited by the client's
+	 * bufsize for reads, testing against win98se shows that it throws
+	 * INVALID_PARAMETER errors if you try to request too large a read.
+	 *
+	 * If the server advertises a MaxBufferSize of less than one page,
+	 * assume that it also can't satisfy reads larger than that either.
+	 *
+	 * FIXME: Is there a better heuristic for this?
+	 */
+	if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_READ_CAP))
+		defsize = CIFS_DEFAULT_IOSIZE;
+	else if (server->capabilities & CAP_LARGE_READ_X)
+		defsize = CIFS_DEFAULT_NON_POSIX_RSIZE;
+	else if (server->maxBuf >= PAGE_CACHE_SIZE)
+		defsize = CIFSMaxBufSize;
+	else
+		defsize = server->maxBuf - sizeof(READ_RSP);
+
+	rsize = pvolume_info->rsize ? pvolume_info->rsize : defsize;
+
+	/*
+	 * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to
+	 * the client's MaxBufferSize.
+	 */
+	if (!(server->capabilities & CAP_LARGE_READ_X))
+		rsize = min_t(unsigned int, CIFSMaxBufSize, rsize);
+
+	/* hard limit of CIFS_MAX_RSIZE */
+	rsize = min_t(unsigned int, rsize, CIFS_MAX_RSIZE);
+
+	return rsize;
+}
+
 static int
 is_path_accessible(int xid, struct cifs_tcon *tcon,
 		   struct cifs_sb_info *cifs_sb, const char *full_path)
@@ -3234,14 +3266,8 @@ try_mount_again:
 		CIFSSMBQFSAttributeInfo(xid, tcon);
 	}
 
-	if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
-		cifs_sb->rsize = 1024 * 127;
-		cFYI(DBG2, "no very large read support, rsize now 127K");
-	}
-	if (!(tcon->ses->capabilities & CAP_LARGE_READ_X))
-		cifs_sb->rsize = min(cifs_sb->rsize, CIFSMaxBufSize);
-
 	cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info);
+	cifs_sb->rsize = cifs_negotiate_rsize(tcon, volume_info);
 
 remote_path_check:
 #ifdef CONFIG_CIFS_DFS_UPCALL
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8f6917816fec..a3b545ff5250 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1758,6 +1758,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
 	struct smb_com_read_rsp *pSMBr;
 	struct cifs_io_parms io_parms;
 	char *read_data;
+	unsigned int rsize;
 	__u32 pid;
 
 	if (!nr_segs)
@@ -1770,6 +1771,9 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
 	xid = GetXid();
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 
+	/* FIXME: set up handlers for larger reads and/or convert to async */
+	rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize);
+
 	open_file = file->private_data;
 	pTcon = tlink_tcon(open_file->tlink);
 
@@ -1782,7 +1786,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
 		cFYI(1, "attempting read on write only file instance");
 
 	for (total_read = 0; total_read < len; total_read += bytes_read) {
-		cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
+		cur_len = min_t(const size_t, len - total_read, rsize);
 		rc = -EAGAIN;
 		read_data = NULL;
 
@@ -1874,6 +1878,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 	unsigned int bytes_read = 0;
 	unsigned int total_read;
 	unsigned int current_read_size;
+	unsigned int rsize;
 	struct cifs_sb_info *cifs_sb;
 	struct cifs_tcon *pTcon;
 	int xid;
@@ -1886,6 +1891,9 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 	xid = GetXid();
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 
+	/* FIXME: set up handlers for larger reads and/or convert to async */
+	rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize);
+
 	if (file->private_data == NULL) {
 		rc = -EBADF;
 		FreeXid(xid);
@@ -1905,8 +1913,8 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 	for (total_read = 0, current_offset = read_data;
 	     read_size > total_read;
 	     total_read += bytes_read, current_offset += bytes_read) {
-		current_read_size = min_t(uint, read_size - total_read,
-					  cifs_sb->rsize);
+		current_read_size = min_t(uint, read_size - total_read, rsize);
+
 		/* For windows me and 9x we do not want to request more
 		than it negotiated since it will refuse the read then */
 		if ((pTcon->ses) &&
-- 
cgit v1.2.3-58-ga151


From 66bfaadc3da74fecb4ba8b03f6b81d5f58b031fa Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 19 Oct 2011 15:30:35 -0400
Subject: cifs: tune bdi.ra_pages in accordance with the rsize

Tune bdi.ra_pages to be a multiple of the rsize. This prevents the VFS
from asking for pages that require small reads to satisfy.

Reviewed-and-Tested-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/connect.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 06dfaacfea5d..f70d87d6ba61 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3182,6 +3182,22 @@ cifs_get_volume_info(char *mount_data, const char *devname)
 	return volume_info;
 }
 
+/* make sure ra_pages is a multiple of rsize */
+static inline unsigned int
+cifs_ra_pages(struct cifs_sb_info *cifs_sb)
+{
+	unsigned int reads;
+	unsigned int rsize_pages = cifs_sb->rsize / PAGE_CACHE_SIZE;
+
+	if (rsize_pages >= default_backing_dev_info.ra_pages)
+		return default_backing_dev_info.ra_pages;
+	else if (rsize_pages == 0)
+		return rsize_pages;
+
+	reads = default_backing_dev_info.ra_pages / rsize_pages;
+	return reads * rsize_pages;
+}
+
 int
 cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
 {
@@ -3200,8 +3216,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
 	if (rc)
 		return rc;
 
-	cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages;
-
 #ifdef CONFIG_CIFS_DFS_UPCALL
 try_mount_again:
 	/* cleanup activities if we're chasing a referral */
@@ -3269,6 +3283,9 @@ try_mount_again:
 	cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info);
 	cifs_sb->rsize = cifs_negotiate_rsize(tcon, volume_info);
 
+	/* tune readahead according to rsize */
+	cifs_sb->bdi.ra_pages = cifs_ra_pages(cifs_sb);
+
 remote_path_check:
 #ifdef CONFIG_CIFS_DFS_UPCALL
 	/*
-- 
cgit v1.2.3-58-ga151


From fef33df88bef6810cc3c4e6edf55c741a8fd68e3 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 19 Oct 2011 15:30:37 -0400
Subject: cifs: allow cifs_max_pending to be readable under
 /sys/module/cifs/parameters

Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/cifsfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index b0a2e1647390..fa6724562a41 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -74,7 +74,7 @@ module_param(cifs_min_small, int, 0);
 MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
 				 "Range: 2 to 256");
 unsigned int cifs_max_pending = CIFS_MAX_REQ;
-module_param(cifs_max_pending, int, 0);
+module_param(cifs_max_pending, int, 0444);
 MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
 				   "Default: 50 Range: 2 to 256");
 unsigned short echo_retries = 5;
-- 
cgit v1.2.3-58-ga151


From f06ac72e929115f2772c29727152ba0832d641e4 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 19 Oct 2011 15:30:40 -0400
Subject: cifs, freezer: add wait_event_freezekillable and have cifs use it

CIFS currently uses wait_event_killable to put tasks to sleep while
they await replies from the server. That function though does not
allow the freezer to run. In many cases, the network interface may
be going down anyway, in which case the reply will never come. The
client then ends up blocking the computer from suspending.

Fix this by adding a new wait_event_freezable variant --
wait_event_freezekillable. The idea is to combine the behavior of
wait_event_killable and wait_event_freezable -- put the task to
sleep and only allow it to be awoken by fatal signals, but also
allow the freezer to do its job.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/transport.c     |  3 ++-
 include/linux/freezer.h | 19 +++++++++++++++++--
 2 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index e7398d0cd054..0cc9584f5889 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -26,6 +26,7 @@
 #include <linux/wait.h>
 #include <linux/net.h>
 #include <linux/delay.h>
+#include <linux/freezer.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <linux/mempool.h>
@@ -324,7 +325,7 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
 {
 	int error;
 
-	error = wait_event_killable(server->response_q,
+	error = wait_event_freezekillable(server->response_q,
 				    midQ->midState != MID_REQUEST_SUBMITTED);
 	if (error < 0)
 		return -ERESTARTSYS;
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 1effc8b56b4e..3672f731f03a 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -134,10 +134,25 @@ static inline void set_freezable_with_signal(void)
 }
 
 /*
- * Freezer-friendly wrappers around wait_event_interruptible() and
- * wait_event_interruptible_timeout(), originally defined in <linux/wait.h>
+ * Freezer-friendly wrappers around wait_event_interruptible(),
+ * wait_event_killable() and wait_event_interruptible_timeout(), originally
+ * defined in <linux/wait.h>
  */
 
+#define wait_event_freezekillable(wq, condition)			\
+({									\
+	int __retval;							\
+	do {								\
+		__retval = wait_event_killable(wq, 			\
+				(condition) || freezing(current));	\
+		if (__retval && !freezing(current))			\
+			break;						\
+		else if (!(condition))					\
+			__retval = -ERESTARTSYS;			\
+	} while (try_to_freeze());					\
+	__retval;							\
+})
+
 #define wait_event_freezable(wq, condition)				\
 ({									\
 	int __retval;							\
-- 
cgit v1.2.3-58-ga151


From d00c5d43866720963a265fa3129f3203cac35b8e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 19 Oct 2011 12:17:29 -0700
Subject: NFS: Get rid of nfs_restart_rpc()

It can trivially be replaced with rpc_restart_call_prepare.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h       | 10 ----------
 fs/nfs/nfs4filelayout.c | 14 ++++----------
 fs/nfs/nfs4proc.c       | 17 ++++++++---------
 fs/nfs/read.c           |  2 +-
 fs/nfs/unlink.c         |  4 ++--
 fs/nfs/write.c          |  2 +-
 net/sunrpc/clnt.c       |  4 +++-
 7 files changed, 19 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ab12913dd473..c1a1bd8ddf1c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -457,13 +457,3 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
 		PAGE_SIZE - 1) >> PAGE_SHIFT;
 }
 
-/*
- * Helper for restarting RPC calls in the possible presence of NFSv4.1
- * sessions.
- */
-static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp)
-{
-	if (nfs4_has_session(clp))
-		return rpc_restart_call_prepare(task);
-	return rpc_restart_call(task);
-}
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 4c78c62639e6..09119418402f 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -122,7 +122,6 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 static int filelayout_read_done_cb(struct rpc_task *task,
 				struct nfs_read_data *data)
 {
-	struct nfs_client *clp = data->ds_clp;
 	int reset = 0;
 
 	dprintk("%s DS read\n", __func__);
@@ -134,9 +133,8 @@ static int filelayout_read_done_cb(struct rpc_task *task,
 		if (reset) {
 			pnfs_set_lo_fail(data->lseg);
 			nfs4_reset_read(task, data);
-			clp = NFS_SERVER(data->inode)->nfs_client;
 		}
-		nfs_restart_rpc(task, clp);
+		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
 
@@ -203,17 +201,13 @@ static int filelayout_write_done_cb(struct rpc_task *task,
 
 	if (filelayout_async_handle_error(task, data->args.context->state,
 					  data->ds_clp, &reset) == -EAGAIN) {
-		struct nfs_client *clp;
-
 		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
 			__func__, data->ds_clp, data->ds_clp->cl_session);
 		if (reset) {
 			pnfs_set_lo_fail(data->lseg);
 			nfs4_reset_write(task, data);
-			clp = NFS_SERVER(data->inode)->nfs_client;
-		} else
-			clp = data->ds_clp;
-		nfs_restart_rpc(task, clp);
+		}
+		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
 
@@ -245,7 +239,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
 			prepare_to_resend_writes(data);
 			pnfs_set_lo_fail(data->lseg);
 		} else
-			nfs_restart_rpc(task, data->ds_clp);
+			rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ba0da50865fe..d2ae413c986a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3191,7 +3191,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
 	struct nfs_server *server = NFS_SERVER(data->inode);
 
 	if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
-		nfs_restart_rpc(task, server->nfs_client);
+		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
 
@@ -3241,7 +3241,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data
 	struct inode *inode = data->inode;
 	
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
-		nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
+		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
 	if (task->tk_status >= 0) {
@@ -3298,7 +3298,7 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *dat
 	struct inode *inode = data->inode;
 
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
-		nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
+		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
 	nfs_refresh_inode(inode, data->res.fattr);
@@ -3838,7 +3838,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 	default:
 		if (nfs4_async_handle_error(task, data->res.server, NULL) ==
 				-EAGAIN) {
-			nfs_restart_rpc(task, data->res.server->nfs_client);
+			rpc_restart_call_prepare(task);
 			return;
 		}
 	}
@@ -4092,8 +4092,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 			break;
 		default:
 			if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
-				nfs_restart_rpc(task,
-						 calldata->server->nfs_client);
+				rpc_restart_call_prepare(task);
 	}
 }
 
@@ -4926,7 +4925,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
 		task->tk_status = 0;
 		/* fall through */
 	case -NFS4ERR_RETRY_UNCACHED_REP:
-		nfs_restart_rpc(task, data->clp);
+		rpc_restart_call_prepare(task);
 		return;
 	}
 	dprintk("<-- %s\n", __func__);
@@ -5767,7 +5766,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
 
 	server = NFS_SERVER(lrp->args.inode);
 	if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
-		nfs_restart_rpc(task, lrp->clp);
+		rpc_restart_call_prepare(task);
 		return;
 	}
 	spin_lock(&lo->plh_inode->i_lock);
@@ -5938,7 +5937,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
 	}
 
 	if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
-		nfs_restart_rpc(task, server->nfs_client);
+		rpc_restart_call_prepare(task);
 		return;
 	}
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index bfc20b160243..e866a7e6e2d5 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -435,7 +435,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
 	argp->offset += resp->count;
 	argp->pgbase += resp->count;
 	argp->count -= resp->count;
-	nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client);
+	rpc_restart_call_prepare(task);
 }
 
 /*
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index b2fbbde58e44..4f9319a2e567 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -87,7 +87,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
 	struct inode *dir = data->dir;
 
 	if (!NFS_PROTO(dir)->unlink_done(task, dir))
-		nfs_restart_rpc(task, NFS_SERVER(dir)->nfs_client);
+		rpc_restart_call_prepare(task);
 }
 
 /**
@@ -369,7 +369,7 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
 	struct dentry *new_dentry = data->new_dentry;
 
 	if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
-		nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client);
+		rpc_restart_call_prepare(task);
 		return;
 	}
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 2084a6494218..ad90b0c998cb 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1305,7 +1305,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 				 */
 				argp->stable = NFS_FILE_SYNC;
 			}
-			nfs_restart_rpc(task, server->nfs_client);
+			rpc_restart_call_prepare(task);
 			return;
 		}
 		if (time_before(complain, jiffies)) {
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index c5347d29cfb7..f0268ea7e711 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -850,7 +850,9 @@ rpc_restart_call_prepare(struct rpc_task *task)
 {
 	if (RPC_ASSASSINATED(task))
 		return 0;
-	task->tk_action = rpc_prepare_task;
+	task->tk_action = call_start;
+	if (task->tk_ops->rpc_call_prepare != NULL)
+		task->tk_action = rpc_prepare_task;
 	return 1;
 }
 EXPORT_SYMBOL_GPL(rpc_restart_call_prepare);
-- 
cgit v1.2.3-58-ga151


From fbb5a9abf0d589e9471dc93b18025b7b921d22c9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 19 Oct 2011 12:17:29 -0700
Subject: NFS: Get rid of unnecessary calls to ClearPageError() in read code

The generic file read code does that for us anyway.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/read.c  | 2 --
 fs/nfs/write.c | 1 -
 2 files changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index e866a7e6e2d5..09829d96d207 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -322,7 +322,6 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head
 		offset += len;
 	} while(nbytes != 0);
 	atomic_set(&req->wb_complete, requests);
-	ClearPageError(page);
 	desc->pg_rpc_callops = &nfs_read_partial_ops;
 	return ret;
 out_bad:
@@ -357,7 +356,6 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, struct list_head *
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_list_add_request(req, &data->pages);
-		ClearPageError(req->wb_page);
 		*pages++ = req->wb_page;
 	}
 	req = nfs_list_entry(data->pages.next);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ad90b0c998cb..2219c88d96b2 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1011,7 +1011,6 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *r
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_list_add_request(req, &data->pages);
-		ClearPageError(req->wb_page);
 		*pages++ = req->wb_page;
 	}
 	req = nfs_list_entry(data->pages.next);
-- 
cgit v1.2.3-58-ga151


From fba730050d1246d0e6ef44e026e0b584732fec2b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 19 Oct 2011 12:17:29 -0700
Subject: NFS: Don't rely on PageError in nfs_readpage_release_partial

Don't rely on the PageError flag to tell us if one of the partial reads of
the page failed. Instead, replace that with a dedicated flag in the
struct nfs_page.

Then clean out redundant uses of the PageError flag: the VM no longer
checks it for reads.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/read.c            | 7 ++-----
 include/linux/nfs_page.h | 1 +
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 09829d96d207..fd58e909842b 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -276,7 +276,6 @@ nfs_async_read_error(struct list_head *head)
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
-		SetPageError(req->wb_page);
 		nfs_readpage_release(req);
 	}
 }
@@ -330,7 +329,6 @@ out_bad:
 		list_del(&data->list);
 		nfs_readdata_free(data);
 	}
-	SetPageError(page);
 	nfs_readpage_release(req);
 	return -ENOMEM;
 }
@@ -460,10 +458,10 @@ static void nfs_readpage_release_partial(void *calldata)
 	int status = data->task.tk_status;
 
 	if (status < 0)
-		SetPageError(page);
+		set_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags);
 
 	if (atomic_dec_and_test(&req->wb_complete)) {
-		if (!PageError(page))
+		if (!test_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags))
 			SetPageUptodate(page);
 		nfs_readpage_release(req);
 	}
@@ -656,7 +654,6 @@ readpage_async_filler(void *data, struct page *page)
 	return 0;
 out_error:
 	error = PTR_ERR(new);
-	SetPageError(page);
 out_unlock:
 	unlock_page(page);
 	return error;
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index e2791a27a901..ab465fe8c3d6 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -34,6 +34,7 @@ enum {
 	PG_NEED_COMMIT,
 	PG_NEED_RESCHED,
 	PG_PNFS_COMMIT,
+	PG_PARTIAL_READ_FAILED,
 };
 
 struct nfs_inode;
-- 
cgit v1.2.3-58-ga151


From b6ee8cd2642f6d822dd1a4ba62298b65ff99b72e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 19 Oct 2011 12:17:29 -0700
Subject: NFS: Get rid of the nfs_rdata_mempool

We don't need a mempool in order to guarantee reliable NFS read performance.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/read.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index fd58e909842b..8b48ec63f722 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -35,16 +35,13 @@ static const struct rpc_call_ops nfs_read_partial_ops;
 static const struct rpc_call_ops nfs_read_full_ops;
 
 static struct kmem_cache *nfs_rdata_cachep;
-static mempool_t *nfs_rdata_mempool;
-
-#define MIN_POOL_READ	(32)
 
 struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 {
-	struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL);
+	struct nfs_read_data *p;
 
+	p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
 	if (p) {
-		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
 		p->npages = pagecount;
 		if (pagecount <= ARRAY_SIZE(p->page_array))
@@ -52,7 +49,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 		else {
 			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
 			if (!p->pagevec) {
-				mempool_free(p, nfs_rdata_mempool);
+				kmem_cache_free(nfs_rdata_cachep, p);
 				p = NULL;
 			}
 		}
@@ -64,7 +61,7 @@ void nfs_readdata_free(struct nfs_read_data *p)
 {
 	if (p && (p->pagevec != &p->page_array[0]))
 		kfree(p->pagevec);
-	mempool_free(p, nfs_rdata_mempool);
+	kmem_cache_free(nfs_rdata_cachep, p);
 }
 
 void nfs_readdata_release(struct nfs_read_data *rdata)
@@ -716,16 +713,10 @@ int __init nfs_init_readpagecache(void)
 	if (nfs_rdata_cachep == NULL)
 		return -ENOMEM;
 
-	nfs_rdata_mempool = mempool_create_slab_pool(MIN_POOL_READ,
-						     nfs_rdata_cachep);
-	if (nfs_rdata_mempool == NULL)
-		return -ENOMEM;
-
 	return 0;
 }
 
 void nfs_destroy_readpagecache(void)
 {
-	mempool_destroy(nfs_rdata_mempool);
 	kmem_cache_destroy(nfs_rdata_cachep);
 }
-- 
cgit v1.2.3-58-ga151


From 487505c257021fc06a7d05753cf27b011487f1dc Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 12 Oct 2011 21:53:38 +0000
Subject: sysfs: Implement support for tagged files in sysfs.

Looking up files in sysfs is hard to understand and analyize because we
currently allow placing untagged files in tagged directories.  In the
implementation of that we have two subtly different meanings of NULL.
NULL meaning there is no tag on a directory entry and NULL meaning
we don't care which namespace the lookup is performed for.  This
multiple uses of NULL have resulted in subtle bugs (since fixed)
in the code.

Currently it is only the bonding driver that needs to have an untagged
file in a tagged directory.

To untagle this mess I am adding support for tagged files to sysfs.
Modifying the bonding driver to implement bonding_masters as a tagged
file.  Registering bonding_masters once for each network namespace.
Then I am removing support for untagged entries in tagged sysfs
directories.

Resulting in code that is much easier to reason about.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/sysfs/file.c       | 53 +++++++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/sysfs.h |  1 +
 2 files changed, 52 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1ad8c93c1b85..07c1b4ec00df 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -488,17 +488,56 @@ const struct file_operations sysfs_file_operations = {
 	.poll		= sysfs_poll,
 };
 
+int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
+		  const void **pns)
+{
+	struct sysfs_dirent *dir_sd = kobj->sd;
+	const struct sysfs_ops *ops;
+	const void *ns = NULL;
+	int err;
+
+	err = 0;
+	if (!sysfs_ns_type(dir_sd))
+		goto out;
+
+	err = -EINVAL;
+	if (!kobj->ktype)
+		goto out;
+	ops = kobj->ktype->sysfs_ops;
+	if (!ops)
+		goto out;
+	if (!ops->namespace)
+		goto out;
+
+	err = 0;
+	ns = ops->namespace(kobj, attr);
+out:
+	if (err) {
+		WARN(1, KERN_ERR "missing sysfs namespace attribute operation for "
+		     "kobject: %s\n", kobject_name(kobj));
+	}
+	*pns = ns;
+	return err;
+}
+
 int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
 			const struct attribute *attr, int type, mode_t amode)
 {
 	umode_t mode = (amode & S_IALLUGO) | S_IFREG;
 	struct sysfs_addrm_cxt acxt;
 	struct sysfs_dirent *sd;
+	const void *ns;
 	int rc;
 
+	rc = sysfs_attr_ns(dir_sd->s_dir.kobj, attr, &ns);
+	if (rc)
+		return rc;
+
 	sd = sysfs_new_dirent(attr->name, mode, type);
 	if (!sd)
 		return -ENOMEM;
+
+	sd->s_ns = ns;
 	sd->s_attr.attr = (void *)attr;
 	sysfs_dirent_init_lockdep(sd);
 
@@ -586,12 +625,17 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
 {
 	struct sysfs_dirent *sd;
 	struct iattr newattrs;
+	const void *ns;
 	int rc;
 
+	rc = sysfs_attr_ns(kobj, attr, &ns);
+	if (rc)
+		return rc;
+
 	mutex_lock(&sysfs_mutex);
 
 	rc = -ENOENT;
-	sd = sysfs_find_dirent(kobj->sd, NULL, attr->name);
+	sd = sysfs_find_dirent(kobj->sd, ns, attr->name);
 	if (!sd)
 		goto out;
 
@@ -616,7 +660,12 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
 
 void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
 {
-	sysfs_hash_and_remove(kobj->sd, NULL, attr->name);
+	const void *ns;
+
+	if (sysfs_attr_ns(kobj, attr, &ns))
+		return;
+
+	sysfs_hash_and_remove(kobj->sd, ns, attr->name);
 }
 
 void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index d7d2f2158142..dac0859e6440 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -112,6 +112,7 @@ struct bin_attribute {
 struct sysfs_ops {
 	ssize_t	(*show)(struct kobject *, struct attribute *,char *);
 	ssize_t	(*store)(struct kobject *,struct attribute *,const char *, size_t);
+	const void *(*namespace)(struct kobject *, const struct attribute *);
 };
 
 struct sysfs_dirent;
-- 
cgit v1.2.3-58-ga151


From 23396180a9770df2c6a694bbb689c12bdf792f94 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 12 Oct 2011 22:01:34 +0000
Subject: sysfs: Remove support for tagged directories with untagged members.

Now that /sys/class/net/bonding_masters is implemented as a tagged sysfs
file we can remove support for untagged files in tagged directories.

This change removes any ambiguity of what a NULL namespace value
means.  A NULL namespace parameter after this patch means
that we are talking about an untagged sysfs dirent.

This makes the sysfs code much less prone to mistakes when during
maintenance.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/sysfs/dir.c   | 6 +++---
 fs/sysfs/inode.c | 2 --
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index ea9120a830d8..352d26d98c0a 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -543,7 +543,7 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 	struct sysfs_dirent *sd;
 
 	for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) {
-		if (ns && sd->s_ns && (sd->s_ns != ns))
+		if (sd->s_ns != ns)
 			continue;
 		if (!strcmp(sd->s_name, name))
 			return sd;
@@ -885,7 +885,7 @@ static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
 		while (pos && (ino > pos->s_ino))
 			pos = pos->s_sibling;
 	}
-	while (pos && pos->s_ns && pos->s_ns != ns)
+	while (pos && pos->s_ns != ns)
 		pos = pos->s_sibling;
 	return pos;
 }
@@ -896,7 +896,7 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
 	pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
 	if (pos)
 		pos = pos->s_sibling;
-	while (pos && pos->s_ns && pos->s_ns != ns)
+	while (pos && pos->s_ns != ns)
 		pos = pos->s_sibling;
 	return pos;
 }
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index e3f091a81c72..527f0cca66ee 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -336,8 +336,6 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
 	sysfs_addrm_start(&acxt, dir_sd);
 
 	sd = sysfs_find_dirent(dir_sd, ns, name);
-	if (sd && (sd->s_ns != ns))
-		sd = NULL;
 	if (sd)
 		sysfs_remove_one(&acxt, sd);
 
-- 
cgit v1.2.3-58-ga151


From 903e21e2eea036f6947f523f732e28b33a63ed0f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 12 Oct 2011 22:02:43 +0000
Subject: sysfs: Reject with a warning invalid uses of tagged directories.

sysfs is a core piece of ifrastructure that many people use and
few people have all of the rules in their head on how to use
it correctly.  Add warnings for people using tagged directories
improperly to that any misuses can be caught and diagnosed quickly.

A single inexpensive test in sysfs_find_dirent is almost sufficient
to catch all possible misuses.  An additional warning is needed
in sysfs_add_dirent so that we actually fail when attempting to
add an untagged dirent in a tagged directory.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/sysfs/dir.c  | 14 ++++++++++++++
 fs/sysfs/file.c |  3 ---
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 352d26d98c0a..26f370a9b5ce 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -384,6 +384,13 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
 	struct sysfs_inode_attrs *ps_iattr;
 
+	if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) {
+		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
+			sysfs_ns_type(acxt->parent_sd)? "required": "invalid",
+			acxt->parent_sd->s_name, sd->s_name);
+		return -EINVAL;
+	}
+
 	if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name))
 		return -EEXIST;
 
@@ -542,6 +549,13 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 {
 	struct sysfs_dirent *sd;
 
+	if (!!sysfs_ns_type(parent_sd) != !!ns) {
+		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
+			sysfs_ns_type(parent_sd)? "required": "invalid",
+			parent_sd->s_name, name);
+		return NULL;
+	}
+
 	for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) {
 		if (sd->s_ns != ns)
 			continue;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 07c1b4ec00df..d4e6080b4b20 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -466,9 +466,6 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
 	mutex_lock(&sysfs_mutex);
 
 	if (sd && dir)
-		/* Only directories are tagged, so no need to pass
-		 * a tag explicitly.
-		 */
 		sd = sysfs_find_dirent(sd, NULL, dir);
 	if (sd && attr)
 		sd = sysfs_find_dirent(sd, NULL, attr);
-- 
cgit v1.2.3-58-ga151


From 71c424bac5679200e272357a225639da8bf94068 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Wed, 19 Oct 2011 20:44:48 -0500
Subject: [CIFS] Show nostrictsync and noperm mount options in /proc/mounts

Add support to print nostrictsync and noperm mount options in
/proc/mounts for shares mounted with these options.
(cleanup merge conflict in Sachin's original patch)

Suggested-by: Sachin Prabhu <sprabhu@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsfs.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index b0a2e1647390..96a48baad8f7 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -436,6 +436,10 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 		seq_printf(s, ",mfsymlinks");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
 		seq_printf(s, ",fsc");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)
+		seq_printf(s, ",nostrictsync");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
+		seq_printf(s, ",noperm");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
 		seq_printf(s, ",strictcache");
 
-- 
cgit v1.2.3-58-ga151


From 84850e8d8a5ec7b9d3c47d224e9a10c9da52ff1b Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Mon, 29 Aug 2011 09:25:53 +0800
Subject: btrfs: check file extent backref offset underflow

Offset field in data extent backref can underflow if clone range ioctl
is used. We can reliably detect the underflow because max file size is
limited to 2^63 and max data extent size is limited by block group size.

Signed-off-by: Zheng Yan  <zheng.z.yan@intel.com>
---
 fs/btrfs/relocation.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 10af6a0e0865..24d654ce7a06 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3322,8 +3322,11 @@ static int find_data_references(struct reloc_control *rc,
 	}
 
 	key.objectid = ref_objectid;
-	key.offset = ref_offset;
 	key.type = BTRFS_EXTENT_DATA_KEY;
+	if (ref_offset > ((u64)-1 << 32))
+		key.offset = 0;
+	else
+		key.offset = ref_offset;
 
 	path->search_commit_root = 1;
 	path->skip_locking = 1;
-- 
cgit v1.2.3-58-ga151


From 60ccf82f5b6e26e10d41783464ca469c070c7d49 Mon Sep 17 00:00:00 2001
From: Diego Calleja <diegocg@gmail.com>
Date: Thu, 1 Sep 2011 16:33:57 +0200
Subject: btrfs: fix memory leak in btrfs_defrag_file

kmemleak found this:
unreferenced object 0xffff8801b64af968 (size 512):
  comm "btrfs-cleaner", pid 3317, jiffies 4306810886 (age 903.272s)
  hex dump (first 32 bytes):
    00 82 01 07 00 ea ff ff c0 83 01 07 00 ea ff ff  ................
    80 82 01 07 00 ea ff ff c0 87 01 07 00 ea ff ff  ................
  backtrace:
    [<ffffffff816875cc>] kmemleak_alloc+0x5c/0xc0
    [<ffffffff8114aec3>] kmem_cache_alloc_trace+0x163/0x240
    [<ffffffff8127a290>] btrfs_defrag_file+0xf0/0xb20
    [<ffffffff8125d9a5>] btrfs_run_defrag_inodes+0x165/0x210
    [<ffffffff812479d7>] cleaner_kthread+0x177/0x190
    [<ffffffff81075c7d>] kthread+0x8d/0xa0
    [<ffffffff816af5f4>] kernel_thread_helper+0x4/0x10
    [<ffffffffffffffff>] 0xffffffffffffffff

"pages" is not always freed. Fix it removing the unnecesary additional return.

Signed-off-by: Diego Calleja <diegocg@gmail.com>
---
 fs/btrfs/ioctl.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d2b53eb8a8c2..8ccc106f4e18 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1140,9 +1140,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		btrfs_set_super_incompat_flags(disk_super, features);
 	}
 
-	if (!file)
-		kfree(ra);
-	return defrag_count;
+	ret = defrag_count;
 
 out_ra:
 	if (!file)
-- 
cgit v1.2.3-58-ga151


From cbcc83265d929ac71553c1b5dafdb830171af947 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 2 Sep 2011 15:56:25 +0800
Subject: Btrfs: fix defragmentation regression

There's an off-by-one bug:

  # create a file with lots of 4K file extents
  # btrfs fi defrag /mnt/file
  # sync
  # filefrag -v /mnt/file
  Filesystem type is: 9123683e
  File size of /mnt/file is 1228800 (300 blocks, blocksize 4096)
   ext logical physical expected length flags
     0       0     3372              64
     1      64     3136     3435      1
     2      65     3436     3136     64
     3     129     3201     3499      1
     4     130     3500     3201     64
     5     194     3266     3563      1
     6     195     3564     3266     64
     7     259     3331     3627      1
     8     260     3628     3331     40 eof

After this patch:

  ...
  # filefrag -v /mnt/file
  Filesystem type is: 9123683e
  File size of /mnt/file is 1228800 (300 blocks, blocksize 4096)
   ext logical physical expected length flags
     0       0     3372             300 eof
  /mnt/file: 1 extent found

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8ccc106f4e18..b39f7bf92704 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1087,7 +1087,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
 		defrag_count += ret;
 		balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
-		i += ret;
 
 		if (newer_than) {
 			if (newer_off == (u64)-1)
@@ -1107,7 +1106,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 				break;
 			}
 		} else {
-			i++;
+			if (ret > 0)
+				i += ret;
+			else
+				i++;
 		}
 	}
 
-- 
cgit v1.2.3-58-ga151


From 151a31b25e5c941bdd9fdefed650effca223c716 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 2 Sep 2011 15:56:39 +0800
Subject: Btrfs: use i_size_read() in btrfs_defrag_file()

Don't use inode->i_size directly, since we're not holding i_mutex.

This also fixes another bug, that i_size can change after it's checked
against 0 and then (i_size - 1) can be negative.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b39f7bf92704..323d77f09258 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -978,6 +978,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 	struct btrfs_super_block *disk_super;
 	struct file_ra_state *ra = NULL;
 	unsigned long last_index;
+	u64 isize = i_size_read(inode);
 	u64 features;
 	u64 last_len = 0;
 	u64 skip = 0;
@@ -1003,7 +1004,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			compress_type = range->compress_type;
 	}
 
-	if (inode->i_size == 0)
+	if (isize == 0)
 		return 0;
 
 	/*
@@ -1028,10 +1029,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
 	/* find the last page to defrag */
 	if (range->start + range->len > range->start) {
-		last_index = min_t(u64, inode->i_size - 1,
+		last_index = min_t(u64, isize - 1,
 			 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
 	} else {
-		last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+		last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
 	}
 
 	if (newer_than) {
-- 
cgit v1.2.3-58-ga151


From 5ca496604b5975d371bb669ee6c2394bcbea818f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 2 Sep 2011 15:56:55 +0800
Subject: Btrfs: fix wrong max_to_defrag in btrfs_defrag_file()

It's off-by-one, and thus we may skip the last page while defragmenting.

An example case:

  # create /mnt/file with 2 4K file extents
  # btrfs fi defrag /mnt/file
  # sync
  # filefrag /mnt/file
  /mnt/file: 2 extents found

So it's not defragmented.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 323d77f09258..f9026413bcf1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1052,7 +1052,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		i = range->start >> PAGE_CACHE_SHIFT;
 	}
 	if (!max_to_defrag)
-		max_to_defrag = last_index - 1;
+		max_to_defrag = last_index;
 
 	while (i <= last_index && defrag_count < max_to_defrag) {
 		/*
-- 
cgit v1.2.3-58-ga151


From 83c8c9bde0add721f7509aa446455183b040b931 Mon Sep 17 00:00:00 2001
From: Jeff Liu <jeff.liu@oracle.com>
Date: Wed, 14 Sep 2011 14:11:21 +0800
Subject: btrfs: trivial fix, a potential memory leak in
 btrfs_parse_early_options()

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
---
 fs/btrfs/super.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 266d1f35465d..09ce951666ea 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -419,7 +419,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 		u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
 {
 	substring_t args[MAX_OPT_ARGS];
-	char *opts, *orig, *p;
+	char *device_name, *opts, *orig, *p;
 	int error = 0;
 	int intarg;
 
@@ -470,8 +470,14 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 			}
 			break;
 		case Opt_device:
-			error = btrfs_scan_one_device(match_strdup(&args[0]),
+			device_name = match_strdup(&args[0]);
+			if (!device_name) {
+				error = -ENOMEM;
+				goto out;
+			}
+			error = btrfs_scan_one_device(device_name,
 					flags, holder, fs_devices);
+			kfree(device_name);
 			if (error)
 				goto out;
 			break;
-- 
cgit v1.2.3-58-ga151


From 008873eafbc77deb1702aedece33756c58486c6a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 2 Sep 2011 15:57:07 +0800
Subject: Btrfs: honor extent thresh during defragmentation

We won't defrag an extent, if it's bigger than the threshold we
specified and there's no small extent before it, but actually
the code doesn't work this way.

There are three bugs:

- When should_defrag_range() decides we should keep on defragmenting
  an extent, last_len is not incremented. (old bug)

- The length that passes to should_defrag_range() is not the length
  we're going to defrag. (new bug)

- We always defrag 256K bytes data, and a big extent can be part of
  this range. (new bug)

For a file with 4 extents:

        | 4K | 4K | 256K | 256K |

The result of defrag with (the default) 256K extent thresh should be:

        | 264K | 256K |

but with those bugs, we'll get:

        | 520K |

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f9026413bcf1..d524b6697ad9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -765,7 +765,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
 	int ret = 1;
 
 	/*
-	 * make sure that once we start defragging and extent, we keep on
+	 * make sure that once we start defragging an extent, we keep on
 	 * defragging it
 	 */
 	if (start < *defrag_end)
@@ -810,7 +810,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
 	 * extent will force at least part of that big extent to be defragged.
 	 */
 	if (ret) {
-		*last_len += len;
 		*defrag_end = extent_map_end(em);
 	} else {
 		*last_len = 0;
@@ -984,13 +983,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 	u64 skip = 0;
 	u64 defrag_end = 0;
 	u64 newer_off = range->start;
-	int newer_left = 0;
 	unsigned long i;
+	unsigned long ra_index = 0;
 	int ret;
 	int defrag_count = 0;
 	int compress_type = BTRFS_COMPRESS_ZLIB;
 	int extent_thresh = range->extent_thresh;
-	int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+	int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+	int cluster = max_cluster;
 	u64 new_align = ~((u64)128 * 1024 - 1);
 	struct page **pages = NULL;
 
@@ -1020,7 +1020,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		ra = &file->f_ra;
 	}
 
-	pages = kmalloc(sizeof(struct page *) * newer_cluster,
+	pages = kmalloc(sizeof(struct page *) * max_cluster,
 			GFP_NOFS);
 	if (!pages) {
 		ret = -ENOMEM;
@@ -1045,7 +1045,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			 * the extents in the file evenly spaced
 			 */
 			i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
-			newer_left = newer_cluster;
 		} else
 			goto out_ra;
 	} else {
@@ -1077,12 +1076,26 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			i = max(i + 1, next);
 			continue;
 		}
+
+		if (!newer_than) {
+			cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
+				   PAGE_CACHE_SHIFT) - i;
+			cluster = min(cluster, max_cluster);
+		} else {
+			cluster = max_cluster;
+		}
+
 		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
 			BTRFS_I(inode)->force_compress = compress_type;
 
-		btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster);
+		if (i + cluster > ra_index) {
+			ra_index = max(i, ra_index);
+			btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
+				       cluster);
+			ra_index += max_cluster;
+		}
 
-		ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster);
+		ret = cluster_pages_for_defrag(inode, pages, i, cluster);
 		if (ret < 0)
 			goto out_ra;
 
@@ -1102,15 +1115,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			if (!ret) {
 				range->start = newer_off;
 				i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
-				newer_left = newer_cluster;
 			} else {
 				break;
 			}
 		} else {
-			if (ret > 0)
+			if (ret > 0) {
 				i += ret;
-			else
+				last_len += ret << PAGE_CACHE_SHIFT;
+			} else {
 				i++;
+				last_len = 0;
+			}
 		}
 	}
 
-- 
cgit v1.2.3-58-ga151


From f4c697e6406da5dd445eda8d923c53e1138793dd Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Mon, 5 Sep 2011 16:34:54 +0200
Subject: btrfs: return EINVAL if start > total_bytes in fitrim ioctl

We should retirn EINVAL if the start is beyond the end of the file
system in the btrfs_ioctl_fitrim(). Fix that by adding the appropriate
check for it.

Also in the btrfs_trim_fs() it is possible that len+start might overflow
if big values are passed. Fix it by decrementing the len so that start+len
is equal to the file system size in the worst case.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/btrfs/ioctl.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d524b6697ad9..136a2f980e21 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -282,6 +282,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 	struct fstrim_range range;
 	u64 minlen = ULLONG_MAX;
 	u64 num_devices = 0;
+	u64 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -300,12 +301,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 		}
 	}
 	rcu_read_unlock();
+
 	if (!num_devices)
 		return -EOPNOTSUPP;
-
 	if (copy_from_user(&range, arg, sizeof(range)))
 		return -EFAULT;
+	if (range.start > total_bytes)
+		return -EINVAL;
 
+	range.len = min(range.len, total_bytes - range.start);
 	range.minlen = max(range.minlen, minlen);
 	ret = btrfs_trim_fs(root, &range);
 	if (ret < 0)
-- 
cgit v1.2.3-58-ga151


From a05a9bb18ae0abec0b513b5fde876c47905fa13e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 6 Sep 2011 16:55:34 +0800
Subject: Btrfs: fix array bound checking

Otherwise we can execced the array bound of path->slots[].

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ctree.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 011cab3aca8d..0fe615e4ea38 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -902,9 +902,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
 
-	if (level < BTRFS_MAX_LEVEL - 1)
+	if (level < BTRFS_MAX_LEVEL - 1) {
 		parent = path->nodes[level + 1];
-	pslot = path->slots[level + 1];
+		pslot = path->slots[level + 1];
+	}
 
 	/*
 	 * deal with the case where there is only one pointer in the root
@@ -1107,9 +1108,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 	mid = path->nodes[level];
 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
 
-	if (level < BTRFS_MAX_LEVEL - 1)
+	if (level < BTRFS_MAX_LEVEL - 1) {
 		parent = path->nodes[level + 1];
-	pslot = path->slots[level + 1];
+		pslot = path->slots[level + 1];
+	}
 
 	if (!parent)
 		return 1;
-- 
cgit v1.2.3-58-ga151


From 560f7d75457f86a43970aa413e334e394082dce4 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 8 Sep 2011 10:22:01 +0800
Subject: Btrfs: remove BUG_ON() in compress_file_range()

It's not a big deal if we fail to allocate the array, and instead of
panic we can just give up compressing.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/inode.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f12747c9447b..81d4f68f35c9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -393,7 +393,10 @@ again:
 	     (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
 		WARN_ON(pages);
 		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
-		BUG_ON(!pages);
+		if (!pages) {
+			/* just bail out to the uncompressed code */
+			goto cont;
+		}
 
 		if (BTRFS_I(inode)->force_compress)
 			compress_type = BTRFS_I(inode)->force_compress;
@@ -424,6 +427,7 @@ again:
 			will_compress = 1;
 		}
 	}
+cont:
 	if (start == 0) {
 		trans = btrfs_join_transaction(root);
 		BUG_ON(IS_ERR(trans));
-- 
cgit v1.2.3-58-ga151


From f0dd9592a1aa014b3a01aa2be7e795aae040d65b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 8 Sep 2011 10:26:51 +0800
Subject: Btrfs: fix direct-io vs nodatacow

To reproduce the bug:

  # mount -o nodatacow /dev/sda7 /mnt/
  # dd if=/dev/zero of=/mnt/tmp bs=4K count=1
  1+0 records in
  1+0 records out
  4096 bytes (4.1 kB) copied, 0.000136115 s, 30.1 MB/s
  # dd if=/dev/zero of=/mnt/tmp bs=4K count=1 conv=notrunc oflag=direct
  dd: writing `/mnt/tmp': Input/output error
  1+0 records in
  0+0 records out

btrfs_ordered_update_i_size() may return 1, but btrfs_endio_direct_write()
mistakenly takes it as an error.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 81d4f68f35c9..65474d95f26f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5777,8 +5777,7 @@ again:
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
 		ret = btrfs_ordered_update_i_size(inode, 0, ordered);
 		if (!ret)
-			ret = btrfs_update_inode(trans, root, inode);
-		err = ret;
+			err = btrfs_update_inode(trans, root, inode);
 		goto out;
 	}
 
-- 
cgit v1.2.3-58-ga151


From fee187d9d9ddc382c81370a9a280391132dea2e1 Mon Sep 17 00:00:00 2001
From: Liu Bo <liubo2009@cn.fujitsu.com>
Date: Thu, 29 Sep 2011 15:55:28 +0800
Subject: Btrfs: do not set EXTENT_DIRTY along with EXTENT_DELALLOC

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
---
 fs/btrfs/extent_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0ada0b700b44..f284d4e5f447 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1107,7 +1107,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 			struct extent_state **cached_state, gfp_t mask)
 {
 	return set_extent_bit(tree, start, end,
-			      EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
+			      EXTENT_DELALLOC | EXTENT_UPTODATE,
 			      0, NULL, cached_state, mask);
 }
 
-- 
cgit v1.2.3-58-ga151


From 10b2f34d6e7fbe07f498cb2006272e9a561f5e60 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Sun, 2 Oct 2011 13:56:53 +0300
Subject: Btrfs: pass the correct root to lookup_free_space_inode()

Free space items are located in tree of tree roots, not in the extent
tree.  It didn't pop up because lookup_free_space_inode() grabs the
inode all the time instead of actually searching the tree.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4eb7d2ba38f8..6cfcc9060c83 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7312,7 +7312,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	inode = lookup_free_space_inode(root, block_group, path);
+	inode = lookup_free_space_inode(tree_root, block_group, path);
 	if (!IS_ERR(inode)) {
 		ret = btrfs_orphan_add(trans, inode);
 		BUG_ON(ret);
-- 
cgit v1.2.3-58-ga151


From cfbffc39ac89dbd5197cbeec2599a1128eb928f8 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Thu, 6 Oct 2011 13:37:08 +0900
Subject: Btrfs: fix return value of btrfs_get_acl()

In btrfs_get_acl(), when the second __btrfs_getxattr() call fails,
acl is not correctly set.
Therefore, a wrong value might return to the caller.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
---
 fs/btrfs/acl.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index eb159aaa5a11..89b156d85d63 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 		if (!value)
 			return ERR_PTR(-ENOMEM);
 		size = __btrfs_getxattr(inode, name, value, size);
-		if (size > 0) {
-			acl = posix_acl_from_xattr(value, size);
-			if (IS_ERR(acl)) {
-				kfree(value);
-				return acl;
-			}
-			set_cached_acl(inode, type, acl);
-		}
-		kfree(value);
+	}
+	if (size > 0) {
+		acl = posix_acl_from_xattr(value, size);
 	} else if (size == -ENOENT || size == -ENODATA || size == 0) {
 		/* FIXME, who returns -ENOENT?  I think nobody */
 		acl = NULL;
-		set_cached_acl(inode, type, acl);
 	} else {
 		acl = ERR_PTR(-EIO);
 	}
+	kfree(value);
+
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
 
 	return acl;
 }
-- 
cgit v1.2.3-58-ga151


From 60d2adbb1e7fee1cb4bc67f70bd0bd8ace7b6c3c Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 9 Sep 2011 17:34:35 +0800
Subject: Btrfs: fix race between multi-task space allocation and caching space

The task may fail to get free space though it is enough when multi-task
space allocation and caching space happen at the same time.

	Task1			Caching Thread		Task2
	------------------------------------------------------------------------
	find_free_extent
	  The space has not
	  be cached, and start
	  caching thread. And
	  wait for it.
				cache space, if
				the space is > 2MB
				wake up Task1
							find_free_extent
							  get all the space that
							  is cached.
	  try to allocate space,
	  but there is no space
	  now.
	trigger BUG_ON()

The message is following:
btrfs allocation failed flags 1, wanted 4096
space_info has 1040187392 free, is not full
space_info total=1082130432, used=4096, pinned=41938944, reserved=0, may_use=40828928, readonly=0
block group 12582912 has 8388608 bytes, 0 used 8388608 pinned 0 reserved
block group has cluster?: no
0 blocks of free space at or bigger than bytes is
block group 1103101952 has 1073741824 bytes, 4096 used 33550336 pinned 0 reserved
block group has cluster?: no
0 blocks of free space at or bigger than bytes is
------------[ cut here ]------------
kernel BUG at fs/btrfs/inode.c:835!
 [<ffffffffa031261b>] __extent_writepage+0x1bf/0x5ce [btrfs]
 [<ffffffff810cbcb8>] ? __set_page_dirty_nobuffers+0xfe/0x108
 [<ffffffffa02f8ada>] ? wait_current_trans+0x23/0xec [btrfs]
 [<ffffffff810c3fbf>] ? find_get_pages_tag+0x73/0xe2
 [<ffffffffa0312d12>] extent_write_cache_pages.clone.0+0x176/0x29a [btrfs]
 [<ffffffffa0312e74>] extent_writepages+0x3e/0x53 [btrfs]
 [<ffffffff8110ad2c>] ? do_sync_write+0xc6/0x103
 [<ffffffffa0302d6e>] ? btrfs_submit_direct+0x414/0x414 [btrfs]
 [<ffffffff811380fa>] ? fsnotify+0x236/0x266
 [<ffffffffa02fc930>] btrfs_writepages+0x22/0x24 [btrfs]
 [<ffffffff810cc215>] do_writepages+0x1c/0x25
 [<ffffffff810c4958>] __filemap_fdatawrite_range+0x4e/0x50
 [<ffffffff810c4982>] filemap_write_and_wait_range+0x28/0x51
 [<ffffffffa0306b2e>] btrfs_sync_file+0x7d/0x198 [btrfs]
 [<ffffffff8110aa26>] ? fsnotify_modify+0x5d/0x65
 [<ffffffff8112d150>] vfs_fsync_range+0x18/0x21
 [<ffffffff8112d170>] vfs_fsync+0x17/0x19
 [<ffffffff8112d316>] do_fsync+0x29/0x3e
 [<ffffffff8112d348>] sys_fsync+0xb/0xf
 [<ffffffff81468352>] system_call_fastpath+0x16/0x1b
[SNIP]
RIP  [<ffffffffa02fe08c>] cow_file_range+0x1c4/0x32b [btrfs]

We fix this bug by trying to allocate the space again if there are block groups
in caching.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/extent-tree.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6cfcc9060c83..cef355f1328a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4954,6 +4954,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	bool failed_cluster_refill = false;
 	bool failed_alloc = false;
 	bool use_cluster = true;
+	bool have_caching_bg = false;
 	u64 ideal_cache_percent = 0;
 	u64 ideal_cache_offset = 0;
 
@@ -5036,6 +5037,7 @@ ideal_cache:
 		}
 	}
 search:
+	have_caching_bg = false;
 	down_read(&space_info->groups_sem);
 	list_for_each_entry(block_group, &space_info->block_groups[index],
 			    list) {
@@ -5244,6 +5246,8 @@ refill_cluster:
 			failed_alloc = true;
 			goto have_block_group;
 		} else if (!offset) {
+			if (!cached)
+				have_caching_bg = true;
 			goto loop;
 		}
 checks:
@@ -5294,6 +5298,9 @@ loop:
 	}
 	up_read(&space_info->groups_sem);
 
+	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
+		goto search;
+
 	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
 		goto search;
 
-- 
cgit v1.2.3-58-ga151


From 068132bad1de70f85f5f6d12c36d64f8f7848d92 Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel.blueman@gmail.com>
Date: Thu, 23 Jun 2011 23:01:01 +0800
Subject: btrfs: fix oops on failure path

If lookup_extent_backref fails, path->nodes[0] reasonably could be
null along with other callers of btrfs_print_leaf, so ensure we have a
valid extent buffer before dereferencing.

Signed-off-by: Daniel J Blueman <daniel.blueman@gmail.com>
---
 fs/btrfs/print-tree.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fb2605d998e9..f38e452486b8 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
 void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 {
 	int i;
-	u32 type;
-	u32 nr = btrfs_header_nritems(l);
+	u32 type, nr;
 	struct btrfs_item *item;
 	struct btrfs_root_item *ri;
 	struct btrfs_dir_item *di;
@@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 
+	if (!l)
+		return;
+
+	nr = btrfs_header_nritems(l);
+
 	printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
 		(unsigned long long)btrfs_header_bytenr(l), nr,
 		btrfs_leaf_free_space(root, l));
-- 
cgit v1.2.3-58-ga151


From 5f524444c351e145a5f7e28253594688a421bfe8 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 13 Oct 2011 00:20:43 +0300
Subject: Btrfs: fix a bug when opening seed devices

Initialize fs_info->bdev_holder a bit earlier to be able to pass a
correct holder id to blkdev_get() when opening seed devices with O_EXCL.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 09ce951666ea..29eecbb6ec3a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -939,6 +939,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 
 		s->s_flags = flags | MS_NOSEC;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+		btrfs_sb(s)->fs_info->bdev_holder = fs_type;
 		error = btrfs_fill_super(s, fs_devices, data,
 					 flags & MS_SILENT ? 1 : 0);
 		if (error) {
@@ -946,7 +947,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 			return ERR_PTR(error);
 		}
 
-		btrfs_sb(s)->fs_info->bdev_holder = fs_type;
 		s->s_flags |= MS_ACTIVE;
 	}
 
-- 
cgit v1.2.3-58-ga151


From 20bcd64934e4eb8f3f90a0dca54fb0ac2edd7795 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 20 Oct 2011 00:06:20 +0300
Subject: Btrfs: close all bdevs on mount failure

Fix a bug introduced by 20b45077.  We have to return EINVAL on mount
failure, but doing that too early in the sequence leaves all of the
devices opened exclusively.  This also fixes an issue where under some
scenarios only a second mount -o degraded <devices> command would
succeed.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/volumes.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e138af710de2..c6938b45e0fd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -597,10 +597,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		set_blocksize(bdev, 4096);
 
 		bh = btrfs_read_dev_super(bdev);
-		if (!bh) {
-			ret = -EINVAL;
+		if (!bh)
 			goto error_close;
-		}
 
 		disk_super = (struct btrfs_super_block *)bh->b_data;
 		devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -655,7 +653,7 @@ error:
 		continue;
 	}
 	if (fs_devices->open_devices == 0) {
-		ret = -EIO;
+		ret = -EINVAL;
 		goto out;
 	}
 	fs_devices->seeding = seeding;
-- 
cgit v1.2.3-58-ga151


From 09e0834fb0ce1ea2a63885177015bd5d7d2bc22d Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Thu, 20 Oct 2011 18:56:10 -0400
Subject: ext4: fix deadlock in ext4_ordered_write_end()

If ext4_jbd2_file_inode() in ext4_ordered_write_end() fails for some
reasons, this function returns to caller without unlocking the page.
It leads to the deadlock, and the patch fixes this issue.

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 081bb25a9ad3..ecb572591924 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -959,7 +959,11 @@ static int ext4_ordered_write_end(struct file *file,
 			ext4_orphan_add(handle, inode);
 		if (ret2 < 0)
 			ret = ret2;
+	} else {
+		unlock_page(page);
+		page_cache_release(page);
 	}
+
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-- 
cgit v1.2.3-58-ga151


From 8de49e674a1133ab8998914a6e933ceb4b5f4b07 Mon Sep 17 00:00:00 2001
From: Kazuya Mio <k-mio@sx.jp.nec.com>
Date: Thu, 20 Oct 2011 19:23:08 -0400
Subject: ext4: fix the deadlock in mpage_da_map_and_submit()

If ext4_jbd2_file_inode() in mpage_da_map_and_submit() fails due to
journal abort, this function returns to caller without unlocking the
page.  It leads to the deadlock, and the patch fixes this issue by
calling mpage_da_submit_io().

Signed-off-by: Kazuya Mio <k-mio@sx.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ecb572591924..ff6aace0bb3c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1552,9 +1552,11 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 
 		if (ext4_should_order_data(mpd->inode)) {
 			err = ext4_jbd2_file_inode(handle, mpd->inode);
-			if (err)
+			if (err) {
 				/* Only if the journal is aborted */
-				return;
+				mpd->retval = err;
+				goto submit_io;
+			}
 		}
 	}
 
@@ -2294,11 +2296,12 @@ retry:
 			ret = 0;
 		} else if (ret == MPAGE_DA_EXTENT_TAIL) {
 			/*
-			 * got one extent now try with
-			 * rest of the pages
+			 * Got one extent now try with rest of the pages.
+			 * If mpd.retval is set -EIO, journal is aborted.
+			 * So we don't need to write any more.
 			 */
 			pages_written += mpd.pages_written;
-			ret = 0;
+			ret = mpd.retval;
 			io_done = 1;
 		} else if (wbc->nr_to_write)
 			/*
-- 
cgit v1.2.3-58-ga151


From 940aab490215424a269f93d2eba2794fc8b3e269 Mon Sep 17 00:00:00 2001
From: Malahal Naineni <malahal@us.ibm.com>
Date: Tue, 20 Sep 2011 17:27:14 -0700
Subject: Check validity of cl_rpcclient in nfs_server_list_show

As soon as the nfs_client gets created, its cl_rpcclient is set to
ERR_PTR(-EINVAL). The rpc client structure is allocated later. Check
if the client is ready before using the cl_rpcclient pointer.

Signed-off-by: Malahal Naineni <malahal@us.ibm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index b4e41dd4d0f6..873bf00d51a2 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1868,6 +1868,10 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
 	/* display one transport per line on subsequent lines */
 	clp = list_entry(v, struct nfs_client, cl_share_link);
 
+	/* Check if the client is initialized */
+	if (clp->cl_cons_state != NFS_CS_READY)
+		return 0;
+
 	seq_printf(m, "v%u %s %s %3d %s\n",
 		   clp->rpc_ops->version,
 		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
-- 
cgit v1.2.3-58-ga151


From 45dc63e7d8412546567399b94caeb683af58843e Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Thu, 20 Oct 2011 20:07:23 -0400
Subject: ext4: Allow quota file use root reservation

Quota file is fs's metadata, so it is reasonable  to permit use
root resevation if necessary. This patch fix 265'th xfstest failure

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 8c005c028203..cd70b3041185 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4291,6 +4291,10 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 
 	trace_ext4_request_blocks(ar);
 
+	/* Allow to use superuser reservation for quota file */
+	if (IS_NOQUOTA(ar->inode))
+		ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
+
 	/*
 	 * For delayed allocation, we could skip the ENOSPC and
 	 * EDQUOT check, as blocks and quotas have been already
-- 
cgit v1.2.3-58-ga151


From 4c28d33803d4aeaff32b4ac502af11a9b2aed8f4 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 26 Jul 2011 09:17:28 +0100
Subject: GFS2: Clean up dir hash table reading

Since there is now only a single caller to gfs2_dir_read_data()
and it has a number of constant arguments, we can factor
those out. Also some tests relating to the inode size were
being done twice.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/dir.c | 32 +++++++++-----------------------
 1 file changed, 9 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 1cc2f8ec52a2..2045d70753f1 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -240,16 +240,15 @@ fail:
 	return error;
 }
 
-static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
-				 u64 offset, unsigned int size)
+static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, __be64 *buf,
+				 unsigned int size)
 {
 	struct buffer_head *dibh;
 	int error;
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (!error) {
-		offset += sizeof(struct gfs2_dinode);
-		memcpy(buf, dibh->b_data + offset, size);
+		memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size);
 		brelse(dibh);
 	}
 
@@ -261,13 +260,12 @@ static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
  * gfs2_dir_read_data - Read a data from a directory inode
  * @ip: The GFS2 Inode
  * @buf: The buffer to place result into
- * @offset: File offset to begin jdata_readng from
  * @size: Amount of data to transfer
  *
  * Returns: The amount of data actually copied or the error
  */
-static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
-			      unsigned int size, unsigned ra)
+static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf,
+			      unsigned int size)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	u64 lblock, dblock;
@@ -275,24 +273,14 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
 	unsigned int o;
 	int copied = 0;
 	int error = 0;
-	u64 disksize = i_size_read(&ip->i_inode);
-
-	if (offset >= disksize)
-		return 0;
-
-	if (offset + size > disksize)
-		size = disksize - offset;
-
-	if (!size)
-		return 0;
 
 	if (gfs2_is_stuffed(ip))
-		return gfs2_dir_read_stuffed(ip, buf, offset, size);
+		return gfs2_dir_read_stuffed(ip, buf, size);
 
 	if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
 		return -EINVAL;
 
-	lblock = offset;
+	lblock = 0;
 	o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
 
 	while (copied < size) {
@@ -311,8 +299,6 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
 			if (error || !dblock)
 				goto fail;
 			BUG_ON(extlen < 1);
-			if (!ra)
-				extlen = 1;
 			bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
 		} else {
 			error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh);
@@ -328,7 +314,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
 		extlen--;
 		memcpy(buf, bh->b_data + o, amount);
 		brelse(bh);
-		buf += amount;
+		buf += (amount/sizeof(__be64));
 		copied += amount;
 		lblock++;
 		o = sizeof(struct gfs2_meta_header);
@@ -371,7 +357,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
 	if (hc == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	ret = gfs2_dir_read_data(ip, (char *)hc, 0, hsize, 1);
+	ret = gfs2_dir_read_data(ip, hc, hsize);
 	if (ret < 0) {
 		kfree(hc);
 		return ERR_PTR(ret);
-- 
cgit v1.2.3-58-ga151


From 2f0264d592e34cde99efbad7d616b04af138e913 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 27 Jul 2011 10:58:48 +0100
Subject: GFS2: Split data write & wait in fsync

Now that the data writing is part of fsync proper, we can split
the waiting part out and do it later on. This reduces the
number of waits that we do during fsync on average.

There is also no need to take the i_mutex unless we are flushing
metadata to disk, so we can move that to within the metadata
flushing code.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/file.c | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index edeb9e802903..92c3db424b40 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -551,8 +551,16 @@ static int gfs2_close(struct inode *inode, struct file *file)
  * @end: the end position in the file to sync
  * @datasync: set if we can ignore timestamp changes
  *
- * The VFS will flush data for us. We only need to worry
- * about metadata here.
+ * We split the data flushing here so that we don't wait for the data
+ * until after we've also sent the metadata to disk. Note that for
+ * data=ordered, we will write & wait for the data at the log flush
+ * stage anyway, so this is unlikely to make much of a difference
+ * except in the data=writeback case.
+ *
+ * If the fdatawrite fails due to any reason except -EIO, we will
+ * continue the remainder of the fsync, although we'll still report
+ * the error at the end. This is to match filemap_write_and_wait_range()
+ * behaviour.
  *
  * Returns: errno
  */
@@ -560,30 +568,36 @@ static int gfs2_close(struct inode *inode, struct file *file)
 static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
 		      int datasync)
 {
-	struct inode *inode = file->f_mapping->host;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
 	int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
 	struct gfs2_inode *ip = GFS2_I(inode);
-	int ret;
+	int ret, ret1 = 0;
 
-	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
-	if (ret)
-		return ret;
-	mutex_lock(&inode->i_mutex);
+	if (mapping->nrpages) {
+		ret1 = filemap_fdatawrite_range(mapping, start, end);
+		if (ret1 == -EIO)
+			return ret1;
+	}
 
 	if (datasync)
 		sync_state &= ~I_DIRTY_SYNC;
 
 	if (sync_state) {
+		mutex_lock(&inode->i_mutex);
 		ret = sync_inode_metadata(inode, 1);
 		if (ret) {
 			mutex_unlock(&inode->i_mutex);
 			return ret;
 		}
 		gfs2_ail_flush(ip->i_gl);
+		mutex_unlock(&inode->i_mutex);
 	}
 
-	mutex_unlock(&inode->i_mutex);
-	return 0;
+	if (mapping->nrpages)
+		ret = filemap_fdatawait_range(mapping, start, end);
+
+	return ret ? ret : ret1;
 }
 
 /**
-- 
cgit v1.2.3-58-ga151


From 75549186edf1515062fe2fcbfbd92bd99659afba Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 2 Aug 2011 13:09:36 +0100
Subject: GFS2: Fix bug-trap in ail flush code

The assert was being tested under the wrong lock, a
legacy of the original code. Also, if it does trigger,
the resulting information was not always a lot of help.

This moves the patch under the correct lock and also
prints out more useful information in tacking down the
source of the problem.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glops.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index da21ecaafcc2..99df4832f94c 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -28,6 +28,17 @@
 #include "trans.h"
 #include "dir.h"
 
+static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
+{
+	fs_err(gl->gl_sbd, "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page state 0x%lx\n",
+	       bh, (unsigned long long)bh->b_blocknr, bh->b_state,
+	       bh->b_page->mapping, bh->b_page->flags);
+	fs_err(gl->gl_sbd, "AIL glock %u:%llu mapping %p\n",
+	       gl->gl_name.ln_type, gl->gl_name.ln_number,
+	       gfs2_glock2aspace(gl));
+	gfs2_lm_withdraw(gl->gl_sbd, "AIL error\n");
+}
+
 /**
  * __gfs2_ail_flush - remove all buffers for a given lock from the AIL
  * @gl: the glock
@@ -41,20 +52,24 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl)
 	struct list_head *head = &gl->gl_ail_list;
 	struct gfs2_bufdata *bd;
 	struct buffer_head *bh;
+	sector_t blocknr;
 
 	spin_lock(&sdp->sd_ail_lock);
 	while (!list_empty(head)) {
 		bd = list_entry(head->next, struct gfs2_bufdata,
 				bd_ail_gl_list);
 		bh = bd->bd_bh;
-		gfs2_remove_from_ail(bd);
-		bd->bd_bh = NULL;
+		blocknr = bh->b_blocknr;
+		if (buffer_busy(bh))
+			gfs2_ail_error(gl, bh);
 		bh->b_private = NULL;
+		gfs2_remove_from_ail(bd); /* drops ref on bh */
 		spin_unlock(&sdp->sd_ail_lock);
 
-		bd->bd_blkno = bh->b_blocknr;
+		bd->bd_bh = NULL;
+		bd->bd_blkno = blocknr;
+
 		gfs2_log_lock(sdp);
-		gfs2_assert_withdraw(sdp, !buffer_busy(bh));
 		gfs2_trans_add_revoke(sdp, bd);
 		gfs2_log_unlock(sdp);
 
-- 
cgit v1.2.3-58-ga151


From 1d4ec642d9f00d4c531b1a4ae0613091ec1f8e9b Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 2 Aug 2011 13:13:20 +0100
Subject: GFS2: Make atime checks more efficient

We do not need to start a transaction unless the atime
check has proved positive. Also if we are going to flush
the complete ail list anyway, we might as well skip the
writeback for this specific inode's metadata, since that
will be done as part of the ail writeback process in an
order offering potentially more efficient I/O.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/super.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b7beadd9ba4c..afb87615c014 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -768,30 +768,30 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
 			goto do_flush;
 		unlock_required = 1;
 	}
-	ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
-	if (ret)
-		goto do_unlock;
 	ret = gfs2_meta_inode_buffer(ip, &bh);
 	if (ret == 0) {
 		di = (struct gfs2_dinode *)bh->b_data;
 		atime.tv_sec = be64_to_cpu(di->di_atime);
 		atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
 		if (timespec_compare(&inode->i_atime, &atime) > 0) {
-			gfs2_trans_add_bh(ip->i_gl, bh, 1);
-			gfs2_dinode_out(ip, bh->b_data);
+			ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
+			if (ret == 0) {
+				gfs2_trans_add_bh(ip->i_gl, bh, 1);
+				gfs2_dinode_out(ip, bh->b_data);
+				gfs2_trans_end(sdp);
+			}
 		}
 		brelse(bh);
 	}
-	gfs2_trans_end(sdp);
-do_unlock:
 	if (unlock_required)
 		gfs2_glock_dq_uninit(&gh);
 do_flush:
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
-	filemap_fdatawrite(metamapping);
 	if (bdi->dirty_exceeded)
 		gfs2_ail1_flush(sdp, wbc);
+	else
+		filemap_fdatawrite(metamapping);
 	if (!ret && (wbc->sync_mode == WB_SYNC_ALL))
 		ret = filemap_fdatawait(metamapping);
 	if (ret)
-- 
cgit v1.2.3-58-ga151


From 40ac218f52aa5cac7dc8082f28b61c8b2b29373c Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 2 Aug 2011 13:17:27 +0100
Subject: GFS2: Fix inode allocation error path

If we have got far enough through the inode allocation code
path that an inode has already been allocated, then we must
call iput to dispose of it, if an error occurs during a
later part of the process. This will always be the final iput
since there will be no other references to the inode.

Unlike when the inode has been unlinked, its block state will
be GFS2_BLKST_INODE rather than GFS2_BLKST_UNLINKED so we need
to skip the test in ->evict_inode() for this one case in order
to ensure that it will be deallocated correctly. This patch adds
a new flag in order to ensure that this will happen correctly.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h |  1 +
 fs/gfs2/inode.c  |  6 ++++--
 fs/gfs2/super.c  | 12 +++++++++---
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 892ac37de8ae..be5801a75406 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -267,6 +267,7 @@ struct gfs2_alloc {
 enum {
 	GIF_INVALID		= 0,
 	GIF_QD_LOCKED		= 1,
+	GIF_ALLOC_FAILED	= 2,
 	GIF_SW_PAGED		= 3,
 };
 
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 900cf986aadc..044efe273b97 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -736,10 +736,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 
 fail_gunlock2:
 	gfs2_glock_dq_uninit(ghs + 1);
-	if (inode && !IS_ERR(inode))
-		iput(inode);
 fail_gunlock:
 	gfs2_glock_dq_uninit(ghs);
+	if (inode && !IS_ERR(inode)) {
+		set_bit(GIF_ALLOC_FAILED, &GFS2_I(inode)->i_flags);
+		iput(inode);
+	}
 fail:
 	if (bh)
 		brelse(bh);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index afb87615c014..9961de702d1b 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1471,9 +1471,11 @@ static void gfs2_evict_inode(struct inode *inode)
 		goto out;
 	}
 
-	error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
-	if (error)
-		goto out_truncate;
+	if (!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) {
+		error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
+		if (error)
+			goto out_truncate;
+	}
 
 	if (test_bit(GIF_INVALID, &ip->i_flags)) {
 		error = gfs2_inode_refresh(ip);
@@ -1513,6 +1515,10 @@ static void gfs2_evict_inode(struct inode *inode)
 	goto out_unlock;
 
 out_truncate:
+	gfs2_log_flush(sdp, ip->i_gl);
+	write_inode_now(inode, 1);
+	gfs2_ail_flush(ip->i_gl);
+
 	/* Case 2 starts here */
 	error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
 	if (error)
-- 
cgit v1.2.3-58-ga151


From f18185291d605ea9e442e00e2cf6c917a84d9837 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 5 Aug 2011 10:12:47 +0100
Subject: GFS2: Fix bug trap and journaled data fsync

Journaled data requires that a complete flush of all dirty data for
the file is done, in order that the ail flush which comes after
will succeed.

Also the recently enhanced bug trap can trigger falsely in case
an ail flush from fsync races with a page read. This updates the
bug trap such that it will ignore buffers which are locked and
only trigger on dirty and/or pinned buffers when the ail flush
is run from fsync. The original bug trap is retained when ail
flush is run from ->go_sync()

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/file.c  | 2 ++
 fs/gfs2/glops.c | 8 ++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 92c3db424b40..9d12286d8111 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -590,6 +590,8 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
 			mutex_unlock(&inode->i_mutex);
 			return ret;
 		}
+		if (gfs2_is_jdata(ip))
+			filemap_write_and_wait(mapping);
 		gfs2_ail_flush(ip->i_gl);
 		mutex_unlock(&inode->i_mutex);
 	}
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 99df4832f94c..0cc3ff48ce20 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -46,7 +46,7 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
  * None of the buffers should be dirty, locked, or pinned.
  */
 
-static void __gfs2_ail_flush(struct gfs2_glock *gl)
+static void __gfs2_ail_flush(struct gfs2_glock *gl, unsigned long b_state)
 {
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct list_head *head = &gl->gl_ail_list;
@@ -60,7 +60,7 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl)
 				bd_ail_gl_list);
 		bh = bd->bd_bh;
 		blocknr = bh->b_blocknr;
-		if (buffer_busy(bh))
+		if (bh->b_state & b_state)
 			gfs2_ail_error(gl, bh);
 		bh->b_private = NULL;
 		gfs2_remove_from_ail(bd); /* drops ref on bh */
@@ -99,7 +99,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 	BUG_ON(current->journal_info);
 	current->journal_info = &tr;
 
-	__gfs2_ail_flush(gl);
+	__gfs2_ail_flush(gl, (1ul << BH_Dirty)|(1ul << BH_Pinned)|(1ul << BH_Lock));
 
 	gfs2_trans_end(sdp);
 	gfs2_log_flush(sdp, NULL);
@@ -117,7 +117,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl)
 	ret = gfs2_trans_begin(sdp, 0, revokes);
 	if (ret)
 		return;
-	__gfs2_ail_flush(gl);
+	__gfs2_ail_flush(gl, (1ul << BH_Dirty)|(1ul << BH_Pinned));
 	gfs2_trans_end(sdp);
 	gfs2_log_flush(sdp, NULL);
 }
-- 
cgit v1.2.3-58-ga151


From ab9bbda0204dfd0e5342562d9979d1241b14ea5f Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 15 Aug 2011 14:20:36 +0100
Subject: GFS2: Use ->dirty_inode()

The aim of this patch is to use the newly enhanced ->dirty_inode()
super block operation to deal with atime updates, rather than
piggy backing that code into ->write_inode() as is currently
done.

The net result is a simplification of the code in various places
and a reduction of the number of gfs2_dinode_out() calls since
this is now implied by ->dirty_inode().

Some of the mark_inode_dirty() calls have been moved under glocks
in order to take advantage of then being able to avoid locking in
->dirty_inode() when we already have suitable locks.

One consequence is that generic_write_end() now correctly deals
with file size updates, so that we do not need a separate check
for that afterwards. This also, indirectly, means that fdatasync
should work correctly on GFS2 - the current code always syncs the
metadata whether it needs to or not.

Has survived testing with postmark (with and without atime) and
also fsx.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/acl.c   |  5 +--
 fs/gfs2/aops.c  |  6 ----
 fs/gfs2/dir.c   | 10 +-----
 fs/gfs2/file.c  |  1 -
 fs/gfs2/inode.c | 43 ++++++++-----------------
 fs/gfs2/inode.h |  2 +-
 fs/gfs2/quota.c | 13 ++------
 fs/gfs2/super.c | 97 ++++++++++++++++++++++++++++++++++++---------------------
 fs/gfs2/xattr.c |  5 +--
 9 files changed, 85 insertions(+), 97 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 34501b64bc47..65978d7885c8 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -82,7 +82,7 @@ static int gfs2_set_mode(struct inode *inode, umode_t mode)
 		iattr.ia_valid = ATTR_MODE;
 		iattr.ia_mode = mode;
 
-		error = gfs2_setattr_simple(GFS2_I(inode), &iattr);
+		error = gfs2_setattr_simple(inode, &iattr);
 	}
 
 	return error;
@@ -160,6 +160,7 @@ out:
 
 int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
 {
+	struct inode *inode = &ip->i_inode;
 	struct posix_acl *acl;
 	char *data;
 	unsigned int len;
@@ -169,7 +170,7 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 	if (!acl)
-		return gfs2_setattr_simple(ip, attr);
+		return gfs2_setattr_simple(inode, attr);
 
 	error = posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode);
 	if (error)
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index f9fbbe96c222..212fe74927ba 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -787,7 +787,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
 	u64 to = pos + copied;
 	void *kaddr;
 	unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
-	struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
 
 	BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode)));
 	kaddr = kmap_atomic(page, KM_USER0);
@@ -804,7 +803,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
 	if (copied) {
 		if (inode->i_size < to)
 			i_size_write(inode, to);
-		gfs2_dinode_out(ip, di);
 		mark_inode_dirty(inode);
 	}
 
@@ -873,10 +871,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 		gfs2_page_add_databufs(ip, page, from, to);
 
 	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-	if (ret > 0) {
-		gfs2_dinode_out(ip, dibh->b_data);
-		mark_inode_dirty(inode);
-	}
 
 	if (inode == sdp->sd_rindex) {
 		adjust_fs_space(inode);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2045d70753f1..898e62ed5b85 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1681,7 +1681,6 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
 	const struct qstr *name = &dentry->d_name;
 	struct gfs2_dirent *dent, *prev = NULL;
 	struct buffer_head *bh;
-	int error;
 
 	/* Returns _either_ the entry (if its first in block) or the
 	   previous entry otherwise */
@@ -1710,22 +1709,15 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
 	}
 	brelse(bh);
 
-	error = gfs2_meta_inode_buffer(dip, &bh);
-	if (error)
-		return error;
-
 	if (!dip->i_entries)
 		gfs2_consist_inode(dip);
-	gfs2_trans_add_bh(dip->i_gl, bh, 1);
 	dip->i_entries--;
 	dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
 	if (S_ISDIR(dentry->d_inode->i_mode))
 		drop_nlink(&dip->i_inode);
-	gfs2_dinode_out(dip, bh->b_data);
-	brelse(bh);
 	mark_inode_dirty(&dip->i_inode);
 
-	return error;
+	return 0;
 }
 
 /**
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 9d12286d8111..4416a1cfa1fe 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -802,7 +802,6 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
 		from = 0;
 	}
 
-	gfs2_dinode_out(ip, dibh->b_data);
 	mark_inode_dirty(inode);
 
 	brelse(dibh);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 044efe273b97..a0b53d3bd8fa 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -729,8 +729,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 		gfs2_inplace_release(dip);
 	gfs2_quota_unlock(dip);
 	gfs2_alloc_put(dip);
-	gfs2_glock_dq_uninit_m(2, ghs);
 	mark_inode_dirty(inode);
+	gfs2_glock_dq_uninit_m(2, ghs);
 	d_instantiate(dentry, inode);
 	return 0;
 
@@ -926,8 +926,9 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	inc_nlink(&ip->i_inode);
 	ip->i_inode.i_ctime = CURRENT_TIME;
-	gfs2_dinode_out(ip, dibh->b_data);
-	mark_inode_dirty(&ip->i_inode);
+	ihold(inode);
+	d_instantiate(dentry, inode);
+	mark_inode_dirty(inode);
 
 out_brelse:
 	brelse(dibh);
@@ -949,11 +950,6 @@ out_child:
 out_parent:
 	gfs2_holder_uninit(ghs);
 	gfs2_holder_uninit(ghs + 1);
-	if (!error) {
-		ihold(inode);
-		d_instantiate(dentry, inode);
-		mark_inode_dirty(inode);
-	}
 	return error;
 }
 
@@ -1026,8 +1022,6 @@ static int gfs2_unlink_inode(struct gfs2_inode *dip,
 		clear_nlink(inode);
 	else
 		drop_nlink(inode);
-	gfs2_trans_add_bh(ip->i_gl, bh, 1);
-	gfs2_dinode_out(ip, bh->b_data);
 	mark_inode_dirty(inode);
 	if (inode->i_nlink == 0)
 		gfs2_unlink_di(inode);
@@ -1565,21 +1559,10 @@ int gfs2_permission(struct inode *inode, int mask)
 	return error;
 }
 
-static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
+static int __gfs2_setattr_simple(struct inode *inode, struct iattr *attr)
 {
-	struct inode *inode = &ip->i_inode;
-	struct buffer_head *dibh;
-	int error;
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (error)
-		return error;
-
 	setattr_copy(inode, attr);
 	mark_inode_dirty(inode);
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	gfs2_dinode_out(ip, dibh->b_data);
-	brelse(dibh);
 	return 0;
 }
 
@@ -1591,19 +1574,19 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
  * Returns: errno
  */
 
-int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
+int gfs2_setattr_simple(struct inode *inode, struct iattr *attr)
 {
 	int error;
 
 	if (current->journal_info)
-		return __gfs2_setattr_simple(ip, attr);
+		return __gfs2_setattr_simple(inode, attr);
 
-	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0);
+	error = gfs2_trans_begin(GFS2_SB(inode), RES_DINODE, 0);
 	if (error)
 		return error;
 
-	error = __gfs2_setattr_simple(ip, attr);
-	gfs2_trans_end(GFS2_SB(&ip->i_inode));
+	error = __gfs2_setattr_simple(inode, attr);
+	gfs2_trans_end(GFS2_SB(inode));
 	return error;
 }
 
@@ -1641,7 +1624,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 	if (error)
 		goto out_gunlock_q;
 
-	error = gfs2_setattr_simple(ip, attr);
+	error = gfs2_setattr_simple(inode, attr);
 	if (error)
 		goto out_end_trans;
 
@@ -1697,12 +1680,12 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
 		error = gfs2_acl_chmod(ip, attr);
 	else
-		error = gfs2_setattr_simple(ip, attr);
+		error = gfs2_setattr_simple(inode, attr);
 
 out:
-	gfs2_glock_dq_uninit(&i_gh);
 	if (!error)
 		mark_inode_dirty(inode);
+	gfs2_glock_dq_uninit(&i_gh);
 	return error;
 }
 
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 8d90e0c07672..276e7b52b658 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -109,7 +109,7 @@ extern int gfs2_inode_refresh(struct gfs2_inode *ip);
 extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 				  int is_root);
 extern int gfs2_permission(struct inode *inode, int mask);
-extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
+extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
 
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 0e8bb13381e4..3a9a9749f496 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -638,7 +638,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 	unsigned long index = loc >> PAGE_CACHE_SHIFT;
 	unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
 	unsigned blocksize, iblock, pos;
-	struct buffer_head *bh, *dibh;
+	struct buffer_head *bh;
 	struct page *page;
 	void *kaddr, *ptr;
 	struct gfs2_quota q, *qp;
@@ -736,22 +736,13 @@ get_a_page:
 		goto get_a_page;
 	}
 
-	/* Update the disk inode timestamp and size (if extended) */
-	err = gfs2_meta_inode_buffer(ip, &dibh);
-	if (err)
-		goto out;
-
 	size = loc + sizeof(struct gfs2_quota);
 	if (size > inode->i_size)
 		i_size_write(inode, size);
 	inode->i_mtime = inode->i_atime = CURRENT_TIME;
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	gfs2_dinode_out(ip, dibh->b_data);
-	brelse(dibh);
 	mark_inode_dirty(inode);
-
-out:
 	return err;
+
 unlock_out:
 	unlock_page(page);
 	page_cache_release(page);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 9961de702d1b..b05fa5954550 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -752,53 +752,79 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
 	struct backing_dev_info *bdi = metamapping->backing_dev_info;
-	struct gfs2_holder gh;
-	struct buffer_head *bh;
-	struct timespec atime;
-	struct gfs2_dinode *di;
-	int ret = -EAGAIN;
-	int unlock_required = 0;
-
-	/* Skip timestamp update, if this is from a memalloc */
-	if (current->flags & PF_MEMALLOC)
-		goto do_flush;
-	if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
-		ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-		if (ret)
-			goto do_flush;
-		unlock_required = 1;
-	}
-	ret = gfs2_meta_inode_buffer(ip, &bh);
-	if (ret == 0) {
-		di = (struct gfs2_dinode *)bh->b_data;
-		atime.tv_sec = be64_to_cpu(di->di_atime);
-		atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
-		if (timespec_compare(&inode->i_atime, &atime) > 0) {
-			ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
-			if (ret == 0) {
-				gfs2_trans_add_bh(ip->i_gl, bh, 1);
-				gfs2_dinode_out(ip, bh->b_data);
-				gfs2_trans_end(sdp);
-			}
-		}
-		brelse(bh);
-	}
-	if (unlock_required)
-		gfs2_glock_dq_uninit(&gh);
-do_flush:
+	int ret = 0;
+
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
 	if (bdi->dirty_exceeded)
 		gfs2_ail1_flush(sdp, wbc);
 	else
 		filemap_fdatawrite(metamapping);
-	if (!ret && (wbc->sync_mode == WB_SYNC_ALL))
+	if (wbc->sync_mode == WB_SYNC_ALL)
 		ret = filemap_fdatawait(metamapping);
 	if (ret)
 		mark_inode_dirty_sync(inode);
 	return ret;
 }
 
+/**
+ * gfs2_dirty_inode - check for atime updates
+ * @inode: The inode in question
+ * @flags: The type of dirty
+ *
+ * Unfortunately it can be called under any combination of inode
+ * glock and transaction lock, so we have to check carefully.
+ *
+ * At the moment this deals only with atime - it should be possible
+ * to expand that role in future, once a review of the locking has
+ * been carried out.
+ */
+
+static void gfs2_dirty_inode(struct inode *inode, int flags)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct buffer_head *bh;
+	struct gfs2_holder gh;
+	int need_unlock = 0;
+	int need_endtrans = 0;
+	int ret;
+
+	if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC)))
+		return;
+
+	if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
+		ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+		if (ret) {
+			fs_err(sdp, "dirty_inode: glock %d\n", ret);
+			return;
+		}
+		need_unlock = 1;
+	}
+
+	if (current->journal_info == NULL) {
+		ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
+		if (ret) {
+			fs_err(sdp, "dirty_inode: gfs2_trans_begin %d\n", ret);
+			goto out;
+		}
+		need_endtrans = 1;
+	}
+
+	ret = gfs2_meta_inode_buffer(ip, &bh);
+	if (ret == 0) {
+		gfs2_trans_add_bh(ip->i_gl, bh, 1);
+		gfs2_dinode_out(ip, bh->b_data);
+		brelse(bh);
+	}
+
+	if (need_endtrans)
+		gfs2_trans_end(sdp);
+out:
+	if (need_unlock)
+		gfs2_glock_dq_uninit(&gh);
+}
+
 /**
  * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
  * @sdp: the filesystem
@@ -1578,6 +1604,7 @@ const struct super_operations gfs2_super_ops = {
 	.alloc_inode		= gfs2_alloc_inode,
 	.destroy_inode		= gfs2_destroy_inode,
 	.write_inode		= gfs2_write_inode,
+	.dirty_inode		= gfs2_dirty_inode,
 	.evict_inode		= gfs2_evict_inode,
 	.put_super		= gfs2_put_super,
 	.sync_fs		= gfs2_sync_fs,
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 439b61c03262..695304cf01cc 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1296,7 +1296,8 @@ fail:
 
 int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 {
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct inode *inode = &ip->i_inode;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_ea_location el;
 	int error;
 
@@ -1319,7 +1320,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 	if (error)
 		return error;
 
-	error = gfs2_setattr_simple(ip, attr);
+	error = gfs2_setattr_simple(inode, attr);
 	gfs2_trans_end(sdp);
 	return error;
 }
-- 
cgit v1.2.3-58-ga151


From 9a63edd12ba3c18351f00d6b77a6b2f49f2b8eb6 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 18 Aug 2011 14:35:53 +0100
Subject: GFS2: Clean up gfs2_create

If we pass through knowledge of whether the creation is intended to be
exclusive or not, then we can deal with that in gfs2_create_inode
and remove one set of locking. Also this removes the loop in
gfs2_create and simplifies the code a bit.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index a0b53d3bd8fa..2af69056a9fd 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -663,7 +663,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
 
 static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 			     unsigned int mode, dev_t dev, const char *symname,
-			     unsigned int size)
+			     unsigned int size, int excl)
 {
 	const struct qstr *name = &dentry->d_name;
 	struct gfs2_holder ghs[2];
@@ -683,6 +683,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 		goto fail;
 
 	error = create_ok(dip, name, mode);
+	if ((error == -EEXIST) && S_ISREG(mode) && !excl) {
+		inode = gfs2_lookupi(dir, &dentry->d_name, 0);
+		gfs2_glock_dq_uninit(ghs);
+		d_instantiate(dentry, inode);
+		return IS_ERR(inode) ? PTR_ERR(inode) : 0;
+	}
 	if (error)
 		goto fail_gunlock;
 
@@ -760,24 +766,10 @@ fail:
 static int gfs2_create(struct inode *dir, struct dentry *dentry,
 		       int mode, struct nameidata *nd)
 {
-	struct inode *inode;
-	int ret;
-
-	for (;;) {
-		ret = gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0);
-		if (ret != -EEXIST || (nd && (nd->flags & LOOKUP_EXCL)))
-			return ret;
-
-		inode = gfs2_lookupi(dir, &dentry->d_name, 0);
-		if (inode) {
-			if (!IS_ERR(inode))
-				break;
-			return PTR_ERR(inode);
-		}
-	}
-
-	d_instantiate(dentry, inode);
-	return 0;
+	int excl = 0;
+	if (nd && (nd->flags & LOOKUP_EXCL))
+		excl = 1;
+	return gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0, excl);
 }
 
 /**
@@ -1135,7 +1127,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
 	if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
 		return -ENAMETOOLONG;
 
-	return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size);
+	return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size, 0);
 }
 
 /**
@@ -1149,7 +1141,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
 
 static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
-	return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0);
+	return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0, 0);
 }
 
 /**
@@ -1164,7 +1156,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
 		      dev_t dev)
 {
-	return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0);
+	return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0);
 }
 
 /*
-- 
cgit v1.2.3-58-ga151


From 9453615a1a7ef3fa910c6464a619595556cfcd63 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 23 Aug 2011 10:19:25 +0100
Subject: GFS2: Fix lseek after SEEK_DATA, SEEK_HOLE have been added

We need to take the inode's glock whenever the inode's size
is referenced, otherwise it might not be uptodate. Even
though generic_file_llseek_unlocked() doesn't implement
SEEK_DATA, SEEK_HOLE directly, it does reference the inode's
size in those cases, so we need to add them to the list
of origins which need the glock.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
---
 fs/gfs2/file.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4416a1cfa1fe..d717b72500a4 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -59,15 +59,24 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
 	struct gfs2_holder i_gh;
 	loff_t error;
 
-	if (origin == 2) {
+	switch (origin) {
+	case SEEK_END: /* These reference inode->i_size */
+	case SEEK_DATA:
+	case SEEK_HOLE:
 		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
 					   &i_gh);
 		if (!error) {
 			error = generic_file_llseek_unlocked(file, offset, origin);
 			gfs2_glock_dq_uninit(&i_gh);
 		}
-	} else
+		break;
+	case SEEK_CUR:
+	case SEEK_SET:
 		error = generic_file_llseek_unlocked(file, offset, origin);
+		break;
+	default:
+		error = -EINVAL;
+	}
 
 	return error;
 }
-- 
cgit v1.2.3-58-ga151


From 7c9ca621137cde26be05448133fc1a554345f4f8 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 31 Aug 2011 09:53:19 +0100
Subject: GFS2: Use rbtree for resource groups and clean up bitmap buffer ref
 count scheme

Here is an update of Bob's original rbtree patch which, in addition, also
resolves the rather strange ref counting that was being done relating to
the bitmap blocks.

Originally we had a dual system for journaling resource groups. The metadata
blocks were journaled and also the rgrp itself was added to a list. The reason
for adding the rgrp to the list in the journal was so that the "repolish
clones" code could be run to update the free space, and potentially send any
discard requests when the log was flushed. This was done by comparing the
"cloned" bitmap with what had been written back on disk during the transaction
commit.

Due to this, there was a requirement to hang on to the rgrps' bitmap buffers
until the journal had been flushed. For that reason, there was a rather
complicated set up in the ->go_lock ->go_unlock functions for rgrps involving
both a mutex and a spinlock (the ->sd_rindex_spin) to maintain a reference
count on the buffers.

However, the journal maintains a reference count on the buffers anyway, since
they are being journaled as metadata buffers. So by moving the code which deals
with the post-journal accounting for bitmap blocks to the metadata journaling
code, we can entirely dispense with the rather strange buffer ref counting
scheme and also the requirement to journal the rgrps.

The net result of all this is that the ->sd_rindex_spin is left to do exactly
one job, and that is to look after the rbtree or rgrps.

This patch is designed to be a stepping stone towards using RCU for the rbtree
of resource groups, however the reduction in the number of uses of the
->sd_rindex_spin is likely to have benefits for multi-threaded workloads,
anyway.

The patch retains ->go_lock and ->go_unlock for rgrps, however these maybe also
be removed in future in favour of calling the functions directly where required
in the code. That will allow locking of resource groups without needing to
actually read them in - something that could be useful in speeding up statfs.

In the mean time though it is valid to dereference ->bi_bh only when the rgrp
is locked. This is basically the same rule as before, modulo the references not
being valid until the following journal flush.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Cc: Benjamin Marzinski <bmarzins@redhat.com>
---
 fs/gfs2/glops.c      |  42 ++-----
 fs/gfs2/incore.h     |  11 +-
 fs/gfs2/lops.c       |  64 ++++------
 fs/gfs2/ops_fstype.c |   3 +-
 fs/gfs2/rgrp.c       | 344 ++++++++++++++-------------------------------------
 fs/gfs2/rgrp.h       |  10 +-
 fs/gfs2/trans.c      |   5 -
 fs/gfs2/trans.h      |  14 +--
 8 files changed, 148 insertions(+), 345 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 0cc3ff48ce20..6f82aac9b0ee 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -134,6 +134,8 @@ void gfs2_ail_flush(struct gfs2_glock *gl)
 static void rgrp_go_sync(struct gfs2_glock *gl)
 {
 	struct address_space *metamapping = gfs2_glock2aspace(gl);
+	struct gfs2_rgrpd *rgd = gl->gl_object;
+	unsigned int x;
 	int error;
 
 	if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
@@ -145,6 +147,15 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
 	error = filemap_fdatawait(metamapping);
         mapping_set_error(metamapping, error);
 	gfs2_ail_empty_gl(gl);
+
+	if (!rgd)
+		return;
+
+	for (x = 0; x < rgd->rd_length; x++) {
+		struct gfs2_bitmap *bi = rgd->rd_bits + x;
+		kfree(bi->bi_clone);
+		bi->bi_clone = NULL;
+	}
 }
 
 /**
@@ -444,33 +455,6 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 	return 0;
 }
 
-/**
- * rgrp_go_lock - operation done after an rgrp lock is locked by
- *    a first holder on this node.
- * @gl: the glock
- * @flags:
- *
- * Returns: errno
- */
-
-static int rgrp_go_lock(struct gfs2_holder *gh)
-{
-	return gfs2_rgrp_bh_get(gh->gh_gl->gl_object);
-}
-
-/**
- * rgrp_go_unlock - operation done before an rgrp lock is unlocked by
- *    a last holder on this node.
- * @gl: the glock
- * @flags:
- *
- */
-
-static void rgrp_go_unlock(struct gfs2_holder *gh)
-{
-	gfs2_rgrp_bh_put(gh->gh_gl->gl_object);
-}
-
 /**
  * trans_go_sync - promote/demote the transaction glock
  * @gl: the glock
@@ -573,8 +557,8 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
 	.go_xmote_th = rgrp_go_sync,
 	.go_inval = rgrp_go_inval,
-	.go_lock = rgrp_go_lock,
-	.go_unlock = rgrp_go_unlock,
+	.go_lock = gfs2_rgrp_go_lock,
+	.go_unlock = gfs2_rgrp_go_unlock,
 	.go_dump = gfs2_rgrp_dump,
 	.go_type = LM_TYPE_RGRP,
 	.go_flags = GLOF_ASPACE,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index be5801a75406..9e753fc9e6a5 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -18,6 +18,7 @@
 #include <linux/rcupdate.h>
 #include <linux/rculist_bl.h>
 #include <linux/completion.h>
+#include <linux/rbtree.h>
 
 #define DIO_WAIT	0x00000010
 #define DIO_METADATA	0x00000020
@@ -78,8 +79,7 @@ struct gfs2_bitmap {
 };
 
 struct gfs2_rgrpd {
-	struct list_head rd_list;	/* Link with superblock */
-	struct list_head rd_list_mru;
+	struct rb_node rd_node;		/* Link with superblock */
 	struct gfs2_glock *rd_gl;	/* Glock for this rgrp */
 	u64 rd_addr;			/* grp block disk address */
 	u64 rd_data0;			/* first data location */
@@ -91,10 +91,7 @@ struct gfs2_rgrpd {
 	u32 rd_dinodes;
 	u64 rd_igeneration;
 	struct gfs2_bitmap *rd_bits;
-	struct mutex rd_mutex;
-	struct gfs2_log_element rd_le;
 	struct gfs2_sbd *rd_sbd;
-	unsigned int rd_bh_count;
 	u32 rd_last_alloc;
 	u32 rd_flags;
 #define GFS2_RDF_CHECK		0x10000000 /* check for unlinked inodes */
@@ -575,9 +572,7 @@ struct gfs2_sbd {
 	int sd_rindex_uptodate;
 	spinlock_t sd_rindex_spin;
 	struct mutex sd_rindex_mutex;
-	struct list_head sd_rindex_list;
-	struct list_head sd_rindex_mru_list;
-	struct gfs2_rgrpd *sd_rindex_forward;
+	struct rb_root sd_rindex_tree;
 	unsigned int sd_rgrps;
 	unsigned int sd_max_rg_data;
 
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 05bbb124699f..de05b4d66ef3 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -60,6 +60,29 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
 	trace_gfs2_pin(bd, 1);
 }
 
+static bool buffer_is_rgrp(const struct gfs2_bufdata *bd)
+{
+	return bd->bd_gl->gl_name.ln_type == LM_TYPE_RGRP;
+}
+
+static void maybe_release_space(struct gfs2_bufdata *bd)
+{
+	struct gfs2_glock *gl = bd->bd_gl;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct gfs2_rgrpd *rgd = gl->gl_object;
+	unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number;
+	struct gfs2_bitmap *bi = rgd->rd_bits + index;
+
+	if (bi->bi_clone == 0)
+		return;
+	if (sdp->sd_args.ar_discard)
+		gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi);
+	memcpy(bi->bi_clone + bi->bi_offset,
+	       bd->bd_bh->b_data + bi->bi_offset, bi->bi_len);
+	clear_bit(GBF_FULL, &bi->bi_flags);
+	rgd->rd_free_clone = rgd->rd_free;
+}
+
 /**
  * gfs2_unpin - Unpin a buffer
  * @sdp: the filesystem the buffer belongs to
@@ -81,6 +104,9 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
 	mark_buffer_dirty(bh);
 	clear_buffer_pinned(bh);
 
+	if (buffer_is_rgrp(bd))
+		maybe_release_space(bd);
+
 	spin_lock(&sdp->sd_ail_lock);
 	if (bd->bd_ail) {
 		list_del(&bd->bd_ail_st_list);
@@ -469,42 +495,6 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
 	gfs2_revoke_clean(sdp);
 }
 
-static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
-{
-	struct gfs2_rgrpd *rgd;
-	struct gfs2_trans *tr = current->journal_info;
-
-	tr->tr_touched = 1;
-
-	rgd = container_of(le, struct gfs2_rgrpd, rd_le);
-
-	gfs2_log_lock(sdp);
-	if (!list_empty(&le->le_list)){
-		gfs2_log_unlock(sdp);
-		return;
-	}
-	gfs2_rgrp_bh_hold(rgd);
-	sdp->sd_log_num_rg++;
-	list_add(&le->le_list, &sdp->sd_log_le_rg);
-	gfs2_log_unlock(sdp);
-}
-
-static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
-{
-	struct list_head *head = &sdp->sd_log_le_rg;
-	struct gfs2_rgrpd *rgd;
-
-	while (!list_empty(head)) {
-		rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list);
-		list_del_init(&rgd->rd_le.le_list);
-		sdp->sd_log_num_rg--;
-
-		gfs2_rgrp_repolish_clones(rgd);
-		gfs2_rgrp_bh_put(rgd);
-	}
-	gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);
-}
-
 /**
  * databuf_lo_add - Add a databuf to the transaction.
  *
@@ -771,8 +761,6 @@ const struct gfs2_log_operations gfs2_revoke_lops = {
 };
 
 const struct gfs2_log_operations gfs2_rg_lops = {
-	.lo_add = rg_lo_add,
-	.lo_after_commit = rg_lo_after_commit,
 	.lo_name = "rg",
 };
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 079587e53849..69166ff7f37d 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -77,8 +77,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 
 	spin_lock_init(&sdp->sd_rindex_spin);
 	mutex_init(&sdp->sd_rindex_mutex);
-	INIT_LIST_HEAD(&sdp->sd_rindex_list);
-	INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
+	sdp->sd_rindex_tree.rb_node = NULL;
 
 	INIT_LIST_HEAD(&sdp->sd_jindex_list);
 	spin_lock_init(&sdp->sd_jindex_spin);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 7f8af1eb02de..00f6e3d62c22 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -15,6 +15,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/prefetch.h>
 #include <linux/blkdev.h>
+#include <linux/rbtree.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -328,15 +329,25 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
 
 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
 {
-	struct gfs2_rgrpd *rgd;
+	struct rb_node **newn, *parent = NULL;
 
 	spin_lock(&sdp->sd_rindex_spin);
 
-	list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) {
-		if (rgrp_contains_block(rgd, blk)) {
-			list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+	newn = &sdp->sd_rindex_tree.rb_node;
+
+	/* Figure out where to put new node */
+	while (*newn) {
+		struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
+						  rd_node);
+
+		parent = *newn;
+		if (blk < cur->rd_addr)
+			newn = &((*newn)->rb_left);
+		else if (blk > cur->rd_data0 + cur->rd_data)
+			newn = &((*newn)->rb_right);
+		else {
 			spin_unlock(&sdp->sd_rindex_spin);
-			return rgd;
+			return cur;
 		}
 	}
 
@@ -354,8 +365,13 @@ struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
 
 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
 {
-	gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list));
-	return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list);
+	const struct rb_node *n;
+	struct gfs2_rgrpd *rgd;
+
+	n = rb_first(&sdp->sd_rindex_tree);
+	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
+
+	return rgd;
 }
 
 /**
@@ -367,28 +383,34 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
 
 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
 {
-	if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list)
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	const struct rb_node *n;
+
+	spin_lock(&sdp->sd_rindex_spin);
+	n = rb_next(&rgd->rd_node);
+	if (n == NULL)
+		n = rb_first(&sdp->sd_rindex_tree);
+
+	if (unlikely(&rgd->rd_node == n)) {
+		spin_unlock(&sdp->sd_rindex_spin);
 		return NULL;
-	return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list);
+	}
+	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
+	spin_unlock(&sdp->sd_rindex_spin);
+	return rgd;
 }
 
 static void clear_rgrpdi(struct gfs2_sbd *sdp)
 {
-	struct list_head *head;
+	struct rb_node *n;
 	struct gfs2_rgrpd *rgd;
 	struct gfs2_glock *gl;
 
-	spin_lock(&sdp->sd_rindex_spin);
-	sdp->sd_rindex_forward = NULL;
-	spin_unlock(&sdp->sd_rindex_spin);
-
-	head = &sdp->sd_rindex_list;
-	while (!list_empty(head)) {
-		rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list);
+	while ((n = rb_first(&sdp->sd_rindex_tree))) {
+		rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
 		gl = rgd->rd_gl;
 
-		list_del(&rgd->rd_list);
-		list_del(&rgd->rd_list_mru);
+		rb_erase(n, &sdp->sd_rindex_tree);
 
 		if (gl) {
 			gl->gl_object = NULL;
@@ -535,6 +557,29 @@ static void gfs2_rindex_in(struct gfs2_rgrpd *rgd, const void *buf)
 	rgd->rd_bitbytes = be32_to_cpu(str->ri_bitbytes);
 }
 
+static void rgd_insert(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
+
+	/* Figure out where to put new node */
+	while (*newn) {
+		struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
+						  rd_node);
+
+		parent = *newn;
+		if (rgd->rd_addr < cur->rd_addr)
+			newn = &((*newn)->rb_left);
+		else if (rgd->rd_addr > cur->rd_addr)
+			newn = &((*newn)->rb_right);
+		else
+			return;
+	}
+
+	rb_link_node(&rgd->rd_node, parent, newn);
+	rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
+}
+
 /**
  * read_rindex_entry - Pull in a new resource index entry from the disk
  * @gl: The glock covering the rindex inode
@@ -566,14 +611,11 @@ static int read_rindex_entry(struct gfs2_inode *ip,
 	if (!rgd)
 		return error;
 
-	mutex_init(&rgd->rd_mutex);
-	lops_init_le(&rgd->rd_le, &gfs2_rg_lops);
 	rgd->rd_sbd = sdp;
 
-	list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list);
-	list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
-
 	gfs2_rindex_in(rgd, buf);
+	rgd_insert(rgd);
+
 	error = compute_bitstructs(rgd);
 	if (error)
 		return error;
@@ -585,6 +627,8 @@ static int read_rindex_entry(struct gfs2_inode *ip,
 
 	rgd->rd_gl->gl_object = rgd;
 	rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
+	if (rgd->rd_data > sdp->sd_max_rg_data)
+		sdp->sd_max_rg_data = rgd->rd_data;
 	return error;
 }
 
@@ -601,8 +645,6 @@ int gfs2_ri_update(struct gfs2_inode *ip)
 	struct inode *inode = &ip->i_inode;
 	struct file_ra_state ra_state;
 	u64 rgrp_count = i_size_read(inode);
-	struct gfs2_rgrpd *rgd;
-	unsigned int max_data = 0;
 	int error;
 
 	do_div(rgrp_count, sizeof(struct gfs2_rindex));
@@ -617,10 +659,6 @@ int gfs2_ri_update(struct gfs2_inode *ip)
 		}
 	}
 
-	list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
-		if (rgd->rd_data > max_data)
-			max_data = rgd->rd_data;
-	sdp->sd_max_rg_data = max_data;
 	sdp->sd_rindex_uptodate = 1;
 	return 0;
 }
@@ -694,7 +732,7 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 }
 
 /**
- * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
+ * gfs2_rgrp_go_lock - Read in a RG's header and bitmaps
  * @rgd: the struct gfs2_rgrpd describing the RG to read in
  *
  * Read in all of a Resource Group's header and bitmap blocks.
@@ -703,8 +741,9 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
  * Returns: errno
  */
 
-int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
+int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
 {
+	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
 	struct gfs2_glock *gl = rgd->rd_gl;
 	unsigned int length = rgd->rd_length;
@@ -712,17 +751,6 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
 	unsigned int x, y;
 	int error;
 
-	mutex_lock(&rgd->rd_mutex);
-
-	spin_lock(&sdp->sd_rindex_spin);
-	if (rgd->rd_bh_count) {
-		rgd->rd_bh_count++;
-		spin_unlock(&sdp->sd_rindex_spin);
-		mutex_unlock(&rgd->rd_mutex);
-		return 0;
-	}
-	spin_unlock(&sdp->sd_rindex_spin);
-
 	for (x = 0; x < length; x++) {
 		bi = rgd->rd_bits + x;
 		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh);
@@ -747,15 +775,9 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
 			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
 		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
+		rgd->rd_free_clone = rgd->rd_free;
 	}
 
-	spin_lock(&sdp->sd_rindex_spin);
-	rgd->rd_free_clone = rgd->rd_free;
-	rgd->rd_bh_count++;
-	spin_unlock(&sdp->sd_rindex_spin);
-
-	mutex_unlock(&rgd->rd_mutex);
-
 	return 0;
 
 fail:
@@ -765,52 +787,32 @@ fail:
 		bi->bi_bh = NULL;
 		gfs2_assert_warn(sdp, !bi->bi_clone);
 	}
-	mutex_unlock(&rgd->rd_mutex);
 
 	return error;
 }
 
-void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd)
-{
-	struct gfs2_sbd *sdp = rgd->rd_sbd;
-
-	spin_lock(&sdp->sd_rindex_spin);
-	gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
-	rgd->rd_bh_count++;
-	spin_unlock(&sdp->sd_rindex_spin);
-}
-
 /**
- * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get()
+ * gfs2_rgrp_go_unlock - Release RG bitmaps read in with gfs2_rgrp_bh_get()
  * @rgd: the struct gfs2_rgrpd describing the RG to read in
  *
  */
 
-void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
+void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
 {
-	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
 	int x, length = rgd->rd_length;
 
-	spin_lock(&sdp->sd_rindex_spin);
-	gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
-	if (--rgd->rd_bh_count) {
-		spin_unlock(&sdp->sd_rindex_spin);
-		return;
-	}
-
 	for (x = 0; x < length; x++) {
 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
-		kfree(bi->bi_clone);
-		bi->bi_clone = NULL;
 		brelse(bi->bi_bh);
 		bi->bi_bh = NULL;
 	}
 
-	spin_unlock(&sdp->sd_rindex_spin);
 }
 
-static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
-				    const struct gfs2_bitmap *bi)
+void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
+			     struct buffer_head *bh,
+			     const struct gfs2_bitmap *bi)
 {
 	struct super_block *sb = sdp->sd_vfs;
 	struct block_device *bdev = sb->s_bdev;
@@ -823,7 +825,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 	unsigned int x;
 
 	for (x = 0; x < bi->bi_len; x++) {
-		const u8 *orig = bi->bi_bh->b_data + bi->bi_offset + x;
+		const u8 *orig = bh->b_data + bi->bi_offset + x;
 		const u8 *clone = bi->bi_clone + bi->bi_offset + x;
 		u8 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
 		diff &= 0x55;
@@ -862,28 +864,6 @@ fail:
 	sdp->sd_args.ar_discard = 0;
 }
 
-void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
-{
-	struct gfs2_sbd *sdp = rgd->rd_sbd;
-	unsigned int length = rgd->rd_length;
-	unsigned int x;
-
-	for (x = 0; x < length; x++) {
-		struct gfs2_bitmap *bi = rgd->rd_bits + x;
-		if (!bi->bi_clone)
-			continue;
-		if (sdp->sd_args.ar_discard)
-			gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi);
-		clear_bit(GBF_FULL, &bi->bi_flags);
-		memcpy(bi->bi_clone + bi->bi_offset,
-		       bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
-	}
-
-	spin_lock(&sdp->sd_rindex_spin);
-	rgd->rd_free_clone = rgd->rd_free;
-	spin_unlock(&sdp->sd_rindex_spin);
-}
-
 /**
  * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode
  * @ip: the incore GFS2 inode structure
@@ -911,20 +891,15 @@ struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
 
 static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
 {
-	struct gfs2_sbd *sdp = rgd->rd_sbd;
-	int ret = 0;
-
 	if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
 		return 0;
 
-	spin_lock(&sdp->sd_rindex_spin);
 	if (rgd->rd_free_clone >= al->al_requested) {
 		al->al_rgd = rgd;
-		ret = 1;
+		return 1;
 	}
-	spin_unlock(&sdp->sd_rindex_spin);
 
-	return ret;
+	return 0;
 }
 
 /**
@@ -991,76 +966,6 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
 	return;
 }
 
-/**
- * recent_rgrp_next - get next RG from "recent" list
- * @cur_rgd: current rgrp
- *
- * Returns: The next rgrp in the recent list
- */
-
-static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd)
-{
-	struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
-	struct list_head *head;
-	struct gfs2_rgrpd *rgd;
-
-	spin_lock(&sdp->sd_rindex_spin);
-	head = &sdp->sd_rindex_mru_list;
-	if (unlikely(cur_rgd->rd_list_mru.next == head)) {
-		spin_unlock(&sdp->sd_rindex_spin);
-		return NULL;
-	}
-	rgd = list_entry(cur_rgd->rd_list_mru.next, struct gfs2_rgrpd, rd_list_mru);
-	spin_unlock(&sdp->sd_rindex_spin);
-	return rgd;
-}
-
-/**
- * forward_rgrp_get - get an rgrp to try next from full list
- * @sdp: The GFS2 superblock
- *
- * Returns: The rgrp to try next
- */
-
-static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp)
-{
-	struct gfs2_rgrpd *rgd;
-	unsigned int journals = gfs2_jindex_size(sdp);
-	unsigned int rg = 0, x;
-
-	spin_lock(&sdp->sd_rindex_spin);
-
-	rgd = sdp->sd_rindex_forward;
-	if (!rgd) {
-		if (sdp->sd_rgrps >= journals)
-			rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals;
-
-		for (x = 0, rgd = gfs2_rgrpd_get_first(sdp); x < rg;
-		     x++, rgd = gfs2_rgrpd_get_next(rgd))
-			/* Do Nothing */;
-
-		sdp->sd_rindex_forward = rgd;
-	}
-
-	spin_unlock(&sdp->sd_rindex_spin);
-
-	return rgd;
-}
-
-/**
- * forward_rgrp_set - set the forward rgrp pointer
- * @sdp: the filesystem
- * @rgd: The new forward rgrp
- *
- */
-
-static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
-{
-	spin_lock(&sdp->sd_rindex_spin);
-	sdp->sd_rindex_forward = rgd;
-	spin_unlock(&sdp->sd_rindex_spin);
-}
-
 /**
  * get_local_rgrp - Choose and lock a rgrp for allocation
  * @ip: the inode to reserve space for
@@ -1076,14 +981,15 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_rgrpd *rgd, *begin = NULL;
 	struct gfs2_alloc *al = ip->i_alloc;
-	int flags = LM_FLAG_TRY;
-	int skipped = 0;
-	int loops = 0;
 	int error, rg_locked;
+	int loops = 0;
 
-	rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
+	rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal);
 
-	while (rgd) {
+	if (rgd == NULL)
+		return -EBADSLT;
+
+	while (loops < 3) {
 		rg_locked = 0;
 
 		if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
@@ -1096,80 +1002,24 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 		switch (error) {
 		case 0:
 			if (try_rgrp_fit(rgd, al))
-				goto out;
+				return 0;
 			if (rgd->rd_flags & GFS2_RDF_CHECK)
 				try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
 			if (!rg_locked)
 				gfs2_glock_dq_uninit(&al->al_rgd_gh);
 			/* fall through */
 		case GLR_TRYFAILED:
-			rgd = recent_rgrp_next(rgd);
-			break;
-
-		default:
-			return error;
-		}
-	}
-
-	/* Go through full list of rgrps */
-
-	begin = rgd = forward_rgrp_get(sdp);
-
-	for (;;) {
-		rg_locked = 0;
-
-		if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
-			rg_locked = 1;
-			error = 0;
-		} else {
-			error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags,
-						   &al->al_rgd_gh);
-		}
-		switch (error) {
-		case 0:
-			if (try_rgrp_fit(rgd, al))
-				goto out;
-			if (rgd->rd_flags & GFS2_RDF_CHECK)
-				try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
-			if (!rg_locked)
-				gfs2_glock_dq_uninit(&al->al_rgd_gh);
-			break;
-
-		case GLR_TRYFAILED:
-			skipped++;
+			rgd = gfs2_rgrpd_get_next(rgd);
+			if (rgd == begin)
+				loops++;
 			break;
 
 		default:
 			return error;
 		}
-
-		rgd = gfs2_rgrpd_get_next(rgd);
-		if (!rgd)
-			rgd = gfs2_rgrpd_get_first(sdp);
-
-		if (rgd == begin) {
-			if (++loops >= 3)
-				return -ENOSPC;
-			if (!skipped)
-				loops++;
-			flags = 0;
-			if (loops == 2)
-				gfs2_log_flush(sdp, NULL);
-		}
-	}
-
-out:
-	if (begin) {
-		spin_lock(&sdp->sd_rindex_spin);
-		list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
-		spin_unlock(&sdp->sd_rindex_spin);
-		rgd = gfs2_rgrpd_get_next(rgd);
-		if (!rgd)
-			rgd = gfs2_rgrpd_get_first(sdp);
-		forward_rgrp_set(sdp, rgd);
 	}
 
-	return 0;
+	return -ENOSPC;
 }
 
 /**
@@ -1352,6 +1202,7 @@ do_search:
 		/* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
 		   bitmaps, so we must search the originals for that. */
 		buffer = bi->bi_bh->b_data + bi->bi_offset;
+		WARN_ON(!buffer_uptodate(bi->bi_bh));
 		if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone)
 			buffer = bi->bi_clone + bi->bi_offset;
 
@@ -1371,6 +1222,7 @@ skip:
 
 	if (blk == BFITNOENT)
 		return blk;
+
 	*n = 1;
 	if (old_state == new_state)
 		goto out;
@@ -1539,9 +1391,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
 	gfs2_statfs_change(sdp, 0, -(s64)*n, 0);
 	gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid);
 
-	spin_lock(&sdp->sd_rindex_spin);
 	rgd->rd_free_clone -= *n;
-	spin_unlock(&sdp->sd_rindex_spin);
 	trace_gfs2_block_alloc(ip, block, *n, GFS2_BLKST_USED);
 	*bn = block;
 	return 0;
@@ -1594,9 +1444,7 @@ int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
 	gfs2_statfs_change(sdp, 0, -1, +1);
 	gfs2_trans_add_unrevoke(sdp, block, 1);
 
-	spin_lock(&sdp->sd_rindex_spin);
 	rgd->rd_free_clone--;
-	spin_unlock(&sdp->sd_rindex_spin);
 	trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE);
 	*bn = block;
 	return 0;
@@ -1629,8 +1477,6 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
 
-	gfs2_trans_add_rg(rgd);
-
 	/* Directories keep their data in the metadata address space */
 	if (meta || ip->i_depth)
 		gfs2_meta_wipe(ip, bstart, blen);
@@ -1666,7 +1512,6 @@ void gfs2_unlink_di(struct inode *inode)
 	trace_gfs2_block_alloc(ip, blkno, 1, GFS2_BLKST_UNLINKED);
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
-	gfs2_trans_add_rg(rgd);
 }
 
 static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
@@ -1688,7 +1533,6 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
 
 	gfs2_statfs_change(sdp, 0, +1, -1);
-	gfs2_trans_add_rg(rgd);
 }
 
 
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index d253f9a8c70e..59b950f43ccf 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -25,11 +25,8 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
 extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
 extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
 
-extern int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
-extern void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
-extern void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
-
-extern void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
+extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
+extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
 
 extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
 static inline void gfs2_alloc_put(struct gfs2_inode *ip)
@@ -72,5 +69,8 @@ extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
 extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
 extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
 extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
+extern void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
+				    struct buffer_head *bh,
+				    const struct gfs2_bitmap *bi);
 
 #endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 9ec73a854111..86ac75d99d31 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -185,8 +185,3 @@ void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
 	gfs2_log_unlock(sdp);
 }
 
-void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
-{
-	lops_add(rgd->rd_sbd, &rgd->rd_le);
-}
-
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index fb56b783e028..980c5c05398a 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -34,14 +34,12 @@ static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al)
 	       al->al_requested + 1 : al->al_rgd->rd_length;
 }
 
-int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
-		     unsigned int revokes);
+extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
+			    unsigned int revokes);
 
-void gfs2_trans_end(struct gfs2_sbd *sdp);
-
-void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
-void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
-void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
-void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
+extern void gfs2_trans_end(struct gfs2_sbd *sdp);
+extern void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
+extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+extern void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
 
 #endif /* __TRANS_DOT_H__ */
-- 
cgit v1.2.3-58-ga151


From 8339ee543ece6e2dcc1bbd97d5350163c198cf00 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 31 Aug 2011 16:38:29 +0100
Subject: GFS2: Make resource groups "append only" during life of fs

Since we have ruled out supporting online filesystem shrink,
it is possible to make the resource group list append only
during the life of a super block. This gives several benefits:

Firstly, we only need to read new rindex elements as they are added
rather than needing to reread the whole rindex file each time one
element is added.

Secondly, the rindex glock can be held for much shorter periods of
time, and is completely removed from the fast path for allocations.
The lock is taken in shared mode only when updating the resource
groups when the first allocation occurs, and after a grow has
taken place.

Thirdly, this results in a reduction in code size, and everything
gets a lot simpler to understand in this area.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c   |   7 ---
 fs/gfs2/dir.c    |   6 --
 fs/gfs2/glops.c  |  16 ++----
 fs/gfs2/incore.h |   1 -
 fs/gfs2/inode.c  |  15 +----
 fs/gfs2/rgrp.c   | 168 ++++++++++++++++++++++++-------------------------------
 fs/gfs2/rgrp.h   |  17 +++---
 fs/gfs2/super.c  |  22 ++------
 fs/gfs2/xattr.c  |  17 +-----
 9 files changed, 95 insertions(+), 174 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 7878c473ae62..e60296137707 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -783,11 +783,6 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 	else if (ip->i_depth)
 		revokes = sdp->sd_inptrs;
 
-	if (ip != GFS2_I(sdp->sd_rindex))
-		error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
-	else if (!sdp->sd_rgrps)
-		error = gfs2_ri_update(ip);
-
 	if (error)
 		return error;
 
@@ -887,8 +882,6 @@ out_rg_gunlock:
 out_rlist:
 	gfs2_rlist_free(&rlist);
 out:
-	if (ip != GFS2_I(sdp->sd_rindex))
-		gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
 	return error;
 }
 
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 898e62ed5b85..90b877b464ca 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1807,10 +1807,6 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
 	if (error)
 		goto out_put;
 
-	error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh);
-	if (error)
-		goto out_qs;
-
 	/*  Count the number of leaves  */
 	bh = leaf_bh;
 
@@ -1889,8 +1885,6 @@ out_rg_gunlock:
 	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
 out_rlist:
 	gfs2_rlist_free(&rlist);
-	gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh);
-out_qs:
 	gfs2_quota_unhold(dip);
 out_put:
 	gfs2_alloc_put(dip);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 6f82aac9b0ee..951541b6234c 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -134,8 +134,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl)
 static void rgrp_go_sync(struct gfs2_glock *gl)
 {
 	struct address_space *metamapping = gfs2_glock2aspace(gl);
-	struct gfs2_rgrpd *rgd = gl->gl_object;
-	unsigned int x;
+	struct gfs2_rgrpd *rgd;
 	int error;
 
 	if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
@@ -148,14 +147,11 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
         mapping_set_error(metamapping, error);
 	gfs2_ail_empty_gl(gl);
 
-	if (!rgd)
-		return;
-
-	for (x = 0; x < rgd->rd_length; x++) {
-		struct gfs2_bitmap *bi = rgd->rd_bits + x;
-		kfree(bi->bi_clone);
-		bi->bi_clone = NULL;
-	}
+	spin_lock(&gl->gl_spin);
+	rgd = gl->gl_object;
+	if (rgd)
+		gfs2_free_clones(rgd);
+	spin_unlock(&gl->gl_spin);
 }
 
 /**
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 9e753fc9e6a5..56847f5903ae 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -255,7 +255,6 @@ struct gfs2_alloc {
 
 	unsigned int al_line;
 	char *al_file;
-	struct gfs2_holder al_ri_gh;
 	struct gfs2_holder al_rgd_gh;
 	struct gfs2_rgrpd *al_rgd;
 
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2af69056a9fd..29703dd97dc9 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1041,13 +1041,8 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
 	struct buffer_head *bh;
 	struct gfs2_holder ghs[3];
 	struct gfs2_rgrpd *rgd;
-	struct gfs2_holder ri_gh;
 	int error;
 
-	error = gfs2_rindex_hold(sdp, &ri_gh);
-	if (error)
-		return error;
-
 	gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
 	gfs2_holder_init(ip->i_gl,  LM_ST_EXCLUSIVE, 0, ghs + 1);
 
@@ -1104,7 +1099,6 @@ out_child:
 	gfs2_glock_dq(ghs);
 out_parent:
 	gfs2_holder_uninit(ghs);
-	gfs2_glock_dq_uninit(&ri_gh);
 	return error;
 }
 
@@ -1222,7 +1216,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
 	struct gfs2_inode *nip = NULL;
 	struct gfs2_sbd *sdp = GFS2_SB(odir);
-	struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
+	struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
 	struct gfs2_rgrpd *nrgd;
 	unsigned int num_gh;
 	int dir_rename = 0;
@@ -1236,10 +1230,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			return 0;
 	}
 
-	error = gfs2_rindex_hold(sdp, &ri_gh);
-	if (error)
-		return error;
-
 	if (odip != ndip) {
 		error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
 					   0, &r_gh);
@@ -1376,7 +1366,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 
 		al->al_requested = sdp->sd_max_dirres;
 
-		error = gfs2_inplace_reserve_ri(ndip);
+		error = gfs2_inplace_reserve(ndip);
 		if (error)
 			goto out_gunlock_q;
 
@@ -1447,7 +1437,6 @@ out_gunlock_r:
 	if (r_gh.gh_gl)
 		gfs2_glock_dq_uninit(&r_gh);
 out:
-	gfs2_glock_dq_uninit(&ri_gh);
 	return error;
 }
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 00f6e3d62c22..88d5b75067a8 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -332,14 +332,10 @@ struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
 	struct rb_node **newn, *parent = NULL;
 
 	spin_lock(&sdp->sd_rindex_spin);
-
 	newn = &sdp->sd_rindex_tree.rb_node;
-
-	/* Figure out where to put new node */
 	while (*newn) {
 		struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
 						  rd_node);
-
 		parent = *newn;
 		if (blk < cur->rd_addr)
 			newn = &((*newn)->rb_left);
@@ -350,7 +346,6 @@ struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
 			return cur;
 		}
 	}
-
 	spin_unlock(&sdp->sd_rindex_spin);
 
 	return NULL;
@@ -368,8 +363,10 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
 	const struct rb_node *n;
 	struct gfs2_rgrpd *rgd;
 
+	spin_lock(&sdp->sd_rindex_spin);
 	n = rb_first(&sdp->sd_rindex_tree);
 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
+	spin_unlock(&sdp->sd_rindex_spin);
 
 	return rgd;
 }
@@ -400,7 +397,18 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
 	return rgd;
 }
 
-static void clear_rgrpdi(struct gfs2_sbd *sdp)
+void gfs2_free_clones(struct gfs2_rgrpd *rgd)
+{
+	int x;
+
+	for (x = 0; x < rgd->rd_length; x++) {
+		struct gfs2_bitmap *bi = rgd->rd_bits + x;
+		kfree(bi->bi_clone);
+		bi->bi_clone = NULL;
+	}
+}
+
+void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
 {
 	struct rb_node *n;
 	struct gfs2_rgrpd *rgd;
@@ -413,23 +421,19 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp)
 		rb_erase(n, &sdp->sd_rindex_tree);
 
 		if (gl) {
+			spin_lock(&gl->gl_spin);
 			gl->gl_object = NULL;
+			spin_unlock(&gl->gl_spin);
 			gfs2_glock_add_to_lru(gl);
 			gfs2_glock_put(gl);
 		}
 
+		gfs2_free_clones(rgd);
 		kfree(rgd->rd_bits);
 		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
 	}
 }
 
-void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
-{
-	mutex_lock(&sdp->sd_rindex_mutex);
-	clear_rgrpdi(sdp);
-	mutex_unlock(&sdp->sd_rindex_mutex);
-}
-
 static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
 {
 	printk(KERN_INFO "  ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
@@ -546,17 +550,6 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
 	return total_data;
 }
 
-static void gfs2_rindex_in(struct gfs2_rgrpd *rgd, const void *buf)
-{
-	const struct gfs2_rindex *str = buf;
-
-	rgd->rd_addr = be64_to_cpu(str->ri_addr);
-	rgd->rd_length = be32_to_cpu(str->ri_length);
-	rgd->rd_data0 = be64_to_cpu(str->ri_data0);
-	rgd->rd_data = be32_to_cpu(str->ri_data);
-	rgd->rd_bitbytes = be32_to_cpu(str->ri_bitbytes);
-}
-
 static void rgd_insert(struct gfs2_rgrpd *rgd)
 {
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
@@ -584,7 +577,7 @@ static void rgd_insert(struct gfs2_rgrpd *rgd)
  * read_rindex_entry - Pull in a new resource index entry from the disk
  * @gl: The glock covering the rindex inode
  *
- * Returns: 0 on success, error code otherwise
+ * Returns: 0 on success, > 0 on EOF, error code otherwise
  */
 
 static int read_rindex_entry(struct gfs2_inode *ip,
@@ -592,19 +585,18 @@ static int read_rindex_entry(struct gfs2_inode *ip,
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
-	char buf[sizeof(struct gfs2_rindex)];
+	struct gfs2_rindex buf;
 	int error;
 	struct gfs2_rgrpd *rgd;
 
-	error = gfs2_internal_read(ip, ra_state, buf, &pos,
+	if (pos >= i_size_read(&ip->i_inode))
+		return 1;
+
+	error = gfs2_internal_read(ip, ra_state, (char *)&buf, &pos,
 				   sizeof(struct gfs2_rindex));
-	if (!error)
-		return 0;
-	if (error != sizeof(struct gfs2_rindex)) {
-		if (error > 0)
-			error = -EIO;
-		return error;
-	}
+
+	if (error != sizeof(struct gfs2_rindex))
+		return (error == 0) ? 1 : error;
 
 	rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
 	error = -ENOMEM;
@@ -612,23 +604,34 @@ static int read_rindex_entry(struct gfs2_inode *ip,
 		return error;
 
 	rgd->rd_sbd = sdp;
-
-	gfs2_rindex_in(rgd, buf);
-	rgd_insert(rgd);
+	rgd->rd_addr = be64_to_cpu(buf.ri_addr);
+	rgd->rd_length = be32_to_cpu(buf.ri_length);
+	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
+	rgd->rd_data = be32_to_cpu(buf.ri_data);
+	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
 
 	error = compute_bitstructs(rgd);
 	if (error)
-		return error;
+		goto fail;
 
 	error = gfs2_glock_get(sdp, rgd->rd_addr,
 			       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
 	if (error)
-		return error;
+		goto fail;
 
 	rgd->rd_gl->gl_object = rgd;
 	rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
 	if (rgd->rd_data > sdp->sd_max_rg_data)
 		sdp->sd_max_rg_data = rgd->rd_data;
+	spin_lock(&sdp->sd_rindex_spin);
+	rgd_insert(rgd);
+	sdp->sd_rgrps++;
+	spin_unlock(&sdp->sd_rindex_spin);
+	return error;
+
+fail:
+	kfree(rgd->rd_bits);
+	kmem_cache_free(gfs2_rgrpd_cachep, rgd);
 	return error;
 }
 
@@ -639,34 +642,28 @@ static int read_rindex_entry(struct gfs2_inode *ip,
  * Returns: 0 on successful update, error code otherwise
  */
 
-int gfs2_ri_update(struct gfs2_inode *ip)
+static int gfs2_ri_update(struct gfs2_inode *ip)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct inode *inode = &ip->i_inode;
 	struct file_ra_state ra_state;
-	u64 rgrp_count = i_size_read(inode);
 	int error;
 
-	do_div(rgrp_count, sizeof(struct gfs2_rindex));
-	clear_rgrpdi(sdp);
-
 	file_ra_state_init(&ra_state, inode->i_mapping);
-	for (sdp->sd_rgrps = 0; sdp->sd_rgrps < rgrp_count; sdp->sd_rgrps++) {
+	do {
 		error = read_rindex_entry(ip, &ra_state);
-		if (error) {
-			clear_rgrpdi(sdp);
-			return error;
-		}
-	}
+	} while (error == 0);
+
+	if (error < 0)
+		return error;
 
 	sdp->sd_rindex_uptodate = 1;
 	return 0;
 }
 
 /**
- * gfs2_rindex_hold - Grab a lock on the rindex
+ * gfs2_rindex_update - Update the rindex if required
  * @sdp: The GFS2 superblock
- * @ri_gh: the glock holder
  *
  * We grab a lock on the rindex inode to make sure that it doesn't
  * change whilst we are performing an operation. We keep this lock
@@ -678,30 +675,29 @@ int gfs2_ri_update(struct gfs2_inode *ip)
  * special file, which might have been updated if someone expanded the
  * filesystem (via gfs2_grow utility), which adds new resource groups.
  *
- * Returns: 0 on success, error code otherwise
+ * Returns: 0 on succeess, error code otherwise
  */
 
-int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
+int gfs2_rindex_update(struct gfs2_sbd *sdp)
 {
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
 	struct gfs2_glock *gl = ip->i_gl;
-	int error;
-
-	error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh);
-	if (error)
-		return error;
+	struct gfs2_holder ri_gh;
+	int error = 0;
 
 	/* Read new copy from disk if we don't have the latest */
 	if (!sdp->sd_rindex_uptodate) {
 		mutex_lock(&sdp->sd_rindex_mutex);
-		if (!sdp->sd_rindex_uptodate) {
+		error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
+		if (error)
+			return error;
+		if (!sdp->sd_rindex_uptodate)
 			error = gfs2_ri_update(ip);
-			if (error)
-				gfs2_glock_dq_uninit(ri_gh);
-		}
+		gfs2_glock_dq_uninit(&ri_gh);
 		mutex_unlock(&sdp->sd_rindex_mutex);
 	}
 
+
 	return error;
 }
 
@@ -873,8 +869,13 @@ fail:
 
 struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
 {
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	int error;
 	BUG_ON(ip->i_alloc != NULL);
 	ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_NOFS);
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		fs_warn(sdp, "rindex update returns %d\n", error);
 	return ip->i_alloc;
 }
 
@@ -1029,7 +1030,7 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
  * Returns: errno
  */
 
-int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
+int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
 			   char *file, unsigned int line)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
@@ -1041,18 +1042,6 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
 	if (gfs2_assert_warn(sdp, al->al_requested))
 		return -EINVAL;
 
-	if (hold_rindex) {
-		/* We need to hold the rindex unless the inode we're using is
-		   the rindex itself, in which case it's already held. */
-		if (ip != GFS2_I(sdp->sd_rindex))
-			error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
-		else if (!sdp->sd_rgrps) /* We may not have the rindex read
-					    in, so: */
-			error = gfs2_ri_update(ip);
-		if (error)
-			return error;
-	}
-
 try_again:
 	do {
 		error = get_local_rgrp(ip, &last_unlinked);
@@ -1069,11 +1058,8 @@ try_again:
 		}
 	} while (error && tries++ < 3);
 
-	if (error) {
-		if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
-			gfs2_glock_dq_uninit(&al->al_ri_gh);
+	if (error)
 		return error;
-	}
 
 	/* no error, so we have the rgrp set in the inode's allocation. */
 	al->al_file = file;
@@ -1103,8 +1089,6 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
 	al->al_rgd = NULL;
 	if (al->al_rgd_gh.gh_gl)
 		gfs2_glock_dq_uninit(&al->al_rgd_gh);
-	if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
-		gfs2_glock_dq_uninit(&al->al_ri_gh);
 }
 
 /**
@@ -1558,34 +1542,26 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
 int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
 {
 	struct gfs2_rgrpd *rgd;
-	struct gfs2_holder ri_gh, rgd_gh;
-	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
-	int ri_locked = 0;
+	struct gfs2_holder rgd_gh;
 	int error;
 
-	if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
-		error = gfs2_rindex_hold(sdp, &ri_gh);
-		if (error)
-			goto fail;
-		ri_locked = 1;
-	}
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
 
 	error = -EINVAL;
 	rgd = gfs2_blk2rgrpd(sdp, no_addr);
 	if (!rgd)
-		goto fail_rindex;
+		goto fail;
 
 	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
 	if (error)
-		goto fail_rindex;
+		goto fail;
 
 	if (gfs2_get_block_type(rgd, no_addr) != type)
 		error = -ESTALE;
 
 	gfs2_glock_dq_uninit(&rgd_gh);
-fail_rindex:
-	if (ri_locked)
-		gfs2_glock_dq_uninit(&ri_gh);
 fail:
 	return error;
 }
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 59b950f43ccf..0439fca18f08 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -18,13 +18,13 @@ struct gfs2_holder;
 
 extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
 
-struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
-struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
-struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
+extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
+extern struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
+extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
 
 extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
-extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
-
+extern int gfs2_rindex_update(struct gfs2_sbd *sdp);
+extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
 extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
 extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
 
@@ -36,16 +36,13 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
 	ip->i_alloc = NULL;
 }
 
-extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
+extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
 				  char *file, unsigned int line);
 #define gfs2_inplace_reserve(ip) \
-	gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__)
-#define gfs2_inplace_reserve_ri(ip) \
-	gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__)
+	gfs2_inplace_reserve_i((ip),  __FILE__, __LINE__)
 
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
 
-extern int gfs2_ri_update(struct gfs2_inode *ip);
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
 extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b05fa5954550..f716c4f8b252 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1037,7 +1037,6 @@ static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
 
 static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
 {
-	struct gfs2_holder ri_gh;
 	struct gfs2_rgrpd *rgd_next;
 	struct gfs2_holder *gha, *gh;
 	unsigned int slots = 64;
@@ -1050,10 +1049,6 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host
 	if (!gha)
 		return -ENOMEM;
 
-	error = gfs2_rindex_hold(sdp, &ri_gh);
-	if (error)
-		goto out;
-
 	rgd_next = gfs2_rgrpd_get_first(sdp);
 
 	for (;;) {
@@ -1096,9 +1091,6 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host
 		yield();
 	}
 
-	gfs2_glock_dq_uninit(&ri_gh);
-
-out:
 	kfree(gha);
 	return error;
 }
@@ -1150,6 +1142,10 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct gfs2_statfs_change_host sc;
 	int error;
 
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
 	if (gfs2_tune_get(sdp, gt_statfs_slow))
 		error = gfs2_statfs_slow(sdp, &sc);
 	else
@@ -1420,21 +1416,17 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
 	if (error)
 		goto out;
 
-	error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
-	if (error)
-		goto out_qs;
-
 	rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
 	if (!rgd) {
 		gfs2_consist_inode(ip);
 		error = -EIO;
-		goto out_rindex_relse;
+		goto out_qs;
 	}
 
 	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
 				   &al->al_rgd_gh);
 	if (error)
-		goto out_rindex_relse;
+		goto out_qs;
 
 	error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA,
 				 sdp->sd_jdesc->jd_blocks);
@@ -1449,8 +1441,6 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
 
 out_rg_gunlock:
 	gfs2_glock_dq_uninit(&al->al_rgd_gh);
-out_rindex_relse:
-	gfs2_glock_dq_uninit(&al->al_ri_gh);
 out_qs:
 	gfs2_quota_unhold(ip);
 out:
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 695304cf01cc..167f4af53b1d 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -332,15 +332,8 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
 	if (error)
 		goto out_alloc;
 
-	error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
-	if (error)
-		goto out_quota;
-
 	error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL);
 
-	gfs2_glock_dq_uninit(&al->al_ri_gh);
-
-out_quota:
 	gfs2_quota_unhold(ip);
 out_alloc:
 	gfs2_alloc_put(ip);
@@ -1502,24 +1495,18 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
 	if (error)
 		goto out_alloc;
 
-	error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
-	if (error)
-		goto out_quota;
-
 	error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
 	if (error)
-		goto out_rindex;
+		goto out_quota;
 
 	if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
 		error = ea_dealloc_indirect(ip);
 		if (error)
-			goto out_rindex;
+			goto out_quota;
 	}
 
 	error = ea_dealloc_block(ip);
 
-out_rindex:
-	gfs2_glock_dq_uninit(&al->al_ri_gh);
 out_quota:
 	gfs2_quota_unhold(ip);
 out_alloc:
-- 
cgit v1.2.3-58-ga151


From 54335b1fca27b84baa75b1f45985d98262003837 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 1 Sep 2011 13:31:59 +0100
Subject: GFS2: Cache the most recently used resource group in the inode

This means that after the initial allocation for any inode, the
last used resource group is cached in the inode for future use.
This drastically reduces the number of lookups of resource
groups in the common case, and this the contention on that
data structure.

The allocation algorithm is the same as previously, except that we
always check to see if the goal block is within the cached rgrp
first before going to the rbtree to look one up.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/aops.c   |  2 +-
 fs/gfs2/file.c   |  6 +++---
 fs/gfs2/incore.h |  3 +--
 fs/gfs2/inode.c  | 12 +++++-------
 fs/gfs2/quota.c  |  4 ++--
 fs/gfs2/rgrp.c   | 51 +++++++++++++++++++++++++--------------------------
 fs/gfs2/super.c  |  1 +
 fs/gfs2/trans.h  |  8 +++++---
 fs/gfs2/xattr.c  |  2 +-
 9 files changed, 44 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 212fe74927ba..4858e1fed8b1 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -663,7 +663,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 	if (&ip->i_inode == sdp->sd_rindex)
 		rblocks += 2 * RES_STATFS;
 	if (alloc_required)
-		rblocks += gfs2_rg_blocks(al);
+		rblocks += gfs2_rg_blocks(ip);
 
 	error = gfs2_trans_begin(sdp, rblocks,
 				 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index d717b72500a4..3467f3662149 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -397,7 +397,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		rblocks += data_blocks ? data_blocks : 1;
 	if (ind_blocks || data_blocks) {
 		rblocks += RES_STATFS + RES_QUOTA;
-		rblocks += gfs2_rg_blocks(al);
+		rblocks += gfs2_rg_blocks(ip);
 	}
 	ret = gfs2_trans_begin(sdp, rblocks, 0);
 	if (ret)
@@ -823,7 +823,7 @@ static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
 			    unsigned int *data_blocks, unsigned int *ind_blocks)
 {
 	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
+	unsigned int max_blocks = ip->i_rgd->rd_free_clone;
 	unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
 
 	for (tmp = max_data; tmp > sdp->sd_diptrs;) {
@@ -912,7 +912,7 @@ retry:
 		al->al_requested = data_blocks + ind_blocks;
 
 		rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
-			  RES_RG_HDR + gfs2_rg_blocks(al);
+			  RES_RG_HDR + gfs2_rg_blocks(ip);
 		if (gfs2_is_jdata(ip))
 			rblocks += data_blocks ? data_blocks : 1;
 
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 56847f5903ae..55e335b52839 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -256,8 +256,6 @@ struct gfs2_alloc {
 	unsigned int al_line;
 	char *al_file;
 	struct gfs2_holder al_rgd_gh;
-	struct gfs2_rgrpd *al_rgd;
-
 };
 
 enum {
@@ -279,6 +277,7 @@ struct gfs2_inode {
 	struct gfs2_holder i_iopen_gh;
 	struct gfs2_holder i_gh; /* for prepare/commit_write only */
 	struct gfs2_alloc *i_alloc;
+	struct gfs2_rgrpd *i_rgd;
 	u64 i_goal;	/* goal block for allocations */
 	struct rw_semaphore i_rw_mutex;
 	struct list_head i_trunc_list;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 29703dd97dc9..55b3bbaf2f25 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -583,7 +583,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 			goto fail_quota_locks;
 
 		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-					 al->al_rgd->rd_length +
+					 dip->i_rgd->rd_length +
 					 2 * RES_DINODE +
 					 RES_STATFS + RES_QUOTA, 0);
 		if (error)
@@ -613,8 +613,7 @@ fail_end_trans:
 	gfs2_trans_end(sdp);
 
 fail_ipreserv:
-	if (dip->i_alloc->al_rgd)
-		gfs2_inplace_release(dip);
+	gfs2_inplace_release(dip);
 
 fail_quota_locks:
 	gfs2_quota_unlock(dip);
@@ -731,8 +730,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 		brelse(bh);
 
 	gfs2_trans_end(sdp);
-	if (dip->i_alloc->al_rgd)
-		gfs2_inplace_release(dip);
+	gfs2_inplace_release(dip);
 	gfs2_quota_unlock(dip);
 	gfs2_alloc_put(dip);
 	mark_inode_dirty(inode);
@@ -896,7 +894,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 			goto out_gunlock_q;
 
 		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-					 gfs2_rg_blocks(al) +
+					 gfs2_rg_blocks(dip) +
 					 2 * RES_DINODE + RES_STATFS +
 					 RES_QUOTA, 0);
 		if (error)
@@ -1371,7 +1369,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			goto out_gunlock_q;
 
 		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-					 gfs2_rg_blocks(al) +
+					 gfs2_rg_blocks(ndip) +
 					 4 * RES_DINODE + 4 * RES_LEAF +
 					 RES_STATFS + RES_QUOTA + 4, 0);
 		if (error)
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3a9a9749f496..10a59cd21f0c 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -813,7 +813,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 		goto out_alloc;
 
 	if (nalloc)
-		blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS;
+		blocks += gfs2_rg_blocks(ip) + nalloc * ind_blocks + RES_STATFS;
 
 	error = gfs2_trans_begin(sdp, blocks, 0);
 	if (error)
@@ -1598,7 +1598,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
 		error = gfs2_inplace_reserve(ip);
 		if (error)
 			goto out_alloc;
-		blocks += gfs2_rg_blocks(al);
+		blocks += gfs2_rg_blocks(ip);
 	}
 
 	/* Some quotas span block boundaries and can update two blocks,
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 88d5b75067a8..5bfb97002c2a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -882,24 +882,21 @@ struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
 /**
  * try_rgrp_fit - See if a given reservation will fit in a given RG
  * @rgd: the RG data
- * @al: the struct gfs2_alloc structure describing the reservation
+ * @ip: the inode
  *
  * If there's room for the requested blocks to be allocated from the RG:
- *   Sets the $al_rgd field in @al.
  *
  * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
  */
 
-static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
+static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip)
 {
+	const struct gfs2_alloc *al = ip->i_alloc;
+
 	if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
 		return 0;
-
-	if (rgd->rd_free_clone >= al->al_requested) {
-		al->al_rgd = rgd;
+	if (rgd->rd_free_clone >= al->al_requested)
 		return 1;
-	}
-
 	return 0;
 }
 
@@ -985,7 +982,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 	int error, rg_locked;
 	int loops = 0;
 
-	rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal);
+	if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal))
+		rgd = begin = ip->i_rgd;
+	else
+		rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal);
 
 	if (rgd == NULL)
 		return -EBADSLT;
@@ -1002,8 +1002,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 		}
 		switch (error) {
 		case 0:
-			if (try_rgrp_fit(rgd, al))
+			if (try_rgrp_fit(rgd, ip)) {
+				ip->i_rgd = rgd;
 				return 0;
+			}
 			if (rgd->rd_flags & GFS2_RDF_CHECK)
 				try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
 			if (!rg_locked)
@@ -1014,7 +1016,6 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 			if (rgd == begin)
 				loops++;
 			break;
-
 		default:
 			return error;
 		}
@@ -1042,21 +1043,20 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
 	if (gfs2_assert_warn(sdp, al->al_requested))
 		return -EINVAL;
 
-try_again:
 	do {
 		error = get_local_rgrp(ip, &last_unlinked);
-		/* If there is no space, flushing the log may release some */
-		if (error) {
-			if (ip == GFS2_I(sdp->sd_rindex) &&
-			    !sdp->sd_rindex_uptodate) {
-				error = gfs2_ri_update(ip);
-				if (error)
-					return error;
-				goto try_again;
-			}
-			gfs2_log_flush(sdp, NULL);
+		if (error != -ENOSPC)
+			break;
+		/* Check that fs hasn't grown if writing to rindex */
+		if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
+			error = gfs2_ri_update(ip);
+			if (error)
+				break;
+			continue;
 		}
-	} while (error && tries++ < 3);
+		/* Flushing the log may release space */
+		gfs2_log_flush(sdp, NULL);
+	} while (tries++ < 3);
 
 	if (error)
 		return error;
@@ -1086,7 +1086,6 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
 		             al->al_alloced, al->al_requested, al->al_file,
 			     al->al_line);
 
-	al->al_rgd = NULL;
 	if (al->al_rgd_gh.gh_gl)
 		gfs2_glock_dq_uninit(&al->al_rgd_gh);
 }
@@ -1339,7 +1338,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
 	if (al == NULL)
 		return -ECANCELED;
 
-	rgd = al->al_rgd;
+	rgd = ip->i_rgd;
 
 	if (rgrp_contains_block(rgd, ip->i_goal))
 		goal = ip->i_goal - rgd->rd_data0;
@@ -1398,7 +1397,7 @@ int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
 	struct gfs2_alloc *al = dip->i_alloc;
-	struct gfs2_rgrpd *rgd = al->al_rgd;
+	struct gfs2_rgrpd *rgd = dip->i_rgd;
 	u32 blk;
 	u64 block;
 	unsigned int n = 1;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index f716c4f8b252..87e9141a4def 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1574,6 +1574,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
 	if (ip) {
 		ip->i_flags = 0;
 		ip->i_gl = NULL;
+		ip->i_rgd = NULL;
 	}
 	return &ip->i_inode;
 }
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index 980c5c05398a..f8f101ef600c 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -28,10 +28,12 @@ struct gfs2_glock;
 
 /* reserve either the number of blocks to be allocated plus the rg header
  * block, or all of the blocks in the rg, whichever is smaller */
-static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al)
+static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip)
 {
-	return (al->al_requested < al->al_rgd->rd_length)?
-	       al->al_requested + 1 : al->al_rgd->rd_length;
+	const struct gfs2_alloc *al = ip->i_alloc;
+	if (al->al_requested < ip->i_rgd->rd_length)
+		return al->al_requested + 1;
+	return ip->i_rgd->rd_length;
 }
 
 extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 167f4af53b1d..e7bf0ea1c3cc 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -727,7 +727,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 		goto out_gunlock_q;
 
 	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
-				 blks + gfs2_rg_blocks(al) +
+				 blks + gfs2_rg_blocks(ip) +
 				 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
 	if (error)
 		goto out_ipres;
-- 
cgit v1.2.3-58-ga151


From 534029e2fd06ac9a5a1b33b2735e3ac3242adb29 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 1 Sep 2011 16:36:44 +0100
Subject: GFS2: Remove obsolete assert

Given that a resource group has been locked, there is no reason why
we should not be able to allocate as many blocks as are free. The
al_requested parameter should really be considered as a minimum
number of blocks to be available. Should this limit be overshot,
there are other mechanisms which will prevent over allocation.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 5bfb97002c2a..08b3a8002aca 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1077,15 +1077,8 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
 
 void gfs2_inplace_release(struct gfs2_inode *ip)
 {
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_alloc *al = ip->i_alloc;
 
-	if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
-		fs_warn(sdp, "al_alloced = %u, al_requested = %u "
-			     "al_file = %s, al_line = %u\n",
-		             al->al_alloced, al->al_requested, al->al_file,
-			     al->al_line);
-
 	if (al->al_rgd_gh.gh_gl)
 		gfs2_glock_dq_uninit(&al->al_rgd_gh);
 }
-- 
cgit v1.2.3-58-ga151


From d56fa8a1c17b68274349fc852f634af99c0c4671 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 2 Sep 2011 12:43:41 +0100
Subject: GFS2: Call do_strip() directly from recursive_scan()

The recursive_scan() function only ever takes a single "bc"
argument, so we might as well just call do_strip() directly
from resource_scan() rather than pass it in as an argument.

Also the "data" argument is always a struct strip_mine, so
we can pass that in, rather than using a void pointer.

This also moves do_strip() ahead of recursive_scan() so that
we don't need to add a prototype.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 149 +++++++++++++++++++++++++++------------------------------
 1 file changed, 71 insertions(+), 78 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index e60296137707..9d3a0c26df28 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -36,11 +36,6 @@ struct metapath {
 	__u16 mp_list[GFS2_MAX_META_HEIGHT];
 };
 
-typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
-			     struct buffer_head *bh, __be64 *top,
-			     __be64 *bottom, unsigned int height,
-			     void *data);
-
 struct strip_mine {
 	int sm_first;
 	unsigned int sm_height;
@@ -667,76 +662,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
 	return ret;
 }
 
-/**
- * recursive_scan - recursively scan through the end of a file
- * @ip: the inode
- * @dibh: the dinode buffer
- * @mp: the path through the metadata to the point to start
- * @height: the height the recursion is at
- * @block: the indirect block to look at
- * @first: 1 if this is the first block
- * @bc: the call to make for each piece of metadata
- * @data: data opaque to this function to pass to @bc
- *
- * When this is first called @height and @block should be zero and
- * @first should be 1.
- *
- * Returns: errno
- */
-
-static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
-			  struct metapath *mp, unsigned int height,
-			  u64 block, int first, block_call_t bc,
-			  void *data)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct buffer_head *bh = NULL;
-	__be64 *top, *bottom;
-	u64 bn;
-	int error;
-	int mh_size = sizeof(struct gfs2_meta_header);
-
-	if (!height) {
-		error = gfs2_meta_inode_buffer(ip, &bh);
-		if (error)
-			return error;
-		dibh = bh;
-
-		top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
-		bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
-	} else {
-		error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
-		if (error)
-			return error;
-
-		top = (__be64 *)(bh->b_data + mh_size) +
-				  (first ? mp->mp_list[height] : 0);
-
-		bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
-	}
-
-	error = bc(ip, dibh, bh, top, bottom, height, data);
-	if (error)
-		goto out;
-
-	if (height < ip->i_height - 1)
-		for (; top < bottom; top++, first = 0) {
-			if (!*top)
-				continue;
-
-			bn = be64_to_cpu(*top);
-
-			error = recursive_scan(ip, dibh, mp, height + 1, bn,
-					       first, bc, data);
-			if (error)
-				break;
-		}
-
-out:
-	brelse(bh);
-	return error;
-}
-
 /**
  * do_strip - Look for a layer a particular layer of the file and strip it off
  * @ip: the inode
@@ -752,9 +677,8 @@ out:
 
 static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 		    struct buffer_head *bh, __be64 *top, __be64 *bottom,
-		    unsigned int height, void *data)
+		    unsigned int height, struct strip_mine *sm)
 {
-	struct strip_mine *sm = data;
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_rgrp_list rlist;
 	u64 bn, bstart;
@@ -885,6 +809,75 @@ out:
 	return error;
 }
 
+/**
+ * recursive_scan - recursively scan through the end of a file
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @mp: the path through the metadata to the point to start
+ * @height: the height the recursion is at
+ * @block: the indirect block to look at
+ * @first: 1 if this is the first block
+ * @sm: data opaque to this function to pass to @bc
+ *
+ * When this is first called @height and @block should be zero and
+ * @first should be 1.
+ *
+ * Returns: errno
+ */
+
+static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
+			  struct metapath *mp, unsigned int height,
+			  u64 block, int first, struct strip_mine *sm)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *bh = NULL;
+	__be64 *top, *bottom;
+	u64 bn;
+	int error;
+	int mh_size = sizeof(struct gfs2_meta_header);
+
+	if (!height) {
+		error = gfs2_meta_inode_buffer(ip, &bh);
+		if (error)
+			return error;
+		dibh = bh;
+
+		top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
+		bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
+	} else {
+		error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
+		if (error)
+			return error;
+
+		top = (__be64 *)(bh->b_data + mh_size) +
+				  (first ? mp->mp_list[height] : 0);
+
+		bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
+	}
+
+	error = do_strip(ip, dibh, bh, top, bottom, height, sm);
+	if (error)
+		goto out;
+
+	if (height < ip->i_height - 1)
+		for (; top < bottom; top++, first = 0) {
+			if (!*top)
+				continue;
+
+			bn = be64_to_cpu(*top);
+
+			error = recursive_scan(ip, dibh, mp, height + 1, bn,
+					       first, sm);
+			if (error)
+				break;
+		}
+
+out:
+	brelse(bh);
+	return error;
+}
+
+
 /**
  * gfs2_block_truncate_page - Deal with zeroing out data for truncate
  *
@@ -1024,7 +1017,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
 		sm.sm_first = !!size;
 		sm.sm_height = height;
 
-		error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm);
+		error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm);
 		if (error)
 			break;
 	}
-- 
cgit v1.2.3-58-ga151


From 70b0c3656f12964a6dac104214c904c66e626058 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 2 Sep 2011 16:08:09 +0100
Subject: GFS2: Use cached rgrp in gfs2_rlist_add()

Each block which is deallocated, requires a call to gfs2_rlist_add()
and each of those calls was calling gfs2_blk2rgrpd() in order to
figure out which rgrp the block belonged in. This can be speeded up
by making use of the rgrp cached in the inode. We also reset this
cached rgrp in case the block has changed rgrp. This should provide
a big reduction in gfs2_blk2rgrpd() calls during deallocation.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c  |  4 ++--
 fs/gfs2/dir.c   |  2 +-
 fs/gfs2/rgrp.c  | 14 +++++++++-----
 fs/gfs2/rgrp.h  |  2 +-
 fs/gfs2/xattr.c |  4 ++--
 5 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 9d3a0c26df28..22ad413213ca 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -724,7 +724,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 			blen++;
 		else {
 			if (bstart)
-				gfs2_rlist_add(sdp, &rlist, bstart);
+				gfs2_rlist_add(ip, &rlist, bstart);
 
 			bstart = bn;
 			blen = 1;
@@ -732,7 +732,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 	}
 
 	if (bstart)
-		gfs2_rlist_add(sdp, &rlist, bstart);
+		gfs2_rlist_add(ip, &rlist, bstart);
 	else
 		goto out; /* Nothing to do */
 
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 90b877b464ca..8ccad2467cb6 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1821,7 +1821,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
 		if (blk != leaf_no)
 			brelse(bh);
 
-		gfs2_rlist_add(sdp, &rlist, blk);
+		gfs2_rlist_add(dip, &rlist, blk);
 		l_blocks++;
 	}
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 08b3a8002aca..3088fb25656d 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1560,7 +1560,7 @@ fail:
 
 /**
  * gfs2_rlist_add - add a RG to a list of RGs
- * @sdp: the filesystem
+ * @ip: the inode
  * @rlist: the list of resource groups
  * @block: the block
  *
@@ -1570,9 +1570,10 @@ fail:
  *
  */
 
-void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
 		    u64 block)
 {
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_rgrpd *rgd;
 	struct gfs2_rgrpd **tmp;
 	unsigned int new_space;
@@ -1581,12 +1582,15 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
 	if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
 		return;
 
-	rgd = gfs2_blk2rgrpd(sdp, block);
+	if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, block))
+		rgd = ip->i_rgd;
+	else
+		rgd = gfs2_blk2rgrpd(sdp, block);
 	if (!rgd) {
-		if (gfs2_consist(sdp))
-			fs_err(sdp, "block = %llu\n", (unsigned long long)block);
+		fs_err(sdp, "rlist_add: no rgrp for block %llu\n", (unsigned long long)block);
 		return;
 	}
+	ip->i_rgd = rgd;
 
 	for (x = 0; x < rlist->rl_rgrps; x++)
 		if (rlist->rl_rgd[x] == rgd)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 0439fca18f08..0e886d830784 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -60,7 +60,7 @@ struct gfs2_rgrp_list {
 	struct gfs2_holder *rl_ghs;
 };
 
-extern void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
 			   u64 block);
 extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
 extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index e7bf0ea1c3cc..71d7bf830c09 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1356,14 +1356,14 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
 			blen++;
 		else {
 			if (bstart)
-				gfs2_rlist_add(sdp, &rlist, bstart);
+				gfs2_rlist_add(ip, &rlist, bstart);
 			bstart = bn;
 			blen = 1;
 		}
 		blks++;
 	}
 	if (bstart)
-		gfs2_rlist_add(sdp, &rlist, bstart);
+		gfs2_rlist_add(ip, &rlist, bstart);
 	else
 		goto out;
 
-- 
cgit v1.2.3-58-ga151


From b5b24d7aeb9608935786369ac2d3e9f362877d55 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 7 Sep 2011 10:33:25 +0100
Subject: GFS2: Fix AIL flush issue during fsync

Unfortunately, it is not enough to just ignore locked buffers during
the AIL flush from fsync. We need to be able to ignore all buffers
which are locked, dirty or pinned at this stage as they might have
been added subsequent to the log flush earlier in the fsync function.

In addition, this means that we no longer need to rely on i_mutex to
keep out writes during fsync, so we can, as a side-effect, remove
that protection too.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Tested-By: Abhijith Das <adas@redhat.com>
---
 fs/gfs2/file.c  |  8 ++------
 fs/gfs2/glops.c | 32 ++++++++++++++++----------------
 fs/gfs2/glops.h |  2 +-
 fs/gfs2/super.c |  2 +-
 4 files changed, 20 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 3467f3662149..3b65f67bb38e 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -593,16 +593,12 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
 		sync_state &= ~I_DIRTY_SYNC;
 
 	if (sync_state) {
-		mutex_lock(&inode->i_mutex);
 		ret = sync_inode_metadata(inode, 1);
-		if (ret) {
-			mutex_unlock(&inode->i_mutex);
+		if (ret)
 			return ret;
-		}
 		if (gfs2_is_jdata(ip))
 			filemap_write_and_wait(mapping);
-		gfs2_ail_flush(ip->i_gl);
-		mutex_unlock(&inode->i_mutex);
+		gfs2_ail_flush(ip->i_gl, 1);
 	}
 
 	if (mapping->nrpages)
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 951541b6234c..78418b4fa857 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -42,41 +42,41 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
 /**
  * __gfs2_ail_flush - remove all buffers for a given lock from the AIL
  * @gl: the glock
+ * @fsync: set when called from fsync (not all buffers will be clean)
  *
  * None of the buffers should be dirty, locked, or pinned.
  */
 
-static void __gfs2_ail_flush(struct gfs2_glock *gl, unsigned long b_state)
+static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
 {
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct list_head *head = &gl->gl_ail_list;
-	struct gfs2_bufdata *bd;
+	struct gfs2_bufdata *bd, *tmp;
 	struct buffer_head *bh;
+	const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock);
 	sector_t blocknr;
 
+	gfs2_log_lock(sdp);
 	spin_lock(&sdp->sd_ail_lock);
-	while (!list_empty(head)) {
-		bd = list_entry(head->next, struct gfs2_bufdata,
-				bd_ail_gl_list);
+	list_for_each_entry_safe(bd, tmp, head, bd_ail_gl_list) {
 		bh = bd->bd_bh;
-		blocknr = bh->b_blocknr;
-		if (bh->b_state & b_state)
+		if (bh->b_state & b_state) {
+			if (fsync)
+				continue;
 			gfs2_ail_error(gl, bh);
+		}
+		blocknr = bh->b_blocknr;
 		bh->b_private = NULL;
 		gfs2_remove_from_ail(bd); /* drops ref on bh */
-		spin_unlock(&sdp->sd_ail_lock);
 
 		bd->bd_bh = NULL;
 		bd->bd_blkno = blocknr;
 
-		gfs2_log_lock(sdp);
 		gfs2_trans_add_revoke(sdp, bd);
-		gfs2_log_unlock(sdp);
-
-		spin_lock(&sdp->sd_ail_lock);
 	}
-	gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
+	BUG_ON(!fsync && atomic_read(&gl->gl_ail_count));
 	spin_unlock(&sdp->sd_ail_lock);
+	gfs2_log_unlock(sdp);
 }
 
 
@@ -99,13 +99,13 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 	BUG_ON(current->journal_info);
 	current->journal_info = &tr;
 
-	__gfs2_ail_flush(gl, (1ul << BH_Dirty)|(1ul << BH_Pinned)|(1ul << BH_Lock));
+	__gfs2_ail_flush(gl, 0);
 
 	gfs2_trans_end(sdp);
 	gfs2_log_flush(sdp, NULL);
 }
 
-void gfs2_ail_flush(struct gfs2_glock *gl)
+void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
 {
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	unsigned int revokes = atomic_read(&gl->gl_ail_count);
@@ -117,7 +117,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl)
 	ret = gfs2_trans_begin(sdp, 0, revokes);
 	if (ret)
 		return;
-	__gfs2_ail_flush(gl, (1ul << BH_Dirty)|(1ul << BH_Pinned));
+	__gfs2_ail_flush(gl, fsync);
 	gfs2_trans_end(sdp);
 	gfs2_log_flush(sdp, NULL);
 }
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index 6fce409b5a50..bf95a2dc1662 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -23,6 +23,6 @@ extern const struct gfs2_glock_operations gfs2_quota_glops;
 extern const struct gfs2_glock_operations gfs2_journal_glops;
 extern const struct gfs2_glock_operations *gfs2_glops_list[];
 
-extern void gfs2_ail_flush(struct gfs2_glock *gl);
+extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync);
 
 #endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 87e9141a4def..71e420989f77 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1533,7 +1533,7 @@ static void gfs2_evict_inode(struct inode *inode)
 out_truncate:
 	gfs2_log_flush(sdp, ip->i_gl);
 	write_inode_now(inode, 1);
-	gfs2_ail_flush(ip->i_gl);
+	gfs2_ail_flush(ip->i_gl, 0);
 
 	/* Case 2 starts here */
 	error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
-- 
cgit v1.2.3-58-ga151


From ccad4e147acf2a59b463f5df3cee8b43b144ce82 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 7 Sep 2011 12:15:23 +0100
Subject: GFS2: Correctly set goal block after allocation

The new goal block should be set to the end of the newly
allocated extent, not the start of it.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3088fb25656d..8ec41744594b 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1346,7 +1346,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
 
 	rgd->rd_last_alloc = blk;
 	block = rgd->rd_data0 + blk;
-	ip->i_goal = block;
+	ip->i_goal = block + *n - 1;
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (error == 0) {
 		struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
-- 
cgit v1.2.3-58-ga151


From 13d921e37174e3d1042deeb303537c1d935da553 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 7 Sep 2011 15:12:51 +0100
Subject: GFS2: Clean up ->page_mkwrite

This patch brings gfs2's ->page_mkwrite uptodate with respect to the
expectations set by the VM. Also added is a check to wait if the fs
is frozen, before we attempt to get a glock. This will only work on
the node which initiates the freeze, but thats ok since the transaction
lock will still provide the expected barrier on other nodes.

The major change here is that we return a locked page now, except when
we don't return a page at all (error cases). This removes the race
which required rechecking the page after it was returned.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Nick Piggin <npiggin@kernel.dk>
---
 fs/gfs2/file.c | 64 +++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 46 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 3b65f67bb38e..aa3a4ddb834e 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -366,8 +366,15 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	unsigned int data_blocks, ind_blocks, rblocks;
 	struct gfs2_holder gh;
 	struct gfs2_alloc *al;
+	loff_t size;
 	int ret;
 
+	/* Wait if fs is frozen. This is racy so we check again later on
+	 * and retry if the fs has been frozen after the page lock has
+	 * been acquired
+	 */
+	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	ret = gfs2_glock_nq(&gh);
 	if (ret)
@@ -376,8 +383,15 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
 	set_bit(GIF_SW_PAGED, &ip->i_flags);
 
-	if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE))
+	if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) {
+		lock_page(page);
+		if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
+			ret = -EAGAIN;
+			unlock_page(page);
+		}
 		goto out_unlock;
+	}
+
 	ret = -ENOMEM;
 	al = gfs2_alloc_get(ip);
 	if (al == NULL)
@@ -405,21 +419,29 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	lock_page(page);
 	ret = -EINVAL;
-	last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT;
-	if (page->index > last_index)
-		goto out_unlock_page;
+	size = i_size_read(inode);
+	last_index = (size - 1) >> PAGE_CACHE_SHIFT;
+	/* Check page index against inode size */
+	if (size == 0 || (page->index > last_index))
+		goto out_trans_end;
+
+	ret = -EAGAIN;
+	/* If truncated, we must retry the operation, we may have raced
+	 * with the glock demotion code.
+	 */
+	if (!PageUptodate(page) || page->mapping != inode->i_mapping)
+		goto out_trans_end;
+
+	/* Unstuff, if required, and allocate backing blocks for page */
 	ret = 0;
-	if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping)
-		goto out_unlock_page;
-	if (gfs2_is_stuffed(ip)) {
+	if (gfs2_is_stuffed(ip))
 		ret = gfs2_unstuff_dinode(ip, page);
-		if (ret)
-			goto out_unlock_page;
-	}
-	ret = gfs2_allocate_page_backing(page);
+	if (ret == 0)
+		ret = gfs2_allocate_page_backing(page);
 
-out_unlock_page:
-	unlock_page(page);
+out_trans_end:
+	if (ret)
+		unlock_page(page);
 	gfs2_trans_end(sdp);
 out_trans_fail:
 	gfs2_inplace_release(ip);
@@ -431,11 +453,17 @@ out_unlock:
 	gfs2_glock_dq(&gh);
 out:
 	gfs2_holder_uninit(&gh);
-	if (ret == -ENOMEM)
-		ret = VM_FAULT_OOM;
-	else if (ret)
-		ret = VM_FAULT_SIGBUS;
-	return ret;
+	if (ret == 0) {
+		set_page_dirty(page);
+		/* This check must be post dropping of transaction lock */
+		if (inode->i_sb->s_frozen == SB_UNFROZEN) {
+			wait_on_page_writeback(page);
+		} else {
+			ret = -EAGAIN;
+			unlock_page(page);
+		}
+	}
+	return block_page_mkwrite_return(ret);
 }
 
 static const struct vm_operations_struct gfs2_vm_ops = {
-- 
cgit v1.2.3-58-ga151


From f75bbfb4dda68c86eb33cde7e2b5c1343c6d5812 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 8 Sep 2011 10:21:13 +0100
Subject: GFS2: Fix off-by-one in gfs2_blk2rgrpd

Bob reported:

I found an off-by-one problem with how I coded this section:
It should be:

+ else if (blk >= cur->rd_data0 + cur->rd_data)

In fact, cur->rd_data0 + cur->rd_data is the start of the next
rgrp (the next ri_addr), so without the "=" check it can land on
the wrong rgrp.

In all normal cases, this won't be a problem: you're searching
for a block _within_ the rgrp, which will pass the test properly.
Where it gets into trouble is if you search the rgrps for the
block exactly equal to ri_addr.  I don't think anything in the
kernel does this, but I found a place in gfs2-utils gfs2_edit
where it does.  So I definitely need to fix it in libgfs2.  I'd
like to suggest we fix it in the kernel as well for the sake of
keeping the functions similar.

So this patch fixes the above mentioned off by one error as well
as removing the unused parent pointer.

Reported-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8ec41744594b..1daf8a78c733 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -329,17 +329,16 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
 
 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
 {
-	struct rb_node **newn, *parent = NULL;
+	struct rb_node **newn;
+	struct gfs2_rgrpd *cur;
 
 	spin_lock(&sdp->sd_rindex_spin);
 	newn = &sdp->sd_rindex_tree.rb_node;
 	while (*newn) {
-		struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
-						  rd_node);
-		parent = *newn;
+		cur = rb_entry(*newn, struct gfs2_rgrpd, rd_node);
 		if (blk < cur->rd_addr)
 			newn = &((*newn)->rb_left);
-		else if (blk > cur->rd_data0 + cur->rd_data)
+		else if (blk >= cur->rd_data0 + cur->rd_data)
 			newn = &((*newn)->rb_right);
 		else {
 			spin_unlock(&sdp->sd_rindex_spin);
-- 
cgit v1.2.3-58-ga151


From bd5437a7d4307a35f2c7cc19cad706ec0e5d61f0 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Thu, 15 Sep 2011 09:59:56 -0400
Subject: GFS2: speed up delete/unlink performance for large files

This patch improves the performance of delete/unlink
operations in a GFS2 file system where the files are large
by adding a layer of metadata read-ahead for indirect blocks.
Mileage will vary, but on my system, deleting an 8.6G file
dropped from 22 seconds to about 4.5 seconds.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 22ad413213ca..834cd9442a1d 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -831,7 +831,7 @@ static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct buffer_head *bh = NULL;
-	__be64 *top, *bottom;
+	__be64 *top, *bottom, *t2;
 	u64 bn;
 	int error;
 	int mh_size = sizeof(struct gfs2_meta_header);
@@ -859,7 +859,27 @@ static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
 	if (error)
 		goto out;
 
-	if (height < ip->i_height - 1)
+	if (height < ip->i_height - 1) {
+		struct buffer_head *rabh;
+
+		for (t2 = top; t2 < bottom; t2++, first = 0) {
+			if (!*t2)
+				continue;
+
+			bn = be64_to_cpu(*t2);
+			rabh = gfs2_getbuf(ip->i_gl, bn, CREATE);
+			if (trylock_buffer(rabh)) {
+				if (buffer_uptodate(rabh)) {
+					unlock_buffer(rabh);
+					brelse(rabh);
+					continue;
+				}
+				rabh->b_end_io = end_buffer_read_sync;
+				submit_bh(READA | REQ_META, rabh);
+				continue;
+			}
+			brelse(rabh);
+		}
 		for (; top < bottom; top++, first = 0) {
 			if (!*top)
 				continue;
@@ -871,7 +891,7 @@ static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
 			if (error)
 				break;
 		}
-
+	}
 out:
 	brelse(bh);
 	return error;
-- 
cgit v1.2.3-58-ga151


From 64dd153c83743af81f20924c6343652d731eeecb Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Mon, 12 Sep 2011 18:15:24 -0500
Subject: GFS2: rewrite fallocate code to write blocks directly

GFS2's fallocate code currently goes through the page cache. Since it's only
writing to the end of the file or to holes in it, it doesn't need to, and it
was causing issues on low memory environments. This patch pulls in some of
Steve's block allocation work, and uses it to simply allocate the blocks for
the file, and zero them out at allocation time.  It provides a slight
performance increase, and it dramatically simplifies the code.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c   |  12 ++++
 fs/gfs2/file.c   | 171 ++++++++-----------------------------------------------
 fs/gfs2/incore.h |   3 +
 3 files changed, 39 insertions(+), 147 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 834cd9442a1d..97b61955850a 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -10,6 +10,7 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 
@@ -427,12 +428,14 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct super_block *sb = sdp->sd_vfs;
 	struct buffer_head *dibh = mp->mp_bh[0];
 	u64 bn, dblock = 0;
 	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
 	unsigned dblks = 0;
 	unsigned ptrs_per_blk;
 	const unsigned end_of_metadata = height - 1;
+	int ret;
 	int eob = 0;
 	enum alloc_state state;
 	__be64 *ptr;
@@ -535,6 +538,15 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 			dblock = bn;
 			while (n-- > 0)
 				*ptr++ = cpu_to_be64(bn++);
+			if (buffer_zeronew(bh_map)) {
+				ret = sb_issue_zeroout(sb, dblock, dblks,
+						       GFP_NOFS);
+				if (ret) {
+					fs_err(sdp,
+					       "Failed to zero data buffers\n");
+					clear_buffer_zeronew(bh_map);
+				}
+			}
 			break;
 		}
 	} while ((state != ALLOC_DATA) || !dblock);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index aa3a4ddb834e..5002408dabea 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -669,135 +669,18 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	return generic_file_aio_write(iocb, iov, nr_segs, pos);
 }
 
-static int empty_write_end(struct page *page, unsigned from,
-			   unsigned to, int mode)
-{
-	struct inode *inode = page->mapping->host;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct buffer_head *bh;
-	unsigned offset, blksize = 1 << inode->i_blkbits;
-	pgoff_t end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
-
-	zero_user(page, from, to-from);
-	mark_page_accessed(page);
-
-	if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
-		if (!gfs2_is_writeback(ip))
-			gfs2_page_add_databufs(ip, page, from, to);
-
-		block_commit_write(page, from, to);
-		return 0;
-	}
-
-	offset = 0;
-	bh = page_buffers(page);
-	while (offset < to) {
-		if (offset >= from) {
-			set_buffer_uptodate(bh);
-			mark_buffer_dirty(bh);
-			clear_buffer_new(bh);
-			write_dirty_buffer(bh, WRITE);
-		}
-		offset += blksize;
-		bh = bh->b_this_page;
-	}
-
-	offset = 0;
-	bh = page_buffers(page);
-	while (offset < to) {
-		if (offset >= from) {
-			wait_on_buffer(bh);
-			if (!buffer_uptodate(bh))
-				return -EIO;
-		}
-		offset += blksize;
-		bh = bh->b_this_page;
-	}
-	return 0;
-}
-
-static int needs_empty_write(sector_t block, struct inode *inode)
-{
-	int error;
-	struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
-
-	bh_map.b_size = 1 << inode->i_blkbits;
-	error = gfs2_block_map(inode, block, &bh_map, 0);
-	if (unlikely(error))
-		return error;
-	return !buffer_mapped(&bh_map);
-}
-
-static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
-			      int mode)
-{
-	struct inode *inode = page->mapping->host;
-	unsigned start, end, next, blksize;
-	sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	int ret;
-
-	blksize = 1 << inode->i_blkbits;
-	next = end = 0;
-	while (next < from) {
-		next += blksize;
-		block++;
-	}
-	start = next;
-	do {
-		next += blksize;
-		ret = needs_empty_write(block, inode);
-		if (unlikely(ret < 0))
-			return ret;
-		if (ret == 0) {
-			if (end) {
-				ret = __block_write_begin(page, start, end - start,
-							  gfs2_block_map);
-				if (unlikely(ret))
-					return ret;
-				ret = empty_write_end(page, start, end, mode);
-				if (unlikely(ret))
-					return ret;
-				end = 0;
-			}
-			start = next;
-		}
-		else
-			end = next;
-		block++;
-	} while (next < to);
-
-	if (end) {
-		ret = __block_write_begin(page, start, end - start, gfs2_block_map);
-		if (unlikely(ret))
-			return ret;
-		ret = empty_write_end(page, start, end, mode);
-		if (unlikely(ret))
-			return ret;
-	}
-
-	return 0;
-}
-
 static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
 			   int mode)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct buffer_head *dibh;
 	int error;
-	u64 start = offset >> PAGE_CACHE_SHIFT;
-	unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
-	u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
-	pgoff_t curr;
-	struct page *page;
-	unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
-	unsigned int from, to;
-
-	if (!end_offset)
-		end_offset = PAGE_CACHE_SIZE;
+	unsigned int nr_blks;
+	sector_t lblock = offset >> inode->i_blkbits;
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (unlikely(error))
-		goto out;
+		return error;
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 
@@ -807,39 +690,31 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
 			goto out;
 	}
 
-	curr = start;
-	offset = start << PAGE_CACHE_SHIFT;
-	from = start_offset;
-	to = PAGE_CACHE_SIZE;
-	while (curr <= end) {
-		page = grab_cache_page_write_begin(inode->i_mapping, curr,
-						   AOP_FLAG_NOFS);
-		if (unlikely(!page)) {
-			error = -ENOMEM;
-			goto out;
-		}
+	while (len) {
+		struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
+		bh_map.b_size = len;
+		set_buffer_zeronew(&bh_map);
 
-		if (curr == end)
-			to = end_offset;
-		error = write_empty_blocks(page, from, to, mode);
-		if (!error && offset + to > inode->i_size &&
-		    !(mode & FALLOC_FL_KEEP_SIZE)) {
-			i_size_write(inode, offset + to);
-		}
-		unlock_page(page);
-		page_cache_release(page);
-		if (error)
+		error = gfs2_block_map(inode, lblock, &bh_map, 1);
+		if (unlikely(error))
 			goto out;
-		curr++;
-		offset += PAGE_CACHE_SIZE;
-		from = 0;
+		len -= bh_map.b_size;
+		nr_blks = bh_map.b_size >> inode->i_blkbits;
+		lblock += nr_blks;
+		if (!buffer_new(&bh_map))
+			continue;
+		if (unlikely(!buffer_zeronew(&bh_map))) {
+			error = -EIO;
+			goto out;
+		}
 	}
+	if (offset + len > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE))
+		i_size_write(inode, offset + len);
 
 	mark_inode_dirty(inode);
 
-	brelse(dibh);
-
 out:
+	brelse(dibh);
 	return error;
 }
 
@@ -879,6 +754,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
 	int error;
 	loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
 	loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
+	loff_t max_chunk_size = UINT_MAX & bsize_mask;
 	next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
 
 	/* We only support the FALLOC_FL_KEEP_SIZE mode */
@@ -932,7 +808,8 @@ retry:
 			goto out_qunlock;
 		}
 		max_bytes = bytes;
-		calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
+		calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len,
+				&max_bytes, &data_blocks, &ind_blocks);
 		al->al_requested = data_blocks + ind_blocks;
 
 		rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 55e335b52839..6429aa4339ff 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -103,12 +103,15 @@ struct gfs2_rgrpd {
 enum gfs2_state_bits {
 	BH_Pinned = BH_PrivateStart,
 	BH_Escaped = BH_PrivateStart + 1,
+	BH_Zeronew = BH_PrivateStart + 2,
 };
 
 BUFFER_FNS(Pinned, pinned)
 TAS_BUFFER_FNS(Pinned, pinned)
 BUFFER_FNS(Escaped, escaped)
 TAS_BUFFER_FNS(Escaped, escaped)
+BUFFER_FNS(Zeronew, zeronew)
+TAS_BUFFER_FNS(Zeronew, zeronew)
 
 struct gfs2_bufdata {
 	struct buffer_head *bd_bh;
-- 
cgit v1.2.3-58-ga151


From 891a8e9335176b7eb9adc5e34f555ee5e1da47c6 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 19 Sep 2011 10:25:49 +0100
Subject: GFS2: Misc fixes

Some items picked up through automated code analysis. A few bits
of unreachable code and two unchecked return values.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/lops.c       |  2 --
 fs/gfs2/ops_fstype.c |  3 ---
 fs/gfs2/quota.c      | 11 ++++++++---
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index de05b4d66ef3..0301be655b12 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -695,8 +695,6 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
 
 		brelse(bh_log);
 		brelse(bh_ip);
-		if (error)
-			break;
 
 		sdp->sd_replayed_blocks++;
 	}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 69166ff7f37d..7e823bbd2453 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -651,7 +651,6 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 		fs_err(sdp, "can't lookup journal index: %d\n", error);
 		return PTR_ERR(sdp->sd_jindex);
 	}
-	ip = GFS2_I(sdp->sd_jindex);
 
 	/* Load in the journal index special file */
 
@@ -763,7 +762,6 @@ fail:
 static int init_inodes(struct gfs2_sbd *sdp, int undo)
 {
 	int error = 0;
-	struct gfs2_inode *ip;
 	struct inode *master = sdp->sd_master_dir->d_inode;
 
 	if (undo)
@@ -788,7 +786,6 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
 		fs_err(sdp, "can't get resource index inode: %d\n", error);
 		goto fail_statfs;
 	}
-	ip = GFS2_I(sdp->sd_rindex);
 	sdp->sd_rindex_uptodate = 0;
 
 	/* Read in the quota inode */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 10a59cd21f0c..7e528dc14f85 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -645,8 +645,11 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 	int err, nbytes;
 	u64 size;
 
-	if (gfs2_is_stuffed(ip))
-		gfs2_unstuff_dinode(ip, NULL);
+	if (gfs2_is_stuffed(ip)) {
+		err = gfs2_unstuff_dinode(ip, NULL);
+		if (err)
+			return err;
+	}
 
 	memset(&q, 0, sizeof(struct gfs2_quota));
 	err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q));
@@ -927,7 +930,9 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
 	unsigned int x;
 	int error = 0;
 
-	gfs2_quota_hold(ip, uid, gid);
+	error = gfs2_quota_hold(ip, uid, gid);
+	if (error)
+		return error;
 
 	if (capable(CAP_SYS_RESOURCE) ||
 	    sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
-- 
cgit v1.2.3-58-ga151


From 9ae32429fe036fcfce036ec57b28fc59f3911976 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 20 Sep 2011 12:16:11 +0100
Subject: GFS2: Remove two unused variables

The two variables being initialised in gfs2_inplace_reserve
to track the file & line number of the caller are never
used, so we might as well remove them.

If something does go wrong, then a stack trace is probably
more useful anyway.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h |  4 ----
 fs/gfs2/rgrp.c   | 14 +++-----------
 fs/gfs2/rgrp.h   |  6 +-----
 3 files changed, 4 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 6429aa4339ff..7389dfdcc9ef 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -246,7 +246,6 @@ struct gfs2_glock {
 
 struct gfs2_alloc {
 	/* Quota stuff */
-
 	struct gfs2_quota_data *al_qd[2*MAXQUOTAS];
 	struct gfs2_holder al_qd_ghs[2*MAXQUOTAS];
 	unsigned int al_qd_num;
@@ -255,9 +254,6 @@ struct gfs2_alloc {
 	u32 al_alloced; /* Filled in by gfs2_alloc_*() */
 
 	/* Filled in by gfs2_inplace_reserve() */
-
-	unsigned int al_line;
-	char *al_file;
 	struct gfs2_holder al_rgd_gh;
 };
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 1daf8a78c733..96bd6d759f29 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1024,14 +1024,13 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 }
 
 /**
- * gfs2_inplace_reserve_i - Reserve space in the filesystem
+ * gfs2_inplace_reserve - Reserve space in the filesystem
  * @ip: the inode to reserve space for
  *
  * Returns: errno
  */
 
-int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
-			   char *file, unsigned int line)
+int gfs2_inplace_reserve(struct gfs2_inode *ip)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_alloc *al = ip->i_alloc;
@@ -1057,14 +1056,7 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
 		gfs2_log_flush(sdp, NULL);
 	} while (tries++ < 3);
 
-	if (error)
-		return error;
-
-	/* no error, so we have the rgrp set in the inode's allocation. */
-	al->al_file = file;
-	al->al_line = line;
-
-	return 0;
+	return error;
 }
 
 /**
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 0e886d830784..cf5c50180192 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -36,11 +36,7 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
 	ip->i_alloc = NULL;
 }
 
-extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
-				  char *file, unsigned int line);
-#define gfs2_inplace_reserve(ip) \
-	gfs2_inplace_reserve_i((ip),  __FILE__, __LINE__)
-
+extern int gfs2_inplace_reserve(struct gfs2_inode *ip);
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
 
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
-- 
cgit v1.2.3-58-ga151


From b99b98dc2673a123a73068f16720232d7be7e669 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 21 Sep 2011 11:05:16 +0100
Subject: GFS2: Move readahead of metadata during deallocation into its own
 function

Move the recently added readahead of the indirect pointer
tree during deallocation into its own function in order
that we can use it elsewhere in the future. Also this
fixes the resetting of the "first" variable in the
original patch.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 45 ++++++++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 97b61955850a..41d494d79709 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -269,6 +269,30 @@ static inline __be64 *metapointer(unsigned int height, const struct metapath *mp
 	return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
 }
 
+static void gfs2_metapath_ra(struct gfs2_glock *gl,
+			     const struct buffer_head *bh, const __be64 *pos)
+{
+	struct buffer_head *rabh;
+	const __be64 *endp = (const __be64 *)(bh->b_data + bh->b_size);
+	const __be64 *t;
+
+	for (t = pos; t < endp; t++) {
+		if (!*t)
+			continue;
+
+		rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
+		if (trylock_buffer(rabh)) {
+			if (!buffer_uptodate(rabh)) {
+				rabh->b_end_io = end_buffer_read_sync;
+				submit_bh(READA | REQ_META, rabh);
+				continue;
+			}
+			unlock_buffer(rabh);
+		}
+		brelse(rabh);
+	}
+}
+
 /**
  * lookup_metapath - Walk the metadata tree to a specific point
  * @ip: The inode
@@ -843,7 +867,7 @@ static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct buffer_head *bh = NULL;
-	__be64 *top, *bottom, *t2;
+	__be64 *top, *bottom;
 	u64 bn;
 	int error;
 	int mh_size = sizeof(struct gfs2_meta_header);
@@ -872,26 +896,9 @@ static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
 		goto out;
 
 	if (height < ip->i_height - 1) {
-		struct buffer_head *rabh;
 
-		for (t2 = top; t2 < bottom; t2++, first = 0) {
-			if (!*t2)
-				continue;
+		gfs2_metapath_ra(ip->i_gl, bh, top);
 
-			bn = be64_to_cpu(*t2);
-			rabh = gfs2_getbuf(ip->i_gl, bn, CREATE);
-			if (trylock_buffer(rabh)) {
-				if (buffer_uptodate(rabh)) {
-					unlock_buffer(rabh);
-					brelse(rabh);
-					continue;
-				}
-				rabh->b_end_io = end_buffer_read_sync;
-				submit_bh(READA | REQ_META, rabh);
-				continue;
-			}
-			brelse(rabh);
-		}
 		for (; top < bottom; top++, first = 0) {
 			if (!*top)
 				continue;
-- 
cgit v1.2.3-58-ga151


From a2d6b6cacb4fd4494b4037c01abb1332cefbb37b Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastry@etersoft.ru>
Date: Fri, 21 Oct 2011 10:14:04 +0400
Subject: CIFS: Fix error handling in cifs_readv_complete

In cifs_readv_receive we don't update rdata->result to error value
after kmap'ing a page. We should kunmap the page in the no error
case only.

Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifssmb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index aaad4ce6e6c5..4435b11c41b9 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1619,9 +1619,9 @@ cifs_readv_complete(struct work_struct *work)
 	list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
 		list_del(&page->lru);
 		lru_cache_add_file(page);
-		kunmap(page);
 
 		if (rdata->result == 0) {
+			kunmap(page);
 			flush_dcache_page(page);
 			SetPageUptodate(page);
 		}
-- 
cgit v1.2.3-58-ga151


From 1939dd84b3f52e9c8d1b46dffae2058f16a3ff6a Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Sat, 22 Oct 2011 01:26:05 -0400
Subject: ext4: cleanup ext4_ext_grow_indepth code

Currently code make an impression what grow procedure is very complicated
and some mythical paths, blocks are involved. But in fact grow in depth
it relatively simple procedure:
 1) Just create new meta block and copy root data to that block.
 2) Convert root from extent to index if old depth == 0
 3) Update root block pointer

This patch does:
 - Reorganize code to make it more self explanatory
 - Do not pass path parameter to new_meta_block() in order to
   provoke allocation from inode's group because top-level block
   should site closer to it's inode, but not to leaf data block.

   [ This happens anyway, due to logic in mballoc; we should drop
     the path parameter from new_meta_block() entirely.  -- tytso ]

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 41 +++++++++++++++--------------------------
 1 file changed, 15 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2fc6cc0a51ca..385ebb435332 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1050,16 +1050,14 @@ cleanup:
  */
 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 				 unsigned int flags,
-				 struct ext4_ext_path *path,
 				 struct ext4_extent *newext)
 {
-	struct ext4_ext_path *curp = path;
 	struct ext4_extent_header *neh;
 	struct buffer_head *bh;
 	ext4_fsblk_t newblock;
 	int err = 0;
 
-	newblock = ext4_ext_new_meta_block(handle, inode, path,
+	newblock = ext4_ext_new_meta_block(handle, inode, NULL,
 		newext, &err, flags);
 	if (newblock == 0)
 		return err;
@@ -1079,7 +1077,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 	}
 
 	/* move top-level index/leaf into new block */
-	memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
+	memmove(bh->b_data, EXT4_I(inode)->i_data,
+		sizeof(EXT4_I(inode)->i_data));
 
 	/* set size of new block */
 	neh = ext_block_hdr(bh);
@@ -1097,32 +1096,23 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 	if (err)
 		goto out;
 
-	/* create index in new top-level index: num,max,pointer */
-	err = ext4_ext_get_access(handle, inode, curp);
-	if (err)
-		goto out;
-
-	curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
-	curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
-	curp->p_hdr->eh_entries = cpu_to_le16(1);
-	curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
-
-	if (path[0].p_hdr->eh_depth)
-		curp->p_idx->ei_block =
-			EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
-	else
-		curp->p_idx->ei_block =
-			EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
-	ext4_idx_store_pblock(curp->p_idx, newblock);
-
+	/* Update top-level index: num,max,pointer */
 	neh = ext_inode_hdr(inode);
+	neh->eh_entries = cpu_to_le16(1);
+	ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
+	if (neh->eh_depth == 0) {
+		/* Root extent block becomes index block */
+		neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
+		EXT_FIRST_INDEX(neh)->ei_block =
+			EXT_FIRST_EXTENT(neh)->ee_block;
+	}
 	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
 		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
 		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
 		  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
 
-	neh->eh_depth = cpu_to_le16(path->p_depth + 1);
-	err = ext4_ext_dirty(handle, inode, curp);
+	neh->eh_depth = cpu_to_le16(neh->eh_depth + 1);
+	ext4_mark_inode_dirty(handle, inode);
 out:
 	brelse(bh);
 
@@ -1170,8 +1160,7 @@ repeat:
 			err = PTR_ERR(path);
 	} else {
 		/* tree is full, time to grow in depth */
-		err = ext4_ext_grow_indepth(handle, inode, flags,
-					    path, newext);
+		err = ext4_ext_grow_indepth(handle, inode, flags, newext);
 		if (err)
 			goto out;
 
-- 
cgit v1.2.3-58-ga151


From 42274bb22afc3e877ae5abed787b0b09d7dede52 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastry@etersoft.ru>
Date: Sat, 22 Oct 2011 14:37:50 +0400
Subject: CIFS: Fix DFS handling in cifs_get_file_info

We should call cifs_all_info_to_fattr in rc == 0 case only.

Cc: <stable@kernel.org>
Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/inode.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 663c4e313be4..2c50bd2f65d1 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -562,7 +562,16 @@ int cifs_get_file_info(struct file *filp)
 
 	xid = GetXid();
 	rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
-	if (rc == -EOPNOTSUPP || rc == -EINVAL) {
+	switch (rc) {
+	case 0:
+		cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
+		break;
+	case -EREMOTE:
+		cifs_create_dfs_fattr(&fattr, inode->i_sb);
+		rc = 0;
+		break;
+	case -EOPNOTSUPP:
+	case -EINVAL:
 		/*
 		 * FIXME: legacy server -- fall back to path-based call?
 		 * for now, just skip revalidating and mark inode for
@@ -570,18 +579,14 @@ int cifs_get_file_info(struct file *filp)
 		 */
 		rc = 0;
 		CIFS_I(inode)->time = 0;
+	default:
 		goto cgfi_exit;
-	} else if (rc == -EREMOTE) {
-		cifs_create_dfs_fattr(&fattr, inode->i_sb);
-		rc = 0;
-	} else if (rc)
-		goto cgfi_exit;
+	}
 
 	/*
 	 * don't bother with SFU junk here -- just mark inode as needing
 	 * revalidation.
 	 */
-	cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
 	fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
 	fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
 	cifs_fattr_to_inode(inode, &fattr);
-- 
cgit v1.2.3-58-ga151


From 5423732a71577f7860c56a4eea2c34ff162ddd73 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 19 Oct 2011 19:12:58 -0700
Subject: nfsd41: use SEQ4_STATUS_BACKCHANNEL_FAULT when cb_sequence is invalid

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c | 10 ++++++++++
 fs/nfsd/nfs4state.c    |  8 +++++++-
 fs/nfsd/state.h        |  1 +
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index de018ecadae6..7748d6a18d97 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -39,6 +39,8 @@
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
+static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason);
+
 #define NFSPROC4_CB_NULL 0
 #define NFSPROC4_CB_COMPOUND 1
 
@@ -460,6 +462,8 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
 	 */
 	status = 0;
 out:
+	if (status)
+		nfsd4_mark_cb_fault(cb->cb_clp, status);
 	return status;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
@@ -686,6 +690,12 @@ static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
 	warn_no_callback_path(clp, reason);
 }
 
+static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
+{
+	clp->cl_cb_state = NFSD4_CB_FAULT;
+	warn_no_callback_path(clp, reason);
+}
+
 static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e8c2a3ec0e60..b51ad43b7ea1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1945,8 +1945,14 @@ out:
 
 		nfsd4_get_session(cstate->session);
 		atomic_inc(&clp->cl_refcount);
-		if (clp->cl_cb_state == NFSD4_CB_DOWN)
+		switch (clp->cl_cb_state) {
+		case NFSD4_CB_DOWN:
 			seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN;
+			break;
+		case NFSD4_CB_FAULT:
+			seq->status_flags |= SEQ4_STATUS_BACKCHANNEL_FAULT;
+			break;
+		}
 	}
 	kfree(conn);
 	spin_unlock(&client_lock);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 1a5820066040..a3cf38476a1b 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -258,6 +258,7 @@ struct nfs4_client {
 #define NFSD4_CB_UP		0
 #define NFSD4_CB_UNKNOWN	1
 #define NFSD4_CB_DOWN		2
+#define NFSD4_CB_FAULT		3
 	int			cl_cb_state;
 	struct nfsd4_callback	cl_cb_null;
 	struct nfsd4_session	*cl_cb_session;
-- 
cgit v1.2.3-58-ga151


From fc0c3dd13bac0675cdedc4e0d0641aa8a22e82de Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@tonian.com>
Date: Wed, 19 Oct 2011 19:13:06 -0700
Subject: nfsd4: seq->status_flags may be used unitialized

Reported-by: Gopala Suryanarayana <gsuryanarayana@vmware.com>
Signed-off-by: Benny Halevy <bhalevy@tonian.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b51ad43b7ea1..1527aaffb000 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1947,11 +1947,13 @@ out:
 		atomic_inc(&clp->cl_refcount);
 		switch (clp->cl_cb_state) {
 		case NFSD4_CB_DOWN:
-			seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN;
+			seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN;
 			break;
 		case NFSD4_CB_FAULT:
-			seq->status_flags |= SEQ4_STATUS_BACKCHANNEL_FAULT;
+			seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT;
 			break;
+		default:
+			seq->status_flags = 0;
 		}
 	}
 	kfree(conn);
-- 
cgit v1.2.3-58-ga151


From c668fc6dfcce98f8222ce1c997f4e5c4ac63f3d0 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@tonian.com>
Date: Wed, 19 Oct 2011 19:13:13 -0700
Subject: nfsd4: allow NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL |
 NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED

RFC5661 says:
   The client may set one or both of
   OPEN4_SHARE_ACCESS_WANT_SIGNAL_DELEG_WHEN_RESRC_AVAIL and
   OPEN4_SHARE_ACCESS_WANT_PUSH_DELEG_WHEN_UNCONTENDED.

Signed-off-by: Benny Halevy <bhalevy@tonian.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index fdc09a52cd8d..706ada1956c1 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -677,6 +677,8 @@ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *x)
 	switch (w) {
 	case NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL:
 	case NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED:
+	case (NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL |
+	      NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED):
 		return nfs_ok;
 	}
 xdr_error:
-- 
cgit v1.2.3-58-ga151


From 92bac8c5d60623167c6802b1f125e6d623708185 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@tonian.com>
Date: Wed, 19 Oct 2011 19:13:29 -0700
Subject: nfsd4: typo logical vs bitwise negate for want_mask

Signed-off-by: Benny Halevy <bhalevy@tonian.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 706ada1956c1..decea140b323 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -671,7 +671,7 @@ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *x)
 	default:
 		return nfserr_bad_xdr;
 	}
-	w &= !NFS4_SHARE_WANT_MASK;
+	w &= ~NFS4_SHARE_WANT_MASK;
 	if (!w)
 		return nfs_ok;
 	switch (w) {
-- 
cgit v1.2.3-58-ga151


From 345c284290cabb5484df909303e73d6def8ec8ec Mon Sep 17 00:00:00 2001
From: Mi Jinlong <mijinlong@cn.fujitsu.com>
Date: Thu, 20 Oct 2011 17:51:39 +0800
Subject: nfs41: implement DESTROY_CLIENTID operation

According to rfc5661 18.50, implement DESTROY_CLIENTID operation.

Signed-off-by: Mi Jinlong <mijinlong@cn.fujitsu.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c  |  2 +-
 fs/nfsd/nfs4state.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs4xdr.c   | 12 +++++++++++-
 fs/nfsd/xdr4.h      |  5 +++++
 4 files changed, 61 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 458ebb6b59c7..fa383361bc61 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1645,7 +1645,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_name = "OP_SEQUENCE",
 	},
 	[OP_DESTROY_CLIENTID] = {
-		.op_func = NULL,
+		.op_func = (nfsd4op_func)nfsd4_destroy_clientid,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
 				| OP_MODIFIES_SOMETHING,
 		.op_name = "OP_DESTROY_CLIENTID",
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1527aaffb000..47e94e33a975 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1962,6 +1962,50 @@ out:
 	return status;
 }
 
+static inline bool has_resources(struct nfs4_client *clp)
+{
+	return !list_empty(&clp->cl_openowners)
+		|| !list_empty(&clp->cl_delegations)
+		|| !list_empty(&clp->cl_sessions);
+}
+
+__be32
+nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc)
+{
+	struct nfs4_client *conf, *unconf, *clp;
+	int status = 0;
+
+	nfs4_lock_state();
+	unconf = find_unconfirmed_client(&dc->clientid);
+	conf = find_confirmed_client(&dc->clientid);
+
+	if (conf) {
+		clp = conf;
+
+		if (!is_client_expired(conf) && has_resources(conf)) {
+			status = nfserr_clientid_busy;
+			goto out;
+		}
+
+		/* rfc5661 18.50.3 */
+		if (cstate->session && conf == cstate->session->se_client) {
+			status = nfserr_clientid_busy;
+			goto out;
+		}
+	} else if (unconf)
+		clp = unconf;
+	else {
+		status = nfserr_stale_clientid;
+		goto out;
+	}
+
+	expire_client(clp);
+out:
+	nfs4_unlock_state();
+	dprintk("%s return %d\n", __func__, ntohl(status));
+	return status;
+}
+
 __be32
 nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
 {
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index decea140b323..66d095d7955e 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1417,6 +1417,16 @@ xdr_error:
 	goto out;
 }
 
+static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, struct nfsd4_destroy_clientid *dc)
+{
+	DECODE_HEAD;
+
+	READ_BUF(8);
+	COPYMEM(&dc->clientid, 8);
+
+	DECODE_TAIL;
+}
+
 static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc)
 {
 	DECODE_HEAD;
@@ -1538,7 +1548,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
 	[OP_SET_SSV]		= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_TEST_STATEID]	= (nfsd4_dec)nfsd4_decode_test_stateid,
 	[OP_WANT_DELEGATION]	= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_DESTROY_CLIENTID]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_DESTROY_CLIENTID]	= (nfsd4_dec)nfsd4_decode_destroy_clientid,
 	[OP_RECLAIM_COMPLETE]	= (nfsd4_dec)nfsd4_decode_reclaim_complete,
 };
 
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index e3057350eea1..2364747ee97d 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -398,6 +398,10 @@ struct nfsd4_destroy_session {
 	struct nfs4_sessionid	sessionid;
 };
 
+struct nfsd4_destroy_clientid {
+	clientid_t clientid;
+};
+
 struct nfsd4_reclaim_complete {
 	u32 rca_one_fs;
 };
@@ -552,6 +556,7 @@ extern __be32 nfsd4_sequence(struct svc_rqst *,
 extern __be32 nfsd4_destroy_session(struct svc_rqst *,
 		struct nfsd4_compound_state *,
 		struct nfsd4_destroy_session *);
+extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_clientid *);
 __be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
 extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
 		struct nfsd4_open *open);
-- 
cgit v1.2.3-58-ga151


From f9d9ef62cd3ecbd6cbb7957a253c1e81f69d5586 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Thu, 29 Sep 2011 13:11:33 +0200
Subject: btrfs: do not allow mounting non-subvolumes via subvol option

There's a missing test whether the path passed to subvol=path option
during mount is a real subvolume, allowing any directory located in
default subovlume to be passed and accepted for mount.

(current btrfs progs prevent this early)
$ btrfs subvol snapshot . p1-snap
ERROR: '.' is not a subvolume

(with "is subvolume?" test bypassed)
$ btrfs subvol snapshot . p1-snap
Create a snapshot of '.' in './p1-snap'

$ btrfs subvol list -p .
ID 258 parent 5 top level 5 path subvol
ID 259 parent 5 top level 5 path subvol1
ID 260 parent 5 top level 5 path default-subvol1
ID 262 parent 5 top level 5 path p1/p1-snapshot
ID 263 parent 259 top level 5 path subvol1/subvol1-snap

The problem I see is that this makes a false impression of snapshotting the
given subvolume but in fact snapshots the default one: a user expects outcome
like ID 263 but in fact gets ID 262 .

This patch makes mount fail with EINVAL with a message in syslog.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/super.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 29eecbb6ec3a..5429b1fa0bfc 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -740,6 +740,16 @@ static int btrfs_set_super(struct super_block *s, void *data)
 	return set_anon_super(s, data);
 }
 
+/*
+ * subvolumes are identified by ino 256
+ */
+static inline int is_subvolume_inode(struct inode *inode)
+{
+	if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+		return 1;
+	return 0;
+}
+
 /*
  * This will strip out the subvol=%s argument for an argument string and add
  * subvolid=0 to make sure we get the actual tree root for path walking to the
@@ -843,6 +853,15 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
 	if (error)
 		return ERR_PTR(error);
 
+	if (!is_subvolume_inode(path.dentry->d_inode)) {
+		path_put(&path);
+		mntput(mnt);
+		error = -EINVAL;
+		printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
+				subvol_name);
+		return ERR_PTR(-EINVAL);
+	}
+
 	/* Get a ref to the sb and the dentry we found and return it */
 	s = path.mnt->mnt_sb;
 	atomic_inc(&s->s_active);
-- 
cgit v1.2.3-58-ga151


From dff51cd1c60856c28f5d22a571294c2b70b6b322 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Tue, 14 Jun 2011 12:52:17 +0200
Subject: btrfs: ratelimit WARN_ON in use_block_rsv

The WARN_ON under some circumstances heavily polute log and slow down
the machine. This is just a safety, as the warning should be fixed by
another patch, nevertheless, it still pops up during testing.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/extent-tree.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cef355f1328a..28c4809851a5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,6 +23,7 @@
 #include <linux/rcupdate.h>
 #include <linux/kthread.h>
 #include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include "compat.h"
 #include "hash.h"
 #include "ctree.h"
@@ -5783,7 +5784,13 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	if (!ret)
 		return block_rsv;
 	if (ret) {
-		WARN_ON(1);
+		static DEFINE_RATELIMIT_STATE(_rs,
+				DEFAULT_RATELIMIT_INTERVAL,
+				/*DEFAULT_RATELIMIT_BURST*/ 2);
+		if (__ratelimit(&_rs)) {
+			printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
+			WARN_ON(1);
+		}
 		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
 		if (!ret) {
 			return block_rsv;
-- 
cgit v1.2.3-58-ga151


From 9562ad9ab36df7ccef920d119f3b5100025db95f Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Mon, 24 Oct 2011 16:11:30 +0200
Subject: block: Remove the control of complete cpu from bio.

bio originally has the functionality to set the complete cpu, but
it is broken.

Chirstoph said that "This code is unused, and from the all the
discussions lately pretty obviously broken.  The only thing keeping
it serves is creating more confusion and possibly more bugs."

And Jens replied with "We can kill bio_set_completion_cpu(). I'm fine
with leaving cpu control to the request based drivers, they are the
only ones that can toggle the setting anyway".

So this patch tries to remove all the work of controling complete cpu
from a bio.

Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c          |  4 +---
 drivers/md/raid1.c        |  1 -
 fs/bio.c                  |  1 -
 include/linux/bio.h       |  8 --------
 include/linux/blk_types.h | 11 ++++-------
 5 files changed, 5 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/block/blk-core.c b/block/blk-core.c
index 7e1523521c70..da697936d220 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1276,7 +1276,6 @@ out:
 
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
-	req->cpu = bio->bi_comp_cpu;
 	req->cmd_type = REQ_TYPE_FS;
 
 	req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK;
@@ -1362,8 +1361,7 @@ get_rq:
 	 */
 	init_request_from_bio(req, bio);
 
-	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
-	    bio_flagged(bio, BIO_CPU_AFFINE))
+	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
 		req->cpu = raw_smp_processor_id();
 
 	plug = current->plug;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d4ddfa627301..2948a520f7ba 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2172,7 +2172,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		bio->bi_next = NULL;
 		bio->bi_flags &= ~(BIO_POOL_MASK-1);
 		bio->bi_flags |= 1 << BIO_UPTODATE;
-		bio->bi_comp_cpu = -1;
 		bio->bi_rw = READ;
 		bio->bi_vcnt = 0;
 		bio->bi_idx = 0;
diff --git a/fs/bio.c b/fs/bio.c
index 9bfade8a609b..41c93c722244 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -255,7 +255,6 @@ void bio_init(struct bio *bio)
 {
 	memset(bio, 0, sizeof(*bio));
 	bio->bi_flags = 1 << BIO_UPTODATE;
-	bio->bi_comp_cpu = -1;
 	atomic_set(&bio->bi_cnt, 1);
 }
 EXPORT_SYMBOL(bio_init);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ce33e6868a2f..a3c071c9e189 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -268,14 +268,6 @@ extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set
 extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int);
 extern unsigned int bvec_nr_vecs(unsigned short idx);
 
-/*
- * Allow queuer to specify a completion CPU for this bio
- */
-static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu)
-{
-	bio->bi_comp_cpu = cpu;
-}
-
 /*
  * bio_set is used to allow other portions of the IO system to
  * allocate their own private memory pools for bio and iovec structures.
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 71fc53bb8f1c..4053cbd4490e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -59,8 +59,6 @@ struct bio {
 
 	unsigned int		bi_max_vecs;	/* max bvl_vecs we can hold */
 
-	unsigned int		bi_comp_cpu;	/* completion CPU */
-
 	atomic_t		bi_cnt;		/* pin count */
 
 	struct bio_vec		*bi_io_vec;	/* the actual vec list */
@@ -93,11 +91,10 @@ struct bio {
 #define BIO_BOUNCED	5	/* bio is a bounce bio */
 #define BIO_USER_MAPPED 6	/* contains user pages */
 #define BIO_EOPNOTSUPP	7	/* not supported */
-#define BIO_CPU_AFFINE	8	/* complete bio on same CPU as submitted */
-#define BIO_NULL_MAPPED 9	/* contains invalid user pages */
-#define BIO_FS_INTEGRITY 10	/* fs owns integrity data, not block layer */
-#define BIO_QUIET	11	/* Make BIO Quiet */
-#define BIO_MAPPED_INTEGRITY 12/* integrity metadata has been remapped */
+#define BIO_NULL_MAPPED 8	/* contains invalid user pages */
+#define BIO_FS_INTEGRITY 9	/* fs owns integrity data, not block layer */
+#define BIO_QUIET	10	/* Make BIO Quiet */
+#define BIO_MAPPED_INTEGRITY 11/* integrity metadata has been remapped */
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 
 /*
-- 
cgit v1.2.3-58-ga151


From abfa034e4b8ed0046fa589769e9840af645bc4ba Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 16 Aug 2011 10:50:10 +0530
Subject: fs/9p: Update zero-copy implementation in 9p

* remove lot of update to different data structure
* add a seperate callback for zero copy request.
* above makes non zero copy code path simpler
* remove conditionalizing TREAD/TREADDIR/TWRITE in the zero copy path
* Fix the dotu p9_check_errors with zero copy. Add sufficient doc around
* Add support for both in and output buffers in zero copy callback
* pin and unpin pages in the same context
* use helpers instead of defining page offset and rest of page ourself
* Fix mem leak in p9_check_errors
* Remove 'E' and 'F' in p9pdu_vwritef

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_dir.c            |   2 +-
 include/net/9p/9p.h        |  11 +-
 include/net/9p/transport.h |  10 +-
 net/9p/client.c            | 391 +++++++++++++++++++++++++++++++++------------
 net/9p/protocol.c          |  46 +-----
 net/9p/protocol.h          |   1 +
 net/9p/trans_common.c      |  53 ++----
 net/9p/trans_common.h      |  21 +--
 net/9p/trans_virtio.c      | 319 +++++++++++++++++++++---------------
 9 files changed, 506 insertions(+), 348 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 9c2bdda5cd9d..ce6600f33659 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -231,7 +231,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
 	while (err == 0) {
 		if (rdir->tail == rdir->head) {
 			err = p9_client_readdir(fid, rdir->buf, buflen,
-								filp->f_pos);
+						filp->f_pos);
 			if (err <= 0)
 				goto unlock_and_exit;
 
diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h
index a6326ef8ade6..d83a01300871 100644
--- a/include/net/9p/9p.h
+++ b/include/net/9p/9p.h
@@ -359,6 +359,9 @@ enum p9_qid_t {
 /* Room for readdir header */
 #define P9_READDIRHDRSZ	24
 
+/* size of header for zero copy read/write */
+#define P9_ZC_HDR_SZ 4096
+
 /**
  * struct p9_qid - file system entity information
  * @type: 8-bit type &p9_qid_t
@@ -555,10 +558,6 @@ struct p9_rstatfs {
  * @tag: transaction id of the request
  * @offset: used by marshalling routines to track current position in buffer
  * @capacity: used by marshalling routines to track total malloc'd capacity
- * @pubuf: Payload user buffer given by the caller
- * @pkbuf: Payload kernel buffer given by the caller
- * @pbuf_size: pubuf/pkbuf(only one will be !NULL) size to be read/write.
- * @private: For transport layer's use.
  * @sdata: payload
  *
  * &p9_fcall represents the structure for all 9P RPC
@@ -575,10 +574,6 @@ struct p9_fcall {
 
 	size_t offset;
 	size_t capacity;
-	char __user *pubuf;
-	char *pkbuf;
-	size_t pbuf_size;
-	void *private;
 
 	u8 *sdata;
 };
diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h
index 83531ebeee99..adcbb20f6511 100644
--- a/include/net/9p/transport.h
+++ b/include/net/9p/transport.h
@@ -26,13 +26,6 @@
 #ifndef NET_9P_TRANSPORT_H
 #define NET_9P_TRANSPORT_H
 
-#define P9_TRANS_PREF_PAYLOAD_MASK 0x1
-
-/* Default. Add Payload to PDU before sending it down to transport layer */
-#define P9_TRANS_PREF_PAYLOAD_DEF  0x0
-/* Send pay load separately to transport layer along with PDU.*/
-#define P9_TRANS_PREF_PAYLOAD_SEP  0x1
-
 /**
  * struct p9_trans_module - transport module interface
  * @list: used to maintain a list of currently available transports
@@ -56,13 +49,14 @@ struct p9_trans_module {
 	struct list_head list;
 	char *name;		/* name of transport */
 	int maxsize;		/* max message size of transport */
-	int pref;               /* Preferences of this transport */
 	int def;		/* this transport should be default */
 	struct module *owner;
 	int (*create)(struct p9_client *, const char *, char *);
 	void (*close) (struct p9_client *);
 	int (*request) (struct p9_client *, struct p9_req_t *req);
 	int (*cancel) (struct p9_client *, struct p9_req_t *req);
+	int (*zc_request)(struct p9_client *, struct p9_req_t *,
+			  char *, char *, int , int, int, int);
 };
 
 void v9fs_register_trans(struct p9_trans_module *m);
diff --git a/net/9p/client.c b/net/9p/client.c
index 0505a03c374c..305a4e719b03 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -203,11 +203,12 @@ free_and_return:
  *
  */
 
-static struct p9_req_t *p9_tag_alloc(struct p9_client *c, u16 tag)
+static struct p9_req_t *p9_tag_alloc(struct p9_client *c, u16 tag, int max_size)
 {
 	unsigned long flags;
 	int row, col;
 	struct p9_req_t *req;
+	int alloc_msize = min(c->msize, max_size);
 
 	/* This looks up the original request by tag so we know which
 	 * buffer to read the data into */
@@ -245,23 +246,12 @@ static struct p9_req_t *p9_tag_alloc(struct p9_client *c, u16 tag)
 			return ERR_PTR(-ENOMEM);
 		}
 		init_waitqueue_head(req->wq);
-		if ((c->trans_mod->pref & P9_TRANS_PREF_PAYLOAD_MASK) ==
-				P9_TRANS_PREF_PAYLOAD_SEP) {
-			int alloc_msize = min(c->msize, 4096);
-			req->tc = kmalloc(sizeof(struct p9_fcall)+alloc_msize,
-					  GFP_NOFS);
-			req->tc->capacity = alloc_msize;
-			req->rc = kmalloc(sizeof(struct p9_fcall)+alloc_msize,
-					  GFP_NOFS);
-			req->rc->capacity = alloc_msize;
-		} else {
-			req->tc = kmalloc(sizeof(struct p9_fcall)+c->msize,
-					  GFP_NOFS);
-			req->tc->capacity = c->msize;
-			req->rc = kmalloc(sizeof(struct p9_fcall)+c->msize,
-					  GFP_NOFS);
-			req->rc->capacity = c->msize;
-		}
+		req->tc = kmalloc(sizeof(struct p9_fcall) + alloc_msize,
+				  GFP_NOFS);
+		req->tc->capacity = alloc_msize;
+		req->rc = kmalloc(sizeof(struct p9_fcall) + alloc_msize,
+				  GFP_NOFS);
+		req->rc->capacity = alloc_msize;
 		if ((!req->tc) || (!req->rc)) {
 			printk(KERN_ERR "Couldn't grow tag array\n");
 			kfree(req->tc);
@@ -485,27 +475,8 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
 
 	if (!p9_is_proto_dotl(c)) {
 		char *ename;
-
-		if (req->tc->pbuf_size) {
-			/* Handle user buffers */
-			size_t len = req->rc->size - req->rc->offset;
-			if (req->tc->pubuf) {
-				/* User Buffer */
-				err = copy_from_user(
-					&req->rc->sdata[req->rc->offset],
-					req->tc->pubuf, len);
-				if (err) {
-					err = -EFAULT;
-					goto out_err;
-				}
-			} else {
-				/* Kernel Buffer */
-				memmove(&req->rc->sdata[req->rc->offset],
-						req->tc->pkbuf, len);
-			}
-		}
 		err = p9pdu_readf(req->rc, c->proto_version, "s?d",
-				&ename, &ecode);
+				  &ename, &ecode);
 		if (err)
 			goto out_err;
 
@@ -515,11 +486,10 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
 		if (!err || !IS_ERR_VALUE(err)) {
 			err = p9_errstr2errno(ename, strlen(ename));
 
-			P9_DPRINTK(P9_DEBUG_9P, "<<< RERROR (%d) %s\n", -ecode,
-					ename);
-
-			kfree(ename);
+			P9_DPRINTK(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
+				   -ecode, ename);
 		}
+		kfree(ename);
 	} else {
 		err = p9pdu_readf(req->rc, c->proto_version, "d", &ecode);
 		err = -ecode;
@@ -527,7 +497,6 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
 		P9_DPRINTK(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode);
 	}
 
-
 	return err;
 
 out_err:
@@ -536,6 +505,110 @@ out_err:
 	return err;
 }
 
+/**
+ * p9_check_zc_errors - check 9p packet for error return and process it
+ * @c: current client instance
+ * @req: request to parse and check for error conditions
+ * @in_hdrlen: Size of response protocol buffer.
+ *
+ * returns error code if one is discovered, otherwise returns 0
+ *
+ * this will have to be more complicated if we have multiple
+ * error packet types
+ */
+
+static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req,
+			      char *uidata, int in_hdrlen, int kern_buf)
+{
+	int err;
+	int ecode;
+	int8_t type;
+	char *ename = NULL;
+
+	err = p9_parse_header(req->rc, NULL, &type, NULL, 0);
+	if (err) {
+		P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse header %d\n", err);
+		return err;
+	}
+
+	if (type != P9_RERROR && type != P9_RLERROR)
+		return 0;
+
+	if (!p9_is_proto_dotl(c)) {
+		/* Error is reported in string format */
+		uint16_t len;
+		/* 7 = header size for RERROR, 2 is the size of string len; */
+		int inline_len = in_hdrlen - (7 + 2);
+
+		/* Read the size of error string */
+		err = p9pdu_readf(req->rc, c->proto_version, "w", &len);
+		if (err)
+			goto out_err;
+
+		ename = kmalloc(len + 1, GFP_NOFS);
+		if (!ename) {
+			err = -ENOMEM;
+			goto out_err;
+		}
+		if (len <= inline_len) {
+			/* We have error in protocol buffer itself */
+			if (pdu_read(req->rc, ename, len)) {
+				err = -EFAULT;
+				goto out_free;
+
+			}
+		} else {
+			/*
+			 *  Part of the data is in user space buffer.
+			 */
+			if (pdu_read(req->rc, ename, inline_len)) {
+				err = -EFAULT;
+				goto out_free;
+
+			}
+			if (kern_buf) {
+				memcpy(ename + inline_len, uidata,
+				       len - inline_len);
+			} else {
+				err = copy_from_user(ename + inline_len,
+						     uidata, len - inline_len);
+				if (err) {
+					err = -EFAULT;
+					goto out_free;
+				}
+			}
+		}
+		ename[len] = 0;
+		if (p9_is_proto_dotu(c)) {
+			/* For dotu we also have error code */
+			err = p9pdu_readf(req->rc,
+					  c->proto_version, "d", &ecode);
+			if (err)
+				goto out_free;
+			err = -ecode;
+		}
+		if (!err || !IS_ERR_VALUE(err)) {
+			err = p9_errstr2errno(ename, strlen(ename));
+
+			P9_DPRINTK(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
+				   -ecode, ename);
+		}
+		kfree(ename);
+	} else {
+		err = p9pdu_readf(req->rc, c->proto_version, "d", &ecode);
+		err = -ecode;
+
+		P9_DPRINTK(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode);
+	}
+	return err;
+
+out_free:
+	kfree(ename);
+out_err:
+	P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse error%d\n", err);
+	return err;
+}
+
 static struct p9_req_t *
 p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...);
 
@@ -579,23 +652,12 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq)
 	return 0;
 }
 
-/**
- * p9_client_rpc - issue a request and wait for a response
- * @c: client session
- * @type: type of request
- * @fmt: protocol format string (see protocol.c)
- *
- * Returns request structure (which client must free using p9_free_req)
- */
-
-static struct p9_req_t *
-p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
+static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
+					      int8_t type, int req_size,
+					      const char *fmt, va_list ap)
 {
-	va_list ap;
 	int tag, err;
 	struct p9_req_t *req;
-	unsigned long flags;
-	int sigpending;
 
 	P9_DPRINTK(P9_DEBUG_MUX, "client %p op %d\n", c, type);
 
@@ -607,12 +669,6 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
 	if ((c->status == BeginDisconnect) && (type != P9_TCLUNK))
 		return ERR_PTR(-EIO);
 
-	if (signal_pending(current)) {
-		sigpending = 1;
-		clear_thread_flag(TIF_SIGPENDING);
-	} else
-		sigpending = 0;
-
 	tag = P9_NOTAG;
 	if (type != P9_TVERSION) {
 		tag = p9_idpool_get(c->tagpool);
@@ -620,18 +676,50 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
 			return ERR_PTR(-ENOMEM);
 	}
 
-	req = p9_tag_alloc(c, tag);
+	req = p9_tag_alloc(c, tag, req_size);
 	if (IS_ERR(req))
 		return req;
 
 	/* marshall the data */
 	p9pdu_prepare(req->tc, tag, type);
-	va_start(ap, fmt);
 	err = p9pdu_vwritef(req->tc, c->proto_version, fmt, ap);
-	va_end(ap);
 	if (err)
 		goto reterr;
 	p9pdu_finalize(req->tc);
+	return req;
+reterr:
+	p9_free_req(c, req);
+	return ERR_PTR(err);
+}
+
+/**
+ * p9_client_rpc - issue a request and wait for a response
+ * @c: client session
+ * @type: type of request
+ * @fmt: protocol format string (see protocol.c)
+ *
+ * Returns request structure (which client must free using p9_free_req)
+ */
+
+static struct p9_req_t *
+p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
+{
+	va_list ap;
+	int sigpending, err;
+	unsigned long flags;
+	struct p9_req_t *req;
+
+	va_start(ap, fmt);
+	req = p9_client_prepare_req(c, type, c->msize, fmt, ap);
+	va_end(ap);
+	if (IS_ERR(req))
+		return req;
+
+	if (signal_pending(current)) {
+		sigpending = 1;
+		clear_thread_flag(TIF_SIGPENDING);
+	} else
+		sigpending = 0;
 
 	err = c->trans_mod->request(c, req);
 	if (err < 0) {
@@ -639,18 +727,14 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
 			c->status = Disconnected;
 		goto reterr;
 	}
-
-	P9_DPRINTK(P9_DEBUG_MUX, "wait %p tag: %d\n", req->wq, tag);
+	/* Wait for the response */
 	err = wait_event_interruptible(*req->wq,
-						req->status >= REQ_STATUS_RCVD);
-	P9_DPRINTK(P9_DEBUG_MUX, "wait %p tag: %d returned %d\n",
-						req->wq, tag, err);
+				       req->status >= REQ_STATUS_RCVD);
 
 	if (req->status == REQ_STATUS_ERROR) {
 		P9_DPRINTK(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
 		err = req->t_err;
 	}
-
 	if ((err == -ERESTARTSYS) && (c->status == Connected)) {
 		P9_DPRINTK(P9_DEBUG_MUX, "flushing\n");
 		sigpending = 1;
@@ -663,13 +747,11 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
 		if (req->status == REQ_STATUS_RCVD)
 			err = 0;
 	}
-
 	if (sigpending) {
 		spin_lock_irqsave(&current->sighand->siglock, flags);
 		recalc_sigpending();
 		spin_unlock_irqrestore(&current->sighand->siglock, flags);
 	}
-
 	if (err < 0)
 		goto reterr;
 
@@ -678,7 +760,92 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
 		P9_DPRINTK(P9_DEBUG_MUX, "exit: client %p op %d\n", c, type);
 		return req;
 	}
+reterr:
+	P9_DPRINTK(P9_DEBUG_MUX,
+		   "exit: client %p op %d error: %d\n", c, type, err);
+	p9_free_req(c, req);
+	return ERR_PTR(err);
+}
+
+/**
+ * p9_client_zc_rpc - issue a request and wait for a response
+ * @c: client session
+ * @type: type of request
+ * @uidata: user bffer that should be ued for zero copy read
+ * @uodata: user buffer that shoud be user for zero copy write
+ * @inlen: read buffer size
+ * @olen: write buffer size
+ * @hdrlen: reader header size, This is the size of response protocol data
+ * @fmt: protocol format string (see protocol.c)
+ *
+ * Returns request structure (which client must free using p9_free_req)
+ */
+static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
+					 char *uidata, char *uodata,
+					 int inlen, int olen, int in_hdrlen,
+					 int kern_buf, const char *fmt, ...)
+{
+	va_list ap;
+	int sigpending, err;
+	unsigned long flags;
+	struct p9_req_t *req;
+
+	va_start(ap, fmt);
+	/*
+	 * We allocate a inline protocol data of only 4k bytes.
+	 * The actual content is passed in zero-copy fashion.
+	 */
+	req = p9_client_prepare_req(c, type, P9_ZC_HDR_SZ, fmt, ap);
+	va_end(ap);
+	if (IS_ERR(req))
+		return req;
+
+	if (signal_pending(current)) {
+		sigpending = 1;
+		clear_thread_flag(TIF_SIGPENDING);
+	} else
+		sigpending = 0;
+
+	/* If we are called with KERNEL_DS force kern_buf */
+	if (segment_eq(get_fs(), KERNEL_DS))
+		kern_buf = 1;
+
+	err = c->trans_mod->zc_request(c, req, uidata, uodata,
+				       inlen, olen, in_hdrlen, kern_buf);
+	if (err < 0) {
+		if (err == -EIO)
+			c->status = Disconnected;
+		goto reterr;
+	}
+	if (req->status == REQ_STATUS_ERROR) {
+		P9_DPRINTK(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
+		err = req->t_err;
+	}
+	if ((err == -ERESTARTSYS) && (c->status == Connected)) {
+		P9_DPRINTK(P9_DEBUG_MUX, "flushing\n");
+		sigpending = 1;
+		clear_thread_flag(TIF_SIGPENDING);
 
+		if (c->trans_mod->cancel(c, req))
+			p9_client_flush(c, req);
+
+		/* if we received the response anyway, don't signal error */
+		if (req->status == REQ_STATUS_RCVD)
+			err = 0;
+	}
+	if (sigpending) {
+		spin_lock_irqsave(&current->sighand->siglock, flags);
+		recalc_sigpending();
+		spin_unlock_irqrestore(&current->sighand->siglock, flags);
+	}
+	if (err < 0)
+		goto reterr;
+
+	err = p9_check_zc_errors(c, req, uidata, in_hdrlen, kern_buf);
+	if (!err) {
+		P9_DPRINTK(P9_DEBUG_MUX, "exit: client %p op %d\n", c, type);
+		return req;
+	}
 reterr:
 	P9_DPRINTK(P9_DEBUG_MUX, "exit: client %p op %d error: %d\n", c, type,
 									err);
@@ -1330,13 +1497,15 @@ int
 p9_client_read(struct p9_fid *fid, char *data, char __user *udata, u64 offset,
 								u32 count)
 {
-	int err, rsize;
-	struct p9_client *clnt;
-	struct p9_req_t *req;
 	char *dataptr;
+	int kernel_buf = 0;
+	struct p9_req_t *req;
+	struct p9_client *clnt;
+	int err, rsize, non_zc = 0;
+
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %d\n", fid->fid,
-					(long long unsigned) offset, count);
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %d\n",
+		   fid->fid, (long long unsigned) offset, count);
 	err = 0;
 	clnt = fid->clnt;
 
@@ -1348,13 +1517,24 @@ p9_client_read(struct p9_fid *fid, char *data, char __user *udata, u64 offset,
 		rsize = count;
 
 	/* Don't bother zerocopy for small IO (< 1024) */
-	if (((clnt->trans_mod->pref & P9_TRANS_PREF_PAYLOAD_MASK) ==
-			P9_TRANS_PREF_PAYLOAD_SEP) && (rsize > 1024)) {
-		req = p9_client_rpc(clnt, P9_TREAD, "dqE", fid->fid, offset,
-				rsize, data, udata);
+	if (clnt->trans_mod->zc_request && rsize > 1024) {
+		char *indata;
+		if (data) {
+			kernel_buf = 1;
+			indata = data;
+		} else
+			indata = (char *)udata;
+		/*
+		 * response header len is 11
+		 * PDU Header(7) + IO Size (4)
+		 */
+		req = p9_client_zc_rpc(clnt, P9_TREAD, indata, NULL, rsize, 0,
+				       11, kernel_buf, "dqd", fid->fid,
+				       offset, rsize);
 	} else {
+		non_zc = 1;
 		req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset,
-				rsize);
+				    rsize);
 	}
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
@@ -1370,7 +1550,7 @@ p9_client_read(struct p9_fid *fid, char *data, char __user *udata, u64 offset,
 	P9_DPRINTK(P9_DEBUG_9P, "<<< RREAD count %d\n", count);
 	P9_DUMP_PKT(1, req->rc);
 
-	if (!req->tc->pbuf_size) {
+	if (non_zc) {
 		if (data) {
 			memmove(data, dataptr, count);
 		} else {
@@ -1396,6 +1576,7 @@ p9_client_write(struct p9_fid *fid, char *data, const char __user *udata,
 							u64 offset, u32 count)
 {
 	int err, rsize;
+	int kernel_buf = 0;
 	struct p9_client *clnt;
 	struct p9_req_t *req;
 
@@ -1411,19 +1592,24 @@ p9_client_write(struct p9_fid *fid, char *data, const char __user *udata,
 	if (count < rsize)
 		rsize = count;
 
-	/* Don't bother zerocopy form small IO (< 1024) */
-	if (((clnt->trans_mod->pref & P9_TRANS_PREF_PAYLOAD_MASK) ==
-				P9_TRANS_PREF_PAYLOAD_SEP) && (rsize > 1024)) {
-		req = p9_client_rpc(clnt, P9_TWRITE, "dqE", fid->fid, offset,
-				rsize, data, udata);
+	/* Don't bother zerocopy for small IO (< 1024) */
+	if (clnt->trans_mod->zc_request && rsize > 1024) {
+		char *odata;
+		if (data) {
+			kernel_buf = 1;
+			odata = data;
+		} else
+			odata = (char *)udata;
+		req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, odata, 0, rsize,
+				       P9_ZC_HDR_SZ, kernel_buf, "dqd",
+				       fid->fid, offset, rsize);
 	} else {
-
 		if (data)
 			req = p9_client_rpc(clnt, P9_TWRITE, "dqD", fid->fid,
-					offset, rsize, data);
+					    offset, rsize, data);
 		else
 			req = p9_client_rpc(clnt, P9_TWRITE, "dqU", fid->fid,
-					offset, rsize, udata);
+					    offset, rsize, udata);
 	}
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
@@ -1824,7 +2010,7 @@ EXPORT_SYMBOL_GPL(p9_client_xattrcreate);
 
 int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset)
 {
-	int err, rsize;
+	int err, rsize, non_zc = 0;
 	struct p9_client *clnt;
 	struct p9_req_t *req;
 	char *dataptr;
@@ -1842,13 +2028,18 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset)
 	if (count < rsize)
 		rsize = count;
 
-	if ((clnt->trans_mod->pref & P9_TRANS_PREF_PAYLOAD_MASK) ==
-			P9_TRANS_PREF_PAYLOAD_SEP) {
-		req = p9_client_rpc(clnt, P9_TREADDIR, "dqF", fid->fid,
-				offset, rsize, data);
+	/* Don't bother zerocopy for small IO (< 1024) */
+	if (clnt->trans_mod->zc_request && rsize > 1024) {
+		/*
+		 * response header len is 11
+		 * PDU Header(7) + IO Size (4)
+		 */
+		req = p9_client_zc_rpc(clnt, P9_TREADDIR, data, NULL, rsize, 0,
+				       11, 1, "dqd", fid->fid, offset, rsize);
 	} else {
+		non_zc = 1;
 		req = p9_client_rpc(clnt, P9_TREADDIR, "dqd", fid->fid,
-				offset, rsize);
+				    offset, rsize);
 	}
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
@@ -1863,7 +2054,7 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset)
 
 	P9_DPRINTK(P9_DEBUG_9P, "<<< RREADDIR count %d\n", count);
 
-	if (!req->tc->pbuf_size && data)
+	if (non_zc)
 		memmove(data, dataptr, count);
 
 	p9_free_req(clnt, req);
diff --git a/net/9p/protocol.c b/net/9p/protocol.c
index df58375ea6b3..b7d4e8aa5383 100644
--- a/net/9p/protocol.c
+++ b/net/9p/protocol.c
@@ -81,7 +81,7 @@ void p9stat_free(struct p9_wstat *stbuf)
 }
 EXPORT_SYMBOL(p9stat_free);
 
-static size_t pdu_read(struct p9_fcall *pdu, void *data, size_t size)
+size_t pdu_read(struct p9_fcall *pdu, void *data, size_t size)
 {
 	size_t len = min(pdu->size - pdu->offset, size);
 	memcpy(data, &pdu->sdata[pdu->offset], len);
@@ -108,26 +108,6 @@ pdu_write_u(struct p9_fcall *pdu, const char __user *udata, size_t size)
 	return size - len;
 }
 
-static size_t
-pdu_write_urw(struct p9_fcall *pdu, const char *kdata, const char __user *udata,
-		size_t size)
-{
-	BUG_ON(pdu->size > P9_IOHDRSZ);
-	pdu->pubuf = (char __user *)udata;
-	pdu->pkbuf = (char *)kdata;
-	pdu->pbuf_size = size;
-	return 0;
-}
-
-static size_t
-pdu_write_readdir(struct p9_fcall *pdu, const char *kdata, size_t size)
-{
-	BUG_ON(pdu->size > P9_READDIRHDRSZ);
-	pdu->pkbuf = (char *)kdata;
-	pdu->pbuf_size = size;
-	return 0;
-}
-
 /*
 	b - int8_t
 	w - int16_t
@@ -459,26 +439,6 @@ p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
 					errcode = -EFAULT;
 			}
 			break;
-		case 'E':{
-				 int32_t cnt = va_arg(ap, int32_t);
-				 const char *k = va_arg(ap, const void *);
-				 const char __user *u = va_arg(ap,
-							const void __user *);
-				 errcode = p9pdu_writef(pdu, proto_version, "d",
-						 cnt);
-				 if (!errcode && pdu_write_urw(pdu, k, u, cnt))
-					errcode = -EFAULT;
-			 }
-			 break;
-		case 'F':{
-				 int32_t cnt = va_arg(ap, int32_t);
-				 const char *k = va_arg(ap, const void *);
-				 errcode = p9pdu_writef(pdu, proto_version, "d",
-						 cnt);
-				 if (!errcode && pdu_write_readdir(pdu, k, cnt))
-					errcode = -EFAULT;
-			 }
-			 break;
 		case 'U':{
 				int32_t count = va_arg(ap, int32_t);
 				const char __user *udata =
@@ -637,10 +597,6 @@ void p9pdu_reset(struct p9_fcall *pdu)
 {
 	pdu->offset = 0;
 	pdu->size = 0;
-	pdu->private = NULL;
-	pdu->pubuf = NULL;
-	pdu->pkbuf = NULL;
-	pdu->pbuf_size = 0;
 }
 
 int p9dirent_read(char *buf, int len, struct p9_dirent *dirent,
diff --git a/net/9p/protocol.h b/net/9p/protocol.h
index 2431c0f38d56..a0eb8ff11f22 100644
--- a/net/9p/protocol.h
+++ b/net/9p/protocol.h
@@ -32,3 +32,4 @@ int p9pdu_prepare(struct p9_fcall *pdu, int16_t tag, int8_t type);
 int p9pdu_finalize(struct p9_fcall *pdu);
 void p9pdu_dump(int, struct p9_fcall *);
 void p9pdu_reset(struct p9_fcall *pdu);
+size_t pdu_read(struct p9_fcall *pdu, void *data, size_t size);
diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c
index 9a70ebdec56e..de8df957867d 100644
--- a/net/9p/trans_common.c
+++ b/net/9p/trans_common.c
@@ -21,30 +21,25 @@
 
 /**
  *  p9_release_req_pages - Release pages after the transaction.
- *  @*private: PDU's private page of struct trans_rpage_info
  */
-void
-p9_release_req_pages(struct trans_rpage_info *rpinfo)
+void p9_release_pages(struct page **pages, int nr_pages)
 {
 	int i = 0;
-
-	while (rpinfo->rp_data[i] && rpinfo->rp_nr_pages--) {
-		put_page(rpinfo->rp_data[i]);
+	while (pages[i] && nr_pages--) {
+		put_page(pages[i]);
 		i++;
 	}
 }
-EXPORT_SYMBOL(p9_release_req_pages);
+EXPORT_SYMBOL(p9_release_pages);
 
 /**
  * p9_nr_pages - Return number of pages needed to accommodate the payload.
  */
-int
-p9_nr_pages(struct p9_req_t *req)
+int p9_nr_pages(char *data, int len)
 {
 	unsigned long start_page, end_page;
-	start_page =  (unsigned long)req->tc->pubuf >> PAGE_SHIFT;
-	end_page = ((unsigned long)req->tc->pubuf + req->tc->pbuf_size +
-			PAGE_SIZE - 1) >> PAGE_SHIFT;
+	start_page =  (unsigned long)data >> PAGE_SHIFT;
+	end_page = ((unsigned long)data + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	return end_page - start_page;
 }
 EXPORT_SYMBOL(p9_nr_pages);
@@ -58,35 +53,17 @@ EXPORT_SYMBOL(p9_nr_pages);
  * @nr_pages: number of pages to accommodate the payload
  * @rw: Indicates if the pages are for read or write.
  */
-int
-p9_payload_gup(struct p9_req_t *req, size_t *pdata_off, int *pdata_len,
-		int nr_pages, u8 rw)
-{
-	uint32_t first_page_bytes = 0;
-	int32_t pdata_mapped_pages;
-	struct trans_rpage_info  *rpinfo;
-
-	*pdata_off = (__force size_t)req->tc->pubuf & (PAGE_SIZE-1);
 
-	if (*pdata_off)
-		first_page_bytes = min(((size_t)PAGE_SIZE - *pdata_off),
-				       req->tc->pbuf_size);
+int p9_payload_gup(char *data, int *nr_pages, struct page **pages, int write)
+{
+	int nr_mapped_pages;
 
-	rpinfo = req->tc->private;
-	pdata_mapped_pages = get_user_pages_fast((unsigned long)req->tc->pubuf,
-			nr_pages, rw, &rpinfo->rp_data[0]);
-	if (pdata_mapped_pages <= 0)
-		return pdata_mapped_pages;
+	nr_mapped_pages = get_user_pages_fast((unsigned long)data,
+					      *nr_pages, write, pages);
+	if (nr_mapped_pages <= 0)
+		return nr_mapped_pages;
 
-	rpinfo->rp_nr_pages = pdata_mapped_pages;
-	if (*pdata_off) {
-		*pdata_len = first_page_bytes;
-		*pdata_len += min((req->tc->pbuf_size - *pdata_len),
-				((size_t)pdata_mapped_pages - 1) << PAGE_SHIFT);
-	} else {
-		*pdata_len = min(req->tc->pbuf_size,
-				(size_t)pdata_mapped_pages << PAGE_SHIFT);
-	}
+	*nr_pages = nr_mapped_pages;
 	return 0;
 }
 EXPORT_SYMBOL(p9_payload_gup);
diff --git a/net/9p/trans_common.h b/net/9p/trans_common.h
index 76309223bb02..173bb550a9eb 100644
--- a/net/9p/trans_common.h
+++ b/net/9p/trans_common.h
@@ -12,21 +12,6 @@
  *
  */
 
-/* TRUE if it is user context */
-#define P9_IS_USER_CONTEXT (!segment_eq(get_fs(), KERNEL_DS))
-
-/**
- * struct trans_rpage_info - To store mapped page information in PDU.
- * @rp_alloc:Set if this structure is allocd, not a reuse unused space in pdu.
- * @rp_nr_pages: Number of mapped pages
- * @rp_data: Array of page pointers
- */
-struct trans_rpage_info {
-	u8 rp_alloc;
-	int rp_nr_pages;
-	struct page *rp_data[0];
-};
-
-void p9_release_req_pages(struct trans_rpage_info *);
-int p9_payload_gup(struct p9_req_t *, size_t *, int *, int, u8);
-int p9_nr_pages(struct p9_req_t *);
+void p9_release_pages(struct page **, int);
+int p9_payload_gup(char *, int *, struct page **, int);
+int p9_nr_pages(char *, int);
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index e317583fcc73..32aa9834229c 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -150,12 +150,10 @@ static void req_done(struct virtqueue *vq)
 	while (1) {
 		spin_lock_irqsave(&chan->lock, flags);
 		rc = virtqueue_get_buf(chan->vq, &len);
-
 		if (rc == NULL) {
 			spin_unlock_irqrestore(&chan->lock, flags);
 			break;
 		}
-
 		chan->ring_bufs_avail = 1;
 		spin_unlock_irqrestore(&chan->lock, flags);
 		/* Wakeup if anyone waiting for VirtIO ring space. */
@@ -163,17 +161,6 @@ static void req_done(struct virtqueue *vq)
 		P9_DPRINTK(P9_DEBUG_TRANS, ": rc %p\n", rc);
 		P9_DPRINTK(P9_DEBUG_TRANS, ": lookup tag %d\n", rc->tag);
 		req = p9_tag_lookup(chan->client, rc->tag);
-		if (req->tc->private) {
-			struct trans_rpage_info *rp = req->tc->private;
-			int p = rp->rp_nr_pages;
-			/*Release pages */
-			p9_release_req_pages(rp);
-			atomic_sub(p, &vp_pinned);
-			wake_up(&vp_wq);
-			if (rp->rp_alloc)
-				kfree(rp);
-			req->tc->private = NULL;
-		}
 		req->status = REQ_STATUS_RCVD;
 		p9_client_cb(chan->client, req);
 	}
@@ -193,9 +180,8 @@ static void req_done(struct virtqueue *vq)
  *
  */
 
-static int
-pack_sg_list(struct scatterlist *sg, int start, int limit, char *data,
-								int count)
+static int pack_sg_list(struct scatterlist *sg, int start,
+			int limit, char *data, int count)
 {
 	int s;
 	int index = start;
@@ -224,31 +210,36 @@ static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req)
  * this takes a list of pages.
  * @sg: scatter/gather list to pack into
  * @start: which segment of the sg_list to start at
- * @pdata_off: Offset into the first page
  * @**pdata: a list of pages to add into sg.
+ * @nr_pages: number of pages to pack into the scatter/gather list
+ * @data: data to pack into scatter/gather list
  * @count: amount of data to pack into the scatter/gather list
  */
 static int
-pack_sg_list_p(struct scatterlist *sg, int start, int limit, size_t pdata_off,
-		struct page **pdata, int count)
+pack_sg_list_p(struct scatterlist *sg, int start, int limit,
+	       struct page **pdata, int nr_pages, char *data, int count)
 {
-	int s;
-	int i = 0;
+	int i = 0, s;
+	int data_off;
 	int index = start;
 
-	if (pdata_off) {
-		s = min((int)(PAGE_SIZE - pdata_off), count);
-		sg_set_page(&sg[index++], pdata[i++], s, pdata_off);
-		count -= s;
-	}
-
-	while (count) {
-		BUG_ON(index > limit);
-		s = min((int)PAGE_SIZE, count);
-		sg_set_page(&sg[index++], pdata[i++], s, 0);
+	BUG_ON(nr_pages > (limit - start));
+	/*
+	 * if the first page doesn't start at
+	 * page boundary find the offset
+	 */
+	data_off = offset_in_page(data);
+	while (nr_pages) {
+		s = rest_of_page(data);
+		if (s > count)
+			s = count;
+		sg_set_page(&sg[index++], pdata[i++], s, data_off);
+		data_off = 0;
+		data += s;
 		count -= s;
+		nr_pages--;
 	}
-	return index-start;
+	return index - start;
 }
 
 /**
@@ -261,114 +252,166 @@ pack_sg_list_p(struct scatterlist *sg, int start, int limit, size_t pdata_off,
 static int
 p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
 {
-	int in, out, inp, outp;
-	struct virtio_chan *chan = client->trans;
+	int err;
+	int in, out;
 	unsigned long flags;
-	size_t pdata_off = 0;
-	struct trans_rpage_info *rpinfo = NULL;
-	int err, pdata_len = 0;
+	struct virtio_chan *chan = client->trans;
 
 	P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request\n");
 
 	req->status = REQ_STATUS_SENT;
+req_retry:
+	spin_lock_irqsave(&chan->lock, flags);
+
+	/* Handle out VirtIO ring buffers */
+	out = pack_sg_list(chan->sg, 0,
+			   VIRTQUEUE_NUM, req->tc->sdata, req->tc->size);
 
-	if (req->tc->pbuf_size && (req->tc->pubuf && P9_IS_USER_CONTEXT)) {
-		int nr_pages = p9_nr_pages(req);
-		int rpinfo_size = sizeof(struct trans_rpage_info) +
-			sizeof(struct page *) * nr_pages;
+	in = pack_sg_list(chan->sg, out,
+			  VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity);
 
-		if (atomic_read(&vp_pinned) >= chan->p9_max_pages) {
-			err = wait_event_interruptible(vp_wq,
-				atomic_read(&vp_pinned) < chan->p9_max_pages);
+	err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc);
+	if (err < 0) {
+		if (err == -ENOSPC) {
+			chan->ring_bufs_avail = 0;
+			spin_unlock_irqrestore(&chan->lock, flags);
+			err = wait_event_interruptible(*chan->vc_wq,
+							chan->ring_bufs_avail);
 			if (err  == -ERESTARTSYS)
 				return err;
-			P9_DPRINTK(P9_DEBUG_TRANS, "9p: May gup pages now.\n");
-		}
 
-		if (rpinfo_size <= (req->tc->capacity - req->tc->size)) {
-			/* We can use sdata */
-			req->tc->private = req->tc->sdata + req->tc->size;
-			rpinfo = (struct trans_rpage_info *)req->tc->private;
-			rpinfo->rp_alloc = 0;
+			P9_DPRINTK(P9_DEBUG_TRANS, "9p:Retry virtio request\n");
+			goto req_retry;
 		} else {
-			req->tc->private = kmalloc(rpinfo_size, GFP_NOFS);
-			if (!req->tc->private) {
-				P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: "
-					"private kmalloc returned NULL");
-				return -ENOMEM;
-			}
-			rpinfo = (struct trans_rpage_info *)req->tc->private;
-			rpinfo->rp_alloc = 1;
+			spin_unlock_irqrestore(&chan->lock, flags);
+			P9_DPRINTK(P9_DEBUG_TRANS,
+					"9p debug: "
+					"virtio rpc add_buf returned failure");
+			return -EIO;
 		}
+	}
+	virtqueue_kick(chan->vq);
+	spin_unlock_irqrestore(&chan->lock, flags);
 
-		err = p9_payload_gup(req, &pdata_off, &pdata_len, nr_pages,
-				req->tc->id == P9_TREAD ? 1 : 0);
-		if (err < 0) {
-			if (rpinfo->rp_alloc)
-				kfree(rpinfo);
+	P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request kicked\n");
+	return 0;
+}
+
+static int p9_get_mapped_pages(struct virtio_chan *chan,
+			       struct page **pages, char *data,
+			       int nr_pages, int write, int kern_buf)
+{
+	int err;
+	if (!kern_buf) {
+		/*
+		 * We allow only p9_max_pages pinned. We wait for the
+		 * Other zc request to finish here
+		 */
+		if (atomic_read(&vp_pinned) >= chan->p9_max_pages) {
+			err = wait_event_interruptible(vp_wq,
+			      (atomic_read(&vp_pinned) < chan->p9_max_pages));
+			if (err == -ERESTARTSYS)
+				return err;
+		}
+		err = p9_payload_gup(data, &nr_pages, pages, write);
+		if (err < 0)
 			return err;
-		} else {
-			atomic_add(rpinfo->rp_nr_pages, &vp_pinned);
+		atomic_add(nr_pages, &vp_pinned);
+	} else {
+		/* kernel buffer, no need to pin pages */
+		int s, index = 0;
+		int count = nr_pages;
+		while (nr_pages) {
+			s = rest_of_page(data);
+			pages[index++] = virt_to_page(data);
+			data += s;
+			nr_pages--;
 		}
+		nr_pages = count;
 	}
+	return nr_pages;
+}
 
-req_retry_pinned:
-	spin_lock_irqsave(&chan->lock, flags);
+/**
+ * p9_virtio_zc_request - issue a zero copy request
+ * @client: client instance issuing the request
+ * @req: request to be issued
+ * @uidata: user bffer that should be ued for zero copy read
+ * @uodata: user buffer that shoud be user for zero copy write
+ * @inlen: read buffer size
+ * @olen: write buffer size
+ * @hdrlen: reader header size, This is the size of response protocol data
+ *
+ */
+static int
+p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
+		     char *uidata, char *uodata, int inlen,
+		     int outlen, int in_hdr_len, int kern_buf)
+{
+	int in, out, err;
+	unsigned long flags;
+	int in_nr_pages = 0, out_nr_pages = 0;
+	struct page **in_pages = NULL, **out_pages = NULL;
+	struct virtio_chan *chan = client->trans;
 
-	/* Handle out VirtIO ring buffers */
-	out = pack_sg_list(chan->sg, 0, VIRTQUEUE_NUM, req->tc->sdata,
-			req->tc->size);
-
-	if (req->tc->pbuf_size && (req->tc->id == P9_TWRITE)) {
-		/* We have additional write payload buffer to take care */
-		if (req->tc->pubuf && P9_IS_USER_CONTEXT) {
-			outp = pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM,
-					pdata_off, rpinfo->rp_data, pdata_len);
-		} else {
-			char *pbuf;
-			if (req->tc->pubuf)
-				pbuf = (__force char *) req->tc->pubuf;
-			else
-				pbuf = req->tc->pkbuf;
-			outp = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM, pbuf,
-					req->tc->pbuf_size);
+	P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request\n");
+
+	if (uodata) {
+		out_nr_pages = p9_nr_pages(uodata, outlen);
+		out_pages = kmalloc(sizeof(struct page *) * out_nr_pages,
+				    GFP_NOFS);
+		if (!out_pages) {
+			err = -ENOMEM;
+			goto err_out;
+		}
+		out_nr_pages = p9_get_mapped_pages(chan, out_pages, uodata,
+						   out_nr_pages, 0, kern_buf);
+		if (out_nr_pages < 0) {
+			err = out_nr_pages;
+			kfree(out_pages);
+			out_pages = NULL;
+			goto err_out;
 		}
-		out += outp;
 	}
-
-	/* Handle in VirtIO ring buffers */
-	if (req->tc->pbuf_size &&
-		((req->tc->id == P9_TREAD) || (req->tc->id == P9_TREADDIR))) {
-		/*
-		 * Take care of additional Read payload.
-		 * 11 is the read/write header = PDU Header(7) + IO Size (4).
-		 * Arrange in such a way that server places header in the
-		 * alloced memory and payload onto the user buffer.
-		 */
-		inp = pack_sg_list(chan->sg, out,
-				   VIRTQUEUE_NUM, req->rc->sdata, 11);
-		/*
-		 * Running executables in the filesystem may result in
-		 * a read request with kernel buffer as opposed to user buffer.
-		 */
-		if (req->tc->pubuf && P9_IS_USER_CONTEXT) {
-			in = pack_sg_list_p(chan->sg, out+inp, VIRTQUEUE_NUM,
-					pdata_off, rpinfo->rp_data, pdata_len);
-		} else {
-			char *pbuf;
-			if (req->tc->pubuf)
-				pbuf = (__force char *) req->tc->pubuf;
-			else
-				pbuf = req->tc->pkbuf;
-
-			in = pack_sg_list(chan->sg, out+inp, VIRTQUEUE_NUM,
-					pbuf, req->tc->pbuf_size);
+	if (uidata) {
+		in_nr_pages = p9_nr_pages(uidata, inlen);
+		in_pages = kmalloc(sizeof(struct page *) * in_nr_pages,
+				   GFP_NOFS);
+		if (!in_pages) {
+			err = -ENOMEM;
+			goto err_out;
+		}
+		in_nr_pages = p9_get_mapped_pages(chan, in_pages, uidata,
+						  in_nr_pages, 1, kern_buf);
+		if (in_nr_pages < 0) {
+			err = in_nr_pages;
+			kfree(in_pages);
+			in_pages = NULL;
+			goto err_out;
 		}
-		in += inp;
-	} else {
-		in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM,
-				  req->rc->sdata, req->rc->capacity);
 	}
+	req->status = REQ_STATUS_SENT;
+req_retry_pinned:
+	spin_lock_irqsave(&chan->lock, flags);
+	/* out data */
+	out = pack_sg_list(chan->sg, 0,
+			   VIRTQUEUE_NUM, req->tc->sdata, req->tc->size);
+
+	if (out_pages)
+		out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM,
+				      out_pages, out_nr_pages, uodata, outlen);
+	/*
+	 * Take care of in data
+	 * For example TREAD have 11.
+	 * 11 is the read/write header = PDU Header(7) + IO Size (4).
+	 * Arrange in such a way that server places header in the
+	 * alloced memory and payload onto the user buffer.
+	 */
+	in = pack_sg_list(chan->sg, out,
+			  VIRTQUEUE_NUM, req->rc->sdata, in_hdr_len);
+	if (in_pages)
+		in += pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM,
+				     in_pages, in_nr_pages, uidata, inlen);
 
 	err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc);
 	if (err < 0) {
@@ -376,28 +419,45 @@ req_retry_pinned:
 			chan->ring_bufs_avail = 0;
 			spin_unlock_irqrestore(&chan->lock, flags);
 			err = wait_event_interruptible(*chan->vc_wq,
-							chan->ring_bufs_avail);
+						       chan->ring_bufs_avail);
 			if (err  == -ERESTARTSYS)
-				return err;
+				goto err_out;
 
 			P9_DPRINTK(P9_DEBUG_TRANS, "9p:Retry virtio request\n");
 			goto req_retry_pinned;
 		} else {
 			spin_unlock_irqrestore(&chan->lock, flags);
 			P9_DPRINTK(P9_DEBUG_TRANS,
-					"9p debug: "
-					"virtio rpc add_buf returned failure");
-			if (rpinfo && rpinfo->rp_alloc)
-				kfree(rpinfo);
-			return -EIO;
+				   "9p debug: "
+				   "virtio rpc add_buf returned failure");
+			err = -EIO;
+			goto err_out;
 		}
 	}
-
 	virtqueue_kick(chan->vq);
 	spin_unlock_irqrestore(&chan->lock, flags);
-
 	P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request kicked\n");
-	return 0;
+	err = wait_event_interruptible(*req->wq,
+				       req->status >= REQ_STATUS_RCVD);
+	/*
+	 * Non kernel buffers are pinned, unpin them
+	 */
+err_out:
+	if (!kern_buf) {
+		if (in_pages) {
+			p9_release_pages(in_pages, in_nr_pages);
+			atomic_sub(in_nr_pages, &vp_pinned);
+		}
+		if (out_pages) {
+			p9_release_pages(out_pages, out_nr_pages);
+			atomic_sub(out_nr_pages, &vp_pinned);
+		}
+		/* wakeup anybody waiting for slots to pin pages */
+		wake_up(&vp_wq);
+	}
+	kfree(in_pages);
+	kfree(out_pages);
+	return err;
 }
 
 static ssize_t p9_mount_tag_show(struct device *dev,
@@ -591,8 +651,8 @@ static struct p9_trans_module p9_virtio_trans = {
 	.create = p9_virtio_create,
 	.close = p9_virtio_close,
 	.request = p9_virtio_request,
+	.zc_request = p9_virtio_zc_request,
 	.cancel = p9_virtio_cancel,
-
 	/*
 	 * We leave one entry for input and one entry for response
 	 * headers. We also skip one more entry to accomodate, address
@@ -600,7 +660,6 @@ static struct p9_trans_module p9_virtio_trans = {
 	 * page in zero copy.
 	 */
 	.maxsize = PAGE_SIZE * (VIRTQUEUE_NUM - 3),
-	.pref = P9_TRANS_PREF_PAYLOAD_SEP,
 	.def = 0,
 	.owner = THIS_MODULE,
 };
-- 
cgit v1.2.3-58-ga151


From 464f5ecf00bb4513ba257520678f5168452f67ba Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 30 Aug 2011 11:31:21 +0530
Subject: fs/9p: inode file operation is properly initialized
 init_special_inode

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index e3c03db3c788..b5a1076aaa6c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -278,10 +278,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
 	case S_IFSOCK:
 		if (v9fs_proto_dotl(v9ses)) {
 			inode->i_op = &v9fs_file_inode_operations_dotl;
-			inode->i_fop = &v9fs_file_operations_dotl;
 		} else if (v9fs_proto_dotu(v9ses)) {
 			inode->i_op = &v9fs_file_inode_operations;
-			inode->i_fop = &v9fs_file_operations;
 		} else {
 			P9_DPRINTK(P9_DEBUG_ERROR,
 				   "special files without extended mode\n");
-- 
cgit v1.2.3-58-ga151


From 4d5077f1b2aa502a0ca98b450d1b16fbccfe9c63 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 30 Aug 2011 12:19:34 +0530
Subject: fs/9p: Cleanup option parsing in 9p

Instead of saying all integer argument option should be listed in the beginning
move integer parsing to each option type.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs.c    | 33 ++++++++++++++++++++++++++-------
 net/9p/client.c | 12 +++++-------
 2 files changed, 31 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index ef9661886112..2b78014a124a 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -132,21 +132,19 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 	options = tmp_options;
 
 	while ((p = strsep(&options, ",")) != NULL) {
-		int token;
+		int token, r;
 		if (!*p)
 			continue;
 		token = match_token(p, tokens, args);
-		if (token < Opt_uname) {
-			int r = match_int(&args[0], &option);
+		switch (token) {
+		case Opt_debug:
+			r = match_int(&args[0], &option);
 			if (r < 0) {
 				P9_DPRINTK(P9_DEBUG_ERROR,
-					"integer field, but no integer?\n");
+					   "integer field, but no integer?\n");
 				ret = r;
 				continue;
 			}
-		}
-		switch (token) {
-		case Opt_debug:
 			v9ses->debug = option;
 #ifdef CONFIG_NET_9P_DEBUG
 			p9_debug_level = option;
@@ -154,12 +152,33 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 			break;
 
 		case Opt_dfltuid:
+			r = match_int(&args[0], &option);
+			if (r < 0) {
+				P9_DPRINTK(P9_DEBUG_ERROR,
+					   "integer field, but no integer?\n");
+				ret = r;
+				continue;
+			}
 			v9ses->dfltuid = option;
 			break;
 		case Opt_dfltgid:
+			r = match_int(&args[0], &option);
+			if (r < 0) {
+				P9_DPRINTK(P9_DEBUG_ERROR,
+					   "integer field, but no integer?\n");
+				ret = r;
+				continue;
+			}
 			v9ses->dfltgid = option;
 			break;
 		case Opt_afid:
+			r = match_int(&args[0], &option);
+			if (r < 0) {
+				P9_DPRINTK(P9_DEBUG_ERROR,
+					   "integer field, but no integer?\n");
+				ret = r;
+				continue;
+			}
 			v9ses->afid = option;
 			break;
 		case Opt_uname:
diff --git a/net/9p/client.c b/net/9p/client.c
index 9eadadb0a698..0edee4de608a 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -123,21 +123,19 @@ static int parse_opts(char *opts, struct p9_client *clnt)
 	options = tmp_options;
 
 	while ((p = strsep(&options, ",")) != NULL) {
-		int token;
+		int token, r;
 		if (!*p)
 			continue;
 		token = match_token(p, tokens, args);
-		if (token < Opt_trans) {
-			int r = match_int(&args[0], &option);
+		switch (token) {
+		case Opt_msize:
+			r = match_int(&args[0], &option);
 			if (r < 0) {
 				P9_DPRINTK(P9_DEBUG_ERROR,
-					"integer field, but no integer?\n");
+					   "integer field, but no integer?\n");
 				ret = r;
 				continue;
 			}
-		}
-		switch (token) {
-		case Opt_msize:
 			clnt->msize = option;
 			break;
 		case Opt_trans:
-- 
cgit v1.2.3-58-ga151


From 348b59012e5c6402741d067cf6eeeb6271999d06 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sun, 7 Aug 2011 00:46:59 +0530
Subject: net/9p: Convert net/9p protocol dumps to tracepoints

This helps in more control over debugging.
root@qemu-img-64:~# ls /pass/123
ls: cannot access /pass/123: No such file or directory
root@qemu-img-64:~# cat /sys/kernel/debug/tracing/trace
# tracer: nop
#
#           TASK-PID    CPU#    TIMESTAMP  FUNCTION
#              | |       |          |         |
              ls-1536  [001]    70.928584: 9p_protocol_dump: clnt 18446612132784021504 P9_TWALK(tag = 1)
000: 16 00 00 00 6e 01 00 01 00 00 00 02 00 00 00 01
010: 00 03 00 31 32 33 00 00 00 ff ff ff ff 00 00 00

              ls-1536  [001]    70.928587: <stack trace>
 => trace_9p_protocol_dump
 => p9pdu_finalize
 => p9_client_rpc
 => p9_client_walk
 => v9fs_vfs_lookup
 => d_alloc_and_lookup
 => walk_component
 => path_lookupat
              ls-1536  [000]    70.929696: 9p_protocol_dump: clnt 18446612132784021504 P9_RLERROR(tag = 1)
000: 0b 00 00 00 07 01 00 02 00 00 00 4e 03 00 02 00
010: 00 00 00 00 03 00 02 00 00 00 00 00 ff 43 00 00

              ls-1536  [000]    70.929697: <stack trace>
 => trace_9p_protocol_dump
 => p9_client_rpc
 => p9_client_walk
 => v9fs_vfs_lookup
 => d_alloc_and_lookup
 => walk_component
 => path_lookupat
 => do_path_lookup

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_dir.c           |  12 ++--
 include/net/9p/9p.h       |   3 -
 include/net/9p/client.h   |   6 +-
 include/trace/events/9p.h | 176 ++++++++++++++++++++++++++++++++++++++++++++++
 net/9p/client.c           |  77 ++++++++++----------
 net/9p/protocol.c         |  53 ++++----------
 net/9p/protocol.h         |   3 +-
 7 files changed, 238 insertions(+), 92 deletions(-)
 create mode 100644 include/trace/events/9p.h

(limited to 'fs')

diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index ce6600f33659..598fff1a54e5 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -165,9 +165,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		}
 		while (rdir->head < rdir->tail) {
 			p9stat_init(&st);
-			err = p9stat_read(rdir->buf + rdir->head,
-						rdir->tail - rdir->head, &st,
-						fid->clnt->proto_version);
+			err = p9stat_read(fid->clnt, rdir->buf + rdir->head,
+					  rdir->tail - rdir->head, &st);
 			if (err) {
 				P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
 				err = -EIO;
@@ -241,10 +240,9 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
 
 		while (rdir->head < rdir->tail) {
 
-			err = p9dirent_read(rdir->buf + rdir->head,
-						rdir->tail - rdir->head,
-						&curdirent,
-						fid->clnt->proto_version);
+			err = p9dirent_read(fid->clnt, rdir->buf + rdir->head,
+					    rdir->tail - rdir->head,
+					    &curdirent);
 			if (err < 0) {
 				P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
 				err = -EIO;
diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h
index d83a01300871..2d70b95b3b55 100644
--- a/include/net/9p/9p.h
+++ b/include/net/9p/9p.h
@@ -76,11 +76,8 @@ do {  \
 	} \
 } while (0)
 
-#define P9_DUMP_PKT(way, pdu) p9pdu_dump(way, pdu)
-
 #else
 #define P9_DPRINTK(level, format, arg...)  do { } while (0)
-#define P9_DUMP_PKT(way, pdu) do { } while (0)
 #endif
 
 
diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index d479d7dcc4d5..fc9b90b0c052 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -240,8 +240,8 @@ int p9_client_read(struct p9_fid *fid, char *data, char __user *udata,
 int p9_client_write(struct p9_fid *fid, char *data, const char __user *udata,
 							u64 offset, u32 count);
 int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset);
-int p9dirent_read(char *buf, int len, struct p9_dirent *dirent,
-							int proto_version);
+int p9dirent_read(struct p9_client *clnt, char *buf, int len,
+		  struct p9_dirent *dirent);
 struct p9_wstat *p9_client_stat(struct p9_fid *fid);
 int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst);
 int p9_client_setattr(struct p9_fid *fid, struct p9_iattr_dotl *attr);
@@ -259,7 +259,7 @@ struct p9_req_t *p9_tag_lookup(struct p9_client *, u16);
 void p9_client_cb(struct p9_client *c, struct p9_req_t *req);
 
 int p9_parse_header(struct p9_fcall *, int32_t *, int8_t *, int16_t *, int);
-int p9stat_read(char *, int, struct p9_wstat *, int);
+int p9stat_read(struct p9_client *, char *, int, struct p9_wstat *);
 void p9stat_free(struct p9_wstat *);
 
 int p9_is_proto_dotu(struct p9_client *clnt);
diff --git a/include/trace/events/9p.h b/include/trace/events/9p.h
new file mode 100644
index 000000000000..beeaed8398ec
--- /dev/null
+++ b/include/trace/events/9p.h
@@ -0,0 +1,176 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM 9p
+
+#if !defined(_TRACE_9P_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_9P_H
+
+#include <linux/tracepoint.h>
+
+#define show_9p_op(type)						\
+	__print_symbolic(type,						\
+			 { P9_TLERROR,		"P9_TLERROR" },		\
+			 { P9_RLERROR,		"P9_RLERROR" },		\
+			 { P9_TSTATFS,		"P9_TSTATFS" },		\
+			 { P9_RSTATFS,		"P9_RSTATFS" },		\
+			 { P9_TLOPEN,		"P9_TLOPEN" },		\
+			 { P9_RLOPEN,		"P9_RLOPEN" },		\
+			 { P9_TLCREATE,		"P9_TLCREATE" },	\
+			 { P9_RLCREATE,		"P9_RLCREATE" },	\
+			 { P9_TSYMLINK,		"P9_TSYMLINK" },	\
+			 { P9_RSYMLINK,		"P9_RSYMLINK" },	\
+			 { P9_TMKNOD,		"P9_TMKNOD" },		\
+			 { P9_RMKNOD,		"P9_RMKNOD" },		\
+			 { P9_TRENAME,		"P9_TRENAME" },		\
+			 { P9_RRENAME,		"P9_RRENAME" },		\
+			 { P9_TREADLINK,	"P9_TREADLINK" },	\
+			 { P9_RREADLINK,	"P9_RREADLINK" },	\
+			 { P9_TGETATTR,		"P9_TGETATTR" },	\
+			 { P9_RGETATTR,		"P9_RGETATTR" },	\
+			 { P9_TSETATTR,		"P9_TSETATTR" },	\
+			 { P9_RSETATTR,		"P9_RSETATTR" },	\
+			 { P9_TXATTRWALK,	"P9_TXATTRWALK" },	\
+			 { P9_RXATTRWALK,	"P9_RXATTRWALK" },	\
+			 { P9_TXATTRCREATE,	"P9_TXATTRCREATE" },	\
+			 { P9_RXATTRCREATE,	"P9_RXATTRCREATE" },	\
+			 { P9_TREADDIR,		"P9_TREADDIR" },	\
+			 { P9_RREADDIR,		"P9_RREADDIR" },	\
+			 { P9_TFSYNC,		"P9_TFSYNC" },		\
+			 { P9_RFSYNC,		"P9_RFSYNC" },		\
+			 { P9_TLOCK,		"P9_TLOCK" },		\
+			 { P9_RLOCK,		"P9_RLOCK" },		\
+			 { P9_TGETLOCK,		"P9_TGETLOCK" },	\
+			 { P9_RGETLOCK,		"P9_RGETLOCK" },	\
+			 { P9_TLINK,		"P9_TLINK" },		\
+			 { P9_RLINK,		"P9_RLINK" },		\
+			 { P9_TMKDIR,		"P9_TMKDIR" },		\
+			 { P9_RMKDIR,		"P9_RMKDIR" },		\
+			 { P9_TRENAMEAT,	"P9_TRENAMEAT" },	\
+			 { P9_RRENAMEAT,	"P9_RRENAMEAT" },	\
+			 { P9_TUNLINKAT,	"P9_TUNLINKAT" },	\
+			 { P9_RUNLINKAT,	"P9_RUNLINKAT" },	\
+			 { P9_TVERSION,		"P9_TVERSION" },	\
+			 { P9_RVERSION,		"P9_RVERSION" },	\
+			 { P9_TAUTH,		"P9_TAUTH" },		\
+			 { P9_RAUTH,		"P9_RAUTH" },		\
+			 { P9_TATTACH,		"P9_TATTACH" },		\
+			 { P9_RATTACH,		"P9_RATTACH" },		\
+			 { P9_TERROR,		"P9_TERROR" },		\
+			 { P9_RERROR,		"P9_RERROR" },		\
+			 { P9_TFLUSH,		"P9_TFLUSH" },		\
+			 { P9_RFLUSH,		"P9_RFLUSH" },		\
+			 { P9_TWALK,		"P9_TWALK" },		\
+			 { P9_RWALK,		"P9_RWALK" },		\
+			 { P9_TOPEN,		"P9_TOPEN" },		\
+			 { P9_ROPEN,		"P9_ROPEN" },		\
+			 { P9_TCREATE,		"P9_TCREATE" },		\
+			 { P9_RCREATE,		"P9_RCREATE" },		\
+			 { P9_TREAD,		"P9_TREAD" },		\
+			 { P9_RREAD,		"P9_RREAD" },		\
+			 { P9_TWRITE,		"P9_TWRITE" },		\
+			 { P9_RWRITE,		"P9_RWRITE" },		\
+			 { P9_TCLUNK,		"P9_TCLUNK" },		\
+			 { P9_RCLUNK,		"P9_RCLUNK" },		\
+			 { P9_TREMOVE,		"P9_TREMOVE" },		\
+			 { P9_RREMOVE,		"P9_RREMOVE" },		\
+			 { P9_TSTAT,		"P9_TSTAT" },		\
+			 { P9_RSTAT,		"P9_RSTAT" },		\
+			 { P9_TWSTAT,		"P9_TWSTAT" },		\
+			 { P9_RWSTAT,		"P9_RWSTAT" })
+
+TRACE_EVENT(9p_client_req,
+	    TP_PROTO(struct p9_client *clnt, int8_t type, int tag),
+
+	    TP_ARGS(clnt, type, tag),
+
+	    TP_STRUCT__entry(
+		    __field(    void *,		clnt			     )
+		    __field(	__u8,		type			     )
+		    __field(	__u32,		tag			     )
+		    ),
+
+	    TP_fast_assign(
+		    __entry->clnt    =  clnt;
+		    __entry->type    =  type;
+		    __entry->tag     =  tag;
+		    ),
+
+	    TP_printk("client %lu request %s tag  %d",
+		    (long)__entry->clnt, show_9p_op(__entry->type),
+		    __entry->tag)
+ );
+
+TRACE_EVENT(9p_client_res,
+	    TP_PROTO(struct p9_client *clnt, int8_t type, int tag, int err),
+
+	    TP_ARGS(clnt, type, tag, err),
+
+	    TP_STRUCT__entry(
+		    __field(    void *,		clnt			     )
+		    __field(	__u8,		type			     )
+		    __field(	__u32,		tag			     )
+		    __field(	__u32,		err			     )
+		    ),
+
+	    TP_fast_assign(
+		    __entry->clnt    =  clnt;
+		    __entry->type    =  type;
+		    __entry->tag     =  tag;
+		    __entry->err     =  err;
+		    ),
+
+	    TP_printk("client %lu response %s tag  %d err %d",
+		      (long)__entry->clnt, show_9p_op(__entry->type),
+		      __entry->tag, __entry->err)
+);
+
+/* dump 32 bytes of protocol data */
+#define P9_PROTO_DUMP_SZ 32
+TRACE_EVENT(9p_protocol_dump,
+	    TP_PROTO(struct p9_client *clnt, struct p9_fcall *pdu),
+
+	    TP_ARGS(clnt, pdu),
+
+	    TP_STRUCT__entry(
+		    __field(	void *,		clnt				)
+		    __field(	__u8,		type				)
+		    __field(	__u16,		tag				)
+		    __array(	unsigned char,	line,	P9_PROTO_DUMP_SZ	)
+		    ),
+
+	    TP_fast_assign(
+		    __entry->clnt   =  clnt;
+		    __entry->type   =  pdu->id;
+		    __entry->tag    =  pdu->tag;
+		    memcpy(__entry->line, pdu->sdata, P9_PROTO_DUMP_SZ);
+		    ),
+	    TP_printk("clnt %lu %s(tag = %d)\n%.3x: "
+		      "%02x %02x %02x %02x %02x %02x %02x %02x "
+		      "%02x %02x %02x %02x %02x %02x %02x %02x\n"
+		      "%.3x: "
+		      "%02x %02x %02x %02x %02x %02x %02x %02x "
+		      "%02x %02x %02x %02x %02x %02x %02x %02x\n",
+		      (long)__entry->clnt, show_9p_op(__entry->type),
+		      __entry->tag, 0,
+		      __entry->line[0],  __entry->line[1],
+		      __entry->line[2],  __entry->line[3],
+		      __entry->line[4],  __entry->line[5],
+		      __entry->line[6],  __entry->line[7],
+		      __entry->line[8],  __entry->line[9],
+		      __entry->line[10], __entry->line[11],
+		      __entry->line[12], __entry->line[13],
+		      __entry->line[14], __entry->line[15],
+		      16,
+		      __entry->line[16], __entry->line[17],
+		      __entry->line[18], __entry->line[19],
+		      __entry->line[20], __entry->line[21],
+		      __entry->line[22], __entry->line[23],
+		      __entry->line[24], __entry->line[25],
+		      __entry->line[26], __entry->line[27],
+		      __entry->line[28], __entry->line[29],
+		      __entry->line[30], __entry->line[31])
+ );
+
+#endif /* _TRACE_9P_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/net/9p/client.c b/net/9p/client.c
index b1c02187f862..854ca7a911c4 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -38,6 +38,9 @@
 #include <net/9p/transport.h>
 #include "protocol.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/9p.h>
+
 /*
   * Client Option Parsing (code inspired by NFS code)
   *  - a little lazy - parse all client options
@@ -464,11 +467,15 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
 	int ecode;
 
 	err = p9_parse_header(req->rc, NULL, &type, NULL, 0);
+	/*
+	 * dump the response from server
+	 * This should be after check errors which poplulate pdu_fcall.
+	 */
+	trace_9p_protocol_dump(c, req->rc);
 	if (err) {
 		P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse header %d\n", err);
 		return err;
 	}
-
 	if (type != P9_RERROR && type != P9_RLERROR)
 		return 0;
 
@@ -525,6 +532,11 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req,
 	char *ename = NULL;
 
 	err = p9_parse_header(req->rc, NULL, &type, NULL, 0);
+	/*
+	 * dump the response from server
+	 * This should be after parse_header which poplulate pdu_fcall.
+	 */
+	trace_9p_protocol_dump(c, req->rc);
 	if (err) {
 		P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse header %d\n", err);
 		return err;
@@ -684,7 +696,8 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
 	err = p9pdu_vwritef(req->tc, c->proto_version, fmt, ap);
 	if (err)
 		goto reterr;
-	p9pdu_finalize(req->tc);
+	p9pdu_finalize(c, req->tc);
+	trace_9p_client_req(c, type, tag);
 	return req;
 reterr:
 	p9_free_req(c, req);
@@ -755,13 +768,10 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
 		goto reterr;
 
 	err = p9_check_errors(c, req);
-	if (!err) {
-		P9_DPRINTK(P9_DEBUG_MUX, "exit: client %p op %d\n", c, type);
+	trace_9p_client_res(c, type, req->rc->tag, err);
+	if (!err)
 		return req;
-	}
 reterr:
-	P9_DPRINTK(P9_DEBUG_MUX,
-		   "exit: client %p op %d error: %d\n", c, type, err);
 	p9_free_req(c, req);
 	return ERR_PTR(err);
 }
@@ -841,13 +851,10 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
 		goto reterr;
 
 	err = p9_check_zc_errors(c, req, uidata, in_hdrlen, kern_buf);
-	if (!err) {
-		P9_DPRINTK(P9_DEBUG_MUX, "exit: client %p op %d\n", c, type);
+	trace_9p_client_res(c, type, req->rc->tag, err);
+	if (!err)
 		return req;
-	}
 reterr:
-	P9_DPRINTK(P9_DEBUG_MUX, "exit: client %p op %d error: %d\n", c, type,
-									err);
 	p9_free_req(c, req);
 	return ERR_PTR(err);
 }
@@ -935,7 +942,7 @@ static int p9_client_version(struct p9_client *c)
 	err = p9pdu_readf(req->rc, c->proto_version, "ds", &msize, &version);
 	if (err) {
 		P9_DPRINTK(P9_DEBUG_9P, "version error %d\n", err);
-		P9_DUMP_PKT(1, req->rc);
+		trace_9p_protocol_dump(c, req->rc);
 		goto error;
 	}
 
@@ -1072,15 +1079,14 @@ EXPORT_SYMBOL(p9_client_begin_disconnect);
 struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid,
 	char *uname, u32 n_uname, char *aname)
 {
-	int err;
+	int err = 0;
 	struct p9_req_t *req;
 	struct p9_fid *fid;
 	struct p9_qid qid;
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TATTACH afid %d uname %s aname %s\n",
-					afid ? afid->fid : -1, uname, aname);
-	err = 0;
 
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TATTACH afid %d uname %s aname %s\n",
+		   afid ? afid->fid : -1, uname, aname);
 	fid = p9_fid_create(clnt);
 	if (IS_ERR(fid)) {
 		err = PTR_ERR(fid);
@@ -1097,7 +1103,7 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid,
 
 	err = p9pdu_readf(req->rc, clnt->proto_version, "Q", &qid);
 	if (err) {
-		P9_DUMP_PKT(1, req->rc);
+		trace_9p_protocol_dump(clnt, req->rc);
 		p9_free_req(clnt, req);
 		goto error;
 	}
@@ -1157,7 +1163,7 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname,
 
 	err = p9pdu_readf(req->rc, clnt->proto_version, "R", &nwqids, &wqids);
 	if (err) {
-		P9_DUMP_PKT(1, req->rc);
+		trace_9p_protocol_dump(clnt, req->rc);
 		p9_free_req(clnt, req);
 		goto clunk_fid;
 	}
@@ -1224,7 +1230,7 @@ int p9_client_open(struct p9_fid *fid, int mode)
 
 	err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit);
 	if (err) {
-		P9_DUMP_PKT(1, req->rc);
+		trace_9p_protocol_dump(clnt, req->rc);
 		goto free_and_error;
 	}
 
@@ -1267,7 +1273,7 @@ int p9_client_create_dotl(struct p9_fid *ofid, char *name, u32 flags, u32 mode,
 
 	err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", qid, &iounit);
 	if (err) {
-		P9_DUMP_PKT(1, req->rc);
+		trace_9p_protocol_dump(clnt, req->rc);
 		goto free_and_error;
 	}
 
@@ -1312,7 +1318,7 @@ int p9_client_fcreate(struct p9_fid *fid, char *name, u32 perm, int mode,
 
 	err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit);
 	if (err) {
-		P9_DUMP_PKT(1, req->rc);
+		trace_9p_protocol_dump(clnt, req->rc);
 		goto free_and_error;
 	}
 
@@ -1351,7 +1357,7 @@ int p9_client_symlink(struct p9_fid *dfid, char *name, char *symtgt, gid_t gid,
 
 	err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid);
 	if (err) {
-		P9_DUMP_PKT(1, req->rc);
+		trace_9p_protocol_dump(clnt, req->rc);
 		goto free_and_error;
 	}
 
@@ -1542,12 +1548,11 @@ p9_client_read(struct p9_fid *fid, char *data, char __user *udata, u64 offset,
 
 	err = p9pdu_readf(req->rc, clnt->proto_version, "D", &count, &dataptr);
 	if (err) {
-		P9_DUMP_PKT(1, req->rc);
+		trace_9p_protocol_dump(clnt, req->rc);
 		goto free_and_error;
 	}
 
 	P9_DPRINTK(P9_DEBUG_9P, "<<< RREAD count %d\n", count);
-	P9_DUMP_PKT(1, req->rc);
 
 	if (non_zc) {
 		if (data) {
@@ -1617,7 +1622,7 @@ p9_client_write(struct p9_fid *fid, char *data, const char __user *udata,
 
 	err = p9pdu_readf(req->rc, clnt->proto_version, "d", &count);
 	if (err) {
-		P9_DUMP_PKT(1, req->rc);
+		trace_9p_protocol_dump(clnt, req->rc);
 		goto free_and_error;
 	}
 
@@ -1657,7 +1662,7 @@ struct p9_wstat *p9_client_stat(struct p9_fid *fid)
 
 	err = p9pdu_readf(req->rc, clnt->proto_version, "wS", &ignored, ret);
 	if (err) {
-		P9_DUMP_PKT(1, req->rc);
+		trace_9p_protocol_dump(clnt, req->rc);
 		p9_free_req(clnt, req);
 		goto error;
 	}
@@ -1708,7 +1713,7 @@ struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid,
 
 	err = p9pdu_readf(req->rc, clnt->proto_version, "A", ret);
 	if (err) {
-		P9_DUMP_PKT(1, req->rc);
+		trace_9p_protocol_dump(clnt, req->rc);
 		p9_free_req(clnt, req);
 		goto error;
 	}
@@ -1856,7 +1861,7 @@ int p9_client_statfs(struct p9_fid *fid, struct p9_rstatfs *sb)
 		&sb->bsize, &sb->blocks, &sb->bfree, &sb->bavail,
 		&sb->files, &sb->ffree, &sb->fsid, &sb->namelen);
 	if (err) {
-		P9_DUMP_PKT(1, req->rc);
+		trace_9p_protocol_dump(clnt, req->rc);
 		p9_free_req(clnt, req);
 		goto error;
 	}
@@ -1963,7 +1968,7 @@ struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid,
 	}
 	err = p9pdu_readf(req->rc, clnt->proto_version, "q", attr_size);
 	if (err) {
-		P9_DUMP_PKT(1, req->rc);
+		trace_9p_protocol_dump(clnt, req->rc);
 		p9_free_req(clnt, req);
 		goto clunk_fid;
 	}
@@ -2047,7 +2052,7 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset)
 
 	err = p9pdu_readf(req->rc, clnt->proto_version, "D", &count, &dataptr);
 	if (err) {
-		P9_DUMP_PKT(1, req->rc);
+		trace_9p_protocol_dump(clnt, req->rc);
 		goto free_and_error;
 	}
 
@@ -2084,7 +2089,7 @@ int p9_client_mknod_dotl(struct p9_fid *fid, char *name, int mode,
 
 	err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid);
 	if (err) {
-		P9_DUMP_PKT(1, req->rc);
+		trace_9p_protocol_dump(clnt, req->rc);
 		goto error;
 	}
 	P9_DPRINTK(P9_DEBUG_9P, "<<< RMKNOD qid %x.%llx.%x\n", qid->type,
@@ -2115,7 +2120,7 @@ int p9_client_mkdir_dotl(struct p9_fid *fid, char *name, int mode,
 
 	err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid);
 	if (err) {
-		P9_DUMP_PKT(1, req->rc);
+		trace_9p_protocol_dump(clnt, req->rc);
 		goto error;
 	}
 	P9_DPRINTK(P9_DEBUG_9P, "<<< RMKDIR qid %x.%llx.%x\n", qid->type,
@@ -2150,7 +2155,7 @@ int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status)
 
 	err = p9pdu_readf(req->rc, clnt->proto_version, "b", status);
 	if (err) {
-		P9_DUMP_PKT(1, req->rc);
+		trace_9p_protocol_dump(clnt, req->rc);
 		goto error;
 	}
 	P9_DPRINTK(P9_DEBUG_9P, "<<< RLOCK status %i\n", *status);
@@ -2183,7 +2188,7 @@ int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *glock)
 			&glock->start, &glock->length, &glock->proc_id,
 			&glock->client_id);
 	if (err) {
-		P9_DUMP_PKT(1, req->rc);
+		trace_9p_protocol_dump(clnt, req->rc);
 		goto error;
 	}
 	P9_DPRINTK(P9_DEBUG_9P, "<<< RGETLOCK type %i start %lld length %lld "
@@ -2211,7 +2216,7 @@ int p9_client_readlink(struct p9_fid *fid, char **target)
 
 	err = p9pdu_readf(req->rc, clnt->proto_version, "s", target);
 	if (err) {
-		P9_DUMP_PKT(1, req->rc);
+		trace_9p_protocol_dump(clnt, req->rc);
 		goto error;
 	}
 	P9_DPRINTK(P9_DEBUG_9P, "<<< RREADLINK target %s\n", *target);
diff --git a/net/9p/protocol.c b/net/9p/protocol.c
index b7d4e8aa5383..55e10a96c902 100644
--- a/net/9p/protocol.c
+++ b/net/9p/protocol.c
@@ -37,40 +37,11 @@
 #include <net/9p/client.h>
 #include "protocol.h"
 
+#include <trace/events/9p.h>
+
 static int
 p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...);
 
-#ifdef CONFIG_NET_9P_DEBUG
-void
-p9pdu_dump(int way, struct p9_fcall *pdu)
-{
-	int len = pdu->size;
-
-	if ((p9_debug_level & P9_DEBUG_VPKT) != P9_DEBUG_VPKT) {
-		if ((p9_debug_level & P9_DEBUG_PKT) == P9_DEBUG_PKT) {
-			if (len > 32)
-				len = 32;
-		} else {
-			/* shouldn't happen */
-			return;
-		}
-	}
-
-	if (way)
-		print_hex_dump_bytes("[9P] ", DUMP_PREFIX_OFFSET, pdu->sdata,
-									len);
-	else
-		print_hex_dump_bytes("]9P[ ", DUMP_PREFIX_OFFSET, pdu->sdata,
-									len);
-}
-#else
-void
-p9pdu_dump(int way, struct p9_fcall *pdu)
-{
-}
-#endif
-EXPORT_SYMBOL(p9pdu_dump);
-
 void p9stat_free(struct p9_wstat *stbuf)
 {
 	kfree(stbuf->name);
@@ -551,7 +522,7 @@ p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...)
 	return ret;
 }
 
-int p9stat_read(char *buf, int len, struct p9_wstat *st, int proto_version)
+int p9stat_read(struct p9_client *clnt, char *buf, int len, struct p9_wstat *st)
 {
 	struct p9_fcall fake_pdu;
 	int ret;
@@ -561,10 +532,10 @@ int p9stat_read(char *buf, int len, struct p9_wstat *st, int proto_version)
 	fake_pdu.sdata = buf;
 	fake_pdu.offset = 0;
 
-	ret = p9pdu_readf(&fake_pdu, proto_version, "S", st);
+	ret = p9pdu_readf(&fake_pdu, clnt->proto_version, "S", st);
 	if (ret) {
 		P9_DPRINTK(P9_DEBUG_9P, "<<< p9stat_read failed: %d\n", ret);
-		P9_DUMP_PKT(0, &fake_pdu);
+		trace_9p_protocol_dump(clnt, &fake_pdu);
 	}
 
 	return ret;
@@ -577,7 +548,7 @@ int p9pdu_prepare(struct p9_fcall *pdu, int16_t tag, int8_t type)
 	return p9pdu_writef(pdu, 0, "dbw", 0, type, tag);
 }
 
-int p9pdu_finalize(struct p9_fcall *pdu)
+int p9pdu_finalize(struct p9_client *clnt, struct p9_fcall *pdu)
 {
 	int size = pdu->size;
 	int err;
@@ -586,7 +557,7 @@ int p9pdu_finalize(struct p9_fcall *pdu)
 	err = p9pdu_writef(pdu, 0, "d", size);
 	pdu->size = size;
 
-	P9_DUMP_PKT(0, pdu);
+	trace_9p_protocol_dump(clnt, pdu);
 	P9_DPRINTK(P9_DEBUG_9P, ">>> size=%d type: %d tag: %d\n", pdu->size,
 							pdu->id, pdu->tag);
 
@@ -599,8 +570,8 @@ void p9pdu_reset(struct p9_fcall *pdu)
 	pdu->size = 0;
 }
 
-int p9dirent_read(char *buf, int len, struct p9_dirent *dirent,
-						int proto_version)
+int p9dirent_read(struct p9_client *clnt, char *buf, int len,
+		  struct p9_dirent *dirent)
 {
 	struct p9_fcall fake_pdu;
 	int ret;
@@ -611,11 +582,11 @@ int p9dirent_read(char *buf, int len, struct p9_dirent *dirent,
 	fake_pdu.sdata = buf;
 	fake_pdu.offset = 0;
 
-	ret = p9pdu_readf(&fake_pdu, proto_version, "Qqbs", &dirent->qid,
-			&dirent->d_off, &dirent->d_type, &nameptr);
+	ret = p9pdu_readf(&fake_pdu, clnt->proto_version, "Qqbs", &dirent->qid,
+			  &dirent->d_off, &dirent->d_type, &nameptr);
 	if (ret) {
 		P9_DPRINTK(P9_DEBUG_9P, "<<< p9dirent_read failed: %d\n", ret);
-		P9_DUMP_PKT(1, &fake_pdu);
+		trace_9p_protocol_dump(clnt, &fake_pdu);
 		goto out;
 	}
 
diff --git a/net/9p/protocol.h b/net/9p/protocol.h
index a0eb8ff11f22..2cc525fa49fa 100644
--- a/net/9p/protocol.h
+++ b/net/9p/protocol.h
@@ -29,7 +29,6 @@ int p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
 								va_list ap);
 int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...);
 int p9pdu_prepare(struct p9_fcall *pdu, int16_t tag, int8_t type);
-int p9pdu_finalize(struct p9_fcall *pdu);
-void p9pdu_dump(int, struct p9_fcall *);
+int p9pdu_finalize(struct p9_client *clnt, struct p9_fcall *pdu);
 void p9pdu_reset(struct p9_fcall *pdu);
 size_t pdu_read(struct p9_fcall *pdu, void *data, size_t size);
-- 
cgit v1.2.3-58-ga151


From 85160e03a79e0d7f9082e61f6a784abc6f402701 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastry@etersoft.ru>
Date: Sat, 22 Oct 2011 15:33:29 +0400
Subject: CIFS: Implement caching mechanism for mandatory brlocks

If we have an oplock and negotiate mandatory locking style we handle
all brlock requests on the client.

Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Acked-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsglob.h |   2 +
 fs/cifs/file.c     | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 197 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index d153d0b89d39..8238aa13e01c 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -496,6 +496,8 @@ extern struct cifs_tcon *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
  */
 struct cifsLockInfo {
 	struct list_head llist;	/* pointer to next cifsLockInfo */
+	struct list_head blist; /* pointer to locks blocked on this */
+	wait_queue_head_t block_q;
 	__u64 offset;
 	__u64 length;
 	__u32 pid;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index a3b545ff5250..34cbbee30b18 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -275,11 +275,14 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 	spin_unlock(&cifs_file_list_lock);
 
 	cifs_set_oplock_level(pCifsInode, oplock);
+	pCifsInode->can_cache_brlcks = pCifsInode->clientCanCacheAll;
 
 	file->private_data = pCifsFile;
 	return pCifsFile;
 }
 
+static void cifs_del_lock_waiters(struct cifsLockInfo *lock);
+
 /*
  * Release a reference on the file private data. This may involve closing
  * the filehandle out on the server. Must be called without holding
@@ -335,6 +338,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 		if (li->netfid != cifs_file->netfid)
 			continue;
 		list_del(&li->llist);
+		cifs_del_lock_waiters(li);
 		kfree(li);
 	}
 	mutex_unlock(&cifsi->lock_mutex);
@@ -640,24 +644,182 @@ int cifs_closedir(struct inode *inode, struct file *file)
 	return rc;
 }
 
-static int store_file_lock(struct cifsInodeInfo *cinode, __u64 len,
-			   __u64 offset, __u8 type, __u16 netfid)
+static struct cifsLockInfo *
+cifs_lock_init(__u64 len, __u64 offset, __u8 type, __u16 netfid)
 {
 	struct cifsLockInfo *li =
 		kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL);
-	if (li == NULL)
-		return -ENOMEM;
+	if (!li)
+		return li;
 	li->netfid = netfid;
 	li->offset = offset;
 	li->length = len;
 	li->type = type;
 	li->pid = current->tgid;
+	INIT_LIST_HEAD(&li->blist);
+	init_waitqueue_head(&li->block_q);
+	return li;
+}
+
+static void
+cifs_del_lock_waiters(struct cifsLockInfo *lock)
+{
+	struct cifsLockInfo *li, *tmp;
+	list_for_each_entry_safe(li, tmp, &lock->blist, blist) {
+		list_del_init(&li->blist);
+		wake_up(&li->block_q);
+	}
+}
+
+static bool
+cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset,
+			__u64 length, __u8 type, __u16 netfid,
+			struct cifsLockInfo **conf_lock)
+{
+	struct cifsLockInfo *li, *tmp;
+
+	list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
+		if (offset + length <= li->offset ||
+		    offset >= li->offset + li->length)
+			continue;
+		else if ((type & LOCKING_ANDX_SHARED_LOCK) &&
+			 ((netfid == li->netfid && current->tgid == li->pid) ||
+			  type == li->type))
+			continue;
+		else {
+			*conf_lock = li;
+			return true;
+		}
+	}
+	return false;
+}
+
+static int
+cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
+	       __u8 type, __u16 netfid, struct file_lock *flock)
+{
+	int rc = 0;
+	struct cifsLockInfo *conf_lock;
+	bool exist;
+
+	mutex_lock(&cinode->lock_mutex);
+
+	exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid,
+					&conf_lock);
+	if (exist) {
+		flock->fl_start = conf_lock->offset;
+		flock->fl_end = conf_lock->offset + conf_lock->length - 1;
+		flock->fl_pid = conf_lock->pid;
+		if (conf_lock->type & LOCKING_ANDX_SHARED_LOCK)
+			flock->fl_type = F_RDLCK;
+		else
+			flock->fl_type = F_WRLCK;
+	} else if (!cinode->can_cache_brlcks)
+		rc = 1;
+	else
+		flock->fl_type = F_UNLCK;
+
+	mutex_unlock(&cinode->lock_mutex);
+	return rc;
+}
+
+static int
+cifs_lock_add(struct cifsInodeInfo *cinode, __u64 len, __u64 offset,
+	      __u8 type, __u16 netfid)
+{
+	struct cifsLockInfo *li;
+
+	li = cifs_lock_init(len, offset, type, netfid);
+	if (!li)
+		return -ENOMEM;
+
 	mutex_lock(&cinode->lock_mutex);
 	list_add_tail(&li->llist, &cinode->llist);
 	mutex_unlock(&cinode->lock_mutex);
 	return 0;
 }
 
+static int
+cifs_lock_add_if(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
+		 __u8 type, __u16 netfid, bool wait)
+{
+	struct cifsLockInfo *lock, *conf_lock;
+	bool exist;
+	int rc = 0;
+
+	lock = cifs_lock_init(length, offset, type, netfid);
+	if (!lock)
+		return -ENOMEM;
+
+try_again:
+	exist = false;
+	mutex_lock(&cinode->lock_mutex);
+
+	exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid,
+					&conf_lock);
+	if (!exist && cinode->can_cache_brlcks) {
+		list_add_tail(&lock->llist, &cinode->llist);
+		mutex_unlock(&cinode->lock_mutex);
+		return rc;
+	}
+
+	if (!exist)
+		rc = 1;
+	else if (!wait)
+		rc = -EACCES;
+	else {
+		list_add_tail(&lock->blist, &conf_lock->blist);
+		mutex_unlock(&cinode->lock_mutex);
+		rc = wait_event_interruptible(lock->block_q,
+					(lock->blist.prev == &lock->blist) &&
+					(lock->blist.next == &lock->blist));
+		if (!rc)
+			goto try_again;
+		else {
+			mutex_lock(&cinode->lock_mutex);
+			list_del_init(&lock->blist);
+			mutex_unlock(&cinode->lock_mutex);
+		}
+	}
+
+	kfree(lock);
+	mutex_unlock(&cinode->lock_mutex);
+	return rc;
+}
+
+static int
+cifs_push_locks(struct cifsFileInfo *cfile)
+{
+	int xid, rc = 0, stored_rc;
+	struct cifsLockInfo *li, *tmp;
+	struct cifs_tcon *tcon;
+	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+
+	xid = GetXid();
+	tcon = tlink_tcon(cfile->tlink);
+
+	mutex_lock(&cinode->lock_mutex);
+	if (!cinode->can_cache_brlcks) {
+		mutex_unlock(&cinode->lock_mutex);
+		FreeXid(xid);
+		return rc;
+	}
+
+	list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
+		stored_rc = CIFSSMBLock(xid, tcon, cfile->netfid,
+					li->pid, li->length, li->offset,
+					0, 1, li->type, 0, 0);
+		if (stored_rc)
+			rc = stored_rc;
+	}
+
+	cinode->can_cache_brlcks = false;
+	mutex_unlock(&cinode->lock_mutex);
+
+	FreeXid(xid);
+	return rc;
+}
+
 static void
 cifs_read_flock(struct file_lock *flock, __u8 *type, int *lock, int *unlock,
 		bool *wait_flag)
@@ -708,6 +870,7 @@ cifs_getlk(struct cifsFileInfo *cfile, struct file_lock *flock, __u8 type,
 {
 	int rc = 0;
 	__u64 length = 1 + flock->fl_end - flock->fl_start;
+	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
 	__u16 netfid = cfile->netfid;
 	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
 
@@ -723,6 +886,11 @@ cifs_getlk(struct cifsFileInfo *cfile, struct file_lock *flock, __u8 type,
 		return rc;
 	}
 
+	rc = cifs_lock_test(cinode, flock->fl_start, length, type, netfid,
+			    flock);
+	if (!rc)
+		return rc;
+
 	/* BB we could chain these into one lock request BB */
 	rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
 			 flock->fl_start, 0, 1, type, 0, 0);
@@ -790,12 +958,19 @@ cifs_setlk(struct file *file,  struct file_lock *flock, __u8 type,
 	}
 
 	if (lock) {
+		rc = cifs_lock_add_if(cinode, flock->fl_start, length,
+				      type, netfid, wait_flag);
+		if (rc < 0)
+			return rc;
+		else if (!rc)
+			goto out;
+
 		rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
-				 flock->fl_start, 0, lock, type, wait_flag, 0);
+				 flock->fl_start, 0, 1, type, wait_flag, 0);
 		if (rc == 0) {
 			/* For Windows locks we must store them. */
-			rc = store_file_lock(cinode, length, flock->fl_start,
-					     type, netfid);
+			rc = cifs_lock_add(cinode, length, flock->fl_start,
+					   type, netfid);
 		}
 	} else if (unlock) {
 		/*
@@ -816,14 +991,19 @@ cifs_setlk(struct file *file,  struct file_lock *flock, __u8 type,
 			if (cfile->netfid != li->netfid)
 				continue;
 
-			stored_rc = CIFSSMBLock(xid, tcon, netfid,
-						current->tgid, li->length,
-						li->offset, 1, 0, li->type,
-						0, 0);
+			if (!cinode->can_cache_brlcks)
+				stored_rc = CIFSSMBLock(xid, tcon, netfid,
+							current->tgid,
+							li->length, li->offset,
+							1, 0, li->type, 0, 0);
+			else
+				stored_rc = 0;
+
 			if (stored_rc)
 				rc = stored_rc;
 			else {
 				list_del(&li->llist);
+				cifs_del_lock_waiters(li);
 				kfree(li);
 			}
 		}
@@ -2404,6 +2584,10 @@ void cifs_oplock_break(struct work_struct *work)
 		cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
 	}
 
+	rc = cifs_push_locks(cfile);
+	if (rc)
+		cERROR(1, "Push locks rc = %d", rc);
+
 	/*
 	 * releasing stale oplock after recent reconnect of smb session using
 	 * a now incorrect file handle is not a data integrity issue but do
-- 
cgit v1.2.3-58-ga151


From 4f6bcec910d45e4f46b1514977caa529bc69e645 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastry@etersoft.ru>
Date: Sat, 22 Oct 2011 15:33:30 +0400
Subject: CIFS: Implement caching mechanism for posix brlocks

to handle all lock requests on the client in an exclusive oplock case.

Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsproto.h |   4 +-
 fs/cifs/cifssmb.c   |   8 +--
 fs/cifs/file.c      | 150 ++++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 147 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index c25d0636cc4f..67c26cfe160d 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -374,8 +374,8 @@ extern int CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
 			const __u32 numLock, const __u8 lockType,
 			const bool waitFlag, const __u8 oplock_level);
 extern int CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
-			const __u16 smb_file_id, const int get_flag,
-			const __u64 len, struct file_lock *,
+			const __u16 smb_file_id, const __u32 netpid,
+			const int get_flag, const __u64 len, struct file_lock *,
 			const __u16 lock_type, const bool waitFlag);
 extern int CIFSSMBTDis(const int xid, struct cifs_tcon *tcon);
 extern int CIFSSMBEcho(struct TCP_Server_Info *server);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 4435b11c41b9..6a45a1769388 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2393,9 +2393,9 @@ CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
 
 int
 CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
-		const __u16 smb_file_id, const int get_flag, const __u64 len,
-		struct file_lock *pLockData, const __u16 lock_type,
-		const bool waitFlag)
+		const __u16 smb_file_id, const __u32 netpid, const int get_flag,
+		const __u64 len, struct file_lock *pLockData,
+		const __u16 lock_type, const bool waitFlag)
 {
 	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
 	struct smb_com_transaction2_sfi_rsp *pSMBr = NULL;
@@ -2453,7 +2453,7 @@ CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
 	} else
 		pSMB->Timeout = 0;
 
-	parm_data->pid = cpu_to_le32(current->tgid);
+	parm_data->pid = cpu_to_le32(netpid);
 	parm_data->start = cpu_to_le64(pLockData->fl_start);
 	parm_data->length = cpu_to_le64(len);  /* normalize negative numbers */
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 34cbbee30b18..805e2bd1dfd5 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -788,7 +788,42 @@ try_again:
 }
 
 static int
-cifs_push_locks(struct cifsFileInfo *cfile)
+cifs_posix_lock_test(struct file *file, struct file_lock *flock)
+{
+	int rc = 0;
+	struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
+	unsigned char saved_type = flock->fl_type;
+
+	mutex_lock(&cinode->lock_mutex);
+	posix_test_lock(file, flock);
+
+	if (flock->fl_type == F_UNLCK && !cinode->can_cache_brlcks) {
+		flock->fl_type = saved_type;
+		rc = 1;
+	}
+
+	mutex_unlock(&cinode->lock_mutex);
+	return rc;
+}
+
+static int
+cifs_posix_lock_set(struct file *file, struct file_lock *flock)
+{
+	struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
+	int rc;
+
+	mutex_lock(&cinode->lock_mutex);
+	if (!cinode->can_cache_brlcks) {
+		mutex_unlock(&cinode->lock_mutex);
+		return 1;
+	}
+	rc = posix_lock_file_wait(file, flock);
+	mutex_unlock(&cinode->lock_mutex);
+	return rc;
+}
+
+static int
+cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
 {
 	int xid, rc = 0, stored_rc;
 	struct cifsLockInfo *li, *tmp;
@@ -820,6 +855,91 @@ cifs_push_locks(struct cifsFileInfo *cfile)
 	return rc;
 }
 
+/* copied from fs/locks.c with a name change */
+#define cifs_for_each_lock(inode, lockp) \
+	for (lockp = &inode->i_flock; *lockp != NULL; \
+	     lockp = &(*lockp)->fl_next)
+
+static int
+cifs_push_posix_locks(struct cifsFileInfo *cfile)
+{
+	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	struct file_lock *flock, **before;
+	struct cifsLockInfo *lck, *tmp;
+	int rc = 0, xid, type;
+	__u64 length;
+	struct list_head locks_to_send;
+
+	xid = GetXid();
+
+	mutex_lock(&cinode->lock_mutex);
+	if (!cinode->can_cache_brlcks) {
+		mutex_unlock(&cinode->lock_mutex);
+		FreeXid(xid);
+		return rc;
+	}
+
+	INIT_LIST_HEAD(&locks_to_send);
+
+	lock_flocks();
+	cifs_for_each_lock(cfile->dentry->d_inode, before) {
+		flock = *before;
+		length = 1 + flock->fl_end - flock->fl_start;
+		if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK)
+			type = CIFS_RDLCK;
+		else
+			type = CIFS_WRLCK;
+
+		lck = cifs_lock_init(length, flock->fl_start, type,
+				     cfile->netfid);
+		if (!lck) {
+			rc = -ENOMEM;
+			goto send_locks;
+		}
+		lck->pid = flock->fl_pid;
+
+		list_add_tail(&lck->llist, &locks_to_send);
+	}
+
+send_locks:
+	unlock_flocks();
+
+	list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
+		struct file_lock tmp_lock;
+		int stored_rc;
+
+		tmp_lock.fl_start = lck->offset;
+		stored_rc = CIFSSMBPosixLock(xid, tcon, lck->netfid, lck->pid,
+					     0, lck->length, &tmp_lock,
+					     lck->type, 0);
+		if (stored_rc)
+			rc = stored_rc;
+		list_del(&lck->llist);
+		kfree(lck);
+	}
+
+	cinode->can_cache_brlcks = false;
+	mutex_unlock(&cinode->lock_mutex);
+
+	FreeXid(xid);
+	return rc;
+}
+
+static int
+cifs_push_locks(struct cifsFileInfo *cfile)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+
+	if ((tcon->ses->capabilities & CAP_UNIX) &&
+	    (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+	    ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+		return cifs_push_posix_locks(cfile);
+
+	return cifs_push_mandatory_locks(cfile);
+}
+
 static void
 cifs_read_flock(struct file_lock *flock, __u8 *type, int *lock, int *unlock,
 		bool *wait_flag)
@@ -865,24 +985,30 @@ cifs_read_flock(struct file_lock *flock, __u8 *type, int *lock, int *unlock,
 }
 
 static int
-cifs_getlk(struct cifsFileInfo *cfile, struct file_lock *flock, __u8 type,
+cifs_getlk(struct file *file, struct file_lock *flock, __u8 type,
 	   bool wait_flag, bool posix_lck, int xid)
 {
 	int rc = 0;
 	__u64 length = 1 + flock->fl_end - flock->fl_start;
+	struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
 	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
 	__u16 netfid = cfile->netfid;
-	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
 
 	if (posix_lck) {
 		int posix_lock_type;
+
+		rc = cifs_posix_lock_test(file, flock);
+		if (!rc)
+			return rc;
+
 		if (type & LOCKING_ANDX_SHARED_LOCK)
 			posix_lock_type = CIFS_RDLCK;
 		else
 			posix_lock_type = CIFS_WRLCK;
-		rc = CIFSSMBPosixLock(xid, tcon, netfid, 1 /* get */,
-				      length, flock, posix_lock_type,
-				      wait_flag);
+		rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid,
+				      1 /* get */, length, flock,
+				      posix_lock_type, wait_flag);
 		return rc;
 	}
 
@@ -944,6 +1070,11 @@ cifs_setlk(struct file *file,  struct file_lock *flock, __u8 type,
 
 	if (posix_lck) {
 		int posix_lock_type;
+
+		rc = cifs_posix_lock_set(file, flock);
+		if (!rc || rc < 0)
+			return rc;
+
 		if (type & LOCKING_ANDX_SHARED_LOCK)
 			posix_lock_type = CIFS_RDLCK;
 		else
@@ -952,8 +1083,9 @@ cifs_setlk(struct file *file,  struct file_lock *flock, __u8 type,
 		if (unlock == 1)
 			posix_lock_type = CIFS_UNLCK;
 
-		rc = CIFSSMBPosixLock(xid, tcon, netfid, 0 /* set */, length,
-				      flock, posix_lock_type, wait_flag);
+		rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid,
+				      0 /* set */, length, flock,
+				      posix_lock_type, wait_flag);
 		goto out;
 	}
 
@@ -1052,7 +1184,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
 	 * negative length which we can not accept over the wire.
 	 */
 	if (IS_GETLK(cmd)) {
-		rc = cifs_getlk(cfile, flock, type, wait_flag, posix_lck, xid);
+		rc = cifs_getlk(file, flock, type, wait_flag, posix_lck, xid);
 		FreeXid(xid);
 		return rc;
 	}
-- 
cgit v1.2.3-58-ga151


From 9ee305b70e09f5132c9723780ce10e69710b8bca Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastry@etersoft.ru>
Date: Sat, 22 Oct 2011 15:33:31 +0400
Subject: CIFS: Send as many mandatory unlock ranges at once as possible

that reduces a traffic and increases a performance.

Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsproto.h |   3 +
 fs/cifs/cifssmb.c   |  40 +++++++++++++
 fs/cifs/file.c      | 160 ++++++++++++++++++++++++++++++++++++++++------------
 3 files changed, 167 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 67c26cfe160d..ef4f631e4c01 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -368,6 +368,9 @@ extern int CIFSGetSrvInodeNumber(const int xid, struct cifs_tcon *tcon,
 			const struct nls_table *nls_codepage,
 			int remap_special_chars);
 
+extern int cifs_lockv(const int xid, struct cifs_tcon *tcon, const __u16 netfid,
+		      const __u8 lock_type, const __u32 num_unlock,
+		      const __u32 num_lock, LOCKING_ANDX_RANGE *buf);
 extern int CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
 			const __u16 netfid, const __u32 netpid, const __u64 len,
 			const __u64 offset, const __u32 numUnlock,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 6a45a1769388..6600aa2d2ef3 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2320,6 +2320,46 @@ CIFSSMBWrite2(const int xid, struct cifs_io_parms *io_parms,
 	return rc;
 }
 
+int cifs_lockv(const int xid, struct cifs_tcon *tcon, const __u16 netfid,
+	       const __u8 lock_type, const __u32 num_unlock,
+	       const __u32 num_lock, LOCKING_ANDX_RANGE *buf)
+{
+	int rc = 0;
+	LOCK_REQ *pSMB = NULL;
+	struct kvec iov[2];
+	int resp_buf_type;
+	__u16 count;
+
+	cFYI(1, "cifs_lockv num lock %d num unlock %d", num_lock, num_unlock);
+
+	rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
+	if (rc)
+		return rc;
+
+	pSMB->Timeout = 0;
+	pSMB->NumberOfLocks = cpu_to_le16(num_lock);
+	pSMB->NumberOfUnlocks = cpu_to_le16(num_unlock);
+	pSMB->LockType = lock_type;
+	pSMB->AndXCommand = 0xFF; /* none */
+	pSMB->Fid = netfid; /* netfid stays le */
+
+	count = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
+	inc_rfc1001_len(pSMB, count);
+	pSMB->ByteCount = cpu_to_le16(count);
+
+	iov[0].iov_base = (char *)pSMB;
+	iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4 -
+			 (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
+	iov[1].iov_base = (char *)buf;
+	iov[1].iov_len = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
+
+	cifs_stats_inc(&tcon->num_locks);
+	rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP);
+	if (rc)
+		cFYI(1, "Send error in cifs_lockv = %d", rc);
+
+	return rc;
+}
 
 int
 CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 805e2bd1dfd5..569184e6ee01 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1057,6 +1057,128 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u8 type,
 	return rc;
 }
 
+static void
+cifs_move_llist(struct list_head *source, struct list_head *dest)
+{
+	struct list_head *li, *tmp;
+	list_for_each_safe(li, tmp, source)
+		list_move(li, dest);
+}
+
+static void
+cifs_free_llist(struct list_head *llist)
+{
+	struct cifsLockInfo *li, *tmp;
+	list_for_each_entry_safe(li, tmp, llist, llist) {
+		cifs_del_lock_waiters(li);
+		list_del(&li->llist);
+		kfree(li);
+	}
+}
+
+static int
+cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
+{
+	int rc = 0, stored_rc;
+	int types[] = {LOCKING_ANDX_LARGE_FILES,
+		       LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES};
+	unsigned int i;
+	unsigned int max_num, num;
+	LOCKING_ANDX_RANGE *buf, *cur;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+	struct cifsLockInfo *li, *tmp;
+	__u64 length = 1 + flock->fl_end - flock->fl_start;
+	struct list_head tmp_llist;
+
+	INIT_LIST_HEAD(&tmp_llist);
+
+	max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) /
+		  sizeof(LOCKING_ANDX_RANGE);
+	buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	mutex_lock(&cinode->lock_mutex);
+	for (i = 0; i < 2; i++) {
+		cur = buf;
+		num = 0;
+		list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
+			if (flock->fl_start > li->offset ||
+			    (flock->fl_start + length) <
+			    (li->offset + li->length))
+				continue;
+			if (current->tgid != li->pid)
+				continue;
+			if (cfile->netfid != li->netfid)
+				continue;
+			if (types[i] != li->type)
+				continue;
+			if (!cinode->can_cache_brlcks) {
+				cur->Pid = cpu_to_le16(li->pid);
+				cur->LengthLow = cpu_to_le32((u32)li->length);
+				cur->LengthHigh =
+					cpu_to_le32((u32)(li->length>>32));
+				cur->OffsetLow = cpu_to_le32((u32)li->offset);
+				cur->OffsetHigh =
+					cpu_to_le32((u32)(li->offset>>32));
+				/*
+				 * We need to save a lock here to let us add
+				 * it again to the inode list if the unlock
+				 * range request fails on the server.
+				 */
+				list_move(&li->llist, &tmp_llist);
+				if (++num == max_num) {
+					stored_rc = cifs_lockv(xid, tcon,
+							       cfile->netfid,
+							       li->type, num,
+							       0, buf);
+					if (stored_rc) {
+						/*
+						 * We failed on the unlock range
+						 * request - add all locks from
+						 * the tmp list to the head of
+						 * the inode list.
+						 */
+						cifs_move_llist(&tmp_llist,
+								&cinode->llist);
+						rc = stored_rc;
+					} else
+						/*
+						 * The unlock range request
+						 * succeed - free the tmp list.
+						 */
+						cifs_free_llist(&tmp_llist);
+					cur = buf;
+					num = 0;
+				} else
+					cur++;
+			} else {
+				/*
+				 * We can cache brlock requests - simply remove
+				 * a lock from the inode list.
+				 */
+				list_del(&li->llist);
+				cifs_del_lock_waiters(li);
+				kfree(li);
+			}
+		}
+		if (num) {
+			stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
+					       types[i], num, 0, buf);
+			if (stored_rc) {
+				cifs_move_llist(&tmp_llist, &cinode->llist);
+				rc = stored_rc;
+			} else
+				cifs_free_llist(&tmp_llist);
+		}
+	}
+
+	mutex_unlock(&cinode->lock_mutex);
+	kfree(buf);
+	return rc;
+}
+
 static int
 cifs_setlk(struct file *file,  struct file_lock *flock, __u8 type,
 	   bool wait_flag, bool posix_lck, int lock, int unlock, int xid)
@@ -1104,43 +1226,9 @@ cifs_setlk(struct file *file,  struct file_lock *flock, __u8 type,
 			rc = cifs_lock_add(cinode, length, flock->fl_start,
 					   type, netfid);
 		}
-	} else if (unlock) {
-		/*
-		 * For each stored lock that this unlock overlaps completely,
-		 * unlock it.
-		 */
-		int stored_rc = 0;
-		struct cifsLockInfo *li, *tmp;
-
-		mutex_lock(&cinode->lock_mutex);
-		list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
-			if (flock->fl_start > li->offset ||
-			    (flock->fl_start + length) <
-			    (li->offset + li->length))
-				continue;
-			if (current->tgid != li->pid)
-				continue;
-			if (cfile->netfid != li->netfid)
-				continue;
-
-			if (!cinode->can_cache_brlcks)
-				stored_rc = CIFSSMBLock(xid, tcon, netfid,
-							current->tgid,
-							li->length, li->offset,
-							1, 0, li->type, 0, 0);
-			else
-				stored_rc = 0;
+	} else if (unlock)
+		rc = cifs_unlock_range(cfile, flock, xid);
 
-			if (stored_rc)
-				rc = stored_rc;
-			else {
-				list_del(&li->llist);
-				cifs_del_lock_waiters(li);
-				kfree(li);
-			}
-		}
-		mutex_unlock(&cinode->lock_mutex);
-	}
 out:
 	if (flock->fl_flags & FL_POSIX)
 		posix_lock_file_wait(file, flock);
-- 
cgit v1.2.3-58-ga151


From 32b9aaf1a53b3c8d435f86339b01b3968520cb0a Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastry@etersoft.ru>
Date: Sat, 22 Oct 2011 15:33:32 +0400
Subject: CIFS: Make cifs_push_locks send as many locks at once as possible

that reduces a traffic and increases a performance.

Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/file.c | 49 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 569184e6ee01..ea096ce5d4f7 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -829,6 +829,11 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
 	struct cifsLockInfo *li, *tmp;
 	struct cifs_tcon *tcon;
 	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+	unsigned int num, max_num;
+	LOCKING_ANDX_RANGE *buf, *cur;
+	int types[] = {LOCKING_ANDX_LARGE_FILES,
+		       LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES};
+	int i;
 
 	xid = GetXid();
 	tcon = tlink_tcon(cfile->tlink);
@@ -840,17 +845,49 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
 		return rc;
 	}
 
-	list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
-		stored_rc = CIFSSMBLock(xid, tcon, cfile->netfid,
-					li->pid, li->length, li->offset,
-					0, 1, li->type, 0, 0);
-		if (stored_rc)
-			rc = stored_rc;
+	max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) /
+		  sizeof(LOCKING_ANDX_RANGE);
+	buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
+	if (!buf) {
+		mutex_unlock(&cinode->lock_mutex);
+		FreeXid(xid);
+		return rc;
+	}
+
+	for (i = 0; i < 2; i++) {
+		cur = buf;
+		num = 0;
+		list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
+			if (li->type != types[i])
+				continue;
+			cur->Pid = cpu_to_le16(li->pid);
+			cur->LengthLow = cpu_to_le32((u32)li->length);
+			cur->LengthHigh = cpu_to_le32((u32)(li->length>>32));
+			cur->OffsetLow = cpu_to_le32((u32)li->offset);
+			cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32));
+			if (++num == max_num) {
+				stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
+						       li->type, 0, num, buf);
+				if (stored_rc)
+					rc = stored_rc;
+				cur = buf;
+				num = 0;
+			} else
+				cur++;
+		}
+
+		if (num) {
+			stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
+					       types[i], 0, num, buf);
+			if (stored_rc)
+				rc = stored_rc;
+		}
 	}
 
 	cinode->can_cache_brlcks = false;
 	mutex_unlock(&cinode->lock_mutex);
 
+	kfree(buf);
 	FreeXid(xid);
 	return rc;
 }
-- 
cgit v1.2.3-58-ga151


From 611d7a5dc6f2a1a0cfd8cc07b9d15f794cbe5f98 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 4 Oct 2011 14:20:17 +0200
Subject: ore: Make ore_calc_stripe_info EXPORT_SYMBOL

ore_calc_stripe_info is needed by exofs::export.c
for the layout calculations. Make it exportable

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/ore.c         | 8 +++-----
 include/scsi/osd_ore.h | 3 +++
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 3b1cc3a132d7..d92998d5c2d6 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -57,9 +57,6 @@ MODULE_LICENSE("GPL");
  * 3. Cache some havily used calculations that will be needed by users.
  */
 
-static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
-				 struct ore_striping_info *si);
-
 enum { BIO_MAX_PAGES_KMALLOC =
 		(PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),};
 
@@ -409,8 +406,8 @@ EXPORT_SYMBOL(ore_check_io);
  *
  *	O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
  */
-static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
-				 struct ore_striping_info *si)
+void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
+			  struct ore_striping_info *si)
 {
 	u32	stripe_unit = layout->stripe_unit;
 	u32	group_width = layout->group_width;
@@ -443,6 +440,7 @@ static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
 	si->group_length = T - H;
 	si->M = M;
 }
+EXPORT_SYMBOL(ore_calc_stripe_info);
 
 static int _add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
 		unsigned pgbase, struct ore_per_dev_state *per_dev,
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index af2231a0fd09..a8e39d14f82b 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -146,6 +146,9 @@ static inline unsigned ore_io_state_size(unsigned numdevs)
 
 /* ore.c */
 int ore_verify_layout(unsigned total_comps, struct ore_layout *layout);
+void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
+			  struct ore_striping_info *si);
+
 int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps,
 		     bool is_reading, u64 offset, u64 length,
 		     struct ore_io_state **ios);
-- 
cgit v1.2.3-58-ga151


From 3e335672e018c06e007f85a5d54afd721fb3d6d5 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 24 Oct 2011 16:36:33 -0700
Subject: fs/Makefile: Always inspect exofs/

fs/exofs directory has multiple targets now, of which the
ore.ko will be needed by the pnfs-objects-layout-driver
(fs/nfs/objlayout).

As suggested by: Michal Marek <mmarek@suse.cz>  convert
inclusion of exofs/ from obj-$(CONFIG_EXOFS_FS) => obj-$(y).
So ORE can be selected also from fs/nfs/Kconfig

CC: Michal Marek <mmarek@suse.cz>
CC: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/Makefile b/fs/Makefile
index afc109691a9b..5c30a13341eb 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -120,6 +120,6 @@ obj-$(CONFIG_DEBUG_FS)		+= debugfs/
 obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
 obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
-obj-$(CONFIG_EXOFS_FS)          += exofs/
+obj-$(y)          		+= exofs/ # Multiple mods, used by nfs/objlayout
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
-- 
cgit v1.2.3-58-ga151


From a1fec1dbbc8db974d2582e4040590cebe72171e4 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 12 Oct 2011 18:42:22 +0200
Subject: ore: RAID5 read

This patch introduces the first stage of RAID5 support
mainly the skip-over-raid-units when reading. For
writes it inserts BLANK units, into where XOR blocks
should be calculated and written to.

It introduces the new "general raid maths", and the main
additional parameters and components needed for raid5.

Since at this stage it could corrupt future version that
actually do support raid5. The enablement of raid5
mounting and setting of parity-count > 0 is disabled. So
the raid5 code will never be used. Mounting of raid5 is
only enabled later once the basic XOR write is also in.
But if the patch "enable RAID5" is applied this code has
been tested to be able to properly read raid5 volumes
and is according to standard.

Also it has been tested that the new maths still properly
supports RAID0 and grouping code just as before.
(BTW: I have found more bugs in the pnfs-obj RAID math
 fixed here)

The ore.c file is getting too big, so new ore_raid.[hc]
files are added that will include the special raid stuff
that are not used in striping and mirrors. In future write
support these will get bigger.
When adding the ore_raid.c to Kbuild file I was forced to
rename ore.ko to libore.ko. Is it possible to keep source
file, say ore.c and module file ore.ko the same even if there
are multiple files inside ore.ko?

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/Kbuild        |   3 +-
 fs/exofs/ore.c         | 326 +++++++++++++++++++++++++++++++++++++------------
 fs/exofs/ore_raid.c    | 140 +++++++++++++++++++++
 fs/exofs/ore_raid.h    |  64 ++++++++++
 include/scsi/osd_ore.h |  21 +++-
 5 files changed, 473 insertions(+), 81 deletions(-)
 create mode 100644 fs/exofs/ore_raid.c
 create mode 100644 fs/exofs/ore_raid.h

(limited to 'fs')

diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index c5a5855a6c44..352ba149d23e 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -13,7 +13,8 @@
 #
 
 # ore module library
-obj-$(CONFIG_ORE) += ore.o
+libore-y := ore.o ore_raid.o
+obj-$(CONFIG_ORE) += libore.o
 
 exofs-y := inode.o file.o symlink.o namei.o dir.o super.o
 obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index d92998d5c2d6..fd6090ddd3bf 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -24,24 +24,9 @@
 
 #include <linux/slab.h>
 #include <asm/div64.h>
+#include <linux/lcm.h>
 
-#include <scsi/osd_ore.h>
-
-#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
-
-#ifdef CONFIG_EXOFS_DEBUG
-#define ORE_DBGMSG(fmt, a...) \
-	printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
-#else
-#define ORE_DBGMSG(fmt, a...) \
-	do { if (0) printk(fmt, ##a); } while (0)
-#endif
-
-/* u64 has problems with printk this will cast it to unsigned long long */
-#define _LLU(x) (unsigned long long)(x)
-
-#define ORE_DBGMSG2(M...) do {} while (0)
-/* #define ORE_DBGMSG2 ORE_DBGMSG */
+#include "ore_raid.h"
 
 MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
 MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
@@ -133,21 +118,81 @@ static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
 	return ore_comp_dev(ios->oc, index);
 }
 
-static int  _get_io_state(struct ore_layout *layout,
-			  struct ore_components *oc, unsigned numdevs,
-			  struct ore_io_state **pios)
+static int  _ore_get_io_state(struct ore_layout *layout,
+			struct ore_components *oc, unsigned numdevs,
+			unsigned sgs_per_dev, unsigned num_par_pages,
+			struct ore_io_state **pios)
 {
 	struct ore_io_state *ios;
+	struct page **pages;
+	struct osd_sg_entry *sgilist;
+	struct __alloc_all_io_state {
+		struct ore_io_state ios;
+		struct ore_per_dev_state per_dev[numdevs];
+		union {
+			struct osd_sg_entry sglist[sgs_per_dev * numdevs];
+			struct page *pages[num_par_pages];
+		};
+	} *_aios;
+
+	if (likely(sizeof(*_aios) <= PAGE_SIZE)) {
+		_aios = kzalloc(sizeof(*_aios), GFP_KERNEL);
+		if (unlikely(!_aios)) {
+			ORE_DBGMSG("Failed kzalloc bytes=%zd\n",
+				   sizeof(*_aios));
+			*pios = NULL;
+			return -ENOMEM;
+		}
+		pages = num_par_pages ? _aios->pages : NULL;
+		sgilist = sgs_per_dev ? _aios->sglist : NULL;
+		ios = &_aios->ios;
+	} else {
+		struct __alloc_small_io_state {
+			struct ore_io_state ios;
+			struct ore_per_dev_state per_dev[numdevs];
+		} *_aio_small;
+		union __extra_part {
+			struct osd_sg_entry sglist[sgs_per_dev * numdevs];
+			struct page *pages[num_par_pages];
+		} *extra_part;
+
+		_aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL);
+		if (unlikely(!_aio_small)) {
+			ORE_DBGMSG("Failed alloc first part bytes=%zd\n",
+				   sizeof(*_aio_small));
+			*pios = NULL;
+			return -ENOMEM;
+		}
+		extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL);
+		if (unlikely(!extra_part)) {
+			ORE_DBGMSG("Failed alloc second part bytes=%zd\n",
+				   sizeof(*extra_part));
+			kfree(_aio_small);
+			*pios = NULL;
+			return -ENOMEM;
+		}
 
-	/*TODO: Maybe use kmem_cach per sbi of size
-	 * exofs_io_state_size(layout->s_numdevs)
-	 */
-	ios = kzalloc(ore_io_state_size(numdevs), GFP_KERNEL);
-	if (unlikely(!ios)) {
-		ORE_DBGMSG("Failed kzalloc bytes=%d\n",
-			   ore_io_state_size(numdevs));
-		*pios = NULL;
-		return -ENOMEM;
+		pages = num_par_pages ? extra_part->pages : NULL;
+		sgilist = sgs_per_dev ? extra_part->sglist : NULL;
+		/* In this case the per_dev[0].sgilist holds the pointer to
+		 * be freed
+		 */
+		ios = &_aio_small->ios;
+		ios->extra_part_alloc = true;
+	}
+
+	if (pages) {
+		ios->parity_pages = pages;
+		ios->max_par_pages = num_par_pages;
+	}
+	if (sgilist) {
+		unsigned d;
+
+		for (d = 0; d < numdevs; ++d) {
+			ios->per_dev[d].sglist = sgilist;
+			sgilist += sgs_per_dev;
+		}
+		ios->sgs_per_dev = sgs_per_dev;
 	}
 
 	ios->layout = layout;
@@ -178,9 +223,42 @@ int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
 {
 	struct ore_io_state *ios;
 	unsigned numdevs = layout->group_width * layout->mirrors_p1;
+	unsigned sgs_per_dev = 0, max_par_pages = 0;
 	int ret;
 
-	ret = _get_io_state(layout, oc, numdevs, pios);
+	if (layout->parity && length) {
+		unsigned data_devs = layout->group_width - layout->parity;
+		unsigned stripe_size = layout->stripe_unit * data_devs;
+		unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
+		u32 remainder;
+		u64 num_stripes;
+		u64 num_raid_units;
+
+		num_stripes = div_u64_rem(length, stripe_size, &remainder);
+		if (remainder)
+			++num_stripes;
+
+		num_raid_units =  num_stripes * layout->parity;
+
+		if (is_reading) {
+			/* For reads add per_dev sglist array */
+			/* TODO: Raid 6 we need twice more. Actually:
+			*         num_stripes / LCMdP(W,P);
+			*         if (W%P != 0) num_stripes *= parity;
+			*/
+
+			/* first/last seg is split */
+			num_raid_units += layout->group_width;
+			sgs_per_dev = div_u64(num_raid_units, data_devs);
+		} else {
+			/* For Writes add parity pages array. */
+			max_par_pages = num_raid_units * pages_in_unit *
+						sizeof(struct page *);
+		}
+	}
+
+	ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages,
+				pios);
 	if (unlikely(ret))
 		return ret;
 
@@ -189,10 +267,11 @@ int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
 	ios->offset = offset;
 
 	if (length) {
-		ore_calc_stripe_info(layout, offset, &ios->si);
-		ios->length = (length <= ios->si.group_length) ? length :
-							ios->si.group_length;
+		ore_calc_stripe_info(layout, offset, length, &ios->si);
+		ios->length = ios->si.length;
 		ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
+		if (layout->parity)
+			_ore_post_alloc_raid_stuff(ios);
 	}
 
 	return 0;
@@ -209,7 +288,7 @@ EXPORT_SYMBOL(ore_get_rw_state);
 int  ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
 		      struct ore_io_state **pios)
 {
-	return _get_io_state(layout, oc, oc->numdevs, pios);
+	return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios);
 }
 EXPORT_SYMBOL(ore_get_io_state);
 
@@ -227,6 +306,7 @@ void ore_put_io_state(struct ore_io_state *ios)
 				bio_put(per_dev->bio);
 		}
 
+		_ore_free_raid_stuff(ios);
 		kfree(ios);
 	}
 }
@@ -367,53 +447,65 @@ EXPORT_SYMBOL(ore_check_io);
 /*
  * L - logical offset into the file
  *
- * U - The number of bytes in a stripe within a group
+ * D - number of Data devices
+ *	D = group_width - parity
  *
- *	U = stripe_unit * group_width
+ * U - The number of bytes in a stripe within a group
+ *	U =  stripe_unit * D
  *
  * T - The number of bytes striped within a group of component objects
  *     (before advancing to the next group)
- *
- *	T = stripe_unit * group_width * group_depth
+ *	T = U * group_depth
  *
  * S - The number of bytes striped across all component objects
  *     before the pattern repeats
+ *	S = T * group_count
  *
- *	S = stripe_unit * group_width * group_depth * group_count
- *
- * M - The "major" (i.e., across all components) stripe number
- *
+ * M - The "major" (i.e., across all components) cycle number
  *	M = L / S
  *
- * G - Counts the groups from the beginning of the major stripe
- *
+ * G - Counts the groups from the beginning of the major cycle
  *	G = (L - (M * S)) / T	[or (L % S) / T]
  *
  * H - The byte offset within the group
- *
  *	H = (L - (M * S)) % T	[or (L % S) % T]
  *
  * N - The "minor" (i.e., across the group) stripe number
- *
  *	N = H / U
  *
  * C - The component index coresponding to L
  *
- *	C = (H - (N * U)) / stripe_unit + G * group_width
- *	[or (L % U) / stripe_unit + G * group_width]
+ *	C = (H - (N * U)) / stripe_unit + G * D
+ *	[or (L % U) / stripe_unit + G * D]
  *
  * O - The component offset coresponding to L
- *
  *	O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
+ *
+ * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity
+ *          divide by parity
+ *	LCMdP = lcm(group_width, parity) / parity
+ *
+ * R - The parity Rotation stripe
+ *     (Note parity cycle always starts at a group's boundary)
+ *	R = N % LCMdP
+ *
+ * I = the first parity device index
+ *	I = (group_width + group_width - R*parity - parity) % group_width
+ *
+ * Craid - The component index Rotated
+ *	Craid = (group_width + C - R*parity) % group_width
+ *      (We add the group_width to avoid negative numbers modulo math)
  */
 void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
-			  struct ore_striping_info *si)
+			  u64 length, struct ore_striping_info *si)
 {
 	u32	stripe_unit = layout->stripe_unit;
 	u32	group_width = layout->group_width;
 	u64	group_depth = layout->group_depth;
+	u32	parity      = layout->parity;
 
-	u32	U = stripe_unit * group_width;
+	u32	D = group_width - parity;
+	u32	U = D * stripe_unit;
 	u64	T = U * group_depth;
 	u64	S = T * layout->group_count;
 	u64	M = div64_u64(file_offset, S);
@@ -429,22 +521,43 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
 	u32	N = div_u64(H, U);
 
 	/* "H - (N * U)" is just "H % U" so it's bound to u32 */
-	si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
-	si->dev *= layout->mirrors_p1;
+	u32	C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
 
 	div_u64_rem(file_offset, stripe_unit, &si->unit_off);
 
 	si->obj_offset = si->unit_off + (N * stripe_unit) +
 				  (M * group_depth * stripe_unit);
 
-	si->group_length = T - H;
+	if (parity) {
+		u32 LCMdP = lcm(group_width, parity) / parity;
+		/* R     = N % LCMdP; */
+		u32 RxP   = (N % LCMdP) * parity;
+		u32 first_dev = C - C % group_width;
+
+		si->par_dev = (group_width + group_width - parity - RxP) %
+			      group_width + first_dev;
+		si->dev = (group_width + C - RxP) % group_width + first_dev;
+		si->bytes_in_stripe = U;
+		si->first_stripe_start = M * S + G * T + N * U;
+	} else {
+		/* Make the math correct see _prepare_one_group */
+		si->par_dev = group_width;
+		si->dev = C;
+	}
+
+	si->dev *= layout->mirrors_p1;
+	si->par_dev *= layout->mirrors_p1;
+	si->offset = file_offset;
+	si->length = T - H;
+	if (si->length > length)
+		si->length = length;
 	si->M = M;
 }
 EXPORT_SYMBOL(ore_calc_stripe_info);
 
-static int _add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
-		unsigned pgbase, struct ore_per_dev_state *per_dev,
-		int cur_len)
+int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
+			 unsigned pgbase, struct page **pages,
+			 struct ore_per_dev_state *per_dev, int cur_len)
 {
 	unsigned pg = *cur_pg;
 	struct request_queue *q =
@@ -455,8 +568,11 @@ static int _add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
 	if (per_dev->bio == NULL) {
 		unsigned pages_in_stripe = ios->layout->group_width *
 					(ios->layout->stripe_unit / PAGE_SIZE);
-		unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
-						ios->layout->group_width;
+		unsigned nr_pages = ios->nr_pages * ios->layout->group_width /
+					(ios->layout->group_width -
+					 ios->layout->parity);
+		unsigned bio_size = (nr_pages + pages_in_stripe) /
+					ios->layout->group_width;
 
 		per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
 		if (unlikely(!per_dev->bio)) {
@@ -471,12 +587,13 @@ static int _add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
 		unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
 		unsigned added_len;
 
-		BUG_ON(ios->nr_pages <= pg);
 		cur_len -= pglen;
 
-		added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg],
+		added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
 					    pglen, pgbase);
 		if (unlikely(pglen != added_len)) {
+			ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n",
+				   per_dev->bio->bi_vcnt);
 			ret = -ENOMEM;
 			goto out;
 		}
@@ -501,9 +618,11 @@ static int _prepare_for_striping(struct ore_io_state *ios)
 	struct ore_striping_info *si = &ios->si;
 	unsigned stripe_unit = ios->layout->stripe_unit;
 	unsigned mirrors_p1 = ios->layout->mirrors_p1;
-	unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
+	unsigned group_width = ios->layout->group_width;
+	unsigned devs_in_group = group_width * mirrors_p1;
 	unsigned dev = si->dev;
 	unsigned first_dev = dev - (dev % devs_in_group);
+	unsigned dev_order;
 	unsigned cur_pg = ios->pages_consumed;
 	u64 length = ios->length;
 	int ret = 0;
@@ -513,7 +632,10 @@ static int _prepare_for_striping(struct ore_io_state *ios)
 		return 0;
 	}
 
-	BUG_ON(length > si->group_length);
+	BUG_ON(length > si->length);
+
+	dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev);
+	si->cur_comp = dev_order;
 
 	while (length) {
 		unsigned comp = dev - first_dev;
@@ -522,17 +644,20 @@ static int _prepare_for_striping(struct ore_io_state *ios)
 
 		if (!per_dev->length) {
 			per_dev->dev = dev;
-			if (dev < si->dev) {
-				per_dev->offset = si->obj_offset + stripe_unit -
-								   si->unit_off;
-				cur_len = stripe_unit;
-			} else if (dev == si->dev) {
+			if (dev == si->dev) {
+				WARN_ON(dev == si->par_dev);
 				per_dev->offset = si->obj_offset;
 				cur_len = stripe_unit - si->unit_off;
 				page_off = si->unit_off & ~PAGE_MASK;
 				BUG_ON(page_off && (page_off != ios->pgbase));
-			} else { /* dev > si->dev */
-				per_dev->offset = si->obj_offset - si->unit_off;
+			} else {
+				if (si->cur_comp > dev_order)
+					per_dev->offset =
+						si->obj_offset - si->unit_off;
+				else /* si->cur_comp < dev_order */
+					per_dev->offset =
+						si->obj_offset + stripe_unit -
+								   si->unit_off;
 				cur_len = stripe_unit;
 			}
 		} else {
@@ -541,8 +666,8 @@ static int _prepare_for_striping(struct ore_io_state *ios)
 		if (cur_len >= length)
 			cur_len = length;
 
-		ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
-				       cur_len);
+		ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages,
+					   per_dev, cur_len);
 		if (unlikely(ret))
 			goto out;
 
@@ -550,6 +675,41 @@ static int _prepare_for_striping(struct ore_io_state *ios)
 		dev = (dev % devs_in_group) + first_dev;
 
 		length -= cur_len;
+
+		si->cur_comp = (si->cur_comp + 1) % group_width;
+		if (unlikely((dev == si->par_dev) ||
+			     (!length && ios->parity_pages))) {
+			if (!length)
+				/* If we are writing and this is the very last
+				 * stripe. then operate on parity dev.
+				 */
+				dev = si->par_dev;
+			if (ios->reading)
+				/* In writes cur_len just means if it's the
+				 * last one. See _ore_add_parity_unit.
+				 */
+				cur_len = length;
+			per_dev = &ios->per_dev[dev - first_dev];
+			if (!per_dev->length) {
+				/* Only/always the parity unit of the first
+				 * stripe will be empty. So this is a chance to
+				 * initialize the per_dev info.
+				 */
+				per_dev->dev = dev;
+				per_dev->offset = si->obj_offset - si->unit_off;
+			}
+
+			ret = _ore_add_parity_unit(ios, si, per_dev, cur_len);
+			if (unlikely(ret))
+					goto out;
+
+			/* Rotate next par_dev backwards with wraping */
+			si->par_dev = (devs_in_group + si->par_dev -
+				       ios->layout->parity * mirrors_p1) %
+				      devs_in_group + first_dev;
+			/* Next stripe, start fresh */
+			si->cur_comp = 0;
+		}
 	}
 out:
 	ios->numdevs = devs_in_group;
@@ -747,12 +907,24 @@ static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp)
 	per_dev->or = or;
 
 	if (ios->pages) {
-		osd_req_read(or, obj, per_dev->offset,
-				per_dev->bio, per_dev->length);
+		if (per_dev->cur_sg) {
+			/* finalize the last sg_entry */
+			_ore_add_sg_seg(per_dev, 0, false);
+			if (unlikely(!per_dev->cur_sg))
+				return 0; /* Skip parity only device */
+
+			osd_req_read_sg(or, obj, per_dev->bio,
+					per_dev->sglist, per_dev->cur_sg);
+		} else {
+			/* The no raid case */
+			osd_req_read(or, obj, per_dev->offset,
+				     per_dev->bio, per_dev->length);
+		}
+
 		ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
-			     " dev=%d\n", _LLU(obj->id),
+			     " dev=%d sg_len=%d\n", _LLU(obj->id),
 			     _LLU(per_dev->offset), _LLU(per_dev->length),
-			     first_dev);
+			     first_dev, per_dev->cur_sg);
 	} else {
 		BUG_ON(ios->kern_buff);
 
@@ -849,7 +1021,7 @@ static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
 {
 	unsigned stripe_unit = layout->stripe_unit;
 
-	ore_calc_stripe_info(layout, file_offset, &ti->si);
+	ore_calc_stripe_info(layout, file_offset, 0, &ti->si);
 
 	ti->prev_group_obj_off = ti->si.M * stripe_unit;
 	ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
new file mode 100644
index 000000000000..8d4b93a93c67
--- /dev/null
+++ b/fs/exofs/ore_raid.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (C) 2011
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of the objects raid engine (ore).
+ *
+ * It is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with "ore". If not, write to the Free Software Foundation, Inc:
+ *	"Free Software Foundation <info@fsf.org>"
+ */
+
+#include <linux/gfp.h>
+
+#include "ore_raid.h"
+
+struct page *_raid_page_alloc(void)
+{
+	return alloc_page(GFP_KERNEL);
+}
+
+void _raid_page_free(struct page *p)
+{
+	__free_page(p);
+}
+
+void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
+		     bool not_last)
+{
+	struct osd_sg_entry *sge;
+
+	ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d "
+		     "offset=0x%llx length=0x%x last_sgs_total=0x%x\n",
+		     per_dev->dev, cur_len, not_last, per_dev->cur_sg,
+		     _LLU(per_dev->offset), per_dev->length,
+		     per_dev->last_sgs_total);
+
+	if (!per_dev->cur_sg) {
+		sge = per_dev->sglist;
+
+		/* First time we prepare two entries */
+		if (per_dev->length) {
+			++per_dev->cur_sg;
+			sge->offset = per_dev->offset;
+			sge->len = per_dev->length;
+		} else {
+			/* Here the parity is the first unit of this object.
+			 * This happens every time we reach a parity device on
+			 * the same stripe as the per_dev->offset. We need to
+			 * just skip this unit.
+			 */
+			per_dev->offset += cur_len;
+			return;
+		}
+	} else {
+		/* finalize the last one */
+		sge = &per_dev->sglist[per_dev->cur_sg - 1];
+		sge->len = per_dev->length - per_dev->last_sgs_total;
+	}
+
+	if (not_last) {
+		/* Partly prepare the next one */
+		struct osd_sg_entry *next_sge = sge + 1;
+
+		++per_dev->cur_sg;
+		next_sge->offset = sge->offset + sge->len + cur_len;
+		/* Save cur len so we know how mutch was added next time */
+		per_dev->last_sgs_total = per_dev->length;
+		next_sge->len = 0;
+	} else if (!sge->len) {
+		/* Optimize for when the last unit is a parity */
+		--per_dev->cur_sg;
+	}
+}
+
+/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */
+int _ore_add_parity_unit(struct ore_io_state *ios,
+			    struct ore_striping_info *si,
+			    struct ore_per_dev_state *per_dev,
+			    unsigned cur_len)
+{
+	if (ios->reading) {
+		BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev);
+		_ore_add_sg_seg(per_dev, cur_len, true);
+	} else {
+		struct page **pages = ios->parity_pages + ios->cur_par_page;
+		unsigned num_pages = ios->layout->stripe_unit / PAGE_SIZE;
+		unsigned array_start = 0;
+		unsigned i;
+		int ret;
+
+		for (i = 0; i < num_pages; i++) {
+			pages[i] = _raid_page_alloc();
+			if (unlikely(!pages[i]))
+				return -ENOMEM;
+
+			++(ios->cur_par_page);
+			/* TODO: only read support for now */
+			clear_highpage(pages[i]);
+		}
+
+		ORE_DBGMSG("writing dev=%d num_pages=%d cur_par_page=%d",
+			     per_dev->dev, num_pages, ios->cur_par_page);
+
+		ret = _ore_add_stripe_unit(ios,  &array_start, 0, pages,
+					   per_dev, num_pages * PAGE_SIZE);
+		if (unlikely(ret))
+			return ret;
+	}
+	return 0;
+}
+
+int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
+{
+	/*TODO: Only raid writes has stuff to add here */
+	return 0;
+}
+
+void _ore_free_raid_stuff(struct ore_io_state *ios)
+{
+	if (ios->parity_pages) { /* writing and raid */
+		unsigned i;
+
+		for (i = 0; i < ios->cur_par_page; i++) {
+			struct page *page = ios->parity_pages[i];
+
+			if (page)
+				_raid_page_free(page);
+		}
+		if (ios->extra_part_alloc)
+			kfree(ios->parity_pages);
+	} else {
+		/* Will only be set if raid reading && sglist is big */
+		if (ios->extra_part_alloc)
+			kfree(ios->per_dev[0].sglist);
+	}
+}
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
new file mode 100644
index 000000000000..c21080b4407f
--- /dev/null
+++ b/fs/exofs/ore_raid.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) from 2011
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of the objects raid engine (ore).
+ *
+ * It is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with "ore". If not, write to the Free Software Foundation, Inc:
+ *	"Free Software Foundation <info@fsf.org>"
+ */
+
+#include <scsi/osd_ore.h>
+
+#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
+
+#ifdef CONFIG_EXOFS_DEBUG
+#define ORE_DBGMSG(fmt, a...) \
+	printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
+#else
+#define ORE_DBGMSG(fmt, a...) \
+	do { if (0) printk(fmt, ##a); } while (0)
+#endif
+
+/* u64 has problems with printk this will cast it to unsigned long long */
+#define _LLU(x) (unsigned long long)(x)
+
+#define ORE_DBGMSG2(M...) do {} while (0)
+/* #define ORE_DBGMSG2 ORE_DBGMSG */
+
+/* Calculate the component order in a stripe. eg the logical data unit
+ * address within the stripe of @dev given the @par_dev of this stripe.
+ */
+static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1,
+				  unsigned par_dev, unsigned dev)
+{
+	unsigned first_dev = dev - dev % devs_in_group;
+
+	dev -= first_dev;
+	par_dev -= first_dev;
+
+	if (devs_in_group == par_dev) /* The raid 0 case */
+		return dev / mirrors_p1;
+	/* raid4/5/6 case */
+	return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) /
+	       mirrors_p1;
+}
+
+/* ios_raid.c stuff needed by ios.c */
+int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);
+void _ore_free_raid_stuff(struct ore_io_state *ios);
+
+void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
+		 bool not_last);
+int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
+		     struct ore_per_dev_state *per_dev, unsigned cur_len);
+
+/* ios.c stuff needed by ios_raid.c */
+int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
+		unsigned pgbase, struct page **pages,
+		struct ore_per_dev_state *per_dev, int cur_len);
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index a8e39d14f82b..43821c18cd3f 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -40,6 +40,7 @@ struct ore_layout {
 	unsigned mirrors_p1;
 
 	unsigned group_width;
+	unsigned parity;
 	u64	 group_depth;
 	unsigned group_count;
 
@@ -89,11 +90,16 @@ static inline void ore_comp_set_dev(
 }
 
 struct ore_striping_info {
+	u64 offset;
 	u64 obj_offset;
-	u64 group_length;
+	u64 length;
+	u64 first_stripe_start; /* only used in raid writes */
 	u64 M; /* for truncate */
+	unsigned bytes_in_stripe;
 	unsigned dev;
+	unsigned par_dev;
 	unsigned unit_off;
+	unsigned cur_comp;
 };
 
 struct ore_io_state;
@@ -127,6 +133,13 @@ struct ore_io_state {
 
 	bool			reading;
 
+	/* House keeping of Parity pages */
+	bool			extra_part_alloc;
+	struct page		**parity_pages;
+	unsigned		max_par_pages;
+	unsigned		cur_par_page;
+	unsigned		sgs_per_dev;
+
 	/* Variable array of size numdevs */
 	unsigned numdevs;
 	struct ore_per_dev_state {
@@ -134,7 +147,10 @@ struct ore_io_state {
 		struct bio *bio;
 		loff_t offset;
 		unsigned length;
+		unsigned last_sgs_total;
 		unsigned dev;
+		struct osd_sg_entry *sglist;
+		unsigned cur_sg;
 	} per_dev[];
 };
 
@@ -147,8 +163,7 @@ static inline unsigned ore_io_state_size(unsigned numdevs)
 /* ore.c */
 int ore_verify_layout(unsigned total_comps, struct ore_layout *layout);
 void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
-			  struct ore_striping_info *si);
-
+			  u64 length, struct ore_striping_info *si);
 int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps,
 		     bool is_reading, u64 offset, u64 length,
 		     struct ore_io_state **ios);
-- 
cgit v1.2.3-58-ga151


From 769ba8d92025fa390f3097e658b8ed6e032d68e9 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Fri, 14 Oct 2011 15:33:51 +0200
Subject: ore: RAID5 Write

This is finally the RAID5 Write support.

The bigger part of this patch is not the XOR engine itself, But the
read4write logic, which is a complete mini prepare_for_striping
reading engine that can read scattered pages of a stripe into cache
so it can be used for XOR calculation. That is, if the write was not
stripe aligned.

The main algorithm behind the XOR engine is the 2 dimensional array:
	struct __stripe_pages_2d.
A drawing might save 1000 words
---

__stripe_pages_2d
       |
 n = pages_in_stripe_unit;
 w = group_width - parity;
       |                            pages array presented to the XOR lib
       |                                                |
       V                                                |
 __1_page_stripe[0].pages --> [c0][c1]..[cw][c_par] <---|
       |                                                |
 __1_page_stripe[1].pages --> [c0][c1]..[cw][c_par] <---
       |
...    |                         ...
       |
 __1_page_stripe[n].pages --> [c0][c1]..[cw][c_par]
                               ^
                               |
           data added columns first then row

---
The pages are put on this array columns first. .i.e:
	p0-of-c0, p1-of-c0, ... pn-of-c0, p0-of-c1, ...
So we are doing a corner turn of the pages.

Note that pages will zigzag down and left. but are put sequentially
in growing order. So when the time comes to XOR the stripe, only the
beginning and end of the array need be checked. We scan the array
and any NULL spot will be field by pages-to-be-read.

The FS that wants to support RAID5 needs to supply an
operations-vector that searches a given page in cache, and specifies
if the page is uptodate or need reading. All these pages to be read
are put on a slave ore_io_state and synchronously read. All the pages
of a stripe are read in one IO, using the scatter gather mechanism.

In write we constrain our IO to only be incomplete on a single
stripe. Meaning either the complete IO is within a single stripe so
we might have pages to read from both beginning  or end of the
strip. Or we have some reading to do at beginning but end at strip
boundary. The left over pages are pushed to the next IO by the API
already established by previous work, where an IO offset/length
combination presented to the ORE might get the length truncated and
the user must re-submit the leftover pages. (Both exofs and NFS
support this)

But any ORE user should make it's best effort to align it's IO
before hand and avoid complications. A cached ore_layout->stripe_size
member can be used for that calculation. (NOTE: that ORE demands
that stripe_size may not be bigger then 32bit)

What else? Well read it and tell me.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/Kconfig       |   9 +-
 fs/exofs/ore.c         |  36 +++-
 fs/exofs/ore_raid.c    | 534 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/exofs/ore_raid.h    |  15 ++
 include/scsi/osd_ore.h |   9 +
 5 files changed, 587 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
index 70bae4149291..fa9a286c8771 100644
--- a/fs/exofs/Kconfig
+++ b/fs/exofs/Kconfig
@@ -1,10 +1,17 @@
+# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
+# for every ORE user we do it like this. Any user should add itself here
+# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
+# selected here, and we default to "ON". So in effect it is like been
+# selected by any of the users.
 config ORE
 	tristate
+	depends on EXOFS_FS
+	select ASYNC_XOR
+	default SCSI_OSD_ULD
 
 config EXOFS_FS
 	tristate "exofs: OSD based file system support"
 	depends on SCSI_OSD_ULD
-	select ORE
 	help
 	  EXOFS is a file system that uses an OSD storage device,
 	  as its backing storage.
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index fd6090ddd3bf..08ee454b2187 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -95,6 +95,14 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
 	layout->max_io_length =
 		(BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
 							layout->group_width;
+	if (layout->parity) {
+		unsigned stripe_length =
+				(layout->group_width - layout->parity) *
+				layout->stripe_unit;
+
+		layout->max_io_length /= stripe_length;
+		layout->max_io_length *= stripe_length;
+	}
 	return 0;
 }
 EXPORT_SYMBOL(ore_verify_layout);
@@ -118,7 +126,7 @@ static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
 	return ore_comp_dev(ios->oc, index);
 }
 
-static int  _ore_get_io_state(struct ore_layout *layout,
+int  _ore_get_io_state(struct ore_layout *layout,
 			struct ore_components *oc, unsigned numdevs,
 			unsigned sgs_per_dev, unsigned num_par_pages,
 			struct ore_io_state **pios)
@@ -334,7 +342,7 @@ static void _done_io(struct osd_request *or, void *p)
 	kref_put(&ios->kref, _last_io);
 }
 
-static int ore_io_execute(struct ore_io_state *ios)
+int ore_io_execute(struct ore_io_state *ios)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	bool sync = (ios->done == NULL);
@@ -597,6 +605,8 @@ int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
 			ret = -ENOMEM;
 			goto out;
 		}
+		_add_stripe_page(ios->sp2d, &ios->si, pages[pg]);
+
 		pgbase = 0;
 		++pg;
 	}
@@ -636,6 +646,7 @@ static int _prepare_for_striping(struct ore_io_state *ios)
 
 	dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev);
 	si->cur_comp = dev_order;
+	si->cur_pg = si->unit_off / PAGE_SIZE;
 
 	while (length) {
 		unsigned comp = dev - first_dev;
@@ -677,14 +688,14 @@ static int _prepare_for_striping(struct ore_io_state *ios)
 		length -= cur_len;
 
 		si->cur_comp = (si->cur_comp + 1) % group_width;
-		if (unlikely((dev == si->par_dev) ||
-			     (!length && ios->parity_pages))) {
-			if (!length)
+		if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) {
+			if (!length && ios->sp2d) {
 				/* If we are writing and this is the very last
 				 * stripe. then operate on parity dev.
 				 */
 				dev = si->par_dev;
-			if (ios->reading)
+			}
+			if (ios->sp2d)
 				/* In writes cur_len just means if it's the
 				 * last one. See _ore_add_parity_unit.
 				 */
@@ -709,6 +720,7 @@ static int _prepare_for_striping(struct ore_io_state *ios)
 				      devs_in_group + first_dev;
 			/* Next stripe, start fresh */
 			si->cur_comp = 0;
+			si->cur_pg = 0;
 		}
 	}
 out:
@@ -873,6 +885,14 @@ int ore_write(struct ore_io_state *ios)
 	int i;
 	int ret;
 
+	if (unlikely(ios->sp2d && !ios->r4w)) {
+		/* A library is attempting a RAID-write without providing
+		 * a pages lock interface.
+		 */
+		WARN_ON_ONCE(1);
+		return -ENOTSUPP;
+	}
+
 	ret = _prepare_for_striping(ios);
 	if (unlikely(ret))
 		return ret;
@@ -888,7 +908,7 @@ int ore_write(struct ore_io_state *ios)
 }
 EXPORT_SYMBOL(ore_write);
 
-static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp)
+int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp)
 {
 	struct osd_request *or;
 	struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
@@ -952,7 +972,7 @@ int ore_read(struct ore_io_state *ios)
 		return ret;
 
 	for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
-		ret = _read_mirror(ios, i);
+		ret = _ore_read_mirror(ios, i);
 		if (unlikely(ret))
 			return ret;
 	}
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 8d4b93a93c67..29c47e5c4a86 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -14,9 +14,13 @@
  */
 
 #include <linux/gfp.h>
+#include <linux/async_tx.h>
 
 #include "ore_raid.h"
 
+#undef ORE_DBGMSG2
+#define ORE_DBGMSG2 ORE_DBGMSG
+
 struct page *_raid_page_alloc(void)
 {
 	return alloc_page(GFP_KERNEL);
@@ -27,6 +31,236 @@ void _raid_page_free(struct page *p)
 	__free_page(p);
 }
 
+/* This struct is forward declare in ore_io_state, but is private to here.
+ * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit.
+ *
+ * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn.
+ * Ascending page index access is sp2d(p-minor, c-major). But storage is
+ * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor
+ * API.
+ */
+struct __stripe_pages_2d {
+	/* Cache some hot path repeated calculations */
+	unsigned parity;
+	unsigned data_devs;
+	unsigned pages_in_unit;
+
+	bool needed ;
+
+	/* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */
+	struct __1_page_stripe {
+		bool alloc;
+		unsigned write_count;
+		struct async_submit_ctl submit;
+		struct dma_async_tx_descriptor *tx;
+
+		/* The size of this array is data_devs + parity */
+		struct page **pages;
+		struct page **scribble;
+		/* bool array, size of this array is data_devs */
+		char *page_is_read;
+	} _1p_stripes[];
+};
+
+/* This can get bigger then a page. So support multiple page allocations
+ * _sp2d_free should be called even if _sp2d_alloc fails (by returning
+ * none-zero).
+ */
+static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width,
+		       unsigned parity, struct __stripe_pages_2d **psp2d)
+{
+	struct __stripe_pages_2d *sp2d;
+	unsigned data_devs = group_width - parity;
+	struct _alloc_all_bytes {
+		struct __alloc_stripe_pages_2d {
+			struct __stripe_pages_2d sp2d;
+			struct __1_page_stripe _1p_stripes[pages_in_unit];
+		} __asp2d;
+		struct __alloc_1p_arrays {
+			struct page *pages[group_width];
+			struct page *scribble[group_width];
+			char page_is_read[data_devs];
+		} __a1pa[pages_in_unit];
+	} *_aab;
+	struct __alloc_1p_arrays *__a1pa;
+	struct __alloc_1p_arrays *__a1pa_end;
+	const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]);
+	unsigned num_a1pa, alloc_size, i;
+
+	/* FIXME: check these numbers in ore_verify_layout */
+	BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE);
+	BUG_ON(sizeof__a1pa > PAGE_SIZE);
+
+	if (sizeof(*_aab) > PAGE_SIZE) {
+		num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa;
+		alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa;
+	} else {
+		num_a1pa = pages_in_unit;
+		alloc_size = sizeof(*_aab);
+	}
+
+	_aab = kzalloc(alloc_size, GFP_KERNEL);
+	if (unlikely(!_aab)) {
+		ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size);
+		return -ENOMEM;
+	}
+
+	sp2d = &_aab->__asp2d.sp2d;
+	*psp2d = sp2d; /* From here Just call _sp2d_free */
+
+	__a1pa = _aab->__a1pa;
+	__a1pa_end = __a1pa + num_a1pa;
+
+	for (i = 0; i < pages_in_unit; ++i) {
+		if (unlikely(__a1pa >= __a1pa_end)) {
+			num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa,
+							pages_in_unit - i);
+
+			__a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL);
+			if (unlikely(!__a1pa)) {
+				ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n",
+					   num_a1pa);
+				return -ENOMEM;
+			}
+			__a1pa_end = __a1pa + num_a1pa;
+			/* First *pages is marked for kfree of the buffer */
+			sp2d->_1p_stripes[i].alloc = true;
+		}
+
+		sp2d->_1p_stripes[i].pages = __a1pa->pages;
+		sp2d->_1p_stripes[i].scribble = __a1pa->scribble ;
+		sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read;
+		++__a1pa;
+	}
+
+	sp2d->parity = parity;
+	sp2d->data_devs = data_devs;
+	sp2d->pages_in_unit = pages_in_unit;
+	return 0;
+}
+
+static void _sp2d_reset(struct __stripe_pages_2d *sp2d,
+			const struct _ore_r4w_op *r4w, void *priv)
+{
+	unsigned data_devs = sp2d->data_devs;
+	unsigned group_width = data_devs + sp2d->parity;
+	unsigned p;
+
+	if (!sp2d->needed)
+		return;
+
+	for (p = 0; p < sp2d->pages_in_unit; p++) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+		if (_1ps->write_count < group_width) {
+			unsigned c;
+
+			for (c = 0; c < data_devs; c++)
+				if (_1ps->page_is_read[c]) {
+					struct page *page = _1ps->pages[c];
+
+					r4w->put_page(priv, page);
+					_1ps->page_is_read[c] = false;
+				}
+		}
+
+		memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages));
+		_1ps->write_count = 0;
+		_1ps->tx = NULL;
+	}
+
+	sp2d->needed = false;
+}
+
+static void _sp2d_free(struct __stripe_pages_2d *sp2d)
+{
+	unsigned i;
+
+	if (!sp2d)
+		return;
+
+	for (i = 0; i < sp2d->pages_in_unit; ++i) {
+		if (sp2d->_1p_stripes[i].alloc)
+			kfree(sp2d->_1p_stripes[i].pages);
+	}
+
+	kfree(sp2d);
+}
+
+static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d)
+{
+	unsigned p;
+
+	for (p = 0; p < sp2d->pages_in_unit; p++) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+		if (_1ps->write_count)
+			return p;
+	}
+
+	return ~0;
+}
+
+static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
+{
+	unsigned p;
+
+	for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+		if (_1ps->write_count)
+			return p;
+	}
+
+	return ~0;
+}
+
+static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
+{
+	unsigned p;
+	for (p = 0; p < sp2d->pages_in_unit; p++) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+		if (!_1ps->write_count)
+			continue;
+
+		init_async_submit(&_1ps->submit,
+			ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK,
+			NULL,
+			NULL, NULL,
+			(addr_conv_t *)_1ps->scribble);
+
+		/* TODO: raid6 */
+		_1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages,
+				     0, sp2d->data_devs, PAGE_SIZE,
+				     &_1ps->submit);
+	}
+
+	for (p = 0; p < sp2d->pages_in_unit; p++) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+		/* NOTE: We wait for HW synchronously (I don't have such HW
+		 * to test with.) Is parallelism needed with today's multi
+		 * cores?
+		 */
+		async_tx_issue_pending(_1ps->tx);
+	}
+}
+
+void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
+		       struct ore_striping_info *si, struct page *page)
+{
+	struct __1_page_stripe *_1ps;
+
+	sp2d->needed = true;
+
+	_1ps = &sp2d->_1p_stripes[si->cur_pg];
+	_1ps->pages[si->cur_comp] = page;
+	++_1ps->write_count;
+
+	si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit;
+	/* si->cur_comp is advanced outside at main loop */
+}
+
 void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
 		     bool not_last)
 {
@@ -76,6 +310,240 @@ void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
 	}
 }
 
+static int _alloc_read_4_write(struct ore_io_state *ios)
+{
+	struct ore_layout *layout = ios->layout;
+	int ret;
+	/* We want to only read those pages not in cache so worst case
+	 * is a stripe populated with every other page
+	 */
+	unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2;
+
+	ret = _ore_get_io_state(layout, ios->oc,
+				layout->group_width * layout->mirrors_p1,
+				sgs_per_dev, 0, &ios->ios_read_4_write);
+	return ret;
+}
+
+/* @si contains info of the to-be-inserted page. Update of @si should be
+ * maintained by caller. Specificaly si->dev, si->obj_offset, ...
+ */
+static int _add_to_read_4_write(struct ore_io_state *ios,
+				struct ore_striping_info *si, struct page *page)
+{
+	struct request_queue *q;
+	struct ore_per_dev_state *per_dev;
+	struct ore_io_state *read_ios;
+	unsigned first_dev = si->dev - (si->dev %
+			  (ios->layout->group_width * ios->layout->mirrors_p1));
+	unsigned comp = si->dev - first_dev;
+	unsigned added_len;
+
+	if (!ios->ios_read_4_write) {
+		int ret = _alloc_read_4_write(ios);
+
+		if (unlikely(ret))
+			return ret;
+	}
+
+	read_ios = ios->ios_read_4_write;
+	read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1;
+
+	per_dev = &read_ios->per_dev[comp];
+	if (!per_dev->length) {
+		per_dev->bio = bio_kmalloc(GFP_KERNEL,
+					   ios->sp2d->pages_in_unit);
+		if (unlikely(!per_dev->bio)) {
+			ORE_DBGMSG("Failed to allocate BIO size=%u\n",
+				     ios->sp2d->pages_in_unit);
+			return -ENOMEM;
+		}
+		per_dev->offset = si->obj_offset;
+		per_dev->dev = si->dev;
+	} else if (si->obj_offset != (per_dev->offset + per_dev->length)) {
+		u64 gap = si->obj_offset - (per_dev->offset + per_dev->length);
+
+		_ore_add_sg_seg(per_dev, gap, true);
+	}
+	q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev));
+	added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0);
+	if (unlikely(added_len != PAGE_SIZE)) {
+		ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n",
+			      per_dev->bio->bi_vcnt);
+		return -ENOMEM;
+	}
+
+	per_dev->length += PAGE_SIZE;
+	return 0;
+}
+
+static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
+{
+	struct bio_vec *bv;
+	unsigned i, d;
+
+	/* loop on all devices all pages */
+	for (d = 0; d < ios->numdevs; d++) {
+		struct bio *bio = ios->per_dev[d].bio;
+
+		if (!bio)
+			continue;
+
+		__bio_for_each_segment(bv, bio, i, 0) {
+			struct page *page = bv->bv_page;
+
+			SetPageUptodate(page);
+			if (PageError(page))
+				ClearPageError(page);
+		}
+	}
+}
+
+/* read_4_write is hacked to read the start of the first stripe and/or
+ * the end of the last stripe. If needed, with an sg-gap at each device/page.
+ * It is assumed to be called after the to_be_written pages of the first stripe
+ * are populating ios->sp2d[][]
+ *
+ * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations
+ * These pages are held at sp2d[p].pages[c] but with
+ * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are
+ * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is
+ * @uptodate=true, so we don't need to read it, only unlock, after IO.
+ *
+ * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then
+ * to-be-written count, we should consider the xor-in-place mode.
+ * need_to_read_pages_count is the actual number of pages not present in cache.
+ * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough
+ * approximation? In this mode the read pages are put in the empty places of
+ * ios->sp2d[p][*], xor is calculated the same way. These pages are
+ * allocated/freed and don't go through cache
+ */
+static int _read_4_write(struct ore_io_state *ios)
+{
+	struct ore_io_state *ios_read;
+	struct ore_striping_info read_si;
+	struct __stripe_pages_2d *sp2d = ios->sp2d;
+	u64 offset = ios->si.first_stripe_start;
+	u64 last_stripe_end;
+	unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
+	unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1;
+	int ret;
+
+	if (offset == ios->offset) /* Go to start collect $200 */
+		goto read_last_stripe;
+
+	min_p = _sp2d_min_pg(sp2d);
+	max_p = _sp2d_max_pg(sp2d);
+
+	for (c = 0; ; c++) {
+		ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+		read_si.obj_offset += min_p * PAGE_SIZE;
+		offset += min_p * PAGE_SIZE;
+		for (p = min_p; p <= max_p; p++) {
+			struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+			struct page **pp = &_1ps->pages[c];
+			bool uptodate;
+
+			if (*pp)
+				/* to-be-written pages start here */
+				goto read_last_stripe;
+
+			*pp = ios->r4w->get_page(ios->private, offset,
+						 &uptodate);
+			if (unlikely(!*pp))
+				return -ENOMEM;
+
+			if (!uptodate)
+				_add_to_read_4_write(ios, &read_si, *pp);
+
+			/* Mark read-pages to be cache_released */
+			_1ps->page_is_read[c] = true;
+			read_si.obj_offset += PAGE_SIZE;
+			offset += PAGE_SIZE;
+		}
+		offset += (sp2d->pages_in_unit - p) * PAGE_SIZE;
+	}
+
+read_last_stripe:
+	offset = ios->offset + (ios->length + PAGE_SIZE - 1) /
+				PAGE_SIZE * PAGE_SIZE;
+	last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe)
+				 * bytes_in_stripe;
+	if (offset == last_stripe_end) /* Optimize for the aligned case */
+		goto read_it;
+
+	ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+	p = read_si.unit_off / PAGE_SIZE;
+	c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
+		       ios->layout->mirrors_p1, read_si.par_dev, read_si.dev);
+
+	BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end);
+	/* unaligned IO must be within a single stripe */
+
+	if (min_p == sp2d->pages_in_unit) {
+		/* Didn't do it yet */
+		min_p = _sp2d_min_pg(sp2d);
+		max_p = _sp2d_max_pg(sp2d);
+	}
+
+	while (offset < last_stripe_end) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+		if ((min_p <= p) && (p <= max_p)) {
+			struct page *page;
+			bool uptodate;
+
+			BUG_ON(_1ps->pages[c]);
+			page = ios->r4w->get_page(ios->private, offset,
+						  &uptodate);
+			if (unlikely(!page))
+				return -ENOMEM;
+
+			_1ps->pages[c] = page;
+			/* Mark read-pages to be cache_released */
+			_1ps->page_is_read[c] = true;
+			if (!uptodate)
+				_add_to_read_4_write(ios, &read_si, page);
+		}
+
+		offset += PAGE_SIZE;
+		if (p == (sp2d->pages_in_unit - 1)) {
+			++c;
+			p = 0;
+			ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+		} else {
+			read_si.obj_offset += PAGE_SIZE;
+			++p;
+		}
+	}
+
+read_it:
+	ios_read = ios->ios_read_4_write;
+	if (!ios_read)
+		return 0;
+
+	/* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change
+	 * to check for per_dev->bio
+	 */
+	ios_read->pages = ios->pages;
+
+	/* Now read these devices */
+	for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) {
+		ret = _ore_read_mirror(ios_read, i);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	ret = ore_io_execute(ios_read); /* Synchronus execution */
+	if (unlikely(ret)) {
+		ORE_DBGMSG("!! ore_io_execute => %d\n", ret);
+		return ret;
+	}
+
+	_mark_read4write_pages_uptodate(ios_read, ret);
+	return 0;
+}
+
 /* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */
 int _ore_add_parity_unit(struct ore_io_state *ios,
 			    struct ore_striping_info *si,
@@ -86,42 +554,89 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
 		BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev);
 		_ore_add_sg_seg(per_dev, cur_len, true);
 	} else {
+		struct __stripe_pages_2d *sp2d = ios->sp2d;
 		struct page **pages = ios->parity_pages + ios->cur_par_page;
-		unsigned num_pages = ios->layout->stripe_unit / PAGE_SIZE;
+		unsigned num_pages;
 		unsigned array_start = 0;
 		unsigned i;
 		int ret;
 
+		si->cur_pg = _sp2d_min_pg(sp2d);
+		num_pages  = _sp2d_max_pg(sp2d) + 1 - si->cur_pg;
+
+		if (!cur_len) /* If last stripe operate on parity comp */
+			si->cur_comp = sp2d->data_devs;
+
+		if (!per_dev->length) {
+			per_dev->offset += si->cur_pg * PAGE_SIZE;
+			/* If first stripe, Read in all read4write pages
+			 * (if needed) before we calculate the first parity.
+			 */
+			_read_4_write(ios);
+		}
+
 		for (i = 0; i < num_pages; i++) {
 			pages[i] = _raid_page_alloc();
 			if (unlikely(!pages[i]))
 				return -ENOMEM;
 
 			++(ios->cur_par_page);
-			/* TODO: only read support for now */
-			clear_highpage(pages[i]);
 		}
 
-		ORE_DBGMSG("writing dev=%d num_pages=%d cur_par_page=%d",
-			     per_dev->dev, num_pages, ios->cur_par_page);
+		BUG_ON(si->cur_comp != sp2d->data_devs);
+		BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit);
 
 		ret = _ore_add_stripe_unit(ios,  &array_start, 0, pages,
 					   per_dev, num_pages * PAGE_SIZE);
 		if (unlikely(ret))
 			return ret;
+
+		/* TODO: raid6 if (last_parity_dev) */
+		_gen_xor_unit(sp2d);
+		_sp2d_reset(sp2d, ios->r4w, ios->private);
 	}
 	return 0;
 }
 
 int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
 {
-	/*TODO: Only raid writes has stuff to add here */
+	struct ore_layout *layout = ios->layout;
+
+	if (ios->parity_pages) {
+		unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
+		unsigned stripe_size = ios->si.bytes_in_stripe;
+		u64 last_stripe, first_stripe;
+
+		if (_sp2d_alloc(pages_in_unit, layout->group_width,
+				layout->parity, &ios->sp2d)) {
+			return -ENOMEM;
+		}
+
+		BUG_ON(ios->offset % PAGE_SIZE);
+
+		/* Round io down to last full strip */
+		first_stripe = div_u64(ios->offset, stripe_size);
+		last_stripe = div_u64(ios->offset + ios->length, stripe_size);
+
+		/* If an IO spans more then a single stripe it must end at
+		 * a stripe boundary. The reminder at the end is pushed into the
+		 * next IO.
+		 */
+		if (last_stripe != first_stripe) {
+			ios->length = last_stripe * stripe_size - ios->offset;
+
+			BUG_ON(!ios->length);
+			ios->nr_pages = (ios->length + PAGE_SIZE - 1) /
+					PAGE_SIZE;
+			ios->si.length = ios->length; /*make it consistent */
+		}
+	}
 	return 0;
 }
 
 void _ore_free_raid_stuff(struct ore_io_state *ios)
 {
-	if (ios->parity_pages) { /* writing and raid */
+	if (ios->sp2d) { /* writing and raid */
 		unsigned i;
 
 		for (i = 0; i < ios->cur_par_page; i++) {
@@ -132,9 +647,14 @@ void _ore_free_raid_stuff(struct ore_io_state *ios)
 		}
 		if (ios->extra_part_alloc)
 			kfree(ios->parity_pages);
+		/* If IO returned an error pages might need unlocking */
+		_sp2d_reset(ios->sp2d, ios->r4w, ios->private);
+		_sp2d_free(ios->sp2d);
 	} else {
 		/* Will only be set if raid reading && sglist is big */
 		if (ios->extra_part_alloc)
 			kfree(ios->per_dev[0].sglist);
 	}
+	if (ios->ios_read_4_write)
+		ore_put_io_state(ios->ios_read_4_write);
 }
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
index c21080b4407f..2ffd2c3c6e46 100644
--- a/fs/exofs/ore_raid.h
+++ b/fs/exofs/ore_raid.h
@@ -57,8 +57,23 @@ void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
 		 bool not_last);
 int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
 		     struct ore_per_dev_state *per_dev, unsigned cur_len);
+void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
+		       struct ore_striping_info *si, struct page *page);
+static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,
+				struct ore_striping_info *si, struct page *page)
+{
+	if (!sp2d) /* Inline the fast path */
+		return; /* Hay no raid stuff */
+	_ore_add_stripe_page(sp2d, si, page);
+}
 
 /* ios.c stuff needed by ios_raid.c */
+int  _ore_get_io_state(struct ore_layout *layout,
+			struct ore_components *oc, unsigned numdevs,
+			unsigned sgs_per_dev, unsigned num_par_pages,
+			struct ore_io_state **pios);
 int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
 		unsigned pgbase, struct page **pages,
 		struct ore_per_dev_state *per_dev, int cur_len);
+int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp);
+int ore_io_execute(struct ore_io_state *ios);
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index 43821c18cd3f..f05fa826f89e 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -99,11 +99,17 @@ struct ore_striping_info {
 	unsigned dev;
 	unsigned par_dev;
 	unsigned unit_off;
+	unsigned cur_pg;
 	unsigned cur_comp;
 };
 
 struct ore_io_state;
 typedef void (*ore_io_done_fn)(struct ore_io_state *ios, void *private);
+struct _ore_r4w_op {
+	/* @Priv given here is passed ios->private */
+	struct page * (*get_page)(void *priv, u64 page_index, bool *uptodate);
+	void (*put_page)(void *priv, struct page *page);
+};
 
 struct ore_io_state {
 	struct kref		kref;
@@ -139,6 +145,9 @@ struct ore_io_state {
 	unsigned		max_par_pages;
 	unsigned		cur_par_page;
 	unsigned		sgs_per_dev;
+	struct __stripe_pages_2d *sp2d;
+	struct ore_io_state	 *ios_read_4_write;
+	const struct _ore_r4w_op *r4w;
 
 	/* Variable array of size numdevs */
 	unsigned numdevs;
-- 
cgit v1.2.3-58-ga151


From dd296619974c50c46c67e58f355a7e85ef3f0c01 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 12 Oct 2011 15:42:07 +0200
Subject: exofs: Support for RAID5 read-4-write interface.

The ore need suplied a r4w_get_page/r4w_put_page API
from Filesystem so it can get cache pages to read-into when
writing parial stripes.

Also I commented out and NULLed the .writepage (singular)
vector. Because it gives terrible write pattern to raid
and is apparently not needed. Even in OOM conditions the
system copes (even better) with out it.

TODO: How to specify to write_cache_pages() to start
      or include a certain page?

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/inode.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 59 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 86c0ac87b8e3..3e5f3a6be90a 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -63,6 +63,7 @@ struct page_collect {
 	bool read_4_write; /* This means two things: that the read is sync
 			    * And the pages should not be unlocked.
 			    */
+	struct page *that_locked_page;
 };
 
 static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
@@ -81,6 +82,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
 	pcol->length = 0;
 	pcol->pg_first = -1;
 	pcol->read_4_write = false;
+	pcol->that_locked_page = NULL;
 }
 
 static void _pcol_reset(struct page_collect *pcol)
@@ -93,6 +95,7 @@ static void _pcol_reset(struct page_collect *pcol)
 	pcol->length = 0;
 	pcol->pg_first = -1;
 	pcol->ios = NULL;
+	pcol->that_locked_page = NULL;
 
 	/* this is probably the end of the loop but in writes
 	 * it might not end here. don't be left with nothing
@@ -391,6 +394,8 @@ static int readpage_strip(void *data, struct page *page)
 		EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
 			  page->index);
 
+	pcol->that_locked_page = page;
+
 	if (page->index < end_index)
 		len = PAGE_CACHE_SIZE;
 	else if (page->index == end_index)
@@ -560,6 +565,56 @@ static void writepages_done(struct ore_io_state *ios, void *p)
 	EXOFS_DBGMSG2("writepages_done END\n");
 }
 
+static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
+{
+	struct page_collect *pcol = priv;
+	pgoff_t index = offset / PAGE_SIZE;
+
+	if (!pcol->that_locked_page ||
+	    (pcol->that_locked_page->index != index)) {
+		struct page *page = find_get_page(pcol->inode->i_mapping, index);
+
+		if (!page) {
+			page = find_or_create_page(pcol->inode->i_mapping,
+						   index, GFP_NOFS);
+			if (unlikely(!page)) {
+				EXOFS_DBGMSG("grab_cache_page Failed "
+					"index=0x%llx\n", _LLU(index));
+				return NULL;
+			}
+			unlock_page(page);
+		}
+		if (PageDirty(page) || PageWriteback(page))
+			*uptodate = true;
+		else
+			*uptodate = PageUptodate(page);
+		EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate);
+		return page;
+	} else {
+		EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n",
+			     pcol->that_locked_page->index);
+		*uptodate = true;
+		return pcol->that_locked_page;
+	}
+}
+
+static void __r4w_put_page(void *priv, struct page *page)
+{
+	struct page_collect *pcol = priv;
+
+	if (pcol->that_locked_page != page) {
+		EXOFS_DBGMSG("index=0x%lx\n", page->index);
+		page_cache_release(page);
+		return;
+	}
+	EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index);
+}
+
+static const struct _ore_r4w_op _r4w_op = {
+	.get_page = &__r4w_get_page,
+	.put_page = &__r4w_put_page,
+};
+
 static int write_exec(struct page_collect *pcol)
 {
 	struct exofs_i_info *oi = exofs_i(pcol->inode);
@@ -589,6 +644,7 @@ static int write_exec(struct page_collect *pcol)
 	ios = pcol->ios;
 	ios->pages = pcol_copy->pages;
 	ios->done = writepages_done;
+	ios->r4w = &_r4w_op;
 	ios->private = pcol_copy;
 
 	/* pages ownership was passed to pcol_copy */
@@ -773,6 +829,7 @@ static int exofs_writepages(struct address_space *mapping,
 	return 0;
 }
 
+/*
 static int exofs_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct page_collect pcol;
@@ -788,7 +845,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc)
 
 	return write_exec(&pcol);
 }
-
+*/
 /* i_mutex held using inode->i_size directly */
 static void _write_failed(struct inode *inode, loff_t to)
 {
@@ -894,7 +951,7 @@ static void exofs_invalidatepage(struct page *page, unsigned long offset)
 const struct address_space_operations exofs_aops = {
 	.readpage	= exofs_readpage,
 	.readpages	= exofs_readpages,
-	.writepage	= exofs_writepage,
+	.writepage	= NULL,
 	.writepages	= exofs_writepages,
 	.write_begin	= exofs_write_begin_export,
 	.write_end	= exofs_write_end,
-- 
cgit v1.2.3-58-ga151


From 44231e686b2ba3b5702db867bb84e6d76b7cf2c7 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Sun, 21 Nov 2010 20:03:24 +0200
Subject: ore: Enable RAID5 mounts

Now that we support raid5 Enable it at mount. Raid6 will come next
raid4 is not demanded for so it will probably not be enabled.
(Until some one wants it)

NOTE: That mkfs.exofs had support for raid5/6 since long time
ago. (Making an empty raidX FS is just as easy as raid0 ;-} )

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/ore.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 08ee454b2187..fcfa86ae6faf 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -49,9 +49,17 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
 {
 	u64 stripe_length;
 
-/* FIXME: Only raid0 is supported for now. */
-	if (layout->raid_algorithm != PNFS_OSD_RAID_0) {
-		ORE_ERR("Only RAID_0 for now\n");
+	switch (layout->raid_algorithm) {
+	case PNFS_OSD_RAID_0:
+		layout->parity = 0;
+		break;
+	case PNFS_OSD_RAID_5:
+		layout->parity = 1;
+		break;
+	case PNFS_OSD_RAID_PQ:
+	case PNFS_OSD_RAID_4:
+	default:
+		ORE_ERR("Only RAID_0/5 for now\n");
 		return -EINVAL;
 	}
 	if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
-- 
cgit v1.2.3-58-ga151


From 750c9c47a5f9daa88333ac435a1afe4d4b428230 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Tue, 25 Oct 2011 05:35:05 -0400
Subject: ext4: remove messy logic from ext4_ext_rm_leaf

- Both callers(truncate and punch_hole) already aligned left end point
  so we no longer need split logic here.
- Remove dead duplicated code.
- Call ext4_ext_dirty only after we have updated eh_entries, otherwise
  we'll loose entries update. Regression caused by d583fb87a3ff0
  266'th testcase in xfstests (http://patchwork.ozlabs.org/patch/120872)

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 103 +++++++-----------------------------------------------
 1 file changed, 12 insertions(+), 91 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 385ebb435332..d0e3f333d3f0 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2311,13 +2311,12 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 	int err = 0, correct_index = 0;
 	int depth = ext_depth(inode), credits;
 	struct ext4_extent_header *eh;
-	ext4_lblk_t a, b, block;
+	ext4_lblk_t a, b;
 	unsigned num;
 	ext4_lblk_t ex_ee_block;
 	unsigned short ex_ee_len;
 	unsigned uninitialized = 0;
 	struct ext4_extent *ex;
-	struct ext4_map_blocks map;
 
 	/* the header must be checked already in ext4_ext_remove_space() */
 	ext_debug("truncate since %u in leaf\n", start);
@@ -2360,86 +2359,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 			ex_ee_block = le32_to_cpu(ex->ee_block);
 			ex_ee_len = ext4_ext_get_actual_len(ex);
 			continue;
-		} else if (a != ex_ee_block &&
-			b != ex_ee_block + ex_ee_len - 1) {
-			/*
-			 * If this is a truncate, then this condition should
-			 * never happen because at least one of the end points
-			 * needs to be on the edge of the extent.
-			 */
-			if (end == EXT_MAX_BLOCKS - 1) {
-				ext_debug("  bad truncate %u:%u\n",
-						start, end);
-				block = 0;
-				num = 0;
-				err = -EIO;
-				goto out;
-			}
-			/*
-			 * else this is a hole punch, so the extent needs to
-			 * be split since neither edge of the hole is on the
-			 * extent edge
-			 */
-			else{
-				map.m_pblk = ext4_ext_pblock(ex);
-				map.m_lblk = ex_ee_block;
-				map.m_len = b - ex_ee_block;
-
-				err = ext4_split_extent(handle,
-					inode, path, &map, 0,
-					EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
-					EXT4_GET_BLOCKS_PRE_IO);
-
-				if (err < 0)
-					goto out;
-
-				ex_ee_len = ext4_ext_get_actual_len(ex);
-
-				b = ex_ee_block+ex_ee_len - 1 < end ?
-					ex_ee_block+ex_ee_len - 1 : end;
-
-				/* Then remove tail of this extent */
-				block = ex_ee_block;
-				num = a - block;
-			}
+		} else if (b != ex_ee_block + ex_ee_len - 1) {
+			EXT4_ERROR_INODE(inode,"  bad truncate %u:%u\n",
+					 start, end);
+			err = -EIO;
+			goto out;
 		} else if (a != ex_ee_block) {
 			/* remove tail of the extent */
-			block = ex_ee_block;
-			num = a - block;
-		} else if (b != ex_ee_block + ex_ee_len - 1) {
-			/* remove head of the extent */
-			block = b;
-			num =  ex_ee_block + ex_ee_len - b;
-
-			/*
-			 * If this is a truncate, this condition
-			 * should never happen
-			 */
-			if (end == EXT_MAX_BLOCKS - 1) {
-				ext_debug("  bad truncate %u:%u\n",
-					start, end);
-				err = -EIO;
-				goto out;
-			}
+			num = a - ex_ee_block;
 		} else {
 			/* remove whole extent: excellent! */
-			block = ex_ee_block;
 			num = 0;
-			if (a != ex_ee_block) {
-				ext_debug("  bad truncate %u:%u\n",
-					start, end);
-				err = -EIO;
-				goto out;
-			}
-
-			if (b != ex_ee_block + ex_ee_len - 1) {
-				ext_debug("  bad truncate %u:%u\n",
-					start, end);
-				err = -EIO;
-				goto out;
-			}
 		}
-
 		/*
 		 * 3 for leaf, sb, and inode plus 2 (bmap and group
 		 * descriptor) for each block group; assume two block
@@ -2466,19 +2397,10 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		if (err)
 			goto out;
 
-		if (num == 0) {
+		if (num == 0)
 			/* this extent is removed; mark slot entirely unused */
 			ext4_ext_store_pblock(ex, 0);
-		} else if (block != ex_ee_block) {
-			/*
-			 * If this was a head removal, then we need to update
-			 * the physical block since it is now at a different
-			 * location
-			 */
-			ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
-		}
 
-		ex->ee_block = cpu_to_le32(block);
 		ex->ee_len = cpu_to_le16(num);
 		/*
 		 * Do not mark uninitialized if all the blocks in the
@@ -2486,11 +2408,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		 */
 		if (uninitialized && num)
 			ext4_ext_mark_uninitialized(ex);
-
-		err = ext4_ext_dirty(handle, inode, path + depth);
-		if (err)
-			goto out;
-
 		/*
 		 * If the extent was completely released,
 		 * we need to remove it from the leaf
@@ -2513,6 +2430,10 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		} else
 			*partial_cluster = 0;
 
+		err = ext4_ext_dirty(handle, inode, path + depth);
+		if (err)
+			goto out;
+
 		ext_debug("new extent: %u:%u:%llu\n", block, num,
 				ext4_ext_pblock(ex));
 		ex--;
-- 
cgit v1.2.3-58-ga151


From 16d0587090ab93206768f726f71d84ecf55e05c4 Mon Sep 17 00:00:00 2001
From: Stanislav Kinsbursky <skinsbursky@parallels.com>
Date: Tue, 25 Oct 2011 14:17:28 +0300
Subject: NFSd: call svc rpcbind cleanup explicitly

We have to call svc_rpcb_cleanup() explicitly from nfsd_last_thread() since
this function is registered as service shutdown callback and thus nobody else
will done it for us.

Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfsd/nfssvc.c           | 2 ++
 include/linux/sunrpc/svc.h | 1 +
 net/sunrpc/svc.c           | 3 ++-
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index dc5a1bf476b1..52cd976b6099 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -256,6 +256,8 @@ static void nfsd_last_thread(struct svc_serv *serv)
 	nfsd_serv = NULL;
 	nfsd_shutdown();
 
+	svc_rpcb_cleanup(serv);
+
 	printk(KERN_WARNING "nfsd: last server has exited, flushing export "
 			    "cache\n");
 	nfsd_export_flush();
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 223588a976a0..5e71a306216f 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -401,6 +401,7 @@ struct svc_procedure {
 /*
  * Function prototypes.
  */
+void svc_rpcb_cleanup(struct svc_serv *serv);
 struct svc_serv *svc_create(struct svc_program *, unsigned int,
 			    void (*shutdown)(struct svc_serv *));
 struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 407462ff4779..252552a685dc 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -367,11 +367,12 @@ static int svc_rpcb_setup(struct svc_serv *serv)
 	return 0;
 }
 
-static void svc_rpcb_cleanup(struct svc_serv *serv)
+void svc_rpcb_cleanup(struct svc_serv *serv)
 {
 	svc_unregister(serv);
 	rpcb_put_local();
 }
+EXPORT_SYMBOL_GPL(svc_rpcb_cleanup);
 
 static int svc_uses_rpcbind(struct svc_serv *serv)
 {
-- 
cgit v1.2.3-58-ga151


From a4e5d88b1b24827f4f6a3cba43228936a67d81ba Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Tue, 25 Oct 2011 08:15:12 -0400
Subject: ext4: update EOFBLOCKS flag on fallocate properly

EOFBLOCK_FL should be updated if called w/o FALLOCATE_FL_KEEP_SIZE
Currently it happens only if new extent was allocated.

TESTCASE:
fallocate test_file -n -l4096
fallocate test_file -l4096
Last fallocate cmd has updated size, but keept EOFBLOCK_FL set. And
fsck will complain about that.

Also remove ping pong in ext4_fallocate() in case of new extents,
where ext4_ext_map_blocks() clear EOFBLOCKS bit, and later
ext4_falloc_update_inode() restore it again.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  2 ++
 fs/ext4/extents.c | 28 +++++++++++++++++-----------
 fs/ext4/inode.c   |  6 ++++--
 3 files changed, 23 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4546da4f26c4..3647ae0b21ab 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -539,6 +539,8 @@ struct ext4_new_group_data {
 #define EXT4_GET_BLOCKS_PUNCH_OUT_EXT		0x0020
 	/* Don't normalize allocation size (used for fallocate) */
 #define EXT4_GET_BLOCKS_NO_NORMALIZE		0x0040
+	/* Request will not result in inode size update (user for fallocate) */
+#define EXT4_GET_BLOCKS_KEEP_SIZE		0x0080
 
 /*
  * Flags used by ext4_free_blocks
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d0e3f333d3f0..8686eb756b38 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3432,14 +3432,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 
 	/* buffered write, writepage time, convert*/
 	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
-	if (ret >= 0) {
+	if (ret >= 0)
 		ext4_update_inode_fsync_trans(handle, inode, 1);
-		err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
-					 map->m_len);
-		if (err < 0)
-			goto out2;
-	}
-
 out:
 	if (ret <= 0) {
 		err = ret;
@@ -3480,6 +3474,12 @@ out:
 
 map_out:
 	map->m_flags |= EXT4_MAP_MAPPED;
+	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
+		err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
+					 map->m_len);
+		if (err < 0)
+			goto out2;
+	}
 out1:
 	if (allocated > map->m_len)
 		allocated = map->m_len;
@@ -3955,7 +3955,10 @@ got_allocated_blocks:
 			map->m_flags |= EXT4_MAP_UNINIT;
 	}
 
-	err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
+	err = 0;
+	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
+		err = check_eofblocks_fl(handle, inode, map->m_lblk,
+					 path, ar.len);
 	if (!err)
 		err = ext4_ext_insert_extent(handle, inode, path,
 					     &newex, flags);
@@ -4214,6 +4217,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	int ret = 0;
 	int ret2 = 0;
 	int retries = 0;
+	int flags;
 	struct ext4_map_blocks map;
 	unsigned int credits, blkbits = inode->i_blkbits;
 
@@ -4250,6 +4254,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 		trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
 		return ret;
 	}
+	flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
+		EXT4_GET_BLOCKS_NO_NORMALIZE;
+	if (mode & FALLOC_FL_KEEP_SIZE)
+		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
 retry:
 	while (ret >= 0 && ret < max_blocks) {
 		map.m_lblk = map.m_lblk + ret;
@@ -4259,9 +4267,7 @@ retry:
 			ret = PTR_ERR(handle);
 			break;
 		}
-		ret = ext4_map_blocks(handle, inode, &map,
-				      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
-				      EXT4_GET_BLOCKS_NO_NORMALIZE);
+		ret = ext4_map_blocks(handle, inode, &map, flags);
 		if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
 			WARN_ON(ret <= 0);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ff6aace0bb3c..87ec615f0fd6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -477,9 +477,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 	 */
 	down_read((&EXT4_I(inode)->i_data_sem));
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
-		retval = ext4_ext_map_blocks(handle, inode, map, 0);
+		retval = ext4_ext_map_blocks(handle, inode, map, flags &
+					     EXT4_GET_BLOCKS_KEEP_SIZE);
 	} else {
-		retval = ext4_ind_map_blocks(handle, inode, map, 0);
+		retval = ext4_ind_map_blocks(handle, inode, map, flags &
+					     EXT4_GET_BLOCKS_KEEP_SIZE);
 	}
 	up_read((&EXT4_I(inode)->i_data_sem));
 
-- 
cgit v1.2.3-58-ga151


From b9e2780d576a010d4aba1e69f247170bf3718d6b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 25 Oct 2011 05:38:41 -0700
Subject: sysfs: Remove support for tagged directories with untagged members
 (again)

In commit 8a9ea3237e7e ("Merge git://.../davem/net-next") where my sysfs
changes from the net tree merged with the sysfs rbtree changes from
Mickulas Patocka the conflict resolution failed to preserve the
simplified property that was the point of my changes.

That is sysfs_find_dirent can now say something is a match if and only
s_name and s_ns match what we are looking for, and sysfs_readdir can
simply return all of the directory entries where s_ns matches the
directory that we should be returning.

Now that we are back to exact matches we can tweak sysfs_find_dirent and
the name rb_tree to order sysfs_dirents by s_ns s_name and remove the
second loop in sysfs_find_dirent.  However that change seems a bit much
for a conflict resolution so it can come later.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/sysfs/dir.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 0344ee70a47c..48ffbdf0d017 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -590,8 +590,8 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 #undef node
 	}
 
-	if (found && ns) {
-		while (found->s_ns && found->s_ns != ns) {
+	if (found) {
+		while (found->s_ns != ns) {
 			p = rb_next(&found->name_node);
 			if (!p)
 				return NULL;
@@ -947,7 +947,7 @@ static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
 #undef node
 		}
 	}
-	while (pos && pos->s_ns && pos->s_ns != ns) {
+	while (pos && pos->s_ns != ns) {
 		struct rb_node *p = rb_next(&pos->inode_node);
 		if (!p)
 			pos = NULL;
@@ -967,7 +967,7 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
 			pos = NULL;
 		else
 			pos = rb_entry(p, struct sysfs_dirent, inode_node);
-	} while (pos && pos->s_ns && pos->s_ns != ns);
+	} while (pos && pos->s_ns != ns);
 	return pos;
 }
 
-- 
cgit v1.2.3-58-ga151


From cf8039036a6e9c5f7144841925f212a957faf1aa Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@us.ibm.com>
Date: Tue, 25 Oct 2011 09:18:41 -0400
Subject: ext4: prevent stack overrun in ext4_file_open

In ext4_file_open, the filesystem records the mountpoint of the first
file that is opened after mounting the filesystem.  It does this by
allocating a 64-byte stack buffer, calling d_path() to grab the mount
point through which this file was accessed, and then memcpy()ing 64
bytes into the superblock's s_last_mounted field, starting from the
return value of d_path(), which is stored as "cp".  However, if cp >
buf (which it frequently is since path components are prepended
starting at the end of buf) then we can end up copying stack data into
the superblock.

Writing stack variables into the superblock doesn't sound like a great
idea, so use strlcpy instead.  Andi Kleen suggested using strlcpy
instead of strncpy.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index e4095e988eba..9781099337ec 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -181,8 +181,8 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 		path.dentry = mnt->mnt_root;
 		cp = d_path(&path, buf, sizeof(buf));
 		if (!IS_ERR(cp)) {
-			memcpy(sbi->s_es->s_last_mounted, cp,
-			       sizeof(sbi->s_es->s_last_mounted));
+			strlcpy(sbi->s_es->s_last_mounted, cp,
+				sizeof(sbi->s_es->s_last_mounted));
 			ext4_mark_super_dirty(sb);
 		}
 	}
-- 
cgit v1.2.3-58-ga151


From 7c272194e66e91830b90f6202e61c69f8590f1eb Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 3 Aug 2011 09:58:09 -0700
Subject: ceph: make readpages fully async

When we get a ->readpages() aop, submit async reads for all page ranges
in the provided page list.  Lock the pages immediately, so that VFS/MM
will block until the reads complete.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/addr.c | 185 +++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 115 insertions(+), 70 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5a3953db8118..5bb39a50f904 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -228,102 +228,147 @@ static int ceph_readpage(struct file *filp, struct page *page)
 }
 
 /*
- * Build a vector of contiguous pages from the provided page list.
+ * Finish an async read(ahead) op.
  */
-static struct page **page_vector_from_list(struct list_head *page_list,
-					   unsigned *nr_pages)
+static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
 {
-	struct page **pages;
-	struct page *page;
-	int next_index, contig_pages = 0;
+	struct inode *inode = req->r_inode;
+	struct ceph_osd_reply_head *replyhead;
+	int rc, bytes;
+	int i;
 
-	/* build page vector */
-	pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
-	if (!pages)
-		return ERR_PTR(-ENOMEM);
+	/* parse reply */
+	replyhead = msg->front.iov_base;
+	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
+	rc = le32_to_cpu(replyhead->result);
+	bytes = le32_to_cpu(msg->hdr.data_len);
 
-	BUG_ON(list_empty(page_list));
-	next_index = list_entry(page_list->prev, struct page, lru)->index;
-	list_for_each_entry_reverse(page, page_list, lru) {
-		if (page->index == next_index) {
-			dout("readpages page %d %p\n", contig_pages, page);
-			pages[contig_pages] = page;
-			contig_pages++;
-			next_index++;
-		} else {
-			break;
+	dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
+
+	/* unlock all pages, zeroing any data we didn't read */
+	for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) {
+		struct page *page = req->r_pages[i];
+
+		if (bytes < (int)PAGE_CACHE_SIZE) {
+			/* zero (remainder of) page */
+			int s = bytes < 0 ? 0 : bytes;
+			zero_user_segment(page, s, PAGE_CACHE_SIZE);
 		}
+ 		dout("finish_read %p uptodate %p idx %lu\n", inode, page,
+		     page->index);
+		flush_dcache_page(page);
+		SetPageUptodate(page);
+		unlock_page(page);
+		page_cache_release(page);
 	}
-	*nr_pages = contig_pages;
-	return pages;
+	kfree(req->r_pages);
 }
 
 /*
- * Read multiple pages.  Leave pages we don't read + unlock in page_list;
- * the caller (VM) cleans them up.
+ * start an async read(ahead) operation.  return nr_pages we submitted
+ * a read for on success, or negative error code.
  */
-static int ceph_readpages(struct file *file, struct address_space *mapping,
-			  struct list_head *page_list, unsigned nr_pages)
+static int start_read(struct inode *inode, struct list_head *page_list)
 {
-	struct inode *inode = file->f_dentry->d_inode;
-	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_osd_client *osdc =
 		&ceph_inode_to_client(inode)->client->osdc;
-	int rc = 0;
-	struct page **pages;
-	loff_t offset;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct page *page = list_entry(page_list->prev, struct page, lru);
+	struct ceph_osd_request *req;
+	u64 off;
 	u64 len;
+	int i;
+	struct page **pages;
+	pgoff_t next_index;
+	int nr_pages = 0;
+	int ret;
 
-	dout("readpages %p file %p nr_pages %d\n",
-	     inode, file, nr_pages);
-
-	pages = page_vector_from_list(page_list, &nr_pages);
-	if (IS_ERR(pages))
-		return PTR_ERR(pages);
+	off = page->index << PAGE_CACHE_SHIFT;
 
-	/* guess read extent */
-	offset = pages[0]->index << PAGE_CACHE_SHIFT;
+	/* count pages */
+	next_index = page->index;
+	list_for_each_entry_reverse(page, page_list, lru) {
+		if (page->index != next_index)
+			break;
+		nr_pages++;
+		next_index++;
+	}
 	len = nr_pages << PAGE_CACHE_SHIFT;
-	rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
-				 offset, &len,
-				 ci->i_truncate_seq, ci->i_truncate_size,
-				 pages, nr_pages, 0);
-	if (rc == -ENOENT)
-		rc = 0;
-	if (rc < 0)
-		goto out;
-
-	for (; !list_empty(page_list) && len > 0;
-	     rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
-		struct page *page =
-			list_entry(page_list->prev, struct page, lru);
+	dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
+	     off, len);
+
+	req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode),
+				    off, &len,
+				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+				    NULL, 0,
+				    ci->i_truncate_seq, ci->i_truncate_size,
+				    NULL, false, 1, 0);
+	if (!req)
+		return -ENOMEM;
 
+	/* build page vector */
+	nr_pages = len >> PAGE_CACHE_SHIFT;
+	pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);
+	ret = -ENOMEM;
+	if (!pages)
+		goto out;
+	for (i = 0; i < nr_pages; ++i) {
+		page = list_entry(page_list->prev, struct page, lru);
+		BUG_ON(PageLocked(page));
 		list_del(&page->lru);
-
-		if (rc < (int)PAGE_CACHE_SIZE) {
-			/* zero (remainder of) page */
-			int s = rc < 0 ? 0 : rc;
-			zero_user_segment(page, s, PAGE_CACHE_SIZE);
-		}
-
-		if (add_to_page_cache_lru(page, mapping, page->index,
+		
+ 		dout("start_read %p adding %p idx %lu\n", inode, page,
+		     page->index);
+		if (add_to_page_cache_lru(page, &inode->i_data, page->index,
 					  GFP_NOFS)) {
 			page_cache_release(page);
-			dout("readpages %p add_to_page_cache failed %p\n",
+			dout("start_read %p add_to_page_cache failed %p\n",
 			     inode, page);
-			continue;
+			nr_pages = i;
+			goto out_pages;
 		}
-		dout("readpages %p adding %p idx %lu\n", inode, page,
-		     page->index);
-		flush_dcache_page(page);
-		SetPageUptodate(page);
-		unlock_page(page);
-		page_cache_release(page);
+		pages[i] = page;
 	}
-	rc = 0;
+	req->r_pages = pages;
+	req->r_num_pages = nr_pages;
+	req->r_callback = finish_read;
+	req->r_inode = inode;
+
+	dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
+	ret = ceph_osdc_start_request(osdc, req, false);
+	if (ret < 0)
+		goto out_pages;
+	ceph_osdc_put_request(req);
+	return nr_pages;
 
-out:
+out_pages:
+	ceph_release_page_vector(pages, nr_pages);
 	kfree(pages);
+out:
+	ceph_osdc_put_request(req);
+	return ret;
+}
+
+
+/*
+ * Read multiple pages.  Leave pages we don't read + unlock in page_list;
+ * the caller (VM) cleans them up.
+ */
+static int ceph_readpages(struct file *file, struct address_space *mapping,
+			  struct list_head *page_list, unsigned nr_pages)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	int rc = 0;
+
+	dout("readpages %p file %p nr_pages %d\n", inode, file, nr_pages);
+	while (!list_empty(page_list)) {
+		rc = start_read(inode, page_list);
+		if (rc < 0)
+			goto out;
+		BUG_ON(rc == 0);
+	}
+out:
+	dout("readpages %p file %p ret %d\n", inode, file, rc);
 	return rc;
 }
 
-- 
cgit v1.2.3-58-ga151


From 83817e35cbd9b36db955a22418c9e30324353587 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 4 Aug 2011 08:03:44 -0700
Subject: ceph: rename rsize -> rasize

It controls readahead.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c | 14 +++++++++++---
 fs/ceph/super.h |  8 +++++---
 2 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 88bacaf385d9..387addbf942e 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -114,6 +114,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
 enum {
 	Opt_wsize,
 	Opt_rsize,
+	Opt_rasize,
 	Opt_caps_wanted_delay_min,
 	Opt_caps_wanted_delay_max,
 	Opt_cap_release_safety,
@@ -136,6 +137,7 @@ enum {
 static match_table_t fsopt_tokens = {
 	{Opt_wsize, "wsize=%d"},
 	{Opt_rsize, "rsize=%d"},
+	{Opt_rasize, "rasize=%d"},
 	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
 	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
 	{Opt_cap_release_safety, "cap_release_safety=%d"},
@@ -196,6 +198,9 @@ static int parse_fsopt_token(char *c, void *private)
 	case Opt_rsize:
 		fsopt->rsize = intval;
 		break;
+	case Opt_rasize:
+		fsopt->rasize = intval;
+		break;
 	case Opt_caps_wanted_delay_min:
 		fsopt->caps_wanted_delay_min = intval;
 		break;
@@ -293,6 +298,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
         fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
 
         fsopt->rsize = CEPH_RSIZE_DEFAULT;
+        fsopt->rasize = CEPH_RASIZE_DEFAULT;
         fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
 	fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
 	fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
@@ -376,6 +382,8 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
 		seq_printf(m, ",wsize=%d", fsopt->wsize);
 	if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
 		seq_printf(m, ",rsize=%d", fsopt->rsize);
+	if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
+		seq_printf(m, ",rasize=%d", fsopt->rsize);
 	if (fsopt->congestion_kb != default_congestion_kb())
 		seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
 	if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
@@ -774,10 +782,10 @@ static int ceph_register_bdi(struct super_block *sb,
 {
 	int err;
 
-	/* set ra_pages based on rsize mount option? */
-	if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
+	/* set ra_pages based on rasize mount option? */
+	if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE)
 		fsc->backing_dev_info.ra_pages =
-			(fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
+			(fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1)
 			>> PAGE_SHIFT;
 	else
 		fsc->backing_dev_info.ra_pages =
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index a23eed526f05..5c72430b8f71 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -36,7 +36,8 @@
 #define ceph_test_mount_opt(fsc, opt) \
 	(!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
 
-#define CEPH_RSIZE_DEFAULT             (512*1024) /* readahead */
+#define CEPH_RSIZE_DEFAULT             0           /* max read size */
+#define CEPH_RASIZE_DEFAULT            (8192*1024) /* readahead */
 #define CEPH_MAX_READDIR_DEFAULT        1024
 #define CEPH_MAX_READDIR_BYTES_DEFAULT  (512*1024)
 #define CEPH_SNAPDIRNAME_DEFAULT        ".snap"
@@ -45,8 +46,9 @@ struct ceph_mount_options {
 	int flags;
 	int sb_flags;
 
-	int wsize;
-	int rsize;            /* max readahead */
+	int wsize;            /* max write size */
+	int rsize;            /* max read size */
+	int rasize;           /* max readahead */
 	int congestion_kb;    /* max writeback in flight */
 	int caps_wanted_delay_min, caps_wanted_delay_max;
 	int cap_release_safety;
-- 
cgit v1.2.3-58-ga151


From 0d66a487c120012f33fbcd6af5cbf0a0cad71557 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 4 Aug 2011 08:21:30 -0700
Subject: ceph: implement (optional) max read size

The 'rsize' mount option limits the maximum size of an individual
read(ahead) operation that is sent off to an OSD.  This is distinct from
'rasize', which controls the size of the readahead window.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/addr.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5bb39a50f904..5ffee90d9fb5 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -268,7 +268,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
  * start an async read(ahead) operation.  return nr_pages we submitted
  * a read for on success, or negative error code.
  */
-static int start_read(struct inode *inode, struct list_head *page_list)
+static int start_read(struct inode *inode, struct list_head *page_list, int max)
 {
 	struct ceph_osd_client *osdc =
 		&ceph_inode_to_client(inode)->client->osdc;
@@ -292,6 +292,8 @@ static int start_read(struct inode *inode, struct list_head *page_list)
 			break;
 		nr_pages++;
 		next_index++;
+		if (max && nr_pages == max)
+			break;
 	}
 	len = nr_pages << PAGE_CACHE_SHIFT;
 	dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
@@ -358,11 +360,18 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
 			  struct list_head *page_list, unsigned nr_pages)
 {
 	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	int rc = 0;
+	int max = 0;
+
+	if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
+		max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
+			>> PAGE_SHIFT;
 
-	dout("readpages %p file %p nr_pages %d\n", inode, file, nr_pages);
+	dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages,
+	     max);
 	while (!list_empty(page_list)) {
-		rc = start_read(inode, page_list);
+		rc = start_read(inode, page_list, max);
 		if (rc < 0)
 			goto out;
 		BUG_ON(rc == 0);
-- 
cgit v1.2.3-58-ga151


From 6a8ea4706adb4b4d8f77a8da5f9778b65fbf6f48 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 4 Aug 2011 15:41:37 -0700
Subject: ceph: document ioctls

...after some prodding by Christoph.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ioctl.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 0c5167e43180..be4a60487333 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -6,7 +6,31 @@
 
 #define CEPH_IOCTL_MAGIC 0x97
 
-/* just use u64 to align sanely on all archs */
+/*
+ * CEPH_IOC_GET_LAYOUT - get file layout or dir layout policy
+ * CEPH_IOC_SET_LAYOUT - set file layout
+ * CEPH_IOC_SET_LAYOUT_POLICY - set dir layout policy
+ *
+ * The file layout specifies how file data is striped over objects in
+ * the distributed object store, which object pool they belong to (if
+ * it differs from the default), and an optional 'preferred osd' to
+ * store them on.
+ *
+ * Files get a new layout based on the policy set on the containing
+ * directory or one of its ancestors.  The GET_LAYOUT ioctl will let
+ * you examine the layout for a file or the policy on a directory.
+ *
+ * SET_LAYOUT will let you set a layout on a newly created file.  This
+ * only works immediately after the file is created and before any
+ * data is written to it.
+ *
+ * SET_LAYOUT_POLICY will let you set a layout policy (default layout)
+ * on a directory that will apply to any new files created in that
+ * directory (or any child directory that doesn't specify a layout of
+ * its own).
+ */
+
+/* use u64 to align sanely on all archs */
 struct ceph_ioctl_layout {
 	__u64 stripe_unit, stripe_count, object_size;
 	__u64 data_pool;
@@ -21,6 +45,8 @@ struct ceph_ioctl_layout {
 				   struct ceph_ioctl_layout)
 
 /*
+ * CEPH_IOC_GET_DATALOC - get location of file data in the cluster
+ *
  * Extract identity, address of the OSD and object storing a given
  * file offset.
  */
@@ -39,7 +65,34 @@ struct ceph_ioctl_dataloc {
 #define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3,	\
 				   struct ceph_ioctl_dataloc)
 
+/*
+ * CEPH_IOC_LAZYIO - relax consistency
+ *
+ * Normally Ceph switches to synchronous IO when multiple clients have
+ * the file open (and or more for write).  Reads and writes bypass the
+ * page cache and go directly to the OSD.  Setting this flag on a file
+ * descriptor will allow buffered IO for this file in cases where the
+ * application knows it won't interfere with other nodes (or doesn't
+ * care).
+ */
 #define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4)
+
+/*
+ * CEPH_IOC_SYNCIO - force synchronous IO
+ *
+ * This ioctl sets a file flag that forces the synchronous IO that
+ * bypasses the page cache, even if it is not necessary.  This is
+ * essentially the opposite behavior of IOC_LAZYIO.  This forces the
+ * same read/write path as a file opened by multiple clients when one
+ * or more of those clients is opened for write.
+ *
+ * Note that this type of sync IO takes a different path than a file
+ * opened with O_SYNC/D_SYNC (writes hit the page cache and are
+ * immediately flushed on page boundaries).  It is very similar to
+ * O_DIRECT (writes bypass the page cache) excep that O_DIRECT writes
+ * are not copied (user page must remain stable) and O_DIRECT writes
+ * have alignment restrictions (on the buffer and file offset).
+ */
 #define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5)
 
 #endif
-- 
cgit v1.2.3-58-ga151


From 6ab00d465a1c8c02c2216f8220727282f3aa50b5 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 9 Aug 2011 09:41:59 -0700
Subject: libceph: create messenger with client

This simplifies the init/shutdown paths, and makes client->msgr available
during the rest of the setup process.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 drivers/block/rbd.c          |  2 +-
 fs/ceph/super.c              |  9 ++++++---
 include/linux/ceph/libceph.h |  4 +++-
 net/ceph/ceph_common.c       | 47 ++++++++++++++++++++++----------------------
 4 files changed, 34 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 15f65b5f3fc7..0b3c5663a187 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -260,7 +260,7 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt,
 	kref_init(&rbdc->kref);
 	INIT_LIST_HEAD(&rbdc->node);
 
-	rbdc->client = ceph_create_client(opt, rbdc);
+	rbdc->client = ceph_create_client(opt, rbdc, 0, 0);
 	if (IS_ERR(rbdc->client))
 		goto out_rbdc;
 	opt = NULL; /* Now rbdc->client is responsible for opt */
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 387addbf942e..f90dc0b011a7 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -430,20 +430,23 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 					struct ceph_options *opt)
 {
 	struct ceph_fs_client *fsc;
+	const unsigned supported_features =
+		CEPH_FEATURE_FLOCK |
+		CEPH_FEATURE_DIRLAYOUTHASH;
+	const unsigned required_features = 0;
 	int err = -ENOMEM;
 
 	fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
 	if (!fsc)
 		return ERR_PTR(-ENOMEM);
 
-	fsc->client = ceph_create_client(opt, fsc);
+	fsc->client = ceph_create_client(opt, fsc, supported_features,
+					 required_features);
 	if (IS_ERR(fsc->client)) {
 		err = PTR_ERR(fsc->client);
 		goto fail;
 	}
 	fsc->client->extra_mon_dispatch = extra_mon_dispatch;
-	fsc->client->supported_features |= CEPH_FEATURE_FLOCK |
-		CEPH_FEATURE_DIRLAYOUTHASH;
 	fsc->client->monc.want_mdsmap = 1;
 
 	fsc->mount_options = fsopt;
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 563755181c1e..95bd8502e715 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -215,7 +215,9 @@ extern void ceph_destroy_options(struct ceph_options *opt);
 extern int ceph_compare_options(struct ceph_options *new_opt,
 				struct ceph_client *client);
 extern struct ceph_client *ceph_create_client(struct ceph_options *opt,
-					      void *private);
+					      void *private,
+					      unsigned supported_features,
+					      unsigned required_features);
 extern u64 ceph_client_id(struct ceph_client *client);
 extern void ceph_destroy_client(struct ceph_client *client);
 extern int __ceph_open_session(struct ceph_client *client,
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 2883ea01e680..97f70e50ad3b 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -432,9 +432,12 @@ EXPORT_SYMBOL(ceph_client_id);
 /*
  * create a fresh client instance
  */
-struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
+struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
+				       unsigned supported_features,
+				       unsigned required_features)
 {
 	struct ceph_client *client;
+	struct ceph_entity_addr *myaddr = NULL;
 	int err = -ENOMEM;
 
 	client = kzalloc(sizeof(*client), GFP_KERNEL);
@@ -449,15 +452,27 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
 	client->auth_err = 0;
 
 	client->extra_mon_dispatch = NULL;
-	client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT;
-	client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT;
-
-	client->msgr = NULL;
+	client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT |
+		supported_features;
+	client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT |
+		required_features;
+
+	/* msgr */
+	if (ceph_test_opt(client, MYIP))
+		myaddr = &client->options->my_addr;
+	client->msgr = ceph_messenger_create(myaddr,
+					     client->supported_features,
+					     client->required_features);
+	if (IS_ERR(client->msgr)) {
+		err = PTR_ERR(client->msgr);
+		goto fail;
+	}
+	client->msgr->nocrc = ceph_test_opt(client, NOCRC);
 
 	/* subsystems */
 	err = ceph_monc_init(&client->monc, client);
 	if (err < 0)
-		goto fail;
+		goto fail_msgr;
 	err = ceph_osdc_init(&client->osdc, client);
 	if (err < 0)
 		goto fail_monc;
@@ -466,6 +481,8 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
 
 fail_monc:
 	ceph_monc_stop(&client->monc);
+fail_msgr:
+	ceph_messenger_destroy(client->msgr);
 fail:
 	kfree(client);
 	return ERR_PTR(err);
@@ -490,8 +507,7 @@ void ceph_destroy_client(struct ceph_client *client)
 
 	ceph_debugfs_client_cleanup(client);
 
-	if (client->msgr)
-		ceph_messenger_destroy(client->msgr);
+	ceph_messenger_destroy(client->msgr);
 
 	ceph_destroy_options(client->options);
 
@@ -514,24 +530,9 @@ static int have_mon_and_osd_map(struct ceph_client *client)
  */
 int __ceph_open_session(struct ceph_client *client, unsigned long started)
 {
-	struct ceph_entity_addr *myaddr = NULL;
 	int err;
 	unsigned long timeout = client->options->mount_timeout * HZ;
 
-	/* initialize the messenger */
-	if (client->msgr == NULL) {
-		if (ceph_test_opt(client, MYIP))
-			myaddr = &client->options->my_addr;
-		client->msgr = ceph_messenger_create(myaddr,
-					client->supported_features,
-					client->required_features);
-		if (IS_ERR(client->msgr)) {
-			client->msgr = NULL;
-			return PTR_ERR(client->msgr);
-		}
-		client->msgr->nocrc = ceph_test_opt(client, NOCRC);
-	}
-
 	/* open session, and wait for mon and osd maps */
 	err = ceph_monc_open_session(&client->monc);
 	if (err < 0)
-- 
cgit v1.2.3-58-ga151


From b61c27636fffbaf1980e675282777b9467254a40 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 9 Aug 2011 15:03:46 -0700
Subject: libceph: don't complain on msgpool alloc failures

The pool allocation failures are masked by the pool; there is no need to
spam the console about them.  (That's the whole point of having the pool
in the first place.)

Mark msg allocations whose failure is safely handled as such.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c                 |  2 +-
 fs/ceph/mds_client.c           | 11 ++++++-----
 include/linux/ceph/messenger.h |  3 ++-
 net/ceph/messenger.c           | 15 +++++++++++----
 net/ceph/mon_client.c          | 24 +++++++++++++++---------
 net/ceph/msgpool.c             |  4 ++--
 net/ceph/osd_client.c          |  8 ++++----
 7 files changed, 41 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 8d74ad7ba556..b8731bf3ef1f 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -945,7 +945,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
 	     seq, issue_seq, mseq, follows, size, max_size,
 	     xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS);
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false);
 	if (!msg)
 		return -ENOMEM;
 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 86c59e16ba74..1d72f15fe9f4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -764,7 +764,8 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
 	struct ceph_msg *msg;
 	struct ceph_mds_session_head *h;
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS);
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
+			   false);
 	if (!msg) {
 		pr_err("create_session_msg ENOMEM creating msg\n");
 		return NULL;
@@ -1240,7 +1241,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
 	while (session->s_num_cap_releases < session->s_nr_caps + extra) {
 		spin_unlock(&session->s_cap_lock);
 		msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
-				   GFP_NOFS);
+				   GFP_NOFS, false);
 		if (!msg)
 			goto out_unlocked;
 		dout("add_cap_releases %p msg %p now %d\n", session, msg,
@@ -1652,7 +1653,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 	if (req->r_old_dentry_drop)
 		len += req->r_old_dentry->d_name.len;
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS);
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false);
 	if (!msg) {
 		msg = ERR_PTR(-ENOMEM);
 		goto out_free2;
@@ -2518,7 +2519,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 		goto fail_nopagelist;
 	ceph_pagelist_init(pagelist);
 
-	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS);
+	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false);
 	if (!reply)
 		goto fail_nomsg;
 
@@ -2831,7 +2832,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
 	dnamelen = dentry->d_name.len;
 	len += dnamelen;
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS);
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
 	if (!msg)
 		return;
 	lease = msg->front.iov_base;
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index d7adf151d335..bbd4a4bb2c6b 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -238,7 +238,8 @@ extern void ceph_con_keepalive(struct ceph_connection *con);
 extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
 extern void ceph_con_put(struct ceph_connection *con);
 
-extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
+extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
+				     bool can_fail);
 extern void ceph_msg_kfree(struct ceph_msg *m);
 
 
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 9918e9eb276e..2de711a0c037 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2281,7 +2281,8 @@ EXPORT_SYMBOL(ceph_con_keepalive);
  * construct a new message with given type, size
  * the new msg has a ref count of 1.
  */
-struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
+struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
+			      bool can_fail)
 {
 	struct ceph_msg *m;
 
@@ -2333,7 +2334,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
 			m->front.iov_base = kmalloc(front_len, flags);
 		}
 		if (m->front.iov_base == NULL) {
-			pr_err("msg_new can't allocate %d bytes\n",
+			dout("ceph_msg_new can't allocate %d bytes\n",
 			     front_len);
 			goto out2;
 		}
@@ -2348,7 +2349,13 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
 out2:
 	ceph_msg_put(m);
 out:
-	pr_err("msg_new can't create type %d front %d\n", type, front_len);
+	if (!can_fail) {
+		pr_err("msg_new can't create type %d front %d\n", type,
+		       front_len);
+	} else {
+		dout("msg_new can't create type %d front %d\n", type,
+		     front_len);
+	}
 	return NULL;
 }
 EXPORT_SYMBOL(ceph_msg_new);
@@ -2398,7 +2405,7 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
 	}
 	if (!msg) {
 		*skip = 0;
-		msg = ceph_msg_new(type, front_len, GFP_NOFS);
+		msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
 		if (!msg) {
 			pr_err("unable to allocate msg type %d len %d\n",
 			       type, front_len);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 556721665335..af2ef3082499 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -517,10 +517,12 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 	init_completion(&req->completion);
 
 	err = -ENOMEM;
-	req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
+	req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
+				    true);
 	if (!req->request)
 		goto out;
-	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
+	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS,
+				  true);
 	if (!req->reply)
 		goto out;
 
@@ -615,10 +617,12 @@ int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
 	init_completion(&req->completion);
 
 	err = -ENOMEM;
-	req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
+	req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS,
+				    true);
 	if (!req->request)
 		goto out;
-	req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
+	req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS,
+				  true);
 	if (!req->reply)
 		goto out;
 
@@ -765,19 +769,21 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 	err = -ENOMEM;
 	monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
 				     sizeof(struct ceph_mon_subscribe_ack),
-				     GFP_NOFS);
+				     GFP_NOFS, true);
 	if (!monc->m_subscribe_ack)
 		goto out_con;
 
-	monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
+	monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS,
+					 true);
 	if (!monc->m_subscribe)
 		goto out_subscribe_ack;
 
-	monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
+	monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS,
+					  true);
 	if (!monc->m_auth_reply)
 		goto out_subscribe;
 
-	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
+	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true);
 	monc->pending_auth = 0;
 	if (!monc->m_auth)
 		goto out_auth_reply;
@@ -970,7 +976,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
 	case CEPH_MSG_MON_MAP:
 	case CEPH_MSG_MDS_MAP:
 	case CEPH_MSG_OSD_MAP:
-		m = ceph_msg_new(type, front_len, GFP_NOFS);
+		m = ceph_msg_new(type, front_len, GFP_NOFS, false);
 		break;
 	}
 
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c
index 1f4cb30a42c5..11d5f4196a73 100644
--- a/net/ceph/msgpool.c
+++ b/net/ceph/msgpool.c
@@ -12,7 +12,7 @@ static void *msgpool_alloc(gfp_t gfp_mask, void *arg)
 	struct ceph_msgpool *pool = arg;
 	struct ceph_msg *msg;
 
-	msg = ceph_msg_new(0, pool->front_len, gfp_mask);
+	msg = ceph_msg_new(0, pool->front_len, gfp_mask, true);
 	if (!msg) {
 		dout("msgpool_alloc %s failed\n", pool->name);
 	} else {
@@ -61,7 +61,7 @@ struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
 		WARN_ON(1);
 
 		/* try to alloc a fresh message */
-		return ceph_msg_new(0, front_len, GFP_NOFS);
+		return ceph_msg_new(0, front_len, GFP_NOFS, false);
 	}
 
 	msg = mempool_alloc(pool->pool, GFP_NOFS);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 88ad8a2501b5..01ceb59a8b4a 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -227,7 +227,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
 	else
 		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
-				   OSD_OPREPLY_FRONT_LEN, gfp_flags);
+				   OSD_OPREPLY_FRONT_LEN, gfp_flags, true);
 	if (!msg) {
 		ceph_osdc_put_request(req);
 		return NULL;
@@ -250,7 +250,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	if (use_mempool)
 		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
 	else
-		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags);
+		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true);
 	if (!msg) {
 		ceph_osdc_put_request(req);
 		return NULL;
@@ -2032,7 +2032,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 	if (front > req->r_reply->front.iov_len) {
 		pr_warning("get_reply front %d > preallocated %d\n",
 			   front, (int)req->r_reply->front.iov_len);
-		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
+		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS, false);
 		if (!m)
 			goto out;
 		ceph_msg_put(req->r_reply);
@@ -2080,7 +2080,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
 	switch (type) {
 	case CEPH_MSG_OSD_MAP:
 	case CEPH_MSG_WATCH_NOTIFY:
-		return ceph_msg_new(type, front, GFP_NOFS);
+		return ceph_msg_new(type, front, GFP_NOFS, false);
 	case CEPH_MSG_OSD_OPREPLY:
 		return get_reply(con, hdr, skip);
 	default:
-- 
cgit v1.2.3-58-ga151


From 80db8bea6a0f4fd047eafd8329a44d5a110f462b Mon Sep 17 00:00:00 2001
From: Noah Watkins <noahwatkins@gmail.com>
Date: Mon, 22 Aug 2011 13:49:23 -0600
Subject: ceph: replace leading spaces with tabs

Trivial formatting fix.

Signed-off-by: Noah Watkins <noahwatkins@gmail.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f90dc0b011a7..788f5ad8e66d 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -294,29 +294,29 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
 
 	dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
 
-        fsopt->sb_flags = flags;
-        fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
+	fsopt->sb_flags = flags;
+	fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
 
-        fsopt->rsize = CEPH_RSIZE_DEFAULT;
-        fsopt->rasize = CEPH_RASIZE_DEFAULT;
-        fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+	fsopt->rsize = CEPH_RSIZE_DEFAULT;
+	fsopt->rasize = CEPH_RASIZE_DEFAULT;
+	fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
 	fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
 	fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
-        fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
-        fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
-        fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
-        fsopt->congestion_kb = default_congestion_kb();
-	
-        /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
-        err = -EINVAL;
-        if (!dev_name)
-                goto out;
-        *path = strstr(dev_name, ":/");
-        if (*path == NULL) {
-                pr_err("device name is missing path (no :/ in %s)\n",
-                       dev_name);
-                goto out;
-        }
+	fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
+	fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+	fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
+	fsopt->congestion_kb = default_congestion_kb();
+
+	/* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+	err = -EINVAL;
+	if (!dev_name)
+		goto out;
+	*path = strstr(dev_name, ":/");
+	if (*path == NULL) {
+		pr_err("device name is missing path (no :/ in %s)\n",
+				dev_name);
+		goto out;
+	}
 	dev_name_end = *path;
 	dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
 
-- 
cgit v1.2.3-58-ga151


From 83eaea22bdfc9e1cec88f81be5b64f30f6c37e8b Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 24 Aug 2011 14:07:01 -0700
Subject: Revert "ceph: don't truncate dirty pages in invalidate work thread"

This reverts commit c9af9fb68e01eb2c2165e1bc45cfeeed510c64e6.

We need to block and truncate all pages in order to reliably invalidate
them.  Otherwise, we could:

 - have some uptodate pages in the cache
 - queue an invalidate
 - write(2) locks some pages
 - invalidate_work skips them
 - write(2) only overwrites part of the page
 - page now dirty and uptodate
 -> partial leakage of invalidated data

It's not entirely clear why we started skipping locked pages in the first
place.  I just ran this through fsx and didn't see any problems.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 46 +---------------------------------------------
 1 file changed, 1 insertion(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 095799ba9dd1..5dde7d51dc11 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -9,7 +9,6 @@
 #include <linux/namei.h>
 #include <linux/writeback.h>
 #include <linux/vmalloc.h>
-#include <linux/pagevec.h>
 
 #include "super.h"
 #include "mds_client.h"
@@ -1363,49 +1362,6 @@ void ceph_queue_invalidate(struct inode *inode)
 	}
 }
 
-/*
- * invalidate any pages that are not dirty or under writeback.  this
- * includes pages that are clean and mapped.
- */
-static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
-{
-	struct pagevec pvec;
-	pgoff_t next = 0;
-	int i;
-
-	pagevec_init(&pvec, 0);
-	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
-		for (i = 0; i < pagevec_count(&pvec); i++) {
-			struct page *page = pvec.pages[i];
-			pgoff_t index;
-			int skip_page =
-				(PageDirty(page) || PageWriteback(page));
-
-			if (!skip_page)
-				skip_page = !trylock_page(page);
-
-			/*
-			 * We really shouldn't be looking at the ->index of an
-			 * unlocked page.  But we're not allowed to lock these
-			 * pages.  So we rely upon nobody altering the ->index
-			 * of this (pinned-by-us) page.
-			 */
-			index = page->index;
-			if (index > next)
-				next = index;
-			next++;
-
-			if (skip_page)
-				continue;
-
-			generic_error_remove_page(mapping, page);
-			unlock_page(page);
-		}
-		pagevec_release(&pvec);
-		cond_resched();
-	}
-}
-
 /*
  * Invalidate inode pages in a worker thread.  (This can't be done
  * in the message handler context.)
@@ -1429,7 +1385,7 @@ static void ceph_invalidate_work(struct work_struct *work)
 	orig_gen = ci->i_rdcache_gen;
 	spin_unlock(&inode->i_lock);
 
-	ceph_invalidate_nondirty_pages(inode->i_mapping);
+	truncate_inode_pages(&inode->i_data, 0);
 
 	spin_lock(&inode->i_lock);
 	if (orig_gen == ci->i_rdcache_gen &&
-- 
cgit v1.2.3-58-ga151


From a35eca958aa1c7d0b5f993db1a3ded45ae16d59b Mon Sep 17 00:00:00 2001
From: Greg Farnum <gregory.farnum@dreamhost.com>
Date: Thu, 25 Aug 2011 12:43:06 -0700
Subject: ceph: let the set_layout ioctl set single traits

Previously we were validating the passed-in stripe unit, object size,
and stripe count against each other (and not testing most other stuff).
Instead, make sure that the composed previous layout and new values are valid,
and only send the new values to the MDS. This lets users change the
pool without setting the whole layout, for instance.

Signed-off-by: Greg Farnum <gregory.farnum@dreamhost.com>
---
 fs/ceph/ioctl.c | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 3b256b50f7d8..5a14c29cbba6 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -42,17 +42,39 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_mds_request *req;
 	struct ceph_ioctl_layout l;
+	struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
+	struct ceph_ioctl_layout nl;
 	int err, i;
 
-	/* copy and validate */
 	if (copy_from_user(&l, arg, sizeof(l)))
 		return -EFAULT;
 
-	if ((l.object_size & ~PAGE_MASK) ||
-	    (l.stripe_unit & ~PAGE_MASK) ||
-	    !l.stripe_unit ||
-	    (l.object_size &&
-	     (unsigned)l.object_size % (unsigned)l.stripe_unit))
+	/* validate changed params against current layout */
+	err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
+	if (!err) {
+		nl.stripe_unit = ceph_file_layout_su(ci->i_layout);
+		nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+		nl.object_size = ceph_file_layout_object_size(ci->i_layout);
+		nl.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
+		nl.preferred_osd =
+				(s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
+	} else
+		return err;
+
+	if (l.stripe_count)
+		nl.stripe_count = l.stripe_count;
+	if (l.stripe_unit)
+		nl.stripe_unit = l.stripe_unit;
+	if (l.object_size)
+		nl.object_size = l.object_size;
+	if (l.data_pool)
+		nl.data_pool = l.data_pool;
+	if (l.preferred_osd)
+		nl.preferred_osd = l.preferred_osd;
+
+	if ((nl.object_size & ~PAGE_MASK) ||
+	    (nl.stripe_unit & ~PAGE_MASK) ||
+	    ((unsigned)nl.object_size % (unsigned)nl.stripe_unit))
 		return -EINVAL;
 
 	/* make sure it's a valid data pool */
-- 
cgit v1.2.3-58-ga151


From 3310f7541f0c991b51324a7712db51fb8f912601 Mon Sep 17 00:00:00 2001
From: Amon Ott <a.ott@m-privacy.de>
Date: Thu, 20 Oct 2011 13:04:07 -0700
Subject: ceph: fix 32-bit ino numbers

Fix 32-bit ino generation to not always be 1.

Signed-off-by: Amon Ott <a.ott@m-privacy.de>
---
 fs/ceph/super.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 5c72430b8f71..b01442aaf278 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -346,9 +346,10 @@ static inline struct ceph_vino ceph_vino(struct inode *inode)
  * x86_64+ino32  64                     32
  * x86_64        64                     64
  */
-static inline u32 ceph_ino_to_ino32(ino_t ino)
+static inline u32 ceph_ino_to_ino32(__u64 vino)
 {
-	ino ^= ino >> (sizeof(ino) * 8 - 32);
+	u32 ino = vino & 0xffffffff;
+	ino ^= vino >> 32;
 	if (!ino)
 		ino = 1;
 	return ino;
@@ -359,11 +360,11 @@ static inline u32 ceph_ino_to_ino32(ino_t ino)
  */
 static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
 {
-	ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
 #if BITS_PER_LONG == 32
-	ino = ceph_ino_to_ino32(ino);
+	return ceph_ino_to_ino32(vino.ino);
+#else
+	return (ino_t)vino.ino;
 #endif
-	return ino;
 }
 
 /*
-- 
cgit v1.2.3-58-ga151


From 339573406737461cfb17bebabf7ba536a302d841 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 24 Oct 2011 09:05:47 -0700
Subject: libceph: fix double-free of page vector

ceph_release_page_vector() kfrees the vector; we shouldn't do it here too.

Reported-by: Jeff Wu <cpwu@tnsoft.com.cn>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/addr.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5ffee90d9fb5..4144caf2f9d3 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -345,7 +345,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 
 out_pages:
 	ceph_release_page_vector(pages, nr_pages);
-	kfree(pages);
 out:
 	ceph_osdc_put_request(req);
 	return ret;
-- 
cgit v1.2.3-58-ga151


From 909a4cf1ffe4b875c87abf38239a9bfd25167e0c Mon Sep 17 00:00:00 2001
From: Andreas Dilger <adilger@whamcloud.com>
Date: Wed, 26 Oct 2011 03:22:31 -0400
Subject: ext4: avoid setting directory i_nlink to zero

If a directory with more than EXT4_LINK_MAX subdirectories, the nlink
count is set to 1.  Subsequently, if any subdirectories are deleted,
ext4_dec_count() decrements the i_nlink count, which may go to 0
temporarily before being incremented back to 1.

While this is done under i_mutex, which prevents races for directory
and inode operations that check i_nlink, the temporary i_nlink == 0
case is exposed to userspace via stat() and similar calls that do not
hold i_mutex.

Instead, change the code to not decrement i_nlink count for any
directories that do not already have i_nlink larger than 2.

Reported-by: Cliff White <cliffw@whamcloud.com>
Reviewed-by: Johann Lombardi <johann@whamcloud.com>
Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/namei.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a067835bbac1..4a550aa07614 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1706,9 +1706,8 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
  */
 static void ext4_dec_count(handle_t *handle, struct inode *inode)
 {
-	drop_nlink(inode);
-	if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0)
-		inc_nlink(inode);
+	if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
+		drop_nlink(inode);
 }
 
 
-- 
cgit v1.2.3-58-ga151


From 665436175c3ca9d35f135e1ba6bdd63745cff08a Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 26 Oct 2011 03:32:07 -0400
Subject: ext4: use ext4_reserve_inode_write in ext4_xattr_set_handle

ext4_mark_iloc_dirty() says:

 * The caller must have previously called ext4_reserve_inode_write().
 * Give this, we know that the caller already has write access to iloc->bh.

ext4_xattr_set_handle, however, just open-codes it.  May as well use
the helper function for consistency.

No bug here, just tidiness.

(Note: on cleanup path, ext4_reserve_inode_write sets
the bh to NULL if it returns an error, and brelse() of
a null bh is handled gracefully).

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/xattr.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index c757adc97250..0ae3668520f8 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -985,11 +985,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
 	ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
 
-	error = ext4_get_inode_loc(inode, &is.iloc);
-	if (error)
-		goto cleanup;
-
-	error = ext4_journal_get_write_access(handle, is.iloc.bh);
+	error = ext4_reserve_inode_write(handle, inode, &is.iloc);
 	if (error)
 		goto cleanup;
 
-- 
cgit v1.2.3-58-ga151


From f85b287a01237857a50c93868231f7e831581a27 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 26 Oct 2011 03:42:36 -0400
Subject: ext4: error handling fix in ext4_ext_convert_to_initialized()

When allocated is unsigned it breaks the error handling at the end
of the function when we call:
	allocated = ext4_split_extent(...);
	if (allocated < 0)
		err = allocated;

I've made it a signed int instead of unsigned.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 8686eb756b38..02a4d80573a1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2928,7 +2928,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	struct ext4_extent zero_ex;
 	struct ext4_extent *ex;
 	ext4_lblk_t ee_block, eof_block;
-	unsigned int allocated, ee_len, depth;
+	unsigned int ee_len, depth;
+	int allocated;
 	int err = 0;
 	int split_flag = 0;
 
-- 
cgit v1.2.3-58-ga151


From 6f8ff537266ee5396c920fb0c842a21df3055ff3 Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Wed, 26 Oct 2011 04:38:59 -0400
Subject: ext4: handle NULL p_ext in ext4_ext_next_allocated_block()

In ext4_ext_next_allocated_block(), the path[depth] might
have a p_ext that is NULL -- see ext4_ext_binsearch().  In
such a case, dereferencing it will crash the machine.

This patch checks for p_ext == NULL in
ext4_ext_next_allocated_block() before dereferencinging it.

Tested using a hand-crafted an inode with eh_entries == 0 in
an extent block, verified that running FIEMAP on it crashes
without this patch, works fine with it.

Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 02a4d80573a1..797b63b59740 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1392,7 +1392,8 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
 	while (depth >= 0) {
 		if (depth == path->p_depth) {
 			/* leaf */
-			if (path[depth].p_ext !=
+			if (path[depth].p_ext &&
+				path[depth].p_ext !=
 					EXT_LAST_EXTENT(path[depth].p_hdr))
 			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
 		} else {
-- 
cgit v1.2.3-58-ga151


From fcbb5515825f1bb20b7a6f75ec48bee61416f879 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Wed, 26 Oct 2011 05:00:19 -0400
Subject: ext4: let ext4_page_mkwrite stop started handle in failure

The started journal handle should be stopped in failure case.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Acked-by: Jan Kara <jack@suse.cz>
Cc: stable@kernel.org
---
 fs/ext4/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 87ec615f0fd6..e4b26faac5ff 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4791,6 +4791,7 @@ retry_alloc:
 			  PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
 			unlock_page(page);
 			ret = VM_FAULT_SIGBUS;
+			ext4_journal_stop(handle);
 			goto out;
 		}
 		ext4_set_inode_state(inode, EXT4_STATE_JDATA);
-- 
cgit v1.2.3-58-ga151


From ebbe027797f67d34708ccfabdb129886d549f9ce Mon Sep 17 00:00:00 2001
From: Robin Dong <sanbai@taobao.com>
Date: Wed, 26 Oct 2011 05:14:27 -0400
Subject: ext4: use stream-alloc when mb_group_prealloc set to zero

The kernel will crash on

ext4_mb_mark_diskspace_used:
	BUG_ON(ac->ac_b_ex.fe_len <= 0);

after we set /sys/fs/ext4/sda/mb_group_prealloc to zero and create new files in an ext4 filesystem.

The reason is: ac_b_ex.fe_len also set to zero(mb_group_prealloc) in ext4_mb_normalize_group_request
because the ac_flags contains EXT4_MB_HINT_GROUP_ALLOC.

I think when someone set mb_group_prealloc to zero, it means DO NOT USE GROUP PREALLOCATION,
so we should set alloc-strategy to STREAM in this case.

Signed-off-by: Robin Dong <sanbai@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cd70b3041185..89762652aae2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4006,6 +4006,11 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 		return;
 	}
 
+	if (sbi->s_mb_group_prealloc <= 0) {
+		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+		return;
+	}
+
 	/* don't use group allocation for large files */
 	size = max(size, isize);
 	if (size > sbi->s_mb_stream_request) {
-- 
cgit v1.2.3-58-ga151


From 66a83cde47deb4e8874539326e12e88ed82158d3 Mon Sep 17 00:00:00 2001
From: Robin Dong <sanbai@taobao.com>
Date: Wed, 26 Oct 2011 05:29:21 -0400
Subject: ext4: remove unused variable in ext4_mb_generate_from_pa()

The variable 'count' in function ext4_mb_generate_from_pa() looks
useless, so remove it.

Signed-off-by: Robin Dong <sanbai@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 89762652aae2..c66531d800b1 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3345,7 +3345,6 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 	ext4_group_t groupnr;
 	ext4_grpblk_t start;
 	int preallocated = 0;
-	int count = 0;
 	int len;
 
 	/* all form of preallocation discards first load group,
@@ -3368,7 +3367,6 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 		BUG_ON(groupnr != group);
 		ext4_set_bits(bitmap, start, len);
 		preallocated += len;
-		count++;
 	}
 	mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
 }
-- 
cgit v1.2.3-58-ga151


From b051d8dc4e1f011e1b0543a875f5861be5d90222 Mon Sep 17 00:00:00 2001
From: Robin Dong <sanbai@taobao.com>
Date: Wed, 26 Oct 2011 05:30:30 -0400
Subject: ext4: remove unused variable in mb_find_extent()

The variable 'ord' in function mb_find_extent() is redundant, so
remove it.

Signed-off-by: Robin Dong <sanbai@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c66531d800b1..2bb1ddc5c30b 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1391,7 +1391,6 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
 {
 	int next = block;
 	int max;
-	int ord;
 	void *buddy;
 
 	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
@@ -1433,9 +1432,8 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
 		if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
 			break;
 
-		ord = mb_find_order_for_block(e4b, next);
+		order = mb_find_order_for_block(e4b, next);
 
-		order = ord;
 		block = next >> order;
 		ex->fe_len += 1 << order;
 	}
-- 
cgit v1.2.3-58-ga151


From 0a10da73e1fa6fb9b45f1166011ff3b04c27c010 Mon Sep 17 00:00:00 2001
From: Robin Dong <sanbai@taobao.com>
Date: Wed, 26 Oct 2011 08:48:54 -0400
Subject: ext4: fix a wrong comment in __mb_check_buddy()

The comment says the bit should be 0, but the after code assert the
bit to be 1.  This makes people confused, so fix it.

Signed-off-by: Robin Dong <sanbai@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 2bb1ddc5c30b..e2d8be8f28bf 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -581,7 +581,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 				continue;
 			}
 
-			/* both bits in buddy2 must be 0 */
+			/* both bits in buddy2 must be 1 */
 			MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
 			MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
 
-- 
cgit v1.2.3-58-ga151


From b3ff05690845911cc40387176f0bc5a7af9ef3ff Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Wed, 26 Oct 2011 11:08:39 -0400
Subject: ext4: don't check io->flag when setting EXT4_STATE_DIO_UNWRITTEN
 inode state

When we want to convert the unitialized extent in direct write, we can
either do it in ext4_end_io_nolock(AIO case) or in
ext4_ext_direct_IO(non AIO case) and EXT4_I(inode)->cur_aio_dio is a
guard for ext4_ext_map_blocks to find the right case.  In e9e3bcecf,
we mistakenly change it by:

-			if (io)
+			if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
 				io->flag = EXT4_IO_END_UNWRITTEN;
-			else
+				atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+			} else
 				ext4_set_inode_state(inode,
 						     EXT4_STATE_DIO_UNWRITTEN);

So now if we map 2 blocks, and the first one set the
EXT_IO_END_UNWRITTEN, the 2nd mapping will set inode state because of
the check for the flag. This is wrong.

Cc: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 797b63b59740..c2ac06cb2d46 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3390,9 +3390,11 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 		 * that this IO needs to conversion to written when IO is
 		 * completed
 		 */
-		if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
-			io->flag = EXT4_IO_END_UNWRITTEN;
-			atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+		if (io) {
+			if (!(io->flag & EXT4_IO_END_UNWRITTEN)) {
+				io->flag = EXT4_IO_END_UNWRITTEN;
+				atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+			}
 		} else
 			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
 		if (ext4_should_dioread_nolock(inode))
@@ -3946,9 +3948,11 @@ got_allocated_blocks:
 		 * that we need to perform conversion when IO is done.
 		 */
 		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-			if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
-				io->flag = EXT4_IO_END_UNWRITTEN;
-				atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+			if (io) {
+				if (!(io->flag & EXT4_IO_END_UNWRITTEN)) {
+					io->flag = EXT4_IO_END_UNWRITTEN;
+					atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+				}
 			} else
 				ext4_set_inode_state(inode,
 						     EXT4_STATE_DIO_UNWRITTEN);
-- 
cgit v1.2.3-58-ga151


From 60325f0c6ee7c6b68f95aaa643260fb33d4bdd88 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 26 Oct 2011 18:26:49 -0700
Subject: fs/Makefile: Stupid typo breakage of exofs inclusion

In my last patch I did a stupid mistake and broke the exofs
compilation completely. Fix it ASAP.

Instead of obj-y I did obj-$(y)

Really Really sorry. Me totally blushing :-{|

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/Makefile b/fs/Makefile
index 5c30a13341eb..d2c3353d5477 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -120,6 +120,6 @@ obj-$(CONFIG_DEBUG_FS)		+= debugfs/
 obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
 obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
-obj-$(y)          		+= exofs/ # Multiple mods, used by nfs/objlayout
+obj-y				+= exofs/ # Multiple modules
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
-- 
cgit v1.2.3-58-ga151


From 44705754610dbc63503bc7679ff9d9f84978a76f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Thu, 27 Oct 2011 04:05:13 -0400
Subject: jbd2: fix build when CONFIG_BUG is not enabled

Fix build error when CONFIG_BUG is not enabled:

fs/jbd2/transaction.c:1175:3: error: implicit declaration of function '__WARN'

by changing __WARN() to WARN_ON(), as suggested by
Arnaud Lacombe <lacombar@gmail.com>.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Arnaud Lacombe <lacombar@gmail.com>
---
 fs/jbd2/transaction.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index b01fd6104089..1e5c5ead5b0d 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -27,6 +27,7 @@
 #include <linux/highmem.h>
 #include <linux/hrtimer.h>
 #include <linux/backing-dev.h>
+#include <linux/bug.h>
 #include <linux/module.h>
 
 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
@@ -1171,8 +1172,7 @@ out_unlock_bh:
 	jbd_unlock_bh_state(bh);
 out:
 	JBUFFER_TRACE(jh, "exit");
-	if (ret)
-		__WARN();	/* All errors are bugs, so dump the stack */
+	WARN_ON(ret);	/* All errors are bugs, so dump the stack */
 	return ret;
 }
 
-- 
cgit v1.2.3-58-ga151


From 6f91bc5fda82d2c49b4f7fb29424cf6a3c7574bc Mon Sep 17 00:00:00 2001
From: Eric Gouriou <egouriou@google.com>
Date: Thu, 27 Oct 2011 11:43:23 -0400
Subject: ext4: optimize ext4_ext_convert_to_initialized()

This patch introduces a fast path in ext4_ext_convert_to_initialized()
for the case when the conversion can be performed by transferring
the newly initialized blocks from the uninitialized extent into
an adjacent initialized extent. Doing so removes the expensive
invocations of memmove() which occur during extent insertion and
the subsequent merge.

In practice this should be the common case for clients performing
append writes into files pre-allocated via
fallocate(FALLOC_FL_KEEP_SIZE). In such a workload performed via
direct IO and when using a suboptimal implementation of memmove()
(x86_64 prior to the 2.6.39 rewrite), this patch reduces kernel CPU
consumption by 32%.

Two new trace points are added to ext4_ext_convert_to_initialized()
to offer visibility into its operations. No exit trace point has
been added due to the multiplicity of return points. This can be
revisited once the upstream cleanup is backported.

Signed-off-by: Eric Gouriou <egouriou@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c           | 93 +++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/super.c             |  1 +
 include/trace/events/ext4.h | 82 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 176 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c2ac06cb2d46..8b6a17b60970 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2919,12 +2919,23 @@ out:
  *   a> There is no split required: Entire extent should be initialized
  *   b> Splits in two extents: Write is happening at either end of the extent
  *   c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * Pre-conditions:
+ *  - The extent pointed to by 'path' is uninitialized.
+ *  - The extent pointed to by 'path' contains a superset
+ *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
+ *
+ * Post-conditions on success:
+ *  - the returned value is the number of blocks beyond map->l_lblk
+ *    that are allocated and initialized.
+ *    It is guaranteed to be >= map->m_len.
  */
 static int ext4_ext_convert_to_initialized(handle_t *handle,
 					   struct inode *inode,
 					   struct ext4_map_blocks *map,
 					   struct ext4_ext_path *path)
 {
+	struct ext4_extent_header *eh;
 	struct ext4_map_blocks split_map;
 	struct ext4_extent zero_ex;
 	struct ext4_extent *ex;
@@ -2944,11 +2955,93 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		eof_block = map->m_lblk + map->m_len;
 
 	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
 	ex = path[depth].p_ext;
 	ee_block = le32_to_cpu(ex->ee_block);
 	ee_len = ext4_ext_get_actual_len(ex);
 	allocated = ee_len - (map->m_lblk - ee_block);
 
+	trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
+
+	/* Pre-conditions */
+	BUG_ON(!ext4_ext_is_uninitialized(ex));
+	BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
+	BUG_ON(map->m_lblk + map->m_len > ee_block + ee_len);
+
+	/*
+	 * Attempt to transfer newly initialized blocks from the currently
+	 * uninitialized extent to its left neighbor. This is much cheaper
+	 * than an insertion followed by a merge as those involve costly
+	 * memmove() calls. This is the common case in steady state for
+	 * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
+	 * writes.
+	 *
+	 * Limitations of the current logic:
+	 *  - L1: we only deal with writes at the start of the extent.
+	 *    The approach could be extended to writes at the end
+	 *    of the extent but this scenario was deemed less common.
+	 *  - L2: we do not deal with writes covering the whole extent.
+	 *    This would require removing the extent if the transfer
+	 *    is possible.
+	 *  - L3: we only attempt to merge with an extent stored in the
+	 *    same extent tree node.
+	 */
+	if ((map->m_lblk == ee_block) &&	/*L1*/
+		(map->m_len < ee_len) &&	/*L2*/
+		(ex > EXT_FIRST_EXTENT(eh))) {	/*L3*/
+		struct ext4_extent *prev_ex;
+		ext4_lblk_t prev_lblk;
+		ext4_fsblk_t prev_pblk, ee_pblk;
+		unsigned int prev_len, write_len;
+
+		prev_ex = ex - 1;
+		prev_lblk = le32_to_cpu(prev_ex->ee_block);
+		prev_len = ext4_ext_get_actual_len(prev_ex);
+		prev_pblk = ext4_ext_pblock(prev_ex);
+		ee_pblk = ext4_ext_pblock(ex);
+		write_len = map->m_len;
+
+		/*
+		 * A transfer of blocks from 'ex' to 'prev_ex' is allowed
+		 * upon those conditions:
+		 * - C1: prev_ex is initialized,
+		 * - C2: prev_ex is logically abutting ex,
+		 * - C3: prev_ex is physically abutting ex,
+		 * - C4: prev_ex can receive the additional blocks without
+		 *   overflowing the (initialized) length limit.
+		 */
+		if ((!ext4_ext_is_uninitialized(prev_ex)) &&		/*C1*/
+			((prev_lblk + prev_len) == ee_block) &&		/*C2*/
+			((prev_pblk + prev_len) == ee_pblk) &&		/*C3*/
+			(prev_len < (EXT_INIT_MAX_LEN - write_len))) {	/*C4*/
+			err = ext4_ext_get_access(handle, inode, path + depth);
+			if (err)
+				goto out;
+
+			trace_ext4_ext_convert_to_initialized_fastpath(inode,
+				map, ex, prev_ex);
+
+			/* Shift the start of ex by 'write_len' blocks */
+			ex->ee_block = cpu_to_le32(ee_block + write_len);
+			ext4_ext_store_pblock(ex, ee_pblk + write_len);
+			ex->ee_len = cpu_to_le16(ee_len - write_len);
+			ext4_ext_mark_uninitialized(ex); /* Restore the flag */
+
+			/* Extend prev_ex by 'write_len' blocks */
+			prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
+
+			/* Mark the block containing both extents as dirty */
+			ext4_ext_dirty(handle, inode, path + depth);
+
+			/* Update path to point to the right extent */
+			path[depth].p_ext = prev_ex;
+
+			/* Result: number of initialized blocks past m_lblk */
+			allocated = write_len;
+			goto out;
+		}
+	}
+
 	WARN_ON(map->m_lblk < ee_block);
 	/*
 	 * It is safe to convert extent to initialized via explicit
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dcc460537bc7..9953d80145ad 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,6 +45,7 @@
 #include <linux/freezer.h>
 
 #include "ext4.h"
+#include "ext4_extents.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index c9a341e385a3..748ff7cbe555 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -9,6 +9,7 @@
 
 struct ext4_allocation_context;
 struct ext4_allocation_request;
+struct ext4_extent;
 struct ext4_prealloc_space;
 struct ext4_inode_info;
 struct mpage_da_data;
@@ -1394,6 +1395,87 @@ DEFINE_EVENT(ext4__truncate, ext4_truncate_exit,
 	TP_ARGS(inode)
 );
 
+/* 'ux' is the uninitialized extent. */
+TRACE_EVENT(ext4_ext_convert_to_initialized_enter,
+	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
+		 struct ext4_extent *ux),
+
+	TP_ARGS(inode, map, ux),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino	)
+		__field(	dev_t,		dev	)
+		__field(	ext4_lblk_t,	m_lblk	)
+		__field(	unsigned,	m_len	)
+		__field(	ext4_lblk_t,	u_lblk	)
+		__field(	unsigned,	u_len	)
+		__field(	ext4_fsblk_t,	u_pblk	)
+	),
+
+	TP_fast_assign(
+		__entry->ino		= inode->i_ino;
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->m_lblk		= map->m_lblk;
+		__entry->m_len		= map->m_len;
+		__entry->u_lblk		= le32_to_cpu(ux->ee_block);
+		__entry->u_len		= ext4_ext_get_actual_len(ux);
+		__entry->u_pblk		= ext4_ext_pblock(ux);
+	),
+
+	TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u u_lblk %u u_len %u "
+		  "u_pblk %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  __entry->m_lblk, __entry->m_len,
+		  __entry->u_lblk, __entry->u_len, __entry->u_pblk)
+);
+
+/*
+ * 'ux' is the uninitialized extent.
+ * 'ix' is the initialized extent to which blocks are transferred.
+ */
+TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath,
+	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
+		 struct ext4_extent *ux, struct ext4_extent *ix),
+
+	TP_ARGS(inode, map, ux, ix),
+
+	TP_STRUCT__entry(
+		__field(	ino_t,		ino	)
+		__field(	dev_t,		dev	)
+		__field(	ext4_lblk_t,	m_lblk	)
+		__field(	unsigned,	m_len	)
+		__field(	ext4_lblk_t,	u_lblk	)
+		__field(	unsigned,	u_len	)
+		__field(	ext4_fsblk_t,	u_pblk	)
+		__field(	ext4_lblk_t,	i_lblk	)
+		__field(	unsigned,	i_len	)
+		__field(	ext4_fsblk_t,	i_pblk	)
+	),
+
+	TP_fast_assign(
+		__entry->ino		= inode->i_ino;
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->m_lblk		= map->m_lblk;
+		__entry->m_len		= map->m_len;
+		__entry->u_lblk		= le32_to_cpu(ux->ee_block);
+		__entry->u_len		= ext4_ext_get_actual_len(ux);
+		__entry->u_pblk		= ext4_ext_pblock(ux);
+		__entry->i_lblk		= le32_to_cpu(ix->ee_block);
+		__entry->i_len		= ext4_ext_get_actual_len(ix);
+		__entry->i_pblk		= ext4_ext_pblock(ix);
+	),
+
+	TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u "
+		  "u_lblk %u u_len %u u_pblk %llu "
+		  "i_lblk %u i_len %u i_pblk %llu ",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  __entry->m_lblk, __entry->m_len,
+		  __entry->u_lblk, __entry->u_len, __entry->u_pblk,
+		  __entry->i_lblk, __entry->i_len, __entry->i_pblk)
+);
+
 DECLARE_EVENT_CLASS(ext4__map_blocks_enter,
 	TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
 		 unsigned int len, unsigned int flags),
-- 
cgit v1.2.3-58-ga151


From 80e675f906db54eb1ce3a9555cee5f45b5b72ab2 Mon Sep 17 00:00:00 2001
From: Eric Gouriou <egouriou@google.com>
Date: Thu, 27 Oct 2011 11:52:18 -0400
Subject: ext4: optimize memmmove lengths in extent/index insertions

ext4_ext_insert_extent() (respectively ext4_ext_insert_index())
was using EXT_MAX_EXTENT() (resp. EXT_MAX_INDEX()) to determine
how many entries needed to be moved beyond the insertion point.
In practice this means that (320 - I) * 24 bytes were memmove()'d
when I is the insertion point, rather than (#entries - I) * 24 bytes.

This patch uses EXT_LAST_EXTENT() (resp. EXT_LAST_INDEX()) instead
to only move existing entries. The code flow is also simplified
slightly to highlight similarities and reduce code duplication in
the insertion logic.

This patch reduces system CPU consumption by over 25% on a 4kB
synchronous append DIO write workload when used with the
pre-2.6.39 x86_64 memmove() implementation. With the much faster
2.6.39 memmove() implementation we still see a decrease in
system CPU usage between 2% and 7%.

Note that the ext_debug() output changes with this patch, splitting
some log information between entries. Users of the ext_debug() output
should note that the "move %d" units changed from reporting the number
of bytes moved to reporting the number of entries moved.

Signed-off-by: Eric Gouriou <egouriou@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 85 +++++++++++++++++++++++++++----------------------------
 1 file changed, 42 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 8b6a17b60970..29969622af8b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -754,31 +754,25 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 		return -EIO;
 	}
 
-	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
 	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
 		/* insert after */
-		if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
-			len = (len - 1) * sizeof(struct ext4_extent_idx);
-			len = len < 0 ? 0 : len;
-			ext_debug("insert new index %d after: %llu. "
-					"move %d from 0x%p to 0x%p\n",
-					logical, ptr, len,
-					(curp->p_idx + 1), (curp->p_idx + 2));
-			memmove(curp->p_idx + 2, curp->p_idx + 1, len);
-		}
+		ext_debug("insert new index %d after: %llu\n", logical, ptr);
 		ix = curp->p_idx + 1;
 	} else {
 		/* insert before */
-		len = len * sizeof(struct ext4_extent_idx);
-		len = len < 0 ? 0 : len;
-		ext_debug("insert new index %d before: %llu. "
-				"move %d from 0x%p to 0x%p\n",
-				logical, ptr, len,
-				curp->p_idx, (curp->p_idx + 1));
-		memmove(curp->p_idx + 1, curp->p_idx, len);
+		ext_debug("insert new index %d before: %llu\n", logical, ptr);
 		ix = curp->p_idx;
 	}
 
+	len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
+	BUG_ON(len < 0);
+	if (len > 0) {
+		ext_debug("insert new index %d: "
+				"move %d indices from 0x%p to 0x%p\n",
+				logical, len, ix, ix + 1);
+		memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
+	}
+
 	if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
 		EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
 		return -EIO;
@@ -1779,41 +1773,46 @@ has_space:
 				ext4_ext_pblock(newext),
 				ext4_ext_is_uninitialized(newext),
 				ext4_ext_get_actual_len(newext));
-		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
-	} else if (le32_to_cpu(newext->ee_block)
+		nearex = EXT_FIRST_EXTENT(eh);
+	} else {
+		if (le32_to_cpu(newext->ee_block)
 			   > le32_to_cpu(nearex->ee_block)) {
-/*		BUG_ON(newext->ee_block == nearex->ee_block); */
-		if (nearex != EXT_LAST_EXTENT(eh)) {
-			len = EXT_MAX_EXTENT(eh) - nearex;
-			len = (len - 1) * sizeof(struct ext4_extent);
-			len = len < 0 ? 0 : len;
-			ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
-					"move %d from 0x%p to 0x%p\n",
+			/* Insert after */
+			ext_debug("insert %d:%llu:[%d]%d %s before: "
+					"nearest 0x%p\n"
+					le32_to_cpu(newext->ee_block),
+					ext4_ext_pblock(newext),
+					ext4_ext_is_uninitialized(newext),
+					ext4_ext_get_actual_len(newext),
+					nearex);
+			nearex++;
+		} else {
+			/* Insert before */
+			BUG_ON(newext->ee_block == nearex->ee_block);
+			ext_debug("insert %d:%llu:[%d]%d %s after: "
+					"nearest 0x%p\n"
 					le32_to_cpu(newext->ee_block),
 					ext4_ext_pblock(newext),
 					ext4_ext_is_uninitialized(newext),
 					ext4_ext_get_actual_len(newext),
-					nearex, len, nearex + 1, nearex + 2);
-			memmove(nearex + 2, nearex + 1, len);
+					nearex);
+		}
+		len = EXT_LAST_EXTENT(eh) - nearex + 1;
+		if (len > 0) {
+			ext_debug("insert %d:%llu:[%d]%d: "
+					"move %d extents from 0x%p to 0x%p\n",
+					le32_to_cpu(newext->ee_block),
+					ext4_ext_pblock(newext),
+					ext4_ext_is_uninitialized(newext),
+					ext4_ext_get_actual_len(newext),
+					len, nearex, nearex + 1);
+			memmove(nearex + 1, nearex,
+				len * sizeof(struct ext4_extent));
 		}
-		path[depth].p_ext = nearex + 1;
-	} else {
-		BUG_ON(newext->ee_block == nearex->ee_block);
-		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
-		len = len < 0 ? 0 : len;
-		ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
-				"move %d from 0x%p to 0x%p\n",
-				le32_to_cpu(newext->ee_block),
-				ext4_ext_pblock(newext),
-				ext4_ext_is_uninitialized(newext),
-				ext4_ext_get_actual_len(newext),
-				nearex, len, nearex, nearex + 1);
-		memmove(nearex + 1, nearex, len);
-		path[depth].p_ext = nearex;
 	}
 
 	le16_add_cpu(&eh->eh_entries, 1);
-	nearex = path[depth].p_ext;
+	path[depth].p_ext = nearex;
 	nearex->ee_block = newext->ee_block;
 	ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
 	nearex->ee_len = newext->ee_len;
-- 
cgit v1.2.3-58-ga151


From 96814ecb404d587fade79af70bb741bc753e9ffb Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Mon, 24 Oct 2011 20:46:50 -0500
Subject: Add definition for share encryption

Samba supports a setfs info level to negotiate encrypted
shares.  This patch adds the defines so we recognize
this info level.  Later patches will add the enablement
for it.

Acked-by: Jeremy Allison <jra@samba.org>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifspdu.h | 44 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 3c6ef34fe2bc..3fb03e2c8e86 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1911,6 +1911,10 @@ typedef struct whoami_rsp_data { /* Query level 0x202 */
 
 /* SETFSInfo Levels */
 #define SMB_SET_CIFS_UNIX_INFO    0x200
+/* level 0x203 is defined above in list of QFS info levels */
+/* #define SMB_REQUEST_TRANSPORT_ENCRYPTION 0x203 */
+
+/* Level 0x200 request structure follows */
 typedef struct smb_com_transaction2_setfsi_req {
 	struct smb_hdr hdr;	/* wct = 15 */
 	__le16 TotalParameterCount;
@@ -1938,13 +1942,39 @@ typedef struct smb_com_transaction2_setfsi_req {
 	__le64 ClientUnixCap;   /* Data end */
 } __attribute__((packed)) TRANSACTION2_SETFSI_REQ;
 
+/* level 0x203 request structure follows */
+typedef struct smb_com_transaction2_setfs_enc_req {
+	struct smb_hdr hdr;	/* wct = 15 */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__le16 MaxParameterCount;
+	__le16 MaxDataCount;
+	__u8 MaxSetupCount;
+	__u8 Reserved;
+	__le16 Flags;
+	__le32 Timeout;
+	__u16 Reserved2;
+	__le16 ParameterCount;	/* 4 */
+	__le16 ParameterOffset;
+	__le16 DataCount;	/* 12 */
+	__le16 DataOffset;
+	__u8 SetupCount;	/* one */
+	__u8 Reserved3;
+	__le16 SubCommand;	/* TRANS2_SET_FS_INFORMATION */
+	__le16 ByteCount;
+	__u8 Pad;
+	__u16  Reserved4;	/* Parameters start. */
+	__le16 InformationLevel;/* Parameters end. */
+	/* NTLMSSP Blob, Data start. */
+} __attribute__((packed)) TRANSACTION2_SETFSI_ENC_REQ;
+
+/* response for setfsinfo levels 0x200 and 0x203 */
 typedef struct smb_com_transaction2_setfsi_rsp {
 	struct smb_hdr hdr;	/* wct = 10 */
 	struct trans2_resp t2;
 	__u16 ByteCount;
 } __attribute__((packed)) TRANSACTION2_SETFSI_RSP;
 
-
 typedef struct smb_com_transaction2_get_dfs_refer_req {
 	struct smb_hdr hdr;	/* wct = 15 */
 	__le16 TotalParameterCount;
@@ -2096,13 +2126,13 @@ typedef struct {
 #define CIFS_UNIX_PROXY_CAP             0x00000400 /* Proxy cap: 0xACE ioctl and
 						      QFS PROXY call */
 #ifdef CONFIG_CIFS_POSIX
-/* Can not set pathnames cap yet until we send new posix create SMB since
-   otherwise server can treat such handles opened with older ntcreatex
-   (by a new client which knows how to send posix path ops)
-   as non-posix handles (can affect write behavior with byte range locks.
-   We can add back in POSIX_PATH_OPS cap when Posix Create/Mkdir finished */
+/* presumably don't need the 0x20 POSIX_PATH_OPS_CAP since we never send
+   LockingX instead of posix locking call on unix sess (and we do not expect
+   LockingX to use different (ie Windows) semantics than posix locking on
+   the same session (if WINE needs to do this later, we can add this cap
+   back in later */
 /* #define CIFS_UNIX_CAP_MASK              0x000000fb */
-#define CIFS_UNIX_CAP_MASK              0x000000db
+#define CIFS_UNIX_CAP_MASK              0x000003db
 #else
 #define CIFS_UNIX_CAP_MASK              0x00000013
 #endif /* CONFIG_CIFS_POSIX */
-- 
cgit v1.2.3-58-ga151


From 814e1d25a59662f9552e6dc1305d1df3616fc87e Mon Sep 17 00:00:00 2001
From: Wang Sheng-Hui <shhuiw@gmail.com>
Date: Thu, 1 Sep 2011 08:22:57 +0800
Subject: cleanup: vfs: small comment fix for block_invalidatepage

The patch is aganist 3.1-rc3.

Signed-off-by: Wang Sheng-Hui <shhuiw@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/buffer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 1a80b048ade8..936d6035f6e2 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1470,13 +1470,13 @@ static void discard_buffer(struct buffer_head * bh)
 }
 
 /**
- * block_invalidatepage - invalidate part of all of a buffer-backed page
+ * block_invalidatepage - invalidate part or all of a buffer-backed page
  *
  * @page: the page which is affected
  * @offset: the index of the truncation point
  *
  * block_invalidatepage() is called when all or part of the page has become
- * invalidatedby a truncate operation.
+ * invalidated by a truncate operation.
  *
  * block_invalidatepage() does not have to release all buffers, but it must
  * ensure that no dirty buffer is left outside @offset and that no I/O
-- 
cgit v1.2.3-58-ga151


From a877ee03ac010ded434b77f7831f43cbb1fcc60f Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Fri, 7 Oct 2011 13:41:15 -0400
Subject: vfs: add "device" tag to /proc/self/mountstats

nfsiostat was failing to find mounted filesystems on kernels after
2.6.38 because of changes to show_vfsstat() by commit
c7f404b40a3665d9f4e9a927cc5c1ee0479ed8f9.  This patch adds back the
"device" tag before the nfs server entry so scripts can parse the
mountstats file correctly.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
CC: stable@kernel.org [>=2.6.39]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/namespace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index b4febb29d3bb..e5e1c7d1839b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1109,6 +1109,7 @@ static int show_vfsstat(struct seq_file *m, void *v)
 
 	/* device */
 	if (mnt->mnt_sb->s_op->show_devname) {
+		seq_puts(m, "device ");
 		err = mnt->mnt_sb->s_op->show_devname(m, mnt);
 	} else {
 		if (mnt->mnt_devname) {
-- 
cgit v1.2.3-58-ga151


From 1448c721e4fa2faf742029a0403b4b787fccb7fa Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 17 Oct 2011 13:40:02 -0700
Subject: compat: sync compat_stats with statfs.

This was found by inspection while tracking a similar
bug in compat_statfs64, that has been fixed in mainline
since decemeber.

- This fixes a bug where not all of the f_spare fields
  were cleared on mips and s390.
- Add the f_flags field to struct compat_statfs
- Copy f_flags to userspace in case someone cares.
- Use __clear_user to copy the f_spare field to userspace
  to ensure that all of the elements of f_spare are cleared.
  On some architectures f_spare is has 5 ints and on some
  architectures f_spare only has 4 ints.  Which makes
  the previous technique of clearing each int individually
  broken.

I don't expect anyone actually uses the old statfs system
call anymore but if they do let them benefit from having
the compat and the native version working the same.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/mips/include/asm/compat.h    | 3 ++-
 arch/parisc/include/asm/compat.h  | 3 ++-
 arch/powerpc/include/asm/compat.h | 3 ++-
 arch/s390/include/asm/compat.h    | 3 ++-
 arch/sparc/include/asm/compat.h   | 3 ++-
 arch/x86/include/asm/compat.h     | 3 ++-
 fs/compat.c                       | 7 ++-----
 7 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h
index dbc51065df5b..b77df0366ee6 100644
--- a/arch/mips/include/asm/compat.h
+++ b/arch/mips/include/asm/compat.h
@@ -111,7 +111,8 @@ struct compat_statfs {
 	int		f_bavail;
 	compat_fsid_t	f_fsid;
 	int		f_namelen;
-	int		f_spare[6];
+	int		f_flags;
+	int		f_spare[5];
 };
 
 #define COMPAT_RLIM_INFINITY	0x7fffffffUL
diff --git a/arch/parisc/include/asm/compat.h b/arch/parisc/include/asm/compat.h
index efa0b60c63fe..760f331d4fa3 100644
--- a/arch/parisc/include/asm/compat.h
+++ b/arch/parisc/include/asm/compat.h
@@ -105,7 +105,8 @@ struct compat_statfs {
 	__kernel_fsid_t	f_fsid;
 	s32		f_namelen;
 	s32		f_frsize;
-	s32		f_spare[5];
+	s32		f_flags;
+	s32		f_spare[4];
 };
 
 struct compat_sigcontext {
diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h
index 91010e8f8479..88e602f6430d 100644
--- a/arch/powerpc/include/asm/compat.h
+++ b/arch/powerpc/include/asm/compat.h
@@ -100,7 +100,8 @@ struct compat_statfs {
 	compat_fsid_t	f_fsid;
 	int		f_namelen;	/* SunOS ignores this field. */
 	int		f_frsize;
-	int		f_spare[5];
+	int		f_flags;
+	int		f_spare[4];
 };
 
 #define COMPAT_RLIM_OLD_INFINITY	0x7fffffff
diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h
index da359ca6fe55..cdb9b78f6c08 100644
--- a/arch/s390/include/asm/compat.h
+++ b/arch/s390/include/asm/compat.h
@@ -131,7 +131,8 @@ struct compat_statfs {
 	compat_fsid_t	f_fsid;
 	s32		f_namelen;
 	s32		f_frsize;
-	s32		f_spare[6];
+	s32		f_flags;
+	s32		f_spare[5];
 };
 
 #define COMPAT_RLIM_OLD_INFINITY	0x7fffffff
diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h
index 6f57325bb883..b8be20d42a0a 100644
--- a/arch/sparc/include/asm/compat.h
+++ b/arch/sparc/include/asm/compat.h
@@ -134,7 +134,8 @@ struct compat_statfs {
 	compat_fsid_t	f_fsid;
 	int		f_namelen;	/* SunOS ignores this field. */
 	int		f_frsize;
-	int		f_spare[5];
+	int		f_flags;
+	int		f_spare[4];
 };
 
 #define COMPAT_RLIM_INFINITY 0x7fffffff
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 1d9cd27c2920..30d737ef2a42 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -108,7 +108,8 @@ struct compat_statfs {
 	compat_fsid_t	f_fsid;
 	int		f_namelen;	/* SunOS ignores this field. */
 	int		f_frsize;
-	int		f_spare[5];
+	int		f_flags;
+	int		f_spare[4];
 };
 
 #define COMPAT_RLIM_OLD_INFINITY	0x7fffffff
diff --git a/fs/compat.c b/fs/compat.c
index 58b1da459893..0f1ab468fa2d 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -247,11 +247,8 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
 	    __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
 	    __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
 	    __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
-	    __put_user(0, &ubuf->f_spare[0]) || 
-	    __put_user(0, &ubuf->f_spare[1]) || 
-	    __put_user(0, &ubuf->f_spare[2]) || 
-	    __put_user(0, &ubuf->f_spare[3]) || 
-	    __put_user(0, &ubuf->f_spare[4]))
+	    __put_user(kbuf->f_flags, &ubuf->f_flags) ||
+	    __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
 		return -EFAULT;
 	return 0;
 }
-- 
cgit v1.2.3-58-ga151


From 8fd90c8d1dacb5ff0f372217c97f57a9e61559cd Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@kernel.org>
Date: Sun, 23 Oct 2011 23:13:30 +0530
Subject: vfs: indicate that the permission functions take all the MAY_* flags

Acked-by: J. Bruce Fields <bfields@redhat.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andreas Gruenbacher <agruen@kernel.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/namei.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 0b3138de2a3b..2a4574f48001 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -257,7 +257,7 @@ other_perms:
 /**
  * generic_permission -  check for access rights on a Posix-like filesystem
  * @inode:	inode to check access rights for
- * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
  *
  * Used to check for read/write/execute permissions on a file.
  * We use "fsuid" for this, letting us set arbitrary permissions
@@ -331,7 +331,7 @@ static inline int do_inode_permission(struct inode *inode, int mask)
 /**
  * inode_permission  -  check for access rights to a given inode
  * @inode:	inode to check permission on
- * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
  *
  * Used to check for read/write/execute permissions on an inode.
  * We use "fsuid" for this, letting us set arbitrary permissions
-- 
cgit v1.2.3-58-ga151


From d124b60a838141bb9cac1b6567e9ca4539d1fff0 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@kernel.org>
Date: Sun, 23 Oct 2011 23:13:32 +0530
Subject: vfs: pass all mask flags check_acl and posix_acl_permission

Acked-by: J. Bruce Fields <bfields@redhat.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andreas Gruenbacher <agruen@kernel.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/namei.c     | 2 --
 fs/posix_acl.c | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 2a4574f48001..276cd30ab8f8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -227,8 +227,6 @@ static int acl_permission_check(struct inode *inode, int mask)
 {
 	unsigned int mode = inode->i_mode;
 
-	mask &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK;
-
 	if (current_user_ns() != inode_userns(inode))
 		goto other_perms;
 
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 10027b42b7e2..cea4623f1ed6 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -218,6 +218,8 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want)
 	const struct posix_acl_entry *pa, *pe, *mask_obj;
 	int found = 0;
 
+	want &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK;
+
 	FOREACH_ACL_ENTRY(pa, acl, pe) {
                 switch(pa->e_tag) {
                         case ACL_USER_OBJ:
-- 
cgit v1.2.3-58-ga151


From 948409c74d217f6cf054b8c927765a1c3fe16b53 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@kernel.org>
Date: Sun, 23 Oct 2011 23:13:33 +0530
Subject: vfs: add a comment to inode_permission()

Acked-by: J. Bruce Fields <bfields@redhat.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andreas Gruenbacher <agruen@kernel.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/namei.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 276cd30ab8f8..9061157e39d6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -221,7 +221,7 @@ static int check_acl(struct inode *inode, int mask)
 }
 
 /*
- * This does basic POSIX ACL permission checking
+ * This does the basic permission checking
  */
 static int acl_permission_check(struct inode *inode, int mask)
 {
@@ -271,7 +271,7 @@ int generic_permission(struct inode *inode, int mask)
 	int ret;
 
 	/*
-	 * Do the basic POSIX ACL permission checks.
+	 * Do the basic permission checks.
 	 */
 	ret = acl_permission_check(inode, mask);
 	if (ret != -EACCES)
@@ -335,6 +335,8 @@ static inline int do_inode_permission(struct inode *inode, int mask)
  * We use "fsuid" for this, letting us set arbitrary permissions
  * for filesystem access without changing the "normal" uids which
  * are used for other things.
+ *
+ * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
  */
 int inode_permission(struct inode *inode, int mask)
 {
-- 
cgit v1.2.3-58-ga151


From 62a3ddef6181d7d932c565d97552d2f7b9ab4d28 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 28 Oct 2011 10:03:41 +0200
Subject: vfs: fix spinning prevention in prune_icache_sb

We need to move the inode to the end of the list to actually make the
spinning prevention explained in the comment above it work.  With a
plain list_move it will simply stay in place as we're always reclaiming
from the head of the list.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index ec7924696a13..ecbb68dc7e2a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -634,7 +634,7 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan)
 		 * inode to the back of the list so we don't spin on it.
 		 */
 		if (!spin_trylock(&inode->i_lock)) {
-			list_move(&inode->i_lru, &sb->s_inode_lru);
+			list_move_tail(&inode->i_lru, &sb->s_inode_lru);
 			continue;
 		}
 
-- 
cgit v1.2.3-58-ga151


From eb28be2b4c0a0608e54f0a8fc237372c674eb7d0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 1 Aug 2011 21:38:03 -0700
Subject: direct-io: separate fields only used in the submission path from
 struct dio

This large, but largely mechanic, patch moves all fields in struct dio
that are only used in the submission path into a separate on stack
data structure. This has the advantage that the memory is very likely
cache hot, which is not guaranteed for memory fresh out of kmalloc.

This also gives gcc more optimization potential because it can easier
determine that there are no external aliases for these variables.

The sdio initialization is a initialization now instead of memset.
This allows gcc to break sdio into individual fields and optimize
away unnecessary zeroing (after all the functions are inlined)

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/direct-io.c | 389 +++++++++++++++++++++++++++++----------------------------
 1 file changed, 201 insertions(+), 188 deletions(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 44a360ca8046..02ccf766903c 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -55,13 +55,10 @@
  * blocksize.
  */
 
-struct dio {
-	/* BIO submission state */
+/* dio_state only used in the submission path */
+
+struct dio_submit {
 	struct bio *bio;		/* bio under assembly */
-	struct inode *inode;
-	int rw;
-	loff_t i_size;			/* i_size when submitted */
-	int flags;			/* doesn't change */
 	unsigned blkbits;		/* doesn't change */
 	unsigned blkfactor;		/* When we're using an alignment which
 					   is finer than the filesystem's soft
@@ -81,13 +78,12 @@ struct dio {
 	int boundary;			/* prev block is at a boundary */
 	int reap_counter;		/* rate limit reaping */
 	get_block_t *get_block;		/* block mapping function */
-	dio_iodone_t *end_io;		/* IO completion function */
 	dio_submit_t *submit_io;	/* IO submition function */
+
 	loff_t logical_offset_in_bio;	/* current first logical block in bio */
 	sector_t final_block_in_bio;	/* current final block in bio + 1 */
 	sector_t next_block_for_io;	/* next block to be put under IO,
 					   in dio_blocks units */
-	struct buffer_head map_bh;	/* last get_block() result */
 
 	/*
 	 * Deferred addition of a page to the dio.  These variables are
@@ -100,18 +96,6 @@ struct dio {
 	sector_t cur_page_block;	/* Where it starts */
 	loff_t cur_page_fs_offset;	/* Offset in file */
 
-	/* BIO completion state */
-	spinlock_t bio_lock;		/* protects BIO fields below */
-	unsigned long refcount;		/* direct_io_worker() and bios */
-	struct bio *bio_list;		/* singly linked via bi_private */
-	struct task_struct *waiter;	/* waiting task (NULL if none) */
-
-	/* AIO related stuff */
-	struct kiocb *iocb;		/* kiocb */
-	int is_async;			/* is IO async ? */
-	int io_error;			/* IO error in completion path */
-	ssize_t result;                 /* IO result */
-
 	/*
 	 * Page fetching state. These variables belong to dio_refill_pages().
 	 */
@@ -125,6 +109,30 @@ struct dio {
 	 */
 	unsigned head;			/* next page to process */
 	unsigned tail;			/* last valid page + 1 */
+};
+
+/* dio_state communicated between submission path and end_io */
+struct dio {
+	int flags;			/* doesn't change */
+	struct inode *inode;
+	int rw;
+	loff_t i_size;			/* i_size when submitted */
+	dio_iodone_t *end_io;		/* IO completion function */
+	struct buffer_head map_bh;	/* last get_block() result */
+
+
+	/* BIO completion state */
+	spinlock_t bio_lock;		/* protects BIO fields below */
+	unsigned long refcount;		/* direct_io_worker() and bios */
+	struct bio *bio_list;		/* singly linked via bi_private */
+	struct task_struct *waiter;	/* waiting task (NULL if none) */
+
+	/* AIO related stuff */
+	struct kiocb *iocb;		/* kiocb */
+	int is_async;			/* is IO async ? */
+	int io_error;			/* IO error in completion path */
+	ssize_t result;                 /* IO result */
+
 	int page_errors;		/* errno from get_user_pages() */
 
 	/*
@@ -182,27 +190,27 @@ EXPORT_SYMBOL_GPL(inode_dio_done);
 /*
  * How many pages are in the queue?
  */
-static inline unsigned dio_pages_present(struct dio *dio)
+static inline unsigned dio_pages_present(struct dio_submit *sdio)
 {
-	return dio->tail - dio->head;
+	return sdio->tail - sdio->head;
 }
 
 /*
  * Go grab and pin some userspace pages.   Typically we'll get 64 at a time.
  */
-static int dio_refill_pages(struct dio *dio)
+static int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 {
 	int ret;
 	int nr_pages;
 
-	nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
+	nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES);
 	ret = get_user_pages_fast(
-		dio->curr_user_address,		/* Where from? */
+		sdio->curr_user_address,		/* Where from? */
 		nr_pages,			/* How many pages? */
 		dio->rw == READ,		/* Write to memory? */
 		&dio->pages[0]);		/* Put results here */
 
-	if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
+	if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
 		struct page *page = ZERO_PAGE(0);
 		/*
 		 * A memory fault, but the filesystem has some outstanding
@@ -213,17 +221,17 @@ static int dio_refill_pages(struct dio *dio)
 			dio->page_errors = ret;
 		page_cache_get(page);
 		dio->pages[0] = page;
-		dio->head = 0;
-		dio->tail = 1;
+		sdio->head = 0;
+		sdio->tail = 1;
 		ret = 0;
 		goto out;
 	}
 
 	if (ret >= 0) {
-		dio->curr_user_address += ret * PAGE_SIZE;
-		dio->curr_page += ret;
-		dio->head = 0;
-		dio->tail = ret;
+		sdio->curr_user_address += ret * PAGE_SIZE;
+		sdio->curr_page += ret;
+		sdio->head = 0;
+		sdio->tail = ret;
 		ret = 0;
 	}
 out:
@@ -236,17 +244,17 @@ out:
  * decent number of pages, less frequently.  To provide nicer use of the
  * L1 cache.
  */
-static struct page *dio_get_page(struct dio *dio)
+static struct page *dio_get_page(struct dio *dio, struct dio_submit *sdio)
 {
-	if (dio_pages_present(dio) == 0) {
+	if (dio_pages_present(sdio) == 0) {
 		int ret;
 
-		ret = dio_refill_pages(dio);
+		ret = dio_refill_pages(dio, sdio);
 		if (ret)
 			return ERR_PTR(ret);
-		BUG_ON(dio_pages_present(dio) == 0);
+		BUG_ON(dio_pages_present(sdio) == 0);
 	}
-	return dio->pages[dio->head++];
+	return dio->pages[sdio->head++];
 }
 
 /**
@@ -368,8 +376,9 @@ void dio_end_io(struct bio *bio, int error)
 EXPORT_SYMBOL_GPL(dio_end_io);
 
 static void
-dio_bio_alloc(struct dio *dio, struct block_device *bdev,
-		sector_t first_sector, int nr_vecs)
+dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
+	      struct block_device *bdev,
+	      sector_t first_sector, int nr_vecs)
 {
 	struct bio *bio;
 
@@ -386,8 +395,8 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
 	else
 		bio->bi_end_io = dio_bio_end_io;
 
-	dio->bio = bio;
-	dio->logical_offset_in_bio = dio->cur_page_fs_offset;
+	sdio->bio = bio;
+	sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
 }
 
 /*
@@ -397,9 +406,9 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
  *
  * bios hold a dio reference between submit_bio and ->end_io.
  */
-static void dio_bio_submit(struct dio *dio)
+static void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 {
-	struct bio *bio = dio->bio;
+	struct bio *bio = sdio->bio;
 	unsigned long flags;
 
 	bio->bi_private = dio;
@@ -411,24 +420,24 @@ static void dio_bio_submit(struct dio *dio)
 	if (dio->is_async && dio->rw == READ)
 		bio_set_pages_dirty(bio);
 
-	if (dio->submit_io)
-		dio->submit_io(dio->rw, bio, dio->inode,
-			       dio->logical_offset_in_bio);
+	if (sdio->submit_io)
+		sdio->submit_io(dio->rw, bio, dio->inode,
+			       sdio->logical_offset_in_bio);
 	else
 		submit_bio(dio->rw, bio);
 
-	dio->bio = NULL;
-	dio->boundary = 0;
-	dio->logical_offset_in_bio = 0;
+	sdio->bio = NULL;
+	sdio->boundary = 0;
+	sdio->logical_offset_in_bio = 0;
 }
 
 /*
  * Release any resources in case of a failure
  */
-static void dio_cleanup(struct dio *dio)
+static void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
 {
-	while (dio_pages_present(dio))
-		page_cache_release(dio_get_page(dio));
+	while (dio_pages_present(sdio))
+		page_cache_release(dio_get_page(dio, sdio));
 }
 
 /*
@@ -518,11 +527,11 @@ static void dio_await_completion(struct dio *dio)
  *
  * This also helps to limit the peak amount of pinned userspace memory.
  */
-static int dio_bio_reap(struct dio *dio)
+static int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
 {
 	int ret = 0;
 
-	if (dio->reap_counter++ >= 64) {
+	if (sdio->reap_counter++ >= 64) {
 		while (dio->bio_list) {
 			unsigned long flags;
 			struct bio *bio;
@@ -536,14 +545,14 @@ static int dio_bio_reap(struct dio *dio)
 			if (ret == 0)
 				ret = ret2;
 		}
-		dio->reap_counter = 0;
+		sdio->reap_counter = 0;
 	}
 	return ret;
 }
 
 /*
  * Call into the fs to map some more disk blocks.  We record the current number
- * of available blocks at dio->blocks_available.  These are in units of the
+ * of available blocks at sdio->blocks_available.  These are in units of the
  * fs blocksize, (1 << inode->i_blkbits).
  *
  * The fs is allowed to map lots of blocks at once.  If it wants to do that,
@@ -564,7 +573,7 @@ static int dio_bio_reap(struct dio *dio)
  * buffer_mapped().  However the direct-io code will only process holes one
  * block at a time - it will repeatedly call get_block() as it walks the hole.
  */
-static int get_more_blocks(struct dio *dio)
+static int get_more_blocks(struct dio *dio, struct dio_submit *sdio)
 {
 	int ret;
 	struct buffer_head *map_bh = &dio->map_bh;
@@ -580,11 +589,11 @@ static int get_more_blocks(struct dio *dio)
 	 */
 	ret = dio->page_errors;
 	if (ret == 0) {
-		BUG_ON(dio->block_in_file >= dio->final_block_in_request);
-		fs_startblk = dio->block_in_file >> dio->blkfactor;
-		dio_count = dio->final_block_in_request - dio->block_in_file;
-		fs_count = dio_count >> dio->blkfactor;
-		blkmask = (1 << dio->blkfactor) - 1;
+		BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
+		fs_startblk = sdio->block_in_file >> sdio->blkfactor;
+		dio_count = sdio->final_block_in_request - sdio->block_in_file;
+		fs_count = dio_count >> sdio->blkfactor;
+		blkmask = (1 << sdio->blkfactor) - 1;
 		if (dio_count & blkmask)	
 			fs_count++;
 
@@ -604,12 +613,12 @@ static int get_more_blocks(struct dio *dio)
 		 */
 		create = dio->rw & WRITE;
 		if (dio->flags & DIO_SKIP_HOLES) {
-			if (dio->block_in_file < (i_size_read(dio->inode) >>
-							dio->blkbits))
+			if (sdio->block_in_file < (i_size_read(dio->inode) >>
+							sdio->blkbits))
 				create = 0;
 		}
 
-		ret = (*dio->get_block)(dio->inode, fs_startblk,
+		ret = (*sdio->get_block)(dio->inode, fs_startblk,
 						map_bh, create);
 	}
 	return ret;
@@ -618,20 +627,21 @@ static int get_more_blocks(struct dio *dio)
 /*
  * There is no bio.  Make one now.
  */
-static int dio_new_bio(struct dio *dio, sector_t start_sector)
+static int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
+		       sector_t start_sector)
 {
 	sector_t sector;
 	int ret, nr_pages;
 
-	ret = dio_bio_reap(dio);
+	ret = dio_bio_reap(dio, sdio);
 	if (ret)
 		goto out;
-	sector = start_sector << (dio->blkbits - 9);
-	nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
+	sector = start_sector << (sdio->blkbits - 9);
+	nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
 	nr_pages = min(nr_pages, BIO_MAX_PAGES);
 	BUG_ON(nr_pages <= 0);
-	dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
-	dio->boundary = 0;
+	dio_bio_alloc(dio, sdio, dio->map_bh.b_bdev, sector, nr_pages);
+	sdio->boundary = 0;
 out:
 	return ret;
 }
@@ -643,21 +653,21 @@ out:
  *
  * Return zero on success.  Non-zero means the caller needs to start a new BIO.
  */
-static int dio_bio_add_page(struct dio *dio)
+static int dio_bio_add_page(struct dio_submit *sdio)
 {
 	int ret;
 
-	ret = bio_add_page(dio->bio, dio->cur_page,
-			dio->cur_page_len, dio->cur_page_offset);
-	if (ret == dio->cur_page_len) {
+	ret = bio_add_page(sdio->bio, sdio->cur_page,
+			sdio->cur_page_len, sdio->cur_page_offset);
+	if (ret == sdio->cur_page_len) {
 		/*
 		 * Decrement count only, if we are done with this page
 		 */
-		if ((dio->cur_page_len + dio->cur_page_offset) == PAGE_SIZE)
-			dio->pages_in_io--;
-		page_cache_get(dio->cur_page);
-		dio->final_block_in_bio = dio->cur_page_block +
-			(dio->cur_page_len >> dio->blkbits);
+		if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE)
+			sdio->pages_in_io--;
+		page_cache_get(sdio->cur_page);
+		sdio->final_block_in_bio = sdio->cur_page_block +
+			(sdio->cur_page_len >> sdio->blkbits);
 		ret = 0;
 	} else {
 		ret = 1;
@@ -675,14 +685,14 @@ static int dio_bio_add_page(struct dio *dio)
  * The caller of this function is responsible for removing cur_page from the
  * dio, and for dropping the refcount which came from that presence.
  */
-static int dio_send_cur_page(struct dio *dio)
+static int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio)
 {
 	int ret = 0;
 
-	if (dio->bio) {
-		loff_t cur_offset = dio->cur_page_fs_offset;
-		loff_t bio_next_offset = dio->logical_offset_in_bio +
-			dio->bio->bi_size;
+	if (sdio->bio) {
+		loff_t cur_offset = sdio->cur_page_fs_offset;
+		loff_t bio_next_offset = sdio->logical_offset_in_bio +
+			sdio->bio->bi_size;
 
 		/*
 		 * See whether this new request is contiguous with the old.
@@ -698,28 +708,28 @@ static int dio_send_cur_page(struct dio *dio)
 		 * be the next logical offset in the bio, submit the bio we
 		 * have.
 		 */
-		if (dio->final_block_in_bio != dio->cur_page_block ||
+		if (sdio->final_block_in_bio != sdio->cur_page_block ||
 		    cur_offset != bio_next_offset)
-			dio_bio_submit(dio);
+			dio_bio_submit(dio, sdio);
 		/*
 		 * Submit now if the underlying fs is about to perform a
 		 * metadata read
 		 */
-		else if (dio->boundary)
-			dio_bio_submit(dio);
+		else if (sdio->boundary)
+			dio_bio_submit(dio, sdio);
 	}
 
-	if (dio->bio == NULL) {
-		ret = dio_new_bio(dio, dio->cur_page_block);
+	if (sdio->bio == NULL) {
+		ret = dio_new_bio(dio, sdio, sdio->cur_page_block);
 		if (ret)
 			goto out;
 	}
 
-	if (dio_bio_add_page(dio) != 0) {
-		dio_bio_submit(dio);
-		ret = dio_new_bio(dio, dio->cur_page_block);
+	if (dio_bio_add_page(sdio) != 0) {
+		dio_bio_submit(dio, sdio);
+		ret = dio_new_bio(dio, sdio, sdio->cur_page_block);
 		if (ret == 0) {
-			ret = dio_bio_add_page(dio);
+			ret = dio_bio_add_page(sdio);
 			BUG_ON(ret != 0);
 		}
 	}
@@ -745,7 +755,7 @@ out:
  * page to the dio instead.
  */
 static int
-submit_page_section(struct dio *dio, struct page *page,
+submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
 		unsigned offset, unsigned len, sector_t blocknr)
 {
 	int ret = 0;
@@ -760,20 +770,20 @@ submit_page_section(struct dio *dio, struct page *page,
 	/*
 	 * Can we just grow the current page's presence in the dio?
 	 */
-	if (	(dio->cur_page == page) &&
-		(dio->cur_page_offset + dio->cur_page_len == offset) &&
-		(dio->cur_page_block +
-			(dio->cur_page_len >> dio->blkbits) == blocknr)) {
-		dio->cur_page_len += len;
+	if (sdio->cur_page == page &&
+	    sdio->cur_page_offset + sdio->cur_page_len == offset &&
+	    sdio->cur_page_block +
+	    (sdio->cur_page_len >> sdio->blkbits) == blocknr) {
+		sdio->cur_page_len += len;
 
 		/*
-		 * If dio->boundary then we want to schedule the IO now to
+		 * If sdio->boundary then we want to schedule the IO now to
 		 * avoid metadata seeks.
 		 */
-		if (dio->boundary) {
-			ret = dio_send_cur_page(dio);
-			page_cache_release(dio->cur_page);
-			dio->cur_page = NULL;
+		if (sdio->boundary) {
+			ret = dio_send_cur_page(dio, sdio);
+			page_cache_release(sdio->cur_page);
+			sdio->cur_page = NULL;
 		}
 		goto out;
 	}
@@ -781,20 +791,20 @@ submit_page_section(struct dio *dio, struct page *page,
 	/*
 	 * If there's a deferred page already there then send it.
 	 */
-	if (dio->cur_page) {
-		ret = dio_send_cur_page(dio);
-		page_cache_release(dio->cur_page);
-		dio->cur_page = NULL;
+	if (sdio->cur_page) {
+		ret = dio_send_cur_page(dio, sdio);
+		page_cache_release(sdio->cur_page);
+		sdio->cur_page = NULL;
 		if (ret)
 			goto out;
 	}
 
 	page_cache_get(page);		/* It is in dio */
-	dio->cur_page = page;
-	dio->cur_page_offset = offset;
-	dio->cur_page_len = len;
-	dio->cur_page_block = blocknr;
-	dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;
+	sdio->cur_page = page;
+	sdio->cur_page_offset = offset;
+	sdio->cur_page_len = len;
+	sdio->cur_page_block = blocknr;
+	sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
 out:
 	return ret;
 }
@@ -826,19 +836,19 @@ static void clean_blockdev_aliases(struct dio *dio)
  * `end' is zero if we're doing the start of the IO, 1 at the end of the
  * IO.
  */
-static void dio_zero_block(struct dio *dio, int end)
+static void dio_zero_block(struct dio *dio, struct dio_submit *sdio, int end)
 {
 	unsigned dio_blocks_per_fs_block;
 	unsigned this_chunk_blocks;	/* In dio_blocks */
 	unsigned this_chunk_bytes;
 	struct page *page;
 
-	dio->start_zero_done = 1;
-	if (!dio->blkfactor || !buffer_new(&dio->map_bh))
+	sdio->start_zero_done = 1;
+	if (!sdio->blkfactor || !buffer_new(&dio->map_bh))
 		return;
 
-	dio_blocks_per_fs_block = 1 << dio->blkfactor;
-	this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1);
+	dio_blocks_per_fs_block = 1 << sdio->blkfactor;
+	this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1);
 
 	if (!this_chunk_blocks)
 		return;
@@ -850,14 +860,14 @@ static void dio_zero_block(struct dio *dio, int end)
 	if (end) 
 		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
 
-	this_chunk_bytes = this_chunk_blocks << dio->blkbits;
+	this_chunk_bytes = this_chunk_blocks << sdio->blkbits;
 
 	page = ZERO_PAGE(0);
-	if (submit_page_section(dio, page, 0, this_chunk_bytes, 
-				dio->next_block_for_io))
+	if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes,
+				sdio->next_block_for_io))
 		return;
 
-	dio->next_block_for_io += this_chunk_blocks;
+	sdio->next_block_for_io += this_chunk_blocks;
 }
 
 /*
@@ -876,9 +886,9 @@ static void dio_zero_block(struct dio *dio, int end)
  * it should set b_size to PAGE_SIZE or more inside get_block().  This gives
  * fine alignment but still allows this function to work in PAGE_SIZE units.
  */
-static int do_direct_IO(struct dio *dio)
+static int do_direct_IO(struct dio *dio, struct dio_submit *sdio)
 {
-	const unsigned blkbits = dio->blkbits;
+	const unsigned blkbits = sdio->blkbits;
 	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
 	struct page *page;
 	unsigned block_in_page;
@@ -886,10 +896,10 @@ static int do_direct_IO(struct dio *dio)
 	int ret = 0;
 
 	/* The I/O can start at any block offset within the first page */
-	block_in_page = dio->first_block_in_page;
+	block_in_page = sdio->first_block_in_page;
 
-	while (dio->block_in_file < dio->final_block_in_request) {
-		page = dio_get_page(dio);
+	while (sdio->block_in_file < sdio->final_block_in_request) {
+		page = dio_get_page(dio, sdio);
 		if (IS_ERR(page)) {
 			ret = PTR_ERR(page);
 			goto out;
@@ -901,14 +911,14 @@ static int do_direct_IO(struct dio *dio)
 			unsigned this_chunk_blocks;	/* # of blocks */
 			unsigned u;
 
-			if (dio->blocks_available == 0) {
+			if (sdio->blocks_available == 0) {
 				/*
 				 * Need to go and map some more disk
 				 */
 				unsigned long blkmask;
 				unsigned long dio_remainder;
 
-				ret = get_more_blocks(dio);
+				ret = get_more_blocks(dio, sdio);
 				if (ret) {
 					page_cache_release(page);
 					goto out;
@@ -916,18 +926,18 @@ static int do_direct_IO(struct dio *dio)
 				if (!buffer_mapped(map_bh))
 					goto do_holes;
 
-				dio->blocks_available =
-						map_bh->b_size >> dio->blkbits;
-				dio->next_block_for_io =
-					map_bh->b_blocknr << dio->blkfactor;
+				sdio->blocks_available =
+						map_bh->b_size >> sdio->blkbits;
+				sdio->next_block_for_io =
+					map_bh->b_blocknr << sdio->blkfactor;
 				if (buffer_new(map_bh))
 					clean_blockdev_aliases(dio);
 
-				if (!dio->blkfactor)
+				if (!sdio->blkfactor)
 					goto do_holes;
 
-				blkmask = (1 << dio->blkfactor) - 1;
-				dio_remainder = (dio->block_in_file & blkmask);
+				blkmask = (1 << sdio->blkfactor) - 1;
+				dio_remainder = (sdio->block_in_file & blkmask);
 
 				/*
 				 * If we are at the start of IO and that IO
@@ -941,8 +951,8 @@ static int do_direct_IO(struct dio *dio)
 				 * on-disk
 				 */
 				if (!buffer_new(map_bh))
-					dio->next_block_for_io += dio_remainder;
-				dio->blocks_available -= dio_remainder;
+					sdio->next_block_for_io += dio_remainder;
+				sdio->blocks_available -= dio_remainder;
 			}
 do_holes:
 			/* Handle holes */
@@ -961,7 +971,7 @@ do_holes:
 				 */
 				i_size_aligned = ALIGN(i_size_read(dio->inode),
 							1 << blkbits);
-				if (dio->block_in_file >=
+				if (sdio->block_in_file >=
 						i_size_aligned >> blkbits) {
 					/* We hit eof */
 					page_cache_release(page);
@@ -969,7 +979,7 @@ do_holes:
 				}
 				zero_user(page, block_in_page << blkbits,
 						1 << blkbits);
-				dio->block_in_file++;
+				sdio->block_in_file++;
 				block_in_page++;
 				goto next_block;
 			}
@@ -979,38 +989,40 @@ do_holes:
 			 * is finer than the underlying fs, go check to see if
 			 * we must zero out the start of this block.
 			 */
-			if (unlikely(dio->blkfactor && !dio->start_zero_done))
-				dio_zero_block(dio, 0);
+			if (unlikely(sdio->blkfactor && !sdio->start_zero_done))
+				dio_zero_block(dio, sdio, 0);
 
 			/*
 			 * Work out, in this_chunk_blocks, how much disk we
 			 * can add to this page
 			 */
-			this_chunk_blocks = dio->blocks_available;
+			this_chunk_blocks = sdio->blocks_available;
 			u = (PAGE_SIZE - offset_in_page) >> blkbits;
 			if (this_chunk_blocks > u)
 				this_chunk_blocks = u;
-			u = dio->final_block_in_request - dio->block_in_file;
+			u = sdio->final_block_in_request - sdio->block_in_file;
 			if (this_chunk_blocks > u)
 				this_chunk_blocks = u;
 			this_chunk_bytes = this_chunk_blocks << blkbits;
 			BUG_ON(this_chunk_bytes == 0);
 
-			dio->boundary = buffer_boundary(map_bh);
-			ret = submit_page_section(dio, page, offset_in_page,
-				this_chunk_bytes, dio->next_block_for_io);
+			sdio->boundary = buffer_boundary(map_bh);
+			ret = submit_page_section(dio, sdio, page,
+						  offset_in_page,
+						  this_chunk_bytes,
+						  sdio->next_block_for_io);
 			if (ret) {
 				page_cache_release(page);
 				goto out;
 			}
-			dio->next_block_for_io += this_chunk_blocks;
+			sdio->next_block_for_io += this_chunk_blocks;
 
-			dio->block_in_file += this_chunk_blocks;
+			sdio->block_in_file += this_chunk_blocks;
 			block_in_page += this_chunk_blocks;
-			dio->blocks_available -= this_chunk_blocks;
+			sdio->blocks_available -= this_chunk_blocks;
 next_block:
-			BUG_ON(dio->block_in_file > dio->final_block_in_request);
-			if (dio->block_in_file == dio->final_block_in_request)
+			BUG_ON(sdio->block_in_file > sdio->final_block_in_request);
+			if (sdio->block_in_file == sdio->final_block_in_request)
 				break;
 		}
 
@@ -1026,7 +1038,7 @@ static ssize_t
 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
 	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
 	unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
-	dio_submit_t submit_io, struct dio *dio)
+	dio_submit_t submit_io, struct dio *dio, struct dio_submit *sdio)
 {
 	unsigned long user_addr; 
 	unsigned long flags;
@@ -1037,15 +1049,15 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 
 	dio->inode = inode;
 	dio->rw = rw;
-	dio->blkbits = blkbits;
-	dio->blkfactor = inode->i_blkbits - blkbits;
-	dio->block_in_file = offset >> blkbits;
+	sdio->blkbits = blkbits;
+	sdio->blkfactor = inode->i_blkbits - blkbits;
+	sdio->block_in_file = offset >> blkbits;
 
-	dio->get_block = get_block;
+	sdio->get_block = get_block;
 	dio->end_io = end_io;
-	dio->submit_io = submit_io;
-	dio->final_block_in_bio = -1;
-	dio->next_block_for_io = -1;
+	sdio->submit_io = submit_io;
+	sdio->final_block_in_bio = -1;
+	sdio->next_block_for_io = -1;
 
 	dio->iocb = iocb;
 	dio->i_size = i_size_read(inode);
@@ -1057,45 +1069,45 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	 * In case of non-aligned buffers, we may need 2 more
 	 * pages since we need to zero out first and last block.
 	 */
-	if (unlikely(dio->blkfactor))
-		dio->pages_in_io = 2;
+	if (unlikely(sdio->blkfactor))
+		sdio->pages_in_io = 2;
 
 	for (seg = 0; seg < nr_segs; seg++) {
 		user_addr = (unsigned long)iov[seg].iov_base;
-		dio->pages_in_io +=
+		sdio->pages_in_io +=
 			((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE
 				- user_addr/PAGE_SIZE);
 	}
 
 	for (seg = 0; seg < nr_segs; seg++) {
 		user_addr = (unsigned long)iov[seg].iov_base;
-		dio->size += bytes = iov[seg].iov_len;
+		sdio->size += bytes = iov[seg].iov_len;
 
 		/* Index into the first page of the first block */
-		dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
-		dio->final_block_in_request = dio->block_in_file +
+		sdio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
+		sdio->final_block_in_request = sdio->block_in_file +
 						(bytes >> blkbits);
 		/* Page fetching state */
-		dio->head = 0;
-		dio->tail = 0;
-		dio->curr_page = 0;
+		sdio->head = 0;
+		sdio->tail = 0;
+		sdio->curr_page = 0;
 
-		dio->total_pages = 0;
+		sdio->total_pages = 0;
 		if (user_addr & (PAGE_SIZE-1)) {
-			dio->total_pages++;
+			sdio->total_pages++;
 			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
 		}
-		dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
-		dio->curr_user_address = user_addr;
+		sdio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+		sdio->curr_user_address = user_addr;
 	
-		ret = do_direct_IO(dio);
+		ret = do_direct_IO(dio, sdio);
 
 		dio->result += iov[seg].iov_len -
-			((dio->final_block_in_request - dio->block_in_file) <<
+			((sdio->final_block_in_request - sdio->block_in_file) <<
 					blkbits);
 
 		if (ret) {
-			dio_cleanup(dio);
+			dio_cleanup(dio, sdio);
 			break;
 		}
 	} /* end iovec loop */
@@ -1111,23 +1123,23 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	 * There may be some unwritten disk at the end of a part-written
 	 * fs-block-sized block.  Go zero that now.
 	 */
-	dio_zero_block(dio, 1);
+	dio_zero_block(dio, sdio, 1);
 
-	if (dio->cur_page) {
-		ret2 = dio_send_cur_page(dio);
+	if (sdio->cur_page) {
+		ret2 = dio_send_cur_page(dio, sdio);
 		if (ret == 0)
 			ret = ret2;
-		page_cache_release(dio->cur_page);
-		dio->cur_page = NULL;
+		page_cache_release(sdio->cur_page);
+		sdio->cur_page = NULL;
 	}
-	if (dio->bio)
-		dio_bio_submit(dio);
+	if (sdio->bio)
+		dio_bio_submit(dio, sdio);
 
 	/*
 	 * It is possible that, we return short IO due to end of file.
 	 * In that case, we need to release all the pages we got hold on.
 	 */
-	dio_cleanup(dio);
+	dio_cleanup(dio, sdio);
 
 	/*
 	 * All block lookups have been performed. For READ requests
@@ -1146,7 +1158,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	 */
 	BUG_ON(ret == -EIOCBQUEUED);
 	if (dio->is_async && ret == 0 && dio->result &&
-	    ((rw & READ) || (dio->result == dio->size)))
+	    ((rw & READ) || (dio->result == sdio->size)))
 		ret = -EIOCBQUEUED;
 
 	if (ret != -EIOCBQUEUED)
@@ -1211,6 +1223,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	ssize_t retval = -EINVAL;
 	loff_t end = offset;
 	struct dio *dio;
+	struct dio_submit sdio = { 0, };
 
 	if (rw & WRITE)
 		rw = WRITE_ODIRECT;
@@ -1290,7 +1303,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 
 	retval = direct_io_worker(rw, iocb, inode, iov, offset,
 				nr_segs, blkbits, get_block, end_io,
-				submit_io, dio);
+				  submit_io, dio, &sdio);
 
 out:
 	return retval;
-- 
cgit v1.2.3-58-ga151


From cde1ecb3247f67c167918ea6326159209996fd54 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 1 Aug 2011 21:38:04 -0700
Subject: direct-io: fix a wrong comment

There's nothing on the stack, even before my changes.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/direct-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 02ccf766903c..c1e03ae961af 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -39,7 +39,7 @@
 
 /*
  * How many user pages to map in one call to get_user_pages().  This determines
- * the size of a structure on the stack.
+ * the size of a structure in the slab cache
  */
 #define DIO_PAGES	64
 
-- 
cgit v1.2.3-58-ga151


From 0dc2bc49be545626a2dc6da133202ffe69ac3fcc Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 1 Aug 2011 21:38:05 -0700
Subject: direct-io: rearrange fields in dio/dio_submit to avoid holes

Fix most problems reported by pahole.

There is still a weird 104 byte hole after map_bh. I'm not sure what
causes this.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/direct-io.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index c1e03ae961af..b01ea0d7160d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -73,10 +73,10 @@ struct dio_submit {
 	sector_t block_in_file;		/* Current offset into the underlying
 					   file in dio_block units. */
 	unsigned blocks_available;	/* At block_in_file.  changes */
+	int reap_counter;		/* rate limit reaping */
 	sector_t final_block_in_request;/* doesn't change */
 	unsigned first_block_in_page;	/* doesn't change, Used only once */
 	int boundary;			/* prev block is at a boundary */
-	int reap_counter;		/* rate limit reaping */
 	get_block_t *get_block;		/* block mapping function */
 	dio_submit_t *submit_io;	/* IO submition function */
 
@@ -114,27 +114,26 @@ struct dio_submit {
 /* dio_state communicated between submission path and end_io */
 struct dio {
 	int flags;			/* doesn't change */
-	struct inode *inode;
 	int rw;
+	struct inode *inode;
 	loff_t i_size;			/* i_size when submitted */
 	dio_iodone_t *end_io;		/* IO completion function */
-	struct buffer_head map_bh;	/* last get_block() result */
 
 
 	/* BIO completion state */
 	spinlock_t bio_lock;		/* protects BIO fields below */
+	int page_errors;		/* errno from get_user_pages() */
+	int is_async;			/* is IO async ? */
+	int io_error;			/* IO error in completion path */
 	unsigned long refcount;		/* direct_io_worker() and bios */
 	struct bio *bio_list;		/* singly linked via bi_private */
 	struct task_struct *waiter;	/* waiting task (NULL if none) */
 
 	/* AIO related stuff */
 	struct kiocb *iocb;		/* kiocb */
-	int is_async;			/* is IO async ? */
-	int io_error;			/* IO error in completion path */
 	ssize_t result;                 /* IO result */
 
-	int page_errors;		/* errno from get_user_pages() */
-
+	struct buffer_head map_bh;	/* last get_block() result */
 	/*
 	 * pages[] (and any fields placed after it) are not zeroed out at
 	 * allocation time.  Don't add new fields after pages[] unless you
-- 
cgit v1.2.3-58-ga151


From 6e8267f532a17165ab551ac5fdafcba5333dcca5 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 1 Aug 2011 21:38:06 -0700
Subject: direct-io: use a slab cache for struct dio

A direct slab call is slightly faster than kmalloc and can be better cached
per CPU. It also avoids rounding to the next kmalloc slab.

In addition this enforces cache line alignment for struct dio to avoid
any false sharing.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/direct-io.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index b01ea0d7160d..6bb04409e725 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -140,7 +140,9 @@ struct dio {
 	 * wish that they not be zeroed.
 	 */
 	struct page *pages[DIO_PAGES];	/* page buffer */
-};
+} ____cacheline_aligned_in_smp;
+
+static struct kmem_cache *dio_cache __read_mostly;
 
 static void __inode_dio_wait(struct inode *inode)
 {
@@ -330,7 +332,7 @@ static void dio_bio_end_aio(struct bio *bio, int error)
 
 	if (remaining == 0) {
 		dio_complete(dio, dio->iocb->ki_pos, 0, true);
-		kfree(dio);
+		kmem_cache_free(dio_cache, dio);
 	}
 }
 
@@ -1180,7 +1182,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 
 	if (ret2 == 0) {
 		ret = dio_complete(dio, offset, ret, false);
-		kfree(dio);
+		kmem_cache_free(dio_cache, dio);
 	} else
 		BUG_ON(ret != -EIOCBQUEUED);
 
@@ -1256,7 +1258,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	if (rw == READ && end == offset)
 		return 0;
 
-	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
+	dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
 	retval = -ENOMEM;
 	if (!dio)
 		goto out;
@@ -1280,7 +1282,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 							      end - 1);
 			if (retval) {
 				mutex_unlock(&inode->i_mutex);
-				kfree(dio);
+				kmem_cache_free(dio_cache, dio);
 				goto out;
 			}
 		}
@@ -1308,3 +1310,10 @@ out:
 	return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
+
+static __init int dio_init(void)
+{
+	dio_cache = KMEM_CACHE(dio, SLAB_PANIC);
+	return 0;
+}
+module_init(dio_init)
-- 
cgit v1.2.3-58-ga151


From 18772641dbe2c89c6122c603f81f6a9574aee556 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 1 Aug 2011 21:38:07 -0700
Subject: direct-io: separate map_bh from dio

Only a single b_private field in the map_bh buffer head is needed after
the submission path. Move map_bh separately to avoid storing
this information in the long term slab.

This avoids the weird 104 byte hole in struct dio_submit which also needed
to be memseted early.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/direct-io.c | 66 ++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 37 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 6bb04409e725..edf3174afd6a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -119,6 +119,7 @@ struct dio {
 	loff_t i_size;			/* i_size when submitted */
 	dio_iodone_t *end_io;		/* IO completion function */
 
+	void *private;			/* copy from map_bh.b_private */
 
 	/* BIO completion state */
 	spinlock_t bio_lock;		/* protects BIO fields below */
@@ -133,7 +134,6 @@ struct dio {
 	struct kiocb *iocb;		/* kiocb */
 	ssize_t result;                 /* IO result */
 
-	struct buffer_head map_bh;	/* last get_block() result */
 	/*
 	 * pages[] (and any fields placed after it) are not zeroed out at
 	 * allocation time.  Don't add new fields after pages[] unless you
@@ -301,7 +301,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
 
 	if (dio->end_io && dio->result) {
 		dio->end_io(dio->iocb, offset, transferred,
-			    dio->map_bh.b_private, ret, is_async);
+			    dio->private, ret, is_async);
 	} else {
 		if (is_async)
 			aio_complete(dio->iocb, ret, 0);
@@ -574,10 +574,10 @@ static int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
  * buffer_mapped().  However the direct-io code will only process holes one
  * block at a time - it will repeatedly call get_block() as it walks the hole.
  */
-static int get_more_blocks(struct dio *dio, struct dio_submit *sdio)
+static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
+			   struct buffer_head *map_bh)
 {
 	int ret;
-	struct buffer_head *map_bh = &dio->map_bh;
 	sector_t fs_startblk;	/* Into file, in filesystem-sized blocks */
 	unsigned long fs_count;	/* Number of filesystem-sized blocks */
 	unsigned long dio_count;/* Number of dio_block-sized blocks */
@@ -621,6 +621,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio)
 
 		ret = (*sdio->get_block)(dio->inode, fs_startblk,
 						map_bh, create);
+
+		/* Store for completion */
+		dio->private = map_bh->b_private;
 	}
 	return ret;
 }
@@ -629,7 +632,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio)
  * There is no bio.  Make one now.
  */
 static int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
-		       sector_t start_sector)
+		       sector_t start_sector, struct buffer_head *map_bh)
 {
 	sector_t sector;
 	int ret, nr_pages;
@@ -638,10 +641,10 @@ static int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
 	if (ret)
 		goto out;
 	sector = start_sector << (sdio->blkbits - 9);
-	nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
+	nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev));
 	nr_pages = min(nr_pages, BIO_MAX_PAGES);
 	BUG_ON(nr_pages <= 0);
-	dio_bio_alloc(dio, sdio, dio->map_bh.b_bdev, sector, nr_pages);
+	dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
 	sdio->boundary = 0;
 out:
 	return ret;
@@ -686,7 +689,8 @@ static int dio_bio_add_page(struct dio_submit *sdio)
  * The caller of this function is responsible for removing cur_page from the
  * dio, and for dropping the refcount which came from that presence.
  */
-static int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio)
+static int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
+			     struct buffer_head *map_bh)
 {
 	int ret = 0;
 
@@ -721,14 +725,14 @@ static int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio)
 	}
 
 	if (sdio->bio == NULL) {
-		ret = dio_new_bio(dio, sdio, sdio->cur_page_block);
+		ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
 		if (ret)
 			goto out;
 	}
 
 	if (dio_bio_add_page(sdio) != 0) {
 		dio_bio_submit(dio, sdio);
-		ret = dio_new_bio(dio, sdio, sdio->cur_page_block);
+		ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
 		if (ret == 0) {
 			ret = dio_bio_add_page(sdio);
 			BUG_ON(ret != 0);
@@ -757,7 +761,8 @@ out:
  */
 static int
 submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
-		unsigned offset, unsigned len, sector_t blocknr)
+		    unsigned offset, unsigned len, sector_t blocknr,
+		    struct buffer_head *map_bh)
 {
 	int ret = 0;
 
@@ -782,7 +787,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
 		 * avoid metadata seeks.
 		 */
 		if (sdio->boundary) {
-			ret = dio_send_cur_page(dio, sdio);
+			ret = dio_send_cur_page(dio, sdio, map_bh);
 			page_cache_release(sdio->cur_page);
 			sdio->cur_page = NULL;
 		}
@@ -793,7 +798,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
 	 * If there's a deferred page already there then send it.
 	 */
 	if (sdio->cur_page) {
-		ret = dio_send_cur_page(dio, sdio);
+		ret = dio_send_cur_page(dio, sdio, map_bh);
 		page_cache_release(sdio->cur_page);
 		sdio->cur_page = NULL;
 		if (ret)
@@ -815,16 +820,16 @@ out:
  * file blocks.  Only called for S_ISREG files - blockdevs do not set
  * buffer_new
  */
-static void clean_blockdev_aliases(struct dio *dio)
+static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh)
 {
 	unsigned i;
 	unsigned nblocks;
 
-	nblocks = dio->map_bh.b_size >> dio->inode->i_blkbits;
+	nblocks = map_bh->b_size >> dio->inode->i_blkbits;
 
 	for (i = 0; i < nblocks; i++) {
-		unmap_underlying_metadata(dio->map_bh.b_bdev,
-					dio->map_bh.b_blocknr + i);
+		unmap_underlying_metadata(map_bh->b_bdev,
+					  map_bh->b_blocknr + i);
 	}
 }
 
@@ -837,7 +842,8 @@ static void clean_blockdev_aliases(struct dio *dio)
  * `end' is zero if we're doing the start of the IO, 1 at the end of the
  * IO.
  */
-static void dio_zero_block(struct dio *dio, struct dio_submit *sdio, int end)
+static void dio_zero_block(struct dio *dio, struct dio_submit *sdio, int end,
+			   struct buffer_head *map_bh)
 {
 	unsigned dio_blocks_per_fs_block;
 	unsigned this_chunk_blocks;	/* In dio_blocks */
@@ -845,7 +851,7 @@ static void dio_zero_block(struct dio *dio, struct dio_submit *sdio, int end)
 	struct page *page;
 
 	sdio->start_zero_done = 1;
-	if (!sdio->blkfactor || !buffer_new(&dio->map_bh))
+	if (!sdio->blkfactor || !buffer_new(map_bh))
 		return;
 
 	dio_blocks_per_fs_block = 1 << sdio->blkfactor;
@@ -865,7 +871,7 @@ static void dio_zero_block(struct dio *dio, struct dio_submit *sdio, int end)
 
 	page = ZERO_PAGE(0);
 	if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes,
-				sdio->next_block_for_io))
+				sdio->next_block_for_io, map_bh))
 		return;
 
 	sdio->next_block_for_io += this_chunk_blocks;
@@ -887,13 +893,13 @@ static void dio_zero_block(struct dio *dio, struct dio_submit *sdio, int end)
  * it should set b_size to PAGE_SIZE or more inside get_block().  This gives
  * fine alignment but still allows this function to work in PAGE_SIZE units.
  */
-static int do_direct_IO(struct dio *dio, struct dio_submit *sdio)
+static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
+			struct buffer_head *map_bh)
 {
 	const unsigned blkbits = sdio->blkbits;
 	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
 	struct page *page;
 	unsigned block_in_page;
-	struct buffer_head *map_bh = &dio->map_bh;
 	int ret = 0;
 
 	/* The I/O can start at any block offset within the first page */
@@ -919,7 +925,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio)
 				unsigned long blkmask;
 				unsigned long dio_remainder;
 
-				ret = get_more_blocks(dio, sdio);
+				ret = get_more_blocks(dio, sdio, map_bh);
 				if (ret) {
 					page_cache_release(page);
 					goto out;
@@ -932,7 +938,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio)
 				sdio->next_block_for_io =
 					map_bh->b_blocknr << sdio->blkfactor;
 				if (buffer_new(map_bh))
-					clean_blockdev_aliases(dio);
+					clean_blockdev_aliases(dio, map_bh);
 
 				if (!sdio->blkfactor)
 					goto do_holes;
@@ -991,7 +997,7 @@ do_holes:
 			 * we must zero out the start of this block.
 			 */
 			if (unlikely(sdio->blkfactor && !sdio->start_zero_done))
-				dio_zero_block(dio, sdio, 0);
+				dio_zero_block(dio, sdio, 0, map_bh);
 
 			/*
 			 * Work out, in this_chunk_blocks, how much disk we
@@ -1011,7 +1017,8 @@ do_holes:
 			ret = submit_page_section(dio, sdio, page,
 						  offset_in_page,
 						  this_chunk_bytes,
-						  sdio->next_block_for_io);
+						  sdio->next_block_for_io,
+						  map_bh);
 			if (ret) {
 				page_cache_release(page);
 				goto out;
@@ -1047,6 +1054,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	ssize_t ret = 0;
 	ssize_t ret2;
 	size_t bytes;
+	struct buffer_head map_bh = { 0, };
 
 	dio->inode = inode;
 	dio->rw = rw;
@@ -1101,7 +1109,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 		sdio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
 		sdio->curr_user_address = user_addr;
 	
-		ret = do_direct_IO(dio, sdio);
+		ret = do_direct_IO(dio, sdio, &map_bh);
 
 		dio->result += iov[seg].iov_len -
 			((sdio->final_block_in_request - sdio->block_in_file) <<
@@ -1124,10 +1132,10 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	 * There may be some unwritten disk at the end of a part-written
 	 * fs-block-sized block.  Go zero that now.
 	 */
-	dio_zero_block(dio, sdio, 1);
+	dio_zero_block(dio, sdio, 1, &map_bh);
 
 	if (sdio->cur_page) {
-		ret2 = dio_send_cur_page(dio, sdio);
+		ret2 = dio_send_cur_page(dio, sdio, &map_bh);
 		if (ret == 0)
 			ret = ret2;
 		page_cache_release(sdio->cur_page);
-- 
cgit v1.2.3-58-ga151


From ba253fbf6d3502c54e1ac8792e7ac8290a1f5b8d Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 1 Aug 2011 21:38:08 -0700
Subject: direct-io: inline the complete submission path

Add inlines to all the submission path functions. While this increases
code size it also gives gcc a lot of optimization opportunities
in this critical hotpath.

In particular -- together with some other changes -- this
allows gcc to get rid of the unnecessary clearing of
sdio at the beginning and optimize the messy parameter passing.
Any non inlining of a function which takes a sdio parameter
would break this optimization because they cannot be done if the
address of a structure is taken.

Note that benefits are only seen with CONFIG_OPTIMIZE_INLINING
and CONFIG_CC_OPTIMIZE_FOR_SIZE both set to off.

This gives about 2.2% improvement on a large database benchmark
with a high IOPS rate.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/direct-io.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index edf3174afd6a..6d425821be66 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -199,7 +199,7 @@ static inline unsigned dio_pages_present(struct dio_submit *sdio)
 /*
  * Go grab and pin some userspace pages.   Typically we'll get 64 at a time.
  */
-static int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
+static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 {
 	int ret;
 	int nr_pages;
@@ -245,7 +245,8 @@ out:
  * decent number of pages, less frequently.  To provide nicer use of the
  * L1 cache.
  */
-static struct page *dio_get_page(struct dio *dio, struct dio_submit *sdio)
+static inline struct page *dio_get_page(struct dio *dio,
+		struct dio_submit *sdio)
 {
 	if (dio_pages_present(sdio) == 0) {
 		int ret;
@@ -376,7 +377,7 @@ void dio_end_io(struct bio *bio, int error)
 }
 EXPORT_SYMBOL_GPL(dio_end_io);
 
-static void
+static inline void
 dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
 	      struct block_device *bdev,
 	      sector_t first_sector, int nr_vecs)
@@ -407,7 +408,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
  *
  * bios hold a dio reference between submit_bio and ->end_io.
  */
-static void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
+static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 {
 	struct bio *bio = sdio->bio;
 	unsigned long flags;
@@ -435,7 +436,7 @@ static void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 /*
  * Release any resources in case of a failure
  */
-static void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
+static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
 {
 	while (dio_pages_present(sdio))
 		page_cache_release(dio_get_page(dio, sdio));
@@ -528,7 +529,7 @@ static void dio_await_completion(struct dio *dio)
  *
  * This also helps to limit the peak amount of pinned userspace memory.
  */
-static int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
+static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
 {
 	int ret = 0;
 
@@ -631,8 +632,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
 /*
  * There is no bio.  Make one now.
  */
-static int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
-		       sector_t start_sector, struct buffer_head *map_bh)
+static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
+		sector_t start_sector, struct buffer_head *map_bh)
 {
 	sector_t sector;
 	int ret, nr_pages;
@@ -657,7 +658,7 @@ out:
  *
  * Return zero on success.  Non-zero means the caller needs to start a new BIO.
  */
-static int dio_bio_add_page(struct dio_submit *sdio)
+static inline int dio_bio_add_page(struct dio_submit *sdio)
 {
 	int ret;
 
@@ -689,8 +690,8 @@ static int dio_bio_add_page(struct dio_submit *sdio)
  * The caller of this function is responsible for removing cur_page from the
  * dio, and for dropping the refcount which came from that presence.
  */
-static int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
-			     struct buffer_head *map_bh)
+static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
+		struct buffer_head *map_bh)
 {
 	int ret = 0;
 
@@ -759,7 +760,7 @@ out:
  * If that doesn't work out then we put the old page into the bio and add this
  * page to the dio instead.
  */
-static int
+static inline int
 submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
 		    unsigned offset, unsigned len, sector_t blocknr,
 		    struct buffer_head *map_bh)
@@ -842,8 +843,8 @@ static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh)
  * `end' is zero if we're doing the start of the IO, 1 at the end of the
  * IO.
  */
-static void dio_zero_block(struct dio *dio, struct dio_submit *sdio, int end,
-			   struct buffer_head *map_bh)
+static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio,
+		int end, struct buffer_head *map_bh)
 {
 	unsigned dio_blocks_per_fs_block;
 	unsigned this_chunk_blocks;	/* In dio_blocks */
@@ -1042,7 +1043,7 @@ out:
 	return ret;
 }
 
-static ssize_t
+static inline ssize_t
 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
 	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
 	unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
@@ -1216,6 +1217,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
  * expected that filesystem provide exclusion between new direct I/O
  * and truncates.  For DIO_LOCKING filesystems this is done by i_mutex,
  * but other filesystems need to take care of this on their own.
+ *
+ * NOTE: if you pass "sdio" to anything by pointer make sure that function
+ * is always inlined. Otherwise gcc is unable to split the structure into
+ * individual fields and will generate much worse code. This is important
+ * for the whole file.
  */
 ssize_t
 __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
-- 
cgit v1.2.3-58-ga151


From 847cc6371ba820763773e993000410d6d8d23515 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 1 Aug 2011 21:38:09 -0700
Subject: direct-io: merge direct_io_walker into __blockdev_direct_IO

This doesn't change anything for the compiler, but hch thought it would
make the code clearer.

I moved the reference counting into its own little inline.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/direct-io.c | 271 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 132 insertions(+), 139 deletions(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 6d425821be66..d740ab67ff6e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1043,136 +1043,10 @@ out:
 	return ret;
 }
 
-static inline ssize_t
-direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
-	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
-	unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
-	dio_submit_t submit_io, struct dio *dio, struct dio_submit *sdio)
+static inline int drop_refcount(struct dio *dio)
 {
-	unsigned long user_addr; 
+	int ret2;
 	unsigned long flags;
-	int seg;
-	ssize_t ret = 0;
-	ssize_t ret2;
-	size_t bytes;
-	struct buffer_head map_bh = { 0, };
-
-	dio->inode = inode;
-	dio->rw = rw;
-	sdio->blkbits = blkbits;
-	sdio->blkfactor = inode->i_blkbits - blkbits;
-	sdio->block_in_file = offset >> blkbits;
-
-	sdio->get_block = get_block;
-	dio->end_io = end_io;
-	sdio->submit_io = submit_io;
-	sdio->final_block_in_bio = -1;
-	sdio->next_block_for_io = -1;
-
-	dio->iocb = iocb;
-	dio->i_size = i_size_read(inode);
-
-	spin_lock_init(&dio->bio_lock);
-	dio->refcount = 1;
-
-	/*
-	 * In case of non-aligned buffers, we may need 2 more
-	 * pages since we need to zero out first and last block.
-	 */
-	if (unlikely(sdio->blkfactor))
-		sdio->pages_in_io = 2;
-
-	for (seg = 0; seg < nr_segs; seg++) {
-		user_addr = (unsigned long)iov[seg].iov_base;
-		sdio->pages_in_io +=
-			((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE
-				- user_addr/PAGE_SIZE);
-	}
-
-	for (seg = 0; seg < nr_segs; seg++) {
-		user_addr = (unsigned long)iov[seg].iov_base;
-		sdio->size += bytes = iov[seg].iov_len;
-
-		/* Index into the first page of the first block */
-		sdio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
-		sdio->final_block_in_request = sdio->block_in_file +
-						(bytes >> blkbits);
-		/* Page fetching state */
-		sdio->head = 0;
-		sdio->tail = 0;
-		sdio->curr_page = 0;
-
-		sdio->total_pages = 0;
-		if (user_addr & (PAGE_SIZE-1)) {
-			sdio->total_pages++;
-			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
-		}
-		sdio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
-		sdio->curr_user_address = user_addr;
-	
-		ret = do_direct_IO(dio, sdio, &map_bh);
-
-		dio->result += iov[seg].iov_len -
-			((sdio->final_block_in_request - sdio->block_in_file) <<
-					blkbits);
-
-		if (ret) {
-			dio_cleanup(dio, sdio);
-			break;
-		}
-	} /* end iovec loop */
-
-	if (ret == -ENOTBLK) {
-		/*
-		 * The remaining part of the request will be
-		 * be handled by buffered I/O when we return
-		 */
-		ret = 0;
-	}
-	/*
-	 * There may be some unwritten disk at the end of a part-written
-	 * fs-block-sized block.  Go zero that now.
-	 */
-	dio_zero_block(dio, sdio, 1, &map_bh);
-
-	if (sdio->cur_page) {
-		ret2 = dio_send_cur_page(dio, sdio, &map_bh);
-		if (ret == 0)
-			ret = ret2;
-		page_cache_release(sdio->cur_page);
-		sdio->cur_page = NULL;
-	}
-	if (sdio->bio)
-		dio_bio_submit(dio, sdio);
-
-	/*
-	 * It is possible that, we return short IO due to end of file.
-	 * In that case, we need to release all the pages we got hold on.
-	 */
-	dio_cleanup(dio, sdio);
-
-	/*
-	 * All block lookups have been performed. For READ requests
-	 * we can let i_mutex go now that its achieved its purpose
-	 * of protecting us from looking up uninitialized blocks.
-	 */
-	if (rw == READ && (dio->flags & DIO_LOCKING))
-		mutex_unlock(&dio->inode->i_mutex);
-
-	/*
-	 * The only time we want to leave bios in flight is when a successful
-	 * partial aio read or full aio write have been setup.  In that case
-	 * bio completion will call aio_complete.  The only time it's safe to
-	 * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
-	 * This had *better* be the only place that raises -EIOCBQUEUED.
-	 */
-	BUG_ON(ret == -EIOCBQUEUED);
-	if (dio->is_async && ret == 0 && dio->result &&
-	    ((rw & READ) || (dio->result == sdio->size)))
-		ret = -EIOCBQUEUED;
-
-	if (ret != -EIOCBQUEUED)
-		dio_await_completion(dio);
 
 	/*
 	 * Sync will always be dropping the final ref and completing the
@@ -1188,14 +1062,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	spin_lock_irqsave(&dio->bio_lock, flags);
 	ret2 = --dio->refcount;
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
-
-	if (ret2 == 0) {
-		ret = dio_complete(dio, offset, ret, false);
-		kmem_cache_free(dio_cache, dio);
-	} else
-		BUG_ON(ret != -EIOCBQUEUED);
-
-	return ret;
+	return ret2;
 }
 
 /*
@@ -1239,6 +1106,9 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	loff_t end = offset;
 	struct dio *dio;
 	struct dio_submit sdio = { 0, };
+	unsigned long user_addr;
+	size_t bytes;
+	struct buffer_head map_bh = { 0, };
 
 	if (rw & WRITE)
 		rw = WRITE_ODIRECT;
@@ -1316,9 +1186,132 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
 		(end > i_size_read(inode)));
 
-	retval = direct_io_worker(rw, iocb, inode, iov, offset,
-				nr_segs, blkbits, get_block, end_io,
-				  submit_io, dio, &sdio);
+	retval = 0;
+
+	dio->inode = inode;
+	dio->rw = rw;
+	sdio.blkbits = blkbits;
+	sdio.blkfactor = inode->i_blkbits - blkbits;
+	sdio.block_in_file = offset >> blkbits;
+
+	sdio.get_block = get_block;
+	dio->end_io = end_io;
+	sdio.submit_io = submit_io;
+	sdio.final_block_in_bio = -1;
+	sdio.next_block_for_io = -1;
+
+	dio->iocb = iocb;
+	dio->i_size = i_size_read(inode);
+
+	spin_lock_init(&dio->bio_lock);
+	dio->refcount = 1;
+
+	/*
+	 * In case of non-aligned buffers, we may need 2 more
+	 * pages since we need to zero out first and last block.
+	 */
+	if (unlikely(sdio.blkfactor))
+		sdio.pages_in_io = 2;
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		user_addr = (unsigned long)iov[seg].iov_base;
+		sdio.pages_in_io +=
+			((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
+				PAGE_SIZE - user_addr / PAGE_SIZE);
+	}
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		user_addr = (unsigned long)iov[seg].iov_base;
+		sdio.size += bytes = iov[seg].iov_len;
+
+		/* Index into the first page of the first block */
+		sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
+		sdio.final_block_in_request = sdio.block_in_file +
+						(bytes >> blkbits);
+		/* Page fetching state */
+		sdio.head = 0;
+		sdio.tail = 0;
+		sdio.curr_page = 0;
+
+		sdio.total_pages = 0;
+		if (user_addr & (PAGE_SIZE-1)) {
+			sdio.total_pages++;
+			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
+		}
+		sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+		sdio.curr_user_address = user_addr;
+
+		retval = do_direct_IO(dio, &sdio, &map_bh);
+
+		dio->result += iov[seg].iov_len -
+			((sdio.final_block_in_request - sdio.block_in_file) <<
+					blkbits);
+
+		if (retval) {
+			dio_cleanup(dio, &sdio);
+			break;
+		}
+	} /* end iovec loop */
+
+	if (retval == -ENOTBLK) {
+		/*
+		 * The remaining part of the request will be
+		 * be handled by buffered I/O when we return
+		 */
+		retval = 0;
+	}
+	/*
+	 * There may be some unwritten disk at the end of a part-written
+	 * fs-block-sized block.  Go zero that now.
+	 */
+	dio_zero_block(dio, &sdio, 1, &map_bh);
+
+	if (sdio.cur_page) {
+		ssize_t ret2;
+
+		ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
+		if (retval == 0)
+			retval = ret2;
+		page_cache_release(sdio.cur_page);
+		sdio.cur_page = NULL;
+	}
+	if (sdio.bio)
+		dio_bio_submit(dio, &sdio);
+
+	/*
+	 * It is possible that, we return short IO due to end of file.
+	 * In that case, we need to release all the pages we got hold on.
+	 */
+	dio_cleanup(dio, &sdio);
+
+	/*
+	 * All block lookups have been performed. For READ requests
+	 * we can let i_mutex go now that its achieved its purpose
+	 * of protecting us from looking up uninitialized blocks.
+	 */
+	if (rw == READ && (dio->flags & DIO_LOCKING))
+		mutex_unlock(&dio->inode->i_mutex);
+
+	/*
+	 * The only time we want to leave bios in flight is when a successful
+	 * partial aio read or full aio write have been setup.  In that case
+	 * bio completion will call aio_complete.  The only time it's safe to
+	 * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
+	 * This had *better* be the only place that raises -EIOCBQUEUED.
+	 */
+	BUG_ON(retval == -EIOCBQUEUED);
+	if (dio->is_async && retval == 0 && dio->result &&
+	    ((rw & READ) || (dio->result == sdio.size)))
+		retval = -EIOCBQUEUED;
+
+	if (retval != -EIOCBQUEUED)
+		dio_await_completion(dio);
+
+	if (drop_refcount(dio) == 0) {
+		retval = dio_complete(dio, offset, retval, false);
+		kmem_cache_free(dio_cache, dio);
+	} else
+		BUG_ON(retval != -EIOCBQUEUED);
 
 out:
 	return retval;
-- 
cgit v1.2.3-58-ga151


From ef3d0fd27e90f67e35da516dafc1482c82939a60 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 15 Sep 2011 16:06:48 -0700
Subject: vfs: do (nearly) lockless generic_file_llseek

The i_mutex lock use of generic _file_llseek hurts.  Independent processes
accessing the same file synchronize over a single lock, even though
they have no need for synchronization at all.

Under high utilization this can cause llseek to scale very poorly on larger
systems.

This patch does some rethinking of the llseek locking model:

First the 64bit f_pos is not necessarily atomic without locks
on 32bit systems. This can already cause races with read() today.
This was discussed on linux-kernel in the past and deemed acceptable.
The patch does not change that.

Let's look at the different seek variants:

SEEK_SET: Doesn't really need any locking.
If there's a race one writer wins, the other loses.

For 32bit the non atomic update races against read()
stay the same. Without a lock they can also happen
against write() now.  The read() race was deemed
acceptable in past discussions, and I think if it's
ok for read it's ok for write too.

=> Don't need a lock.

SEEK_END: This behaves like SEEK_SET plus it reads
the maximum size too. Reading the maximum size would have the
32bit atomic problem. But luckily we already have a way to read
the maximum size without locking (i_size_read), so we
can just use that instead.

Without i_mutex there is no synchronization with write() anymore,
however since the write() update is atomic on 64bit it just behaves
like another racy SEEK_SET.  On non atomic 32bit it's the same
as SEEK_SET.

=> Don't need a lock, but need to use i_size_read()

SEEK_CUR: This has a read-modify-write race window
on the same file. One could argue that any application
doing unsynchronized seeks on the same file is already broken.
But for the sake of not adding a regression here I'm
using the file->f_lock to synchronize this. Using this
lock is much better than the inode mutex because it doesn't
synchronize between processes.

=> So still need a lock, but can use a f_lock.

This patch implements this new scheme in generic_file_llseek.
I dropped generic_file_llseek_unlocked and changed all callers.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/file.c    |  2 +-
 fs/cifs/cifsfs.c   |  2 +-
 fs/gfs2/file.c     |  4 +--
 fs/nfs/file.c      |  5 ++--
 fs/read_write.c    | 85 ++++++++++++++++++++++++++----------------------------
 include/linux/fs.h |  9 ++++--
 6 files changed, 54 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4e57d59edb7..1266f6e9cdb2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1821,7 +1821,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
 	switch (origin) {
 	case SEEK_END:
 	case SEEK_CUR:
-		offset = generic_file_llseek_unlocked(file, offset, origin);
+		offset = generic_file_llseek(file, offset, origin);
 		goto out;
 	case SEEK_DATA:
 	case SEEK_HOLE:
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 54b8f1e7da94..db7ce87d37a5 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -723,7 +723,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 		if (rc < 0)
 			return (loff_t)rc;
 	}
-	return generic_file_llseek_unlocked(file, offset, origin);
+	return generic_file_llseek(file, offset, origin);
 }
 
 static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index edeb9e802903..fe6bc0207818 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -63,11 +63,11 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
 		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
 					   &i_gh);
 		if (!error) {
-			error = generic_file_llseek_unlocked(file, offset, origin);
+			error = generic_file_llseek(file, offset, origin);
 			gfs2_glock_dq_uninit(&i_gh);
 		}
 	} else
-		error = generic_file_llseek_unlocked(file, offset, origin);
+		error = generic_file_llseek(file, offset, origin);
 
 	return error;
 }
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 28b8c3f3cda3..12623abcf3d4 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -198,11 +198,12 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 		if (retval < 0)
 			return (loff_t)retval;
 
+		/* AK: should drop this lock. Unlikely to be needed. */
 		spin_lock(&inode->i_lock);
-		loff = generic_file_llseek_unlocked(filp, offset, origin);
+		loff = generic_file_llseek(filp, offset, origin);
 		spin_unlock(&inode->i_lock);
 	} else
-		loff = generic_file_llseek_unlocked(filp, offset, origin);
+		loff = generic_file_llseek(filp, offset, origin);
 	return loff;
 }
 
diff --git a/fs/read_write.c b/fs/read_write.c
index 179f1c33ea57..672b187def62 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -35,23 +35,45 @@ static inline int unsigned_offsets(struct file *file)
 	return file->f_mode & FMODE_UNSIGNED_OFFSET;
 }
 
+static loff_t lseek_execute(struct file *file, struct inode *inode,
+		loff_t offset, loff_t maxsize)
+{
+	if (offset < 0 && !unsigned_offsets(file))
+		return -EINVAL;
+	if (offset > maxsize)
+		return -EINVAL;
+
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		file->f_version = 0;
+	}
+	return offset;
+}
+
 /**
- * generic_file_llseek_unlocked - lockless generic llseek implementation
+ * generic_file_llseek - generic llseek implementation for regular files
  * @file:	file structure to seek on
  * @offset:	file offset to seek to
  * @origin:	type of seek
  *
- * Updates the file offset to the value specified by @offset and @origin.
- * Locking must be provided by the caller.
+ * This is a generic implemenation of ->llseek usable for all normal local
+ * filesystems.  It just updates the file offset to the value specified by
+ * @offset and @origin under i_mutex.
+ *
+ * Synchronization:
+ * SEEK_SET is unsynchronized (but atomic on 64bit platforms)
+ * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
+ * read/writes behave like SEEK_SET against seeks.
+ * SEEK_END
  */
 loff_t
-generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
+generic_file_llseek(struct file *file, loff_t offset, int origin)
 {
 	struct inode *inode = file->f_mapping->host;
 
 	switch (origin) {
 	case SEEK_END:
-		offset += inode->i_size;
+		offset += i_size_read(inode);
 		break;
 	case SEEK_CUR:
 		/*
@@ -62,14 +84,22 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
 		 */
 		if (offset == 0)
 			return file->f_pos;
-		offset += file->f_pos;
-		break;
+		/*
+		 * f_lock protects against read/modify/write race with other
+		 * SEEK_CURs. Note that parallel writes and reads behave
+		 * like SEEK_SET.
+		 */
+		spin_lock(&file->f_lock);
+		offset = lseek_execute(file, inode, file->f_pos + offset,
+				       inode->i_sb->s_maxbytes);
+		spin_unlock(&file->f_lock);
+		return offset;
 	case SEEK_DATA:
 		/*
 		 * In the generic case the entire file is data, so as long as
 		 * offset isn't at the end of the file then the offset is data.
 		 */
-		if (offset >= inode->i_size)
+		if (offset >= i_size_read(inode))
 			return -ENXIO;
 		break;
 	case SEEK_HOLE:
@@ -77,46 +107,13 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
 		 * There is a virtual hole at the end of the file, so as long as
 		 * offset isn't i_size or larger, return i_size.
 		 */
-		if (offset >= inode->i_size)
+		if (offset >= i_size_read(inode))
 			return -ENXIO;
-		offset = inode->i_size;
+		offset = i_size_read(inode);
 		break;
 	}
 
-	if (offset < 0 && !unsigned_offsets(file))
-		return -EINVAL;
-	if (offset > inode->i_sb->s_maxbytes)
-		return -EINVAL;
-
-	/* Special lock needed here? */
-	if (offset != file->f_pos) {
-		file->f_pos = offset;
-		file->f_version = 0;
-	}
-
-	return offset;
-}
-EXPORT_SYMBOL(generic_file_llseek_unlocked);
-
-/**
- * generic_file_llseek - generic llseek implementation for regular files
- * @file:	file structure to seek on
- * @offset:	file offset to seek to
- * @origin:	type of seek
- *
- * This is a generic implemenation of ->llseek useable for all normal local
- * filesystems.  It just updates the file offset to the value specified by
- * @offset and @origin under i_mutex.
- */
-loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
-{
-	loff_t rval;
-
-	mutex_lock(&file->f_dentry->d_inode->i_mutex);
-	rval = generic_file_llseek_unlocked(file, offset, origin);
-	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
-
-	return rval;
+	return lseek_execute(file, inode, offset, inode->i_sb->s_maxbytes);
 }
 EXPORT_SYMBOL(generic_file_llseek);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c1884e974ff4..db85196f6308 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -964,7 +964,12 @@ struct file {
 #define f_dentry	f_path.dentry
 #define f_vfsmnt	f_path.mnt
 	const struct file_operations	*f_op;
-	spinlock_t		f_lock;  /* f_ep_links, f_flags, no IRQ */
+
+	/*
+	 * Protects f_ep_links, f_flags, f_pos vs i_size in lseek SEEK_CUR.
+	 * Must not be taken from IRQ context.
+	 */
+	spinlock_t		f_lock;
 #ifdef CONFIG_SMP
 	int			f_sb_list_cpu;
 #endif
@@ -2398,8 +2403,6 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
 extern loff_t noop_llseek(struct file *file, loff_t offset, int origin);
 extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
-extern loff_t generic_file_llseek_unlocked(struct file *file, loff_t offset,
-			int origin);
 extern int generic_file_open(struct inode * inode, struct file * filp);
 extern int nonseekable_open(struct inode * inode, struct file * filp);
 
-- 
cgit v1.2.3-58-ga151


From 5760495a872d63a182962680a13c2af29235237c Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 15 Sep 2011 16:06:50 -0700
Subject: vfs: add generic_file_llseek_size

Add a generic_file_llseek variant to the VFS that allows passing in
the maximum file size of the file system, instead of always
using maxbytes from the superblock.

This can be used to eliminate some cut'n'paste seek code in ext4.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/read_write.c    | 37 ++++++++++++++++++++++++++++---------
 include/linux/fs.h |  2 ++
 2 files changed, 30 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/read_write.c b/fs/read_write.c
index 672b187def62..dfd125798791 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -51,23 +51,23 @@ static loff_t lseek_execute(struct file *file, struct inode *inode,
 }
 
 /**
- * generic_file_llseek - generic llseek implementation for regular files
+ * generic_file_llseek_size - generic llseek implementation for regular files
  * @file:	file structure to seek on
  * @offset:	file offset to seek to
  * @origin:	type of seek
+ * @size:	max size of file system
  *
- * This is a generic implemenation of ->llseek usable for all normal local
- * filesystems.  It just updates the file offset to the value specified by
- * @offset and @origin under i_mutex.
+ * This is a variant of generic_file_llseek that allows passing in a custom
+ * file size.
  *
  * Synchronization:
- * SEEK_SET is unsynchronized (but atomic on 64bit platforms)
+ * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  * read/writes behave like SEEK_SET against seeks.
- * SEEK_END
  */
 loff_t
-generic_file_llseek(struct file *file, loff_t offset, int origin)
+generic_file_llseek_size(struct file *file, loff_t offset, int origin,
+		loff_t maxsize)
 {
 	struct inode *inode = file->f_mapping->host;
 
@@ -91,7 +91,7 @@ generic_file_llseek(struct file *file, loff_t offset, int origin)
 		 */
 		spin_lock(&file->f_lock);
 		offset = lseek_execute(file, inode, file->f_pos + offset,
-				       inode->i_sb->s_maxbytes);
+				       maxsize);
 		spin_unlock(&file->f_lock);
 		return offset;
 	case SEEK_DATA:
@@ -113,7 +113,26 @@ generic_file_llseek(struct file *file, loff_t offset, int origin)
 		break;
 	}
 
-	return lseek_execute(file, inode, offset, inode->i_sb->s_maxbytes);
+	return lseek_execute(file, inode, offset, maxsize);
+}
+EXPORT_SYMBOL(generic_file_llseek_size);
+
+/**
+ * generic_file_llseek - generic llseek implementation for regular files
+ * @file:	file structure to seek on
+ * @offset:	file offset to seek to
+ * @origin:	type of seek
+ *
+ * This is a generic implemenation of ->llseek useable for all normal local
+ * filesystems.  It just updates the file offset to the value specified by
+ * @offset and @origin under i_mutex.
+ */
+loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
+{
+	struct inode *inode = file->f_mapping->host;
+
+	return generic_file_llseek_size(file, offset, origin,
+					inode->i_sb->s_maxbytes);
 }
 EXPORT_SYMBOL(generic_file_llseek);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index db85196f6308..d055cc7d7240 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2403,6 +2403,8 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
 extern loff_t noop_llseek(struct file *file, loff_t offset, int origin);
 extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
+extern loff_t generic_file_llseek_size(struct file *file, loff_t offset,
+		int origin, loff_t maxsize);
 extern int generic_file_open(struct inode * inode, struct file * filp);
 extern int nonseekable_open(struct inode * inode, struct file * filp);
 
-- 
cgit v1.2.3-58-ga151


From 4cce0e28b932c11454f75d1c1fae674600c23fbf Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 15 Sep 2011 16:06:51 -0700
Subject: ext4: replace cut'n'pasted llseek code with generic_file_llseek_size

This gives ext4 the benefits of unlocked llseek.

Cc: tytso@mit.edu
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/ext4/file.c | 47 +----------------------------------------------
 1 file changed, 1 insertion(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index e4095e988eba..b9548f477bb8 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -224,53 +224,8 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
 		maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
 	else
 		maxbytes = inode->i_sb->s_maxbytes;
-	mutex_lock(&inode->i_mutex);
-	switch (origin) {
-	case SEEK_END:
-		offset += inode->i_size;
-		break;
-	case SEEK_CUR:
-		if (offset == 0) {
-			mutex_unlock(&inode->i_mutex);
-			return file->f_pos;
-		}
-		offset += file->f_pos;
-		break;
-	case SEEK_DATA:
-		/*
-		 * In the generic case the entire file is data, so as long as
-		 * offset isn't at the end of the file then the offset is data.
-		 */
-		if (offset >= inode->i_size) {
-			mutex_unlock(&inode->i_mutex);
-			return -ENXIO;
-		}
-		break;
-	case SEEK_HOLE:
-		/*
-		 * There is a virtual hole at the end of the file, so as long as
-		 * offset isn't i_size or larger, return i_size.
-		 */
-		if (offset >= inode->i_size) {
-			mutex_unlock(&inode->i_mutex);
-			return -ENXIO;
-		}
-		offset = inode->i_size;
-		break;
-	}
-
-	if (offset < 0 || offset > maxbytes) {
-		mutex_unlock(&inode->i_mutex);
-		return -EINVAL;
-	}
-
-	if (offset != file->f_pos) {
-		file->f_pos = offset;
-		file->f_version = 0;
-	}
-	mutex_unlock(&inode->i_mutex);
 
-	return offset;
+	return generic_file_llseek_size(file, offset, origin, maxbytes);
 }
 
 const struct file_operations ext4_file_operations = {
-- 
cgit v1.2.3-58-ga151


From 79835a710d6ff811659c8de46f89c7577c3b8cc6 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 15 Sep 2011 16:06:52 -0700
Subject: nfs: drop unnecessary locking in llseek

This makes NFS follow the standard generic_file_llseek locking scheme.

Cc: Trond.Myklebust@netapp.com
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/nfs/file.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 12623abcf3d4..91c01f0a4c3b 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -180,8 +180,6 @@ force_reval:
 
 static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 {
-	loff_t loff;
-
 	dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
 			filp->f_path.dentry->d_parent->d_name.name,
 			filp->f_path.dentry->d_name.name,
@@ -197,14 +195,9 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 		int retval = nfs_revalidate_file_size(inode, filp);
 		if (retval < 0)
 			return (loff_t)retval;
+	}
 
-		/* AK: should drop this lock. Unlikely to be needed. */
-		spin_lock(&inode->i_lock);
-		loff = generic_file_llseek(filp, offset, origin);
-		spin_unlock(&inode->i_lock);
-	} else
-		loff = generic_file_llseek(filp, offset, origin);
-	return loff;
+	return generic_file_llseek(filp, offset, origin);
 }
 
 /*
-- 
cgit v1.2.3-58-ga151


From f3c7691e8d30d88899b514675c7c86d19057b5fd Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 21 Sep 2011 10:58:13 -0400
Subject: leases: fix write-open/read-lease race

In setlease, we use i_writecount to decide whether we can give out a
read lease.

In open, we break leases before incrementing i_writecount.

There is therefore a window between the break lease and the i_writecount
increment when setlease could add a new read lease.

This would leave us with a simultaneous write open and read lease, which
shouldn't happen.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/namei.c | 5 +----
 fs/open.c  | 4 ++++
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 9061157e39d6..7657be4352bf 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2035,10 +2035,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
 	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
 		return -EPERM;
 
-	/*
-	 * Ensure there are no outstanding leases on the file.
-	 */
-	return break_lease(inode, flag);
+	return 0;
 }
 
 static int handle_truncate(struct file *filp)
diff --git a/fs/open.c b/fs/open.c
index f71192109457..22c41b543f2d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -685,6 +685,10 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 	if (error)
 		goto cleanup_all;
 
+	error = break_lease(inode, f->f_flags);
+	if (error)
+		goto cleanup_all;
+
 	if (!open && f->f_op)
 		open = f->f_op->open;
 	if (open) {
-- 
cgit v1.2.3-58-ga151


From d12799b4c3c4d1e65d7fdbb5d91538c92d6863dd Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastry@etersoft.ru>
Date: Wed, 26 Oct 2011 23:41:04 +0400
Subject: CIFS: Remove extra mutex_unlock in cifs_lock_add_if

to prevent the mutex being unlocked twice if we interrupt a blocked lock.

Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/file.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ea096ce5d4f7..91e05f2f0acf 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -778,7 +778,6 @@ try_again:
 		else {
 			mutex_lock(&cinode->lock_mutex);
 			list_del_init(&lock->blist);
-			mutex_unlock(&cinode->lock_mutex);
 		}
 	}
 
-- 
cgit v1.2.3-58-ga151


From 8ea00c6977d8b1463ee86d6689c8ef35ee2529a0 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Fri, 28 Oct 2011 14:49:46 -0500
Subject: [CIFS] Update cifs version to 1.76

Update cifs version to 1.76 now that async read,
lock caching, and changes to oplock enabled interface
are in.

Thanks to Pavel for reminding me.

Reviewed-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index d9dbaf869cd1..30ff56005d8f 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -125,5 +125,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
 
-#define CIFS_VERSION   "1.75"
+#define CIFS_VERSION   "1.76"
 #endif				/* _CIFSFS_H */
-- 
cgit v1.2.3-58-ga151


From fba90ffee813e2425feb9a57c532b3d297af18c3 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Sat, 29 Oct 2011 09:03:00 -0400
Subject: ext4: migrate cleanup

This patch cleanup code a bit, actual logic not changed
- Move current block pointer to migrate_structure, let's all
  walk info will be in one structure.
- Get rid of usless null ind-block ptr checks, caller already
  does that check.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/migrate.c | 101 +++++++++++++++++++-----------------------------------
 1 file changed, 35 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 6f07a06f2437..8a9a0912fdae 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -20,13 +20,13 @@
  * The contiguous blocks details which can be
  * represented by a single extent
  */
-struct list_blocks_struct {
-	ext4_lblk_t first_block, last_block;
+struct migrate_struct {
+	ext4_lblk_t first_block, last_block, curr_block;
 	ext4_fsblk_t first_pblock, last_pblock;
 };
 
 static int finish_range(handle_t *handle, struct inode *inode,
-				struct list_blocks_struct *lb)
+				struct migrate_struct *lb)
 
 {
 	int retval = 0, needed;
@@ -86,8 +86,7 @@ err_out:
 }
 
 static int update_extent_range(handle_t *handle, struct inode *inode,
-				ext4_fsblk_t pblock, ext4_lblk_t blk_num,
-				struct list_blocks_struct *lb)
+			       ext4_fsblk_t pblock, struct migrate_struct *lb)
 {
 	int retval;
 	/*
@@ -95,9 +94,10 @@ static int update_extent_range(handle_t *handle, struct inode *inode,
 	 */
 	if (lb->first_pblock &&
 		(lb->last_pblock+1 == pblock) &&
-		(lb->last_block+1 == blk_num)) {
+		(lb->last_block+1 == lb->curr_block)) {
 		lb->last_pblock = pblock;
-		lb->last_block = blk_num;
+		lb->last_block = lb->curr_block;
+		lb->curr_block++;
 		return 0;
 	}
 	/*
@@ -105,64 +105,49 @@ static int update_extent_range(handle_t *handle, struct inode *inode,
 	 */
 	retval = finish_range(handle, inode, lb);
 	lb->first_pblock = lb->last_pblock = pblock;
-	lb->first_block = lb->last_block = blk_num;
-
+	lb->first_block = lb->last_block = lb->curr_block;
+	lb->curr_block++;
 	return retval;
 }
 
 static int update_ind_extent_range(handle_t *handle, struct inode *inode,
-				   ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
-				   struct list_blocks_struct *lb)
+				   ext4_fsblk_t pblock,
+				   struct migrate_struct *lb)
 {
 	struct buffer_head *bh;
 	__le32 *i_data;
 	int i, retval = 0;
-	ext4_lblk_t blk_count = *blk_nump;
 	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
 
-	if (!pblock) {
-		/* Only update the file block number */
-		*blk_nump += max_entries;
-		return 0;
-	}
-
 	bh = sb_bread(inode->i_sb, pblock);
 	if (!bh)
 		return -EIO;
 
 	i_data = (__le32 *)bh->b_data;
-	for (i = 0; i < max_entries; i++, blk_count++) {
+	for (i = 0; i < max_entries; i++) {
 		if (i_data[i]) {
 			retval = update_extent_range(handle, inode,
-						le32_to_cpu(i_data[i]),
-						blk_count, lb);
+						le32_to_cpu(i_data[i]), lb);
 			if (retval)
 				break;
+		} else {
+			lb->curr_block++;
 		}
 	}
-
-	/* Update the file block number */
-	*blk_nump = blk_count;
 	put_bh(bh);
 	return retval;
 
 }
 
 static int update_dind_extent_range(handle_t *handle, struct inode *inode,
-				    ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
-				    struct list_blocks_struct *lb)
+				    ext4_fsblk_t pblock,
+				    struct migrate_struct *lb)
 {
 	struct buffer_head *bh;
 	__le32 *i_data;
 	int i, retval = 0;
-	ext4_lblk_t blk_count = *blk_nump;
 	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
 
-	if (!pblock) {
-		/* Only update the file block number */
-		*blk_nump += max_entries * max_entries;
-		return 0;
-	}
 	bh = sb_bread(inode->i_sb, pblock);
 	if (!bh)
 		return -EIO;
@@ -171,38 +156,28 @@ static int update_dind_extent_range(handle_t *handle, struct inode *inode,
 	for (i = 0; i < max_entries; i++) {
 		if (i_data[i]) {
 			retval = update_ind_extent_range(handle, inode,
-						le32_to_cpu(i_data[i]),
-						&blk_count, lb);
+						le32_to_cpu(i_data[i]), lb);
 			if (retval)
 				break;
 		} else {
 			/* Only update the file block number */
-			blk_count += max_entries;
+			lb->curr_block += max_entries;
 		}
 	}
-
-	/* Update the file block number */
-	*blk_nump = blk_count;
 	put_bh(bh);
 	return retval;
 
 }
 
 static int update_tind_extent_range(handle_t *handle, struct inode *inode,
-				     ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
-				     struct list_blocks_struct *lb)
+				    ext4_fsblk_t pblock,
+				    struct migrate_struct *lb)
 {
 	struct buffer_head *bh;
 	__le32 *i_data;
 	int i, retval = 0;
-	ext4_lblk_t blk_count = *blk_nump;
 	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
 
-	if (!pblock) {
-		/* Only update the file block number */
-		*blk_nump += max_entries * max_entries * max_entries;
-		return 0;
-	}
 	bh = sb_bread(inode->i_sb, pblock);
 	if (!bh)
 		return -EIO;
@@ -211,16 +186,14 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode,
 	for (i = 0; i < max_entries; i++) {
 		if (i_data[i]) {
 			retval = update_dind_extent_range(handle, inode,
-						le32_to_cpu(i_data[i]),
-						&blk_count, lb);
+						le32_to_cpu(i_data[i]), lb);
 			if (retval)
 				break;
-		} else
+		} else {
 			/* Only update the file block number */
-			blk_count += max_entries * max_entries;
+			lb->curr_block += max_entries * max_entries;
+		}
 	}
-	/* Update the file block number */
-	*blk_nump = blk_count;
 	put_bh(bh);
 	return retval;
 
@@ -461,10 +434,9 @@ int ext4_ext_migrate(struct inode *inode)
 	handle_t *handle;
 	int retval = 0, i;
 	__le32 *i_data;
-	ext4_lblk_t blk_count = 0;
 	struct ext4_inode_info *ei;
 	struct inode *tmp_inode = NULL;
-	struct list_blocks_struct lb;
+	struct migrate_struct lb;
 	unsigned long max_entries;
 	__u32 goal;
 
@@ -550,35 +522,32 @@ int ext4_ext_migrate(struct inode *inode)
 
 	/* 32 bit block address 4 bytes */
 	max_entries = inode->i_sb->s_blocksize >> 2;
-	for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) {
+	for (i = 0; i < EXT4_NDIR_BLOCKS; i++) {
 		if (i_data[i]) {
 			retval = update_extent_range(handle, tmp_inode,
-						le32_to_cpu(i_data[i]),
-						blk_count, &lb);
+						le32_to_cpu(i_data[i]), &lb);
 			if (retval)
 				goto err_out;
-		}
+		} else
+			lb.curr_block++;
 	}
 	if (i_data[EXT4_IND_BLOCK]) {
 		retval = update_ind_extent_range(handle, tmp_inode,
-					le32_to_cpu(i_data[EXT4_IND_BLOCK]),
-					&blk_count, &lb);
+				le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb);
 			if (retval)
 				goto err_out;
 	} else
-		blk_count +=  max_entries;
+		lb.curr_block += max_entries;
 	if (i_data[EXT4_DIND_BLOCK]) {
 		retval = update_dind_extent_range(handle, tmp_inode,
-					le32_to_cpu(i_data[EXT4_DIND_BLOCK]),
-					&blk_count, &lb);
+				le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb);
 			if (retval)
 				goto err_out;
 	} else
-		blk_count += max_entries * max_entries;
+		lb.curr_block += max_entries * max_entries;
 	if (i_data[EXT4_TIND_BLOCK]) {
 		retval = update_tind_extent_range(handle, tmp_inode,
-					le32_to_cpu(i_data[EXT4_TIND_BLOCK]),
-					&blk_count, &lb);
+				le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb);
 			if (retval)
 				goto err_out;
 	}
-- 
cgit v1.2.3-58-ga151


From 5cb81dabcc28863e7d04e6fd9ede154bd8459c14 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Sat, 29 Oct 2011 09:05:00 -0400
Subject: ext4: fix quota accounting during migration

The tmp_inode should have same uid/gid as the original inode.
Otherwise new metadata blocks will be accounted to wrong quota-id,
which will result in a quota leak after the inode migration is
completed.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    | 3 ++-
 fs/ext4/ialloc.c  | 9 ++++++---
 fs/ext4/migrate.c | 7 +++++--
 fs/ext4/namei.c   | 8 ++++----
 4 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3647ae0b21ab..657e82649fa5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1811,7 +1811,8 @@ extern int ext4fs_dirhash(const char *name, int len, struct
 
 /* ialloc.c */
 extern struct inode *ext4_new_inode(handle_t *, struct inode *, int,
-				    const struct qstr *qstr, __u32 goal);
+				    const struct qstr *qstr, __u32 goal,
+				    uid_t *owner);
 extern void ext4_free_inode(handle_t *, struct inode *);
 extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index e007fecdaedc..acdde93b74d7 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -691,7 +691,7 @@ err_ret:
  * group to find a free inode.
  */
 struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
-			     const struct qstr *qstr, __u32 goal)
+			     const struct qstr *qstr, __u32 goal, uid_t *owner)
 {
 	struct super_block *sb;
 	struct buffer_head *inode_bitmap_bh = NULL;
@@ -852,8 +852,11 @@ got:
 		flex_group = ext4_flex_group(sbi, group);
 		atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
 	}
-
-	if (test_opt(sb, GRPID)) {
+	if (owner) {
+		inode->i_mode = mode;
+		inode->i_uid = owner[0];
+		inode->i_gid = owner[1];
+	} else if (test_opt(sb, GRPID)) {
 		inode->i_mode = mode;
 		inode->i_uid = current_fsuid();
 		inode->i_gid = dir->i_gid;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 8a9a0912fdae..f729377bf043 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -439,6 +439,7 @@ int ext4_ext_migrate(struct inode *inode)
 	struct migrate_struct lb;
 	unsigned long max_entries;
 	__u32 goal;
+	uid_t owner[2];
 
 	/*
 	 * If the filesystem does not support extents, or the inode
@@ -466,10 +467,12 @@ int ext4_ext_migrate(struct inode *inode)
 	}
 	goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
 		EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
+	owner[0] = inode->i_uid;
+	owner[1] = inode->i_gid;
 	tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
-				   S_IFREG, NULL, goal);
+				   S_IFREG, NULL, goal, owner);
 	if (IS_ERR(tmp_inode)) {
-		retval = -ENOMEM;
+		retval = PTR_ERR(inode);
 		ext4_journal_stop(handle);
 		return retval;
 	}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 4a550aa07614..f73d582e89f9 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1754,7 +1754,7 @@ retry:
 	if (IS_DIRSYNC(dir))
 		ext4_handle_sync(handle);
 
-	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
+	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		inode->i_op = &ext4_file_inode_operations;
@@ -1790,7 +1790,7 @@ retry:
 	if (IS_DIRSYNC(dir))
 		ext4_handle_sync(handle);
 
-	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
+	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		init_special_inode(inode, inode->i_mode, rdev);
@@ -1830,7 +1830,7 @@ retry:
 		ext4_handle_sync(handle);
 
 	inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
-			       &dentry->d_name, 0);
+			       &dentry->d_name, 0, NULL);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_stop;
@@ -2277,7 +2277,7 @@ retry:
 		ext4_handle_sync(handle);
 
 	inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
-			       &dentry->d_name, 0);
+			       &dentry->d_name, 0, NULL);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_stop;
-- 
cgit v1.2.3-58-ga151


From 81fdbb4a8d34242f0ed048395c4ddc910f1dffbe Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Sat, 29 Oct 2011 09:23:38 -0400
Subject: ext4: move variables to their scope

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 29969622af8b..05693804813d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -117,11 +117,9 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 			      struct ext4_ext_path *path,
 			      ext4_lblk_t block)
 {
-	int depth;
-
 	if (path) {
+		int depth = path->p_depth;
 		struct ext4_extent *ex;
-		depth = path->p_depth;
 
 		/*
 		 * Try to predict block placement assuming that we are
@@ -247,7 +245,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	int idxs, num = 0;
+	int idxs;
 
 	idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 		/ sizeof(struct ext4_extent_idx));
@@ -262,6 +260,8 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 	 */
 	if (ei->i_da_metadata_calc_len &&
 	    ei->i_da_metadata_calc_last_lblock+1 == lblock) {
+		int num = 0;
+
 		if ((ei->i_da_metadata_calc_len % idxs) == 0)
 			num++;
 		if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
@@ -324,8 +324,6 @@ static int ext4_valid_extent_entries(struct inode *inode,
 				struct ext4_extent_header *eh,
 				int depth)
 {
-	struct ext4_extent *ext;
-	struct ext4_extent_idx *ext_idx;
 	unsigned short entries;
 	if (eh->eh_entries == 0)
 		return 1;
@@ -334,7 +332,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
 
 	if (depth == 0) {
 		/* leaf entries */
-		ext = EXT_FIRST_EXTENT(eh);
+		struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
 		while (entries) {
 			if (!ext4_valid_extent(inode, ext))
 				return 0;
@@ -342,7 +340,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
 			entries--;
 		}
 	} else {
-		ext_idx = EXT_FIRST_INDEX(eh);
+		struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
 		while (entries) {
 			if (!ext4_valid_extent_idx(inode, ext_idx))
 				return 0;
@@ -3722,13 +3720,12 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	ext4_fsblk_t newblock = 0;
 	int free_on_err = 0, err = 0, depth, ret;
 	unsigned int allocated = 0, offset = 0;
-	unsigned int allocated_clusters = 0, reserved_clusters = 0;
+	unsigned int allocated_clusters = 0;
 	unsigned int punched_out = 0;
 	unsigned int result = 0;
 	struct ext4_allocation_request ar;
 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
 	ext4_lblk_t cluster_offset;
-	struct ext4_map_blocks punch_map;
 
 	ext_debug("blocks %u/%u requested for inode %lu\n",
 		  map->m_lblk, map->m_len, inode->i_ino);
@@ -3804,6 +3801,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 
 		/* if found extent covers block, simply return it */
 		if (in_range(map->m_lblk, ee_block, ee_len)) {
+			struct ext4_map_blocks punch_map;
 			ext4_fsblk_t partial_cluster = 0;
 
 			newblock = map->m_lblk - ee_block + ee_start;
@@ -4084,8 +4082,9 @@ got_allocated_blocks:
 	 * block allocation which had been deferred till now.
 	 */
 	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+		unsigned int reserved_clusters;
 		/*
-		 * Check how many clusters we had reserved this allocted range.
+		 * Check how many clusters we had reserved this allocated range
 		 */
 		reserved_clusters = get_reserved_cluster_alloc(inode,
 						map->m_lblk, allocated);
-- 
cgit v1.2.3-58-ga151


From 02dc62fba89eaee0157752c5f1ba811ef3156e00 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Sat, 29 Oct 2011 09:29:11 -0400
Subject: ext4: clean up AGGRESSIVE_TEST code

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 05693804813d..652b4dc5dfcb 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -181,12 +181,10 @@ static inline int ext4_ext_space_block(struct inode *inode, int check)
 
 	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 			/ sizeof(struct ext4_extent);
-	if (!check) {
 #ifdef AGGRESSIVE_TEST
-		if (size > 6)
-			size = 6;
+	if (!check && size > 6)
+		size = 6;
 #endif
-	}
 	return size;
 }
 
@@ -196,12 +194,10 @@ static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
 
 	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 			/ sizeof(struct ext4_extent_idx);
-	if (!check) {
 #ifdef AGGRESSIVE_TEST
-		if (size > 5)
-			size = 5;
+	if (!check && size > 5)
+		size = 5;
 #endif
-	}
 	return size;
 }
 
@@ -212,12 +208,10 @@ static inline int ext4_ext_space_root(struct inode *inode, int check)
 	size = sizeof(EXT4_I(inode)->i_data);
 	size -= sizeof(struct ext4_extent_header);
 	size /= sizeof(struct ext4_extent);
-	if (!check) {
 #ifdef AGGRESSIVE_TEST
-		if (size > 3)
-			size = 3;
+	if (!check && size > 3)
+		size = 3;
 #endif
-	}
 	return size;
 }
 
@@ -228,12 +222,10 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 	size = sizeof(EXT4_I(inode)->i_data);
 	size -= sizeof(struct ext4_extent_header);
 	size /= sizeof(struct ext4_extent_idx);
-	if (!check) {
 #ifdef AGGRESSIVE_TEST
-		if (size > 4)
-			size = 4;
+	if (!check && size > 4)
+		size = 4;
 #endif
-	}
 	return size;
 }
 
-- 
cgit v1.2.3-58-ga151


From e7b319e39776bd0e9c0c7855b023dafed2c93d27 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Sat, 29 Oct 2011 09:39:51 -0400
Subject: ext4: trace punch_hole correctly in ext4_ext_map_blocks

When ext4_ext_map_blocks() is called by punch_hole, trace should
trace blocks punched out.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 652b4dc5dfcb..36a0f177ecad 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4171,12 +4171,12 @@ out2:
 		ext4_ext_drop_refs(path);
 		kfree(path);
 	}
-	trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
-		newblock, map->m_len, err ? err : allocated);
-
 	result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
 			punched_out : allocated;
 
+	trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
+		newblock, map->m_len, err ? err : result);
+
 	return err ? err : result;
 }
 
-- 
cgit v1.2.3-58-ga151


From 6d6a435190bdf2e04c9465cde5bdc3ac68cf11a4 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Sat, 29 Oct 2011 10:15:35 -0400
Subject: ext4: fix race in xattr block allocation path

Ceph users reported that when using Ceph on ext4, the filesystem
would often become corrupted, containing inodes with incorrect
i_blocks counters.

I managed to reproduce this with a very hacked-up "streamtest"
binary from the Ceph tree.

Ceph is doing a lot of xattr writes, to out-of-inode blocks.
There is also another thread which does sync_file_range and close,
of the same files.  The problem appears to happen due to this race:

sync/flush thread               xattr-set thread
-----------------               ----------------

do_writepages                   ext4_xattr_set
ext4_da_writepages              ext4_xattr_set_handle
mpage_da_map_blocks             ext4_xattr_block_set
        set DELALLOC_RESERVE
                                ext4_new_meta_blocks
                                        ext4_mb_new_blocks
                                                if (!i_delalloc_reserved_flag)
                                                        vfs_dq_alloc_block
ext4_get_blocks
	down_write(i_data_sem)
        set i_delalloc_reserved_flag
	...
	up_write(i_data_sem)
                                        if (i_delalloc_reserved_flag)
                                                vfs_dq_alloc_block_nofail


In other words, the sync/flush thread pops in and sets
i_delalloc_reserved_flag on the inode, which makes the xattr thread
think that it's in a delalloc path in ext4_new_meta_blocks(),
and add the block for a second time, after already having added
it once in the !i_delalloc_reserved_flag case in ext4_mb_new_blocks

The real problem is that we shouldn't be using the DELALLOC_RESERVED
state flag, and instead we should be passing
EXT4_GET_BLOCKS_DELALLOC_RESERVE down to ext4_map_blocks() instead of
using an inode state flag.  We'll fix this for now with using
i_data_sem to prevent this race, but this is really not the right way
to fix things.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/xattr.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 0ae3668520f8..93a00d89a220 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -820,8 +820,14 @@ inserted:
 			if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 				goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
 
+			/*
+			 * take i_data_sem because we will test
+			 * i_delalloc_reserved_flag in ext4_mb_new_blocks
+			 */
+			down_read((&EXT4_I(inode)->i_data_sem));
 			block = ext4_new_meta_blocks(handle, inode, goal, 0,
 						     NULL, &error);
+			up_read((&EXT4_I(inode)->i_data_sem));
 			if (error)
 				goto cleanup;
 
-- 
cgit v1.2.3-58-ga151


From 5079276066cc421b48a6a63a54a34775979e8506 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastry@etersoft.ru>
Date: Sat, 29 Oct 2011 17:17:57 +0400
Subject: CIFS: Fix the VFS brlock cache usage in posix locking case

Request to the cache in FL_POSIX case only.

Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/file.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 91e05f2f0acf..c1f063cd1b0c 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -793,6 +793,9 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock)
 	struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
 	unsigned char saved_type = flock->fl_type;
 
+	if ((flock->fl_flags & FL_POSIX) == 0)
+		return 1;
+
 	mutex_lock(&cinode->lock_mutex);
 	posix_test_lock(file, flock);
 
@@ -809,12 +812,15 @@ static int
 cifs_posix_lock_set(struct file *file, struct file_lock *flock)
 {
 	struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
-	int rc;
+	int rc = 1;
+
+	if ((flock->fl_flags & FL_POSIX) == 0)
+		return rc;
 
 	mutex_lock(&cinode->lock_mutex);
 	if (!cinode->can_cache_brlcks) {
 		mutex_unlock(&cinode->lock_mutex);
-		return 1;
+		return rc;
 	}
 	rc = posix_lock_file_wait(file, flock);
 	mutex_unlock(&cinode->lock_mutex);
-- 
cgit v1.2.3-58-ga151


From 9ef5992e442b2b0bf6364bfcc5574e983a983159 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Thu, 20 Oct 2011 13:21:59 -0500
Subject: cifs: Assume passwords are encoded according to iocharset (try #2)

Re-posting a patch originally posted by Oskar Liljeblad after
rebasing on 3.2.

Modify cifs to assume that the supplied password is encoded according
to iocharset.  Before this patch passwords would be treated as
raw 8-bit data, which made authentication with Unicode passwords impossible
(at least passwords with characters > 0xFF).

The previous code would as a side effect accept passwords encoded with
ISO 8859-1, since Unicode < 0x100 basically is ISO 8859-1.  Software which
relies on that will no longer support password chars > 0x7F unless it also
uses iocharset=iso8859-1.  (mount.cifs does not care about the encoding so
it will work as expected.)

Signed-off-by: Oskar Liljeblad <oskar@osk.mine.nu>
Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Reviewed-by: Pavel Shilovsky <piastry@etersoft.ru>
Tested-by: A <nimbus1_03087@yahoo.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsencrypt.c |  8 +++----
 fs/cifs/cifsproto.h   |  8 ++++---
 fs/cifs/connect.c     |  2 +-
 fs/cifs/sess.c        |  2 +-
 fs/cifs/smbencrypt.c  | 63 ++++++++++-----------------------------------------
 5 files changed, 23 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 2cfb695d1f89..5d9b9acc5fce 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -204,7 +204,7 @@ int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov,
 }
 
 /* first calculate 24 bytes ntlm response and then 16 byte session key */
-int setup_ntlm_response(struct cifs_ses *ses)
+int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp)
 {
 	int rc = 0;
 	unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
@@ -221,14 +221,14 @@ int setup_ntlm_response(struct cifs_ses *ses)
 	ses->auth_key.len = temp_len;
 
 	rc = SMBNTencrypt(ses->password, ses->server->cryptkey,
-			ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+			ses->auth_key.response + CIFS_SESS_KEY_SIZE, nls_cp);
 	if (rc) {
 		cFYI(1, "%s Can't generate NTLM response, error: %d",
 			__func__, rc);
 		return rc;
 	}
 
-	rc = E_md4hash(ses->password, temp_key);
+	rc = E_md4hash(ses->password, temp_key, nls_cp);
 	if (rc) {
 		cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
 		return rc;
@@ -404,7 +404,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
 	}
 
 	/* calculate md4 hash of password */
-	E_md4hash(ses->password, nt_hash);
+	E_md4hash(ses->password, nt_hash, nls_cp);
 
 	rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
 				CIFS_NTHASH_SIZE);
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index ef4f631e4c01..6f4e243e0f62 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -395,8 +395,9 @@ extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
 extern int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov,
 				 struct TCP_Server_Info *server,
 				__u32 expected_sequence_number);
-extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
-extern int setup_ntlm_response(struct cifs_ses *);
+extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *,
+			const struct nls_table *);
+extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *);
 extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
 extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
 extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
@@ -448,7 +449,8 @@ extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
 		const unsigned char *path,
 		struct cifs_sb_info *cifs_sb, int xid);
 extern int mdfour(unsigned char *, unsigned char *, int);
-extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
+extern int E_md4hash(const unsigned char *passwd, unsigned char *p16,
+			const struct nls_table *codepage);
 extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
 			unsigned char *p24);
 
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d545a95c30ed..c0458c543f17 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3452,7 +3452,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
 		else
 #endif /* CIFS_WEAK_PW_HASH */
 		rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
-					bcc_ptr);
+					bcc_ptr, nls_codepage);
 
 		bcc_ptr += CIFS_AUTH_RESP_SIZE;
 		if (ses->capabilities & CAP_UNICODE) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index c7d80e24f24e..4ec3ee9d72cc 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -683,7 +683,7 @@ ssetup_ntlmssp_authenticate:
 			cpu_to_le16(CIFS_AUTH_RESP_SIZE);
 
 		/* calculate ntlm response and session key */
-		rc = setup_ntlm_response(ses);
+		rc = setup_ntlm_response(ses, nls_cp);
 		if (rc) {
 			cERROR(1, "Error %d during NTLM authentication", rc);
 			goto ssetup_exit;
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index ac1221d969d6..7cacba12b8f1 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -199,75 +199,36 @@ SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24)
 	return rc;
 }
 
-/* Routines for Windows NT MD4 Hash functions. */
-static int
-_my_wcslen(__u16 *str)
-{
-	int len = 0;
-	while (*str++ != 0)
-		len++;
-	return len;
-}
-
-/*
- * Convert a string into an NT UNICODE string.
- * Note that regardless of processor type
- * this must be in intel (little-endian)
- * format.
- */
-
-static int
-_my_mbstowcs(__u16 *dst, const unsigned char *src, int len)
-{	/* BB not a very good conversion routine - change/fix */
-	int i;
-	__u16 val;
-
-	for (i = 0; i < len; i++) {
-		val = *src;
-		SSVAL(dst, 0, val);
-		dst++;
-		src++;
-		if (val == 0)
-			break;
-	}
-	return i;
-}
-
 /*
  * Creates the MD4 Hash of the users password in NT UNICODE.
  */
 
 int
-E_md4hash(const unsigned char *passwd, unsigned char *p16)
+E_md4hash(const unsigned char *passwd, unsigned char *p16,
+	const struct nls_table *codepage)
 {
 	int rc;
 	int len;
 	__u16 wpwd[129];
 
 	/* Password cannot be longer than 128 characters */
-	if (passwd) {
-		len = strlen((char *) passwd);
-		if (len > 128)
-			len = 128;
-
-		/* Password must be converted to NT unicode */
-		_my_mbstowcs(wpwd, passwd, len);
-	} else
+	if (passwd) /* Password must be converted to NT unicode */
+		len = cifs_strtoUCS(wpwd, passwd, 128, codepage);
+	else {
 		len = 0;
+		*wpwd = 0; /* Ensure string is null terminated */
+	}
 
-	wpwd[len] = 0;	/* Ensure string is null terminated */
-	/* Calculate length in bytes */
-	len = _my_wcslen(wpwd) * sizeof(__u16);
-
-	rc = mdfour(p16, (unsigned char *) wpwd, len);
-	memset(wpwd, 0, 129 * 2);
+	rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__u16));
+	memset(wpwd, 0, 129 * sizeof(__u16));
 
 	return rc;
 }
 
 /* Does the NT MD4 hash then des encryption. */
 int
-SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
+SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24,
+		const struct nls_table *codepage)
 {
 	int rc;
 	unsigned char p16[16], p21[21];
@@ -275,7 +236,7 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
 	memset(p16, '\0', 16);
 	memset(p21, '\0', 21);
 
-	rc = E_md4hash(passwd, p16);
+	rc = E_md4hash(passwd, p16, codepage);
 	if (rc) {
 		cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
 		return rc;
-- 
cgit v1.2.3-58-ga151


From ad4e38dd6a33bb3a4882c487d7abe621e583b982 Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Fri, 7 Oct 2011 21:51:56 -0600
Subject: writeback: send work item to queue_io, move_expired_inodes

Instead of sending ->older_than_this to queue_io() and
move_expired_inodes(), send the entire wb_writeback_work
structure.  There are other fields of a work item that are
useful in these routines and in tracepoints.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c                | 16 ++++++++--------
 include/trace/events/writeback.h |  5 +++--
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 6401cd76f109..c51029693600 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -251,7 +251,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
  */
 static int move_expired_inodes(struct list_head *delaying_queue,
 			       struct list_head *dispatch_queue,
-			       unsigned long *older_than_this)
+			       struct wb_writeback_work *work)
 {
 	LIST_HEAD(tmp);
 	struct list_head *pos, *node;
@@ -262,8 +262,8 @@ static int move_expired_inodes(struct list_head *delaying_queue,
 
 	while (!list_empty(delaying_queue)) {
 		inode = wb_inode(delaying_queue->prev);
-		if (older_than_this &&
-		    inode_dirtied_after(inode, *older_than_this))
+		if (work->older_than_this &&
+		    inode_dirtied_after(inode, *work->older_than_this))
 			break;
 		if (sb && sb != inode->i_sb)
 			do_sb_sort = 1;
@@ -302,13 +302,13 @@ out:
  *                                           |
  *                                           +--> dequeue for IO
  */
-static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
+static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
 {
 	int moved;
 	assert_spin_locked(&wb->list_lock);
 	list_splice_init(&wb->b_more_io, &wb->b_io);
-	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
-	trace_writeback_queue_io(wb, older_than_this, moved);
+	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
+	trace_writeback_queue_io(wb, work, moved);
 }
 
 static int write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -651,7 +651,7 @@ long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages)
 
 	spin_lock(&wb->list_lock);
 	if (list_empty(&wb->b_io))
-		queue_io(wb, NULL);
+		queue_io(wb, &work);
 	__writeback_inodes_wb(wb, &work);
 	spin_unlock(&wb->list_lock);
 
@@ -745,7 +745,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 
 		trace_writeback_start(wb->bdi, work);
 		if (list_empty(&wb->b_io))
-			queue_io(wb, work->older_than_this);
+			queue_io(wb, work);
 		if (work->sb)
 			progress = writeback_sb_inodes(work->sb, wb, work);
 		else
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 0ce9f06f58c2..1261db3916cc 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -157,9 +157,9 @@ DEFINE_WBC_EVENT(wbc_writepage);
 
 TRACE_EVENT(writeback_queue_io,
 	TP_PROTO(struct bdi_writeback *wb,
-		 unsigned long *older_than_this,
+		 struct wb_writeback_work *work,
 		 int moved),
-	TP_ARGS(wb, older_than_this, moved),
+	TP_ARGS(wb, work, moved),
 	TP_STRUCT__entry(
 		__array(char,		name, 32)
 		__field(unsigned long,	older)
@@ -167,6 +167,7 @@ TRACE_EVENT(writeback_queue_io,
 		__field(int,		moved)
 	),
 	TP_fast_assign(
+		unsigned long *older_than_this = work->older_than_this;
 		strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
 		__entry->older	= older_than_this ?  *older_than_this : 0;
 		__entry->age	= older_than_this ?
-- 
cgit v1.2.3-58-ga151


From 0e175a1835ffc979e55787774e58ec79e41957d7 Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Fri, 7 Oct 2011 21:54:10 -0600
Subject: writeback: Add a 'reason' to wb_writeback_work

This creates a new 'reason' field in a wb_writeback_work
structure, which unambiguously identifies who initiates
writeback activity.  A 'wb_reason' enumeration has been
added to writeback.h, to enumerate the possible reasons.

The 'writeback_work_class' and tracepoint event class and
'writeback_queue_io' tracepoints are updated to include the
symbolic 'reason' in all trace events.

And the 'writeback_inodes_sbXXX' family of routines has had
a wb_stats parameter added to them, so callers can specify
why writeback is being started.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/btrfs/extent-tree.c           |  3 ++-
 fs/buffer.c                      |  2 +-
 fs/ext4/inode.c                  |  2 +-
 fs/fs-writeback.c                | 49 +++++++++++++++++++++++++++++-----------
 fs/quota/quota.c                 |  2 +-
 fs/sync.c                        |  4 ++--
 fs/ubifs/budget.c                |  2 +-
 include/linux/backing-dev.h      |  3 ++-
 include/linux/writeback.h        | 32 +++++++++++++++++++++-----
 include/trace/events/writeback.h | 14 ++++++++----
 mm/backing-dev.c                 |  3 ++-
 mm/page-writeback.c              |  3 ++-
 mm/vmscan.c                      |  3 ++-
 13 files changed, 88 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5be06a2462f..c9ee0e18bbdc 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3340,7 +3340,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 		smp_mb();
 		nr_pages = min_t(unsigned long, nr_pages,
 		       root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
-		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
+		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
+						WB_REASON_FS_FREE_SPACE);
 
 		spin_lock(&space_info->lock);
 		if (reserved > space_info->bytes_reserved)
diff --git a/fs/buffer.c b/fs/buffer.c
index 1a80b048ade8..f5dcee6c4cfb 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -285,7 +285,7 @@ static void free_more_memory(void)
 	struct zone *zone;
 	int nid;
 
-	wakeup_flusher_threads(1024);
+	wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
 	yield();
 
 	for_each_online_node(nid) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 986e2388f031..7fa73a3b2120 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2241,7 +2241,7 @@ static int ext4_nonda_switch(struct super_block *sb)
 	 * start pushing delalloc when 1/2 of free blocks are dirty.
 	 */
 	if (free_blocks < 2 * dirty_blocks)
-		writeback_inodes_sb_if_idle(sb);
+		writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE);
 
 	return 0;
 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c51029693600..73c3992b2bb4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -41,11 +41,23 @@ struct wb_writeback_work {
 	unsigned int for_kupdate:1;
 	unsigned int range_cyclic:1;
 	unsigned int for_background:1;
+	enum wb_reason reason;		/* why was writeback initiated? */
 
 	struct list_head list;		/* pending work list */
 	struct completion *done;	/* set if the caller waits */
 };
 
+const char *wb_reason_name[] = {
+	[WB_REASON_BACKGROUND]		= "background",
+	[WB_REASON_TRY_TO_FREE_PAGES]	= "try_to_free_pages",
+	[WB_REASON_SYNC]		= "sync",
+	[WB_REASON_PERIODIC]		= "periodic",
+	[WB_REASON_LAPTOP_TIMER]	= "laptop_timer",
+	[WB_REASON_FREE_MORE_MEM]	= "free_more_memory",
+	[WB_REASON_FS_FREE_SPACE]	= "fs_free_space",
+	[WB_REASON_FORKER_THREAD]	= "forker_thread"
+};
+
 /*
  * Include the creation of the trace points after defining the
  * wb_writeback_work structure so that the definition remains local to this
@@ -115,7 +127,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
 
 static void
 __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-		      bool range_cyclic)
+		      bool range_cyclic, enum wb_reason reason)
 {
 	struct wb_writeback_work *work;
 
@@ -135,6 +147,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 	work->sync_mode	= WB_SYNC_NONE;
 	work->nr_pages	= nr_pages;
 	work->range_cyclic = range_cyclic;
+	work->reason	= reason;
 
 	bdi_queue_work(bdi, work);
 }
@@ -150,9 +163,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
  *   completion. Caller need not hold sb s_umount semaphore.
  *
  */
-void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
+			enum wb_reason reason)
 {
-	__bdi_start_writeback(bdi, nr_pages, true);
+	__bdi_start_writeback(bdi, nr_pages, true, reason);
 }
 
 /**
@@ -641,12 +655,14 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
 	return wrote;
 }
 
-long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages)
+long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
+				enum wb_reason reason)
 {
 	struct wb_writeback_work work = {
 		.nr_pages	= nr_pages,
 		.sync_mode	= WB_SYNC_NONE,
 		.range_cyclic	= 1,
+		.reason		= reason,
 	};
 
 	spin_lock(&wb->list_lock);
@@ -825,6 +841,7 @@ static long wb_check_background_flush(struct bdi_writeback *wb)
 			.sync_mode	= WB_SYNC_NONE,
 			.for_background	= 1,
 			.range_cyclic	= 1,
+			.reason		= WB_REASON_BACKGROUND,
 		};
 
 		return wb_writeback(wb, &work);
@@ -858,6 +875,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 			.sync_mode	= WB_SYNC_NONE,
 			.for_kupdate	= 1,
 			.range_cyclic	= 1,
+			.reason		= WB_REASON_PERIODIC,
 		};
 
 		return wb_writeback(wb, &work);
@@ -976,7 +994,7 @@ int bdi_writeback_thread(void *data)
  * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
  * the whole world.
  */
-void wakeup_flusher_threads(long nr_pages)
+void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 {
 	struct backing_dev_info *bdi;
 
@@ -989,7 +1007,7 @@ void wakeup_flusher_threads(long nr_pages)
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		if (!bdi_has_dirty_io(bdi))
 			continue;
-		__bdi_start_writeback(bdi, nr_pages, false);
+		__bdi_start_writeback(bdi, nr_pages, false, reason);
 	}
 	rcu_read_unlock();
 }
@@ -1210,7 +1228,9 @@ static void wait_sb_inodes(struct super_block *sb)
  * on how many (if any) will be written, and this function does not wait
  * for IO completion of submitted IO.
  */
-void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
+void writeback_inodes_sb_nr(struct super_block *sb,
+			    unsigned long nr,
+			    enum wb_reason reason)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct wb_writeback_work work = {
@@ -1219,6 +1239,7 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
 		.tagged_writepages	= 1,
 		.done			= &done,
 		.nr_pages		= nr,
+		.reason			= reason,
 	};
 
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
@@ -1235,9 +1256,9 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr);
  * on how many (if any) will be written, and this function does not wait
  * for IO completion of submitted IO.
  */
-void writeback_inodes_sb(struct super_block *sb)
+void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
 {
-	return writeback_inodes_sb_nr(sb, get_nr_dirty_pages());
+	return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
 
@@ -1248,11 +1269,11 @@ EXPORT_SYMBOL(writeback_inodes_sb);
  * Invoke writeback_inodes_sb if no writeback is currently underway.
  * Returns 1 if writeback was started, 0 if not.
  */
-int writeback_inodes_sb_if_idle(struct super_block *sb)
+int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason)
 {
 	if (!writeback_in_progress(sb->s_bdi)) {
 		down_read(&sb->s_umount);
-		writeback_inodes_sb(sb);
+		writeback_inodes_sb(sb, reason);
 		up_read(&sb->s_umount);
 		return 1;
 	} else
@@ -1269,11 +1290,12 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
  * Returns 1 if writeback was started, 0 if not.
  */
 int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
-				   unsigned long nr)
+				   unsigned long nr,
+				   enum wb_reason reason)
 {
 	if (!writeback_in_progress(sb->s_bdi)) {
 		down_read(&sb->s_umount);
-		writeback_inodes_sb_nr(sb, nr);
+		writeback_inodes_sb_nr(sb, nr, reason);
 		up_read(&sb->s_umount);
 		return 1;
 	} else
@@ -1297,6 +1319,7 @@ void sync_inodes_sb(struct super_block *sb)
 		.nr_pages	= LONG_MAX,
 		.range_cyclic	= 0,
 		.done		= &done,
+		.reason		= WB_REASON_SYNC,
 	};
 
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 10b6be3ca280..4bae57fc603b 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -286,7 +286,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 		/* caller already holds s_umount */
 		if (sb->s_flags & MS_RDONLY)
 			return -EROFS;
-		writeback_inodes_sb(sb);
+		writeback_inodes_sb(sb, WB_REASON_SYNC);
 		return 0;
 	default:
 		return -EINVAL;
diff --git a/fs/sync.c b/fs/sync.c
index c98a7477edfd..101b8ef901d7 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -43,7 +43,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
 	if (wait)
 		sync_inodes_sb(sb);
 	else
-		writeback_inodes_sb(sb);
+		writeback_inodes_sb(sb, WB_REASON_SYNC);
 
 	if (sb->s_op->sync_fs)
 		sb->s_op->sync_fs(sb, wait);
@@ -98,7 +98,7 @@ static void sync_filesystems(int wait)
  */
 SYSCALL_DEFINE0(sync)
 {
-	wakeup_flusher_threads(0);
+	wakeup_flusher_threads(0, WB_REASON_SYNC);
 	sync_filesystems(0);
 	sync_filesystems(1);
 	if (unlikely(laptop_mode))
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 315de66e52b2..bc4f94b28706 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -63,7 +63,7 @@
 static void shrink_liability(struct ubifs_info *c, int nr_to_write)
 {
 	down_read(&c->vfs_sb->s_umount);
-	writeback_inodes_sb(c->vfs_sb);
+	writeback_inodes_sb(c->vfs_sb, WB_REASON_FS_FREE_SPACE);
 	up_read(&c->vfs_sb->s_umount);
 }
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index c3b92010d894..b1038bd686ac 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -118,7 +118,8 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 void bdi_unregister(struct backing_dev_info *bdi);
 int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
-void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages);
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
+			enum wb_reason reason);
 void bdi_start_background_writeback(struct backing_dev_info *bdi);
 int bdi_writeback_thread(void *data);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index ddb4652cb337..a378c295851f 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -38,6 +38,23 @@ enum writeback_sync_modes {
 	WB_SYNC_ALL,	/* Wait on every mapping */
 };
 
+/*
+ * why some writeback work was initiated
+ */
+enum wb_reason {
+	WB_REASON_BACKGROUND,
+	WB_REASON_TRY_TO_FREE_PAGES,
+	WB_REASON_SYNC,
+	WB_REASON_PERIODIC,
+	WB_REASON_LAPTOP_TIMER,
+	WB_REASON_FREE_MORE_MEM,
+	WB_REASON_FS_FREE_SPACE,
+	WB_REASON_FORKER_THREAD,
+
+	WB_REASON_MAX,
+};
+extern const char *wb_reason_name[];
+
 /*
  * A control structure which tells the writeback code what to do.  These are
  * always on the stack, and hence need no locking.  They are always initialised
@@ -69,14 +86,17 @@ struct writeback_control {
  */	
 struct bdi_writeback;
 int inode_wait(void *);
-void writeback_inodes_sb(struct super_block *);
-void writeback_inodes_sb_nr(struct super_block *, unsigned long nr);
-int writeback_inodes_sb_if_idle(struct super_block *);
-int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr);
+void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
+void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
+							enum wb_reason reason);
+int writeback_inodes_sb_if_idle(struct super_block *, enum wb_reason reason);
+int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr,
+							enum wb_reason reason);
 void sync_inodes_sb(struct super_block *);
-long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages);
+long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
+				enum wb_reason reason);
 long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
-void wakeup_flusher_threads(long nr_pages);
+void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
 
 /* writeback.h requires fs.h; it, too, is not included from here. */
 static inline void wait_on_inode(struct inode *inode)
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 1261db3916cc..b99caa8b780c 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -34,6 +34,7 @@ DECLARE_EVENT_CLASS(writeback_work_class,
 		__field(int, for_kupdate)
 		__field(int, range_cyclic)
 		__field(int, for_background)
+		__field(int, reason)
 	),
 	TP_fast_assign(
 		strncpy(__entry->name, dev_name(bdi->dev), 32);
@@ -43,16 +44,18 @@ DECLARE_EVENT_CLASS(writeback_work_class,
 		__entry->for_kupdate = work->for_kupdate;
 		__entry->range_cyclic = work->range_cyclic;
 		__entry->for_background	= work->for_background;
+		__entry->reason = work->reason;
 	),
 	TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
-		  "kupdate=%d range_cyclic=%d background=%d",
+		  "kupdate=%d range_cyclic=%d background=%d reason=%s",
 		  __entry->name,
 		  MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
 		  __entry->nr_pages,
 		  __entry->sync_mode,
 		  __entry->for_kupdate,
 		  __entry->range_cyclic,
-		  __entry->for_background
+		  __entry->for_background,
+		  wb_reason_name[__entry->reason]
 	)
 );
 #define DEFINE_WRITEBACK_WORK_EVENT(name) \
@@ -165,6 +168,7 @@ TRACE_EVENT(writeback_queue_io,
 		__field(unsigned long,	older)
 		__field(long,		age)
 		__field(int,		moved)
+		__field(int,		reason)
 	),
 	TP_fast_assign(
 		unsigned long *older_than_this = work->older_than_this;
@@ -173,12 +177,14 @@ TRACE_EVENT(writeback_queue_io,
 		__entry->age	= older_than_this ?
 				  (jiffies - *older_than_this) * 1000 / HZ : -1;
 		__entry->moved	= moved;
+		__entry->reason	= work->reason;
 	),
-	TP_printk("bdi %s: older=%lu age=%ld enqueue=%d",
+	TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s",
 		__entry->name,
 		__entry->older,	/* older_than_this in jiffies */
 		__entry->age,	/* older_than_this in relative milliseconds */
-		__entry->moved)
+		__entry->moved,
+		wb_reason_name[__entry->reason])
 );
 
 TRACE_EVENT(global_dirty_state,
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 5dcaa3c756d1..dd8916feb05e 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -476,7 +476,8 @@ static int bdi_forker_thread(void *ptr)
 				 * the bdi from the thread. Hopefully 1024 is
 				 * large enough for efficient IO.
 				 */
-				writeback_inodes_wb(&bdi->wb, 1024);
+				writeback_inodes_wb(&bdi->wb, 1024,
+						    WB_REASON_FORKER_THREAD);
 			} else {
 				/*
 				 * The spinlock makes sure we do not lose
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 45d36f7dc169..650846b61584 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1301,7 +1301,8 @@ void laptop_mode_timer_fn(unsigned long data)
 	 * threshold
 	 */
 	if (bdi_has_dirty_io(&q->backing_dev_info))
-		bdi_start_writeback(&q->backing_dev_info, nr_pages);
+		bdi_start_writeback(&q->backing_dev_info, nr_pages,
+					WB_REASON_LAPTOP_TIMER);
 }
 
 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b55699cd9067..c735bd770d3d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2181,7 +2181,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 		 */
 		writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
 		if (total_scanned > writeback_threshold) {
-			wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
+			wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
+						WB_REASON_TRY_TO_FREE_PAGES);
 			sc->may_writepage = 1;
 		}
 
-- 
cgit v1.2.3-58-ga151


From d73d5046a72467d4510825b99e2269e09ad80e15 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Sun, 30 Oct 2011 18:26:08 -0400
Subject: ext4: Use correct locking for ext4_end_io_nolock()

We must hold i_completed_io_lock when manipulating anything on the
i_completed_io_list linked list.  This includes io->lock, which we
were checking in ext4_end_io_nolock().

So move this check to ext4_end_io_work().  This also has the bonus of
avoiding extra work if it is already done without needing to take the
mutex.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/fsync.c   |  3 ---
 fs/ext4/page-io.c | 14 +++++++++++---
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index c942924a0645..851ac5b3cec9 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -83,9 +83,6 @@ int ext4_flush_completed_IO(struct inode *inode)
 	int ret = 0;
 	int ret2 = 0;
 
-	if (list_empty(&ei->i_completed_io_list))
-		return ret;
-
 	dump_completed_IO(inode);
 	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
 	while (!list_empty(&ei->i_completed_io_list)){
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 92f38ee13f8a..aed40966f342 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -87,6 +87,9 @@ void ext4_free_io_end(ext4_io_end_t *io)
 
 /*
  * check a range of space and convert unwritten extents to written.
+ *
+ * Called with inode->i_mutex; we depend on this when we manipulate
+ * io->flag, since we could otherwise race with ext4_flush_completed_IO()
  */
 int ext4_end_io_nolock(ext4_io_end_t *io)
 {
@@ -100,9 +103,6 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
 		   "list->prev 0x%p\n",
 		   io, inode->i_ino, io->list.next, io->list.prev);
 
-	if (list_empty(&io->list))
-		return ret;
-
 	if (!(io->flag & EXT4_IO_END_UNWRITTEN))
 		return ret;
 
@@ -142,6 +142,13 @@ static void ext4_end_io_work(struct work_struct *work)
 	unsigned long		flags;
 	int			ret;
 
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	if (list_empty(&io->list)) {
+		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+		goto free;
+	}
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
 	if (!mutex_trylock(&inode->i_mutex)) {
 		/*
 		 * Requeue the work instead of waiting so that the work
@@ -170,6 +177,7 @@ static void ext4_end_io_work(struct work_struct *work)
 		list_del_init(&io->list);
 	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 	mutex_unlock(&inode->i_mutex);
+free:
 	ext4_free_io_end(io);
 }
 
-- 
cgit v1.2.3-58-ga151


From 4e298021216727cc27017c5032ade86167c66256 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 30 Oct 2011 18:41:19 -0400
Subject: ext4: remove unnecessary call to waitqueue_active()

The usage of waitqueue_active() is not necessary, and introduces (I
believe) a hard-to-hit race.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/page-io.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index aed40966f342..4fa1d709b604 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -70,7 +70,6 @@ static void put_io_page(struct ext4_io_page *io_page)
 void ext4_free_io_end(ext4_io_end_t *io)
 {
 	int i;
-	wait_queue_head_t *wq;
 
 	BUG_ON(!io);
 	if (io->page)
@@ -78,10 +77,8 @@ void ext4_free_io_end(ext4_io_end_t *io)
 	for (i = 0; i < io->num_io_pages; i++)
 		put_io_page(io->pages[i]);
 	io->num_io_pages = 0;
-	wq = ext4_ioend_wq(io->inode);
-	if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
-	    waitqueue_active(wq))
-		wake_up_all(wq);
+	if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count))
+		wake_up_all(ext4_ioend_wq(io->inode));
 	kmem_cache_free(io_end_cachep, io);
 }
 
@@ -96,7 +93,6 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
 	struct inode *inode = io->inode;
 	loff_t offset = io->offset;
 	ssize_t size = io->size;
-	wait_queue_head_t *wq;
 	int ret = 0;
 
 	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
@@ -121,11 +117,8 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
 	if (io->flag & EXT4_IO_END_UNWRITTEN) {
 		io->flag &= ~EXT4_IO_END_UNWRITTEN;
 		/* Wake up anyone waiting on unwritten extent conversion */
-		wq = ext4_ioend_wq(io->inode);
-		if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) &&
-		    waitqueue_active(wq)) {
-			wake_up_all(wq);
-		}
+		if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
+			wake_up_all(ext4_ioend_wq(io->inode));
 	}
 
 	return ret;
-- 
cgit v1.2.3-58-ga151


From b82e384c7bb9a19036b4daf58fa216df7cd48aa0 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 31 Oct 2011 10:56:32 -0400
Subject: ext4: optimize locking for end_io extent conversion

Now that we are doing the locking correctly, we need to grab the
i_completed_io_lock() twice per end_io.  We can clean this up by
removing the structure from the i_complted_io_list, and use this as
the locking mechanism to prevent ext4_flush_completed_IO() racing
against ext4_end_io_work(), instead of clearing the
EXT4_IO_END_UNWRITTEN in io->flag.

In addition, if the ext4_convert_unwritten_extents() returns an error,
we no longer keep the end_io structure on the linked list.  This
doesn't help, because it tends to lock up the file system and wedges
the system.  That's one way to call attention to the problem, but it
doesn't help the overall robustness of the system.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/fsync.c   |  5 ++---
 fs/ext4/page-io.c | 37 +++++++++++--------------------------
 2 files changed, 13 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 851ac5b3cec9..00a2cb753efd 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -88,6 +88,7 @@ int ext4_flush_completed_IO(struct inode *inode)
 	while (!list_empty(&ei->i_completed_io_list)){
 		io = list_entry(ei->i_completed_io_list.next,
 				ext4_io_end_t, list);
+		list_del_init(&io->list);
 		/*
 		 * Calling ext4_end_io_nolock() to convert completed
 		 * IO to written.
@@ -104,11 +105,9 @@ int ext4_flush_completed_IO(struct inode *inode)
 		 */
 		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 		ret = ext4_end_io_nolock(io);
-		spin_lock_irqsave(&ei->i_completed_io_lock, flags);
 		if (ret < 0)
 			ret2 = ret;
-		else
-			list_del_init(&io->list);
+		spin_lock_irqsave(&ei->i_completed_io_lock, flags);
 	}
 	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 	return (ret2 < 0) ? ret2 : 0;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4fa1d709b604..9eebf44646f6 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -99,28 +99,21 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
 		   "list->prev 0x%p\n",
 		   io, inode->i_ino, io->list.next, io->list.prev);
 
-	if (!(io->flag & EXT4_IO_END_UNWRITTEN))
-		return ret;
-
 	ret = ext4_convert_unwritten_extents(inode, offset, size);
 	if (ret < 0) {
-		printk(KERN_EMERG "%s: failed to convert unwritten "
-			"extents to written extents, error is %d "
-			"io is still on inode %lu aio dio list\n",
-		       __func__, ret, inode->i_ino);
-		return ret;
+		ext4_msg(inode->i_sb, KERN_EMERG,
+			 "failed to convert unwritten extents to written "
+			 "extents -- potential data loss!  "
+			 "(inode %lu, offset %llu, size %zd, error %d)",
+			 inode->i_ino, offset, size, ret);
 	}
 
 	if (io->iocb)
 		aio_complete(io->iocb, io->result, 0);
-	/* clear the DIO AIO unwritten flag */
-	if (io->flag & EXT4_IO_END_UNWRITTEN) {
-		io->flag &= ~EXT4_IO_END_UNWRITTEN;
-		/* Wake up anyone waiting on unwritten extent conversion */
-		if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
-			wake_up_all(ext4_ioend_wq(io->inode));
-	}
 
+	/* Wake up anyone waiting on unwritten extent conversion */
+	if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
+		wake_up_all(ext4_ioend_wq(io->inode));
 	return ret;
 }
 
@@ -133,16 +126,15 @@ static void ext4_end_io_work(struct work_struct *work)
 	struct inode		*inode = io->inode;
 	struct ext4_inode_info	*ei = EXT4_I(inode);
 	unsigned long		flags;
-	int			ret;
 
 	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
 	if (list_empty(&io->list)) {
 		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 		goto free;
 	}
-	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 
 	if (!mutex_trylock(&inode->i_mutex)) {
+		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 		/*
 		 * Requeue the work instead of waiting so that the work
 		 * items queued after this can be processed.
@@ -159,16 +151,9 @@ static void ext4_end_io_work(struct work_struct *work)
 		io->flag |= EXT4_IO_END_QUEUED;
 		return;
 	}
-	ret = ext4_end_io_nolock(io);
-	if (ret < 0) {
-		mutex_unlock(&inode->i_mutex);
-		return;
-	}
-
-	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-	if (!list_empty(&io->list))
-		list_del_init(&io->list);
+	list_del_init(&io->list);
 	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+	(void) ext4_end_io_nolock(io);
 	mutex_unlock(&inode->i_mutex);
 free:
 	ext4_free_io_end(io);
-- 
cgit v1.2.3-58-ga151


From 92407e75ce45b41c46944891711fd8faf0714d84 Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Sun, 23 Oct 2011 20:21:17 -0700
Subject: nfs4: serialize layoutcommit

Current pnfs_layoutcommit_inode can not handle parallel layoutcommit.
And as Trond suggested , there is no need for client to optimize for
parallel layoutcommit. So add NFS_INO_LAYOUTCOMMITTING flag to
mark inflight layoutcommit and serialize lalyoutcommit with it.
Also mark_inode_dirty_sync if pnfs_layoutcommit_inode fails to issue
layoutcommit.

Reported-by: Vitaliy Gusev <gusev.vitaliy@nexenta.com>
Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Jim Rees <rees@umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c      |  6 ++++++
 fs/nfs/pnfs.c          | 25 ++++++++++++++++++++++---
 include/linux/nfs_fs.h |  1 +
 3 files changed, 29 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d2ae413c986a..b60fddf606f7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5950,6 +5950,7 @@ static void nfs4_layoutcommit_release(void *calldata)
 {
 	struct nfs4_layoutcommit_data *data = calldata;
 	struct pnfs_layout_segment *lseg, *tmp;
+	unsigned long *bitlock = &NFS_I(data->args.inode)->flags;
 
 	pnfs_cleanup_layoutcommit(data);
 	/* Matched by references in pnfs_set_layoutcommit */
@@ -5959,6 +5960,11 @@ static void nfs4_layoutcommit_release(void *calldata)
 				       &lseg->pls_flags))
 			put_lseg(lseg);
 	}
+
+	clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
+	smp_mb__after_clear_bit();
+	wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
+
 	put_rpccred(data->cred);
 	kfree(data);
 }
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index ee73d9a4f700..a2478bc74442 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1443,17 +1443,31 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 	/* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
 	data = kzalloc(sizeof(*data), GFP_NOFS);
 	if (!data) {
-		mark_inode_dirty_sync(inode);
 		status = -ENOMEM;
 		goto out;
 	}
 
+	if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
+		goto out_free;
+
+	if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
+		if (!sync) {
+			status = -EAGAIN;
+			goto out_free;
+		}
+		status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING,
+					nfs_wait_bit_killable, TASK_KILLABLE);
+		if (status)
+			goto out_free;
+	}
+
 	INIT_LIST_HEAD(&data->lseg_list);
 	spin_lock(&inode->i_lock);
 	if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
+		clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags);
 		spin_unlock(&inode->i_lock);
-		kfree(data);
-		goto out;
+		wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING);
+		goto out_free;
 	}
 
 	pnfs_list_write_lseg(inode, &data->lseg_list);
@@ -1475,6 +1489,11 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 
 	status = nfs4_proc_layoutcommit(data, sync);
 out:
+	if (status)
+		mark_inode_dirty_sync(inode);
 	dprintk("<-- %s status %d\n", __func__, status);
 	return status;
+out_free:
+	kfree(data);
+	goto out;
 }
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 60a137b7f171..ab2c6343361a 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -229,6 +229,7 @@ struct nfs_inode {
 #define NFS_INO_COMMIT		(7)		/* inode is committing unstable writes */
 #define NFS_INO_PNFS_COMMIT	(8)		/* use pnfs code for commit */
 #define NFS_INO_LAYOUTCOMMIT	(9)		/* layoutcommit required */
+#define NFS_INO_LAYOUTCOMMITTING (10)		/* layoutcommit inflight */
 
 static inline struct nfs_inode *NFS_I(const struct inode *inode)
 {
-- 
cgit v1.2.3-58-ga151


From d743c3c9c236cc61403a4f7d6283d59ddf68b2bd Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Sun, 23 Oct 2011 20:22:38 -0700
Subject: NFS4: fix cb_recallany decode error

craa_type_mask is bitmap4 per RFC5661. We need to expect a length before
extracting bitmap value.

Cc: Alexandros Batsakis <batsakis@netapp.com>
Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback_xdr.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 918ad647afea..ee1a5b3cd48d 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -488,17 +488,18 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp,
 				      struct xdr_stream *xdr,
 				      struct cb_recallanyargs *args)
 {
-	__be32 *p;
+	uint32_t bitmap[2];
+	__be32 *p, status;
 
 	args->craa_addr = svc_addr(rqstp);
 	p = read_buf(xdr, 4);
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_BADXDR);
 	args->craa_objs_to_keep = ntohl(*p++);
-	p = read_buf(xdr, 4);
-	if (unlikely(p == NULL))
-		return htonl(NFS4ERR_BADXDR);
-	args->craa_type_mask = ntohl(*p);
+	status = decode_bitmap(xdr, bitmap);
+	if (unlikely(status))
+		return status;
+	args->craa_type_mask = bitmap[0];
 
 	return 0;
 }
-- 
cgit v1.2.3-58-ga151


From c02f557dd0a026d7147da3b6f7daf52c6ff5580f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 25 Oct 2011 12:17:43 -0400
Subject: NFS: Fix documenting comment for nfs_create_request()

Clean up: the first parameter of nfs_create_request() has been
incorrectly documented since time immemorial (OK, since before
2.6.12).

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pagelist.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index b60970cc7f1f..0a5ff5c19511 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -41,7 +41,7 @@ nfs_page_free(struct nfs_page *p)
 
 /**
  * nfs_create_request - Create an NFS read/write request.
- * @file: file descriptor to use
+ * @ctx: open context to use
  * @inode: inode to which the request is attached
  * @page: page to write
  * @offset: starting offset within the page for the write
-- 
cgit v1.2.3-58-ga151


From c6e696660213a89a5bfde8b49d539553904c808f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 25 Oct 2011 12:17:53 -0400
Subject: NFS: Clean up nfs4_xdr_dec_secinfo()

Clean up: Remove superfluous logic at the tail of
nfs4_xdr_dec_secinfo() .

Introduced by commit 5a5ea0d4 "NFS: Add secinfo procedure" (March 24,
2011).

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1dce12f41a4f..e6161b213ed1 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -6602,8 +6602,6 @@ static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp,
 	if (status)
 		goto out;
 	status = decode_secinfo(xdr, res);
-	if (status)
-		goto out;
 out:
 	return status;
 }
-- 
cgit v1.2.3-58-ga151


From e414966b81a74745ac8d6bfeda0d95fb721e6d91 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 25 Oct 2011 12:18:03 -0400
Subject: NFS: Remove no-op less-than-zero checks on unsigned variables.

Introduced by commit 16b374ca "NFSv4.1: pnfs: filelayout: add driver's
LAYOUTGET and GETDEVICEINFO infrastructure" (October 20, 2010).

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4filelayout.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 09119418402f..12185aadb349 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -449,9 +449,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
 
 	fl->dsaddr = dsaddr;
 
-	if (fl->first_stripe_index < 0 ||
-	    fl->first_stripe_index >= dsaddr->stripe_count) {
-		dprintk("%s Bad first_stripe_index %d\n",
+	if (fl->first_stripe_index >= dsaddr->stripe_count) {
+		dprintk("%s Bad first_stripe_index %u\n",
 				__func__, fl->first_stripe_index);
 		goto out_put;
 	}
@@ -552,7 +551,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
 
 	/* Note that a zero value for num_fh is legal for STRIPE_SPARSE.
 	 * Futher checking is done in filelayout_check_layout */
-	if (fl->num_fh < 0 || fl->num_fh >
+	if (fl->num_fh >
 	    max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT))
 		goto out_err;
 
-- 
cgit v1.2.3-58-ga151


From 0edeb71dc9133bfb505d3bf59642e07cd936613e Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Mon, 31 Oct 2011 17:30:44 -0400
Subject: ext4: Create helper function for EXT4_IO_END_UNWRITTEN and
 i_aiodio_unwritten

EXT4_IO_END_UNWRITTEN flag set and the increase of i_aiodio_unwritten
should be done simultaneously since ext4_end_io_nolock always clear
the flag and decrease the counter in the same time.

We have found some bugs that the flag is set while leaving
i_aiodio_unwritten unchanged(commit 32c80b32c053d). So this patch just tries
to create a helper function to wrap them to avoid any future bug.
The idea is inspired by Eric.

Cc: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  9 +++++++++
 fs/ext4/extents.c | 18 ++++++------------
 fs/ext4/inode.c   |  5 +----
 fs/ext4/page-io.c |  6 ++----
 4 files changed, 18 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 657e82649fa5..604c200216c1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1275,6 +1275,15 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
 }
 
+static inline void ext4_set_io_unwritten_flag(struct inode *inode,
+					      struct ext4_io_end *io_end)
+{
+	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+		io_end->flag |= EXT4_IO_END_UNWRITTEN;
+		atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+	}
+}
+
 /*
  * Inode dynamic state flags
  */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 36a0f177ecad..a5c8caaaa099 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3472,12 +3472,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 		 * that this IO needs to conversion to written when IO is
 		 * completed
 		 */
-		if (io) {
-			if (!(io->flag & EXT4_IO_END_UNWRITTEN)) {
-				io->flag = EXT4_IO_END_UNWRITTEN;
-				atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
-			}
-		} else
+		if (io)
+			ext4_set_io_unwritten_flag(inode, io);
+		else
 			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
 		if (ext4_should_dioread_nolock(inode))
 			map->m_flags |= EXT4_MAP_UNINIT;
@@ -4030,12 +4027,9 @@ got_allocated_blocks:
 		 * that we need to perform conversion when IO is done.
 		 */
 		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-			if (io) {
-				if (!(io->flag & EXT4_IO_END_UNWRITTEN)) {
-					io->flag = EXT4_IO_END_UNWRITTEN;
-					atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
-				}
-			} else
+			if (io)
+				ext4_set_io_unwritten_flag(inode, io);
+			else
 				ext4_set_inode_state(inode,
 						     EXT4_STATE_DIO_UNWRITTEN);
 		}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e4b26faac5ff..60af5126eb05 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2831,10 +2831,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
 	 * but being more careful is always safe for the future change.
 	 */
 	inode = io_end->inode;
-	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
-		io_end->flag |= EXT4_IO_END_UNWRITTEN;
-		atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
-	}
+	ext4_set_io_unwritten_flag(inode, io_end);
 
 	/* Add the io_end to per-inode completed io list*/
 	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 9eebf44646f6..7ce1d0b19c94 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -336,10 +336,8 @@ submit_and_retry:
 	if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
 	    (io_end->pages[io_end->num_io_pages-1] != io_page))
 		goto submit_and_retry;
-	if (buffer_uninit(bh) && !(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
-		io_end->flag |= EXT4_IO_END_UNWRITTEN;
-		atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
-	}
+	if (buffer_uninit(bh))
+		ext4_set_io_unwritten_flag(inode, io_end);
 	io->io_end->size += bh->b_size;
 	io->io_next_block++;
 	ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
-- 
cgit v1.2.3-58-ga151


From e260daf27902b2189a9198f5b64fa4567939bb5b Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Mon, 31 Oct 2011 17:54:36 -0400
Subject: ext4: move vars to local scope in
 ext4_discard_partial_page_buffers_no_lock()

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 60af5126eb05..f60b459b27d7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3179,7 +3179,6 @@ int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
 	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned int offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned int blocksize, max, pos;
-	unsigned int end_of_block, range_to_discard;
 	ext4_lblk_t iblock;
 	struct buffer_head *bh;
 	int err = 0;
@@ -3231,6 +3230,8 @@ int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
 
 	pos = offset;
 	while (pos < offset + length) {
+		unsigned int end_of_block, range_to_discard;
+
 		err = 0;
 
 		/* The length of space left to zero and unmap */
-- 
cgit v1.2.3-58-ga151


From 5129d05fda57be13f434dbe8536de39a6c25496d Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Mon, 31 Oct 2011 17:56:10 -0400
Subject: ext4: return ENOMEM if find_or_create_pages fails

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f60b459b27d7..5fcef98f98e5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3129,7 +3129,7 @@ int ext4_discard_partial_page_buffers(handle_t *handle,
 	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
 				   mapping_gfp_mask(mapping) & ~__GFP_FS);
 	if (!page)
-		return -EINVAL;
+		return -ENOMEM;
 
 	err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
 		from, length, flags);
@@ -3372,7 +3372,7 @@ int ext4_block_zero_page_range(handle_t *handle,
 	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
 				   mapping_gfp_mask(mapping) & ~__GFP_FS);
 	if (!page)
-		return -EINVAL;
+		return -ENOMEM;
 
 	blocksize = inode->i_sb->s_blocksize;
 	max = blocksize - (offset & (blocksize - 1));
-- 
cgit v1.2.3-58-ga151


From edb5ac8993e25143f6af1ab143843a65c52e2a15 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Mon, 31 Oct 2011 18:04:38 -0400
Subject: ext4: let ext4_discard_partial_buffers handle unaligned range
 correctly

As comment says, we should handle unaligned range rather than aligned
one.  This fixes a bug found by running xfstests #91.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
---
 fs/ext4/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5fcef98f98e5..de05e86f1993 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3206,8 +3206,8 @@ int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
 		 * to be updated with the contents of the block before
 		 * we write the zeros on top of it.
 		 */
-		if (!(from & (blocksize - 1)) ||
-		    !((from + length) & (blocksize - 1))) {
+		if ((from & (blocksize - 1)) ||
+		    ((from + length) & (blocksize - 1))) {
 			create_empty_buffers(page, blocksize, 0);
 		} else {
 			/*
-- 
cgit v1.2.3-58-ga151


From 4af835089984ce9e24c44a51be64c5524788e973 Mon Sep 17 00:00:00 2001
From: Eryu Guan <guaneryu@gmail.com>
Date: Mon, 31 Oct 2011 18:21:29 -0400
Subject: ext4: remove comments about extent mount option in ext4_new_inode()

Remove comments about 'extent' mount option in ext4_new_inode(), since
it's no longer exists.

Signed-off-by: Eryu Guan <guaneryu@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index acdde93b74d7..612bec255c6c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -873,11 +873,7 @@ got:
 	ei->i_dir_start_lookup = 0;
 	ei->i_disksize = 0;
 
-	/*
-	 * Don't inherit extent flag from directory, amongst others. We set
-	 * extent flag on newly created directory and file only if -o extent
-	 * mount option is specified
-	 */
+	/* Don't inherit extent flag from directory, amongst others. */
 	ei->i_flags =
 		ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
 	ei->i_file_acl = 0;
-- 
cgit v1.2.3-58-ga151


From 3c6fe77017bc6ce489f231c35fed3220b6691836 Mon Sep 17 00:00:00 2001
From: Greg Harm <gharm@google.com>
Date: Mon, 31 Oct 2011 18:41:47 -0400
Subject: ext4: Don't normalize an falloc request if it can fit in 1 extent.

If an fallocate request fits in EXT_UNINIT_MAX_LEN, then set the
EXT4_GET_BLOCKS_NO_NORMALIZE flag. For larger fallocate requests,
let mballoc.c normalize the request.

This fixes a problem where large requests were being split into
non-contiguous extents due to commit 556b27abf73: ext4: do not
normalize block requests from fallocate.

Testing:
*) Checked that 8.x MB falloc'ed files are still laid down next to
each other (contiguously).
*) Checked that the maximum size extent (127.9MB) is allocated as 1
extent.
*) Checked that a 1GB file is somewhat contiguous (often 5-6
non-contiguous extents now).
*) Checked that a 120MB file can still be falloc'ed even if there are
no single extents large enough to hold it.

Signed-off-by: Greg Harm <gharm@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a5c8caaaa099..2798945a1920 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4337,10 +4337,16 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 		trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
 		return ret;
 	}
-	flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
-		EXT4_GET_BLOCKS_NO_NORMALIZE;
+	flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
 	if (mode & FALLOC_FL_KEEP_SIZE)
 		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+	/*
+	 * Don't normalize the request if it can fit in one extent so
+	 * that it doesn't get unnecessarily split into multiple
+	 * extents.
+	 */
+	if (len <= EXT_UNINIT_MAX_LEN << blkbits)
+		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
 retry:
 	while (ret >= 0 && ret < max_blocks) {
 		map.m_lblk = map.m_lblk + ret;
-- 
cgit v1.2.3-58-ga151


From 78ace70c4186c0d18314eb001637aa97d1585e65 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 10 Oct 2011 01:08:05 -0700
Subject: udf: Convert printks to pr_<level>

Use the current logging styles.

Convert a few printks that should have been udf_warn and udf_err.
Coalesce formats.  Add #define pr_fmt.
Move an #include "udfdecls.h" above other includes in udftime.c
so pr_fmt works correctly.  Strip prefixes from conversions as appropriate.
Reorder logging definitions in udfdecl.h

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/directory.c |  4 +--
 fs/udf/inode.c     | 35 +++++++++++---------------
 fs/udf/super.c     | 73 ++++++++++++++++++++++++------------------------------
 fs/udf/truncate.c  | 22 ++++++++--------
 fs/udf/udfdecl.h   | 38 ++++++++++++++--------------
 fs/udf/udftime.c   |  3 ++-
 fs/udf/unicode.c   |  6 ++---
 7 files changed, 83 insertions(+), 98 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 2ffdb6733af1..4e0843897d07 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -201,7 +201,7 @@ struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offs
 	struct short_ad *sa;
 
 	if ((!ptr) || (!offset)) {
-		printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n");
+		pr_err("%s: invalidparms\n", __func__);
 		return NULL;
 	}
 
@@ -223,7 +223,7 @@ struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset
 	struct long_ad *la;
 
 	if ((!ptr) || (!offset)) {
-		printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n");
+		pr_err("%s: invalidparms\n", __func__);
 		return NULL;
 	}
 
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index f94d6f9febf5..f67e7e268c49 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -84,12 +84,10 @@ void udf_evict_inode(struct inode *inode)
 	end_writeback(inode);
 	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
 	    inode->i_size != iinfo->i_lenExtents) {
-		printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has "
-			"inode size %llu different from extent length %llu. "
-			"Filesystem need not be standards compliant.\n",
-			inode->i_sb->s_id, inode->i_ino, inode->i_mode,
-			(unsigned long long)inode->i_size,
-			(unsigned long long)iinfo->i_lenExtents);
+		udf_warn(inode->i_sb, "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n",
+			 inode->i_ino, inode->i_mode,
+			 (unsigned long long)inode->i_size,
+			 (unsigned long long)iinfo->i_lenExtents);
 	}
 	kfree(iinfo->i_ext.i_data);
 	iinfo->i_ext.i_data = NULL;
@@ -1177,16 +1175,15 @@ static void __udf_read_inode(struct inode *inode)
 	 */
 	bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident);
 	if (!bh) {
-		printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n",
-		       inode->i_ino);
+		udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino);
 		make_bad_inode(inode);
 		return;
 	}
 
 	if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE &&
 	    ident != TAG_IDENT_USE) {
-		printk(KERN_ERR "udf: udf_read_inode(ino %ld) "
-				"failed ident=%d\n", inode->i_ino, ident);
+		udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n",
+			inode->i_ino, ident);
 		brelse(bh);
 		make_bad_inode(inode);
 		return;
@@ -1226,8 +1223,8 @@ static void __udf_read_inode(struct inode *inode)
 		}
 		brelse(ibh);
 	} else if (fe->icbTag.strategyType != cpu_to_le16(4)) {
-		printk(KERN_ERR "udf: unsupported strategy type: %d\n",
-		       le16_to_cpu(fe->icbTag.strategyType));
+		udf_err(inode->i_sb, "unsupported strategy type: %d\n",
+			le16_to_cpu(fe->icbTag.strategyType));
 		brelse(bh);
 		make_bad_inode(inode);
 		return;
@@ -1421,9 +1418,8 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 		udf_debug("METADATA BITMAP FILE-----\n");
 		break;
 	default:
-		printk(KERN_ERR "udf: udf_fill_inode(ino %ld) failed unknown "
-				"file type=%d\n", inode->i_ino,
-				fe->icbTag.fileType);
+		udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n",
+			inode->i_ino, fe->icbTag.fileType);
 		make_bad_inode(inode);
 		return;
 	}
@@ -1446,8 +1442,8 @@ static int udf_alloc_i_data(struct inode *inode, size_t size)
 	iinfo->i_ext.i_data = kmalloc(size, GFP_KERNEL);
 
 	if (!iinfo->i_ext.i_data) {
-		printk(KERN_ERR "udf:udf_alloc_i_data (ino %ld) "
-				"no free memory\n", inode->i_ino);
+		udf_err(inode->i_sb, "(ino %ld) no free memory\n",
+			inode->i_ino);
 		return -ENOMEM;
 	}
 
@@ -1697,9 +1693,8 @@ out:
 	if (do_sync) {
 		sync_dirty_buffer(bh);
 		if (buffer_write_io_error(bh)) {
-			printk(KERN_WARNING "IO error syncing udf inode "
-				"[%s:%08lx]\n", inode->i_sb->s_id,
-				inode->i_ino);
+			udf_warn(inode->i_sb, "IO error syncing udf inode [%08lx]\n",
+				 inode->i_ino);
 			err = -EIO;
 		}
 	}
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 622331f1290d..39e3f351c7a6 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -547,8 +547,7 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
 			uopt->dmode = option & 0777;
 			break;
 		default:
-			printk(KERN_ERR "udf: bad mount option \"%s\" "
-			       "or missing value\n", p);
+			pr_err("bad mount option \"%s\" or missing value\n", p);
 			return 0;
 		}
 	}
@@ -1106,11 +1105,9 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
 	udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block);
 	if (!sbi->s_vat_inode &&
 	    sbi->s_last_block != blocks - 1) {
-		printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the"
-		       " last recorded block (%lu), retrying with the last "
-		       "block of the device (%lu).\n",
-		       (unsigned long)sbi->s_last_block,
-		       (unsigned long)blocks - 1);
+		pr_notice("Failed to read VAT inode from the last recorded block (%lu), retrying with the last block of the device (%lu).\n",
+			  (unsigned long)sbi->s_last_block,
+			  (unsigned long)blocks - 1);
 		udf_find_vat_block(sb, p_index, type1_index, blocks - 1);
 	}
 	if (!sbi->s_vat_inode)
@@ -1208,8 +1205,8 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
 	if (map->s_partition_type == UDF_METADATA_MAP25) {
 		ret = udf_load_metadata_files(sb, i);
 		if (ret) {
-			printk(KERN_ERR "UDF-fs: error loading MetaData "
-			"partition map %d\n", i);
+			udf_err(sb, "error loading MetaData partition map %d\n",
+				i);
 			goto out_bh;
 		}
 	} else {
@@ -1222,9 +1219,7 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
 		 * overwrite blocks instead of relocating them).
 		 */
 		sb->s_flags |= MS_RDONLY;
-		printk(KERN_NOTICE "UDF-fs: Filesystem marked read-only "
-			"because writing to pseudooverwrite partition is "
-			"not implemented.\n");
+		pr_notice("Filesystem marked read-only because writing to pseudooverwrite partition is not implemented\n");
 	}
 out_bh:
 	/* In case loading failed, we handle cleanup in udf_fill_super */
@@ -1466,9 +1461,9 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
 
 		bh = udf_read_tagged(sb, block, block, &ident);
 		if (!bh) {
-			printk(KERN_ERR "udf: Block %Lu of volume descriptor "
-			       "sequence is corrupted or we could not read "
-			       "it.\n", (unsigned long long)block);
+			udf_err(sb,
+				"Block %llu of volume descriptor sequence is corrupted or we could not read it\n",
+				(unsigned long long)block);
 			return 1;
 		}
 
@@ -1541,7 +1536,7 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
 	 * in a suitable order
 	 */
 	if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) {
-		printk(KERN_ERR "udf: Primary Volume Descriptor not found!\n");
+		udf_err(sb, "Primary Volume Descriptor not found!\n");
 		return 1;
 	}
 	if (udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block))
@@ -1728,7 +1723,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
 
 	if (!sb_set_blocksize(sb, uopt->blocksize)) {
 		if (!silent)
-			printk(KERN_WARNING "UDF-fs: Bad block size\n");
+			udf_warn(sb, "Bad block size\n");
 		return 0;
 	}
 	sbi->s_last_block = uopt->lastblock;
@@ -1737,7 +1732,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
 		nsr_off = udf_check_vsd(sb);
 		if (!nsr_off) {
 			if (!silent)
-				printk(KERN_WARNING "UDF-fs: No VRS found\n");
+				udf_warn(sb, "No VRS found\n");
 			return 0;
 		}
 		if (nsr_off == -1)
@@ -1753,7 +1748,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
 	sbi->s_anchor = uopt->anchor;
 	if (!udf_find_anchor(sb, fileset)) {
 		if (!silent)
-			printk(KERN_WARNING "UDF-fs: No anchor found\n");
+			udf_warn(sb, "No anchor found\n");
 		return 0;
 	}
 	return 1;
@@ -1974,15 +1969,14 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 		ret = udf_load_vrs(sb, &uopt, silent, &fileset);
 		if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
 			if (!silent)
-				printk(KERN_NOTICE
-				       "UDF-fs: Rescanning with blocksize "
-				       "%d\n", UDF_DEFAULT_BLOCKSIZE);
+				pr_notice("Rescanning with blocksize %d\n",
+					  UDF_DEFAULT_BLOCKSIZE);
 			uopt.blocksize = UDF_DEFAULT_BLOCKSIZE;
 			ret = udf_load_vrs(sb, &uopt, silent, &fileset);
 		}
 	}
 	if (!ret) {
-		printk(KERN_WARNING "UDF-fs: No partition found (1)\n");
+		udf_warn(sb, "No partition found (1)\n");
 		goto error_out;
 	}
 
@@ -1997,10 +1991,9 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 				le16_to_cpu(lvidiu->maxUDFWriteRev); */
 
 		if (minUDFReadRev > UDF_MAX_READ_VERSION) {
-			printk(KERN_ERR "UDF-fs: minUDFReadRev=%x "
-					"(max is %x)\n",
-			       le16_to_cpu(lvidiu->minUDFReadRev),
-			       UDF_MAX_READ_VERSION);
+			udf_err(sb, "minUDFReadRev=%x (max is %x)\n",
+				le16_to_cpu(lvidiu->minUDFReadRev),
+				UDF_MAX_READ_VERSION);
 			goto error_out;
 		} else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION)
 			sb->s_flags |= MS_RDONLY;
@@ -2014,28 +2007,27 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	}
 
 	if (!sbi->s_partitions) {
-		printk(KERN_WARNING "UDF-fs: No partition found (2)\n");
+		udf_warn(sb, "No partition found (2)\n");
 		goto error_out;
 	}
 
 	if (sbi->s_partmaps[sbi->s_partition].s_partition_flags &
 			UDF_PART_FLAG_READ_ONLY) {
-		printk(KERN_NOTICE "UDF-fs: Partition marked readonly; "
-				   "forcing readonly mount\n");
+		pr_notice("Partition marked readonly; forcing readonly mount\n");
 		sb->s_flags |= MS_RDONLY;
 	}
 
 	if (udf_find_fileset(sb, &fileset, &rootdir)) {
-		printk(KERN_WARNING "UDF-fs: No fileset found\n");
+		udf_warn(sb, "No fileset found\n");
 		goto error_out;
 	}
 
 	if (!silent) {
 		struct timestamp ts;
 		udf_time_to_disk_stamp(&ts, sbi->s_record_time);
-		udf_info("UDF: Mounting volume '%s', "
-			 "timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
-			 sbi->s_volume_ident, le16_to_cpu(ts.year), ts.month, ts.day,
+		udf_info("Mounting volume '%s', timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
+			 sbi->s_volume_ident,
+			 le16_to_cpu(ts.year), ts.month, ts.day,
 			 ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone));
 	}
 	if (!(sb->s_flags & MS_RDONLY))
@@ -2046,8 +2038,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	/* perhaps it's not extensible enough, but for now ... */
 	inode = udf_iget(sb, &rootdir);
 	if (!inode) {
-		printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, "
-				"partition=%d\n",
+		udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n",
 		       rootdir.logicalBlockNum, rootdir.partitionReferenceNum);
 		goto error_out;
 	}
@@ -2055,7 +2046,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	/* Allocate a dentry for the root inode */
 	sb->s_root = d_alloc_root(inode);
 	if (!sb->s_root) {
-		printk(KERN_ERR "UDF-fs: Couldn't allocate root dentry\n");
+		udf_err(sb, "Couldn't allocate root dentry\n");
 		iput(inode);
 		goto error_out;
 	}
@@ -2095,7 +2086,7 @@ void _udf_err(struct super_block *sb, const char *function,
 	va_start(args, fmt);
 	vsnprintf(error_buf, sizeof(error_buf), fmt, args);
 	va_end(args);
-	printk(KERN_CRIT "UDF-fs error (device %s): %s: %s",
+	pr_crit("error (device %s): %s: %s",
 	       sb->s_id, function, error_buf);
 }
 
@@ -2107,7 +2098,7 @@ void _udf_warn(struct super_block *sb, const char *function,
 	va_start(args, fmt);
 	vsnprintf(error_buf, sizeof(error_buf), fmt, args);
 	va_end(args);
-	printk(KERN_WARNING "UDF-fs warning (device %s): %s: %s",
+	pr_warn("warning (device %s): %s: %s",
 	       sb->s_id, function, error_buf);
 }
 
@@ -2200,11 +2191,11 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
 	bh = udf_read_ptagged(sb, &loc, 0, &ident);
 
 	if (!bh) {
-		printk(KERN_ERR "udf: udf_count_free failed\n");
+		udf_err(sb, "udf_count_free failed\n");
 		goto out;
 	} else if (ident != TAG_IDENT_SBD) {
 		brelse(bh);
-		printk(KERN_ERR "udf: udf_count_free failed\n");
+		udf_err(sb, "udf_count_free failed\n");
 		goto out;
 	}
 
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 8424308db4b4..4b98fee8e161 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -95,23 +95,21 @@ void udf_truncate_tail_extent(struct inode *inode)
 		lbcount += elen;
 		if (lbcount > inode->i_size) {
 			if (lbcount - inode->i_size >= inode->i_sb->s_blocksize)
-				printk(KERN_WARNING
-				       "udf_truncate_tail_extent(): Too long "
-				       "extent after EOF in inode %u: i_size: "
-				       "%Ld lbcount: %Ld extent %u+%u\n",
-				       (unsigned)inode->i_ino,
-				       (long long)inode->i_size,
-				       (long long)lbcount,
-				       (unsigned)eloc.logicalBlockNum,
-				       (unsigned)elen);
+				udf_warn(inode->i_sb,
+					 "Too long extent after EOF in inode %u: i_size: %lld lbcount: %lld extent %u+%u\n",
+					 (unsigned)inode->i_ino,
+					 (long long)inode->i_size,
+					 (long long)lbcount,
+					 (unsigned)eloc.logicalBlockNum,
+					 (unsigned)elen);
 			nelen = elen - (lbcount - inode->i_size);
 			epos.offset -= adsize;
 			extent_trunc(inode, &epos, &eloc, etype, elen, nelen);
 			epos.offset += adsize;
 			if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1)
-				printk(KERN_ERR "udf_truncate_tail_extent(): "
-				       "Extent after EOF in inode %u.\n",
-				       (unsigned)inode->i_ino);
+				udf_err(inode->i_sb,
+					"Extent after EOF in inode %u\n",
+					(unsigned)inode->i_ino);
 			break;
 		}
 	}
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 85e15edc080f..f3d449867301 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -1,6 +1,8 @@
 #ifndef __UDF_DECL_H
 #define __UDF_DECL_H
 
+#define pr_fmt(fmt) "UDF-fs: " fmt
+
 #include "ecma_167.h"
 #include "osta_udf.h"
 
@@ -16,18 +18,11 @@
 #define UDF_PREALLOCATE
 #define UDF_DEFAULT_PREALLOC_BLOCKS	8
 
-#undef UDFFS_DEBUG
-
-#ifdef UDFFS_DEBUG
-#define udf_debug(f, a...) \
-do { \
-	printk(KERN_DEBUG "UDF-fs DEBUG %s:%d:%s: ", \
-		__FILE__, __LINE__, __func__); \
-	printk(f, ##a); \
-} while (0)
-#else
-#define udf_debug(f, a...) /**/
-#endif
+__attribute__((format(printf, 3, 4)))
+extern void _udf_err(struct super_block *sb, const char *function,
+		     const char *fmt, ...);
+#define udf_err(sb, fmt, ...)					\
+	_udf_err(sb, __func__, fmt, ##__VA_ARGS__)
 
 __attribute__((format(printf, 3, 4)))
 extern void _udf_warn(struct super_block *sb, const char *function,
@@ -35,14 +30,19 @@ extern void _udf_warn(struct super_block *sb, const char *function,
 #define udf_warn(sb, fmt, ...)					\
 	_udf_warn(sb, __func__, fmt, ##__VA_ARGS__)
 
-__attribute__((format(printf, 3, 4)))
-extern void _udf_err(struct super_block *sb, const char *function,
-		     const char *fmt, ...);
-#define udf_err(sb, fmt, ...)					\
-	_udf_err(sb, __func__, fmt, ##__VA_ARGS__)
+#define udf_info(fmt, ...)					\
+	pr_info("INFO " fmt, ##__VA_ARGS__)
 
-#define udf_info(f, a...) \
-	printk(KERN_INFO "UDF-fs INFO " f, ##a);
+#undef UDFFS_DEBUG
+
+#ifdef UDFFS_DEBUG
+#define udf_debug(fmt, ...)					\
+	printk(KERN_DEBUG pr_fmt("%s:%d:%s: " fmt),		\
+	       __FILE__, __LINE__, __func__, ##__VA_ARGS__)
+#else
+#define udf_debug(fmt, ...)					\
+	no_printk(fmt, ##__VA_ARGS__)
+#endif
 
 #define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) )
 #define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) )
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index b8c828c4d200..1f11483eba6a 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -34,9 +34,10 @@
  * http://www.boulder.nist.gov/timefreq/pubs/bulletin/leapsecond.htm
  */
 
+#include "udfdecl.h"
+
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include "udfdecl.h"
 
 #define EPOCH_YEAR 1970
 
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index d03a90b6ad69..44b815e57f94 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -114,7 +114,7 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
 	cmp_id = ocu_i->u_cmpID;
 	if (cmp_id != 8 && cmp_id != 16) {
 		memset(utf_o, 0, sizeof(struct ustr));
-		printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
+		pr_err("unknown compression code (%d) stri=%s\n",
 		       cmp_id, ocu_i->u_name);
 		return 0;
 	}
@@ -242,7 +242,7 @@ try_again:
 	if (utf_cnt) {
 error_out:
 		ocu[++u_len] = '?';
-		printk(KERN_DEBUG "udf: bad UTF-8 character\n");
+		printk(KERN_DEBUG pr_fmt("bad UTF-8 character\n"));
 	}
 
 	ocu[length - 1] = (uint8_t)u_len + 1;
@@ -267,7 +267,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
 	cmp_id = ocu_i->u_cmpID;
 	if (cmp_id != 8 && cmp_id != 16) {
 		memset(utf_o, 0, sizeof(struct ustr));
-		printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
+		pr_err("unknown compression code (%d) stri=%s\n",
 		       cmp_id, ocu_i->u_name);
 		return 0;
 	}
-- 
cgit v1.2.3-58-ga151


From c2bff36c299b88ca5e05638dfa210d709e2a87ef Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 10 Oct 2011 01:08:06 -0700
Subject: udf: Neaten logging output, use vsprintf extension %pV

Use %pV and remove a static buffer to save some text space and fix possible
issues when several processes call error reporting function in parallel. Also
change error level from KERN_CRIT to KERN_ERR.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index 39e3f351c7a6..926228e7435a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -75,8 +75,6 @@
 
 #define UDF_DEFAULT_BLOCKSIZE 2048
 
-static char error_buf[1024];
-
 /* These are the "meat" - everything else is stuffing */
 static int udf_fill_super(struct super_block *, void *, int);
 static void udf_put_super(struct super_block *);
@@ -2077,29 +2075,37 @@ error_out:
 void _udf_err(struct super_block *sb, const char *function,
 	      const char *fmt, ...)
 {
+	struct va_format vaf;
 	va_list args;
 
-	if (!(sb->s_flags & MS_RDONLY)) {
-		/* mark sb error */
+	/* mark sb error */
+	if (!(sb->s_flags & MS_RDONLY))
 		sb->s_dirt = 1;
-	}
+
 	va_start(args, fmt);
-	vsnprintf(error_buf, sizeof(error_buf), fmt, args);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	pr_err("error (device %s): %s: %pV", sb->s_id, function, &vaf);
+
 	va_end(args);
-	pr_crit("error (device %s): %s: %s",
-	       sb->s_id, function, error_buf);
 }
 
 void _udf_warn(struct super_block *sb, const char *function,
 	       const char *fmt, ...)
 {
+	struct va_format vaf;
 	va_list args;
 
 	va_start(args, fmt);
-	vsnprintf(error_buf, sizeof(error_buf), fmt, args);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	pr_warn("warning (device %s): %s: %pV", sb->s_id, function, &vaf);
+
 	va_end(args);
-	pr_warn("warning (device %s): %s: %s",
-	       sb->s_id, function, error_buf);
 }
 
 static void udf_put_super(struct super_block *sb)
-- 
cgit v1.2.3-58-ga151


From a983f368f8986c1ecb64f2947fcf594343130215 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 10 Oct 2011 01:08:07 -0700
Subject: udf: Neaten udf_debug uses

Just whitespace and argument alignment.
Introduce some checkpatch warnings that deserve to be ignored.

Reviewed-by: NamJae Jeon <linkinjeon@gmail.com>
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/balloc.c    | 14 +++++-----
 fs/udf/directory.c |  4 +--
 fs/udf/inode.c     |  3 +--
 fs/udf/lowlevel.c  |  2 +-
 fs/udf/partition.c |  8 +++---
 fs/udf/super.c     | 78 ++++++++++++++++++++++++------------------------------
 6 files changed, 50 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 95518a9f589e..987585bb0a1d 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -59,8 +59,8 @@ static int __load_block_bitmap(struct super_block *sb,
 	int nr_groups = bitmap->s_nr_groups;
 
 	if (block_group >= nr_groups) {
-		udf_debug("block_group (%d) > nr_groups (%d)\n", block_group,
-			  nr_groups);
+		udf_debug("block_group (%d) > nr_groups (%d)\n",
+			  block_group, nr_groups);
 	}
 
 	if (bitmap->s_block_bitmap[block_group]) {
@@ -126,8 +126,9 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
 	if (bloc->logicalBlockNum + count < count ||
 	    (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
 		udf_debug("%d < %d || %d + %d > %d\n",
-			  bloc->logicalBlockNum, 0, bloc->logicalBlockNum,
-			  count, partmap->s_partition_len);
+			  bloc->logicalBlockNum, 0,
+			  bloc->logicalBlockNum, count,
+			  partmap->s_partition_len);
 		goto error_return;
 	}
 
@@ -155,7 +156,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
 			if (udf_set_bit(bit + i, bh->b_data)) {
 				udf_debug("bit %ld already set\n", bit + i);
 				udf_debug("byte=%2x\n",
-					((char *)bh->b_data)[(bit + i) >> 3]);
+					  ((char *)bh->b_data)[(bit + i) >> 3]);
 			}
 		}
 		udf_add_free_space(sb, sbi->s_partition, count);
@@ -369,7 +370,8 @@ static void udf_table_free_blocks(struct super_block *sb,
 	if (bloc->logicalBlockNum + count < count ||
 	    (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
 		udf_debug("%d < %d || %d + %d > %d\n",
-			  bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count,
+			  bloc->logicalBlockNum, 0,
+			  bloc->logicalBlockNum, count,
 			  partmap->s_partition_len);
 		goto error_return;
 	}
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 4e0843897d07..3e44f575fb9c 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -162,8 +162,8 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
 	int padlen;
 
 	if ((!buffer) || (!offset)) {
-		udf_debug("invalidparms\n, buffer=%p, offset=%p\n", buffer,
-			  offset);
+		udf_debug("invalidparms, buffer=%p, offset=%p\n",
+			  buffer, offset);
 		return NULL;
 	}
 
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index f67e7e268c49..329e7a108ab7 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1985,8 +1985,7 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
 		*elen = le32_to_cpu(lad->extLength) & UDF_EXTENT_LENGTH_MASK;
 		break;
 	default:
-		udf_debug("alloc_type = %d unsupported\n",
-				iinfo->i_alloc_type);
+		udf_debug("alloc_type = %d unsupported\n", iinfo->i_alloc_type);
 		return -1;
 	}
 
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 43e24a3b8e10..6583fe9b0645 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -38,7 +38,7 @@ unsigned int udf_get_last_session(struct super_block *sb)
 
 	if (i == 0) {
 		udf_debug("XA disk: %s, vol_desc_start=%d\n",
-			  (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba);
+			  ms_info.xa_flag ? "yes" : "no", ms_info.addr.lba);
 		if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
 			vol_desc_start = ms_info.addr.lba;
 	} else {
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index c72edb2260e3..f3e472c67709 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -33,8 +33,8 @@ uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	struct udf_part_map *map;
 	if (partition >= sbi->s_partitions) {
-		udf_debug("block=%d, partition=%d, offset=%d: "
-			  "invalid partition\n", block, partition, offset);
+		udf_debug("block=%d, partition=%d, offset=%d: invalid partition\n",
+			  block, partition, offset);
 		return 0xFFFFFFFF;
 	}
 	map = &sbi->s_partmaps[partition];
@@ -60,8 +60,8 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block,
 	vdata = &map->s_type_specific.s_virtual;
 
 	if (block > vdata->s_num_entries) {
-		udf_debug("Trying to access block beyond end of VAT "
-			  "(%d max %d)\n", block, vdata->s_num_entries);
+		udf_debug("Trying to access block beyond end of VAT (%d max %d)\n",
+			  block, vdata->s_num_entries);
 		return 0xFFFFFFFF;
 	}
 
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 926228e7435a..e58123ad75ba 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -639,20 +639,16 @@ static loff_t udf_check_vsd(struct super_block *sb)
 				udf_debug("ISO9660 Boot Record found\n");
 				break;
 			case 1:
-				udf_debug("ISO9660 Primary Volume Descriptor "
-					  "found\n");
+				udf_debug("ISO9660 Primary Volume Descriptor found\n");
 				break;
 			case 2:
-				udf_debug("ISO9660 Supplementary Volume "
-					  "Descriptor found\n");
+				udf_debug("ISO9660 Supplementary Volume Descriptor found\n");
 				break;
 			case 3:
-				udf_debug("ISO9660 Volume Partition Descriptor "
-					  "found\n");
+				udf_debug("ISO9660 Volume Partition Descriptor found\n");
 				break;
 			case 255:
-				udf_debug("ISO9660 Volume Descriptor Set "
-					  "Terminator found\n");
+				udf_debug("ISO9660 Volume Descriptor Set Terminator found\n");
 				break;
 			default:
 				udf_debug("ISO9660 VRS (%u) found\n",
@@ -803,8 +799,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
 			      pvoldesc->recordingDateAndTime)) {
 #ifdef UDFFS_DEBUG
 		struct timestamp *ts = &pvoldesc->recordingDateAndTime;
-		udf_debug("recording time %04u/%02u/%02u"
-			  " %02u:%02u (%x)\n",
+		udf_debug("recording time %04u/%02u/%02u %02u:%02u (%x)\n",
 			  le16_to_cpu(ts->year), ts->month, ts->day, ts->hour,
 			  ts->minute, le16_to_cpu(ts->typeAndTimezone));
 #endif
@@ -815,7 +810,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
 			strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
 				outstr->u_len > 31 ? 31 : outstr->u_len);
 			udf_debug("volIdent[] = '%s'\n",
-					UDF_SB(sb)->s_volume_ident);
+				  UDF_SB(sb)->s_volume_ident);
 		}
 
 	if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128))
@@ -847,7 +842,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
 	addr.partitionReferenceNum = map->s_partition_num;
 
 	udf_debug("Metadata file location: block = %d part = %d\n",
-			  addr.logicalBlockNum, addr.partitionReferenceNum);
+		  addr.logicalBlockNum, addr.partitionReferenceNum);
 
 	mdata->s_metadata_fe = udf_iget(sb, &addr);
 
@@ -867,7 +862,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
 	addr.partitionReferenceNum = map->s_partition_num;
 
 	udf_debug("Mirror metadata file location: block = %d part = %d\n",
-			  addr.logicalBlockNum, addr.partitionReferenceNum);
+		  addr.logicalBlockNum, addr.partitionReferenceNum);
 
 	mdata->s_mirror_fe = udf_iget(sb, &addr);
 
@@ -896,7 +891,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
 		addr.partitionReferenceNum = map->s_partition_num;
 
 		udf_debug("Bitmap file location: block = %d part = %d\n",
-			addr.logicalBlockNum, addr.partitionReferenceNum);
+			  addr.logicalBlockNum, addr.partitionReferenceNum);
 
 		mdata->s_bitmap_fe = udf_iget(sb, &addr);
 
@@ -988,10 +983,9 @@ static int udf_fill_partdesc_info(struct super_block *sb,
 	if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE))
 		map->s_partition_flags |= UDF_PART_FLAG_OVERWRITABLE;
 
-	udf_debug("Partition (%d type %x) starts at physical %d, "
-		  "block length %d\n", p_index,
-		  map->s_partition_type, map->s_partition_root,
-		  map->s_partition_len);
+	udf_debug("Partition (%d type %x) starts at physical %d, block length %d\n",
+		  p_index, map->s_partition_type,
+		  map->s_partition_root, map->s_partition_len);
 
 	if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) &&
 	    strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03))
@@ -1008,12 +1002,12 @@ static int udf_fill_partdesc_info(struct super_block *sb,
 		map->s_uspace.s_table = udf_iget(sb, &loc);
 		if (!map->s_uspace.s_table) {
 			udf_debug("cannot load unallocSpaceTable (part %d)\n",
-					p_index);
+				  p_index);
 			return 1;
 		}
 		map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
 		udf_debug("unallocSpaceTable (part %d) @ %ld\n",
-				p_index, map->s_uspace.s_table->i_ino);
+			  p_index, map->s_uspace.s_table->i_ino);
 	}
 
 	if (phd->unallocSpaceBitmap.extLength) {
@@ -1026,8 +1020,8 @@ static int udf_fill_partdesc_info(struct super_block *sb,
 		bitmap->s_extPosition = le32_to_cpu(
 				phd->unallocSpaceBitmap.extPosition);
 		map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP;
-		udf_debug("unallocSpaceBitmap (part %d) @ %d\n", p_index,
-						bitmap->s_extPosition);
+		udf_debug("unallocSpaceBitmap (part %d) @ %d\n",
+			  p_index, bitmap->s_extPosition);
 	}
 
 	if (phd->partitionIntegrityTable.extLength)
@@ -1043,13 +1037,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
 		map->s_fspace.s_table = udf_iget(sb, &loc);
 		if (!map->s_fspace.s_table) {
 			udf_debug("cannot load freedSpaceTable (part %d)\n",
-				p_index);
+				  p_index);
 			return 1;
 		}
 
 		map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
 		udf_debug("freedSpaceTable (part %d) @ %ld\n",
-				p_index, map->s_fspace.s_table->i_ino);
+			  p_index, map->s_fspace.s_table->i_ino);
 	}
 
 	if (phd->freedSpaceBitmap.extLength) {
@@ -1062,8 +1056,8 @@ static int udf_fill_partdesc_info(struct super_block *sb,
 		bitmap->s_extPosition = le32_to_cpu(
 				phd->freedSpaceBitmap.extPosition);
 		map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP;
-		udf_debug("freedSpaceBitmap (part %d) @ %d\n", p_index,
-					bitmap->s_extPosition);
+		udf_debug("freedSpaceBitmap (part %d) @ %d\n",
+			  p_index, bitmap->s_extPosition);
 	}
 	return 0;
 }
@@ -1325,9 +1319,8 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
 				struct metadataPartitionMap *mdm =
 						(struct metadataPartitionMap *)
 						&(lvd->partitionMaps[offset]);
-				udf_debug("Parsing Logical vol part %d "
-					"type %d  id=%s\n", i, type,
-					UDF_ID_METADATA);
+				udf_debug("Parsing Logical vol part %d type %d  id=%s\n",
+					  i, type, UDF_ID_METADATA);
 
 				map->s_partition_type = UDF_METADATA_MAP25;
 				map->s_partition_func = udf_get_pblock_meta25;
@@ -1346,21 +1339,20 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
 					mdm->flags & 0x01;
 
 				udf_debug("Metadata Ident suffix=0x%x\n",
-					(le16_to_cpu(
-					 ((__le16 *)
-					      mdm->partIdent.identSuffix)[0])));
+					  le16_to_cpu(*(__le16 *)
+						      mdm->partIdent.identSuffix));
 				udf_debug("Metadata part num=%d\n",
-					le16_to_cpu(mdm->partitionNum));
+					  le16_to_cpu(mdm->partitionNum));
 				udf_debug("Metadata part alloc unit size=%d\n",
-					le32_to_cpu(mdm->allocUnitSize));
+					  le32_to_cpu(mdm->allocUnitSize));
 				udf_debug("Metadata file loc=%d\n",
-					le32_to_cpu(mdm->metadataFileLoc));
+					  le32_to_cpu(mdm->metadataFileLoc));
 				udf_debug("Mirror file loc=%d\n",
-				       le32_to_cpu(mdm->metadataMirrorFileLoc));
+					  le32_to_cpu(mdm->metadataMirrorFileLoc));
 				udf_debug("Bitmap file loc=%d\n",
-				       le32_to_cpu(mdm->metadataBitmapFileLoc));
+					  le32_to_cpu(mdm->metadataBitmapFileLoc));
 				udf_debug("Duplicate Flag: %d %d\n",
-					mdata->s_dup_md_flag, mdm->flags);
+					  mdata->s_dup_md_flag, mdm->flags);
 			} else {
 				udf_debug("Unknown ident: %s\n",
 					  upm2->partIdent.ident);
@@ -1370,16 +1362,15 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
 			map->s_partition_num = le16_to_cpu(upm2->partitionNum);
 		}
 		udf_debug("Partition (%d:%d) type %d on volume %d\n",
-			  i, map->s_partition_num, type,
-			  map->s_volumeseqnum);
+			  i, map->s_partition_num, type, map->s_volumeseqnum);
 	}
 
 	if (fileset) {
 		struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]);
 
 		*fileset = lelb_to_cpu(la->extLocation);
-		udf_debug("FileSet found in LogicalVolDesc at block=%d, "
-			  "partition=%d\n", fileset->logicalBlockNum,
+		udf_debug("FileSet found in LogicalVolDesc at block=%d, partition=%d\n",
+			  fileset->logicalBlockNum,
 			  fileset->partitionReferenceNum);
 	}
 	if (lvd->integritySeqExt.extLength)
@@ -1734,8 +1725,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
 			return 0;
 		}
 		if (nsr_off == -1)
-			udf_debug("Failed to read byte 32768. Assuming open "
-				  "disc. Skipping validity check\n");
+			udf_debug("Failed to read byte 32768. Assuming open disc. Skipping validity check\n");
 		if (!sbi->s_last_block)
 			sbi->s_last_block = udf_get_last_block(sb);
 	} else {
-- 
cgit v1.2.3-58-ga151


From 0aaa618863c40e86b543debe002d6f65ff5d61d4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 10 Oct 2011 18:32:06 +0200
Subject: quota: Drop path reference on error exit from quotactl

One error exit from quotactl forgot to do path_put(). Fix that.

Reported-by: Valerie Aurora <val@vaaconsulting.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/quota.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b34bdb25490c..a18ef461a5ba 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -363,12 +363,15 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
 	}
 
 	sb = quotactl_block(special);
-	if (IS_ERR(sb))
-		return PTR_ERR(sb);
+	if (IS_ERR(sb)) {
+		ret = PTR_ERR(sb);
+		goto out;
+	}
 
 	ret = do_quotactl(sb, type, cmds, id, addr, pathp);
 
 	drop_super(sb);
+out:
 	if (pathp && !IS_ERR(pathp))
 		path_put(pathp);
 	return ret;
-- 
cgit v1.2.3-58-ga151


From 6360e21f943172bb71772ec150b96a9e787a535f Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Wed, 19 Oct 2011 14:16:23 +0400
Subject: ext3: Allow quota file use root reservation

Quota file is fs's metadata, so it is reasonable  to permit use
root resevation if necessary. This patch fix 265'th xfstest failure

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/balloc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index a7fceaba87f1..a2038928f9a3 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1440,14 +1440,14 @@ out:
  *
  * Check if filesystem has at least 1 free block available for allocation.
  */
-static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
+static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation)
 {
 	ext3_fsblk_t free_blocks, root_blocks;
 
 	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
 	root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
 	if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
-		sbi->s_resuid != current_fsuid() &&
+		!use_reservation && sbi->s_resuid != current_fsuid() &&
 		(sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
 		return 0;
 	}
@@ -1468,7 +1468,7 @@ static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
  */
 int ext3_should_retry_alloc(struct super_block *sb, int *retries)
 {
-	if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3)
+	if (!ext3_has_free_blocks(EXT3_SB(sb), 0) || (*retries)++ > 3)
 		return 0;
 
 	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1546,7 +1546,7 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
 	if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
 		my_rsv = &block_i->rsv_window_node;
 
-	if (!ext3_has_free_blocks(sbi)) {
+	if (!ext3_has_free_blocks(sbi, IS_NOQUOTA(inode))) {
 		*errp = -ENOSPC;
 		goto out;
 	}
-- 
cgit v1.2.3-58-ga151


From 3080a74ea39eece6ac21aae768c48ab8b1f89ac1 Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon@gmail.com>
Date: Sun, 23 Oct 2011 19:28:32 +0900
Subject: udf: Skip mirror metadata FE loading when metadata FE is ok

It is not necessary to load mirror metadata FE when metadata FE is OK.  So try
to read it only the first time udf_get_pblock_meta25() fails to map the block
from metadata FE.

Signed-off-by: Ashish Sangwan <ashishsangwan2@gmail.com>
Signed-off-by: Namjae Jeon <linkinjeon@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/partition.c |  8 ++++++-
 fs/udf/super.c     | 68 ++++++++++++++++++++++++++----------------------------
 fs/udf/udf_sb.h    |  1 +
 fs/udf/udfdecl.h   |  2 ++
 4 files changed, 43 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index f3e472c67709..b526f25c04c9 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -321,8 +321,14 @@ uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block,
 	/* We shouldn't mount such media... */
 	BUG_ON(!inode);
 	retblk = udf_try_read_meta(inode, block, partition, offset);
-	if (retblk == 0xFFFFFFFF) {
+	if (retblk == 0xFFFFFFFF && mdata->s_metadata_fe) {
 		udf_warn(sb, "error reading from METADATA, trying to read from MIRROR\n");
+		if (!mdata->s_mirror_loaded_flag) {
+			mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb,
+				mdata->s_mirror_file_loc, map->s_partition_num);
+			mdata->s_mirror_loaded_flag = 1;
+		}
+
 		inode = mdata->s_mirror_fe;
 		if (!inode)
 			return 0xFFFFFFFF;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e58123ad75ba..dfe043a36593 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -826,59 +826,57 @@ out1:
 	return ret;
 }
 
+struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
+					u32 meta_file_loc, u32 partition_num)
+{
+	struct kernel_lb_addr addr;
+	struct inode *metadata_fe;
+
+	addr.logicalBlockNum = meta_file_loc;
+	addr.partitionReferenceNum = partition_num;
+
+	metadata_fe = udf_iget(sb, &addr);
+
+	if (metadata_fe == NULL)
+		udf_warn(sb, "metadata inode efe not found\n");
+	else if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) {
+		udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n");
+		iput(metadata_fe);
+		metadata_fe = NULL;
+	}
+
+	return metadata_fe;
+}
+
 static int udf_load_metadata_files(struct super_block *sb, int partition)
 {
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	struct udf_part_map *map;
 	struct udf_meta_data *mdata;
 	struct kernel_lb_addr addr;
-	int fe_error = 0;
 
 	map = &sbi->s_partmaps[partition];
 	mdata = &map->s_type_specific.s_metadata;
 
 	/* metadata address */
-	addr.logicalBlockNum =  mdata->s_meta_file_loc;
-	addr.partitionReferenceNum = map->s_partition_num;
-
 	udf_debug("Metadata file location: block = %d part = %d\n",
-		  addr.logicalBlockNum, addr.partitionReferenceNum);
+		  mdata->s_meta_file_loc, map->s_partition_num);
 
-	mdata->s_metadata_fe = udf_iget(sb, &addr);
+	mdata->s_metadata_fe = udf_find_metadata_inode_efe(sb,
+		mdata->s_meta_file_loc, map->s_partition_num);
 
 	if (mdata->s_metadata_fe == NULL) {
-		udf_warn(sb, "metadata inode efe not found, will try mirror inode\n");
-		fe_error = 1;
-	} else if (UDF_I(mdata->s_metadata_fe)->i_alloc_type !=
-		 ICBTAG_FLAG_AD_SHORT) {
-		udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n");
-		fe_error = 1;
-		iput(mdata->s_metadata_fe);
-		mdata->s_metadata_fe = NULL;
-	}
-
-	/* mirror file entry */
-	addr.logicalBlockNum = mdata->s_mirror_file_loc;
-	addr.partitionReferenceNum = map->s_partition_num;
+		/* mirror file entry */
+		udf_debug("Mirror metadata file location: block = %d part = %d\n",
+			  mdata->s_mirror_file_loc, map->s_partition_num);
 
-	udf_debug("Mirror metadata file location: block = %d part = %d\n",
-		  addr.logicalBlockNum, addr.partitionReferenceNum);
+		mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb,
+			mdata->s_mirror_file_loc, map->s_partition_num);
 
-	mdata->s_mirror_fe = udf_iget(sb, &addr);
-
-	if (mdata->s_mirror_fe == NULL) {
-		if (fe_error) {
-			udf_err(sb, "mirror inode efe not found and metadata inode is missing too, exiting...\n");
-			goto error_exit;
-		} else
-			udf_warn(sb, "mirror inode efe not found, but metadata inode is OK\n");
-	} else if (UDF_I(mdata->s_mirror_fe)->i_alloc_type !=
-		 ICBTAG_FLAG_AD_SHORT) {
-		udf_warn(sb, "mirror inode efe does not have short allocation descriptors!\n");
-		iput(mdata->s_mirror_fe);
-		mdata->s_mirror_fe = NULL;
-		if (fe_error)
+		if (mdata->s_mirror_fe == NULL) {
+			udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n");
 			goto error_exit;
+		}
 	}
 
 	/*
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 4858c191242b..a3146b05feeb 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -61,6 +61,7 @@ struct udf_meta_data {
 	__u32	s_alloc_unit_size;
 	__u16	s_align_unit_size;
 	__u8 	s_dup_md_flag;
+	__u8	s_mirror_loaded_flag;
 	struct inode *s_metadata_fe;
 	struct inode *s_mirror_fe;
 	struct inode *s_bitmap_fe;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index f3d449867301..79aae3fe7b55 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -135,6 +135,8 @@ static inline void udf_updated_lvid(struct super_block *sb)
 	UDF_SB(sb)->s_lvid_dirty = 1;
 }
 extern u64 lvid_get_unique_id(struct super_block *sb);
+struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
+					u32 meta_file_loc, u32 partition_num);
 
 /* namei.c */
 extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
-- 
cgit v1.2.3-58-ga151


From ed47a7d00c22b326fc4c97342a73ecd15929732e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 24 Oct 2011 16:47:48 +0200
Subject: udf: Cleanup metadata flags handling

Use simple ->s_flags variable instead of u8 variable for each flag.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/partition.c | 4 ++--
 fs/udf/super.c     | 8 ++++----
 fs/udf/udf_sb.h    | 6 ++++--
 3 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index b526f25c04c9..d6caf01a2097 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -323,10 +323,10 @@ uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block,
 	retblk = udf_try_read_meta(inode, block, partition, offset);
 	if (retblk == 0xFFFFFFFF && mdata->s_metadata_fe) {
 		udf_warn(sb, "error reading from METADATA, trying to read from MIRROR\n");
-		if (!mdata->s_mirror_loaded_flag) {
+		if (!(mdata->s_flags & MF_MIRROR_FE_LOADED)) {
 			mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb,
 				mdata->s_mirror_file_loc, map->s_partition_num);
-			mdata->s_mirror_loaded_flag = 1;
+			mdata->s_flags |= MF_MIRROR_FE_LOADED;
 		}
 
 		inode = mdata->s_mirror_fe;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index dfe043a36593..e185253470df 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1333,8 +1333,8 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
 					le32_to_cpu(mdm->allocUnitSize);
 				mdata->s_align_unit_size =
 					le16_to_cpu(mdm->alignUnitSize);
-				mdata->s_dup_md_flag 	 =
-					mdm->flags & 0x01;
+				if (mdm->flags & 0x01)
+					mdata->s_flags |= MF_DUPLICATE_MD;
 
 				udf_debug("Metadata Ident suffix=0x%x\n",
 					  le16_to_cpu(*(__le16 *)
@@ -1349,8 +1349,8 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
 					  le32_to_cpu(mdm->metadataMirrorFileLoc));
 				udf_debug("Bitmap file loc=%d\n",
 					  le32_to_cpu(mdm->metadataBitmapFileLoc));
-				udf_debug("Duplicate Flag: %d %d\n",
-					  mdata->s_dup_md_flag, mdm->flags);
+				udf_debug("Flags: %d %d\n",
+					  mdata->s_flags, mdm->flags);
 			} else {
 				udf_debug("Unknown ident: %s\n",
 					  upm2->partIdent.ident);
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index a3146b05feeb..5142a82e3276 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -54,14 +54,16 @@
 
 #pragma pack(1) /* XXX(hch): Why?  This file just defines in-core structures */
 
+#define MF_DUPLICATE_MD		0x01
+#define MF_MIRROR_FE_LOADED	0x02
+
 struct udf_meta_data {
 	__u32	s_meta_file_loc;
 	__u32	s_mirror_file_loc;
 	__u32	s_bitmap_file_loc;
 	__u32	s_alloc_unit_size;
 	__u16	s_align_unit_size;
-	__u8 	s_dup_md_flag;
-	__u8	s_mirror_loaded_flag;
+	int	s_flags;
 	struct inode *s_metadata_fe;
 	struct inode *s_mirror_fe;
 	struct inode *s_bitmap_fe;
-- 
cgit v1.2.3-58-ga151


From ff3fc1736f1967b59801ab2cf6409fc6c8556b0a Mon Sep 17 00:00:00 2001
From: Robin Dong <sanbai@taobao.com>
Date: Mon, 31 Oct 2011 18:55:50 -0400
Subject: ext4: fix a typo in struct ext4_allocation_context

This patch changes "bext" to "best".

Signed-off-by: Robin Dong <sanbai@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index dc99930d4cb5..47705f3285e3 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -175,7 +175,7 @@ struct ext4_allocation_context {
 	/* the best found extent */
 	struct ext4_free_extent ac_b_ex;
 
-	/* copy of the bext found extent taken before preallocation efforts */
+	/* copy of the best found extent taken before preallocation efforts */
 	struct ext4_free_extent ac_f_ex;
 
 	/* number of iterations done. we have to track to limit searching */
-- 
cgit v1.2.3-58-ga151


From afeacc8c1f38b7bb93d4bc7b4ba04c2605061ef0 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Thu, 26 May 2011 16:00:52 -0400
Subject: fs: add export.h to files using EXPORT_SYMBOL/THIS_MODULE macros

These files were getting <linux/module.h> via an implicit include
path, but we want to crush those out of existence since they cost
time during compiles of processing thousands of lines of headers
for no reason.  Give them the lightweight header that just contains
the EXPORT_SYMBOL infrastructure.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 fs/bio-integrity.c      | 1 +
 fs/gfs2/ops_fstype.c    | 1 +
 fs/ioprio.c             | 1 +
 fs/jfs/jfs_logmgr.c     | 1 +
 fs/nfs/pagelist.c       | 1 +
 fs/nfs/pnfs_dev.c       | 1 +
 fs/nfs/write.c          | 1 +
 fs/nfsd/nfs4acl.c       | 1 +
 fs/ocfs2/cluster/tcp.c  | 1 +
 fs/ocfs2/dlm/dlmdebug.c | 1 +
 fs/proc/vmcore.c        | 1 +
 11 files changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 9c5e6b2cd11a..c2183f3917cd 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -22,6 +22,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/mempool.h>
+#include <linux/export.h>
 #include <linux/bio.h>
 #include <linux/workqueue.h>
 #include <linux/slab.h>
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7e823bbd2453..cb23c2be731a 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -14,6 +14,7 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/kthread.h>
+#include <linux/export.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/gfs2_ondisk.h>
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 7da2a06508e5..f79dab83e17b 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -21,6 +21,7 @@
  */
 #include <linux/gfp.h>
 #include <linux/kernel.h>
+#include <linux/export.h>
 #include <linux/ioprio.h>
 #include <linux/blkdev.h>
 #include <linux/capability.h>
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 583636f745e5..cc5f811ed383 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -67,6 +67,7 @@
 #include <linux/buffer_head.h>		/* for sync_blockdev() */
 #include <linux/bio.h>
 #include <linux/freezer.h>
+#include <linux/export.h>
 #include <linux/delay.h>
 #include <linux/mutex.h>
 #include <linux/seq_file.h>
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index b60970cc7f1f..a788d8522c88 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -18,6 +18,7 @@
 #include <linux/nfs_page.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
+#include <linux/export.h>
 
 #include "internal.h"
 #include "pnfs.h"
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 6fda5228ef56..4f359d2a26eb 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -28,6 +28,7 @@
  *  such damages.
  */
 
+#include <linux/export.h>
 #include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PNFS
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 2219c88d96b2..cd1edfd8c2d0 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -20,6 +20,7 @@
 #include <linux/nfs_mount.h>
 #include <linux/nfs_page.h>
 #include <linux/backing-dev.h>
+#include <linux/export.h>
 
 #include <asm/uaccess.h>
 
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index ad88f1c0a4c3..9c51aff02ae2 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -36,6 +36,7 @@
 
 #include <linux/slab.h>
 #include <linux/nfs_fs.h>
+#include <linux/export.h>
 #include "acl.h"
 
 
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index db5ee4b4f47a..ad7d0c155de4 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -59,6 +59,7 @@
 #include <linux/idr.h>
 #include <linux/kref.h>
 #include <linux/net.h>
+#include <linux/export.h>
 #include <net/tcp.h>
 
 #include <asm/uaccess.h>
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 56f82cb912e3..0e28e242226d 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -30,6 +30,7 @@
 #include <linux/sysctl.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
+#include <linux/export.h>
 
 #include "cluster/heartbeat.h"
 #include "cluster/nodemanager.h"
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cd99bf557650..b0f450a2bb7c 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -12,6 +12,7 @@
 #include <linux/user.h>
 #include <linux/elf.h>
 #include <linux/elfcore.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/bootmem.h>
-- 
cgit v1.2.3-58-ga151


From 143cb494cb6662e37c4020b7fe9839837f718e56 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Fri, 1 Jul 2011 14:23:34 -0400
Subject: fs: add module.h to files that were implicitly using it

Some files were using the complete module.h infrastructure without
actually including the header at all.  Fix them up in advance so
once the implicit presence is removed, we won't get failures like this:

  CC [M]  fs/nfsd/nfssvc.o
fs/nfsd/nfssvc.c: In function 'nfsd_create_serv':
fs/nfsd/nfssvc.c:335: error: 'THIS_MODULE' undeclared (first use in this function)
fs/nfsd/nfssvc.c:335: error: (Each undeclared identifier is reported only once
fs/nfsd/nfssvc.c:335: error: for each function it appears in.)
fs/nfsd/nfssvc.c: In function 'nfsd':
fs/nfsd/nfssvc.c:555: error: implicit declaration of function 'module_put_and_exit'
make[3]: *** [fs/nfsd/nfssvc.o] Error 1

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 fs/cifs/connect.c       | 1 +
 fs/exofs/ore.c          | 1 +
 fs/exofs/super.c        | 1 +
 fs/fuse/cuse.c          | 1 +
 fs/logfs/super.c        | 1 +
 fs/nfs/nfs4filelayout.c | 1 +
 fs/nfs/pnfs.c           | 1 +
 fs/nfsd/nfsctl.c        | 1 +
 fs/nfsd/nfssvc.c        | 1 +
 9 files changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d545a95c30ed..7ef4e2846658 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -37,6 +37,7 @@
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <linux/inet.h>
+#include <linux/module.h>
 #include <net/ipv6.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index fcfa86ae6faf..d271ad837202 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -23,6 +23,7 @@
  */
 
 #include <linux/slab.h>
+#include <linux/module.h>
 #include <asm/div64.h>
 #include <linux/lcm.h>
 
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 057b237b8b69..e6085ec192d6 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -35,6 +35,7 @@
 #include <linux/parser.h>
 #include <linux/vfs.h>
 #include <linux/random.h>
+#include <linux/module.h>
 #include <linux/exportfs.h>
 #include <linux/slab.h>
 
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index b6cca47f7b07..3426521f3205 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -47,6 +47,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/stat.h>
+#include <linux/module.h>
 
 #include "fuse_i.h"
 
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index ce03a182c771..b9b3154b0485 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -13,6 +13,7 @@
 #include <linux/bio.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
+#include <linux/module.h>
 #include <linux/mtd/mtd.h>
 #include <linux/statfs.h>
 #include <linux/buffer_head.h>
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 09119418402f..955699515e70 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -31,6 +31,7 @@
 
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
+#include <linux/module.h>
 
 #include "internal.h"
 #include "nfs4filelayout.h"
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index ee73d9a4f700..ba1d5388fafd 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -29,6 +29,7 @@
 
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
+#include <linux/module.h>
 #include "internal.h"
 #include "pnfs.h"
 #include "iostat.h"
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index db34a585e112..c45a2ea4a090 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -13,6 +13,7 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/gss_api.h>
 #include <linux/sunrpc/gss_krb5_enctypes.h>
+#include <linux/module.h>
 
 #include "idmap.h"
 #include "nfsd.h"
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index dc5a1bf476b1..4b8e828ae15f 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -8,6 +8,7 @@
 
 #include <linux/sched.h>
 #include <linux/freezer.h>
+#include <linux/module.h>
 #include <linux/fs_struct.h>
 #include <linux/swap.h>
 
-- 
cgit v1.2.3-58-ga151


From fc360bd9cdcf875639a77f07fafec26699c546f3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 31 Oct 2011 17:06:32 -0700
Subject: /proc/self/numa_maps: restore "huge" tag for hugetlb vmas

The display of the "huge" tag was accidentally removed in 29ea2f698 ("mm:
use walk_page_range() instead of custom page table walking code").

Reported-by: Stephen Hemminger <shemminger@vyatta.com>
Tested-by: Stephen Hemminger <shemminger@vyatta.com>
Reviewed-by: Stephen Wilson <wilsons@start.ca>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: <stable@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5afaa58a8630..c7d4ee663f14 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1039,6 +1039,9 @@ static int show_numa_map(struct seq_file *m, void *v)
 		seq_printf(m, " stack");
 	}
 
+	if (is_vm_hugetlb_page(vma))
+		seq_printf(m, " huge");
+
 	walk_page_range(vma->vm_start, vma->vm_end, &walk);
 
 	if (!md->pages)
-- 
cgit v1.2.3-58-ga151


From fcf634098c00dd9cd247447368495f0b79be12d1 Mon Sep 17 00:00:00 2001
From: Christopher Yeoh <cyeoh@au1.ibm.com>
Date: Mon, 31 Oct 2011 17:06:39 -0700
Subject: Cross Memory Attach

The basic idea behind cross memory attach is to allow MPI programs doing
intra-node communication to do a single copy of the message rather than a
double copy of the message via shared memory.

The following patch attempts to achieve this by allowing a destination
process, given an address and size from a source process, to copy memory
directly from the source process into its own address space via a system
call.  There is also a symmetrical ability to copy from the current
process's address space into a destination process's address space.

- Use of /proc/pid/mem has been considered, but there are issues with
  using it:
  - Does not allow for specifying iovecs for both src and dest, assuming
    preadv or pwritev was implemented either the area read from or
  written to would need to be contiguous.
  - Currently mem_read allows only processes who are currently
  ptrace'ing the target and are still able to ptrace the target to read
  from the target. This check could possibly be moved to the open call,
  but its not clear exactly what race this restriction is stopping
  (reason  appears to have been lost)
  - Having to send the fd of /proc/self/mem via SCM_RIGHTS on unix
  domain socket is a bit ugly from a userspace point of view,
  especially when you may have hundreds if not (eventually) thousands
  of processes  that all need to do this with each other
  - Doesn't allow for some future use of the interface we would like to
  consider adding in the future (see below)
  - Interestingly reading from /proc/pid/mem currently actually
  involves two copies! (But this could be fixed pretty easily)

As mentioned previously use of vmsplice instead was considered, but has
problems.  Since you need the reader and writer working co-operatively if
the pipe is not drained then you block.  Which requires some wrapping to
do non blocking on the send side or polling on the receive.  In all to all
communication it requires ordering otherwise you can deadlock.  And in the
example of many MPI tasks writing to one MPI task vmsplice serialises the
copying.

There are some cases of MPI collectives where even a single copy interface
does not get us the performance gain we could.  For example in an
MPI_Reduce rather than copy the data from the source we would like to
instead use it directly in a mathops (say the reduce is doing a sum) as
this would save us doing a copy.  We don't need to keep a copy of the data
from the source.  I haven't implemented this, but I think this interface
could in the future do all this through the use of the flags - eg could
specify the math operation and type and the kernel rather than just
copying the data would apply the specified operation between the source
and destination and store it in the destination.

Although we don't have a "second user" of the interface (though I've had
some nibbles from people who may be interested in using it for intra
process messaging which is not MPI).  This interface is something which
hardware vendors are already doing for their custom drivers to implement
fast local communication.  And so in addition to this being useful for
OpenMPI it would mean the driver maintainers don't have to fix things up
when the mm changes.

There was some discussion about how much faster a true zero copy would
go. Here's a link back to the email with some testing I did on that:

http://marc.info/?l=linux-mm&m=130105930902915&w=2

There is a basic man page for the proposed interface here:

http://ozlabs.org/~cyeoh/cma/process_vm_readv.txt

This has been implemented for x86 and powerpc, other architecture should
mainly (I think) just need to add syscall numbers for the process_vm_readv
and process_vm_writev. There are 32 bit compatibility versions for
64-bit kernels.

For arch maintainers there are some simple tests to be able to quickly
verify that the syscalls are working correctly here:

http://ozlabs.org/~cyeoh/cma/cma-test-20110718.tgz

Signed-off-by: Chris Yeoh <yeohc@au1.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: James Morris <jmorris@namei.org>
Cc: <linux-man@vger.kernel.org>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/include/asm/systbl.h  |   2 +
 arch/powerpc/include/asm/unistd.h  |   4 +-
 arch/x86/ia32/ia32entry.S          |   2 +
 arch/x86/include/asm/unistd_32.h   |   4 +-
 arch/x86/include/asm/unistd_64.h   |   4 +
 arch/x86/kernel/syscall_table_32.S |   2 +
 fs/aio.c                           |   4 +-
 fs/compat.c                        |   7 +-
 fs/read_write.c                    |   8 +-
 include/linux/compat.h             |   3 +-
 include/linux/fs.h                 |   7 +-
 include/linux/syscalls.h           |  13 +
 kernel/sys_ni.c                    |   4 +
 mm/Makefile                        |   3 +-
 mm/process_vm_access.c             | 496 +++++++++++++++++++++++++++++++++++++
 security/keys/compat.c             |   2 +-
 security/keys/keyctl.c             |   2 +-
 17 files changed, 550 insertions(+), 17 deletions(-)
 create mode 100644 mm/process_vm_access.c

(limited to 'fs')

diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index fa0d27a400de..559ae1ee6706 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -354,3 +354,5 @@ COMPAT_SYS_SPU(clock_adjtime)
 SYSCALL_SPU(syncfs)
 COMPAT_SYS_SPU(sendmmsg)
 SYSCALL_SPU(setns)
+COMPAT_SYS(process_vm_readv)
+COMPAT_SYS(process_vm_writev)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index b8b3f599362b..d3d1b5efd7eb 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -373,10 +373,12 @@
 #define __NR_syncfs		348
 #define __NR_sendmmsg		349
 #define __NR_setns		350
+#define __NR_process_vm_readv	351
+#define __NR_process_vm_writev	352
 
 #ifdef __KERNEL__
 
-#define __NR_syscalls		351
+#define __NR_syscalls		353
 
 #define __NR__exit __NR_exit
 #define NR_syscalls	__NR_syscalls
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 54edb207ff3a..a6253ec1b284 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -850,4 +850,6 @@ ia32_sys_call_table:
 	.quad sys_syncfs
 	.quad compat_sys_sendmmsg	/* 345 */
 	.quad sys_setns
+	.quad compat_sys_process_vm_readv
+	.quad compat_sys_process_vm_writev
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 593485b38ab3..599c77d38f33 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -352,10 +352,12 @@
 #define __NR_syncfs             344
 #define __NR_sendmmsg		345
 #define __NR_setns		346
+#define __NR_process_vm_readv	347
+#define __NR_process_vm_writev	348
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 347
+#define NR_syscalls 349
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 0a6ba337a2eb..0431f193c3f2 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -682,6 +682,10 @@ __SYSCALL(__NR_sendmmsg, sys_sendmmsg)
 __SYSCALL(__NR_setns, sys_setns)
 #define __NR_getcpu				309
 __SYSCALL(__NR_getcpu, sys_getcpu)
+#define __NR_process_vm_readv			310
+__SYSCALL(__NR_process_vm_readv, sys_process_vm_readv)
+#define __NR_process_vm_writev			311
+__SYSCALL(__NR_process_vm_writev, sys_process_vm_writev)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index bc19be332bc9..9a0e31293920 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -346,3 +346,5 @@ ENTRY(sys_call_table)
 	.long sys_syncfs
 	.long sys_sendmmsg		/* 345 */
 	.long sys_setns
+	.long sys_process_vm_readv
+	.long sys_process_vm_writev
diff --git a/fs/aio.c b/fs/aio.c
index e29ec485af25..632b235f4fbe 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1387,13 +1387,13 @@ static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
 		ret = compat_rw_copy_check_uvector(type,
 				(struct compat_iovec __user *)kiocb->ki_buf,
 				kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
-				&kiocb->ki_iovec);
+				&kiocb->ki_iovec, 1);
 	else
 #endif
 		ret = rw_copy_check_uvector(type,
 				(struct iovec __user *)kiocb->ki_buf,
 				kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
-				&kiocb->ki_iovec);
+				&kiocb->ki_iovec, 1);
 	if (ret < 0)
 		goto out;
 
diff --git a/fs/compat.c b/fs/compat.c
index 302e761bd0aa..c98787536bb8 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -546,7 +546,7 @@ out:
 ssize_t compat_rw_copy_check_uvector(int type,
 		const struct compat_iovec __user *uvector, unsigned long nr_segs,
 		unsigned long fast_segs, struct iovec *fast_pointer,
-		struct iovec **ret_pointer)
+		struct iovec **ret_pointer, int check_access)
 {
 	compat_ssize_t tot_len;
 	struct iovec *iov = *ret_pointer = fast_pointer;
@@ -593,7 +593,8 @@ ssize_t compat_rw_copy_check_uvector(int type,
 		}
 		if (len < 0)	/* size_t not fitting in compat_ssize_t .. */
 			goto out;
-		if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
+		if (check_access &&
+		    !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
 			ret = -EFAULT;
 			goto out;
 		}
@@ -1107,7 +1108,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 		goto out;
 
 	tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
-					       UIO_FASTIOV, iovstack, &iov);
+					       UIO_FASTIOV, iovstack, &iov, 1);
 	if (tot_len == 0) {
 		ret = 0;
 		goto out;
diff --git a/fs/read_write.c b/fs/read_write.c
index dfd125798791..5ad4248b0cd8 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -633,7 +633,8 @@ ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 			      unsigned long nr_segs, unsigned long fast_segs,
 			      struct iovec *fast_pointer,
-			      struct iovec **ret_pointer)
+			      struct iovec **ret_pointer,
+			      int check_access)
 {
 	unsigned long seg;
 	ssize_t ret;
@@ -689,7 +690,8 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 			ret = -EINVAL;
 			goto out;
 		}
-		if (unlikely(!access_ok(vrfy_dir(type), buf, len))) {
+		if (check_access
+		    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
 			ret = -EFAULT;
 			goto out;
 		}
@@ -721,7 +723,7 @@ static ssize_t do_readv_writev(int type, struct file *file,
 	}
 
 	ret = rw_copy_check_uvector(type, uvector, nr_segs,
-			ARRAY_SIZE(iovstack), iovstack, &iov);
+				    ARRAY_SIZE(iovstack), iovstack, &iov, 1);
 	if (ret <= 0)
 		goto out;
 
diff --git a/include/linux/compat.h b/include/linux/compat.h
index c6e7523bf765..154bf5683015 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -547,7 +547,8 @@ extern ssize_t compat_rw_copy_check_uvector(int type,
 		const struct compat_iovec __user *uvector,
 		unsigned long nr_segs,
 		unsigned long fast_segs, struct iovec *fast_pointer,
-		struct iovec **ret_pointer);
+		struct iovec **ret_pointer,
+		int check_access);
 
 extern void __user *compat_alloc_user_space(unsigned long len);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 14493a2d5a03..87b4c6b9692d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1633,9 +1633,10 @@ struct inode_operations {
 struct seq_file;
 
 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
-				unsigned long nr_segs, unsigned long fast_segs,
-				struct iovec *fast_pointer,
-				struct iovec **ret_pointer);
+			      unsigned long nr_segs, unsigned long fast_segs,
+			      struct iovec *fast_pointer,
+			      struct iovec **ret_pointer,
+			      int check_access);
 
 extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
 extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 1ff0ec2a5e8d..86a24b1166d1 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -844,4 +844,17 @@ asmlinkage long sys_open_by_handle_at(int mountdirfd,
 				      struct file_handle __user *handle,
 				      int flags);
 asmlinkage long sys_setns(int fd, int nstype);
+asmlinkage long sys_process_vm_readv(pid_t pid,
+				     const struct iovec __user *lvec,
+				     unsigned long liovcnt,
+				     const struct iovec __user *rvec,
+				     unsigned long riovcnt,
+				     unsigned long flags);
+asmlinkage long sys_process_vm_writev(pid_t pid,
+				      const struct iovec __user *lvec,
+				      unsigned long liovcnt,
+				      const struct iovec __user *rvec,
+				      unsigned long riovcnt,
+				      unsigned long flags);
+
 #endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index a9a5de07c4f1..47bfa16430d7 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -145,6 +145,10 @@ cond_syscall(sys_io_submit);
 cond_syscall(sys_io_cancel);
 cond_syscall(sys_io_getevents);
 cond_syscall(sys_syslog);
+cond_syscall(sys_process_vm_readv);
+cond_syscall(sys_process_vm_writev);
+cond_syscall(compat_sys_process_vm_readv);
+cond_syscall(compat_sys_process_vm_writev);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
diff --git a/mm/Makefile b/mm/Makefile
index 836e4163c1bf..50ec00ef2a0e 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,7 +5,8 @@
 mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
-			   vmalloc.o pagewalk.o pgtable-generic.o
+			   vmalloc.o pagewalk.o pgtable-generic.o \
+			   process_vm_access.o
 
 obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   maccess.o page_alloc.o page-writeback.o \
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
new file mode 100644
index 000000000000..e920aa3ce104
--- /dev/null
+++ b/mm/process_vm_access.c
@@ -0,0 +1,496 @@
+/*
+ * linux/mm/process_vm_access.c
+ *
+ * Copyright (C) 2010-2011 Christopher Yeoh <cyeoh@au1.ibm.com>, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/mm.h>
+#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+#endif
+
+/**
+ * process_vm_rw_pages - read/write pages from task specified
+ * @task: task to read/write from
+ * @mm: mm for task
+ * @process_pages: struct pages area that can store at least
+ *  nr_pages_to_copy struct page pointers
+ * @pa: address of page in task to start copying from/to
+ * @start_offset: offset in page to start copying from/to
+ * @len: number of bytes to copy
+ * @lvec: iovec array specifying where to copy to/from
+ * @lvec_cnt: number of elements in iovec array
+ * @lvec_current: index in iovec array we are up to
+ * @lvec_offset: offset in bytes from current iovec iov_base we are up to
+ * @vm_write: 0 means copy from, 1 means copy to
+ * @nr_pages_to_copy: number of pages to copy
+ * @bytes_copied: returns number of bytes successfully copied
+ * Returns 0 on success, error code otherwise
+ */
+static int process_vm_rw_pages(struct task_struct *task,
+			       struct mm_struct *mm,
+			       struct page **process_pages,
+			       unsigned long pa,
+			       unsigned long start_offset,
+			       unsigned long len,
+			       const struct iovec *lvec,
+			       unsigned long lvec_cnt,
+			       unsigned long *lvec_current,
+			       size_t *lvec_offset,
+			       int vm_write,
+			       unsigned int nr_pages_to_copy,
+			       ssize_t *bytes_copied)
+{
+	int pages_pinned;
+	void *target_kaddr;
+	int pgs_copied = 0;
+	int j;
+	int ret;
+	ssize_t bytes_to_copy;
+	ssize_t rc = 0;
+
+	*bytes_copied = 0;
+
+	/* Get the pages we're interested in */
+	down_read(&mm->mmap_sem);
+	pages_pinned = get_user_pages(task, mm, pa,
+				      nr_pages_to_copy,
+				      vm_write, 0, process_pages, NULL);
+	up_read(&mm->mmap_sem);
+
+	if (pages_pinned != nr_pages_to_copy) {
+		rc = -EFAULT;
+		goto end;
+	}
+
+	/* Do the copy for each page */
+	for (pgs_copied = 0;
+	     (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt);
+	     pgs_copied++) {
+		/* Make sure we have a non zero length iovec */
+		while (*lvec_current < lvec_cnt
+		       && lvec[*lvec_current].iov_len == 0)
+			(*lvec_current)++;
+		if (*lvec_current == lvec_cnt)
+			break;
+
+		/*
+		 * Will copy smallest of:
+		 * - bytes remaining in page
+		 * - bytes remaining in destination iovec
+		 */
+		bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset,
+				      len - *bytes_copied);
+		bytes_to_copy = min_t(ssize_t, bytes_to_copy,
+				      lvec[*lvec_current].iov_len
+				      - *lvec_offset);
+
+		target_kaddr = kmap(process_pages[pgs_copied]) + start_offset;
+
+		if (vm_write)
+			ret = copy_from_user(target_kaddr,
+					     lvec[*lvec_current].iov_base
+					     + *lvec_offset,
+					     bytes_to_copy);
+		else
+			ret = copy_to_user(lvec[*lvec_current].iov_base
+					   + *lvec_offset,
+					   target_kaddr, bytes_to_copy);
+		kunmap(process_pages[pgs_copied]);
+		if (ret) {
+			*bytes_copied += bytes_to_copy - ret;
+			pgs_copied++;
+			rc = -EFAULT;
+			goto end;
+		}
+		*bytes_copied += bytes_to_copy;
+		*lvec_offset += bytes_to_copy;
+		if (*lvec_offset == lvec[*lvec_current].iov_len) {
+			/*
+			 * Need to copy remaining part of page into the
+			 * next iovec if there are any bytes left in page
+			 */
+			(*lvec_current)++;
+			*lvec_offset = 0;
+			start_offset = (start_offset + bytes_to_copy)
+				% PAGE_SIZE;
+			if (start_offset)
+				pgs_copied--;
+		} else {
+			start_offset = 0;
+		}
+	}
+
+end:
+	if (vm_write) {
+		for (j = 0; j < pages_pinned; j++) {
+			if (j < pgs_copied)
+				set_page_dirty_lock(process_pages[j]);
+			put_page(process_pages[j]);
+		}
+	} else {
+		for (j = 0; j < pages_pinned; j++)
+			put_page(process_pages[j]);
+	}
+
+	return rc;
+}
+
+/* Maximum number of pages kmalloc'd to hold struct page's during copy */
+#define PVM_MAX_KMALLOC_PAGES (PAGE_SIZE * 2)
+
+/**
+ * process_vm_rw_single_vec - read/write pages from task specified
+ * @addr: start memory address of target process
+ * @len: size of area to copy to/from
+ * @lvec: iovec array specifying where to copy to/from locally
+ * @lvec_cnt: number of elements in iovec array
+ * @lvec_current: index in iovec array we are up to
+ * @lvec_offset: offset in bytes from current iovec iov_base we are up to
+ * @process_pages: struct pages area that can store at least
+ *  nr_pages_to_copy struct page pointers
+ * @mm: mm for task
+ * @task: task to read/write from
+ * @vm_write: 0 means copy from, 1 means copy to
+ * @bytes_copied: returns number of bytes successfully copied
+ * Returns 0 on success or on failure error code
+ */
+static int process_vm_rw_single_vec(unsigned long addr,
+				    unsigned long len,
+				    const struct iovec *lvec,
+				    unsigned long lvec_cnt,
+				    unsigned long *lvec_current,
+				    size_t *lvec_offset,
+				    struct page **process_pages,
+				    struct mm_struct *mm,
+				    struct task_struct *task,
+				    int vm_write,
+				    ssize_t *bytes_copied)
+{
+	unsigned long pa = addr & PAGE_MASK;
+	unsigned long start_offset = addr - pa;
+	unsigned long nr_pages;
+	ssize_t bytes_copied_loop;
+	ssize_t rc = 0;
+	unsigned long nr_pages_copied = 0;
+	unsigned long nr_pages_to_copy;
+	unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
+		/ sizeof(struct pages *);
+
+	*bytes_copied = 0;
+
+	/* Work out address and page range required */
+	if (len == 0)
+		return 0;
+	nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
+
+	while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) {
+		nr_pages_to_copy = min(nr_pages - nr_pages_copied,
+				       max_pages_per_loop);
+
+		rc = process_vm_rw_pages(task, mm, process_pages, pa,
+					 start_offset, len,
+					 lvec, lvec_cnt,
+					 lvec_current, lvec_offset,
+					 vm_write, nr_pages_to_copy,
+					 &bytes_copied_loop);
+		start_offset = 0;
+		*bytes_copied += bytes_copied_loop;
+
+		if (rc < 0) {
+			return rc;
+		} else {
+			len -= bytes_copied_loop;
+			nr_pages_copied += nr_pages_to_copy;
+			pa += nr_pages_to_copy * PAGE_SIZE;
+		}
+	}
+
+	return rc;
+}
+
+/* Maximum number of entries for process pages array
+   which lives on stack */
+#define PVM_MAX_PP_ARRAY_COUNT 16
+
+/**
+ * process_vm_rw_core - core of reading/writing pages from task specified
+ * @pid: PID of process to read/write from/to
+ * @lvec: iovec array specifying where to copy to/from locally
+ * @liovcnt: size of lvec array
+ * @rvec: iovec array specifying where to copy to/from in the other process
+ * @riovcnt: size of rvec array
+ * @flags: currently unused
+ * @vm_write: 0 if reading from other process, 1 if writing to other process
+ * Returns the number of bytes read/written or error code. May
+ *  return less bytes than expected if an error occurs during the copying
+ *  process.
+ */
+static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
+				  unsigned long liovcnt,
+				  const struct iovec *rvec,
+				  unsigned long riovcnt,
+				  unsigned long flags, int vm_write)
+{
+	struct task_struct *task;
+	struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT];
+	struct page **process_pages = pp_stack;
+	struct mm_struct *mm;
+	unsigned long i;
+	ssize_t rc = 0;
+	ssize_t bytes_copied_loop;
+	ssize_t bytes_copied = 0;
+	unsigned long nr_pages = 0;
+	unsigned long nr_pages_iov;
+	unsigned long iov_l_curr_idx = 0;
+	size_t iov_l_curr_offset = 0;
+	ssize_t iov_len;
+
+	/*
+	 * Work out how many pages of struct pages we're going to need
+	 * when eventually calling get_user_pages
+	 */
+	for (i = 0; i < riovcnt; i++) {
+		iov_len = rvec[i].iov_len;
+		if (iov_len > 0) {
+			nr_pages_iov = ((unsigned long)rvec[i].iov_base
+					+ iov_len)
+				/ PAGE_SIZE - (unsigned long)rvec[i].iov_base
+				/ PAGE_SIZE + 1;
+			nr_pages = max(nr_pages, nr_pages_iov);
+		}
+	}
+
+	if (nr_pages == 0)
+		return 0;
+
+	if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) {
+		/* For reliability don't try to kmalloc more than
+		   2 pages worth */
+		process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES,
+					      sizeof(struct pages *)*nr_pages),
+					GFP_KERNEL);
+
+		if (!process_pages)
+			return -ENOMEM;
+	}
+
+	/* Get process information */
+	rcu_read_lock();
+	task = find_task_by_vpid(pid);
+	if (task)
+		get_task_struct(task);
+	rcu_read_unlock();
+	if (!task) {
+		rc = -ESRCH;
+		goto free_proc_pages;
+	}
+
+	task_lock(task);
+	if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
+		task_unlock(task);
+		rc = -EPERM;
+		goto put_task_struct;
+	}
+	mm = task->mm;
+
+	if (!mm || (task->flags & PF_KTHREAD)) {
+		task_unlock(task);
+		rc = -EINVAL;
+		goto put_task_struct;
+	}
+
+	atomic_inc(&mm->mm_users);
+	task_unlock(task);
+
+	for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) {
+		rc = process_vm_rw_single_vec(
+			(unsigned long)rvec[i].iov_base, rvec[i].iov_len,
+			lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset,
+			process_pages, mm, task, vm_write, &bytes_copied_loop);
+		bytes_copied += bytes_copied_loop;
+		if (rc != 0) {
+			/* If we have managed to copy any data at all then
+			   we return the number of bytes copied. Otherwise
+			   we return the error code */
+			if (bytes_copied)
+				rc = bytes_copied;
+			goto put_mm;
+		}
+	}
+
+	rc = bytes_copied;
+put_mm:
+	mmput(mm);
+
+put_task_struct:
+	put_task_struct(task);
+
+free_proc_pages:
+	if (process_pages != pp_stack)
+		kfree(process_pages);
+	return rc;
+}
+
+/**
+ * process_vm_rw - check iovecs before calling core routine
+ * @pid: PID of process to read/write from/to
+ * @lvec: iovec array specifying where to copy to/from locally
+ * @liovcnt: size of lvec array
+ * @rvec: iovec array specifying where to copy to/from in the other process
+ * @riovcnt: size of rvec array
+ * @flags: currently unused
+ * @vm_write: 0 if reading from other process, 1 if writing to other process
+ * Returns the number of bytes read/written or error code. May
+ *  return less bytes than expected if an error occurs during the copying
+ *  process.
+ */
+static ssize_t process_vm_rw(pid_t pid,
+			     const struct iovec __user *lvec,
+			     unsigned long liovcnt,
+			     const struct iovec __user *rvec,
+			     unsigned long riovcnt,
+			     unsigned long flags, int vm_write)
+{
+	struct iovec iovstack_l[UIO_FASTIOV];
+	struct iovec iovstack_r[UIO_FASTIOV];
+	struct iovec *iov_l = iovstack_l;
+	struct iovec *iov_r = iovstack_r;
+	ssize_t rc;
+
+	if (flags != 0)
+		return -EINVAL;
+
+	/* Check iovecs */
+	if (vm_write)
+		rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV,
+					   iovstack_l, &iov_l, 1);
+	else
+		rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV,
+					   iovstack_l, &iov_l, 1);
+	if (rc <= 0)
+		goto free_iovecs;
+
+	rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV,
+				   iovstack_r, &iov_r, 0);
+	if (rc <= 0)
+		goto free_iovecs;
+
+	rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
+				vm_write);
+
+free_iovecs:
+	if (iov_r != iovstack_r)
+		kfree(iov_r);
+	if (iov_l != iovstack_l)
+		kfree(iov_l);
+
+	return rc;
+}
+
+SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec,
+		unsigned long, liovcnt, const struct iovec __user *, rvec,
+		unsigned long, riovcnt,	unsigned long, flags)
+{
+	return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0);
+}
+
+SYSCALL_DEFINE6(process_vm_writev, pid_t, pid,
+		const struct iovec __user *, lvec,
+		unsigned long, liovcnt, const struct iovec __user *, rvec,
+		unsigned long, riovcnt,	unsigned long, flags)
+{
+	return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1);
+}
+
+#ifdef CONFIG_COMPAT
+
+asmlinkage ssize_t
+compat_process_vm_rw(compat_pid_t pid,
+		     const struct compat_iovec __user *lvec,
+		     unsigned long liovcnt,
+		     const struct compat_iovec __user *rvec,
+		     unsigned long riovcnt,
+		     unsigned long flags, int vm_write)
+{
+	struct iovec iovstack_l[UIO_FASTIOV];
+	struct iovec iovstack_r[UIO_FASTIOV];
+	struct iovec *iov_l = iovstack_l;
+	struct iovec *iov_r = iovstack_r;
+	ssize_t rc = -EFAULT;
+
+	if (flags != 0)
+		return -EINVAL;
+
+	if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec)))
+		goto out;
+
+	if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec)))
+		goto out;
+
+	if (vm_write)
+		rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt,
+						  UIO_FASTIOV, iovstack_l,
+						  &iov_l, 1);
+	else
+		rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt,
+						  UIO_FASTIOV, iovstack_l,
+						  &iov_l, 1);
+	if (rc <= 0)
+		goto free_iovecs;
+	rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt,
+					  UIO_FASTIOV, iovstack_r,
+					  &iov_r, 0);
+	if (rc <= 0)
+		goto free_iovecs;
+
+	rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
+			   vm_write);
+
+free_iovecs:
+	if (iov_r != iovstack_r)
+		kfree(iov_r);
+	if (iov_l != iovstack_l)
+		kfree(iov_l);
+
+out:
+	return rc;
+}
+
+asmlinkage ssize_t
+compat_sys_process_vm_readv(compat_pid_t pid,
+			    const struct compat_iovec __user *lvec,
+			    unsigned long liovcnt,
+			    const struct compat_iovec __user *rvec,
+			    unsigned long riovcnt,
+			    unsigned long flags)
+{
+	return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
+				    riovcnt, flags, 0);
+}
+
+asmlinkage ssize_t
+compat_sys_process_vm_writev(compat_pid_t pid,
+			     const struct compat_iovec __user *lvec,
+			     unsigned long liovcnt,
+			     const struct compat_iovec __user *rvec,
+			     unsigned long riovcnt,
+			     unsigned long flags)
+{
+	return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
+				    riovcnt, flags, 1);
+}
+
+#endif
diff --git a/security/keys/compat.c b/security/keys/compat.c
index 338b510e9027..4c48e13448f8 100644
--- a/security/keys/compat.c
+++ b/security/keys/compat.c
@@ -38,7 +38,7 @@ long compat_keyctl_instantiate_key_iov(
 
 	ret = compat_rw_copy_check_uvector(WRITE, _payload_iov, ioc,
 					   ARRAY_SIZE(iovstack),
-					   iovstack, &iov);
+					   iovstack, &iov, 1);
 	if (ret < 0)
 		return ret;
 	if (ret == 0)
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index eca51918c951..0b3f5d72af1c 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1065,7 +1065,7 @@ long keyctl_instantiate_key_iov(key_serial_t id,
 		goto no_payload;
 
 	ret = rw_copy_check_uvector(WRITE, _payload_iov, ioc,
-				    ARRAY_SIZE(iovstack), iovstack, &iov);
+				    ARRAY_SIZE(iovstack), iovstack, &iov, 1);
 	if (ret < 0)
 		return ret;
 	if (ret == 0)
-- 
cgit v1.2.3-58-ga151


From c9f01245b6a7d77d17deaa71af10f6aca14fa24e Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 31 Oct 2011 17:07:15 -0700
Subject: oom: remove oom_disable_count

This removes mm->oom_disable_count entirely since it's unnecessary and
currently buggy.  The counter was intended to be per-process but it's
currently decremented in the exit path for each thread that exits, causing
it to underflow.

The count was originally intended to prevent oom killing threads that
share memory with threads that cannot be killed since it doesn't lead to
future memory freeing.  The counter could be fixed to represent all
threads sharing the same mm, but it's better to remove the count since:

 - it is possible that the OOM_DISABLE thread sharing memory with the
   victim is waiting on that thread to exit and will actually cause
   future memory freeing, and

 - there is no guarantee that a thread is disabled from oom killing just
   because another thread sharing its mm is oom disabled.

Signed-off-by: David Rientjes <rientjes@google.com>
Reported-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Ying Han <yinghan@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                |  4 ----
 fs/proc/base.c           | 13 -------------
 include/linux/mm_types.h |  3 ---
 kernel/exit.c            |  2 --
 kernel/fork.c            | 10 +---------
 mm/oom_kill.c            | 23 +++++------------------
 6 files changed, 6 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 25dcbe5fc356..36254645b7cc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -841,10 +841,6 @@ static int exec_mmap(struct mm_struct *mm)
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
-	if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
-		atomic_dec(&old_mm->oom_disable_count);
-		atomic_inc(&tsk->mm->oom_disable_count);
-	}
 	task_unlock(tsk);
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5eb02069e1b8..8f0087e20e16 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1107,13 +1107,6 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 		goto err_sighand;
 	}
 
-	if (oom_adjust != task->signal->oom_adj) {
-		if (oom_adjust == OOM_DISABLE)
-			atomic_inc(&task->mm->oom_disable_count);
-		if (task->signal->oom_adj == OOM_DISABLE)
-			atomic_dec(&task->mm->oom_disable_count);
-	}
-
 	/*
 	 * Warn that /proc/pid/oom_adj is deprecated, see
 	 * Documentation/feature-removal-schedule.txt.
@@ -1215,12 +1208,6 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 		goto err_sighand;
 	}
 
-	if (oom_score_adj != task->signal->oom_score_adj) {
-		if (oom_score_adj == OOM_SCORE_ADJ_MIN)
-			atomic_inc(&task->mm->oom_disable_count);
-		if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-			atomic_dec(&task->mm->oom_disable_count);
-	}
 	task->signal->oom_score_adj = oom_score_adj;
 	if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
 		task->signal->oom_score_adj_min = oom_score_adj;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c93d00a6e95d..6456624aa964 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -336,9 +336,6 @@ struct mm_struct {
 	unsigned int token_priority;
 	unsigned int last_interval;
 
-	/* How many tasks sharing this mm are OOM_DISABLE */
-	atomic_t oom_disable_count;
-
 	unsigned long flags; /* Must use atomic bitops to access the bits */
 
 	struct core_state *core_state; /* coredumping support */
diff --git a/kernel/exit.c b/kernel/exit.c
index 2913b3509d42..d0b7d988f873 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -681,8 +681,6 @@ static void exit_mm(struct task_struct * tsk)
 	enter_lazy_tlb(mm, current);
 	/* We don't want this task to be frozen prematurely */
 	clear_freeze_flag(tsk);
-	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-		atomic_dec(&mm->oom_disable_count);
 	task_unlock(tsk);
 	mm_update_next_owner(mm);
 	mmput(mm);
diff --git a/kernel/fork.c b/kernel/fork.c
index 8e6b6f4fb272..70d76191afb9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -501,7 +501,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 	mm->cached_hole_size = ~0UL;
 	mm_init_aio(mm);
 	mm_init_owner(mm, p);
-	atomic_set(&mm->oom_disable_count, 0);
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
@@ -816,8 +815,6 @@ good_mm:
 	/* Initializing for Swap token stuff */
 	mm->token_priority = 0;
 	mm->last_interval = 0;
-	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-		atomic_inc(&mm->oom_disable_count);
 
 	tsk->mm = mm;
 	tsk->active_mm = mm;
@@ -1391,13 +1388,8 @@ bad_fork_cleanup_io:
 bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
 bad_fork_cleanup_mm:
-	if (p->mm) {
-		task_lock(p);
-		if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-			atomic_dec(&p->mm->oom_disable_count);
-		task_unlock(p);
+	if (p->mm)
 		mmput(p->mm);
-	}
 bad_fork_cleanup_signal:
 	if (!(clone_flags & CLONE_THREAD))
 		free_signal_struct(p->signal);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b0d8943bc9fd..2b97e8f04607 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -54,13 +54,7 @@ int test_set_oom_score_adj(int new_val)
 
 	spin_lock_irq(&sighand->siglock);
 	old_val = current->signal->oom_score_adj;
-	if (new_val != old_val) {
-		if (new_val == OOM_SCORE_ADJ_MIN)
-			atomic_inc(&current->mm->oom_disable_count);
-		else if (old_val == OOM_SCORE_ADJ_MIN)
-			atomic_dec(&current->mm->oom_disable_count);
-		current->signal->oom_score_adj = new_val;
-	}
+	current->signal->oom_score_adj = new_val;
 	spin_unlock_irq(&sighand->siglock);
 
 	return old_val;
@@ -172,16 +166,6 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
 	if (!p)
 		return 0;
 
-	/*
-	 * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
-	 * so the entire heuristic doesn't need to be executed for something
-	 * that cannot be killed.
-	 */
-	if (atomic_read(&p->mm->oom_disable_count)) {
-		task_unlock(p);
-		return 0;
-	}
-
 	/*
 	 * The memory controller may have a limit of 0 bytes, so avoid a divide
 	 * by zero, if necessary.
@@ -451,6 +435,9 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
 	for_each_process(q)
 		if (q->mm == mm && !same_thread_group(q, p) &&
 		    !(q->flags & PF_KTHREAD)) {
+			if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+				continue;
+
 			task_lock(q);	/* Protect ->comm from prctl() */
 			pr_err("Kill process %d (%s) sharing same memory\n",
 				task_pid_nr(q), q->comm);
@@ -727,7 +714,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	read_lock(&tasklist_lock);
 	if (sysctl_oom_kill_allocating_task &&
 	    !oom_unkillable_task(current, NULL, nodemask) &&
-	    current->mm && !atomic_read(&current->mm->oom_disable_count)) {
+	    current->mm) {
 		/*
 		 * oom_kill_process() needs tasklist_lock held.  If it returns
 		 * non-zero, current could not be killed so we must fallback to
-- 
cgit v1.2.3-58-ga151


From f5fc870da2f8798edb5481cd2137a3b2d5bd1b19 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Mon, 31 Oct 2011 17:07:21 -0700
Subject: tmpfs: add "tmpfs" to the Kconfig prompt to make it obvious.

Add the leading word "tmpfs" to the Kconfig string to make it blindingly
obvious that this selection refers to tmpfs.

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 9fe0b349f4cd..5f4c45d4aa10 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -109,7 +109,7 @@ source "fs/proc/Kconfig"
 source "fs/sysfs/Kconfig"
 
 config TMPFS
-	bool "Virtual memory file system support (former shm fs)"
+	bool "Tmpfs virtual memory file system support (former shm fs)"
 	depends on SHMEM
 	help
 	  Tmpfs is a file system which keeps all files in virtual memory.
-- 
cgit v1.2.3-58-ga151


From bc3e53f682d93df677dbd5006a404722b3adfe18 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Mon, 31 Oct 2011 17:07:30 -0700
Subject: mm: distinguish between mlocked and pinned pages

Some kernel components pin user space memory (infiniband and perf) (by
increasing the page count) and account that memory as "mlocked".

The difference between mlocking and pinning is:

A. mlocked pages are marked with PG_mlocked and are exempt from
   swapping. Page migration may move them around though.
   They are kept on a special LRU list.

B. Pinned pages cannot be moved because something needs to
   directly access physical memory. They may not be on any
   LRU list.

I recently saw an mlockalled process where mm->locked_vm became
bigger than the virtual size of the process (!) because some
memory was accounted for twice:

Once when the page was mlocked and once when the Infiniband
layer increased the refcount because it needt to pin the RDMA
memory.

This patch introduces a separate counter for pinned pages and
accounts them seperately.

Signed-off-by: Christoph Lameter <cl@linux.com>
Cc: Mike Marciniszyn <infinipath@qlogic.com>
Cc: Roland Dreier <roland@kernel.org>
Cc: Sean Hefty <sean.hefty@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/infiniband/core/umem.c                 | 6 +++---
 drivers/infiniband/hw/ipath/ipath_user_pages.c | 6 +++---
 drivers/infiniband/hw/qib/qib_user_pages.c     | 4 ++--
 fs/proc/task_mmu.c                             | 2 ++
 include/linux/mm_types.h                       | 2 +-
 kernel/events/core.c                           | 6 +++---
 6 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index b645e558876f..9155f91d66bf 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -136,7 +136,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 
 	down_write(&current->mm->mmap_sem);
 
-	locked     = npages + current->mm->locked_vm;
+	locked     = npages + current->mm->pinned_vm;
 	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 
 	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
@@ -206,7 +206,7 @@ out:
 		__ib_umem_release(context->device, umem, 0);
 		kfree(umem);
 	} else
-		current->mm->locked_vm = locked;
+		current->mm->pinned_vm = locked;
 
 	up_write(&current->mm->mmap_sem);
 	if (vma_list)
@@ -222,7 +222,7 @@ static void ib_umem_account(struct work_struct *work)
 	struct ib_umem *umem = container_of(work, struct ib_umem, work);
 
 	down_write(&umem->mm->mmap_sem);
-	umem->mm->locked_vm -= umem->diff;
+	umem->mm->pinned_vm -= umem->diff;
 	up_write(&umem->mm->mmap_sem);
 	mmput(umem->mm);
 	kfree(umem);
diff --git a/drivers/infiniband/hw/ipath/ipath_user_pages.c b/drivers/infiniband/hw/ipath/ipath_user_pages.c
index cfed5399f074..dc66c4506916 100644
--- a/drivers/infiniband/hw/ipath/ipath_user_pages.c
+++ b/drivers/infiniband/hw/ipath/ipath_user_pages.c
@@ -79,7 +79,7 @@ static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages,
 			goto bail_release;
 	}
 
-	current->mm->locked_vm += num_pages;
+	current->mm->pinned_vm += num_pages;
 
 	ret = 0;
 	goto bail;
@@ -178,7 +178,7 @@ void ipath_release_user_pages(struct page **p, size_t num_pages)
 
 	__ipath_release_user_pages(p, num_pages, 1);
 
-	current->mm->locked_vm -= num_pages;
+	current->mm->pinned_vm -= num_pages;
 
 	up_write(&current->mm->mmap_sem);
 }
@@ -195,7 +195,7 @@ static void user_pages_account(struct work_struct *_work)
 		container_of(_work, struct ipath_user_pages_work, work);
 
 	down_write(&work->mm->mmap_sem);
-	work->mm->locked_vm -= work->num_pages;
+	work->mm->pinned_vm -= work->num_pages;
 	up_write(&work->mm->mmap_sem);
 	mmput(work->mm);
 	kfree(work);
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
index 7689e49c13c9..2bc1d2b96298 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -74,7 +74,7 @@ static int __qib_get_user_pages(unsigned long start_page, size_t num_pages,
 			goto bail_release;
 	}
 
-	current->mm->locked_vm += num_pages;
+	current->mm->pinned_vm += num_pages;
 
 	ret = 0;
 	goto bail;
@@ -151,7 +151,7 @@ void qib_release_user_pages(struct page **p, size_t num_pages)
 	__qib_release_user_pages(p, num_pages, 1);
 
 	if (current->mm) {
-		current->mm->locked_vm -= num_pages;
+		current->mm->pinned_vm -= num_pages;
 		up_write(&current->mm->mmap_sem);
 	}
 }
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c7d4ee663f14..e418c5abdb0e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -44,6 +44,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
 		"VmLck:\t%8lu kB\n"
+		"VmPin:\t%8lu kB\n"
 		"VmHWM:\t%8lu kB\n"
 		"VmRSS:\t%8lu kB\n"
 		"VmData:\t%8lu kB\n"
@@ -55,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		hiwater_vm << (PAGE_SHIFT-10),
 		(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
 		mm->locked_vm << (PAGE_SHIFT-10),
+		mm->pinned_vm << (PAGE_SHIFT-10),
 		hiwater_rss << (PAGE_SHIFT-10),
 		total_rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6456624aa964..f3175830cc73 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -304,7 +304,7 @@ struct mm_struct {
 	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
 	unsigned long hiwater_vm;	/* High-water virtual memory usage */
 
-	unsigned long total_vm, locked_vm, shared_vm, exec_vm;
+	unsigned long total_vm, locked_vm, pinned_vm, shared_vm, exec_vm;
 	unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
 	unsigned long start_code, end_code, start_data, end_data;
 	unsigned long start_brk, brk, start_stack;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d1a1bee35228..12a0287e0358 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3544,7 +3544,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 		struct ring_buffer *rb = event->rb;
 
 		atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
-		vma->vm_mm->locked_vm -= event->mmap_locked;
+		vma->vm_mm->pinned_vm -= event->mmap_locked;
 		rcu_assign_pointer(event->rb, NULL);
 		mutex_unlock(&event->mmap_mutex);
 
@@ -3625,7 +3625,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
 	lock_limit = rlimit(RLIMIT_MEMLOCK);
 	lock_limit >>= PAGE_SHIFT;
-	locked = vma->vm_mm->locked_vm + extra;
+	locked = vma->vm_mm->pinned_vm + extra;
 
 	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
 		!capable(CAP_IPC_LOCK)) {
@@ -3651,7 +3651,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	atomic_long_add(user_extra, &user->locked_vm);
 	event->mmap_locked = extra;
 	event->mmap_user = get_current_user();
-	vma->vm_mm->locked_vm += event->mmap_locked;
+	vma->vm_mm->pinned_vm += event->mmap_locked;
 
 unlock:
 	if (!ret)
-- 
cgit v1.2.3-58-ga151


From 94054fa3fca1fd78db02cb3d68d5627120f0a1d4 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 31 Oct 2011 17:07:45 -0700
Subject: xfs: warn if direct reclaim tries to writeback pages

Direct reclaim should never writeback pages.  For now, handle the
situation and warn about it.  Ultimately, this will be a BUG_ON.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Alex Elder <aelder@sgi.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/xfs/xfs_aops.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 11b2aad982d4..33b13310ee0c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -902,11 +902,11 @@ xfs_vm_writepage(
 	 * random callers for direct reclaim or memcg reclaim.  We explicitly
 	 * allow reclaim from kswapd as the stack usage there is relatively low.
 	 *
-	 * This should really be done by the core VM, but until that happens
-	 * filesystems like XFS, btrfs and ext4 have to take care of this
-	 * by themselves.
+	 * This should never happen except in the case of a VM regression so
+	 * warn about it.
 	 */
-	if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
+	if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
+			PF_MEMALLOC))
 		goto redirty;
 
 	/*
-- 
cgit v1.2.3-58-ga151


From 966dbde2c208e07bab7a45a7855e1e693eabe661 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 31 Oct 2011 17:07:48 -0700
Subject: ext4: warn if direct reclaim tries to writeback pages

Direct reclaim should never writeback pages.  Warn if an attempt is made.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Alex Elder <aelder@sgi.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext4/inode.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 986e2388f031..0defe0bfe019 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1811,8 +1811,12 @@ static int ext4_writepage(struct page *page,
 		 * We don't want to do block allocation, so redirty
 		 * the page and return.  We may reach here when we do
 		 * a journal commit via journal_submit_inode_data_buffers.
-		 * We can also reach here via shrink_page_list
+		 * We can also reach here via shrink_page_list but it
+		 * should never be for direct reclaim so warn if that
+		 * happens
 		 */
+		WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
+								PF_MEMALLOC);
 		goto redirty_page;
 	}
 	if (commit_write)
-- 
cgit v1.2.3-58-ga151


From 798248206b59acc6e1238c778281419c041891a7 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Mon, 31 Oct 2011 17:08:07 -0700
Subject: lib/string.c: introduce memchr_inv()

memchr_inv() is mainly used to check whether the whole buffer is filled
with just a specified byte.

The function name and prototype are stolen from logfs and the
implementation is from SLUB.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Matt Mackall <mpm@selenic.com>
Acked-by: Joern Engel <joern@logfs.org>
Cc: Marcin Slusarz <marcin.slusarz@gmail.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/logfs/logfs.h       |  1 -
 fs/logfs/super.c       | 22 --------------------
 include/linux/string.h |  1 +
 lib/string.c           | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/slub.c              | 47 ++-----------------------------------------
 5 files changed, 57 insertions(+), 68 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index f22d108bfa5d..398ecff6e548 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -618,7 +618,6 @@ static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs,
 struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index);
 void emergency_read_end(struct page *page);
 void logfs_crash_dump(struct super_block *sb);
-void *memchr_inv(const void *s, int c, size_t n);
 int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
 int logfs_check_ds(struct logfs_disk_super *ds);
 int logfs_write_sb(struct super_block *sb);
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index ce03a182c771..f2697e4df109 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -90,28 +90,6 @@ void logfs_crash_dump(struct super_block *sb)
 	dump_segfile(sb);
 }
 
-/*
- * TODO: move to lib/string.c
- */
-/**
- * memchr_inv - Find a character in an area of memory.
- * @s: The memory area
- * @c: The byte to search for
- * @n: The size of the area.
- *
- * returns the address of the first character other than @c, or %NULL
- * if the whole buffer contains just @c.
- */
-void *memchr_inv(const void *s, int c, size_t n)
-{
-	const unsigned char *p = s;
-	while (n-- != 0)
-		if ((unsigned char)c != *p++)
-			return (void *)(p - 1);
-
-	return NULL;
-}
-
 /*
  * FIXME: There should be a reserve for root, similar to ext2.
  */
diff --git a/include/linux/string.h b/include/linux/string.h
index a176db2f2c85..e033564f10ba 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -114,6 +114,7 @@ extern int memcmp(const void *,const void *,__kernel_size_t);
 #ifndef __HAVE_ARCH_MEMCHR
 extern void * memchr(const void *,int,__kernel_size_t);
 #endif
+void *memchr_inv(const void *s, int c, size_t n);
 
 extern char *kstrdup(const char *s, gfp_t gfp);
 extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
diff --git a/lib/string.c b/lib/string.c
index 01fad9b203e1..11df54325fb8 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -756,3 +756,57 @@ void *memchr(const void *s, int c, size_t n)
 }
 EXPORT_SYMBOL(memchr);
 #endif
+
+static void *check_bytes8(const u8 *start, u8 value, unsigned int bytes)
+{
+	while (bytes) {
+		if (*start != value)
+			return (void *)start;
+		start++;
+		bytes--;
+	}
+	return NULL;
+}
+
+/**
+ * memchr_inv - Find an unmatching character in an area of memory.
+ * @start: The memory area
+ * @c: Find a character other than c
+ * @bytes: The size of the area.
+ *
+ * returns the address of the first character other than @c, or %NULL
+ * if the whole buffer contains just @c.
+ */
+void *memchr_inv(const void *start, int c, size_t bytes)
+{
+	u8 value = c;
+	u64 value64;
+	unsigned int words, prefix;
+
+	if (bytes <= 16)
+		return check_bytes8(start, value, bytes);
+
+	value64 = value | value << 8 | value << 16 | value << 24;
+	value64 = (value64 & 0xffffffff) | value64 << 32;
+	prefix = 8 - ((unsigned long)start) % 8;
+
+	if (prefix) {
+		u8 *r = check_bytes8(start, value, prefix);
+		if (r)
+			return r;
+		start += prefix;
+		bytes -= prefix;
+	}
+
+	words = bytes / 8;
+
+	while (words) {
+		if (*(u64 *)start != value64)
+			return check_bytes8(start, value, 8);
+		start += 8;
+		words--;
+	}
+
+	return check_bytes8(start, value, bytes % 8);
+}
+EXPORT_SYMBOL(memchr_inv);
diff --git a/mm/slub.c b/mm/slub.c
index 95215aa6a75e..7d2a996c307e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -655,49 +655,6 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
 		memset(p + s->objsize, val, s->inuse - s->objsize);
 }
 
-static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes)
-{
-	while (bytes) {
-		if (*start != value)
-			return start;
-		start++;
-		bytes--;
-	}
-	return NULL;
-}
-
-static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes)
-{
-	u64 value64;
-	unsigned int words, prefix;
-
-	if (bytes <= 16)
-		return check_bytes8(start, value, bytes);
-
-	value64 = value | value << 8 | value << 16 | value << 24;
-	value64 = (value64 & 0xffffffff) | value64 << 32;
-	prefix = 8 - ((unsigned long)start) % 8;
-
-	if (prefix) {
-		u8 *r = check_bytes8(start, value, prefix);
-		if (r)
-			return r;
-		start += prefix;
-		bytes -= prefix;
-	}
-
-	words = bytes / 8;
-
-	while (words) {
-		if (*(u64 *)start != value64)
-			return check_bytes8(start, value, 8);
-		start += 8;
-		words--;
-	}
-
-	return check_bytes8(start, value, bytes % 8);
-}
-
 static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
 						void *from, void *to)
 {
@@ -712,7 +669,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
 	u8 *fault;
 	u8 *end;
 
-	fault = check_bytes(start, value, bytes);
+	fault = memchr_inv(start, value, bytes);
 	if (!fault)
 		return 1;
 
@@ -805,7 +762,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
 	if (!remainder)
 		return 1;
 
-	fault = check_bytes(end - remainder, POISON_INUSE, remainder);
+	fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
 	if (!fault)
 		return 1;
 	while (end > fault && end[-1] == POISON_INUSE)
-- 
cgit v1.2.3-58-ga151


From 09f363c7363eb10cfb4b82094bd7064e5608258b Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 31 Oct 2011 17:08:57 -0700
Subject: vmscan: fix shrinker callback bug in fs/super.c

The callback must not return -1 when nr_to_scan is zero. Fix the bug in
fs/super.c and add this requirement to the callback specification.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/super.c               | 2 +-
 include/linux/shrinker.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index 3f56a269a4f4..32a81f3467e0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -61,7 +61,7 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
 		return -1;
 
 	if (!grab_super_passive(sb))
-		return -1;
+		return !sc->nr_to_scan ? 0 : -1;
 
 	if (sb->s_op && sb->s_op->nr_cached_objects)
 		fs_objects = sb->s_op->nr_cached_objects(sb);
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 790651b4e5ba..a83833a1f7a2 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -20,6 +20,7 @@ struct shrink_control {
  * 'nr_to_scan' entries and attempt to free them up.  It should return
  * the number of objects which remain in the cache.  If it returns -1, it means
  * it cannot do any scanning at this time (eg. there is a risk of deadlock).
+ * The callback must not return -1 if nr_to_scan is zero.
  *
  * The 'gfpmask' refers to the allocation we are currently trying to
  * fulfil.
-- 
cgit v1.2.3-58-ga151


From 72a2ebd8bc62e6658513d3b2a1119e91c3ea6810 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Mon, 31 Oct 2011 17:09:00 -0700
Subject: fs/buffer.c: add device information for error output in
 __find_get_block_slow()

On the ext4 mailing list[1], we got some report about errors in
__find_get_block_slow(), but the information is very limited.

If the device information is given, we can know the name of the sick
volume.  Futhermore, we can get the corresponding status of that
block(group, inode block etc) by analyzing the disk layout.

[1] http://marc.info/?l=linux-ext4&m=131379831421147&w=2

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 936d6035f6e2..70a19745cb61 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -213,13 +213,16 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
 	 * elsewhere, don't buffer_error if we had some unmapped buffers
 	 */
 	if (all_mapped) {
+		char b[BDEVNAME_SIZE];
+
 		printk("__find_get_block_slow() failed. "
 			"block=%llu, b_blocknr=%llu\n",
 			(unsigned long long)block,
 			(unsigned long long)bh->b_blocknr);
 		printk("b_state=0x%08lx, b_size=%zu\n",
 			bh->b_state, bh->b_size);
-		printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
+		printk("device %s blocksize: %d\n", bdevname(bdev, b),
+			1 << bd_inode->i_blkbits);
 	}
 out_unlock:
 	spin_unlock(&bd_mapping->private_lock);
-- 
cgit v1.2.3-58-ga151


From d70ef97baf048412c395bb5d65791d8fe133a52b Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Mon, 31 Oct 2011 17:10:04 -0700
Subject: fs/pipe.c: add ->statfs callback for pipefs

Currently a statfs on a pipe's /proc/<pid>/fd/ link returns -ENOSYS.  Wire
pipfs up so that the statfs succeeds.

This is required by checkpoint-restart in the userspace to make it
possible to distinguish pipes from fifos.

When we dump information about task's open files we use the /proc/pid/fd
directoy's symlinks and the fact that opening any of them gives us exactly
the same dentry->inode pair as the original process has.  Now if a task
we're dumping has opened pipe and fifo we need to detect this and act
accordingly.  Knowing that an fd with type S_ISFIFO resides on a pipefs is
the most precise way.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 0e0be1dc0f8e..4065f07366b3 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1254,6 +1254,7 @@ out:
 
 static const struct super_operations pipefs_ops = {
 	.destroy_inode = free_inode_nonrcu,
+	.statfs = simple_statfs,
 };
 
 /*
-- 
cgit v1.2.3-58-ga151


From b9075fa968a0a4347aef35e235e2995c0e57dddd Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 31 Oct 2011 17:11:33 -0700
Subject: treewide: use __printf not __attribute__((format(printf,...)))

Standardize the style for compiler based printf format verification.
Standardized the location of __printf too.

Done via script and a little typing.

$ grep -rPl --include=*.[ch] -w "__attribute__" * | \
  grep -vP "^(tools|scripts|include/linux/compiler-gcc.h)" | \
  xargs perl -n -i -e 'local $/; while (<>) { s/\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf\s*,\s*(.+)\s*,\s*(.+)\s*\)\s*\)\s*\)/__printf($1, $2)/g ; print; }'

[akpm@linux-foundation.org: revert arch bits]
Signed-off-by: Joe Perches <joe@perches.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/isdn/hisax/callc.c                   |   4 +-
 drivers/isdn/hisax/hisax.h                   |   4 +-
 drivers/isdn/hisax/isdnl1.h                  |   2 +-
 drivers/isdn/hisax/isdnl3.c                  |   2 +-
 drivers/isdn/hisax/st5481_d.c                |   4 +-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h |   3 +-
 drivers/net/wireless/ath/ath.h               |   5 +-
 drivers/net/wireless/ath/ath5k/debug.h       |   4 +-
 drivers/net/wireless/ath/ath6kl/debug.h      |   4 +-
 drivers/net/wireless/b43/b43.h               |  12 +--
 drivers/net/wireless/b43legacy/b43legacy.h   |  16 ++--
 drivers/staging/iio/trigger.h                |   3 +-
 fs/ecryptfs/ecryptfs_kernel.h                |   2 +-
 fs/ext2/ext2.h                               |   8 +-
 fs/ext4/ext4.h                               |  44 +++++------
 fs/fat/fat.h                                 |   9 +--
 fs/gfs2/glock.h                              |   2 +-
 fs/hpfs/hpfs_fn.h                            |   4 +-
 fs/nilfs2/nilfs.h                            |   8 +-
 fs/ntfs/debug.h                              |  15 ++--
 fs/ocfs2/super.h                             |  14 ++--
 fs/partitions/ldm.c                          |  16 ++--
 fs/udf/udfdecl.h                             |   4 +-
 fs/ufs/ufs.h                                 |   9 ++-
 fs/xfs/xfs_message.h                         |  42 +++++-----
 include/asm-generic/bug.h                    |  11 +--
 include/drm/drmP.h                           |  10 +--
 include/linux/audit.h                        |  11 ++-
 include/linux/blktrace_api.h                 |   2 +-
 include/linux/device.h                       | 110 +++++++++++++--------------
 include/linux/dynamic_debug.h                |  19 +++--
 include/linux/ext3_fs.h                      |  16 ++--
 include/linux/fs.h                           |   4 +-
 include/linux/fscache-cache.h                |   8 +-
 include/linux/gameport.h                     |   8 +-
 include/linux/kallsyms.h                     |   5 +-
 include/linux/kdb.h                          |   9 +--
 include/linux/kernel.h                       |  44 +++++------
 include/linux/kexec.h                        |   4 +-
 include/linux/kmod.h                         |   4 +-
 include/linux/kobject.h                      |  26 +++----
 include/linux/kthread.h                      |   4 +-
 include/linux/libata.h                       |  18 ++---
 include/linux/mmiotrace.h                    |   8 +-
 include/linux/netdevice.h                    |  34 ++++-----
 include/linux/printk.h                       |  12 +--
 include/linux/quotaops.h                     |   2 +-
 include/linux/seq_file.h                     |   3 +-
 include/linux/trace_seq.h                    |   8 +-
 include/net/bluetooth/bluetooth.h            |   2 +-
 include/net/netfilter/nf_log.h               |   3 +-
 include/net/sock.h                           |   4 +-
 include/sound/core.h                         |   4 +-
 include/sound/info.h                         |   4 +-
 include/sound/seq_kernel.h                   |   4 +-
 include/xen/hvc-console.h                    |   4 +-
 include/xen/xenbus.h                         |  12 +--
 net/nfc/nfc.h                                |   2 +-
 net/rds/rds.h                                |   8 +-
 net/sunrpc/svc.c                             |   5 +-
 sound/firewire/cmp.c                         |   2 +-
 61 files changed, 324 insertions(+), 350 deletions(-)

(limited to 'fs')

diff --git a/drivers/isdn/hisax/callc.c b/drivers/isdn/hisax/callc.c
index 37e685eafd24..c4897e1075d8 100644
--- a/drivers/isdn/hisax/callc.c
+++ b/drivers/isdn/hisax/callc.c
@@ -65,7 +65,7 @@ hisax_findcard(int driverid)
 	return (struct IsdnCardState *) 0;
 }
 
-static __attribute__((format(printf, 3, 4))) void
+static __printf(3, 4) void
 link_debug(struct Channel *chanp, int direction, char *fmt, ...)
 {
 	va_list args;
@@ -1068,7 +1068,7 @@ init_d_st(struct Channel *chanp)
 	return 0;
 }
 
-static __attribute__((format(printf, 2, 3))) void
+static __printf(2, 3) void
 callc_debug(struct FsmInst *fi, char *fmt, ...)
 {
 	va_list args;
diff --git a/drivers/isdn/hisax/hisax.h b/drivers/isdn/hisax/hisax.h
index 0a5c42a3f125..aff45a11a92d 100644
--- a/drivers/isdn/hisax/hisax.h
+++ b/drivers/isdn/hisax/hisax.h
@@ -1287,9 +1287,9 @@ int jiftime(char *s, long mark);
 
 int HiSax_command(isdn_ctrl * ic);
 int HiSax_writebuf_skb(int id, int chan, int ack, struct sk_buff *skb);
-__attribute__((format(printf, 3, 4)))
+__printf(3, 4)
 void HiSax_putstatus(struct IsdnCardState *cs, char *head, char *fmt, ...);
-__attribute__((format(printf, 3, 0)))
+__printf(3, 0)
 void VHiSax_putstatus(struct IsdnCardState *cs, char *head, char *fmt, va_list args);
 void HiSax_reportcard(int cardnr, int sel);
 int QuickHex(char *txt, u_char * p, int cnt);
diff --git a/drivers/isdn/hisax/isdnl1.h b/drivers/isdn/hisax/isdnl1.h
index 425d86116f2b..66ddcab19bba 100644
--- a/drivers/isdn/hisax/isdnl1.h
+++ b/drivers/isdn/hisax/isdnl1.h
@@ -21,7 +21,7 @@
 #define B_XMTBUFREADY	1
 #define B_ACKPENDING	2
 
-__attribute__((format(printf, 2, 3)))
+__printf(2, 3)
 void debugl1(struct IsdnCardState *cs, char *fmt, ...);
 void DChannel_proc_xmt(struct IsdnCardState *cs);
 void DChannel_proc_rcv(struct IsdnCardState *cs);
diff --git a/drivers/isdn/hisax/isdnl3.c b/drivers/isdn/hisax/isdnl3.c
index ad291f21b201..1c24e4457b6f 100644
--- a/drivers/isdn/hisax/isdnl3.c
+++ b/drivers/isdn/hisax/isdnl3.c
@@ -66,7 +66,7 @@ static char *strL3Event[] =
 	"EV_TIMEOUT",
 };
 
-static __attribute__((format(printf, 2, 3))) void
+static __printf(2, 3) void
 l3m_debug(struct FsmInst *fi, char *fmt, ...)
 {
 	va_list args;
diff --git a/drivers/isdn/hisax/st5481_d.c b/drivers/isdn/hisax/st5481_d.c
index 44082637a09f..db247b79e561 100644
--- a/drivers/isdn/hisax/st5481_d.c
+++ b/drivers/isdn/hisax/st5481_d.c
@@ -167,7 +167,7 @@ static struct FsmNode L1FnList[] __initdata =
 	{ST_L1_F8, EV_IND_RSY,           l1_ignore},
 };
 
-static __attribute__((format(printf, 2, 3)))
+static __printf(2, 3)
 void l1m_debug(struct FsmInst *fi, char *fmt, ...)
 {
 	va_list args;
@@ -270,7 +270,7 @@ static char *strDoutEvent[] =
 	"EV_DOUT_UNDERRUN",
 };
 
-static __attribute__((format(printf, 2, 3)))
+static __printf(2, 3)
 void dout_debug(struct FsmInst *fi, char *fmt, ...)
 {
 	va_list args;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index fca66165110e..8fda331c65df 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -581,8 +581,9 @@ extern const struct ethtool_ops mlx4_en_ethtool_ops;
  * printk / logging functions
  */
 
+__printf(3, 4)
 int en_print(const char *level, const struct mlx4_en_priv *priv,
-	     const char *format, ...) __attribute__ ((format (printf, 3, 4)));
+	     const char *format, ...);
 
 #define en_dbg(mlevel, priv, format, arg...)			\
 do {								\
diff --git a/drivers/net/wireless/ath/ath.h b/drivers/net/wireless/ath/ath.h
index 908fdbc3e0ee..0f9ee46cfc97 100644
--- a/drivers/net/wireless/ath/ath.h
+++ b/drivers/net/wireless/ath/ath.h
@@ -173,8 +173,7 @@ bool ath_hw_keyreset(struct ath_common *common, u16 entry);
 void ath_hw_cycle_counters_update(struct ath_common *common);
 int32_t ath_hw_get_listen_time(struct ath_common *common);
 
-extern __attribute__((format (printf, 2, 3)))
-void ath_printk(const char *level, const char *fmt, ...);
+extern __printf(2, 3) void ath_printk(const char *level, const char *fmt, ...);
 
 #define _ath_printk(level, common, fmt, ...)			\
 do {								\
@@ -258,7 +257,7 @@ do {									\
 
 #else
 
-static inline  __attribute__((format (printf, 3, 4)))
+static inline  __attribute__ ((format (printf, 3, 4)))
 void ath_dbg(struct ath_common *common, enum ATH_DEBUG dbg_mask,
 	     const char *fmt, ...)
 {
diff --git a/drivers/net/wireless/ath/ath5k/debug.h b/drivers/net/wireless/ath/ath5k/debug.h
index 7f37df3125fd..0a3f916a1ef3 100644
--- a/drivers/net/wireless/ath/ath5k/debug.h
+++ b/drivers/net/wireless/ath/ath5k/debug.h
@@ -141,10 +141,10 @@ ath5k_debug_printtxbuf(struct ath5k_hw *ah, struct ath5k_buf *bf);
 
 #include <linux/compiler.h>
 
-static inline void __attribute__ ((format (printf, 3, 4)))
+static inline __printf(3, 4) void
 ATH5K_DBG(struct ath5k_hw *ah, unsigned int m, const char *fmt, ...) {}
 
-static inline void __attribute__ ((format (printf, 3, 4)))
+static inline __printf(3, 4) void
 ATH5K_DBG_UNLIMIT(struct ath5k_hw *ah, unsigned int m, const char *fmt, ...)
 {}
 
diff --git a/drivers/net/wireless/ath/ath6kl/debug.h b/drivers/net/wireless/ath/ath6kl/debug.h
index 9288a3ce1e39..7b7675f70a10 100644
--- a/drivers/net/wireless/ath/ath6kl/debug.h
+++ b/drivers/net/wireless/ath/ath6kl/debug.h
@@ -44,8 +44,8 @@ enum ATH6K_DEBUG_MASK {
 };
 
 extern unsigned int debug_mask;
-extern int ath6kl_printk(const char *level, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
+extern __printf(2, 3)
+int ath6kl_printk(const char *level, const char *fmt, ...);
 
 #define ath6kl_info(fmt, ...)				\
 	ath6kl_printk(KERN_INFO, fmt, ##__VA_ARGS__)
diff --git a/drivers/net/wireless/b43/b43.h b/drivers/net/wireless/b43/b43.h
index 447a2307c9d9..37110dfd2c96 100644
--- a/drivers/net/wireless/b43/b43.h
+++ b/drivers/net/wireless/b43/b43.h
@@ -1011,14 +1011,10 @@ static inline bool b43_using_pio_transfers(struct b43_wldev *dev)
 }
 
 /* Message printing */
-void b43info(struct b43_wl *wl, const char *fmt, ...)
-    __attribute__ ((format(printf, 2, 3)));
-void b43err(struct b43_wl *wl, const char *fmt, ...)
-    __attribute__ ((format(printf, 2, 3)));
-void b43warn(struct b43_wl *wl, const char *fmt, ...)
-    __attribute__ ((format(printf, 2, 3)));
-void b43dbg(struct b43_wl *wl, const char *fmt, ...)
-    __attribute__ ((format(printf, 2, 3)));
+__printf(2, 3) void b43info(struct b43_wl *wl, const char *fmt, ...);
+__printf(2, 3) void b43err(struct b43_wl *wl, const char *fmt, ...);
+__printf(2, 3) void b43warn(struct b43_wl *wl, const char *fmt, ...);
+__printf(2, 3) void b43dbg(struct b43_wl *wl, const char *fmt, ...);
 
 
 /* A WARN_ON variant that vanishes when b43 debugging is disabled.
diff --git a/drivers/net/wireless/b43legacy/b43legacy.h b/drivers/net/wireless/b43legacy/b43legacy.h
index 12b518251581..1d4fc9db7f5e 100644
--- a/drivers/net/wireless/b43legacy/b43legacy.h
+++ b/drivers/net/wireless/b43legacy/b43legacy.h
@@ -810,15 +810,15 @@ struct b43legacy_lopair *b43legacy_get_lopair(struct b43legacy_phy *phy,
 
 
 /* Message printing */
-void b43legacyinfo(struct b43legacy_wl *wl, const char *fmt, ...)
-		__attribute__((format(printf, 2, 3)));
-void b43legacyerr(struct b43legacy_wl *wl, const char *fmt, ...)
-		__attribute__((format(printf, 2, 3)));
-void b43legacywarn(struct b43legacy_wl *wl, const char *fmt, ...)
-		__attribute__((format(printf, 2, 3)));
+__printf(2, 3)
+void b43legacyinfo(struct b43legacy_wl *wl, const char *fmt, ...);
+__printf(2, 3)
+void b43legacyerr(struct b43legacy_wl *wl, const char *fmt, ...);
+__printf(2, 3)
+void b43legacywarn(struct b43legacy_wl *wl, const char *fmt, ...);
 #if B43legacy_DEBUG
-void b43legacydbg(struct b43legacy_wl *wl, const char *fmt, ...)
-		__attribute__((format(printf, 2, 3)));
+__printf(2, 3)
+void b43legacydbg(struct b43legacy_wl *wl, const char *fmt, ...);
 #else /* DEBUG */
 # define b43legacydbg(wl, fmt...) do { /* nothing */ } while (0)
 #endif /* DEBUG */
diff --git a/drivers/staging/iio/trigger.h b/drivers/staging/iio/trigger.h
index 598fcb3599f9..5cc42a655c88 100644
--- a/drivers/staging/iio/trigger.h
+++ b/drivers/staging/iio/trigger.h
@@ -115,8 +115,7 @@ void iio_trigger_poll_chained(struct iio_trigger *trig, s64 time);
 
 irqreturn_t iio_trigger_generic_data_rdy_poll(int irq, void *private);
 
-struct iio_trigger *iio_allocate_trigger(const char *fmt, ...)
-	__attribute__((format(printf, 1, 2)));
+__printf(1, 2) struct iio_trigger *iio_allocate_trigger(const char *fmt, ...);
 void iio_free_trigger(struct iio_trigger *trig);
 
 #endif /* _IIO_TRIGGER_H_ */
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index b36c5572b3f3..54481a3b2c79 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -514,7 +514,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
 
 #define ecryptfs_printk(type, fmt, arg...) \
         __ecryptfs_printk(type "%s: " fmt, __func__, ## arg);
-__attribute__ ((format(printf, 1, 2)))
+__printf(1, 2)
 void __ecryptfs_printk(const char *fmt, ...);
 
 extern const struct file_operations ecryptfs_main_fops;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index af9fc89b1b2d..9a4e5e206d08 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -135,10 +135,10 @@ extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long);
 struct dentry *ext2_get_parent(struct dentry *child);
 
 /* super.c */
-extern void ext2_error (struct super_block *, const char *, const char *, ...)
-	__attribute__ ((format (printf, 3, 4)));
-extern void ext2_msg(struct super_block *, const char *, const char *, ...)
-	__attribute__ ((format (printf, 3, 4)));
+extern __printf(3, 4)
+void ext2_error(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ext2_msg(struct super_block *, const char *, const char *, ...);
 extern void ext2_update_dynamic_rev (struct super_block *sb);
 extern void ext2_write_super (struct super_block *);
 
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b7d7bd0f066e..cec3145e532c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1878,40 +1878,40 @@ extern int ext4_group_extend(struct super_block *sb,
 extern void *ext4_kvmalloc(size_t size, gfp_t flags);
 extern void *ext4_kvzalloc(size_t size, gfp_t flags);
 extern void ext4_kvfree(void *ptr);
-extern void __ext4_error(struct super_block *, const char *, unsigned int,
-			 const char *, ...)
-	__attribute__ ((format (printf, 4, 5)));
+extern __printf(4, 5)
+void __ext4_error(struct super_block *, const char *, unsigned int,
+		  const char *, ...);
 #define ext4_error(sb, message...)	__ext4_error(sb, __func__,	\
 						     __LINE__, ## message)
-extern void ext4_error_inode(struct inode *, const char *, unsigned int,
-			     ext4_fsblk_t, const char *, ...)
-	__attribute__ ((format (printf, 5, 6)));
-extern void ext4_error_file(struct file *, const char *, unsigned int,
-			    ext4_fsblk_t, const char *, ...)
-	__attribute__ ((format (printf, 5, 6)));
+extern __printf(5, 6)
+void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
+		      const char *, ...);
+extern __printf(5, 6)
+void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
+		     const char *, ...);
 extern void __ext4_std_error(struct super_block *, const char *,
 			     unsigned int, int);
-extern void __ext4_abort(struct super_block *, const char *, unsigned int,
-		       const char *, ...)
-	__attribute__ ((format (printf, 4, 5)));
+extern __printf(4, 5)
+void __ext4_abort(struct super_block *, const char *, unsigned int,
+		  const char *, ...);
 #define ext4_abort(sb, message...)	__ext4_abort(sb, __func__, \
 						       __LINE__, ## message)
-extern void __ext4_warning(struct super_block *, const char *, unsigned int,
-			  const char *, ...)
-	__attribute__ ((format (printf, 4, 5)));
+extern __printf(4, 5)
+void __ext4_warning(struct super_block *, const char *, unsigned int,
+		    const char *, ...);
 #define ext4_warning(sb, message...)	__ext4_warning(sb, __func__, \
 						       __LINE__, ## message)
-extern void ext4_msg(struct super_block *, const char *, const char *, ...)
-	__attribute__ ((format (printf, 3, 4)));
+extern __printf(3, 4)
+void ext4_msg(struct super_block *, const char *, const char *, ...);
 extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
 			   const char *, unsigned int, const char *);
 #define dump_mmp_msg(sb, mmp, msg)	__dump_mmp_msg(sb, mmp, __func__, \
 						       __LINE__, msg)
-extern void __ext4_grp_locked_error(const char *, unsigned int, \
-				    struct super_block *, ext4_group_t, \
-				    unsigned long, ext4_fsblk_t, \
-				    const char *, ...)
-	__attribute__ ((format (printf, 7, 8)));
+extern __printf(7, 8)
+void __ext4_grp_locked_error(const char *, unsigned int,
+			     struct super_block *, ext4_group_t,
+			     unsigned long, ext4_fsblk_t,
+			     const char *, ...);
 #define ext4_grp_locked_error(sb, grp, message...) \
 	__ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
 extern void ext4_update_dynamic_rev(struct super_block *sb);
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index a5d3853822e0..1510a4d51990 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -326,15 +326,14 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
 		            struct inode *i2);
 /* fat/misc.c */
-extern void
-__fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
-	__attribute__ ((format (printf, 3, 4))) __cold;
+extern __printf(3, 4) __cold
+void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...);
 #define fat_fs_error(sb, fmt, args...)		\
 	__fat_fs_error(sb, 1, fmt , ## args)
 #define fat_fs_error_ratelimit(sb, fmt, args...) \
 	__fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args)
-void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
-	__attribute__ ((format (printf, 3, 4))) __cold;
+__printf(3, 4) __cold
+void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...);
 extern int fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
 extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 66707118af25..2553b858a72e 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -201,7 +201,7 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
 
-__attribute__ ((format(printf, 2, 3)))
+__printf(2, 3)
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 
 /**
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 331b5e234ef3..de946170ebb1 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -311,8 +311,8 @@ static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb)
 
 /* super.c */
 
-void hpfs_error(struct super_block *, const char *, ...)
-	__attribute__((format (printf, 2, 3)));
+__printf(2, 3)
+void hpfs_error(struct super_block *, const char *, ...);
 int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *);
 unsigned hpfs_count_one_bitmap(struct super_block *, secno);
 
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 255d5e1c03b7..3777d138f895 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -276,10 +276,10 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 /* super.c */
 extern struct inode *nilfs_alloc_inode(struct super_block *);
 extern void nilfs_destroy_inode(struct inode *);
-extern void nilfs_error(struct super_block *, const char *, const char *, ...)
-	__attribute__ ((format (printf, 3, 4)));
-extern void nilfs_warning(struct super_block *, const char *, const char *, ...)
-	__attribute__ ((format (printf, 3, 4)));
+extern __printf(3, 4)
+void nilfs_error(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void nilfs_warning(struct super_block *, const char *, const char *, ...);
 extern struct nilfs_super_block *
 nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
 extern int nilfs_store_magic_and_option(struct super_block *,
diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
index 2142b1c68b61..53c27eaf2307 100644
--- a/fs/ntfs/debug.h
+++ b/fs/ntfs/debug.h
@@ -30,8 +30,9 @@
 
 extern int debug_msgs;
 
-extern void __ntfs_debug(const char *file, int line, const char *function,
-	const char *format, ...) __attribute__ ((format (printf, 4, 5)));
+extern __printf(4, 5)
+void __ntfs_debug(const char *file, int line, const char *function,
+		  const char *format, ...);
 /**
  * ntfs_debug - write a debug level message to syslog
  * @f:		a printf format string containing the message
@@ -52,12 +53,14 @@ extern void ntfs_debug_dump_runlist(const runlist_element *rl);
 
 #endif	/* !DEBUG */
 
-extern void __ntfs_warning(const char *function, const struct super_block *sb,
-		const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
+extern  __printf(3, 4)
+void __ntfs_warning(const char *function, const struct super_block *sb,
+		    const char *fmt, ...);
 #define ntfs_warning(sb, f, a...)	__ntfs_warning(__func__, sb, f, ##a)
 
-extern void __ntfs_error(const char *function, const struct super_block *sb,
-		const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
+extern  __printf(3, 4)
+void __ntfs_error(const char *function, const struct super_block *sb,
+		  const char *fmt, ...);
 #define ntfs_error(sb, f, a...)		__ntfs_error(__func__, sb, f, ##a)
 
 #endif /* _LINUX_NTFS_DEBUG_H */
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 40c7de084c10..74ff74cf78fe 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -31,17 +31,15 @@ extern struct workqueue_struct *ocfs2_wq;
 int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
 				  int node_num);
 
-void __ocfs2_error(struct super_block *sb,
-		   const char *function,
-		   const char *fmt, ...)
-	__attribute__ ((format (printf, 3, 4)));
+__printf(3, 4)
+void __ocfs2_error(struct super_block *sb, const char *function,
+		   const char *fmt, ...);
 
 #define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
 
-void __ocfs2_abort(struct super_block *sb,
-		   const char *function,
-		   const char *fmt, ...)
-	__attribute__ ((format (printf, 3, 4)));
+__printf(3, 4)
+void __ocfs2_abort(struct super_block *sb, const char *function,
+		   const char *fmt, ...);
 
 #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
 
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index af9fdf046769..bd8ae788f689 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -49,18 +49,20 @@
 #define ldm_error(f, a...) _ldm_printk (KERN_ERR,   __func__, f, ##a)
 #define ldm_info(f, a...)  _ldm_printk (KERN_INFO,  __func__, f, ##a)
 
-__attribute__ ((format (printf, 3, 4)))
-static void _ldm_printk (const char *level, const char *function,
-			 const char *fmt, ...)
+static __printf(3, 4)
+void _ldm_printk(const char *level, const char *function, const char *fmt, ...)
 {
-	static char buf[128];
+	struct va_format vaf;
 	va_list args;
 
 	va_start (args, fmt);
-	vsnprintf (buf, sizeof (buf), fmt, args);
-	va_end (args);
 
-	printk ("%s%s(): %s\n", level, function, buf);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk("%s%s(): %pV\n", level, function, &vaf);
+
+	va_end(args);
 }
 
 /**
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index dbd52d4b5eed..dc8a8dcc5ae1 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -112,8 +112,8 @@ struct extent_position {
 
 /* super.c */
 
-__attribute__((format(printf, 3, 4)))
-extern void udf_warning(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4) void udf_warning(struct super_block *, const char *,
+					const char *, ...);
 static inline void udf_updated_lvid(struct super_block *sb)
 {
 	struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh;
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 5be2755dd715..c26f2bcec264 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -117,9 +117,12 @@ extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buf
 extern const struct file_operations ufs_dir_operations;
 
 /* super.c */
-extern void ufs_warning (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4)));
-extern void ufs_error (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4)));
-extern void ufs_panic (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4)));
+extern __printf(3, 4)
+void ufs_warning(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ufs_error(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ufs_panic(struct super_block *, const char *, const char *, ...);
 
 /* symlink.c */
 extern const struct inode_operations ufs_fast_symlink_inode_operations;
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index 7fb7ea007672..56dc0c17f16a 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -3,31 +3,29 @@
 
 struct xfs_mount;
 
-extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
-        __attribute__ ((format (printf, 2, 3)));
-extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
-        __attribute__ ((format (printf, 2, 3)));
-extern void xfs_alert_tag(const struct xfs_mount *mp, int tag,
-			 const char *fmt, ...)
-        __attribute__ ((format (printf, 3, 4)));
-extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...)
-        __attribute__ ((format (printf, 2, 3)));
-extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...)
-        __attribute__ ((format (printf, 2, 3)));
-extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...)
-        __attribute__ ((format (printf, 2, 3)));
-extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...)
-        __attribute__ ((format (printf, 2, 3)));
-extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
-        __attribute__ ((format (printf, 2, 3)));
+extern __printf(2, 3)
+void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...);
+extern __printf(2, 3)
+void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...);
+extern __printf(3, 4)
+void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...);
+extern __printf(2, 3)
+void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...);
+extern __printf(2, 3)
+void xfs_err(const struct xfs_mount *mp, const char *fmt, ...);
+extern __printf(2, 3)
+void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...);
+extern __printf(2, 3)
+void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...);
+extern __printf(2, 3)
+void xfs_info(const struct xfs_mount *mp, const char *fmt, ...);
 
 #ifdef DEBUG
-extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
-        __attribute__ ((format (printf, 2, 3)));
+extern __printf(2, 3)
+void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...);
 #else
-static inline void
-__attribute__ ((format (printf, 2, 3)))
-xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+static inline __printf(2, 3)
+void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
 {
 }
 #endif
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index dfb0ec666c94..84458b0c38d1 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -61,11 +61,12 @@ struct bug_entry {
  */
 #ifndef __WARN_TAINT
 #ifndef __ASSEMBLY__
-extern void warn_slowpath_fmt(const char *file, const int line,
-		const char *fmt, ...) __attribute__((format(printf, 3, 4)));
-extern void warn_slowpath_fmt_taint(const char *file, const int line,
-				    unsigned taint, const char *fmt, ...)
-	__attribute__((format(printf, 4, 5)));
+extern __printf(3, 4)
+void warn_slowpath_fmt(const char *file, const int line,
+		       const char *fmt, ...);
+extern __printf(4, 5)
+void warn_slowpath_fmt_taint(const char *file, const int line, unsigned taint,
+			     const char *fmt, ...);
 extern void warn_slowpath_null(const char *file, const int line);
 #define WANT_WARN_ON_SLOWPATH
 #endif
diff --git a/include/drm/drmP.h b/include/drm/drmP.h
index 43538b643560..cf3b446139ea 100644
--- a/include/drm/drmP.h
+++ b/include/drm/drmP.h
@@ -122,12 +122,12 @@ struct drm_device;
  * using the DRM_DEBUG_KMS and DRM_DEBUG.
  */
 
-extern __attribute__((format (printf, 4, 5)))
+extern __printf(4, 5)
 void drm_ut_debug_printk(unsigned int request_level,
-				const char *prefix,
-				const char *function_name,
-				const char *format, ...);
-extern __attribute__((format (printf, 2, 3)))
+			 const char *prefix,
+			 const char *function_name,
+			 const char *format, ...);
+extern __printf(2, 3)
 int drm_err(const char *func, const char *format, ...);
 
 /***********************************************************************/
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 0c8006129fb2..2f81c6f3b630 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -584,14 +584,13 @@ extern int audit_signals;
 #ifdef CONFIG_AUDIT
 /* These are defined in audit.c */
 				/* Public API */
-extern void		    audit_log(struct audit_context *ctx, gfp_t gfp_mask,
-				      int type, const char *fmt, ...)
-				      __attribute__((format(printf,4,5)));
+extern __printf(4, 5)
+void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
+	       const char *fmt, ...);
 
 extern struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type);
-extern void		    audit_log_format(struct audit_buffer *ab,
-					     const char *fmt, ...)
-			    __attribute__((format(printf,2,3)));
+extern __printf(2, 3)
+void audit_log_format(struct audit_buffer *ab, const char *fmt, ...);
 extern void		    audit_log_end(struct audit_buffer *ab);
 extern int		    audit_string_contains_control(const char *string,
 							  size_t len);
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 8e9e4bc6d73b..4d1a0748eaf8 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -170,7 +170,7 @@ extern void blk_trace_shutdown(struct request_queue *);
 extern int do_blk_trace_setup(struct request_queue *q, char *name,
 			      dev_t dev, struct block_device *bdev,
 			      struct blk_user_trace_setup *buts);
-extern __attribute__((format(printf, 2, 3)))
+extern __printf(2, 3)
 void __trace_note_message(struct blk_trace *, const char *fmt, ...);
 
 /**
diff --git a/include/linux/device.h b/include/linux/device.h
index 85e78fc7d7fd..e88abeecfadf 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -622,8 +622,8 @@ static inline const char *dev_name(const struct device *dev)
 	return kobject_name(&dev->kobj);
 }
 
-extern int dev_set_name(struct device *dev, const char *name, ...)
-			__attribute__((format(printf, 2, 3)));
+extern __printf(2, 3)
+int dev_set_name(struct device *dev, const char *name, ...);
 
 #ifdef CONFIG_NUMA
 static inline int dev_to_node(struct device *dev)
@@ -753,10 +753,10 @@ extern struct device *device_create_vargs(struct class *cls,
 					  void *drvdata,
 					  const char *fmt,
 					  va_list vargs);
-extern struct device *device_create(struct class *cls, struct device *parent,
-				    dev_t devt, void *drvdata,
-				    const char *fmt, ...)
-				    __attribute__((format(printf, 5, 6)));
+extern __printf(5, 6)
+struct device *device_create(struct class *cls, struct device *parent,
+			     dev_t devt, void *drvdata,
+			     const char *fmt, ...);
 extern void device_destroy(struct class *cls, dev_t devt);
 
 /*
@@ -800,64 +800,56 @@ extern const char *dev_driver_string(const struct device *dev);
 
 extern int __dev_printk(const char *level, const struct device *dev,
 			struct va_format *vaf);
-extern int dev_printk(const char *level, const struct device *dev,
-		      const char *fmt, ...)
-	__attribute__ ((format (printf, 3, 4)));
-extern int dev_emerg(const struct device *dev, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-extern int dev_alert(const struct device *dev, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-extern int dev_crit(const struct device *dev, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-extern int dev_err(const struct device *dev, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-extern int dev_warn(const struct device *dev, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-extern int dev_notice(const struct device *dev, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-extern int _dev_info(const struct device *dev, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
+extern __printf(3, 4)
+int dev_printk(const char *level, const struct device *dev,
+	       const char *fmt, ...)
+	;
+extern __printf(2, 3)
+int dev_emerg(const struct device *dev, const char *fmt, ...);
+extern __printf(2, 3)
+int dev_alert(const struct device *dev, const char *fmt, ...);
+extern __printf(2, 3)
+int dev_crit(const struct device *dev, const char *fmt, ...);
+extern __printf(2, 3)
+int dev_err(const struct device *dev, const char *fmt, ...);
+extern __printf(2, 3)
+int dev_warn(const struct device *dev, const char *fmt, ...);
+extern __printf(2, 3)
+int dev_notice(const struct device *dev, const char *fmt, ...);
+extern __printf(2, 3)
+int _dev_info(const struct device *dev, const char *fmt, ...);
 
 #else
 
 static inline int __dev_printk(const char *level, const struct device *dev,
 			       struct va_format *vaf)
-	 { return 0; }
-static inline int dev_printk(const char *level, const struct device *dev,
-		      const char *fmt, ...)
-	__attribute__ ((format (printf, 3, 4)));
-static inline int dev_printk(const char *level, const struct device *dev,
-		      const char *fmt, ...)
-	 { return 0; }
-
-static inline int dev_emerg(const struct device *dev, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-static inline int dev_emerg(const struct device *dev, const char *fmt, ...)
-	{ return 0; }
-static inline int dev_crit(const struct device *dev, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-static inline int dev_crit(const struct device *dev, const char *fmt, ...)
-	{ return 0; }
-static inline int dev_alert(const struct device *dev, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-static inline int dev_alert(const struct device *dev, const char *fmt, ...)
-	{ return 0; }
-static inline int dev_err(const struct device *dev, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-static inline int dev_err(const struct device *dev, const char *fmt, ...)
-	{ return 0; }
-static inline int dev_warn(const struct device *dev, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-static inline int dev_warn(const struct device *dev, const char *fmt, ...)
-	{ return 0; }
-static inline int dev_notice(const struct device *dev, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-static inline int dev_notice(const struct device *dev, const char *fmt, ...)
-	{ return 0; }
-static inline int _dev_info(const struct device *dev, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-static inline int _dev_info(const struct device *dev, const char *fmt, ...)
-	{ return 0; }
+{ return 0; }
+static inline __printf(3, 4)
+int dev_printk(const char *level, const struct device *dev,
+	       const char *fmt, ...)
+{ return 0; }
+
+static inline __printf(2, 3)
+int dev_emerg(const struct device *dev, const char *fmt, ...)
+{ return 0; }
+static inline __printf(2, 3)
+int dev_crit(const struct device *dev, const char *fmt, ...)
+{ return 0; }
+static inline __printf(2, 3)
+int dev_alert(const struct device *dev, const char *fmt, ...)
+{ return 0; }
+static inline __printf(2, 3)
+int dev_err(const struct device *dev, const char *fmt, ...)
+{ return 0; }
+static inline __printf(2, 3)
+int dev_warn(const struct device *dev, const char *fmt, ...)
+{ return 0; }
+static inline __printf(2, 3)
+int dev_notice(const struct device *dev, const char *fmt, ...)
+{ return 0; }
+static inline __printf(2, 3)
+int _dev_info(const struct device *dev, const char *fmt, ...)
+{ return 0; }
 
 #endif
 
diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index 13aae8087b56..0564e3c39882 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -37,22 +37,21 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n,
 
 #if defined(CONFIG_DYNAMIC_DEBUG)
 extern int ddebug_remove_module(const char *mod_name);
-extern int __dynamic_pr_debug(struct _ddebug *descriptor, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
+extern __printf(2, 3)
+int __dynamic_pr_debug(struct _ddebug *descriptor, const char *fmt, ...);
 
 struct device;
 
-extern int __dynamic_dev_dbg(struct _ddebug *descriptor,
-			     const struct device *dev,
-			     const char *fmt, ...)
-	__attribute__ ((format (printf, 3, 4)));
+extern __printf(3, 4)
+int __dynamic_dev_dbg(struct _ddebug *descriptor, const struct device *dev,
+		      const char *fmt, ...);
 
 struct net_device;
 
-extern int __dynamic_netdev_dbg(struct _ddebug *descriptor,
-			     const struct net_device *dev,
-			     const char *fmt, ...)
-	__attribute__ ((format (printf, 3, 4)));
+extern __printf(3, 4)
+int __dynamic_netdev_dbg(struct _ddebug *descriptor,
+			 const struct net_device *dev,
+			 const char *fmt, ...);
 
 #define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt)		\
 	static struct _ddebug __used __aligned(8)		\
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 67a803aee619..81965cce6bfa 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -937,15 +937,15 @@ extern int ext3_group_extend(struct super_block *sb,
 				ext3_fsblk_t n_blocks_count);
 
 /* super.c */
-extern void ext3_error (struct super_block *, const char *, const char *, ...)
-	__attribute__ ((format (printf, 3, 4)));
+extern __printf(3, 4)
+void ext3_error(struct super_block *, const char *, const char *, ...);
 extern void __ext3_std_error (struct super_block *, const char *, int);
-extern void ext3_abort (struct super_block *, const char *, const char *, ...)
-	__attribute__ ((format (printf, 3, 4)));
-extern void ext3_warning (struct super_block *, const char *, const char *, ...)
-	__attribute__ ((format (printf, 3, 4)));
-extern void ext3_msg(struct super_block *, const char *, const char *, ...)
-	__attribute__ ((format (printf, 3, 4)));
+extern __printf(3, 4)
+void ext3_abort(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ext3_warning(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ext3_msg(struct super_block *, const char *, const char *, ...);
 extern void ext3_update_dynamic_rev (struct super_block *sb);
 
 #define ext3_std_error(sb, errno)				\
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 87b4c6b9692d..7a049fd2aa4c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2635,8 +2635,8 @@ static const struct file_operations __fops = {				\
 	.llseek	 = generic_file_llseek,					\
 };
 
-static inline void __attribute__((format(printf, 1, 2)))
-__simple_attr_check_format(const char *fmt, ...)
+static inline __printf(1, 2)
+void __simple_attr_check_format(const char *fmt, ...)
 {
 	/* don't do anything, just let the compiler check the arguments; */
 }
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index af095b54502e..ce31408b1e47 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -492,10 +492,10 @@ static inline void fscache_end_io(struct fscache_retrieval *op,
 /*
  * out-of-line cache backend functions
  */
-extern void fscache_init_cache(struct fscache_cache *cache,
-			       const struct fscache_cache_ops *ops,
-			       const char *idfmt,
-			       ...) __attribute__ ((format (printf, 3, 4)));
+extern __printf(3, 4)
+void fscache_init_cache(struct fscache_cache *cache,
+			const struct fscache_cache_ops *ops,
+			const char *idfmt, ...);
 
 extern int fscache_add_cache(struct fscache_cache *cache,
 			     struct fscache_object *fsdef,
diff --git a/include/linux/gameport.h b/include/linux/gameport.h
index b65a6f472775..069ee4139105 100644
--- a/include/linux/gameport.h
+++ b/include/linux/gameport.h
@@ -78,8 +78,8 @@ static inline void gameport_register_port(struct gameport *gameport)
 
 void gameport_unregister_port(struct gameport *gameport);
 
-void gameport_set_phys(struct gameport *gameport, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
+__printf(2, 3)
+void gameport_set_phys(struct gameport *gameport, const char *fmt, ...);
 
 #else
 
@@ -93,8 +93,8 @@ static inline void gameport_unregister_port(struct gameport *gameport)
 	return;
 }
 
-static inline void gameport_set_phys(struct gameport *gameport,
-				     const char *fmt, ...)
+static inline __printf(2, 3)
+void gameport_set_phys(struct gameport *gameport, const char *fmt, ...)
 {
 	return;
 }
diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index 0df513b7a9f8..387571959dd9 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -101,9 +101,8 @@ static inline int lookup_symbol_attrs(unsigned long addr, unsigned long *size, u
 #endif /*CONFIG_KALLSYMS*/
 
 /* This macro allows us to keep printk typechecking */
-static void __check_printsym_format(const char *fmt, ...)
-__attribute__((format(printf,1,2)));
-static inline void __check_printsym_format(const char *fmt, ...)
+static __printf(1, 2)
+void __check_printsym_format(const char *fmt, ...)
 {
 }
 
diff --git a/include/linux/kdb.h b/include/linux/kdb.h
index 529d9a0c75a5..064725854db8 100644
--- a/include/linux/kdb.h
+++ b/include/linux/kdb.h
@@ -114,12 +114,9 @@ typedef enum {
 } kdb_reason_t;
 
 extern int kdb_trap_printk;
-extern int vkdb_printf(const char *fmt, va_list args)
-	    __attribute__ ((format (printf, 1, 0)));
-extern int kdb_printf(const char *, ...)
-	    __attribute__ ((format (printf, 1, 2)));
-typedef int (*kdb_printf_t)(const char *, ...)
-	     __attribute__ ((format (printf, 1, 2)));
+extern __printf(1, 0) int vkdb_printf(const char *fmt, va_list args);
+extern __printf(1, 2) int kdb_printf(const char *, ...);
+typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
 
 extern void kdb_init(int level);
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 8eefcf7e95eb..e40c950e1d62 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -296,20 +296,18 @@ extern long long simple_strtoll(const char *,char **,unsigned int);
 #define strict_strtoull	kstrtoull
 #define strict_strtoll	kstrtoll
 
-extern int sprintf(char * buf, const char * fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-extern int vsprintf(char *buf, const char *, va_list)
-	__attribute__ ((format (printf, 2, 0)));
-extern int snprintf(char * buf, size_t size, const char * fmt, ...)
-	__attribute__ ((format (printf, 3, 4)));
-extern int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
-	__attribute__ ((format (printf, 3, 0)));
-extern int scnprintf(char * buf, size_t size, const char * fmt, ...)
-	__attribute__ ((format (printf, 3, 4)));
-extern int vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
-	__attribute__ ((format (printf, 3, 0)));
-extern char *kasprintf(gfp_t gfp, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
+extern __printf(2, 3) int sprintf(char *buf, const char * fmt, ...);
+extern __printf(2, 0) int vsprintf(char *buf, const char *, va_list);
+extern __printf(3, 4)
+int snprintf(char *buf, size_t size, const char *fmt, ...);
+extern __printf(3, 0)
+int vsnprintf(char *buf, size_t size, const char *fmt, va_list args);
+extern __printf(3, 4)
+int scnprintf(char *buf, size_t size, const char *fmt, ...);
+extern __printf(3, 0)
+int vscnprintf(char *buf, size_t size, const char *fmt, va_list args);
+extern __printf(2, 3)
+char *kasprintf(gfp_t gfp, const char *fmt, ...);
 extern char *kvasprintf(gfp_t gfp, const char *fmt, va_list args);
 
 extern int sscanf(const char *, const char *, ...)
@@ -427,8 +425,8 @@ extern void tracing_start(void);
 extern void tracing_stop(void);
 extern void ftrace_off_permanent(void);
 
-static inline void __attribute__ ((format (printf, 1, 2)))
-____trace_printk_check_format(const char *fmt, ...)
+static inline __printf(1, 2)
+void ____trace_printk_check_format(const char *fmt, ...)
 {
 }
 #define __trace_printk_check_format(fmt, args...)			\
@@ -467,13 +465,11 @@ do {									\
 		__trace_printk(_THIS_IP_, fmt, ##args);		\
 } while (0)
 
-extern int
-__trace_bprintk(unsigned long ip, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
+extern __printf(2, 3)
+int __trace_bprintk(unsigned long ip, const char *fmt, ...);
 
-extern int
-__trace_printk(unsigned long ip, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
+extern __printf(2, 3)
+int __trace_printk(unsigned long ip, const char *fmt, ...);
 
 extern void trace_dump_stack(void);
 
@@ -502,8 +498,8 @@ __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
 
 extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode);
 #else
-static inline int
-trace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
+static inline __printf(1, 2)
+int trace_printk(const char *fmt, ...);
 
 static inline void tracing_start(void) { }
 static inline void tracing_stop(void) { }
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index c2478a342cd7..8944ebe7963e 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -130,8 +130,8 @@ int kexec_should_crash(struct task_struct *);
 void crash_save_cpu(struct pt_regs *regs, int cpu);
 void crash_save_vmcoreinfo(void);
 void arch_crash_save_vmcoreinfo(void);
-void vmcoreinfo_append_str(const char *fmt, ...)
-	__attribute__ ((format (printf, 1, 2)));
+__printf(1, 2)
+void vmcoreinfo_append_str(const char *fmt, ...);
 unsigned long paddr_vmcoreinfo_note(void);
 
 #define VMCOREINFO_OSRELEASE(value) \
diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 0da38cf7db7b..b16f65390734 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -32,8 +32,8 @@
 extern char modprobe_path[]; /* for sysctl */
 /* modprobe exit status on success, -ve on error.  Return value
  * usually useless though. */
-extern int __request_module(bool wait, const char *name, ...) \
-	__attribute__((format(printf, 2, 3)));
+extern __printf(2, 3)
+int __request_module(bool wait, const char *name, ...);
 #define request_module(mod...) __request_module(true, mod)
 #define request_module_nowait(mod...) __request_module(false, mod)
 #define try_then_request_module(x, mod...) \
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 668729cc0fe9..ad81e1c51487 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -72,8 +72,8 @@ struct kobject {
 	unsigned int uevent_suppress:1;
 };
 
-extern int kobject_set_name(struct kobject *kobj, const char *name, ...)
-			    __attribute__((format(printf, 2, 3)));
+extern __printf(2, 3)
+int kobject_set_name(struct kobject *kobj, const char *name, ...);
 extern int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
 				  va_list vargs);
 
@@ -83,15 +83,13 @@ static inline const char *kobject_name(const struct kobject *kobj)
 }
 
 extern void kobject_init(struct kobject *kobj, struct kobj_type *ktype);
-extern int __must_check kobject_add(struct kobject *kobj,
-				    struct kobject *parent,
-				    const char *fmt, ...)
-	__attribute__((format(printf, 3, 4)));
-extern int __must_check kobject_init_and_add(struct kobject *kobj,
-					     struct kobj_type *ktype,
-					     struct kobject *parent,
-					     const char *fmt, ...)
-	__attribute__((format(printf, 4, 5)));
+extern __printf(3, 4) __must_check
+int kobject_add(struct kobject *kobj, struct kobject *parent,
+		const char *fmt, ...);
+extern __printf(4, 5) __must_check
+int kobject_init_and_add(struct kobject *kobj,
+			 struct kobj_type *ktype, struct kobject *parent,
+			 const char *fmt, ...);
 
 extern void kobject_del(struct kobject *kobj);
 
@@ -212,8 +210,8 @@ int kobject_uevent(struct kobject *kobj, enum kobject_action action);
 int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
 			char *envp[]);
 
-int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...)
-	__attribute__((format (printf, 2, 3)));
+__printf(2, 3)
+int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...);
 
 int kobject_action_type(const char *buf, size_t count,
 			enum kobject_action *type);
@@ -226,7 +224,7 @@ static inline int kobject_uevent_env(struct kobject *kobj,
 				      char *envp[])
 { return 0; }
 
-static inline __attribute__((format(printf, 2, 3)))
+static inline __printf(2, 3)
 int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...)
 { return 0; }
 
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 1e923e5e88e8..5cac19b3a266 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -4,11 +4,11 @@
 #include <linux/err.h>
 #include <linux/sched.h>
 
+__printf(4, 5)
 struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 					   void *data,
 					   int node,
-					   const char namefmt[], ...)
-	__attribute__((format(printf, 4, 5)));
+					   const char namefmt[], ...);
 
 #define kthread_create(threadfn, data, namefmt, arg...) \
 	kthread_create_on_node(threadfn, data, -1, namefmt, ##arg)
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 23fa829bf7a3..cafc09a64fe4 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1256,13 +1256,13 @@ static inline int sata_srst_pmp(struct ata_link *link)
 /*
  * printk helpers
  */
-__attribute__((format (printf, 3, 4)))
+__printf(3, 4)
 int ata_port_printk(const struct ata_port *ap, const char *level,
 		    const char *fmt, ...);
-__attribute__((format (printf, 3, 4)))
+__printf(3, 4)
 int ata_link_printk(const struct ata_link *link, const char *level,
 		    const char *fmt, ...);
-__attribute__((format (printf, 3, 4)))
+__printf(3, 4)
 int ata_dev_printk(const struct ata_device *dev, const char *level,
 		   const char *fmt, ...);
 
@@ -1304,10 +1304,10 @@ void ata_print_version(const struct device *dev, const char *version);
 /*
  * ata_eh_info helpers
  */
-extern void __ata_ehi_push_desc(struct ata_eh_info *ehi, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-extern void ata_ehi_push_desc(struct ata_eh_info *ehi, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
+extern __printf(2, 3)
+void __ata_ehi_push_desc(struct ata_eh_info *ehi, const char *fmt, ...);
+extern __printf(2, 3)
+void ata_ehi_push_desc(struct ata_eh_info *ehi, const char *fmt, ...);
 extern void ata_ehi_clear_desc(struct ata_eh_info *ehi);
 
 static inline void ata_ehi_hotplugged(struct ata_eh_info *ehi)
@@ -1321,8 +1321,8 @@ static inline void ata_ehi_hotplugged(struct ata_eh_info *ehi)
 /*
  * port description helpers
  */
-extern void ata_port_desc(struct ata_port *ap, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
+extern __printf(2, 3)
+void ata_port_desc(struct ata_port *ap, const char *fmt, ...);
 #ifdef CONFIG_PCI
 extern void ata_port_pbar_desc(struct ata_port *ap, int bar, ssize_t offset,
 			       const char *name);
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index 97491f78b08c..c5d52780d6a0 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -49,8 +49,7 @@ extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
 extern void mmiotrace_iounmap(volatile void __iomem *addr);
 
 /* For anyone to insert markers. Remember trailing newline. */
-extern int mmiotrace_printk(const char *fmt, ...)
-				__attribute__ ((format (printf, 1, 2)));
+extern __printf(1, 2) int mmiotrace_printk(const char *fmt, ...);
 #else /* !CONFIG_MMIOTRACE: */
 static inline int is_kmmio_active(void)
 {
@@ -71,10 +70,7 @@ static inline void mmiotrace_iounmap(volatile void __iomem *addr)
 {
 }
 
-static inline int mmiotrace_printk(const char *fmt, ...)
-				__attribute__ ((format (printf, 1, 0)));
-
-static inline int mmiotrace_printk(const char *fmt, ...)
+static inline __printf(1, 2) int mmiotrace_printk(const char *fmt, ...)
 {
 	return 0;
 }
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index df1c836e6948..cbeb5867cff7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2622,23 +2622,23 @@ static inline const char *netdev_name(const struct net_device *dev)
 extern int __netdev_printk(const char *level, const struct net_device *dev,
 			struct va_format *vaf);
 
-extern int netdev_printk(const char *level, const struct net_device *dev,
-			 const char *format, ...)
-	__attribute__ ((format (printf, 3, 4)));
-extern int netdev_emerg(const struct net_device *dev, const char *format, ...)
-	__attribute__ ((format (printf, 2, 3)));
-extern int netdev_alert(const struct net_device *dev, const char *format, ...)
-	__attribute__ ((format (printf, 2, 3)));
-extern int netdev_crit(const struct net_device *dev, const char *format, ...)
-	__attribute__ ((format (printf, 2, 3)));
-extern int netdev_err(const struct net_device *dev, const char *format, ...)
-	__attribute__ ((format (printf, 2, 3)));
-extern int netdev_warn(const struct net_device *dev, const char *format, ...)
-	__attribute__ ((format (printf, 2, 3)));
-extern int netdev_notice(const struct net_device *dev, const char *format, ...)
-	__attribute__ ((format (printf, 2, 3)));
-extern int netdev_info(const struct net_device *dev, const char *format, ...)
-	__attribute__ ((format (printf, 2, 3)));
+extern __printf(3, 4)
+int netdev_printk(const char *level, const struct net_device *dev,
+		  const char *format, ...);
+extern __printf(2, 3)
+int netdev_emerg(const struct net_device *dev, const char *format, ...);
+extern __printf(2, 3)
+int netdev_alert(const struct net_device *dev, const char *format, ...);
+extern __printf(2, 3)
+int netdev_crit(const struct net_device *dev, const char *format, ...);
+extern __printf(2, 3)
+int netdev_err(const struct net_device *dev, const char *format, ...);
+extern __printf(2, 3)
+int netdev_warn(const struct net_device *dev, const char *format, ...);
+extern __printf(2, 3)
+int netdev_notice(const struct net_device *dev, const char *format, ...);
+extern __printf(2, 3)
+int netdev_info(const struct net_device *dev, const char *format, ...);
 
 #define MODULE_ALIAS_NETDEV(device) \
 	MODULE_ALIAS("netdev-" device)
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 0101d55d9651..f0e22f75143f 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -82,22 +82,22 @@ struct va_format {
  * Dummy printk for disabled debugging statements to use whilst maintaining
  * gcc's format and side-effect checking.
  */
-static inline __attribute__ ((format (printf, 1, 2)))
+static inline __printf(1, 2)
 int no_printk(const char *fmt, ...)
 {
 	return 0;
 }
 
-extern asmlinkage __attribute__ ((format (printf, 1, 2)))
+extern asmlinkage __printf(1, 2)
 void early_printk(const char *fmt, ...);
 
 extern int printk_needs_cpu(int cpu);
 extern void printk_tick(void);
 
 #ifdef CONFIG_PRINTK
-asmlinkage __attribute__ ((format (printf, 1, 0)))
+asmlinkage __printf(1, 0)
 int vprintk(const char *fmt, va_list args);
-asmlinkage __attribute__ ((format (printf, 1, 2))) __cold
+asmlinkage __printf(1, 2) __cold
 int printk(const char *fmt, ...);
 
 /*
@@ -117,12 +117,12 @@ extern int kptr_restrict;
 void log_buf_kexec_setup(void);
 void __init setup_log_buf(int early);
 #else
-static inline __attribute__ ((format (printf, 1, 0)))
+static inline __printf(1, 0)
 int vprintk(const char *s, va_list args)
 {
 	return 0;
 }
-static inline __attribute__ ((format (printf, 1, 2))) __cold
+static inline __printf(1, 2) __cold
 int printk(const char *s, ...)
 {
 	return 0;
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 26f9e3612e0f..d93f95e6177c 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -31,7 +31,7 @@ static inline bool is_quota_modification(struct inode *inode, struct iattr *ia)
 #define quota_error(sb, fmt, args...) \
 	__quota_error((sb), __func__, fmt , ## args)
 
-extern __attribute__((format (printf, 3, 4)))
+extern __printf(3, 4)
 void __quota_error(struct super_block *sb, const char *func,
 		   const char *fmt, ...);
 
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index be720cd2038d..0b69a4684216 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -84,8 +84,7 @@ int seq_putc(struct seq_file *m, char c);
 int seq_puts(struct seq_file *m, const char *s);
 int seq_write(struct seq_file *seq, const void *data, size_t len);
 
-int seq_printf(struct seq_file *, const char *, ...)
-	__attribute__ ((format (printf,2,3)));
+__printf(2, 3) int seq_printf(struct seq_file *, const char *, ...);
 
 int seq_path(struct seq_file *, struct path *, char *);
 int seq_dentry(struct seq_file *, struct dentry *, char *);
diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 5cf397ceb726..7dadc3df0c77 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -29,10 +29,10 @@ trace_seq_init(struct trace_seq *s)
  * Currently only defined when tracing is enabled.
  */
 #ifdef CONFIG_TRACING
-extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-extern int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
-	__attribute__ ((format (printf, 2, 0)));
+extern __printf(2, 3)
+int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
+extern __printf(2, 0)
+int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args);
 extern int
 trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
 extern int trace_print_seq(struct seq_file *m, struct trace_seq *s);
diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index e727555d4ee9..e86af08293a8 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -77,7 +77,7 @@ struct bt_power {
 #define BT_POWER_FORCE_ACTIVE_OFF 0
 #define BT_POWER_FORCE_ACTIVE_ON  1
 
-__attribute__((format (printf, 2, 3)))
+__printf(2, 3)
 int bt_printk(const char *level, const char *fmt, ...);
 
 #define BT_INFO(fmt, arg...)   bt_printk(KERN_INFO, pr_fmt(fmt), ##arg)
diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h
index 920997f1aff0..e991bd0a27af 100644
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@@ -53,12 +53,13 @@ int nf_log_bind_pf(u_int8_t pf, const struct nf_logger *logger);
 void nf_log_unbind_pf(u_int8_t pf);
 
 /* Calls the registered backend logging function */
+__printf(7, 8)
 void nf_log_packet(u_int8_t pf,
 		   unsigned int hooknum,
 		   const struct sk_buff *skb,
 		   const struct net_device *in,
 		   const struct net_device *out,
 		   const struct nf_loginfo *li,
-		   const char *fmt, ...) __attribute__ ((format(printf,7,8)));
+		   const char *fmt, ...);
 
 #endif /* _NF_LOG_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 5ac682f73d63..c6658bef7f32 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -76,8 +76,8 @@
 					printk(KERN_DEBUG msg); } while (0)
 #else
 /* Validate arguments and do nothing */
-static inline void __attribute__ ((format (printf, 2, 3)))
-SOCK_DEBUG(struct sock *sk, const char *msg, ...)
+static inline __printf(2, 3)
+void SOCK_DEBUG(struct sock *sk, const char *msg, ...)
 {
 }
 #endif
diff --git a/include/sound/core.h b/include/sound/core.h
index 1fa2407c966f..91d513879a78 100644
--- a/include/sound/core.h
+++ b/include/sound/core.h
@@ -326,9 +326,9 @@ void release_and_free_resource(struct resource *res);
 /* --- */
 
 #if defined(CONFIG_SND_DEBUG) || defined(CONFIG_SND_VERBOSE_PRINTK)
+__printf(4, 5)
 void __snd_printk(unsigned int level, const char *file, int line,
-		  const char *format, ...)
-     __attribute__ ((format (printf, 4, 5)));
+		  const char *format, ...);
 #else
 #define __snd_printk(level, file, line, format, args...) \
 	printk(format, ##args)
diff --git a/include/sound/info.h b/include/sound/info.h
index 4e94cf1ff762..5492cc40dc57 100644
--- a/include/sound/info.h
+++ b/include/sound/info.h
@@ -110,8 +110,8 @@ void snd_card_info_read_oss(struct snd_info_buffer *buffer);
 static inline void snd_card_info_read_oss(struct snd_info_buffer *buffer) {}
 #endif
 
-int snd_iprintf(struct snd_info_buffer *buffer, const char *fmt, ...) \
-				__attribute__ ((format (printf, 2, 3)));
+__printf(2, 3)
+int snd_iprintf(struct snd_info_buffer *buffer, const char *fmt, ...);
 int snd_info_init(void);
 int snd_info_done(void);
 
diff --git a/include/sound/seq_kernel.h b/include/sound/seq_kernel.h
index 3d9afb6a8c9c..f352a98ce4f4 100644
--- a/include/sound/seq_kernel.h
+++ b/include/sound/seq_kernel.h
@@ -75,9 +75,9 @@ struct snd_seq_port_callback {
 };
 
 /* interface for kernel client */
+__printf(3, 4)
 int snd_seq_create_kernel_client(struct snd_card *card, int client_index,
-				 const char *name_fmt, ...)
-	__attribute__ ((format (printf, 3, 4)));
+				 const char *name_fmt, ...);
 int snd_seq_delete_kernel_client(int client);
 int snd_seq_kernel_client_enqueue(int client, struct snd_seq_event *ev, int atomic, int hop);
 int snd_seq_kernel_client_dispatch(int client, struct snd_seq_event *ev, int atomic, int hop);
diff --git a/include/xen/hvc-console.h b/include/xen/hvc-console.h
index 901724dc528d..b62dfef15f61 100644
--- a/include/xen/hvc-console.h
+++ b/include/xen/hvc-console.h
@@ -6,12 +6,12 @@ extern struct console xenboot_console;
 #ifdef CONFIG_HVC_XEN
 void xen_console_resume(void);
 void xen_raw_console_write(const char *str);
-__attribute__((format(printf, 1, 2)))
+__printf(1, 2)
 void xen_raw_printk(const char *fmt, ...);
 #else
 static inline void xen_console_resume(void) { }
 static inline void xen_raw_console_write(const char *str) { }
-static inline __attribute__((format(printf, 1, 2)))
+static inline __printf(1, 2)
 void xen_raw_printk(const char *fmt, ...) { }
 #endif
 
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index aceeca799fd7..b9f9fb5af0d8 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -156,9 +156,9 @@ int xenbus_scanf(struct xenbus_transaction t,
 	__attribute__((format(scanf, 4, 5)));
 
 /* Single printf and write: returns -errno or 0. */
+__printf(4, 5)
 int xenbus_printf(struct xenbus_transaction t,
-		  const char *dir, const char *node, const char *fmt, ...)
-	__attribute__((format(printf, 4, 5)));
+		  const char *dir, const char *node, const char *fmt, ...);
 
 /* Generic read function: NULL-terminated triples of name,
  * sprintf-style type string, and pointer. Returns 0 or errno.*/
@@ -200,11 +200,11 @@ int xenbus_watch_path(struct xenbus_device *dev, const char *path,
 		      struct xenbus_watch *watch,
 		      void (*callback)(struct xenbus_watch *,
 				       const char **, unsigned int));
+__printf(4, 5)
 int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch,
 			 void (*callback)(struct xenbus_watch *,
 					  const char **, unsigned int),
-			 const char *pathfmt, ...)
-	__attribute__ ((format (printf, 4, 5)));
+			 const char *pathfmt, ...);
 
 int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state);
 int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn);
@@ -223,9 +223,9 @@ int xenbus_free_evtchn(struct xenbus_device *dev, int port);
 
 enum xenbus_state xenbus_read_driver_state(const char *path);
 
-__attribute__((format(printf, 3, 4)))
+__printf(3, 4)
 void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...);
-__attribute__((format(printf, 3, 4)))
+__printf(3, 4)
 void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...);
 
 const char *xenbus_strstate(enum xenbus_state state);
diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h
index b6753f45624e..d86583f4831d 100644
--- a/net/nfc/nfc.h
+++ b/net/nfc/nfc.h
@@ -27,7 +27,7 @@
 #include <net/nfc/nfc.h>
 #include <net/sock.h>
 
-__attribute__((format (printf, 2, 3)))
+__printf(2, 3)
 int nfc_printk(const char *level, const char *fmt, ...);
 
 #define nfc_info(fmt, arg...) nfc_printk(KERN_INFO, fmt, ##arg)
diff --git a/net/rds/rds.h b/net/rds/rds.h
index da8adac2bf06..7eaba1831f0d 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -36,8 +36,8 @@
 #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
 #else
 /* sigh, pr_debug() causes unused variable warnings */
-static inline void __attribute__ ((format (printf, 1, 2)))
-rdsdebug(char *fmt, ...)
+static inline __printf(1, 2)
+void rdsdebug(char *fmt, ...)
 {
 }
 #endif
@@ -625,8 +625,8 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
 			  struct rds_info_lengths *lens,
 			  int (*visitor)(struct rds_connection *, void *),
 			  size_t item_len);
-void __rds_conn_error(struct rds_connection *conn, const char *, ...)
-				__attribute__ ((format (printf, 2, 3)));
+__printf(2, 3)
+void __rds_conn_error(struct rds_connection *conn, const char *, ...);
 #define rds_conn_error(conn, fmt...) \
 	__rds_conn_error(conn, KERN_WARNING "RDS: " fmt)
 
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 30d70abb4e2c..dd5cc00ed559 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -971,9 +971,8 @@ static void svc_unregister(const struct svc_serv *serv)
 /*
  * Printk the given error with the address of the client that caused it.
  */
-static int
-__attribute__ ((format (printf, 2, 3)))
-svc_printk(struct svc_rqst *rqstp, const char *fmt, ...)
+static __printf(2, 3)
+int svc_printk(struct svc_rqst *rqstp, const char *fmt, ...)
 {
 	va_list args;
 	int 	r;
diff --git a/sound/firewire/cmp.c b/sound/firewire/cmp.c
index 14cacbc655dd..76294f2ae47f 100644
--- a/sound/firewire/cmp.c
+++ b/sound/firewire/cmp.c
@@ -32,7 +32,7 @@ enum bus_reset_handling {
 	SUCCEED_ON_BUS_RESET,
 };
 
-static __attribute__((format(printf, 2, 3)))
+static __printf(2, 3)
 void cmp_error(struct cmp_connection *c, const char *fmt, ...)
 {
 	va_list va;
-- 
cgit v1.2.3-58-ga151


From 0a90e0f1012e576500b551455b046013324826b9 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 31 Oct 2011 17:12:57 -0700
Subject: fat: follow rename pack_hex_byte() to hex_byte_pack()

There is no functional change.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/dir.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 5efbd5d7701a..aca191bd5f8f 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -156,8 +156,8 @@ static int uni16_to_x8(struct super_block *sb, unsigned char *ascii,
 		} else {
 			if (uni_xlate == 1) {
 				*op++ = ':';
-				op = pack_hex_byte(op, ec >> 8);
-				op = pack_hex_byte(op, ec);
+				op = hex_byte_pack(op, ec >> 8);
+				op = hex_byte_pack(op, ec);
 				len -= 5;
 			} else {
 				*op++ = '?';
-- 
cgit v1.2.3-58-ga151


From d8805e633e054c816c47cb6e727c81f156d9253d Mon Sep 17 00:00:00 2001
From: Nelson Elhage <nelhage@nelhage.com>
Date: Mon, 31 Oct 2011 17:13:14 -0700
Subject: epoll: fix spurious lockdep warnings

epoll can acquire recursively acquire ep->mtx on multiple "struct
eventpoll"s at once in the case where one epoll fd is monitoring another
epoll fd.  This is perfectly OK, since we're careful about the lock
ordering, but it causes spurious lockdep warnings.  Annotate the recursion
using mutex_lock_nested, and add a comment explaining the nesting rules
for good measure.

Recent versions of systemd are triggering this, and it can also be
demonstrated with the following trivial test program:

--------------------8<--------------------

int main(void) {
   int e1, e2;
   struct epoll_event evt = {
       .events = EPOLLIN
   };

   e1 = epoll_create1(0);
   e2 = epoll_create1(0);
   epoll_ctl(e1, EPOLL_CTL_ADD, e2, &evt);
   return 0;
}
--------------------8<--------------------

Reported-by: Paul Bolle <pebolle@tiscali.nl>
Tested-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Nelson Elhage <nelhage@nelhage.com>
Acked-by: Jason Baron <jbaron@redhat.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: <stable@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 9026fc91fe3b..828e750af23a 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -70,6 +70,15 @@
  * simultaneous inserts (A into B and B into A) from racing and
  * constructing a cycle without either insert observing that it is
  * going to.
+ * It is necessary to acquire multiple "ep->mtx"es at once in the
+ * case when one epoll fd is added to another. In this case, we
+ * always acquire the locks in the order of nesting (i.e. after
+ * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
+ * before e2->mtx). Since we disallow cycles of epoll file
+ * descriptors, this ensures that the mutexes are well-ordered. In
+ * order to communicate this nesting to lockdep, when walking a tree
+ * of epoll file descriptors, we use the current recursion depth as
+ * the lockdep subkey.
  * It is possible to drop the "ep->mtx" and to use the global
  * mutex "epmutex" (together with "ep->lock") to have it working,
  * but having "ep->mtx" will make the interface more scalable.
@@ -464,13 +473,15 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
  * @ep: Pointer to the epoll private data structure.
  * @sproc: Pointer to the scan callback.
  * @priv: Private opaque data passed to the @sproc callback.
+ * @depth: The current depth of recursive f_op->poll calls.
  *
  * Returns: The same integer error code returned by the @sproc callback.
  */
 static int ep_scan_ready_list(struct eventpoll *ep,
 			      int (*sproc)(struct eventpoll *,
 					   struct list_head *, void *),
-			      void *priv)
+			      void *priv,
+			      int depth)
 {
 	int error, pwake = 0;
 	unsigned long flags;
@@ -481,7 +492,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
 	 * We need to lock this because we could be hit by
 	 * eventpoll_release_file() and epoll_ctl().
 	 */
-	mutex_lock(&ep->mtx);
+	mutex_lock_nested(&ep->mtx, depth);
 
 	/*
 	 * Steal the ready list, and re-init the original one to the
@@ -670,7 +681,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
 
 static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
 {
-	return ep_scan_ready_list(priv, ep_read_events_proc, NULL);
+	return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1);
 }
 
 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
@@ -737,7 +748,7 @@ void eventpoll_release_file(struct file *file)
 
 		ep = epi->ep;
 		list_del_init(&epi->fllink);
-		mutex_lock(&ep->mtx);
+		mutex_lock_nested(&ep->mtx, 0);
 		ep_remove(ep, epi);
 		mutex_unlock(&ep->mtx);
 	}
@@ -1134,7 +1145,7 @@ static int ep_send_events(struct eventpoll *ep,
 	esed.maxevents = maxevents;
 	esed.events = events;
 
-	return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
+	return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
 }
 
 static inline struct timespec ep_set_mstimeout(long ms)
@@ -1267,7 +1278,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
 	struct rb_node *rbp;
 	struct epitem *epi;
 
-	mutex_lock(&ep->mtx);
+	mutex_lock_nested(&ep->mtx, call_nests + 1);
 	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
 		epi = rb_entry(rbp, struct epitem, rbn);
 		if (unlikely(is_file_epoll(epi->ffd.file))) {
@@ -1409,7 +1420,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	}
 
 
-	mutex_lock(&ep->mtx);
+	mutex_lock_nested(&ep->mtx, 0);
 
 	/*
 	 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
-- 
cgit v1.2.3-58-ga151


From f6d90b4f9ce018bff429d6e01ee672de712b8641 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 1 Nov 2011 07:06:17 -0700
Subject: sysfs: Make sysfs_rename safe with sysfs_dirents in rbtrees.

In sysfs_rename we need to remove the optimization of not calling
sysfs_unlink_sibling and sysfs_link_sibling if the renamed parent
directory is not changing.  This optimization is no longer valid now
that sysfs dirents are stored in an rbtree sorted by name.

Move the assignment of s_ns before the call of sysfs_link_sibling.  With
no sysfs_dirent fields changing after the call of sysfs_link_sibling
this allows sysfs_link_sibling to take any of the directory entries into
account when it builds the rbtrees, and s_ns looks like a prime canidate
to be used in the rbtree in the future.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jiri Slaby <jirislaby@gmail.com>
Cc: Greg KH <gregkh@suse.de>
Cc: David Miller <davem@davemloft.net>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/sysfs/dir.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 48ffbdf0d017..7fdf6a7b7436 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -865,15 +865,13 @@ int sysfs_rename(struct sysfs_dirent *sd,
 		sd->s_name = new_name;
 	}
 
-	/* Remove from old parent's list and insert into new parent's list. */
-	if (sd->s_parent != new_parent_sd) {
-		sysfs_unlink_sibling(sd);
-		sysfs_get(new_parent_sd);
-		sysfs_put(sd->s_parent);
-		sd->s_parent = new_parent_sd;
-		sysfs_link_sibling(sd);
-	}
+	/* Move to the appropriate place in the appropriate directories rbtree. */
+	sysfs_unlink_sibling(sd);
+	sysfs_get(new_parent_sd);
+	sysfs_put(sd->s_parent);
 	sd->s_ns = new_ns;
+	sd->s_parent = new_parent_sd;
+	sysfs_link_sibling(sd);
 
 	error = 0;
  out:
-- 
cgit v1.2.3-58-ga151


From fc0d14fe2d6403eb21202fd0c1cf67cd2c85ca67 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@tonian.com>
Date: Thu, 27 Oct 2011 20:43:01 +0200
Subject: nfsd4: typo logical vs bitwise negate in nfsd4_decode_share_access

Signed-off-by: Benny Halevy <bhalevy@tonian.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 66d095d7955e..b6fa792d6b85 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -655,7 +655,7 @@ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *x)
 	default:
 		return nfserr_bad_xdr;
 	}
-	w &= !NFS4_SHARE_ACCESS_MASK;
+	w &= ~NFS4_SHARE_ACCESS_MASK;
 	if (!w)
 		return nfs_ok;
 	if (!argp->minorversion)
-- 
cgit v1.2.3-58-ga151


From 32de67569059d22b02dd9323a40220d953642b7e Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Tue, 1 Nov 2011 18:56:41 -0400
Subject: ext4: fix a syntax error in ext4_ext_insert_extent when debugging
 enabled

This patch fixes a syntax error which omits a comma. Besides this,
logical block number is unsigend 32 bits, so printk should use %u
instead %d.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2798945a1920..8cacaf645e36 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1682,7 +1682,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	/* try to insert block into found extent and return */
 	if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
 		&& ext4_can_extents_be_merged(inode, ex, newext)) {
-		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
+		ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n",
 			  ext4_ext_is_uninitialized(newext),
 			  ext4_ext_get_actual_len(newext),
 			  le32_to_cpu(ex->ee_block),
@@ -1720,7 +1720,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
 		next = ext4_ext_next_leaf_block(path);
 	if (next != EXT_MAX_BLOCKS) {
-		ext_debug("next leaf block - %d\n", next);
+		ext_debug("next leaf block - %u\n", next);
 		BUG_ON(npath != NULL);
 		npath = ext4_ext_find_extent(inode, next, NULL);
 		if (IS_ERR(npath))
@@ -1758,7 +1758,7 @@ has_space:
 
 	if (!nearex) {
 		/* there is no extent in this leaf, create first one */
-		ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
+		ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
 				le32_to_cpu(newext->ee_block),
 				ext4_ext_pblock(newext),
 				ext4_ext_is_uninitialized(newext),
@@ -1768,8 +1768,8 @@ has_space:
 		if (le32_to_cpu(newext->ee_block)
 			   > le32_to_cpu(nearex->ee_block)) {
 			/* Insert after */
-			ext_debug("insert %d:%llu:[%d]%d %s before: "
-					"nearest 0x%p\n"
+			ext_debug("insert %u:%llu:[%d]%d before: "
+					"nearest %p\n",
 					le32_to_cpu(newext->ee_block),
 					ext4_ext_pblock(newext),
 					ext4_ext_is_uninitialized(newext),
@@ -1779,8 +1779,8 @@ has_space:
 		} else {
 			/* Insert before */
 			BUG_ON(newext->ee_block == nearex->ee_block);
-			ext_debug("insert %d:%llu:[%d]%d %s after: "
-					"nearest 0x%p\n"
+			ext_debug("insert %u:%llu:[%d]%d after: "
+					"nearest %p\n",
 					le32_to_cpu(newext->ee_block),
 					ext4_ext_pblock(newext),
 					ext4_ext_is_uninitialized(newext),
@@ -1789,7 +1789,7 @@ has_space:
 		}
 		len = EXT_LAST_EXTENT(eh) - nearex + 1;
 		if (len > 0) {
-			ext_debug("insert %d:%llu:[%d]%d: "
+			ext_debug("insert %u:%llu:[%d]%d: "
 					"move %d extents from 0x%p to 0x%p\n",
 					le32_to_cpu(newext->ee_block),
 					ext4_ext_pblock(newext),
-- 
cgit v1.2.3-58-ga151


From bf52c6f7af55c48ab0fd5f990460b884b428d906 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Tue, 1 Nov 2011 18:59:26 -0400
Subject: ext4: let ext4_ext_rm_leaf work with EXT_DEBUG defined

The variable 'block' is removed by commit 750c9c47, so use the
replacement ex_ee_block instead.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 8cacaf645e36..61fa9e1614af 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2424,7 +2424,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		if (err)
 			goto out;
 
-		ext_debug("new extent: %u:%u:%llu\n", block, num,
+		ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
 				ext4_ext_pblock(ex));
 		ex--;
 		ex_ee_block = le32_to_cpu(ex->ee_block);
-- 
cgit v1.2.3-58-ga151


From 8762202dd0d6e46854f786bdb6fb3780a1625efe Mon Sep 17 00:00:00 2001
From: Eryu Guan <guaneryu@gmail.com>
Date: Tue, 1 Nov 2011 19:04:59 -0400
Subject: jbd/jbd2: validate sb->s_first in journal_get_superblock()

I hit a J_ASSERT(blocknr != 0) failure in cleanup_journal_tail() when
mounting a fsfuzzed ext3 image. It turns out that the corrupted ext3
image has s_first = 0 in journal superblock, and the 0 is passed to
journal->j_head in journal_reset(), then to blocknr in
cleanup_journal_tail(), in the end the J_ASSERT failed.

So validate s_first after reading journal superblock from disk in
journal_get_superblock() to ensure s_first is valid.

The following script could reproduce it:

fstype=ext3
blocksize=1024
img=$fstype.img
offset=0
found=0
magic="c0 3b 39 98"

dd if=/dev/zero of=$img bs=1M count=8
mkfs -t $fstype -b $blocksize -F $img
filesize=`stat -c %s $img`
while [ $offset -lt $filesize ]
do
        if od -j $offset -N 4 -t x1 $img | grep -i "$magic";then
                echo "Found journal: $offset"
                found=1
                break
        fi
        offset=`echo "$offset+$blocksize" | bc`
done

if [ $found -ne 1 ];then
        echo "Magic \"$magic\" not found"
        exit 1
fi

dd if=/dev/zero of=$img seek=$(($offset+23)) conv=notrunc bs=1 count=1

mkdir -p ./mnt
mount -o loop $img ./mnt

Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Eryu Guan <guaneryu@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd/journal.c  | 8 ++++++++
 fs/jbd2/journal.c | 8 ++++++++
 2 files changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 9fe061fb8779..fea8dd661d2b 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1135,6 +1135,14 @@ static int journal_get_superblock(journal_t *journal)
 		goto out;
 	}
 
+	if (be32_to_cpu(sb->s_first) == 0 ||
+	    be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
+		printk(KERN_WARNING
+			"JBD: Invalid start block of journal: %u\n",
+			be32_to_cpu(sb->s_first));
+		goto out;
+	}
+
 	return 0;
 
 out:
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index f24df13adc4e..d6e93d00833e 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1251,6 +1251,14 @@ static int journal_get_superblock(journal_t *journal)
 		goto out;
 	}
 
+	if (be32_to_cpu(sb->s_first) == 0 ||
+	    be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
+		printk(KERN_WARNING
+			"JBD2: Invalid start block of journal: %u\n",
+			be32_to_cpu(sb->s_first));
+		goto out;
+	}
+
 	return 0;
 
 out:
-- 
cgit v1.2.3-58-ga151


From f2a44523b20f323e4aef7c16261d34d6f0a4bf06 Mon Sep 17 00:00:00 2001
From: Eryu Guan <guaneryu@gmail.com>
Date: Tue, 1 Nov 2011 19:09:18 -0400
Subject: jbd2: Unify log messages in jbd2 code

Some jbd2 code prints out kernel messages with "JBD2: " prefix, at the
same time other jbd2 code prints with "JBD: " prefix. Unify the prefix
to "JBD2: ".

Signed-off-by: Eryu Guan <guaneryu@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c      | 26 +++++++++++++-------------
 fs/jbd2/journal.c     | 36 ++++++++++++++++++------------------
 fs/jbd2/recovery.c    | 28 ++++++++++++++--------------
 fs/jbd2/transaction.c |  4 ++--
 4 files changed, 47 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index eef6979821a4..68d704db787f 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -352,7 +352,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
 
 	trace_jbd2_start_commit(journal, commit_transaction);
-	jbd_debug(1, "JBD: starting commit of transaction %d\n",
+	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
 			commit_transaction->t_tid);
 
 	write_lock(&journal->j_state_lock);
@@ -427,7 +427,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	__jbd2_journal_clean_checkpoint_list(journal);
 	spin_unlock(&journal->j_list_lock);
 
-	jbd_debug (3, "JBD: commit phase 1\n");
+	jbd_debug(3, "JBD2: commit phase 1\n");
 
 	/*
 	 * Switch to a new revoke table.
@@ -447,7 +447,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	wake_up(&journal->j_wait_transaction_locked);
 	write_unlock(&journal->j_state_lock);
 
-	jbd_debug (3, "JBD: commit phase 2\n");
+	jbd_debug(3, "JBD2: commit phase 2\n");
 
 	/*
 	 * Now start flushing things to disk, in the order they appear
@@ -462,7 +462,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 					  WRITE_SYNC);
 	blk_finish_plug(&plug);
 
-	jbd_debug(3, "JBD: commit phase 2\n");
+	jbd_debug(3, "JBD2: commit phase 2\n");
 
 	/*
 	 * Way to go: we have now written out all of the data for a
@@ -522,7 +522,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
 			J_ASSERT (bufs == 0);
 
-			jbd_debug(4, "JBD: get descriptor\n");
+			jbd_debug(4, "JBD2: get descriptor\n");
 
 			descriptor = jbd2_journal_get_descriptor_buffer(journal);
 			if (!descriptor) {
@@ -531,7 +531,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 			}
 
 			bh = jh2bh(descriptor);
-			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
+			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
 				(unsigned long long)bh->b_blocknr, bh->b_data);
 			header = (journal_header_t *)&bh->b_data[0];
 			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
@@ -625,7 +625,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		    commit_transaction->t_buffers == NULL ||
 		    space_left < tag_bytes + 16) {
 
-			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
+			jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
 
 			/* Write an end-of-descriptor marker before
                            submitting the IOs.  "tag" still points to
@@ -707,7 +707,7 @@ start_journal_io:
 	   so we incur less scheduling load.
 	*/
 
-	jbd_debug(3, "JBD: commit phase 3\n");
+	jbd_debug(3, "JBD2: commit phase 3\n");
 
 	/*
 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -771,7 +771,7 @@ wait_for_iobuf:
 
 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
 
-	jbd_debug(3, "JBD: commit phase 4\n");
+	jbd_debug(3, "JBD2: commit phase 4\n");
 
 	/* Here we wait for the revoke record and descriptor record buffers */
  wait_for_ctlbuf:
@@ -801,7 +801,7 @@ wait_for_iobuf:
 	if (err)
 		jbd2_journal_abort(journal, err);
 
-	jbd_debug(3, "JBD: commit phase 5\n");
+	jbd_debug(3, "JBD2: commit phase 5\n");
 	write_lock(&journal->j_state_lock);
 	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
 	commit_transaction->t_state = T_COMMIT_JFLUSH;
@@ -830,7 +830,7 @@ wait_for_iobuf:
            transaction can be removed from any checkpoint list it was on
            before. */
 
-	jbd_debug(3, "JBD: commit phase 6\n");
+	jbd_debug(3, "JBD2: commit phase 6\n");
 
 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 	J_ASSERT(commit_transaction->t_buffers == NULL);
@@ -964,7 +964,7 @@ restart_loop:
 
 	/* Done with this transaction! */
 
-	jbd_debug(3, "JBD: commit phase 7\n");
+	jbd_debug(3, "JBD2: commit phase 7\n");
 
 	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
 
@@ -1039,7 +1039,7 @@ restart_loop:
 		journal->j_commit_callback(journal, commit_transaction);
 
 	trace_jbd2_end_commit(journal, commit_transaction);
-	jbd_debug(1, "JBD: commit %d complete, head %d\n",
+	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
 	if (to_free)
 		kfree(commit_transaction);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index d6e93d00833e..0fa0123151d3 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -491,7 +491,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 		 */
 
 		journal->j_commit_request = target;
-		jbd_debug(1, "JBD: requesting commit %d/%d\n",
+		jbd_debug(1, "JBD2: requesting commit %d/%d\n",
 			  journal->j_commit_request,
 			  journal->j_commit_sequence);
 		wake_up(&journal->j_wait_commit);
@@ -500,7 +500,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 		/* This should never happen, but if it does, preserve
 		   the evidence before kjournald goes into a loop and
 		   increments j_commit_sequence beyond all recognition. */
-		WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
+		WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n",
 			  journal->j_commit_request,
 			  journal->j_commit_sequence,
 			  target, journal->j_running_transaction ? 
@@ -645,7 +645,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 	}
 #endif
 	while (tid_gt(tid, journal->j_commit_sequence)) {
-		jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
+		jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n",
 				  tid, journal->j_commit_sequence);
 		wake_up(&journal->j_wait_commit);
 		read_unlock(&journal->j_state_lock);
@@ -1093,7 +1093,7 @@ static int journal_reset(journal_t *journal)
 	first = be32_to_cpu(sb->s_first);
 	last = be32_to_cpu(sb->s_maxlen);
 	if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
-		printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n",
+		printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n",
 		       first, last);
 		journal_fail_superblock(journal);
 		return -EINVAL;
@@ -1139,7 +1139,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
 	 */
 	if (sb->s_start == 0 && journal->j_tail_sequence ==
 				journal->j_transaction_sequence) {
-		jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
+		jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
 			"(start %ld, seq %d, errno %d)\n",
 			journal->j_tail, journal->j_tail_sequence,
 			journal->j_errno);
@@ -1163,7 +1163,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
 	}
 
 	read_lock(&journal->j_state_lock);
-	jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
+	jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d, errno %d)\n",
 		  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
 
 	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1216,8 +1216,8 @@ static int journal_get_superblock(journal_t *journal)
 		ll_rw_block(READ, 1, &bh);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
-			printk (KERN_ERR
-				"JBD: IO error reading journal superblock\n");
+			printk(KERN_ERR
+				"JBD2: IO error reading journal superblock\n");
 			goto out;
 		}
 	}
@@ -1228,7 +1228,7 @@ static int journal_get_superblock(journal_t *journal)
 
 	if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
 	    sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
-		printk(KERN_WARNING "JBD: no valid journal superblock found\n");
+		printk(KERN_WARNING "JBD2: no valid journal superblock found\n");
 		goto out;
 	}
 
@@ -1240,14 +1240,14 @@ static int journal_get_superblock(journal_t *journal)
 		journal->j_format_version = 2;
 		break;
 	default:
-		printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
+		printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n");
 		goto out;
 	}
 
 	if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
 		journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
 	else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
-		printk (KERN_WARNING "JBD: journal file too short\n");
+		printk(KERN_WARNING "JBD2: journal file too short\n");
 		goto out;
 	}
 
@@ -1318,8 +1318,8 @@ int jbd2_journal_load(journal_t *journal)
 		     ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
 		    (sb->s_feature_incompat &
 		     ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
-			printk (KERN_WARNING
-				"JBD: Unrecognised features on journal\n");
+			printk(KERN_WARNING
+				"JBD2: Unrecognised features on journal\n");
 			return -EINVAL;
 		}
 	}
@@ -1354,7 +1354,7 @@ int jbd2_journal_load(journal_t *journal)
 	return 0;
 
 recovery_error:
-	printk (KERN_WARNING "JBD: recovery failed\n");
+	printk(KERN_WARNING "JBD2: recovery failed\n");
 	return -EIO;
 }
 
@@ -1585,7 +1585,7 @@ static int journal_convert_superblock_v1(journal_t *journal,
 	struct buffer_head *bh;
 
 	printk(KERN_WARNING
-		"JBD: Converting superblock from version 1 to 2.\n");
+		"JBD2: Converting superblock from version 1 to 2.\n");
 
 	/* Pre-initialise new fields to zero */
 	offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
@@ -1702,7 +1702,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
 	if (!journal->j_tail)
 		goto no_recovery;
 
-	printk (KERN_WARNING "JBD: %s recovery information on journal\n",
+	printk(KERN_WARNING "JBD2: %s recovery information on journal\n",
 		write ? "Clearing" : "Ignoring");
 
 	err = jbd2_journal_skip_recovery(journal);
@@ -2028,7 +2028,7 @@ static int journal_init_jbd2_journal_head_cache(void)
 	retval = 0;
 	if (!jbd2_journal_head_cache) {
 		retval = -ENOMEM;
-		printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
+		printk(KERN_EMERG "JBD2: no memory for journal_head cache\n");
 	}
 	return retval;
 }
@@ -2391,7 +2391,7 @@ static void __exit journal_exit(void)
 #ifdef CONFIG_JBD2_DEBUG
 	int n = atomic_read(&nr_journal_heads);
 	if (n)
-		printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
+		printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n);
 #endif
 	jbd2_remove_debugfs_entry();
 	jbd2_remove_jbd_stats_proc_entry();
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 1cad869494f0..da6d7baf1390 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -89,7 +89,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
 		err = jbd2_journal_bmap(journal, next, &blocknr);
 
 		if (err) {
-			printk (KERN_ERR "JBD: bad block at offset %u\n",
+			printk(KERN_ERR "JBD2: bad block at offset %u\n",
 				next);
 			goto failed;
 		}
@@ -138,14 +138,14 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
 	*bhp = NULL;
 
 	if (offset >= journal->j_maxlen) {
-		printk(KERN_ERR "JBD: corrupted journal superblock\n");
+		printk(KERN_ERR "JBD2: corrupted journal superblock\n");
 		return -EIO;
 	}
 
 	err = jbd2_journal_bmap(journal, offset, &blocknr);
 
 	if (err) {
-		printk (KERN_ERR "JBD: bad block at offset %u\n",
+		printk(KERN_ERR "JBD2: bad block at offset %u\n",
 			offset);
 		return err;
 	}
@@ -163,7 +163,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
 	}
 
 	if (!buffer_uptodate(bh)) {
-		printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
+		printk(KERN_ERR "JBD2: Failed to read block at offset %u\n",
 			offset);
 		brelse(bh);
 		return -EIO;
@@ -251,10 +251,10 @@ int jbd2_journal_recover(journal_t *journal)
 	if (!err)
 		err = do_one_pass(journal, &info, PASS_REPLAY);
 
-	jbd_debug(1, "JBD: recovery, exit status %d, "
+	jbd_debug(1, "JBD2: recovery, exit status %d, "
 		  "recovered transactions %u to %u\n",
 		  err, info.start_transaction, info.end_transaction);
-	jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n",
+	jbd_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
 		  info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
 
 	/* Restart the log at the next transaction ID, thus invalidating
@@ -293,14 +293,14 @@ int jbd2_journal_skip_recovery(journal_t *journal)
 	err = do_one_pass(journal, &info, PASS_SCAN);
 
 	if (err) {
-		printk(KERN_ERR "JBD: error %d scanning journal\n", err);
+		printk(KERN_ERR "JBD2: error %d scanning journal\n", err);
 		++journal->j_transaction_sequence;
 	} else {
 #ifdef CONFIG_JBD2_DEBUG
 		int dropped = info.end_transaction - 
 			be32_to_cpu(journal->j_superblock->s_sequence);
 		jbd_debug(1,
-			  "JBD: ignoring %d transaction%s from the journal.\n",
+			  "JBD2: ignoring %d transaction%s from the journal.\n",
 			  dropped, (dropped == 1) ? "" : "s");
 #endif
 		journal->j_transaction_sequence = ++info.end_transaction;
@@ -338,7 +338,7 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
 		wrap(journal, *next_log_block);
 		err = jread(&obh, journal, io_block);
 		if (err) {
-			printk(KERN_ERR "JBD: IO error %d recovering block "
+			printk(KERN_ERR "JBD2: IO error %d recovering block "
 				"%lu in log\n", err, io_block);
 			return 1;
 		} else {
@@ -411,7 +411,7 @@ static int do_one_pass(journal_t *journal,
 		 * either the next descriptor block or the final commit
 		 * record. */
 
-		jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
+		jbd_debug(3, "JBD2: checking block %ld\n", next_log_block);
 		err = jread(&bh, journal, next_log_block);
 		if (err)
 			goto failed;
@@ -491,8 +491,8 @@ static int do_one_pass(journal_t *journal,
 					/* Recover what we can, but
 					 * report failure at the end. */
 					success = err;
-					printk (KERN_ERR
-						"JBD: IO error %d recovering "
+					printk(KERN_ERR
+						"JBD2: IO error %d recovering "
 						"block %ld in log\n",
 						err, io_block);
 				} else {
@@ -520,7 +520,7 @@ static int do_one_pass(journal_t *journal,
 							journal->j_blocksize);
 					if (nbh == NULL) {
 						printk(KERN_ERR
-						       "JBD: Out of memory "
+						       "JBD2: Out of memory "
 						       "during recovery.\n");
 						err = -ENOMEM;
 						brelse(bh);
@@ -689,7 +689,7 @@ static int do_one_pass(journal_t *journal,
 		/* It's really bad news if different passes end up at
 		 * different places (but possible due to IO errors). */
 		if (info->end_transaction != next_commit_ID) {
-			printk (KERN_ERR "JBD: recovery pass %d ended at "
+			printk(KERN_ERR "JBD2: recovery pass %d ended at "
 				"transaction %u, expected %u\n",
 				pass, next_commit_ID, info->end_transaction);
 			if (!success)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 1e5c5ead5b0d..a0e41a4c080e 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -125,7 +125,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
 	unsigned long ts = jiffies;
 
 	if (nblocks > journal->j_max_transaction_buffers) {
-		printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
+		printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
 		       current->comm, nblocks,
 		       journal->j_max_transaction_buffers);
 		return -ENOSPC;
@@ -564,7 +564,7 @@ static void warn_dirty_buffer(struct buffer_head *bh)
 	char b[BDEVNAME_SIZE];
 
 	printk(KERN_WARNING
-	       "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
+	       "JBD2: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
 	       "There's a risk of filesystem corruption in case of system "
 	       "crash.\n",
 	       bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
-- 
cgit v1.2.3-58-ga151


From 32096ea1aac14e6f29d4744924092eca52b937b0 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Tue, 1 Nov 2011 16:12:33 +0300
Subject: vfs: fix dentry leak in simple_fill_super()

put dentry if inode allocation failed, d_genocide() cannot release it

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/libfs.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/libfs.c b/fs/libfs.c
index c18e9a1235b6..a2c0029cd95f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -510,8 +510,10 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
 		if (!dentry)
 			goto out;
 		inode = new_inode(s);
-		if (!inode)
+		if (!inode) {
+			dput(dentry);
 			goto out;
+		}
 		inode->i_mode = S_IFREG | files->mode;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_fop = files->ops;
-- 
cgit v1.2.3-58-ga151


From 1fa1e7f615f4d3ae436fa319af6e4eebdd4026a8 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Wed, 2 Nov 2011 09:44:39 +0100
Subject: readlinkat: ensure we return ENOENT for the empty pathname for normal
 lookups

Since the commit below which added O_PATH support to the *at() calls, the
error return for readlink/readlinkat for the empty pathname has switched
from ENOENT to EINVAL:

  commit 65cfc6722361570bfe255698d9cd4dccaf47570d
  Author: Al Viro <viro@zeniv.linux.org.uk>
  Date:   Sun Mar 13 15:56:26 2011 -0400

    readlinkat(), fchownat() and fstatat() with empty relative pathnames

This is both unexpected for userspace and makes readlink/readlinkat
inconsistant with all other interfaces; and inconsistant with our stated
return for these pathnames.

As the readlinkat call does not have a flags parameter we cannot use the
AT_EMPTY_PATH approach used in the other calls.  Therefore expose whether
the original path is infact entry via a new user_path_at_empty() path
lookup function.  Use this to determine whether to default to EINVAL or
ENOENT for failures.

Addresses http://bugs.launchpad.net/bugs/817187

[akpm@linux-foundation.org: remove unused getname_flags()]
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: <stable@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/namei.c            | 18 +++++++++++++-----
 fs/stat.c             |  5 +++--
 include/linux/namei.h |  1 +
 3 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 7657be4352bf..ac6d214da827 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -137,7 +137,7 @@ static int do_getname(const char __user *filename, char *page)
 	return retval;
 }
 
-static char *getname_flags(const char __user * filename, int flags)
+static char *getname_flags(const char __user *filename, int flags, int *empty)
 {
 	char *tmp, *result;
 
@@ -148,6 +148,8 @@ static char *getname_flags(const char __user * filename, int flags)
 
 		result = tmp;
 		if (retval < 0) {
+			if (retval == -ENOENT && empty)
+				*empty = 1;
 			if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
 				__putname(tmp);
 				result = ERR_PTR(retval);
@@ -160,7 +162,7 @@ static char *getname_flags(const char __user * filename, int flags)
 
 char *getname(const char __user * filename)
 {
-	return getname_flags(filename, 0);
+	return getname_flags(filename, 0, 0);
 }
 
 #ifdef CONFIG_AUDITSYSCALL
@@ -1798,11 +1800,11 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 	return __lookup_hash(&this, base, NULL);
 }
 
-int user_path_at(int dfd, const char __user *name, unsigned flags,
-		 struct path *path)
+int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
+		 struct path *path, int *empty)
 {
 	struct nameidata nd;
-	char *tmp = getname_flags(name, flags);
+	char *tmp = getname_flags(name, flags, empty);
 	int err = PTR_ERR(tmp);
 	if (!IS_ERR(tmp)) {
 
@@ -1816,6 +1818,12 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
 	return err;
 }
 
+int user_path_at(int dfd, const char __user *name, unsigned flags,
+		 struct path *path)
+{
+	return user_path_at_empty(dfd, name, flags, path, 0);
+}
+
 static int user_path_parent(int dfd, const char __user *path,
 			struct nameidata *nd, char **name)
 {
diff --git a/fs/stat.c b/fs/stat.c
index 78a3aa83c7ea..8806b8997d2e 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -294,15 +294,16 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
 {
 	struct path path;
 	int error;
+	int empty = 0;
 
 	if (bufsiz <= 0)
 		return -EINVAL;
 
-	error = user_path_at(dfd, pathname, LOOKUP_EMPTY, &path);
+	error = user_path_at_empty(dfd, pathname, LOOKUP_EMPTY, &path, &empty);
 	if (!error) {
 		struct inode *inode = path.dentry->d_inode;
 
-		error = -EINVAL;
+		error = empty ? -ENOENT : -EINVAL;
 		if (inode->i_op->readlink) {
 			error = security_inode_readlink(path.dentry);
 			if (!error) {
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 409328d1cbbb..ffc02135c483 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -67,6 +67,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_EMPTY		0x4000
 
 extern int user_path_at(int, const char __user *, unsigned, struct path *);
+extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty);
 
 #define user_path(name, path) user_path_at(AT_FDCWD, name, LOOKUP_FOLLOW, path)
 #define user_lpath(name, path) user_path_at(AT_FDCWD, name, 0, path)
-- 
cgit v1.2.3-58-ga151


From 2833eb2b465a274d1a2529fed76c6d2904f8022b Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 28 Oct 2011 14:13:20 +0200
Subject: vfs: ignore error on forced remount

On emergency remount we want to force MS_RDONLY on the super block
even if ->remount_fs() failed for some reason.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Tested-by: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/super.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index 32a81f3467e0..afd0f1ad45e0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -727,8 +727,13 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 
 	if (sb->s_op->remount_fs) {
 		retval = sb->s_op->remount_fs(sb, &flags, data);
-		if (retval)
-			return retval;
+		if (retval) {
+			if (!force)
+				return retval;
+			/* If forced remount, go ahead despite any errors */
+			WARN(1, "forced remount of a %s fs returned %i\n",
+			     sb->s_type->name, retval);
+		}
 	}
 	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
 
-- 
cgit v1.2.3-58-ga151


From c818b682c955aa4d13212512deb529cb8bd1348b Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 28 Oct 2011 14:13:25 +0200
Subject: jfs: remove unnecessary nlink setting

alloc_inode() initializes i_nlink to 1.  Remove unnecessary
re-initialization.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Acked-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/jfs/super.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 06c8a67cbe76..a44eff076c17 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -485,7 +485,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 		goto out_unload;
 	}
 	inode->i_ino = 0;
-	inode->i_nlink = 1;
 	inode->i_size = sb->s_bdev->bd_inode->i_size;
 	inode->i_mapping->a_ops = &jfs_metapage_aops;
 	insert_inode_hash(inode);
-- 
cgit v1.2.3-58-ga151


From a7732b05f775a5575baac34c03bb0e8d16950edf Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 28 Oct 2011 14:13:26 +0200
Subject: ocfs2: remove unnecessary nlink setting

alloc_inode() initializes i_nlink to 1.  Remove unnecessary
re-initialization.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: Joel Becker <jlbec@evilplan.org>
CC: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/ocfs2/namei.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 53aa41ed7bf3..0181f63dd463 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -200,8 +200,6 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
 	 * callers. */
 	if (S_ISDIR(mode))
 		inode->i_nlink = 2;
-	else
-		inode->i_nlink = 1;
 	inode_init_owner(inode, dir, mode);
 	dquot_initialize(inode);
 	return inode;
-- 
cgit v1.2.3-58-ga151


From dd2a981f46a0903a8770a784f213d4d40bbb6f19 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 28 Oct 2011 14:13:27 +0200
Subject: logfs: remove unnecessary nlink setting

alloc_inode() initializes i_nlink to 1.  Remove unnecessary
re-initialization.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: Joern Engel <joern@logfs.org>
CC: Prasad Joshi <prasadjoshi.linux@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/logfs/inode.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index edfea7a3a747..9abb048b96c0 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -199,7 +199,6 @@ static void logfs_init_inode(struct super_block *sb, struct inode *inode)
 	inode->i_blocks	= 0;
 	inode->i_ctime	= CURRENT_TIME;
 	inode->i_mtime	= CURRENT_TIME;
-	inode->i_nlink	= 1;
 	li->li_refcount = 1;
 	INIT_LIST_HEAD(&li->li_freeing_list);
 
-- 
cgit v1.2.3-58-ga151


From 6d6b77f163c7eabedbba00ed2abb7d4a570bff76 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 28 Oct 2011 14:13:28 +0200
Subject: filesystems: add missing nlink wrappers

Replace direct i_nlink updates with the respective updater function
(inc_nlink, drop_nlink, clear_nlink, inode_dec_link_count).

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 arch/s390/hypfs/inode.c |  2 +-
 drivers/mtd/mtdchar.c   |  2 +-
 fs/affs/amigaffs.c      |  2 +-
 fs/affs/namei.c         |  6 +++---
 fs/binfmt_misc.c        |  2 +-
 fs/cifs/link.c          |  2 +-
 fs/coda/dir.c           |  2 +-
 fs/devpts/inode.c       |  2 +-
 fs/ext2/ialloc.c        |  2 +-
 fs/ext3/ialloc.c        |  2 +-
 fs/ext3/namei.c         |  2 +-
 fs/ext4/ialloc.c        |  2 +-
 fs/ext4/migrate.c       |  2 +-
 fs/ext4/namei.c         |  2 +-
 fs/hfs/dir.c            |  4 ++--
 fs/hfsplus/dir.c        |  4 ++--
 fs/hpfs/inode.c         |  2 +-
 fs/hugetlbfs/inode.c    |  2 +-
 fs/jfs/jfs_inode.c      |  2 +-
 fs/jfs/namei.c          | 10 +++++-----
 fs/logfs/dir.c          |  8 ++++----
 fs/logfs/inode.c        |  2 +-
 fs/nfs/inode.c          |  2 +-
 fs/nilfs2/inode.c       |  2 +-
 fs/ocfs2/namei.c        |  8 ++++----
 fs/proc/proc_sysctl.c   |  2 +-
 fs/reiserfs/inode.c     |  4 ++--
 fs/reiserfs/namei.c     | 12 ++++++------
 fs/ubifs/xattr.c        |  2 +-
 fs/udf/namei.c          | 11 ++++-------
 fs/ufs/ialloc.c         |  2 +-
 mm/shmem.c              |  2 +-
 32 files changed, 55 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 7ae0d0d985a8..baa82f8dd076 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -359,7 +359,7 @@ static struct dentry *hypfs_create_file(struct super_block *sb,
 	} else if (mode & S_IFDIR) {
 		inode->i_op = &simple_dir_inode_operations;
 		inode->i_fop = &simple_dir_operations;
-		parent->d_inode->i_nlink++;
+		inc_nlink(parent->d_inode);
 	} else
 		BUG();
 	inode->i_private = data;
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index f1af2228a1b1..61086ea3cc6b 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -1144,7 +1144,7 @@ static void mtdchar_notify_remove(struct mtd_info *mtd)
 
 	if (mtd_ino) {
 		/* Destroy the inode if it exists */
-		mtd_ino->i_nlink = 0;
+		clear_nlink(mtd_ino);
 		iput(mtd_ino);
 	}
 }
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 3a4557e8325c..8f12723bbc97 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -316,7 +316,7 @@ affs_remove_header(struct dentry *dentry)
 	if (inode->i_nlink > 1)
 		retval = affs_remove_link(dentry);
 	else
-		inode->i_nlink = 0;
+		clear_nlink(inode);
 	affs_unlock_link(inode);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index e3e9efc1fdd8..780a11dc6318 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -277,7 +277,7 @@ affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata
 	inode->i_mapping->a_ops = (AFFS_SB(sb)->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops;
 	error = affs_add_entry(dir, inode, dentry, ST_FILE);
 	if (error) {
-		inode->i_nlink = 0;
+		clear_nlink(inode);
 		iput(inode);
 		return error;
 	}
@@ -305,7 +305,7 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	error = affs_add_entry(dir, inode, dentry, ST_USERDIR);
 	if (error) {
-		inode->i_nlink = 0;
+		clear_nlink(inode);
 		mark_inode_dirty(inode);
 		iput(inode);
 		return error;
@@ -392,7 +392,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 	return 0;
 
 err:
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 	mark_inode_dirty(inode);
 	iput(inode);
 	return error;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index ba1a1ae4a18a..1e9edbdeda7e 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -521,7 +521,7 @@ static void kill_node(Node *e)
 	write_unlock(&entries_lock);
 
 	if (dentry) {
-		dentry->d_inode->i_nlink--;
+		drop_nlink(dentry->d_inode);
 		d_drop(dentry);
 		dput(dentry);
 		simple_release_fs(&bm_mnt, &entry_count);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 8693b5d0e180..6b0e06434391 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -433,7 +433,7 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
 	if (old_file->d_inode) {
 		cifsInode = CIFS_I(old_file->d_inode);
 		if (rc == 0) {
-			old_file->d_inode->i_nlink++;
+			inc_nlink(old_file->d_inode);
 /* BB should we make this contingent on superblock flag NOATIME? */
 /*			old_file->d_inode->i_ctime = CURRENT_TIME;*/
 			/* parent dir timestamps will update from srv
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 0239433f50cb..28e7e135cfab 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -340,7 +340,7 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
 	if (!error) {
 		/* VFS may delete the child */
 		if (de->d_inode)
-		    de->d_inode->i_nlink = 0;
+			clear_nlink(de->d_inode);
 
 		/* fix the link count of the parent */
 		coda_dir_drop_nlink(dir);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 2f27e578d466..c196e544c64e 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -549,7 +549,7 @@ void devpts_pty_kill(struct tty_struct *tty)
 
 	dentry = d_find_alias(inode);
 
-	inode->i_nlink--;
+	drop_nlink(inode);
 	d_delete(dentry);
 	dput(dentry);	/* d_alloc_name() in devpts_pty_new() */
 	dput(dentry);		/* d_find_alias above */
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index ee9ed31948e1..c4e81dfb74ba 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -601,7 +601,7 @@ fail_free_drop:
 fail_drop:
 	dquot_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 	unlock_new_inode(inode);
 	iput(inode);
 	return ERR_PTR(err);
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index bf09cbf938cc..835d4ea61b0e 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -621,7 +621,7 @@ fail_free_drop:
 fail_drop:
 	dquot_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 	unlock_new_inode(inode);
 	iput(inode);
 	brelse(bitmap_bh);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 0629e09f6511..8a60e3327659 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1833,7 +1833,7 @@ retry:
 
 	if (err) {
 out_clear_inode:
-		inode->i_nlink = 0;
+		clear_nlink(inode);
 		unlock_new_inode(inode);
 		ext3_mark_inode_dirty(handle, inode);
 		iput (inode);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 9c63f273b550..ecc55bd2f997 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1084,7 +1084,7 @@ fail_free_drop:
 fail_drop:
 	dquot_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 	unlock_new_inode(inode);
 	iput(inode);
 	brelse(inode_bitmap_bh);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b57b98fb44d1..667cc880bc0c 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -507,7 +507,7 @@ int ext4_ext_migrate(struct inode *inode)
 	 * Set the i_nlink to zero so it will be deleted later
 	 * when we drop inode reference.
 	 */
-	tmp_inode->i_nlink = 0;
+	clear_nlink(tmp_inode);
 
 	ext4_ext_tree_init(handle, tmp_inode);
 	ext4_orphan_add(handle, tmp_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 1c924faeb6c8..4623c082f3b2 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2539,7 +2539,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		if (new_inode) {
 			/* checked empty_dir above, can't have another parent,
 			 * ext4_dec_count() won't work for many-linked dirs */
-			new_inode->i_nlink = 0;
+			clear_nlink(new_inode);
 		} else {
 			ext4_inc_count(handle, new_dir);
 			ext4_update_dx_flag(new_dir);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index b4d70b13be92..bce4eef91a06 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -198,7 +198,7 @@ static int hfs_create(struct inode *dir, struct dentry *dentry, int mode,
 
 	res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode);
 	if (res) {
-		inode->i_nlink = 0;
+		clear_nlink(inode);
 		hfs_delete_inode(inode);
 		iput(inode);
 		return res;
@@ -227,7 +227,7 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode);
 	if (res) {
-		inode->i_nlink = 0;
+		clear_nlink(inode);
 		hfs_delete_inode(inode);
 		iput(inode);
 		return res;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 25b2443a004c..4536cd3f15ae 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -415,7 +415,7 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
 	goto out;
 
 out_err:
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 	hfsplus_delete_inode(inode);
 	iput(inode);
 out:
@@ -440,7 +440,7 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
 
 	res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
 	if (res) {
-		inode->i_nlink = 0;
+		clear_nlink(inode);
 		hfsplus_delete_inode(inode);
 		iput(inode);
 		goto out;
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 338cd8368451..dfe800c8ae38 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -53,7 +53,7 @@ void hpfs_read_inode(struct inode *i)
 		i->i_mode &= ~0111;
 		i->i_op = &hpfs_file_iops;
 		i->i_fop = &hpfs_file_ops;
-		i->i_nlink = 0;*/
+		clear_nlink(i);*/
 		make_bad_inode(i);
 		return;
 	}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ec889538e5a6..0be5a78598d0 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -970,7 +970,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
 
 	d_instantiate(path.dentry, inode);
 	inode->i_size = size;
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 
 	error = -ENFILE;
 	file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 2686531e235a..c1a3e603279c 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -157,7 +157,7 @@ fail_drop:
 	dquot_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 fail_unlock:
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 	unlock_new_inode(inode);
 fail_put:
 	iput(inode);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index e17545e15664..94b0a624b85f 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -172,7 +172,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
 	mutex_unlock(&JFS_IP(dip)->commit_mutex);
 	if (rc) {
 		free_ea_wmap(ip);
-		ip->i_nlink = 0;
+		clear_nlink(ip);
 		unlock_new_inode(ip);
 		iput(ip);
 	} else {
@@ -311,7 +311,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
 	mutex_unlock(&JFS_IP(dip)->commit_mutex);
 	if (rc) {
 		free_ea_wmap(ip);
-		ip->i_nlink = 0;
+		clear_nlink(ip);
 		unlock_new_inode(ip);
 		iput(ip);
 	} else {
@@ -844,7 +844,7 @@ static int jfs_link(struct dentry *old_dentry,
 	rc = txCommit(tid, 2, &iplist[0], 0);
 
 	if (rc) {
-		ip->i_nlink--; /* never instantiated */
+		drop_nlink(ip); /* never instantiated */
 		iput(ip);
 	} else
 		d_instantiate(dentry, ip);
@@ -1048,7 +1048,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 	mutex_unlock(&JFS_IP(dip)->commit_mutex);
 	if (rc) {
 		free_ea_wmap(ip);
-		ip->i_nlink = 0;
+		clear_nlink(ip);
 		unlock_new_inode(ip);
 		iput(ip);
 	} else {
@@ -1433,7 +1433,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
 	mutex_unlock(&JFS_IP(dir)->commit_mutex);
 	if (rc) {
 		free_ea_wmap(ip);
-		ip->i_nlink = 0;
+		clear_nlink(ip);
 		unlock_new_inode(ip);
 		iput(ip);
 	} else {
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index b3ff3d894165..b7d7f67cee5a 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -197,7 +197,7 @@ static int logfs_remove_inode(struct inode *inode)
 {
 	int ret;
 
-	inode->i_nlink--;
+	drop_nlink(inode);
 	ret = write_inode(inode);
 	LOGFS_BUG_ON(ret, inode->i_sb);
 	return ret;
@@ -433,7 +433,7 @@ static int __logfs_create(struct inode *dir, struct dentry *dentry,
 
 	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
 	if (!ta) {
-		inode->i_nlink--;
+		drop_nlink(inode);
 		iput(inode);
 		return -ENOMEM;
 	}
@@ -456,7 +456,7 @@ static int __logfs_create(struct inode *dir, struct dentry *dentry,
 		abort_transaction(inode, ta);
 		li->li_flags |= LOGFS_IF_STILLBORN;
 		/* FIXME: truncate symlink */
-		inode->i_nlink--;
+		drop_nlink(inode);
 		iput(inode);
 		goto out;
 	}
@@ -563,7 +563,7 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir,
 
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
 	ihold(inode);
-	inode->i_nlink++;
+	inc_nlink(inode);
 	mark_inode_dirty_sync(inode);
 
 	return __logfs_create(dir, dentry, inode, NULL, 0);
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 9abb048b96c0..7e441ad5f792 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -93,7 +93,7 @@ static struct inode *__logfs_iget(struct super_block *sb, ino_t ino)
 		/* inode->i_nlink == 0 can be true when called from
 		 * block validator */
 		/* set i_nlink to 0 to prevent caching */
-		inode->i_nlink = 0;
+		clear_nlink(inode);
 		logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE;
 		iget_failed(inode);
 		if (!err)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 4dc6d078f108..1fc715acc2da 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -320,7 +320,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
 		inode->i_version = 0;
 		inode->i_size = 0;
-		inode->i_nlink = 0;
+		clear_nlink(inode);
 		inode->i_uid = -2;
 		inode->i_gid = -2;
 		inode->i_blocks = 0;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 666628b395f1..08ac272b7c28 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -354,7 +354,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
 
  failed_acl:
  failed_bmap:
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 	iput(inode);  /* raw_inode will be deleted through
 			 generic_delete_inode() */
 	goto failed;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 0181f63dd463..ea0ecbdda58c 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1377,7 +1377,7 @@ static int ocfs2_rename(struct inode *old_dir,
 	}
 
 	if (new_inode) {
-		new_inode->i_nlink--;
+		drop_nlink(new_inode);
 		new_inode->i_ctime = CURRENT_TIME;
 	}
 	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
@@ -1385,9 +1385,9 @@ static int ocfs2_rename(struct inode *old_dir,
 	if (update_dot_dot) {
 		status = ocfs2_update_entry(old_inode, handle,
 					    &old_inode_dot_dot_res, new_dir);
-		old_dir->i_nlink--;
+		drop_nlink(old_dir);
 		if (new_inode) {
-			new_inode->i_nlink--;
+			drop_nlink(new_inode);
 		} else {
 			inc_nlink(new_dir);
 			mark_inode_dirty(new_dir);
@@ -2280,7 +2280,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
 		goto leave;
 	}
 
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 	/* do the real work now. */
 	status = __ocfs2_mknod_locked(dir, inode,
 				      0, &new_di_bh, parent_di_bh, handle,
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 1a77dbef226f..b44113279e30 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -39,7 +39,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 		inode->i_fop = &proc_sys_file_operations;
 	} else {
 		inode->i_mode |= S_IFDIR;
-		inode->i_nlink = 0;
+		clear_nlink(inode);
 		inode->i_op = &proc_sys_dir_operations;
 		inode->i_fop = &proc_sys_dir_file_operations;
 	}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 9b0d4b78b4fb..c425441c6942 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1444,7 +1444,7 @@ void reiserfs_read_locked_inode(struct inode *inode,
 		/* a stale NFS handle can trigger this without it being an error */
 		pathrelse(&path_to_sd);
 		reiserfs_make_bad_inode(inode);
-		inode->i_nlink = 0;
+		clear_nlink(inode);
 		return;
 	}
 
@@ -1987,7 +1987,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 	make_bad_inode(inode);
 
       out_inserted_sd:
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 	th->t_trans_id = 0;	/* so the caller can't use this handle later */
 	unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
 	iput(inode);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ef392324bbf1..6ce332821633 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -622,7 +622,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
 			       dentry->d_name.len, inode, 1 /*visible */ );
 	if (retval) {
 		int err;
-		inode->i_nlink--;
+		drop_nlink(inode);
 		reiserfs_update_sd(&th, inode);
 		err = journal_end(&th, dir->i_sb, jbegin_count);
 		if (err)
@@ -702,7 +702,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
 			       dentry->d_name.len, inode, 1 /*visible */ );
 	if (retval) {
 		int err;
-		inode->i_nlink--;
+		drop_nlink(inode);
 		reiserfs_update_sd(&th, inode);
 		err = journal_end(&th, dir->i_sb, jbegin_count);
 		if (err)
@@ -787,7 +787,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 			       dentry->d_name.len, inode, 1 /*visible */ );
 	if (retval) {
 		int err;
-		inode->i_nlink = 0;
+		clear_nlink(inode);
 		DEC_DIR_INODE_NLINK(dir);
 		reiserfs_update_sd(&th, inode);
 		err = journal_end(&th, dir->i_sb, jbegin_count);
@@ -1086,7 +1086,7 @@ static int reiserfs_symlink(struct inode *parent_dir,
 				    dentry->d_name.len, inode, 1 /*visible */ );
 	if (retval) {
 		int err;
-		inode->i_nlink--;
+		drop_nlink(inode);
 		reiserfs_update_sd(&th, inode);
 		err = journal_end(&th, parent_dir->i_sb, jbegin_count);
 		if (err)
@@ -1129,7 +1129,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
 
 	retval = journal_begin(&th, dir->i_sb, jbegin_count);
 	if (retval) {
-		inode->i_nlink--;
+		drop_nlink(inode);
 		reiserfs_write_unlock(dir->i_sb);
 		return retval;
 	}
@@ -1144,7 +1144,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
 
 	if (retval) {
 		int err;
-		inode->i_nlink--;
+		drop_nlink(inode);
 		err = journal_end(&th, dir->i_sb, jbegin_count);
 		reiserfs_write_unlock(dir->i_sb);
 		return err ? err : retval;
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 16f19f55e63f..993adc8f1c87 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -558,7 +558,7 @@ int ubifs_removexattr(struct dentry *dentry, const char *name)
 	}
 
 	ubifs_assert(inode->i_nlink == 1);
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 	err = remove_xattr(c, host, inode, &nm);
 	if (err)
 		inode->i_nlink = 1;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index f1dce848ef96..e8d61b114c1b 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -577,8 +577,7 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
 
 	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
 	if (!fi) {
-		inode->i_nlink--;
-		mark_inode_dirty(inode);
+		inode_dec_link_count(inode);
 		iput(inode);
 		return err;
 	}
@@ -618,8 +617,7 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
 	init_special_inode(inode, mode, rdev);
 	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
 	if (!fi) {
-		inode->i_nlink--;
-		mark_inode_dirty(inode);
+		inode_dec_link_count(inode);
 		iput(inode);
 		return err;
 	}
@@ -665,8 +663,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	inode->i_fop = &udf_dir_operations;
 	fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err);
 	if (!fi) {
-		inode->i_nlink--;
-		mark_inode_dirty(inode);
+		inode_dec_link_count(inode);
 		iput(inode);
 		goto out;
 	}
@@ -683,7 +680,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
 	if (!fi) {
-		inode->i_nlink = 0;
+		clear_nlink(inode);
 		mark_inode_dirty(inode);
 		iput(inode);
 		goto out;
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 2eabf04af3de..78a4c70d46b5 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -341,7 +341,7 @@ cg_found:
 
 fail_remove_inode:
 	unlock_super(sb);
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 	iput(inode);
 	UFSD("EXIT (FAILED): err %d\n", err);
 	return ERR_PTR(err);
diff --git a/mm/shmem.c b/mm/shmem.c
index fa4fa6ce13bc..45b9acb575f9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2503,7 +2503,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
 
 	d_instantiate(path.dentry, inode);
 	inode->i_size = size;
-	inode->i_nlink = 0;	/* It is unlinked */
+	clear_nlink(inode);	/* It is unlinked */
 #ifndef CONFIG_MMU
 	error = ramfs_nommu_expand_for_mapping(inode, size);
 	if (error)
-- 
cgit v1.2.3-58-ga151


From bfe8684869601dacfcb2cd69ef8cfd9045f62170 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 28 Oct 2011 14:13:29 +0200
Subject: filesystems: add set_nlink()

Replace remaining direct i_nlink updates with a new set_nlink()
updater function.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Tested-by: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/s390/hypfs/inode.c          |  2 +-
 drivers/staging/pohmelfs/inode.c |  2 +-
 fs/9p/vfs_inode.c                |  4 ++--
 fs/9p/vfs_inode_dotl.c           |  4 ++--
 fs/adfs/inode.c                  |  2 +-
 fs/affs/amigaffs.c               |  2 +-
 fs/affs/inode.c                  |  8 ++++----
 fs/afs/fsclient.c                |  2 +-
 fs/afs/inode.c                   |  4 ++--
 fs/autofs4/inode.c               |  2 +-
 fs/befs/linuxvfs.c               |  2 +-
 fs/bfs/dir.c                     |  2 +-
 fs/bfs/inode.c                   |  2 +-
 fs/btrfs/delayed-inode.c         |  2 +-
 fs/btrfs/disk-io.c               |  2 +-
 fs/btrfs/inode.c                 |  4 ++--
 fs/btrfs/tree-log.c              |  2 +-
 fs/ceph/caps.c                   |  2 +-
 fs/ceph/inode.c                  |  2 +-
 fs/cifs/inode.c                  |  6 +++---
 fs/coda/coda_linux.c             |  2 +-
 fs/devpts/inode.c                |  2 +-
 fs/ecryptfs/inode.c              | 12 ++++++------
 fs/efs/inode.c                   |  2 +-
 fs/exofs/inode.c                 |  2 +-
 fs/ext2/inode.c                  |  2 +-
 fs/ext3/inode.c                  |  2 +-
 fs/ext3/namei.c                  |  4 ++--
 fs/ext4/inode.c                  |  2 +-
 fs/ext4/namei.c                  |  6 +++---
 fs/fat/inode.c                   |  4 ++--
 fs/fat/namei_msdos.c             |  2 +-
 fs/fat/namei_vfat.c              |  2 +-
 fs/freevxfs/vxfs_inode.c         |  2 +-
 fs/fuse/control.c                |  2 +-
 fs/fuse/inode.c                  |  2 +-
 fs/gfs2/glops.c                  |  2 +-
 fs/hfs/inode.c                   |  4 ++--
 fs/hfsplus/inode.c               | 10 +++++-----
 fs/hostfs/hostfs_kern.c          |  2 +-
 fs/hpfs/dir.c                    |  2 +-
 fs/hpfs/inode.c                  |  8 ++++----
 fs/hpfs/namei.c                  |  8 ++++----
 fs/hppfs/hppfs.c                 |  2 +-
 fs/isofs/inode.c                 |  4 ++--
 fs/isofs/rock.c                  |  4 ++--
 fs/jffs2/dir.c                   |  6 +++---
 fs/jffs2/fs.c                    |  6 +++---
 fs/jfs/jfs_imap.c                |  6 +++---
 fs/jfs/namei.c                   |  2 +-
 fs/libfs.c                       |  2 +-
 fs/logfs/readwrite.c             |  2 +-
 fs/minix/inode.c                 |  4 ++--
 fs/ncpfs/inode.c                 |  2 +-
 fs/nfs/inode.c                   |  4 ++--
 fs/nilfs2/inode.c                |  2 +-
 fs/nilfs2/namei.c                |  2 +-
 fs/ntfs/inode.c                  |  8 ++++----
 fs/ocfs2/dir.c                   |  4 ++--
 fs/ocfs2/dlmglue.c               |  2 +-
 fs/ocfs2/inode.c                 |  4 ++--
 fs/ocfs2/namei.c                 |  8 ++++----
 fs/openpromfs/inode.c            |  4 ++--
 fs/proc/base.c                   | 12 ++++++------
 fs/proc/generic.c                |  2 +-
 fs/proc/inode.c                  |  2 +-
 fs/qnx4/inode.c                  |  2 +-
 fs/reiserfs/inode.c              |  6 +++---
 fs/reiserfs/namei.c              |  4 ++--
 fs/romfs/super.c                 |  2 +-
 fs/squashfs/inode.c              | 18 +++++++++---------
 fs/stack.c                       |  2 +-
 fs/sysfs/inode.c                 |  2 +-
 fs/sysv/inode.c                  |  2 +-
 fs/ubifs/super.c                 |  2 +-
 fs/ubifs/xattr.c                 |  2 +-
 fs/udf/inode.c                   |  8 +++++---
 fs/udf/namei.c                   |  4 ++--
 fs/ufs/inode.c                   |  4 ++--
 fs/xfs/xfs_iops.c                |  2 +-
 include/linux/fs.h               | 13 +++++++++++++
 81 files changed, 163 insertions(+), 148 deletions(-)

(limited to 'fs')

diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index baa82f8dd076..481f4f76f664 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -108,7 +108,7 @@ static struct inode *hypfs_make_inode(struct super_block *sb, int mode)
 		ret->i_gid = hypfs_info->gid;
 		ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME;
 		if (mode & S_IFDIR)
-			ret->i_nlink = 2;
+			set_nlink(ret, 2);
 	}
 	return ret;
 }
diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c
index f3c6060c96b8..7a1955583b7d 100644
--- a/drivers/staging/pohmelfs/inode.c
+++ b/drivers/staging/pohmelfs/inode.c
@@ -1197,7 +1197,7 @@ const struct inode_operations pohmelfs_file_inode_operations = {
 void pohmelfs_fill_inode(struct inode *inode, struct netfs_inode_info *info)
 {
 	inode->i_mode = info->mode;
-	inode->i_nlink = info->nlink;
+	set_nlink(inode, info->nlink);
 	inode->i_uid = info->uid;
 	inode->i_gid = info->gid;
 	inode->i_blocks = info->blocks;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b5a1076aaa6c..879ed8851737 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1138,7 +1138,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
 	struct v9fs_inode *v9inode = V9FS_I(inode);
 
-	inode->i_nlink = 1;
+	set_nlink(inode, 1);
 
 	inode->i_atime.tv_sec = stat->atime;
 	inode->i_mtime.tv_sec = stat->mtime;
@@ -1164,7 +1164,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 			/* HARDLINKCOUNT %u */
 			sscanf(ext, "%13s %u", tag_name, &i_nlink);
 			if (!strncmp(tag_name, "HARDLINKCOUNT", 13))
-				inode->i_nlink = i_nlink;
+				set_nlink(inode, i_nlink);
 		}
 	}
 	mode = stat->mode & S_IALLUGO;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index aded79fcd5cf..0b5745e21946 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -606,7 +606,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
 		inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
 		inode->i_uid = stat->st_uid;
 		inode->i_gid = stat->st_gid;
-		inode->i_nlink = stat->st_nlink;
+		set_nlink(inode, stat->st_nlink);
 
 		mode = stat->st_mode & S_IALLUGO;
 		mode |= inode->i_mode & ~S_IALLUGO;
@@ -632,7 +632,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
 		if (stat->st_result_mask & P9_STATS_GID)
 			inode->i_gid = stat->st_gid;
 		if (stat->st_result_mask & P9_STATS_NLINK)
-			inode->i_nlink = stat->st_nlink;
+			set_nlink(inode, stat->st_nlink);
 		if (stat->st_result_mask & P9_STATS_MODE) {
 			inode->i_mode = stat->st_mode;
 			if ((S_ISBLK(inode->i_mode)) ||
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index d5250c5aae21..1dab6a174d6a 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -247,7 +247,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
 	inode->i_gid	 = ADFS_SB(sb)->s_gid;
 	inode->i_ino	 = obj->file_id;
 	inode->i_size	 = obj->size;
-	inode->i_nlink	 = 2;
+	set_nlink(inode, 2);
 	inode->i_blocks	 = (inode->i_size + sb->s_blocksize - 1) >>
 			    sb->s_blocksize_bits;
 
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 8f12723bbc97..de37ec842340 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -215,7 +215,7 @@ affs_remove_link(struct dentry *dentry)
 				break;
 			default:
 				if (!AFFS_TAIL(sb, bh)->link_chain)
-					inode->i_nlink = 1;
+					set_nlink(inode, 1);
 			}
 			affs_free_block(sb, link_ino);
 			goto done;
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 5d828903ac69..88a4b0b50058 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -54,7 +54,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
 	prot = be32_to_cpu(tail->protect);
 
 	inode->i_size = 0;
-	inode->i_nlink = 1;
+	set_nlink(inode, 1);
 	inode->i_mode = 0;
 	AFFS_I(inode)->i_extcnt = 1;
 	AFFS_I(inode)->i_ext_last = ~1;
@@ -137,7 +137,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
 					       sbi->s_hashsize + 1;
 		}
 		if (tail->link_chain)
-			inode->i_nlink = 2;
+			set_nlink(inode, 2);
 		inode->i_mapping->a_ops = (sbi->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops;
 		inode->i_op = &affs_file_inode_operations;
 		inode->i_fop = &affs_file_operations;
@@ -304,7 +304,7 @@ affs_new_inode(struct inode *dir)
 	inode->i_uid     = current_fsuid();
 	inode->i_gid     = current_fsgid();
 	inode->i_ino     = block;
-	inode->i_nlink   = 1;
+	set_nlink(inode, 1);
 	inode->i_mtime   = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	atomic_set(&AFFS_I(inode)->i_opencnt, 0);
 	AFFS_I(inode)->i_blkcnt = 0;
@@ -387,7 +387,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
 		AFFS_TAIL(sb, inode_bh)->link_chain = cpu_to_be32(block);
 		affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain));
 		mark_buffer_dirty_inode(inode_bh, inode);
-		inode->i_nlink = 2;
+		set_nlink(inode, 2);
 		ihold(inode);
 	}
 	affs_fix_checksum(sb, bh);
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 346e3289abd7..2f213d109c21 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -90,7 +90,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 			vnode->vfs_inode.i_uid = status->owner;
 			vnode->vfs_inode.i_gid = status->group;
 			vnode->vfs_inode.i_generation = vnode->fid.unique;
-			vnode->vfs_inode.i_nlink = status->nlink;
+			set_nlink(&vnode->vfs_inode, status->nlink);
 
 			mode = vnode->vfs_inode.i_mode;
 			mode &= ~S_IALLUGO;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 0fdab6e03d87..d890ae3b2ce6 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -67,7 +67,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 		fscache_attr_changed(vnode->cache);
 #endif
 
-	inode->i_nlink		= vnode->status.nlink;
+	set_nlink(inode, vnode->status.nlink);
 	inode->i_uid		= vnode->status.owner;
 	inode->i_gid		= 0;
 	inode->i_size		= vnode->status.size;
@@ -174,7 +174,7 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
 	inode->i_size		= 0;
 	inode->i_mode		= S_IFDIR | S_IRUGO | S_IXUGO;
 	inode->i_op		= &afs_autocell_inode_operations;
-	inode->i_nlink		= 2;
+	set_nlink(inode, 2);
 	inode->i_uid		= 0;
 	inode->i_gid		= 0;
 	inode->i_ctime.tv_sec	= get_seconds();
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 180fa2425e49..8179f1ab8175 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -342,7 +342,7 @@ struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode)
 	inode->i_ino = get_next_ino();
 
 	if (S_ISDIR(mode)) {
-		inode->i_nlink = 2;
+		set_nlink(inode, 2);
 		inode->i_op = &autofs4_dir_inode_operations;
 		inode->i_fop = &autofs4_dir_operations;
 	} else if (S_ISLNK(mode)) {
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 720d885e8dca..8342ca67abcd 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -357,7 +357,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
 	inode->i_gid = befs_sb->mount_opts.use_gid ?
 	    befs_sb->mount_opts.gid : (gid_t) fs32_to_cpu(sb, raw_inode->gid);
 
-	inode->i_nlink = 1;
+	set_nlink(inode, 1);
 
 	/*
 	 * BEFS's time is 64 bits, but current VFS is 32 bits...
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index b14cebfd9047..9cc074019479 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -199,7 +199,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
 		printf("unlinking non-existent file %s:%lu (nlink=%d)\n",
 					inode->i_sb->s_id, inode->i_ino,
 					inode->i_nlink);
-		inode->i_nlink = 1;
+		set_nlink(inode, 1);
 	}
 	de->ino = 0;
 	mark_buffer_dirty_inode(bh, dir);
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index a8e37f81d097..697af5bf70b3 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -78,7 +78,7 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
 	BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino);
 	inode->i_uid =  le32_to_cpu(di->i_uid);
 	inode->i_gid =  le32_to_cpu(di->i_gid);
-	inode->i_nlink =  le32_to_cpu(di->i_nlink);
+	set_nlink(inode, le32_to_cpu(di->i_nlink));
 	inode->i_size = BFS_FILESIZE(di);
 	inode->i_blocks = BFS_FILEBLOCKS(di);
 	inode->i_atime.tv_sec =  le32_to_cpu(di->i_atime);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index b52c672f4c18..ae4d9cd10961 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1641,7 +1641,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
 	inode->i_gid = btrfs_stack_inode_gid(inode_item);
 	btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
 	inode->i_mode = btrfs_stack_inode_mode(inode_item);
-	inode->i_nlink = btrfs_stack_inode_nlink(inode_item);
+	set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
 	inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
 	BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
 	BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07b3ac662e19..07ea91879a91 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1705,7 +1705,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	sb->s_bdi = &fs_info->bdi;
 
 	fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
-	fs_info->btree_inode->i_nlink = 1;
+	set_nlink(fs_info->btree_inode, 1);
 	/*
 	 * we set the i_size on the btree inode to the max possible int.
 	 * the real end of the address space is determined by all of
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b2d004ad66a0..75686a61bd45 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2534,7 +2534,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_inode_item);
 	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
-	inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
+	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
 	inode->i_uid = btrfs_inode_uid(leaf, inode_item);
 	inode->i_gid = btrfs_inode_gid(leaf, inode_item);
 	btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
@@ -6728,7 +6728,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 	inode->i_op = &btrfs_dir_inode_operations;
 	inode->i_fop = &btrfs_dir_file_operations;
 
-	inode->i_nlink = 1;
+	set_nlink(inode, 1);
 	btrfs_i_size_write(inode, 0);
 
 	err = btrfs_update_inode(trans, new_root, inode);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 786639fca067..0618aa39740b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1030,7 +1030,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 	}
 	btrfs_release_path(path);
 	if (nlink != inode->i_nlink) {
-		inode->i_nlink = nlink;
+		set_nlink(inode, nlink);
 		btrfs_update_inode(trans, root, inode);
 	}
 	BTRFS_I(inode)->index_cnt = (u64)-1;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b8731bf3ef1f..15b21e35078a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2363,7 +2363,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 	}
 
 	if ((issued & CEPH_CAP_LINK_EXCL) == 0)
-		inode->i_nlink = le32_to_cpu(grant->nlink);
+		set_nlink(inode, le32_to_cpu(grant->nlink));
 
 	if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
 		int len = le32_to_cpu(grant->xattr_len);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 5dde7d51dc11..1616a0d37cbd 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -618,7 +618,7 @@ static int fill_inode(struct inode *inode,
 	}
 
 	if ((issued & CEPH_CAP_LINK_EXCL) == 0)
-		inode->i_nlink = le32_to_cpu(info->nlink);
+		set_nlink(inode, le32_to_cpu(info->nlink));
 
 	/* be careful with mtime, atime, size */
 	ceph_decode_timespec(&atime, &info->atime);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 2c50bd2f65d1..e851d5b8931e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -132,7 +132,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 	inode->i_mtime = fattr->cf_mtime;
 	inode->i_ctime = fattr->cf_ctime;
 	inode->i_rdev = fattr->cf_rdev;
-	inode->i_nlink = fattr->cf_nlink;
+	set_nlink(inode, fattr->cf_nlink);
 	inode->i_uid = fattr->cf_uid;
 	inode->i_gid = fattr->cf_gid;
 
@@ -905,7 +905,7 @@ struct inode *cifs_root_iget(struct super_block *sb)
 	if (rc && tcon->ipc) {
 		cFYI(1, "ipc connection - fake read inode");
 		inode->i_mode |= S_IFDIR;
-		inode->i_nlink = 2;
+		set_nlink(inode, 2);
 		inode->i_op = &cifs_ipc_inode_ops;
 		inode->i_fop = &simple_dir_operations;
 		inode->i_uid = cifs_sb->mnt_uid;
@@ -1367,7 +1367,7 @@ mkdir_get_info:
 		 /* setting nlink not necessary except in cases where we
 		  * failed to get it from the server or was set bogus */
 		if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
-				direntry->d_inode->i_nlink = 2;
+			set_nlink(direntry->d_inode, 2);
 
 		mode &= ~current_umask();
 		/* must turn on setgid bit if parent dir has it */
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 2bdbcc11b373..854ace712685 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -104,7 +104,7 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
         if (attr->va_gid != -1)
 	        inode->i_gid = (gid_t) attr->va_gid;
 	if (attr->va_nlink != -1)
-	        inode->i_nlink = attr->va_nlink;
+		set_nlink(inode, attr->va_nlink);
 	if (attr->va_size != -1)
 	        inode->i_size = attr->va_size;
 	if (attr->va_size != -1)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c196e544c64e..d5d5297efe97 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -307,7 +307,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 	inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
 	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
-	inode->i_nlink = 2;
+	set_nlink(inode, 2);
 
 	s->s_root = d_alloc_root(inode);
 	if (s->s_root)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 11f8582d7218..a36d327f1521 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -474,8 +474,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
 		goto out_lock;
 	fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
 	fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
-	old_dentry->d_inode->i_nlink =
-		ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink;
+	set_nlink(old_dentry->d_inode,
+		  ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink);
 	i_size_write(new_dentry->d_inode, file_size_save);
 out_lock:
 	unlock_dir(lower_dir_dentry);
@@ -499,8 +499,8 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
 		goto out_unlock;
 	}
 	fsstack_copy_attr_times(dir, lower_dir_inode);
-	dentry->d_inode->i_nlink =
-		ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink;
+	set_nlink(dentry->d_inode,
+		  ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink);
 	dentry->d_inode->i_ctime = dir->i_ctime;
 	d_drop(dentry);
 out_unlock:
@@ -565,7 +565,7 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		goto out;
 	fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
 	fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
-	dir->i_nlink = lower_dir_dentry->d_inode->i_nlink;
+	set_nlink(dir, lower_dir_dentry->d_inode->i_nlink);
 out:
 	unlock_dir(lower_dir_dentry);
 	if (!dentry->d_inode)
@@ -588,7 +588,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (!rc && dentry->d_inode)
 		clear_nlink(dentry->d_inode);
 	fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
-	dir->i_nlink = lower_dir_dentry->d_inode->i_nlink;
+	set_nlink(dir, lower_dir_dentry->d_inode->i_nlink);
 	unlock_dir(lower_dir_dentry);
 	if (!rc)
 		d_drop(dentry);
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 9c13412e6c99..bc84f365d75c 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -96,7 +96,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
 	efs_inode = (struct efs_dinode *) (bh->b_data + offset);
     
 	inode->i_mode  = be16_to_cpu(efs_inode->di_mode);
-	inode->i_nlink = be16_to_cpu(efs_inode->di_nlink);
+	set_nlink(inode, be16_to_cpu(efs_inode->di_nlink));
 	inode->i_uid   = (uid_t)be16_to_cpu(efs_inode->di_uid);
 	inode->i_gid   = (gid_t)be16_to_cpu(efs_inode->di_gid);
 	inode->i_size  = be32_to_cpu(efs_inode->di_size);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 3e5f3a6be90a..f6dbf7768ce6 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1165,7 +1165,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 	inode->i_mode = le16_to_cpu(fcb.i_mode);
 	inode->i_uid = le32_to_cpu(fcb.i_uid);
 	inode->i_gid = le32_to_cpu(fcb.i_gid);
-	inode->i_nlink = le16_to_cpu(fcb.i_links_count);
+	set_nlink(inode, le16_to_cpu(fcb.i_links_count));
 	inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
 	inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
 	inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index a8a58f63f07c..91a6945af6d8 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1321,7 +1321,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
 		inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
 		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
 	}
-	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
 	inode->i_size = le32_to_cpu(raw_inode->i_size);
 	inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
 	inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 12661e1deedd..85fe655fe3e0 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2899,7 +2899,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
 		inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
 		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
 	}
-	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
 	inode->i_size = le32_to_cpu(raw_inode->i_size);
 	inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
 	inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 8a60e3327659..642dc6d66dfd 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1821,7 +1821,7 @@ retry:
 	de->name_len = 2;
 	strcpy (de->name, "..");
 	ext3_set_de_type(dir->i_sb, de, S_IFDIR);
-	inode->i_nlink = 2;
+	set_nlink(inode, 2);
 	BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
 	err = ext3_journal_dirty_metadata(handle, dir_block);
 	if (err)
@@ -2170,7 +2170,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
 		ext3_warning (inode->i_sb, "ext3_unlink",
 			      "Deleting nonexistent file (%lu), %d",
 			      inode->i_ino, inode->i_nlink);
-		inode->i_nlink = 1;
+		set_nlink(inode, 1);
 	}
 	retval = ext3_delete_entry(handle, dir, de, bh);
 	if (retval)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0defe0bfe019..be6668bbc1b3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3418,7 +3418,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
 		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
 	}
-	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
 
 	ext4_clear_state_flags(ei);	/* Only relevant on 32-bit archs */
 	ei->i_dir_start_lookup = 0;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 4623c082f3b2..5f7fb46293be 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1694,7 +1694,7 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
 	if (is_dx(inode) && inode->i_nlink > 1) {
 		/* limit is 16-bit i_links_count */
 		if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
-			inode->i_nlink = 1;
+			set_nlink(inode, 1);
 			EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb,
 					      EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
 		}
@@ -1861,7 +1861,7 @@ retry:
 	de->name_len = 2;
 	strcpy(de->name, "..");
 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
-	inode->i_nlink = 2;
+	set_nlink(inode, 2);
 	BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
 	err = ext4_handle_dirty_metadata(handle, dir, dir_block);
 	if (err)
@@ -2214,7 +2214,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 		ext4_warning(inode->i_sb,
 			     "Deleting nonexistent file (%lu), %d",
 			     inode->i_ino, inode->i_nlink);
-		inode->i_nlink = 1;
+		set_nlink(inode, 1);
 	}
 	retval = ext4_delete_entry(handle, dir, de, bh);
 	if (retval)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 1726d7303047..808cac7edcfb 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -379,7 +379,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 			return error;
 		MSDOS_I(inode)->mmu_private = inode->i_size;
 
-		inode->i_nlink = fat_subdirs(inode);
+		set_nlink(inode, fat_subdirs(inode));
 	} else { /* not a directory */
 		inode->i_generation |= 1;
 		inode->i_mode = fat_make_mode(sbi, de->attr,
@@ -1233,7 +1233,7 @@ static int fat_read_root(struct inode *inode)
 	fat_save_attrs(inode, ATTR_DIR);
 	inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0;
 	inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0;
-	inode->i_nlink = fat_subdirs(inode)+2;
+	set_nlink(inode, fat_subdirs(inode)+2);
 
 	return 0;
 }
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 66e83b845455..216b419f30e2 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -387,7 +387,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		/* the directory was completed, just return a error */
 		goto out;
 	}
-	inode->i_nlink = 2;
+	set_nlink(inode, 2);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
 	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
 
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index bb3f29c3557b..a87a65663c25 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -900,7 +900,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		goto out;
 	}
 	inode->i_version++;
-	inode->i_nlink = 2;
+	set_nlink(inode, 2);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
 	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
 
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 1a4311437a8b..7b2af5abe2fa 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -227,7 +227,7 @@ vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip)
 	ip->i_uid = (uid_t)vip->vii_uid;
 	ip->i_gid = (gid_t)vip->vii_gid;
 
-	ip->i_nlink = vip->vii_nlink;
+	set_nlink(ip, vip->vii_nlink);
 	ip->i_size = vip->vii_size;
 
 	ip->i_atime.tv_sec = vip->vii_atime;
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 85542a7daf40..42593c587d48 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -231,7 +231,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
 	if (iop)
 		inode->i_op = iop;
 	inode->i_fop = fop;
-	inode->i_nlink = nlink;
+	set_nlink(inode, nlink);
 	inode->i_private = fc;
 	d_add(dentry, inode);
 	return dentry;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index add96f6ffda5..3e6d72756479 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -151,7 +151,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
 
 	inode->i_ino     = attr->ino;
 	inode->i_mode    = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
-	inode->i_nlink   = attr->nlink;
+	set_nlink(inode, attr->nlink);
 	inode->i_uid     = attr->uid;
 	inode->i_gid     = attr->gid;
 	inode->i_blocks  = attr->blocks;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 78418b4fa857..1656df7aacd2 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -299,7 +299,7 @@ static void gfs2_set_nlink(struct inode *inode, u32 nlink)
 		if (nlink == 0)
 			clear_nlink(inode);
 		else
-			inode->i_nlink = nlink;
+			set_nlink(inode, nlink);
 	}
 }
 
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 96a1b625fc74..a1a9fdcd2a00 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -183,7 +183,7 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode)
 	inode->i_mode = mode;
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
-	inode->i_nlink = 1;
+	set_nlink(inode, 1);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	HFS_I(inode)->flags = 0;
 	HFS_I(inode)->rsrc_inode = NULL;
@@ -313,7 +313,7 @@ static int hfs_read_inode(struct inode *inode, void *data)
 	/* Initialize the inode */
 	inode->i_uid = hsb->s_uid;
 	inode->i_gid = hsb->s_gid;
-	inode->i_nlink = 1;
+	set_nlink(inode, 1);
 
 	if (idata->key)
 		HFS_I(inode)->cat_key = *idata->key;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 4cc1e3a36ec7..40e1413be4cf 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -391,7 +391,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
 	inode->i_mode = mode;
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
-	inode->i_nlink = 1;
+	set_nlink(inode, 1);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 
 	hip = HFSPLUS_I(inode);
@@ -512,7 +512,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 		hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
 					sizeof(struct hfsplus_cat_folder));
 		hfsplus_get_perms(inode, &folder->permissions, 1);
-		inode->i_nlink = 1;
+		set_nlink(inode, 1);
 		inode->i_size = 2 + be32_to_cpu(folder->valence);
 		inode->i_atime = hfsp_mt2ut(folder->access_date);
 		inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
@@ -532,11 +532,11 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 		hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ?
 					&file->rsrc_fork : &file->data_fork);
 		hfsplus_get_perms(inode, &file->permissions, 0);
-		inode->i_nlink = 1;
+		set_nlink(inode, 1);
 		if (S_ISREG(inode->i_mode)) {
 			if (file->permissions.dev)
-				inode->i_nlink =
-					be32_to_cpu(file->permissions.dev);
+				set_nlink(inode,
+					  be32_to_cpu(file->permissions.dev));
 			inode->i_op = &hfsplus_file_inode_operations;
 			inode->i_fop = &hfsplus_file_operations;
 			inode->i_mapping->a_ops = &hfsplus_aops;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 0d22afdd4611..2f72da5ae686 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -541,7 +541,7 @@ static int read_name(struct inode *ino, char *name)
 
 	ino->i_ino = st.ino;
 	ino->i_mode = st.mode;
-	ino->i_nlink = st.nlink;
+	set_nlink(ino, st.nlink);
 	ino->i_uid = st.uid;
 	ino->i_gid = st.gid;
 	ino->i_atime = st.atime;
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 96a8ed91cedd..2fa0089a02a8 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -247,7 +247,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
 			result->i_mode &= ~0111;
 			result->i_op = &hpfs_file_iops;
 			result->i_fop = &hpfs_file_ops;
-			result->i_nlink = 1;
+			set_nlink(result, 1);
 		}
 		unlock_new_inode(result);
 	}
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index dfe800c8ae38..3b2cec29972b 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -77,7 +77,7 @@ void hpfs_read_inode(struct inode *i)
 			i->i_mode = S_IFLNK | 0777;
 			i->i_op = &page_symlink_inode_operations;
 			i->i_data.a_ops = &hpfs_symlink_aops;
-			i->i_nlink = 1;
+			set_nlink(i, 1);
 			i->i_size = ea_size;
 			i->i_blocks = 1;
 			brelse(bh);
@@ -101,7 +101,7 @@ void hpfs_read_inode(struct inode *i)
 			}
 			if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) {
 				brelse(bh);
-				i->i_nlink = 1;
+				set_nlink(i, 1);
 				i->i_size = 0;
 				i->i_blocks = 1;
 				init_special_inode(i, mode,
@@ -125,13 +125,13 @@ void hpfs_read_inode(struct inode *i)
 		hpfs_count_dnodes(i->i_sb, hpfs_inode->i_dno, &n_dnodes, &n_subdirs, NULL);
 		i->i_blocks = 4 * n_dnodes;
 		i->i_size = 2048 * n_dnodes;
-		i->i_nlink = 2 + n_subdirs;
+		set_nlink(i, 2 + n_subdirs);
 	} else {
 		i->i_mode |= S_IFREG;
 		if (!hpfs_inode->i_ea_mode) i->i_mode &= ~0111;
 		i->i_op = &hpfs_file_iops;
 		i->i_fop = &hpfs_file_ops;
-		i->i_nlink = 1;
+		set_nlink(i, 1);
 		i->i_size = le32_to_cpu(fnode->file_size);
 		i->i_blocks = ((i->i_size + 511) >> 9) + 1;
 		i->i_data.a_ops = &hpfs_aops;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 2df69e2f07cf..ea91fcb0ef9b 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -56,7 +56,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	result->i_fop = &hpfs_dir_ops;
 	result->i_blocks = 4;
 	result->i_size = 2048;
-	result->i_nlink = 2;
+	set_nlink(result, 2);
 	if (dee.read_only)
 		result->i_mode &= ~0222;
 
@@ -150,7 +150,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
 	result->i_mode &= ~0111;
 	result->i_op = &hpfs_file_iops;
 	result->i_fop = &hpfs_file_ops;
-	result->i_nlink = 1;
+	set_nlink(result, 1);
 	hpfs_i(result)->i_parent_dir = dir->i_ino;
 	result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date));
 	result->i_ctime.tv_nsec = 0;
@@ -242,7 +242,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
 	hpfs_i(result)->i_ea_size = 0;
 	result->i_uid = current_fsuid();
 	result->i_gid = current_fsgid();
-	result->i_nlink = 1;
+	set_nlink(result, 1);
 	result->i_size = 0;
 	result->i_blocks = 1;
 	init_special_inode(result, mode, rdev);
@@ -318,7 +318,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
 	result->i_uid = current_fsuid();
 	result->i_gid = current_fsgid();
 	result->i_blocks = 1;
-	result->i_nlink = 1;
+	set_nlink(result, 1);
 	result->i_size = strlen(symlink);
 	result->i_op = &page_symlink_inode_operations;
 	result->i_data.a_ops = &hpfs_symlink_aops;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 970ea987b3f6..f590b1160c6c 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -702,7 +702,7 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
 	inode->i_ctime = proc_ino->i_ctime;
 	inode->i_ino = proc_ino->i_ino;
 	inode->i_mode = proc_ino->i_mode;
-	inode->i_nlink = proc_ino->i_nlink;
+	set_nlink(inode, proc_ino->i_nlink);
 	inode->i_size = proc_ino->i_size;
 	inode->i_blocks = proc_ino->i_blocks;
 
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index a5d03672d04e..562adabef985 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1319,7 +1319,7 @@ static int isofs_read_inode(struct inode *inode)
 			inode->i_mode = S_IFDIR | sbi->s_dmode;
 		else
 			inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
-		inode->i_nlink = 1;	/*
+		set_nlink(inode, 1);	/*
 					 * Set to 1.  We know there are 2, but
 					 * the find utility tries to optimize
 					 * if it is 2, and it screws up.  It is
@@ -1337,7 +1337,7 @@ static int isofs_read_inode(struct inode *inode)
 			 */
 			inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO;
 		}
-		inode->i_nlink = 1;
+		set_nlink(inode, 1);
 	}
 	inode->i_uid = sbi->s_uid;
 	inode->i_gid = sbi->s_gid;
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 1fbc7de88f50..70e79d0c756a 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -363,7 +363,7 @@ repeat:
 			break;
 		case SIG('P', 'X'):
 			inode->i_mode = isonum_733(rr->u.PX.mode);
-			inode->i_nlink = isonum_733(rr->u.PX.n_links);
+			set_nlink(inode, isonum_733(rr->u.PX.n_links));
 			inode->i_uid = isonum_733(rr->u.PX.uid);
 			inode->i_gid = isonum_733(rr->u.PX.gid);
 			break;
@@ -496,7 +496,7 @@ repeat:
 				goto out;
 			}
 			inode->i_mode = reloc->i_mode;
-			inode->i_nlink = reloc->i_nlink;
+			set_nlink(inode, reloc->i_nlink);
 			inode->i_uid = reloc->i_uid;
 			inode->i_gid = reloc->i_gid;
 			inode->i_rdev = reloc->i_rdev;
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 9659b7c00468..be6169bd8acd 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -245,7 +245,7 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry)
 	ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name,
 			      dentry->d_name.len, dead_f, now);
 	if (dead_f->inocache)
-		dentry->d_inode->i_nlink = dead_f->inocache->pino_nlink;
+		set_nlink(dentry->d_inode, dead_f->inocache->pino_nlink);
 	if (!ret)
 		dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
 	return ret;
@@ -278,7 +278,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
 
 	if (!ret) {
 		mutex_lock(&f->sem);
-		old_dentry->d_inode->i_nlink = ++f->inocache->pino_nlink;
+		set_nlink(old_dentry->d_inode, ++f->inocache->pino_nlink);
 		mutex_unlock(&f->sem);
 		d_instantiate(dentry, old_dentry->d_inode);
 		dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
@@ -497,7 +497,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 	f = JFFS2_INODE_INFO(inode);
 
 	/* Directories get nlink 2 at start */
-	inode->i_nlink = 2;
+	set_nlink(inode, 2);
 	/* but ic->pino_nlink is the parent ino# */
 	f->inocache->pino_nlink = dir_i->i_ino;
 
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index bbcb9755dd2b..7286e44ac665 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -278,7 +278,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
 	inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime));
 	inode->i_ctime = ITIME(je32_to_cpu(latest_node.ctime));
 
-	inode->i_nlink = f->inocache->pino_nlink;
+	set_nlink(inode, f->inocache->pino_nlink);
 
 	inode->i_blocks = (inode->i_size + 511) >> 9;
 
@@ -291,7 +291,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
 	case S_IFDIR:
 	{
 		struct jffs2_full_dirent *fd;
-		inode->i_nlink = 2; /* parent and '.' */
+		set_nlink(inode, 2); /* parent and '.' */
 
 		for (fd=f->dents; fd; fd = fd->next) {
 			if (fd->type == DT_DIR && fd->ino)
@@ -453,7 +453,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
 		iput(inode);
 		return ERR_PTR(ret);
 	}
-	inode->i_nlink = 1;
+	set_nlink(inode, 1);
 	inode->i_ino = je32_to_cpu(ri->ino);
 	inode->i_mode = jemode_to_cpu(ri->mode);
 	inode->i_gid = je16_to_cpu(ri->gid);
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index b78b2f978f04..1b6f15f191b3 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -457,7 +457,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
 	/* read the page of fixed disk inode (AIT) in raw mode */
 	mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
 	if (mp == NULL) {
-		ip->i_nlink = 1;	/* Don't want iput() deleting it */
+		set_nlink(ip, 1);	/* Don't want iput() deleting it */
 		iput(ip);
 		return (NULL);
 	}
@@ -469,7 +469,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
 	/* copy on-disk inode to in-memory inode */
 	if ((copy_from_dinode(dp, ip)) != 0) {
 		/* handle bad return by returning NULL for ip */
-		ip->i_nlink = 1;	/* Don't want iput() deleting it */
+		set_nlink(ip, 1);	/* Don't want iput() deleting it */
 		iput(ip);
 		/* release the page */
 		release_metapage(mp);
@@ -3076,7 +3076,7 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip)
 				ip->i_mode |= 0001;
 		}
 	}
-	ip->i_nlink = le32_to_cpu(dip->di_nlink);
+	set_nlink(ip, le32_to_cpu(dip->di_nlink));
 
 	jfs_ip->saved_uid = le32_to_cpu(dip->di_uid);
 	if (sbi->uid == -1)
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 94b0a624b85f..a112ad96e474 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -292,7 +292,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
 		goto out3;
 	}
 
-	ip->i_nlink = 2;	/* for '.' */
+	set_nlink(ip, 2);	/* for '.' */
 	ip->i_op = &jfs_dir_inode_operations;
 	ip->i_fop = &jfs_dir_operations;
 
diff --git a/fs/libfs.c b/fs/libfs.c
index a2c0029cd95f..f6d411eef1e7 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -490,7 +490,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
-	inode->i_nlink = 2;
+	set_nlink(inode, 2);
 	root = d_alloc_root(inode);
 	if (!root) {
 		iput(inode);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index d8d09380c7de..2ac4217b7901 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -126,7 +126,7 @@ static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
 	inode->i_atime	= be64_to_timespec(di->di_atime);
 	inode->i_ctime	= be64_to_timespec(di->di_ctime);
 	inode->i_mtime	= be64_to_timespec(di->di_mtime);
-	inode->i_nlink	= be32_to_cpu(di->di_refcount);
+	set_nlink(inode, be32_to_cpu(di->di_refcount));
 	inode->i_generation = be32_to_cpu(di->di_generation);
 
 	switch (inode->i_mode & S_IFMT) {
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index e7d23e25bf1d..64cdcd662ffc 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -446,7 +446,7 @@ static struct inode *V1_minix_iget(struct inode *inode)
 	inode->i_mode = raw_inode->i_mode;
 	inode->i_uid = (uid_t)raw_inode->i_uid;
 	inode->i_gid = (gid_t)raw_inode->i_gid;
-	inode->i_nlink = raw_inode->i_nlinks;
+	set_nlink(inode, raw_inode->i_nlinks);
 	inode->i_size = raw_inode->i_size;
 	inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = raw_inode->i_time;
 	inode->i_mtime.tv_nsec = 0;
@@ -479,7 +479,7 @@ static struct inode *V2_minix_iget(struct inode *inode)
 	inode->i_mode = raw_inode->i_mode;
 	inode->i_uid = (uid_t)raw_inode->i_uid;
 	inode->i_gid = (gid_t)raw_inode->i_gid;
-	inode->i_nlink = raw_inode->i_nlinks;
+	set_nlink(inode, raw_inode->i_nlinks);
 	inode->i_size = raw_inode->i_size;
 	inode->i_mtime.tv_sec = raw_inode->i_mtime;
 	inode->i_atime.tv_sec = raw_inode->i_atime;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 202f370526a7..5b5fa33b6b9d 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -228,7 +228,7 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
 
 	DDPRINTK("ncp_read_inode: inode->i_mode = %u\n", inode->i_mode);
 
-	inode->i_nlink = 1;
+	set_nlink(inode, 1);
 	inode->i_uid = server->m.uid;
 	inode->i_gid = server->m.gid;
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 1fc715acc2da..c07a55aec838 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -355,7 +355,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 				| NFS_INO_INVALID_DATA
 				| NFS_INO_REVAL_PAGECACHE;
 		if (fattr->valid & NFS_ATTR_FATTR_NLINK)
-			inode->i_nlink = fattr->nlink;
+			set_nlink(inode, fattr->nlink);
 		else if (nfs_server_capable(inode, NFS_CAP_NLINK))
 			nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 		if (fattr->valid & NFS_ATTR_FATTR_OWNER)
@@ -1361,7 +1361,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			invalid |= NFS_INO_INVALID_ATTR;
 			if (S_ISDIR(inode->i_mode))
 				invalid |= NFS_INO_INVALID_DATA;
-			inode->i_nlink = fattr->nlink;
+			set_nlink(inode, fattr->nlink);
 		}
 	} else if (server->caps & NFS_CAP_NLINK)
 		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 08ac272b7c28..b50ffb72e5b3 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -396,7 +396,7 @@ int nilfs_read_inode_common(struct inode *inode,
 	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
 	inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid);
 	inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid);
-	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
 	inode->i_size = le64_to_cpu(raw_inode->i_size);
 	inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
 	inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index a3141990061e..768982de10e4 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -289,7 +289,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
 		nilfs_warning(inode->i_sb, __func__,
 			      "deleting nonexistent file (%lu), %d\n",
 			      inode->i_ino, inode->i_nlink);
-		inode->i_nlink = 1;
+		set_nlink(inode, 1);
 	}
 	err = nilfs_delete_entry(de, page);
 	if (err)
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 1371487da955..97e2dacbc867 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -612,7 +612,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
 	 * might be tricky due to vfs interactions. Need to think about this
 	 * some more when implementing the unlink command.
 	 */
-	vi->i_nlink = le16_to_cpu(m->link_count);
+	set_nlink(vi, le16_to_cpu(m->link_count));
 	/*
 	 * FIXME: Reparse points can have the directory bit set even though
 	 * they would be S_IFLNK. Need to deal with this further below when we
@@ -634,7 +634,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
 		vi->i_mode &= ~vol->dmask;
 		/* Things break without this kludge! */
 		if (vi->i_nlink > 1)
-			vi->i_nlink = 1;
+			set_nlink(vi, 1);
 	} else {
 		vi->i_mode |= S_IFREG;
 		/* Apply the file permissions mask set in the mount options. */
@@ -1242,7 +1242,7 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 	vi->i_version	= base_vi->i_version;
 	vi->i_uid	= base_vi->i_uid;
 	vi->i_gid	= base_vi->i_gid;
-	vi->i_nlink	= base_vi->i_nlink;
+	set_nlink(vi, base_vi->i_nlink);
 	vi->i_mtime	= base_vi->i_mtime;
 	vi->i_ctime	= base_vi->i_ctime;
 	vi->i_atime	= base_vi->i_atime;
@@ -1508,7 +1508,7 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
 	vi->i_version	= base_vi->i_version;
 	vi->i_uid	= base_vi->i_uid;
 	vi->i_gid	= base_vi->i_gid;
-	vi->i_nlink	= base_vi->i_nlink;
+	set_nlink(vi, base_vi->i_nlink);
 	vi->i_mtime	= base_vi->i_mtime;
 	vi->i_ctime	= base_vi->i_ctime;
 	vi->i_atime	= base_vi->i_atime;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8582e3f4f120..e2878b5895fb 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2292,7 +2292,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
 	ocfs2_journal_dirty(handle, di_bh);
 
 	i_size_write(inode, size);
-	inode->i_nlink = 2;
+	set_nlink(inode, 2);
 	inode->i_blocks = ocfs2_inode_sector_count(inode);
 
 	ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
@@ -2354,7 +2354,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
 	ocfs2_journal_dirty(handle, new_bh);
 
 	i_size_write(inode, inode->i_sb->s_blocksize);
-	inode->i_nlink = 2;
+	set_nlink(inode, 2);
 	inode->i_blocks = ocfs2_inode_sector_count(inode);
 	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
 	if (status < 0) {
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 7642d7ca73e5..e1ed5e502ff2 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2092,7 +2092,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 	inode->i_uid     = be32_to_cpu(lvb->lvb_iuid);
 	inode->i_gid     = be32_to_cpu(lvb->lvb_igid);
 	inode->i_mode    = be16_to_cpu(lvb->lvb_imode);
-	inode->i_nlink   = be16_to_cpu(lvb->lvb_inlink);
+	set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
 	ocfs2_unpack_timespec(&inode->i_atime,
 			      be64_to_cpu(lvb->lvb_iatime_packed));
 	ocfs2_unpack_timespec(&inode->i_mtime,
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index b4c8bb6b8d28..a22d2c098890 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -291,7 +291,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 		     (unsigned long long)le64_to_cpu(fe->i_blkno));
 
-	inode->i_nlink = ocfs2_read_links_count(fe);
+	set_nlink(inode, ocfs2_read_links_count(fe));
 
 	trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno,
 				   le32_to_cpu(fe->i_flags));
@@ -1290,7 +1290,7 @@ void ocfs2_refresh_inode(struct inode *inode,
 	OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
 	ocfs2_set_inode_flags(inode);
 	i_size_write(inode, le64_to_cpu(fe->i_size));
-	inode->i_nlink = ocfs2_read_links_count(fe);
+	set_nlink(inode, ocfs2_read_links_count(fe));
 	inode->i_uid = le32_to_cpu(fe->i_uid);
 	inode->i_gid = le32_to_cpu(fe->i_gid);
 	inode->i_mode = le16_to_cpu(fe->i_mode);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index ea0ecbdda58c..a8b2bfea574e 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -199,7 +199,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
 	 * these are used by the support functions here and in
 	 * callers. */
 	if (S_ISDIR(mode))
-		inode->i_nlink = 2;
+		set_nlink(inode, 2);
 	inode_init_owner(inode, dir, mode);
 	dquot_initialize(inode);
 	return inode;
@@ -2016,7 +2016,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
 	if (S_ISDIR(inode->i_mode))
 		ocfs2_add_links_count(orphan_fe, 1);
-	orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
+	set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
 	ocfs2_journal_dirty(handle, orphan_dir_bh);
 
 	status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
@@ -2114,7 +2114,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
 	if (S_ISDIR(inode->i_mode))
 		ocfs2_add_links_count(orphan_fe, -1);
-	orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
+	set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
 	ocfs2_journal_dirty(handle, orphan_dir_bh);
 
 leave:
@@ -2435,7 +2435,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
 	di = (struct ocfs2_dinode *)di_bh->b_data;
 	le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL);
 	di->i_orphaned_slot = 0;
-	inode->i_nlink = 1;
+	set_nlink(inode, 1);
 	ocfs2_set_links_count(di, inode->i_nlink);
 	ocfs2_journal_dirty(handle, di_bh);
 
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index a2a5bff774e3..e4e0ff7962e2 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -242,7 +242,7 @@ found:
 		inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
 		inode->i_op = &openprom_inode_operations;
 		inode->i_fop = &openprom_operations;
-		inode->i_nlink = 2;
+		set_nlink(inode, 2);
 		break;
 	case op_inode_prop:
 		if (!strcmp(dp->name, "options") && (len == 17) &&
@@ -251,7 +251,7 @@ found:
 		else
 			inode->i_mode = S_IFREG | S_IRUGO;
 		inode->i_fop = &openpromfs_prop_ops;
-		inode->i_nlink = 1;
+		set_nlink(inode, 1);
 		inode->i_size = ent_oi->u.prop->length;
 		break;
 	}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8f0087e20e16..851ba3dcdc29 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2248,7 +2248,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
 	ei = PROC_I(inode);
 	inode->i_mode = p->mode;
 	if (S_ISDIR(inode->i_mode))
-		inode->i_nlink = 2;	/* Use getattr to fix if necessary */
+		set_nlink(inode, 2);	/* Use getattr to fix if necessary */
 	if (p->iop)
 		inode->i_op = p->iop;
 	if (p->fop)
@@ -2642,7 +2642,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
 
 	inode->i_mode = p->mode;
 	if (S_ISDIR(inode->i_mode))
-		inode->i_nlink = 2;
+		set_nlink(inode, 2);
 	if (S_ISLNK(inode->i_mode))
 		inode->i_size = 64;
 	if (p->iop)
@@ -2981,8 +2981,8 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
 	inode->i_fop = &proc_tgid_base_operations;
 	inode->i_flags|=S_IMMUTABLE;
 
-	inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff,
-		ARRAY_SIZE(tgid_base_stuff));
+	set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff,
+						  ARRAY_SIZE(tgid_base_stuff)));
 
 	d_set_d_op(dentry, &pid_dentry_operations);
 
@@ -3233,8 +3233,8 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
 	inode->i_fop = &proc_tid_base_operations;
 	inode->i_flags|=S_IMMUTABLE;
 
-	inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff,
-		ARRAY_SIZE(tid_base_stuff));
+	set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff,
+						  ARRAY_SIZE(tid_base_stuff)));
 
 	d_set_d_op(dentry, &pid_dentry_operations);
 
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 9d99131d0d65..10090d9c7ad5 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -283,7 +283,7 @@ static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	struct inode *inode = dentry->d_inode;
 	struct proc_dir_entry *de = PROC_I(inode)->pde;
 	if (de && de->nlink)
-		inode->i_nlink = de->nlink;
+		set_nlink(inode, de->nlink);
 
 	generic_fillattr(inode, stat);
 	return 0;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 7ed72d6c1c6f..7737c5468a40 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -445,7 +445,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
 		if (de->size)
 			inode->i_size = de->size;
 		if (de->nlink)
-			inode->i_nlink = de->nlink;
+			set_nlink(inode, de->nlink);
 		if (de->proc_iops)
 			inode->i_op = de->proc_iops;
 		if (de->proc_fops) {
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2b0646613f5a..3bdd21418432 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -379,7 +379,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
 	inode->i_mode    = le16_to_cpu(raw_inode->di_mode);
 	inode->i_uid     = (uid_t)le16_to_cpu(raw_inode->di_uid);
 	inode->i_gid     = (gid_t)le16_to_cpu(raw_inode->di_gid);
-	inode->i_nlink   = le16_to_cpu(raw_inode->di_nlink);
+	set_nlink(inode, le16_to_cpu(raw_inode->di_nlink));
 	inode->i_size    = le32_to_cpu(raw_inode->di_size);
 	inode->i_mtime.tv_sec   = le32_to_cpu(raw_inode->di_mtime);
 	inode->i_mtime.tv_nsec = 0;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index c425441c6942..950f13af0951 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1154,7 +1154,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
 		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
 		set_inode_sd_version(inode, STAT_DATA_V1);
 		inode->i_mode = sd_v1_mode(sd);
-		inode->i_nlink = sd_v1_nlink(sd);
+		set_nlink(inode, sd_v1_nlink(sd));
 		inode->i_uid = sd_v1_uid(sd);
 		inode->i_gid = sd_v1_gid(sd);
 		inode->i_size = sd_v1_size(sd);
@@ -1199,7 +1199,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
 		struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih);
 
 		inode->i_mode = sd_v2_mode(sd);
-		inode->i_nlink = sd_v2_nlink(sd);
+		set_nlink(inode, sd_v2_nlink(sd));
 		inode->i_uid = sd_v2_uid(sd);
 		inode->i_size = sd_v2_size(sd);
 		inode->i_gid = sd_v2_gid(sd);
@@ -1832,7 +1832,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 #endif
 
 	/* fill stat data */
-	inode->i_nlink = (S_ISDIR(mode) ? 2 : 1);
+	set_nlink(inode, (S_ISDIR(mode) ? 2 : 1));
 
 	/* uid and gid must already be set by the caller for quota init */
 
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 6ce332821633..80058e8ce361 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -19,7 +19,7 @@
 #include <linux/reiserfs_xattr.h>
 #include <linux/quotaops.h>
 
-#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; }
+#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); }
 #define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i);
 
 // directory item contains array of entry headers. This performs
@@ -964,7 +964,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
 		reiserfs_warning(inode->i_sb, "reiserfs-7042",
 				 "deleting nonexistent file (%lu), %d",
 				 inode->i_ino, inode->i_nlink);
-		inode->i_nlink = 1;
+		set_nlink(inode, 1);
 	}
 
 	drop_nlink(inode);
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 2305e3121cb1..8b4089f30408 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -337,7 +337,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
 	inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK;
 	inode->i_dataoffset = pos + inode->i_metasize;
 
-	i->i_nlink = 1;		/* Hard to decide.. */
+	set_nlink(i, 1);		/* Hard to decide.. */
 	i->i_size = be32_to_cpu(ri.size);
 	i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
 	i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 04bebcaa2373..fd7b3b3bda13 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -159,7 +159,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 			frag_offset = 0;
 		}
 
-		inode->i_nlink = 1;
+		set_nlink(inode, 1);
 		inode->i_size = le32_to_cpu(sqsh_ino->file_size);
 		inode->i_fop = &generic_ro_fops;
 		inode->i_mode |= S_IFREG;
@@ -203,7 +203,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 		}
 
 		xattr_id = le32_to_cpu(sqsh_ino->xattr);
-		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
 		inode->i_size = le64_to_cpu(sqsh_ino->file_size);
 		inode->i_op = &squashfs_inode_ops;
 		inode->i_fop = &generic_ro_fops;
@@ -232,7 +232,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 		if (err < 0)
 			goto failed_read;
 
-		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
 		inode->i_size = le16_to_cpu(sqsh_ino->file_size);
 		inode->i_op = &squashfs_dir_inode_ops;
 		inode->i_fop = &squashfs_dir_ops;
@@ -257,7 +257,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 			goto failed_read;
 
 		xattr_id = le32_to_cpu(sqsh_ino->xattr);
-		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
 		inode->i_size = le32_to_cpu(sqsh_ino->file_size);
 		inode->i_op = &squashfs_dir_inode_ops;
 		inode->i_fop = &squashfs_dir_ops;
@@ -284,7 +284,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 		if (err < 0)
 			goto failed_read;
 
-		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
 		inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
 		inode->i_op = &squashfs_symlink_inode_ops;
 		inode->i_data.a_ops = &squashfs_symlink_aops;
@@ -325,7 +325,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 			inode->i_mode |= S_IFCHR;
 		else
 			inode->i_mode |= S_IFBLK;
-		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
 		rdev = le32_to_cpu(sqsh_ino->rdev);
 		init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
 
@@ -349,7 +349,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 			inode->i_mode |= S_IFBLK;
 		xattr_id = le32_to_cpu(sqsh_ino->xattr);
 		inode->i_op = &squashfs_inode_ops;
-		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
 		rdev = le32_to_cpu(sqsh_ino->rdev);
 		init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
 
@@ -370,7 +370,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 			inode->i_mode |= S_IFIFO;
 		else
 			inode->i_mode |= S_IFSOCK;
-		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
 		init_special_inode(inode, inode->i_mode, 0);
 		break;
 	}
@@ -389,7 +389,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 			inode->i_mode |= S_IFSOCK;
 		xattr_id = le32_to_cpu(sqsh_ino->xattr);
 		inode->i_op = &squashfs_inode_ops;
-		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
 		init_special_inode(inode, inode->i_mode, 0);
 		break;
 	}
diff --git a/fs/stack.c b/fs/stack.c
index b4f2ab48a61f..9c11519245a6 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -71,6 +71,6 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src)
 	dest->i_ctime = src->i_ctime;
 	dest->i_blkbits = src->i_blkbits;
 	dest->i_flags = src->i_flags;
-	dest->i_nlink = src->i_nlink;
+	set_nlink(dest, src->i_nlink);
 }
 EXPORT_SYMBOL_GPL(fsstack_copy_attr_all);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index e23f28894a3a..c81b22f3ace1 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -218,7 +218,7 @@ static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
 	}
 
 	if (sysfs_type(sd) == SYSFS_DIR)
-		inode->i_nlink = sd->s_dir.subdirs + 2;
+		set_nlink(inode, sd->s_dir.subdirs + 2);
 }
 
 int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 0630eb969a28..25ffb3e9a3f8 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -219,7 +219,7 @@ struct inode *sysv_iget(struct super_block *sb, unsigned int ino)
 	inode->i_mode = fs16_to_cpu(sbi, raw_inode->i_mode);
 	inode->i_uid = (uid_t)fs16_to_cpu(sbi, raw_inode->i_uid);
 	inode->i_gid = (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid);
-	inode->i_nlink = fs16_to_cpu(sbi, raw_inode->i_nlink);
+	set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink));
 	inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size);
 	inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime);
 	inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_mtime);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index b28121278d46..20403dc5d437 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -129,7 +129,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
 		goto out_ino;
 
 	inode->i_flags |= (S_NOCMTIME | S_NOATIME);
-	inode->i_nlink = le32_to_cpu(ino->nlink);
+	set_nlink(inode, le32_to_cpu(ino->nlink));
 	inode->i_uid   = le32_to_cpu(ino->uid);
 	inode->i_gid   = le32_to_cpu(ino->gid);
 	inode->i_atime.tv_sec  = (int64_t)le64_to_cpu(ino->atime_sec);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 993adc8f1c87..bf18f7a04544 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -561,7 +561,7 @@ int ubifs_removexattr(struct dentry *dentry, const char *name)
 	clear_nlink(inode);
 	err = remove_xattr(c, host, inode, &nm);
 	if (err)
-		inode->i_nlink = 1;
+		set_nlink(inode, 1);
 
 	/* If @i_nlink is 0, 'iput()' will delete the inode */
 	iput(inode);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 1d1358ed80c1..6e73f1d6e93c 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1236,6 +1236,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 	int offset;
 	struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
 	struct udf_inode_info *iinfo = UDF_I(inode);
+	unsigned int link_count;
 
 	fe = (struct fileEntry *)bh->b_data;
 	efe = (struct extendedFileEntry *)bh->b_data;
@@ -1318,9 +1319,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 	inode->i_mode &= ~sbi->s_umask;
 	read_unlock(&sbi->s_cred_lock);
 
-	inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
-	if (!inode->i_nlink)
-		inode->i_nlink = 1;
+	link_count = le16_to_cpu(fe->fileLinkCount);
+	if (!link_count)
+		link_count = 1;
+	set_nlink(inode, link_count);
 
 	inode->i_size = le64_to_cpu(fe->informationLength);
 	iinfo->i_lenExtents = inode->i_size;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index e8d61b114c1b..f1c64c6a4532 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -667,7 +667,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		iput(inode);
 		goto out;
 	}
-	inode->i_nlink = 2;
+	set_nlink(inode, 2);
 	cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
 	cfi.icb.extLocation = cpu_to_lelb(dinfo->i_location);
 	*(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
@@ -837,7 +837,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
 	if (!inode->i_nlink) {
 		udf_debug("Deleting nonexistent file (%lu), %d\n",
 			  inode->i_ino, inode->i_nlink);
-		inode->i_nlink = 1;
+		set_nlink(inode, 1);
 	}
 	retval = udf_delete_entry(dir, fi, &fibh, &cfi);
 	if (retval)
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index b4d791a83207..879b13436fa4 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -589,7 +589,7 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
 	 * Copy data to the in-core inode.
 	 */
 	inode->i_mode = mode = fs16_to_cpu(sb, ufs_inode->ui_mode);
-	inode->i_nlink = fs16_to_cpu(sb, ufs_inode->ui_nlink);
+	set_nlink(inode, fs16_to_cpu(sb, ufs_inode->ui_nlink));
 	if (inode->i_nlink == 0) {
 		ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
 		return -1;
@@ -637,7 +637,7 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
 	 * Copy data to the in-core inode.
 	 */
 	inode->i_mode = mode = fs16_to_cpu(sb, ufs2_inode->ui_mode);
-	inode->i_nlink = fs16_to_cpu(sb, ufs2_inode->ui_nlink);
+	set_nlink(inode, fs16_to_cpu(sb, ufs2_inode->ui_nlink));
 	if (inode->i_nlink == 0) {
 		ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
 		return -1;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 9ba2a07b7343..23ce927973a4 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1153,7 +1153,7 @@ xfs_setup_inode(
 	hlist_add_fake(&inode->i_hash);
 
 	inode->i_mode	= ip->i_d.di_mode;
-	inode->i_nlink	= ip->i_d.di_nlink;
+	set_nlink(inode, ip->i_d.di_nlink);
 	inode->i_uid	= ip->i_d.di_uid;
 	inode->i_gid	= ip->i_d.di_gid;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7a049fd2aa4c..48c1f5fc7411 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1754,6 +1754,19 @@ static inline void mark_inode_dirty_sync(struct inode *inode)
 	__mark_inode_dirty(inode, I_DIRTY_SYNC);
 }
 
+/**
+ * set_nlink - directly set an inode's link count
+ * @inode: inode
+ * @nlink: new nlink (should be non-zero)
+ *
+ * This is a low-level filesystem helper to replace any
+ * direct filesystem manipulation of i_nlink.
+ */
+static inline void set_nlink(struct inode *inode, unsigned int nlink)
+{
+	inode->i_nlink = nlink;
+}
+
 /**
  * inc_nlink - directly increment an inode's link count
  * @inode: inode
-- 
cgit v1.2.3-58-ga151


From a78ef704a8dd430225955f0709b22d4a6ba21deb Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 28 Oct 2011 14:13:30 +0200
Subject: vfs: protect i_nlink

Prevent direct modification of i_nlink by making it const and adding a
non-const __i_nlink alias.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Tested-by: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/inode.c         |  2 +-
 include/linux/fs.h | 20 +++++++++++++++-----
 2 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index ecbb68dc7e2a..ee4e66b998f4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -142,7 +142,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	atomic_set(&inode->i_count, 1);
 	inode->i_op = &empty_iops;
 	inode->i_fop = &empty_fops;
-	inode->i_nlink = 1;
+	inode->__i_nlink = 1;
 	inode->i_opflags = 0;
 	inode->i_uid = 0;
 	inode->i_gid = 0;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 48c1f5fc7411..23467d768cab 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -768,7 +768,17 @@ struct inode {
 
 	/* Stat data, not accessed from path walking */
 	unsigned long		i_ino;
-	unsigned int		i_nlink;
+	/*
+	 * Filesystems may only read i_nlink directly.  They shall use the
+	 * following functions for modification:
+	 *
+	 *    (set|clear|inc|drop)_nlink
+	 *    inode_(inc|dec)_link_count
+	 */
+	union {
+		const unsigned int i_nlink;
+		unsigned int __i_nlink;
+	};
 	dev_t			i_rdev;
 	loff_t			i_size;
 	struct timespec		i_atime;
@@ -1764,7 +1774,7 @@ static inline void mark_inode_dirty_sync(struct inode *inode)
  */
 static inline void set_nlink(struct inode *inode, unsigned int nlink)
 {
-	inode->i_nlink = nlink;
+	inode->__i_nlink = nlink;
 }
 
 /**
@@ -1777,7 +1787,7 @@ static inline void set_nlink(struct inode *inode, unsigned int nlink)
  */
 static inline void inc_nlink(struct inode *inode)
 {
-	inode->i_nlink++;
+	inode->__i_nlink++;
 }
 
 static inline void inode_inc_link_count(struct inode *inode)
@@ -1799,7 +1809,7 @@ static inline void inode_inc_link_count(struct inode *inode)
  */
 static inline void drop_nlink(struct inode *inode)
 {
-	inode->i_nlink--;
+	inode->__i_nlink--;
 }
 
 /**
@@ -1812,7 +1822,7 @@ static inline void drop_nlink(struct inode *inode)
  */
 static inline void clear_nlink(struct inode *inode)
 {
-	inode->i_nlink = 0;
+	inode->__i_nlink = 0;
 }
 
 static inline void inode_dec_link_count(struct inode *inode)
-- 
cgit v1.2.3-58-ga151


From f0023bc617ba600956b9226f1806033d7486c8ba Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 28 Oct 2011 10:02:42 -0700
Subject: vfs: add d_prune dentry operation

This adds a d_prune dentry operation that is called by the VFS prior to
pruning (i.e. unhashing and killing) a hashed dentry from the dcache.
Wrap dentry_lru_del() and use the new _prune() helper in the cases where we
are about to unhash and kill the dentry.

This will be used by Ceph to maintain a flag indicating whether the
complete contents of a directory are contained in the dcache, allowing it
to satisfy lookups and readdir without addition server communication.

Renumber a few DCACHE_* #defines to group DCACHE_OP_PRUNE with the other
DCACHE_OP_ bits.

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/filesystems/Locking |  1 +
 fs/dcache.c                       | 40 ++++++++++++++++++++++++++++++++++-----
 include/linux/dcache.h            |  8 +++++---
 3 files changed, 41 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 653380793a6c..d819ba16a0c7 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -29,6 +29,7 @@ d_hash		no		no		no		maybe
 d_compare:	yes		no		no		maybe
 d_delete:	no		yes		no		no
 d_release:	no		no		yes		no
+d_prune:        no              yes             no              no
 d_iput:		no		no		yes		no
 d_dname:	no		no		no		no
 d_automount:	no		no		yes		no
diff --git a/fs/dcache.c b/fs/dcache.c
index a88948b8bd17..274f13e2f094 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -225,7 +225,7 @@ static void dentry_unlink_inode(struct dentry * dentry)
 }
 
 /*
- * dentry_lru_(add|del|move_tail) must be called with d_lock held.
+ * dentry_lru_(add|del|prune|move_tail) must be called with d_lock held.
  */
 static void dentry_lru_add(struct dentry *dentry)
 {
@@ -245,6 +245,9 @@ static void __dentry_lru_del(struct dentry *dentry)
 	dentry_stat.nr_unused--;
 }
 
+/*
+ * Remove a dentry with references from the LRU.
+ */
 static void dentry_lru_del(struct dentry *dentry)
 {
 	if (!list_empty(&dentry->d_lru)) {
@@ -254,6 +257,23 @@ static void dentry_lru_del(struct dentry *dentry)
 	}
 }
 
+/*
+ * Remove a dentry that is unreferenced and about to be pruned
+ * (unhashed and destroyed) from the LRU, and inform the file system.
+ * This wrapper should be called _prior_ to unhashing a victim dentry.
+ */
+static void dentry_lru_prune(struct dentry *dentry)
+{
+	if (!list_empty(&dentry->d_lru)) {
+		if (dentry->d_flags & DCACHE_OP_PRUNE)
+			dentry->d_op->d_prune(dentry);
+
+		spin_lock(&dcache_lru_lock);
+		__dentry_lru_del(dentry);
+		spin_unlock(&dcache_lru_lock);
+	}
+}
+
 static void dentry_lru_move_tail(struct dentry *dentry)
 {
 	spin_lock(&dcache_lru_lock);
@@ -403,8 +423,12 @@ relock:
 
 	if (ref)
 		dentry->d_count--;
-	/* if dentry was on the d_lru list delete it from there */
-	dentry_lru_del(dentry);
+	/*
+	 * if dentry was on the d_lru list delete it from there.
+	 * inform the fs via d_prune that this dentry is about to be
+	 * unhashed and destroyed.
+	 */
+	dentry_lru_prune(dentry);
 	/* if it was on the hash then remove it */
 	__d_drop(dentry);
 	return d_kill(dentry, parent);
@@ -854,8 +878,12 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 		do {
 			struct inode *inode;
 
-			/* detach from the system */
-			dentry_lru_del(dentry);
+			/*
+			 * remove the dentry from the lru, and inform
+			 * the fs that this dentry is about to be
+			 * unhashed and destroyed.
+			 */
+			dentry_lru_prune(dentry);
 			__d_shrink(dentry);
 
 			if (dentry->d_count != 0) {
@@ -1283,6 +1311,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
 		dentry->d_flags |= DCACHE_OP_REVALIDATE;
 	if (op->d_delete)
 		dentry->d_flags |= DCACHE_OP_DELETE;
+	if (op->d_prune)
+		dentry->d_flags |= DCACHE_OP_PRUNE;
 
 }
 EXPORT_SYMBOL(d_set_d_op);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 62157c03caf7..4df926199369 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -165,6 +165,7 @@ struct dentry_operations {
 			unsigned int, const char *, const struct qstr *);
 	int (*d_delete)(const struct dentry *);
 	void (*d_release)(struct dentry *);
+	void (*d_prune)(struct dentry *);
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)(struct dentry *, char *, int);
 	struct vfsmount *(*d_automount)(struct path *);
@@ -184,8 +185,9 @@ struct dentry_operations {
 #define DCACHE_OP_COMPARE	0x0002
 #define DCACHE_OP_REVALIDATE	0x0004
 #define DCACHE_OP_DELETE	0x0008
+#define DCACHE_OP_PRUNE         0x0010
 
-#define	DCACHE_DISCONNECTED	0x0010
+#define	DCACHE_DISCONNECTED	0x0020
      /* This dentry is possibly not currently connected to the dcache tree, in
       * which case its parent will either be itself, or will have this flag as
       * well.  nfsd will not use a dentry with this bit set, but will first
@@ -196,8 +198,8 @@ struct dentry_operations {
       * dentry into place and return that dentry rather than the passed one,
       * typically using d_splice_alias. */
 
-#define DCACHE_REFERENCED	0x0020  /* Recently used, don't discard. */
-#define DCACHE_RCUACCESS	0x0040	/* Entry has ever been RCU-visible */
+#define DCACHE_REFERENCED	0x0040  /* Recently used, don't discard. */
+#define DCACHE_RCUACCESS	0x0080	/* Entry has ever been RCU-visible */
 
 #define DCACHE_CANT_MOUNT	0x0100
 #define DCACHE_GENOCIDE		0x0200
-- 
cgit v1.2.3-58-ga151


From 548fd1e8dba90bea674f5969d73498959d83924b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Thu, 18 Aug 2011 20:08:59 +0100
Subject: um: kill useless include of user.h

everything in USER_OBJ gets it via -include user.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/hostfs/hostfs_user.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index d51a98384bc0..dd7bc38a3825 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -16,7 +16,6 @@
 #include <sys/vfs.h>
 #include "hostfs.h"
 #include "os.h"
-#include "user.h"
 #include <utime.h>
 
 static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p)
-- 
cgit v1.2.3-58-ga151


From 7657cacf478940b995c2c73fdff981c13cc62c5c Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Sat, 22 Oct 2011 01:34:48 +0100
Subject: Squashfs: Add an option to set dev block size to 4K

This commit adds an option to set the device block size used to 4K.

By default Squashfs sets the device block size (sb_min_blocksize) to 1K
or the smallest block size supported by the block device (if larger).
This, because blocks are packed together and unaligned in Squashfs,
should reduce latency.

This, however, gives poor performance on MTD NAND devices where
the optimal I/O size is 4K (even though the devices can support
smaller block sizes).

Using a 4K device block size may also improve overall I/O
performance for some file access patterns (e.g. sequential
accesses of files in filesystem order) on all media.

Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
---
 fs/squashfs/Kconfig       | 22 ++++++++++++++++++++++
 fs/squashfs/squashfs_fs.h |  7 +++++++
 fs/squashfs/super.c       |  2 +-
 3 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 1360d4f88f41..5586011b6a38 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -78,6 +78,28 @@ config SQUASHFS_XZ
 
 	  If unsure, say N.
 
+config SQUASHFS_4K_DEVBLK_SIZE
+	bool "Use 4K device block size?"
+	depends on SQUASHFS
+	help
+	  By default Squashfs sets the dev block size (sb_min_blocksize)
+	  to 1K or the smallest block size supported by the block device
+	  (if larger).  This, because blocks are packed together and
+	  unaligned in Squashfs, should reduce latency.
+
+	  This, however, gives poor performance on MTD NAND devices where
+	  the optimal I/O size is 4K (even though the devices can support
+	  smaller block sizes).
+
+	  Using a 4K device block size may also improve overall I/O
+	  performance for some file access patterns (e.g. sequential
+	  accesses of files in filesystem order) on all media.
+
+	  Setting this option will force Squashfs to use a 4K device block
+	  size by default.
+
+	  If unsure, say N.
+
 config SQUASHFS_EMBEDDED
 	bool "Additional option for memory-constrained systems"
 	depends on SQUASHFS
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index b4a4e539a08c..e8e14645de9a 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -36,6 +36,13 @@
 #define SQUASHFS_FILE_SIZE		131072
 #define SQUASHFS_FILE_LOG		17
 
+/* default size of block device I/O */
+#ifdef CONFIG_SQUASHFS_4K_DEVBLK_SIZE
+#define SQUASHFS_DEVBLK_SIZE 4096
+#else
+#define SQUASHFS_DEVBLK_SIZE 1024
+#endif
+
 #define SQUASHFS_FILE_MAX_SIZE		1048576
 #define SQUASHFS_FILE_MAX_LOG		20
 
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 7438850c62d0..2da1715452ac 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -95,7 +95,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	msblk = sb->s_fs_info;
 
-	msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE);
+	msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);
 	msblk->devblksize_log2 = ffz(~msblk->devblksize);
 
 	mutex_init(&msblk->read_data_mutex);
-- 
cgit v1.2.3-58-ga151


From a3defbe5c337dbc6da911f8cc49ae3cc3b49b453 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Wed, 2 Nov 2011 13:37:41 -0700
Subject: binfmt_elf: fix PIE execution with randomization disabled

The case of address space randomization being disabled in runtime through
randomize_va_space sysctl is not treated properly in load_elf_binary(),
resulting in SIGKILL coming at exec() time for certain PIE-linked binaries
in case the randomization has been disabled at runtime prior to calling
exec().

Handle the randomize_va_space == 0 case the same way as if we were not
supporting .text randomization at all.

Based on original patch by H.J. Lu and Josh Boyer.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: H.J. Lu <hongjiu.lu@intel.com>
Cc: <stable@kernel.org>
Tested-by: Josh Boyer <jwboyer@redhat.com>
Acked-by: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index dd0fdfc56d38..21ac5ee4b43f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -795,7 +795,16 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			 * might try to exec.  This is because the brk will
 			 * follow the loader, and is not movable.  */
 #if defined(CONFIG_X86) || defined(CONFIG_ARM)
-			load_bias = 0;
+			/* Memory randomization might have been switched off
+			 * in runtime via sysctl.
+			 * If that is the case, retain the original non-zero
+			 * load_bias value in order to establish proper
+			 * non-randomized mappings.
+			 */
+			if (current->flags & PF_RANDOMIZE)
+				load_bias = 0;
+			else
+				load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
 #else
 			load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
 #endif
-- 
cgit v1.2.3-58-ga151


From 0620d9193cb976ba635d56a6cfd11cb81616d02b Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Wed, 2 Nov 2011 13:37:45 -0700
Subject: ramfs: remove module leftovers

Since ramfs is hard-selected to "y", the module leftovers make no sense.

Signed-off-by: Richard Weinberger <richard@nod.at>
Reviewed-by: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ramfs/inode.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index eacb166fb259..462ceb38fec6 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -23,7 +23,6 @@
  * caches is sufficient.
  */
 
-#include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
@@ -288,14 +287,7 @@ static int __init init_ramfs_fs(void)
 {
 	return register_filesystem(&ramfs_fs_type);
 }
-
-static void __exit exit_ramfs_fs(void)
-{
-	unregister_filesystem(&ramfs_fs_type);
-}
-
 module_init(init_ramfs_fs)
-module_exit(exit_ramfs_fs)
 
 int __init init_rootfs(void)
 {
@@ -311,5 +303,3 @@ int __init init_rootfs(void)
 
 	return err;
 }
-
-MODULE_LICENSE("GPL");
-- 
cgit v1.2.3-58-ga151


From 3069083cc8def2ffad8520f0f24c6f95f140aac5 Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon@gmail.com>
Date: Wed, 2 Nov 2011 13:38:00 -0700
Subject: isofs: add readpages support

Use mpage_readpages() instead of multiple calls to isofs_readpage() to
reduce the CPU utilization and make performance higher.

Signed-off-by: Namjae Jeon <linkinjeon@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/isofs/inode.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index a5d03672d04e..46844ff39d61 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -20,6 +20,7 @@
 #include <linux/statfs.h>
 #include <linux/cdrom.h>
 #include <linux/parser.h>
+#include <linux/mpage.h>
 
 #include "isofs.h"
 #include "zisofs.h"
@@ -1148,7 +1149,13 @@ struct buffer_head *isofs_bread(struct inode *inode, sector_t block)
 
 static int isofs_readpage(struct file *file, struct page *page)
 {
-	return block_read_full_page(page,isofs_get_block);
+	return mpage_readpage(page, isofs_get_block);
+}
+
+static int isofs_readpages(struct file *file, struct address_space *mapping,
+			struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, isofs_get_block);
 }
 
 static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
@@ -1158,6 +1165,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
 
 static const struct address_space_operations isofs_aops = {
 	.readpage = isofs_readpage,
+	.readpages = isofs_readpages,
 	.bmap = _isofs_bmap
 };
 
-- 
cgit v1.2.3-58-ga151


From 434a964daa14b9db083ce20404a4a2add54d037a Mon Sep 17 00:00:00 2001
From: Phillip Lougher <plougher@redhat.com>
Date: Wed, 2 Nov 2011 13:38:01 -0700
Subject: hfs: fix hfs_find_init() sb->ext_tree NULL ptr oops

Clement Lecigne reports a filesystem which causes a kernel oops in
hfs_find_init() trying to dereference sb->ext_tree which is NULL.

This proves to be because the filesystem has a corrupted MDB extent
record, where the extents file does not fit into the first three extents
in the file record (the first blocks).

In hfs_get_block() when looking up the blocks for the extent file
(HFS_EXT_CNID), it fails the first blocks special case, and falls
through to the extent code (which ultimately calls hfs_find_init())
which is in the process of being initialised.

Hfs avoids this scenario by always having the extents b-tree fitting
into the first blocks (the extents B-tree can't have overflow extents).

The fix is to check at mount time that the B-tree fits into first
blocks, i.e.  fail if HFS_I(inode)->alloc_blocks >=
HFS_I(inode)->first_blocks

Note, the existing commit 47f365eb57573 ("hfs: fix oops on mount with
corrupted btree extent records") becomes subsumed into this as a special
case, but only for the extents B-tree (HFS_EXT_CNID), it is perfectly
acceptable for the catalog B-Tree file to grow beyond three extents,
with the remaining extent descriptors in the extents overfow.

This fixes CVE-2011-2203

Reported-by: Clement LECIGNE <clement.lecigne@netasq.com>
Signed-off-by: Phillip Lougher <plougher@redhat.com>
Cc: Jeff Mahoney <jeffm@suse.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfs/btree.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 3ebc437736fe..1cbdeea1db44 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -46,11 +46,26 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 	case HFS_EXT_CNID:
 		hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize,
 				    mdb->drXTFlSize, be32_to_cpu(mdb->drXTClpSiz));
+		if (HFS_I(tree->inode)->alloc_blocks >
+					HFS_I(tree->inode)->first_blocks) {
+			printk(KERN_ERR "hfs: invalid btree extent records\n");
+			unlock_new_inode(tree->inode);
+			goto free_inode;
+		}
+
 		tree->inode->i_mapping->a_ops = &hfs_btree_aops;
 		break;
 	case HFS_CAT_CNID:
 		hfs_inode_read_fork(tree->inode, mdb->drCTExtRec, mdb->drCTFlSize,
 				    mdb->drCTFlSize, be32_to_cpu(mdb->drCTClpSiz));
+
+		if (!HFS_I(tree->inode)->first_blocks) {
+			printk(KERN_ERR "hfs: invalid btree extent records "
+								"(0 size).\n");
+			unlock_new_inode(tree->inode);
+			goto free_inode;
+		}
+
 		tree->inode->i_mapping->a_ops = &hfs_btree_aops;
 		break;
 	default:
@@ -59,11 +74,6 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 	}
 	unlock_new_inode(tree->inode);
 
-	if (!HFS_I(tree->inode)->first_blocks) {
-		printk(KERN_ERR "hfs: invalid btree extent records (0 size).\n");
-		goto free_inode;
-	}
-
 	mapping = tree->inode->i_mapping;
 	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
-- 
cgit v1.2.3-58-ga151


From 887df07891de0435c25cffb92268fea2c621f99c Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Wed, 2 Nov 2011 13:38:42 -0700
Subject: procfs: report EISDIR when reading sysctl dirs in proc

On reading sysctl dirs we should return -EISDIR instead of -EINVAL.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_sysctl.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 1a77dbef226f..dacd840a675a 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -370,6 +370,7 @@ static const struct file_operations proc_sys_file_operations = {
 };
 
 static const struct file_operations proc_sys_dir_file_operations = {
+	.read		= generic_read_dir,
 	.readdir	= proc_sys_readdir,
 	.llseek		= generic_file_llseek,
 };
-- 
cgit v1.2.3-58-ga151


From aa6afca5bcaba8101f3ea09d5c3e4100b2b9f0e5 Mon Sep 17 00:00:00 2001
From: Vasiliy Kulikov <segoon@openwall.com>
Date: Wed, 2 Nov 2011 13:38:44 -0700
Subject: proc: fix races against execve() of /proc/PID/fd**

fd* files are restricted to the task's owner, and other users may not get
direct access to them.  But one may open any of these files and run any
setuid program, keeping opened file descriptors.  As there are permission
checks on open(), but not on readdir() and read(), operations on the kept
file descriptors will not be checked.  It makes it possible to violate
procfs permission model.

Reading fdinfo/* may disclosure current fds' position and flags, reading
directory contents of fdinfo/ and fd/ may disclosure the number of opened
files by the target task.  This information is not sensible per se, but it
can reveal some private information (like length of a password stored in a
file) under certain conditions.

Used existing (un)lock_trace functions to check for ptrace_may_access(),
but instead of using EPERM return code from it use EACCES to be consistent
with existing proc_pid_follow_link()/proc_pid_readlink() return code.  If
they differ, attacker can guess what fds exist by analyzing stat() return
code.  Patched handlers: stat() for fd/*, stat() and read() for fdindo/*,
readdir() and lookup() for fd/ and fdinfo/.

Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: <stable@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 146 ++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 103 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8f0087e20e16..d4f4913f00db 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1652,12 +1652,46 @@ out:
 	return error;
 }
 
+static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task = get_proc_task(inode);
+	int rc;
+
+	if (task == NULL)
+		return -ESRCH;
+
+	rc = -EACCES;
+	if (lock_trace(task))
+		goto out_task;
+
+	generic_fillattr(inode, stat);
+	unlock_trace(task);
+	rc = 0;
+out_task:
+	put_task_struct(task);
+	return rc;
+}
+
 static const struct inode_operations proc_pid_link_inode_operations = {
 	.readlink	= proc_pid_readlink,
 	.follow_link	= proc_pid_follow_link,
 	.setattr	= proc_setattr,
 };
 
+static const struct inode_operations proc_fdinfo_link_inode_operations = {
+	.setattr	= proc_setattr,
+	.getattr	= proc_pid_fd_link_getattr,
+};
+
+static const struct inode_operations proc_fd_link_inode_operations = {
+	.readlink	= proc_pid_readlink,
+	.follow_link	= proc_pid_follow_link,
+	.setattr	= proc_setattr,
+	.getattr	= proc_pid_fd_link_getattr,
+};
+
 
 /* building an inode */
 
@@ -1889,49 +1923,61 @@ out:
 
 static int proc_fd_info(struct inode *inode, struct path *path, char *info)
 {
-	struct task_struct *task = get_proc_task(inode);
-	struct files_struct *files = NULL;
+	struct task_struct *task;
+	struct files_struct *files;
 	struct file *file;
 	int fd = proc_fd(inode);
+	int rc;
 
-	if (task) {
-		files = get_files_struct(task);
-		put_task_struct(task);
-	}
-	if (files) {
-		/*
-		 * We are not taking a ref to the file structure, so we must
-		 * hold ->file_lock.
-		 */
-		spin_lock(&files->file_lock);
-		file = fcheck_files(files, fd);
-		if (file) {
-			unsigned int f_flags;
-			struct fdtable *fdt;
-
-			fdt = files_fdtable(files);
-			f_flags = file->f_flags & ~O_CLOEXEC;
-			if (FD_ISSET(fd, fdt->close_on_exec))
-				f_flags |= O_CLOEXEC;
-
-			if (path) {
-				*path = file->f_path;
-				path_get(&file->f_path);
-			}
-			if (info)
-				snprintf(info, PROC_FDINFO_MAX,
-					 "pos:\t%lli\n"
-					 "flags:\t0%o\n",
-					 (long long) file->f_pos,
-					 f_flags);
-			spin_unlock(&files->file_lock);
-			put_files_struct(files);
-			return 0;
+	task = get_proc_task(inode);
+	if (!task)
+		return -ENOENT;
+
+	rc = -EACCES;
+	if (lock_trace(task))
+		goto out_task;
+
+	rc = -ENOENT;
+	files = get_files_struct(task);
+	if (files == NULL)
+		goto out_unlock;
+
+	/*
+	 * We are not taking a ref to the file structure, so we must
+	 * hold ->file_lock.
+	 */
+	spin_lock(&files->file_lock);
+	file = fcheck_files(files, fd);
+	if (file) {
+		unsigned int f_flags;
+		struct fdtable *fdt;
+
+		fdt = files_fdtable(files);
+		f_flags = file->f_flags & ~O_CLOEXEC;
+		if (FD_ISSET(fd, fdt->close_on_exec))
+			f_flags |= O_CLOEXEC;
+
+		if (path) {
+			*path = file->f_path;
+			path_get(&file->f_path);
 		}
-		spin_unlock(&files->file_lock);
-		put_files_struct(files);
-	}
-	return -ENOENT;
+		if (info)
+			snprintf(info, PROC_FDINFO_MAX,
+				 "pos:\t%lli\n"
+				 "flags:\t0%o\n",
+				 (long long) file->f_pos,
+				 f_flags);
+		rc = 0;
+	} else
+		rc = -ENOENT;
+	spin_unlock(&files->file_lock);
+	put_files_struct(files);
+
+out_unlock:
+	unlock_trace(task);
+out_task:
+	put_task_struct(task);
+	return rc;
 }
 
 static int proc_fd_link(struct inode *inode, struct path *path)
@@ -2026,7 +2072,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
 	spin_unlock(&files->file_lock);
 	put_files_struct(files);
 
-	inode->i_op = &proc_pid_link_inode_operations;
+	inode->i_op = &proc_fd_link_inode_operations;
 	inode->i_size = 64;
 	ei->op.proc_get_link = proc_fd_link;
 	d_set_d_op(dentry, &tid_fd_dentry_operations);
@@ -2058,7 +2104,12 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
 	if (fd == ~0U)
 		goto out;
 
+	result = ERR_PTR(-EACCES);
+	if (lock_trace(task))
+		goto out;
+
 	result = instantiate(dir, dentry, task, &fd);
+	unlock_trace(task);
 out:
 	put_task_struct(task);
 out_no_task:
@@ -2078,23 +2129,28 @@ static int proc_readfd_common(struct file * filp, void * dirent,
 	retval = -ENOENT;
 	if (!p)
 		goto out_no_task;
+
+	retval = -EACCES;
+	if (lock_trace(p))
+		goto out;
+
 	retval = 0;
 
 	fd = filp->f_pos;
 	switch (fd) {
 		case 0:
 			if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
-				goto out;
+				goto out_unlock;
 			filp->f_pos++;
 		case 1:
 			ino = parent_ino(dentry);
 			if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
-				goto out;
+				goto out_unlock;
 			filp->f_pos++;
 		default:
 			files = get_files_struct(p);
 			if (!files)
-				goto out;
+				goto out_unlock;
 			rcu_read_lock();
 			for (fd = filp->f_pos-2;
 			     fd < files_fdtable(files)->max_fds;
@@ -2118,6 +2174,9 @@ static int proc_readfd_common(struct file * filp, void * dirent,
 			rcu_read_unlock();
 			put_files_struct(files);
 	}
+
+out_unlock:
+	unlock_trace(p);
 out:
 	put_task_struct(p);
 out_no_task:
@@ -2195,6 +2254,7 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
 	ei->fd = fd;
 	inode->i_mode = S_IFREG | S_IRUSR;
 	inode->i_fop = &proc_fdinfo_file_operations;
+	inode->i_op = &proc_fdinfo_link_inode_operations;
 	d_set_d_op(dentry, &tid_fd_dentry_operations);
 	d_add(dentry, inode);
 	/* Close the race of the process dying before we return the dentry */
-- 
cgit v1.2.3-58-ga151


From f1ecf06854a66ee663f4d4cf029c78cd62a15e04 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@profusion.mobi>
Date: Wed, 2 Nov 2011 13:39:22 -0700
Subject: sysctl: add support for poll()

Adding support for poll() in sysctl fs allows userspace to receive
notifications of changes in sysctl entries.  This adds a infrastructure to
allow files in sysctl fs to be pollable and implements it for hostname and
domainname.

[akpm@linux-foundation.org: s/declare/define/ for definitions]
Signed-off-by: Lucas De Marchi <lucas.demarchi@profusion.mobi>
Cc: Greg KH <gregkh@suse.de>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_sysctl.c   | 45 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sysctl.h  | 22 ++++++++++++++++++++++
 include/linux/utsname.h | 16 ++++++++++++++++
 kernel/sys.c            |  2 ++
 kernel/utsname_sysctl.c | 23 +++++++++++++++++++++++
 5 files changed, 108 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index dacd840a675a..df594803f45a 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -3,6 +3,7 @@
  */
 #include <linux/init.h>
 #include <linux/sysctl.h>
+#include <linux/poll.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/namei.h>
@@ -14,6 +15,15 @@ static const struct inode_operations proc_sys_inode_operations;
 static const struct file_operations proc_sys_dir_file_operations;
 static const struct inode_operations proc_sys_dir_operations;
 
+void proc_sys_poll_notify(struct ctl_table_poll *poll)
+{
+	if (!poll)
+		return;
+
+	atomic_inc(&poll->event);
+	wake_up_interruptible(&poll->wait);
+}
+
 static struct inode *proc_sys_make_inode(struct super_block *sb,
 		struct ctl_table_header *head, struct ctl_table *table)
 {
@@ -176,6 +186,39 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
 	return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
 }
 
+static int proc_sys_open(struct inode *inode, struct file *filp)
+{
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+
+	if (table->poll)
+		filp->private_data = proc_sys_poll_event(table->poll);
+
+	return 0;
+}
+
+static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	unsigned long event = (unsigned long)filp->private_data;
+	unsigned int ret = DEFAULT_POLLMASK;
+
+	if (!table->proc_handler)
+		goto out;
+
+	if (!table->poll)
+		goto out;
+
+	poll_wait(filp, &table->poll->wait, wait);
+
+	if (event != atomic_read(&table->poll->event)) {
+		filp->private_data = proc_sys_poll_event(table->poll);
+		ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
+	}
+
+out:
+	return ret;
+}
 
 static int proc_sys_fill_cache(struct file *filp, void *dirent,
 				filldir_t filldir,
@@ -364,6 +407,8 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
 }
 
 static const struct file_operations proc_sys_file_operations = {
+	.open		= proc_sys_open,
+	.poll		= proc_sys_poll,
 	.read		= proc_sys_read,
 	.write		= proc_sys_write,
 	.llseek		= default_llseek,
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 9a1ec10fd504..703cfa33a3ca 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -931,6 +931,7 @@ enum
 #ifdef __KERNEL__
 #include <linux/list.h>
 #include <linux/rcupdate.h>
+#include <linux/wait.h>
 
 /* For the /proc/sys support */
 struct ctl_table;
@@ -1011,6 +1012,26 @@ extern int proc_do_large_bitmap(struct ctl_table *, int,
  * cover common cases.
  */
 
+/* Support for userspace poll() to watch for changes */
+struct ctl_table_poll {
+	atomic_t event;
+	wait_queue_head_t wait;
+};
+
+static inline void *proc_sys_poll_event(struct ctl_table_poll *poll)
+{
+	return (void *)(unsigned long)atomic_read(&poll->event);
+}
+
+void proc_sys_poll_notify(struct ctl_table_poll *poll);
+
+#define __CTL_TABLE_POLL_INITIALIZER(name) {				\
+	.event = ATOMIC_INIT(0),					\
+	.wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.wait) }
+
+#define DEFINE_CTL_TABLE_POLL(name)					\
+	struct ctl_table_poll name = __CTL_TABLE_POLL_INITIALIZER(name)
+
 /* A sysctl table is an array of struct ctl_table: */
 struct ctl_table 
 {
@@ -1021,6 +1042,7 @@ struct ctl_table
 	struct ctl_table *child;
 	struct ctl_table *parent;	/* Automatically set */
 	proc_handler *proc_handler;	/* Callback for text formatting */
+	struct ctl_table_poll *poll;
 	void *extra1;
 	void *extra2;
 };
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 4e5b0213fdc1..c714ed75eae2 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -37,6 +37,14 @@ struct new_utsname {
 #include <linux/nsproxy.h>
 #include <linux/err.h>
 
+enum uts_proc {
+	UTS_PROC_OSTYPE,
+	UTS_PROC_OSRELEASE,
+	UTS_PROC_VERSION,
+	UTS_PROC_HOSTNAME,
+	UTS_PROC_DOMAINNAME,
+};
+
 struct user_namespace;
 extern struct user_namespace init_user_ns;
 
@@ -80,6 +88,14 @@ static inline struct uts_namespace *copy_utsname(unsigned long flags,
 }
 #endif
 
+#ifdef CONFIG_PROC_SYSCTL
+extern void uts_proc_notify(enum uts_proc proc);
+#else
+static inline void uts_proc_notify(enum uts_proc proc)
+{
+}
+#endif
+
 static inline struct new_utsname *utsname(void)
 {
 	return &current->nsproxy->uts_ns->name;
diff --git a/kernel/sys.c b/kernel/sys.c
index 58459509b14c..d06c091e0345 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1286,6 +1286,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
 		memset(u->nodename + len, 0, sizeof(u->nodename) - len);
 		errno = 0;
 	}
+	uts_proc_notify(UTS_PROC_HOSTNAME);
 	up_write(&uts_sem);
 	return errno;
 }
@@ -1336,6 +1337,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
 		memset(u->domainname + len, 0, sizeof(u->domainname) - len);
 		errno = 0;
 	}
+	uts_proc_notify(UTS_PROC_DOMAINNAME);
 	up_write(&uts_sem);
 	return errno;
 }
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index a2cd77e70d4d..3b0d48ebf81d 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -13,6 +13,7 @@
 #include <linux/uts.h>
 #include <linux/utsname.h>
 #include <linux/sysctl.h>
+#include <linux/wait.h>
 
 static void *get_uts(ctl_table *table, int write)
 {
@@ -51,12 +52,19 @@ static int proc_do_uts_string(ctl_table *table, int write,
 	uts_table.data = get_uts(table, write);
 	r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
 	put_uts(table, write, uts_table.data);
+
+	if (write)
+		proc_sys_poll_notify(table->poll);
+
 	return r;
 }
 #else
 #define proc_do_uts_string NULL
 #endif
 
+static DEFINE_CTL_TABLE_POLL(hostname_poll);
+static DEFINE_CTL_TABLE_POLL(domainname_poll);
+
 static struct ctl_table uts_kern_table[] = {
 	{
 		.procname	= "ostype",
@@ -85,6 +93,7 @@ static struct ctl_table uts_kern_table[] = {
 		.maxlen		= sizeof(init_uts_ns.name.nodename),
 		.mode		= 0644,
 		.proc_handler	= proc_do_uts_string,
+		.poll		= &hostname_poll,
 	},
 	{
 		.procname	= "domainname",
@@ -92,6 +101,7 @@ static struct ctl_table uts_kern_table[] = {
 		.maxlen		= sizeof(init_uts_ns.name.domainname),
 		.mode		= 0644,
 		.proc_handler	= proc_do_uts_string,
+		.poll		= &domainname_poll,
 	},
 	{}
 };
@@ -105,6 +115,19 @@ static struct ctl_table uts_root_table[] = {
 	{}
 };
 
+#ifdef CONFIG_PROC_SYSCTL
+/*
+ * Notify userspace about a change in a certain entry of uts_kern_table,
+ * identified by the parameter proc.
+ */
+void uts_proc_notify(enum uts_proc proc)
+{
+	struct ctl_table *table = &uts_kern_table[proc];
+
+	proc_sys_poll_notify(table->poll);
+}
+#endif
+
 static int __init utsname_sysctl_init(void)
 {
 	register_sysctl_table(uts_root_table);
-- 
cgit v1.2.3-58-ga151


From 080d676de095a14ecba14c0b9a91acb5bbb634df Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Wed, 2 Nov 2011 13:40:10 -0700
Subject: aio: allocate kiocbs in batches

In testing aio on a fast storage device, I found that the context lock
takes up a fair amount of cpu time in the I/O submission path.  The reason
is that we take it for every I/O submitted (see __aio_get_req).  Since we
know how many I/Os are passed to io_submit, we can preallocate the kiocbs
in batches, reducing the number of times we take and release the lock.

In my testing, I was able to reduce the amount of time spent in
_raw_spin_lock_irq by .56% (average of 3 runs).  The command I used to
test this was:

   aio-stress -O -o 2 -o 3 -r 8 -d 128 -b 32 -i 32 -s 16384 <dev>

I also tested the patch with various numbers of events passed to
io_submit, and I ran the xfstests aio group of tests to ensure I didn't
break anything.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Daniel Ehrenberg <dehrenberg@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c            | 136 +++++++++++++++++++++++++++++++++++++++++-----------
 include/linux/aio.h |   1 +
 2 files changed, 108 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 632b235f4fbe..78c514cfd212 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -440,8 +440,6 @@ void exit_aio(struct mm_struct *mm)
 static struct kiocb *__aio_get_req(struct kioctx *ctx)
 {
 	struct kiocb *req = NULL;
-	struct aio_ring *ring;
-	int okay = 0;
 
 	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
 	if (unlikely(!req))
@@ -459,39 +457,114 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx)
 	INIT_LIST_HEAD(&req->ki_run_list);
 	req->ki_eventfd = NULL;
 
-	/* Check if the completion queue has enough free space to
-	 * accept an event from this io.
-	 */
+	return req;
+}
+
+/*
+ * struct kiocb's are allocated in batches to reduce the number of
+ * times the ctx lock is acquired and released.
+ */
+#define KIOCB_BATCH_SIZE	32L
+struct kiocb_batch {
+	struct list_head head;
+	long count; /* number of requests left to allocate */
+};
+
+static void kiocb_batch_init(struct kiocb_batch *batch, long total)
+{
+	INIT_LIST_HEAD(&batch->head);
+	batch->count = total;
+}
+
+static void kiocb_batch_free(struct kiocb_batch *batch)
+{
+	struct kiocb *req, *n;
+
+	list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
+		list_del(&req->ki_batch);
+		kmem_cache_free(kiocb_cachep, req);
+	}
+}
+
+/*
+ * Allocate a batch of kiocbs.  This avoids taking and dropping the
+ * context lock a lot during setup.
+ */
+static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch)
+{
+	unsigned short allocated, to_alloc;
+	long avail;
+	bool called_fput = false;
+	struct kiocb *req, *n;
+	struct aio_ring *ring;
+
+	to_alloc = min(batch->count, KIOCB_BATCH_SIZE);
+	for (allocated = 0; allocated < to_alloc; allocated++) {
+		req = __aio_get_req(ctx);
+		if (!req)
+			/* allocation failed, go with what we've got */
+			break;
+		list_add(&req->ki_batch, &batch->head);
+	}
+
+	if (allocated == 0)
+		goto out;
+
+retry:
 	spin_lock_irq(&ctx->ctx_lock);
-	ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0);
-	if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) {
+	ring = kmap_atomic(ctx->ring_info.ring_pages[0]);
+
+	avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active;
+	BUG_ON(avail < 0);
+	if (avail == 0 && !called_fput) {
+		/*
+		 * Handle a potential starvation case.  It is possible that
+		 * we hold the last reference on a struct file, causing us
+		 * to delay the final fput to non-irq context.  In this case,
+		 * ctx->reqs_active is artificially high.  Calling the fput
+		 * routine here may free up a slot in the event completion
+		 * ring, allowing this allocation to succeed.
+		 */
+		kunmap_atomic(ring);
+		spin_unlock_irq(&ctx->ctx_lock);
+		aio_fput_routine(NULL);
+		called_fput = true;
+		goto retry;
+	}
+
+	if (avail < allocated) {
+		/* Trim back the number of requests. */
+		list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
+			list_del(&req->ki_batch);
+			kmem_cache_free(kiocb_cachep, req);
+			if (--allocated <= avail)
+				break;
+		}
+	}
+
+	batch->count -= allocated;
+	list_for_each_entry(req, &batch->head, ki_batch) {
 		list_add(&req->ki_list, &ctx->active_reqs);
 		ctx->reqs_active++;
-		okay = 1;
 	}
-	kunmap_atomic(ring, KM_USER0);
-	spin_unlock_irq(&ctx->ctx_lock);
 
-	if (!okay) {
-		kmem_cache_free(kiocb_cachep, req);
-		req = NULL;
-	}
+	kunmap_atomic(ring);
+	spin_unlock_irq(&ctx->ctx_lock);
 
-	return req;
+out:
+	return allocated;
 }
 
-static inline struct kiocb *aio_get_req(struct kioctx *ctx)
+static inline struct kiocb *aio_get_req(struct kioctx *ctx,
+					struct kiocb_batch *batch)
 {
 	struct kiocb *req;
-	/* Handle a potential starvation case -- should be exceedingly rare as 
-	 * requests will be stuck on fput_head only if the aio_fput_routine is 
-	 * delayed and the requests were the last user of the struct file.
-	 */
-	req = __aio_get_req(ctx);
-	if (unlikely(NULL == req)) {
-		aio_fput_routine(NULL);
-		req = __aio_get_req(ctx);
-	}
+
+	if (list_empty(&batch->head))
+		if (kiocb_batch_refill(ctx, batch) == 0)
+			return NULL;
+	req = list_first_entry(&batch->head, struct kiocb, ki_batch);
+	list_del(&req->ki_batch);
 	return req;
 }
 
@@ -1515,7 +1588,8 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
 }
 
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-			 struct iocb *iocb, bool compat)
+			 struct iocb *iocb, struct kiocb_batch *batch,
+			 bool compat)
 {
 	struct kiocb *req;
 	struct file *file;
@@ -1541,7 +1615,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	if (unlikely(!file))
 		return -EBADF;
 
-	req = aio_get_req(ctx);		/* returns with 2 references to req */
+	req = aio_get_req(ctx, batch);  /* returns with 2 references to req */
 	if (unlikely(!req)) {
 		fput(file);
 		return -EAGAIN;
@@ -1621,8 +1695,9 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 {
 	struct kioctx *ctx;
 	long ret = 0;
-	int i;
+	int i = 0;
 	struct blk_plug plug;
+	struct kiocb_batch batch;
 
 	if (unlikely(nr < 0))
 		return -EINVAL;
@@ -1639,6 +1714,8 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 		return -EINVAL;
 	}
 
+	kiocb_batch_init(&batch, nr);
+
 	blk_start_plug(&plug);
 
 	/*
@@ -1659,12 +1736,13 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 			break;
 		}
 
-		ret = io_submit_one(ctx, user_iocb, &tmp, compat);
+		ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat);
 		if (ret)
 			break;
 	}
 	blk_finish_plug(&plug);
 
+	kiocb_batch_free(&batch);
 	put_ioctx(ctx);
 	return i ? i : ret;
 }
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 2dcb72bff4b6..2314ad8b3c9c 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -117,6 +117,7 @@ struct kiocb {
 
 	struct list_head	ki_list;	/* the aio core uses this
 						 * for cancellation */
+	struct list_head	ki_batch;	/* batch allocation */
 
 	/*
 	 * If the aio_resfd field of the userspace iocb is not zero,
-- 
cgit v1.2.3-58-ga151


From 6f276e49fd108362be3fd67154aaaacf872ea026 Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Tue, 1 Nov 2011 12:16:15 +0600
Subject: nfs: Fix unused variable warning from file.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix the following unused variable warning.

fs/nfs/file.c: In function ‘nfs_file_release’:
fs/nfs/file.c:140:17: warning: unused variable ‘dentry’
fs/nfs/file.c: In function ‘nfs_file_read’:
fs/nfs/file.c:237:9: warning: unused variable ‘count’

Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 28b8c3f3cda3..bd7dff001106 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -137,11 +137,9 @@ nfs_file_open(struct inode *inode, struct file *filp)
 static int
 nfs_file_release(struct inode *inode, struct file *filp)
 {
-	struct dentry *dentry = filp->f_path.dentry;
-
 	dprintk("NFS: release(%s/%s)\n",
-			dentry->d_parent->d_name.name,
-			dentry->d_name.name);
+			filp->f_path.dentry->d_parent->d_name.name,
+			filp->f_path.dentry->d_name.name);
 
 	nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
 	return nfs_release(inode, filp);
@@ -234,14 +232,13 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
 	struct dentry * dentry = iocb->ki_filp->f_path.dentry;
 	struct inode * inode = dentry->d_inode;
 	ssize_t result;
-	size_t count = iov_length(iov, nr_segs);
 
 	if (iocb->ki_filp->f_flags & O_DIRECT)
 		return nfs_file_direct_read(iocb, iov, nr_segs, pos);
 
 	dprintk("NFS: read(%s/%s, %lu@%lu)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
-		(unsigned long) count, (unsigned long) pos);
+		(unsigned long) iov_length(iov, nr_segs), (unsigned long) pos);
 
 	result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
 	if (!result) {
-- 
cgit v1.2.3-58-ga151


From 2b72c9ccd22c4a3299e5a358dcd639fb253730f4 Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Tue, 1 Nov 2011 12:23:42 +0600
Subject: nfs: Remove unused variable from write.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When CONFIG_NFS=y and CONFIG_NFS_V3_{,V4}=n we get the following warning.

	fs/nfs/write.c: In function ‘nfs_writeback_done’:
	fs/nfs/write.c:1246:21: warning: unused variable ‘server’

 Remove the variable 'server' to fix the above warning.

Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 2219c88d96b2..b016b8a36399 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1243,7 +1243,6 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 {
 	struct nfs_writeargs	*argp = &data->args;
 	struct nfs_writeres	*resp = &data->res;
-	struct nfs_server	*server = NFS_SERVER(data->inode);
 	int status;
 
 	dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
@@ -1277,7 +1276,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 		if (time_before(complain, jiffies)) {
 			dprintk("NFS:       faulty NFS server %s:"
 				" (committed = %d) != (stable = %d)\n",
-				server->nfs_client->cl_hostname,
+				NFS_SERVER(data->inode)->nfs_client->cl_hostname,
 				resp->verf->committed, argp->stable);
 			complain = jiffies + 300 * HZ;
 		}
-- 
cgit v1.2.3-58-ga151


From 4cdc685c7d06f659ef6c336d4242005cdd8df401 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 31 Oct 2011 14:45:06 -0700
Subject: pnfs-obj: Remove redundant EOF from objlayout_io_state

The EOF calculation was done on .read_pagelist(), cached
in objlayout_io_state->eof, and set in objlayout_read_done()
into nfs_read_data->res.eof.

So set it directly into nfs_read_data->res.eof and avoid
the extra member.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/objlayout/objlayout.c | 16 +++++++---------
 fs/nfs/objlayout/objlayout.h |  1 -
 2 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 1d06f8e2adea..1300736e0fb4 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -287,17 +287,14 @@ static void _rpc_read_complete(struct work_struct *work)
 void
 objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
 {
-	int eof = state->eof;
-	struct nfs_read_data *rdata;
+	struct nfs_read_data *rdata = state->rpcdata;
 
 	state->status = status;
-	dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof);
-	rdata = state->rpcdata;
+	dprintk("%s: Begin status=%zd eof=%d\n", __func__,
+		status, rdata->res.eof);
 	rdata->task.tk_status = status;
-	if (status >= 0) {
+	if (status >= 0)
 		rdata->res.count = status;
-		rdata->res.eof = eof;
-	}
 	objlayout_iodone(state);
 	/* must not use state after this point */
 
@@ -330,11 +327,14 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
 			status = 0;
 			rdata->res.count = 0;
 			rdata->res.eof = 1;
+			/*FIXME: do we need to call pnfs_ld_read_done() */
 			goto out;
 		}
 		count = eof - offset;
 	}
 
+	rdata->res.eof = (offset + count) >= eof;
+
 	state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
 					 rdata->args.pages, rdata->args.pgbase,
 					 offset, count,
@@ -345,8 +345,6 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
 		goto out;
 	}
 
-	state->eof = state->offset + state->count >= eof;
-
 	status = objio_read_pagelist(state);
  out:
 	dprintk("%s: Return status %Zd\n", __func__, status);
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index a8244c8e042d..ffb884c6fef0 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -86,7 +86,6 @@ struct objlayout_io_state {
 
 	void *rpcdata;
 	int status;             /* res */
-	int eof;                /* res */
 	int committed;          /* res */
 
 	/* Error reporting (layout_return) */
-- 
cgit v1.2.3-58-ga151


From e6c40fe3f4c4967f1cb486191ed4a5d5f55f3f7e Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 31 Oct 2011 14:45:46 -0700
Subject: pnfs-obj: Return PNFS_NOT_ATTEMPTED in case of read/write_pagelist

objlayout driver was always returning PNFS_ATTEMPTED from it's
read/write_pagelist operations. Even on error. Fix that.

Start by establishing an error return API from io-engine, by
not returning ssize_t (length-or-error) but returning "int"
0=OK, 0>Error. And clean up all return types in io-engine.

Then if io-engine returned error return PNFS_NOT_ATTEMPTED
to generic layer. (With a dprint)

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/objlayout/objio_osd.c | 32 ++++++++++++++++----------------
 fs/nfs/objlayout/objlayout.c | 36 +++++++++++++++++++-----------------
 fs/nfs/objlayout/objlayout.h |  4 ++--
 3 files changed, 37 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index d0cda12fddc3..0c7c9ec24e67 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -142,7 +142,7 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg)
 }
 
 struct objio_state;
-typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
+typedef int (*objio_done_fn)(struct objio_state *ios);
 
 struct objio_state {
 	/* Generic layer */
@@ -720,7 +720,7 @@ out:
 	return 0;
 }
 
-static ssize_t _sync_done(struct objio_state *ios)
+static int _sync_done(struct objio_state *ios)
 {
 	struct completion *waiting = ios->private;
 
@@ -742,10 +742,10 @@ static void _done_io(struct osd_request *or, void *p)
 	kref_put(&ios->kref, _last_io);
 }
 
-static ssize_t _io_exec(struct objio_state *ios)
+static int _io_exec(struct objio_state *ios)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
-	ssize_t status = 0; /* sync status */
+	int ret = 0;
 	unsigned i;
 	objio_done_fn saved_done_fn = ios->done;
 	bool sync = ios->ol_state.sync;
@@ -771,16 +771,16 @@ static ssize_t _io_exec(struct objio_state *ios)
 
 	if (sync) {
 		wait_for_completion(&wait);
-		status = saved_done_fn(ios);
+		ret = saved_done_fn(ios);
 	}
 
-	return status;
+	return ret;
 }
 
 /*
  * read
  */
-static ssize_t _read_done(struct objio_state *ios)
+static int _read_done(struct objio_state *ios)
 {
 	ssize_t status;
 	int ret = _io_check(ios, false);
@@ -793,7 +793,7 @@ static ssize_t _read_done(struct objio_state *ios)
 		status = ret;
 
 	objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
-	return status;
+	return ret;
 }
 
 static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
@@ -833,7 +833,7 @@ err:
 	return ret;
 }
 
-static ssize_t _read_exec(struct objio_state *ios)
+static int _read_exec(struct objio_state *ios)
 {
 	unsigned i;
 	int ret;
@@ -847,14 +847,14 @@ static ssize_t _read_exec(struct objio_state *ios)
 	}
 
 	ios->done = _read_done;
-	return _io_exec(ios); /* In sync mode exec returns the io status */
+	return _io_exec(ios);
 
 err:
 	_io_free(ios);
 	return ret;
 }
 
-ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
+int objio_read_pagelist(struct objlayout_io_state *ol_state)
 {
 	struct objio_state *ios = container_of(ol_state, struct objio_state,
 					       ol_state);
@@ -870,7 +870,7 @@ ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
 /*
  * write
  */
-static ssize_t _write_done(struct objio_state *ios)
+static int _write_done(struct objio_state *ios)
 {
 	ssize_t status;
 	int ret = _io_check(ios, true);
@@ -887,7 +887,7 @@ static ssize_t _write_done(struct objio_state *ios)
 	}
 
 	objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
-	return status;
+	return ret;
 }
 
 static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
@@ -955,7 +955,7 @@ err:
 	return ret;
 }
 
-static ssize_t _write_exec(struct objio_state *ios)
+static int _write_exec(struct objio_state *ios)
 {
 	unsigned i;
 	int ret;
@@ -969,14 +969,14 @@ static ssize_t _write_exec(struct objio_state *ios)
 	}
 
 	ios->done = _write_done;
-	return _io_exec(ios); /* In sync mode exec returns the io->status */
+	return _io_exec(ios);
 
 err:
 	_io_free(ios);
 	return ret;
 }
 
-ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
+int objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
 {
 	struct objio_state *ios = container_of(ol_state, struct objio_state,
 					       ol_state);
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 1300736e0fb4..99c807df11dc 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -315,16 +315,13 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
 	loff_t offset = rdata->args.offset;
 	size_t count = rdata->args.count;
 	struct objlayout_io_state *state;
-	ssize_t status = 0;
+	int err;
 	loff_t eof;
 
-	dprintk("%s: Begin inode %p offset %llu count %d\n",
-		__func__, rdata->inode, offset, (int)count);
-
 	eof = i_size_read(rdata->inode);
 	if (unlikely(offset + count > eof)) {
 		if (offset >= eof) {
-			status = 0;
+			err = 0;
 			rdata->res.count = 0;
 			rdata->res.eof = 1;
 			/*FIXME: do we need to call pnfs_ld_read_done() */
@@ -341,14 +338,19 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
 					 rdata->lseg, rdata,
 					 GFP_KERNEL);
 	if (unlikely(!state)) {
-		status = -ENOMEM;
+		err = -ENOMEM;
 		goto out;
 	}
+	dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
+		__func__, rdata->inode->i_ino, offset, count, rdata->res.eof);
 
-	status = objio_read_pagelist(state);
+	err = objio_read_pagelist(state);
  out:
-	dprintk("%s: Return status %Zd\n", __func__, status);
-	rdata->pnfs_error = status;
+	if (unlikely(err)) {
+		rdata->pnfs_error = err;
+		dprintk("%s: Returned Error %d\n", __func__, err);
+		return PNFS_NOT_ATTEMPTED;
+	}
 	return PNFS_ATTEMPTED;
 }
 
@@ -406,10 +408,7 @@ objlayout_write_pagelist(struct nfs_write_data *wdata,
 			 int how)
 {
 	struct objlayout_io_state *state;
-	ssize_t status;
-
-	dprintk("%s: Begin inode %p offset %llu count %u\n",
-		__func__, wdata->inode, wdata->args.offset, wdata->args.count);
+	int err;
 
 	state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
 					 wdata->args.pages,
@@ -419,16 +418,19 @@ objlayout_write_pagelist(struct nfs_write_data *wdata,
 					 wdata->lseg, wdata,
 					 GFP_NOFS);
 	if (unlikely(!state)) {
-		status = -ENOMEM;
+		err = -ENOMEM;
 		goto out;
 	}
 
 	state->sync = how & FLUSH_SYNC;
 
-	status = objio_write_pagelist(state, how & FLUSH_STABLE);
+	err = objio_write_pagelist(state, how & FLUSH_STABLE);
  out:
-	dprintk("%s: Return status %Zd\n", __func__, status);
-	wdata->pnfs_error = status;
+	if (unlikely(err)) {
+		wdata->pnfs_error = err;
+		dprintk("%s: Returned Error %d\n", __func__, err);
+		return PNFS_NOT_ATTEMPTED;
+	}
 	return PNFS_ATTEMPTED;
 }
 
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index ffb884c6fef0..4edac9b6ac0c 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -115,8 +115,8 @@ extern int objio_alloc_io_state(
 	gfp_t gfp_flags);
 extern void objio_free_io_state(struct objlayout_io_state *state);
 
-extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
-extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
+extern int objio_read_pagelist(struct objlayout_io_state *ol_state);
+extern int objio_write_pagelist(struct objlayout_io_state *ol_state,
 				    bool stable);
 
 /*
-- 
cgit v1.2.3-58-ga151


From 96218556b03d3c6505e2880a097338bf277fd783 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 31 Oct 2011 14:47:32 -0700
Subject: pnfs-obj: Get rid of objlayout_{alloc,free}_io_state

This is part of moving objio_osd to use the ORE.

objlayout_io_state had two functions:
1. It was used in the error reporting mechanism at layout_return.
   This function is kept intact.
   (Later patch will rename objlayout_io_state => objlayout_io_res)
2. Carrier of rw io members into the objio_read/write_paglist API.
   This is removed in this patch.

The {r,w}data received from NFS are passed directly to the
objio_{read,write}_paglist API. The io_engine is now allocating
it's own IO state as part of the read/write. The minimal
functionality that was part of the generic allocation is passed
to the io_engine.

So part of this patch is rename of:
	ios->ol_state.foo => ios->foo

At objlayout_{read,write}_done an objlayout_io_state is passed that
denotes the result of the IO. (Hence the later name change).
If the IO is successful objlayout calls an objio_free_result() API
immediately (Which for objio_osd causes the release of the io_state).
If the IO ended in an error it is hanged onto until reported in
layout_return and is released later through the objio_free_result()
API. (All this is not new just renamed and cleaned)

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/objlayout/objio_osd.c |  94 +++++++++++++++++++++-----------
 fs/nfs/objlayout/objlayout.c | 124 +++++++++++--------------------------------
 fs/nfs/objlayout/objlayout.h |  36 ++++++-------
 3 files changed, 112 insertions(+), 142 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 0c7c9ec24e67..48eb91aad554 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -148,6 +148,13 @@ struct objio_state {
 	/* Generic layer */
 	struct objlayout_io_state ol_state;
 
+	struct page **pages;
+	unsigned pgbase;
+	unsigned nr_pages;
+	unsigned long count;
+	loff_t offset;
+	bool sync;
+
 	struct objio_segment *layout;
 
 	struct kref kref;
@@ -394,30 +401,43 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg)
 	kfree(objio_seg);
 }
 
-int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
-			 struct objlayout_io_state **outp,
-			 gfp_t gfp_flags)
+static int
+objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
+	struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase,
+	loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags,
+	struct objio_state **outp)
 {
 	struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
 	struct objio_state *ios;
-	const unsigned first_size = sizeof(*ios) +
-				objio_seg->num_comps * sizeof(ios->per_dev[0]);
-	const unsigned sec_size = objio_seg->num_comps *
-						sizeof(ios->ol_state.ioerrs[0]);
-
-	ios = kzalloc(first_size + sec_size, gfp_flags);
-	if (unlikely(!ios))
+	struct __alloc_objio_state {
+		struct objio_state objios;
+		struct _objio_per_comp per_dev[objio_seg->num_comps];
+		struct pnfs_osd_ioerr ioerrs[objio_seg->num_comps];
+	} *aos;
+
+	aos = kzalloc(sizeof(*aos), gfp_flags);
+	if (unlikely(!aos))
 		return -ENOMEM;
 
-	ios->layout = objio_seg;
-	ios->ol_state.ioerrs = ((void *)ios) + first_size;
-	ios->ol_state.num_comps = objio_seg->num_comps;
+	ios = &aos->objios;
 
-	*outp = &ios->ol_state;
+	ios->layout = objio_seg;
+	objlayout_init_ioerrs(&aos->objios.ol_state, objio_seg->num_comps,
+			aos->ioerrs, rpcdata, pnfs_layout_type);
+
+	ios->pages = pages;
+	ios->pgbase = pgbase;
+	ios->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	ios->offset = offset;
+	ios->count = count;
+	ios->sync = 0;
+	BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT);
+
+	*outp = ios;
 	return 0;
 }
 
-void objio_free_io_state(struct objlayout_io_state *ol_state)
+void objio_free_result(struct objlayout_io_state *ol_state)
 {
 	struct objio_state *ios = container_of(ol_state, struct objio_state,
 					       ol_state);
@@ -598,7 +618,7 @@ static int _add_stripe_unit(struct objio_state *ios,  unsigned *cur_pg,
 	if (per_dev->bio == NULL) {
 		unsigned pages_in_stripe = ios->layout->group_width *
 				      (ios->layout->stripe_unit / PAGE_SIZE);
-		unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
+		unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
 				    ios->layout->group_width;
 
 		if (BIO_MAX_PAGES_KMALLOC < bio_size)
@@ -615,11 +635,11 @@ static int _add_stripe_unit(struct objio_state *ios,  unsigned *cur_pg,
 		unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
 		unsigned added_len;
 
-		BUG_ON(ios->ol_state.nr_pages <= pg);
+		BUG_ON(ios->nr_pages <= pg);
 		cur_len -= pglen;
 
 		added_len = bio_add_pc_page(q, per_dev->bio,
-					ios->ol_state.pages[pg], pglen, pgbase);
+					ios->pages[pg], pglen, pgbase);
 		if (unlikely(pglen != added_len))
 			return -ENOMEM;
 		pgbase = 0;
@@ -660,7 +680,7 @@ static int _prepare_one_group(struct objio_state *ios, u64 length,
 				cur_len = stripe_unit - si->unit_off;
 				page_off = si->unit_off & ~PAGE_MASK;
 				BUG_ON(page_off &&
-				      (page_off != ios->ol_state.pgbase));
+				      (page_off != ios->pgbase));
 			} else { /* dev > si->dev */
 				per_dev->offset = si->obj_offset - si->unit_off;
 				cur_len = stripe_unit;
@@ -693,8 +713,8 @@ out:
 
 static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
 {
-	u64 length = ios->ol_state.count;
-	u64 offset = ios->ol_state.offset;
+	u64 length = ios->count;
+	u64 offset = ios->offset;
 	struct _striping_info si;
 	unsigned last_pg = 0;
 	int ret = 0;
@@ -748,7 +768,7 @@ static int _io_exec(struct objio_state *ios)
 	int ret = 0;
 	unsigned i;
 	objio_done_fn saved_done_fn = ios->done;
-	bool sync = ios->ol_state.sync;
+	bool sync = ios->sync;
 
 	if (sync) {
 		ios->done = _sync_done;
@@ -792,7 +812,7 @@ static int _read_done(struct objio_state *ios)
 	else
 		status = ret;
 
-	objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
+	objlayout_read_done(&ios->ol_state, status, ios->sync);
 	return ret;
 }
 
@@ -854,12 +874,18 @@ err:
 	return ret;
 }
 
-int objio_read_pagelist(struct objlayout_io_state *ol_state)
+int objio_read_pagelist(struct nfs_read_data *rdata)
 {
-	struct objio_state *ios = container_of(ol_state, struct objio_state,
-					       ol_state);
+	struct objio_state *ios;
 	int ret;
 
+	ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout,
+			rdata->lseg, rdata->args.pages, rdata->args.pgbase,
+			rdata->args.offset, rdata->args.count, rdata,
+			GFP_KERNEL, &ios);
+	if (unlikely(ret))
+		return ret;
+
 	ret = _io_rw_pagelist(ios, GFP_KERNEL);
 	if (unlikely(ret))
 		return ret;
@@ -886,7 +912,7 @@ static int _write_done(struct objio_state *ios)
 		status = ret;
 	}
 
-	objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
+	objlayout_write_done(&ios->ol_state, status, ios->sync);
 	return ret;
 }
 
@@ -976,12 +1002,20 @@ err:
 	return ret;
 }
 
-int objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
+int objio_write_pagelist(struct nfs_write_data *wdata, int how)
 {
-	struct objio_state *ios = container_of(ol_state, struct objio_state,
-					       ol_state);
+	struct objio_state *ios;
 	int ret;
 
+	ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout,
+			wdata->lseg, wdata->args.pages, wdata->args.pgbase,
+			wdata->args.offset, wdata->args.count, wdata, GFP_NOFS,
+			&ios);
+	if (unlikely(ret))
+		return ret;
+
+	ios->sync = 0 != (how & FLUSH_SYNC);
+
 	/* TODO: ios->stable = stable; */
 	ret = _io_rw_pagelist(ios, GFP_NOFS);
 	if (unlikely(ret))
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 99c807df11dc..a82053ae5595 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -156,59 +156,23 @@ last_byte_offset(u64 start, u64 len)
 	return end > start ? end - 1 : NFS4_MAX_UINT64;
 }
 
-static struct objlayout_io_state *
-objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
-			struct page **pages,
-			unsigned pgbase,
-			loff_t offset,
-			size_t count,
-			struct pnfs_layout_segment *lseg,
-			void *rpcdata,
-			gfp_t gfp_flags)
+void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
+			   struct page ***p_pages, unsigned *p_pgbase,
+			   u64 offset, unsigned long count)
 {
-	struct objlayout_io_state *state;
 	u64 lseg_end_offset;
 
-	dprintk("%s: allocating io_state\n", __func__);
-	if (objio_alloc_io_state(lseg, &state, gfp_flags))
-		return NULL;
-
 	BUG_ON(offset < lseg->pls_range.offset);
 	lseg_end_offset = end_offset(lseg->pls_range.offset,
 				     lseg->pls_range.length);
 	BUG_ON(offset >= lseg_end_offset);
-	if (offset + count > lseg_end_offset) {
-		count = lseg->pls_range.length -
-				(offset - lseg->pls_range.offset);
-		dprintk("%s: truncated count %Zd\n", __func__, count);
-	}
+	WARN_ON(offset + count > lseg_end_offset);
 
-	if (pgbase > PAGE_SIZE) {
-		pages += pgbase >> PAGE_SHIFT;
-		pgbase &= ~PAGE_MASK;
+	if (*p_pgbase > PAGE_SIZE) {
+		dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase);
+		*p_pages += *p_pgbase >> PAGE_SHIFT;
+		*p_pgbase &= ~PAGE_MASK;
 	}
-
-	INIT_LIST_HEAD(&state->err_list);
-	state->lseg = lseg;
-	state->rpcdata = rpcdata;
-	state->pages = pages;
-	state->pgbase = pgbase;
-	state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	state->offset = offset;
-	state->count = count;
-	state->sync = 0;
-
-	return state;
-}
-
-static void
-objlayout_free_io_state(struct objlayout_io_state *state)
-{
-	dprintk("%s: freeing io_state\n", __func__);
-	if (unlikely(!state))
-		return;
-
-	objio_free_io_state(state);
 }
 
 /*
@@ -217,12 +181,10 @@ objlayout_free_io_state(struct objlayout_io_state *state)
 static void
 objlayout_iodone(struct objlayout_io_state *state)
 {
-	dprintk("%s: state %p status\n", __func__, state);
-
 	if (likely(state->status >= 0)) {
-		objlayout_free_io_state(state);
+		objio_free_result(state);
 	} else {
-		struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
+		struct objlayout *objlay = state->objlay;
 
 		spin_lock(&objlay->lock);
 		objlay->delta_space_valid = OBJ_DSU_INVALID;
@@ -289,15 +251,15 @@ objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
 {
 	struct nfs_read_data *rdata = state->rpcdata;
 
-	state->status = status;
-	dprintk("%s: Begin status=%zd eof=%d\n", __func__,
-		status, rdata->res.eof);
-	rdata->task.tk_status = status;
+	state->status = rdata->task.tk_status = status;
 	if (status >= 0)
 		rdata->res.count = status;
 	objlayout_iodone(state);
 	/* must not use state after this point */
 
+	dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
+		status, rdata->res.eof, sync);
+
 	if (sync)
 		pnfs_ld_read_done(rdata);
 	else {
@@ -314,7 +276,6 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
 {
 	loff_t offset = rdata->args.offset;
 	size_t count = rdata->args.count;
-	struct objlayout_io_state *state;
 	int err;
 	loff_t eof;
 
@@ -331,20 +292,14 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
 	}
 
 	rdata->res.eof = (offset + count) >= eof;
+	_fix_verify_io_params(rdata->lseg, &rdata->args.pages,
+			      &rdata->args.pgbase,
+			      rdata->args.offset, rdata->args.count);
 
-	state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
-					 rdata->args.pages, rdata->args.pgbase,
-					 offset, count,
-					 rdata->lseg, rdata,
-					 GFP_KERNEL);
-	if (unlikely(!state)) {
-		err = -ENOMEM;
-		goto out;
-	}
 	dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
 		__func__, rdata->inode->i_ino, offset, count, rdata->res.eof);
 
-	err = objio_read_pagelist(state);
+	err = objio_read_pagelist(rdata);
  out:
 	if (unlikely(err)) {
 		rdata->pnfs_error = err;
@@ -374,23 +329,18 @@ void
 objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
 		     bool sync)
 {
-	struct nfs_write_data *wdata;
+	struct nfs_write_data *wdata = state->rpcdata;
 
-	dprintk("%s: Begin\n", __func__);
-	wdata = state->rpcdata;
-	state->status = status;
-	wdata->task.tk_status = status;
+	state->status = wdata->task.tk_status = status;
 	if (status >= 0) {
 		wdata->res.count = status;
 		wdata->verf.committed = state->committed;
-		dprintk("%s: Return status %d committed %d\n",
-			__func__, wdata->task.tk_status,
-			wdata->verf.committed);
-	} else
-		dprintk("%s: Return status %d\n",
-			__func__, wdata->task.tk_status);
+	}
 	objlayout_iodone(state);
-	/* must not use state after this point */
+	/* must not use oir after this point */
+
+	dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
+		status, wdata->verf.committed, sync);
 
 	if (sync)
 		pnfs_ld_write_done(wdata);
@@ -407,25 +357,13 @@ enum pnfs_try_status
 objlayout_write_pagelist(struct nfs_write_data *wdata,
 			 int how)
 {
-	struct objlayout_io_state *state;
 	int err;
 
-	state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
-					 wdata->args.pages,
-					 wdata->args.pgbase,
-					 wdata->args.offset,
-					 wdata->args.count,
-					 wdata->lseg, wdata,
-					 GFP_NOFS);
-	if (unlikely(!state)) {
-		err = -ENOMEM;
-		goto out;
-	}
+	_fix_verify_io_params(wdata->lseg, &wdata->args.pages,
+			      &wdata->args.pgbase,
+			      wdata->args.offset, wdata->args.count);
 
-	state->sync = how & FLUSH_SYNC;
-
-	err = objio_write_pagelist(state, how & FLUSH_STABLE);
- out:
+	err = objio_write_pagelist(wdata, how);
 	if (unlikely(err)) {
 		wdata->pnfs_error = err;
 		dprintk("%s: Returned Error %d\n", __func__, err);
@@ -564,7 +502,7 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p)
 			merge_ioerr(&accumulated_err, ioerr);
 		}
 		list_del(&state->err_list);
-		objlayout_free_io_state(state);
+		objio_free_result(state);
 	}
 
 	pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
@@ -632,7 +570,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
 			goto loop_done;
 		}
 		list_del(&state->err_list);
-		objlayout_free_io_state(state);
+		objio_free_result(state);
 	}
 loop_done:
 	spin_unlock(&objlay->lock);
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 4edac9b6ac0c..d7b2ccfa2132 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -75,14 +75,7 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo)
  * embedded in objects provider io_state data structure
  */
 struct objlayout_io_state {
-	struct pnfs_layout_segment *lseg;
-
-	struct page **pages;
-	unsigned pgbase;
-	unsigned nr_pages;
-	unsigned long count;
-	loff_t offset;
-	bool sync;
+	struct objlayout *objlay;
 
 	void *rpcdata;
 	int status;             /* res */
@@ -99,6 +92,18 @@ struct objlayout_io_state {
 	struct pnfs_osd_ioerr *ioerrs;
 };
 
+static inline
+void objlayout_init_ioerrs(struct objlayout_io_state *oir, unsigned num_comps,
+			struct pnfs_osd_ioerr *ioerrs, void *rpcdata,
+			struct pnfs_layout_hdr *pnfs_layout_type)
+{
+	oir->objlay = OBJLAYOUT(pnfs_layout_type);
+	oir->rpcdata = rpcdata;
+	INIT_LIST_HEAD(&oir->err_list);
+	oir->num_comps = num_comps;
+	oir->ioerrs = ioerrs;
+}
+
 /*
  * Raid engine I/O API
  */
@@ -109,15 +114,10 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
 	gfp_t gfp_flags);
 extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
 
-extern int objio_alloc_io_state(
-	struct pnfs_layout_segment *lseg,
-	struct objlayout_io_state **outp,
-	gfp_t gfp_flags);
-extern void objio_free_io_state(struct objlayout_io_state *state);
+extern void objio_free_result(struct objlayout_io_state *state);
 
-extern int objio_read_pagelist(struct objlayout_io_state *ol_state);
-extern int objio_write_pagelist(struct objlayout_io_state *ol_state,
-				    bool stable);
+extern int objio_read_pagelist(struct nfs_read_data *rdata);
+extern int objio_write_pagelist(struct nfs_write_data *wdata, int how);
 
 /*
  * callback API
@@ -127,10 +127,8 @@ extern void objlayout_io_set_result(struct objlayout_io_state *state,
 			int osd_error, u64 offset, u64 length, bool is_write);
 
 static inline void
-objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
+objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used)
 {
-	struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
-
 	/* If one of the I/Os errored out and the delta_space_used was
 	 * invalid we render the complete report as invalid. Protocol mandate
 	 * the DSU be accurate or not reported.
-- 
cgit v1.2.3-58-ga151


From e2e04355d9647305c666462a49223f2942a635f0 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 31 Oct 2011 15:03:35 -0700
Subject: pnfs-obj: Rename objlayout_io_state => objlayout_io_res

* All instances of objlayout_io_state => objlayout_io_res
* All instances of state => oir;
* All instances of ol_state => oir;

Big but nothing to it

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/objlayout/objio_osd.c | 17 ++++++------
 fs/nfs/objlayout/objlayout.c | 63 ++++++++++++++++++++++----------------------
 fs/nfs/objlayout/objlayout.h | 15 ++++++-----
 3 files changed, 48 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 48eb91aad554..2347e0ac63e6 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -146,7 +146,7 @@ typedef int (*objio_done_fn)(struct objio_state *ios);
 
 struct objio_state {
 	/* Generic layer */
-	struct objlayout_io_state ol_state;
+	struct objlayout_io_res oir;
 
 	struct page **pages;
 	unsigned pgbase;
@@ -422,7 +422,7 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
 	ios = &aos->objios;
 
 	ios->layout = objio_seg;
-	objlayout_init_ioerrs(&aos->objios.ol_state, objio_seg->num_comps,
+	objlayout_init_ioerrs(&aos->objios.oir, objio_seg->num_comps,
 			aos->ioerrs, rpcdata, pnfs_layout_type);
 
 	ios->pages = pages;
@@ -437,10 +437,9 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
 	return 0;
 }
 
-void objio_free_result(struct objlayout_io_state *ol_state)
+void objio_free_result(struct objlayout_io_res *oir)
 {
-	struct objio_state *ios = container_of(ol_state, struct objio_state,
-					       ol_state);
+	struct objio_state *ios = container_of(oir, struct objio_state, oir);
 
 	kfree(ios);
 }
@@ -519,7 +518,7 @@ static int _io_check(struct objio_state *ios, bool is_write)
 
 			continue; /* we recovered */
 		}
-		objlayout_io_set_result(&ios->ol_state, i,
+		objlayout_io_set_result(&ios->oir, i,
 					&ios->layout->comps[i].oc_object_id,
 					osd_pri_2_pnfs_err(osi.osd_err_pri),
 					ios->per_dev[i].offset,
@@ -812,7 +811,7 @@ static int _read_done(struct objio_state *ios)
 	else
 		status = ret;
 
-	objlayout_read_done(&ios->ol_state, status, ios->sync);
+	objlayout_read_done(&ios->oir, status, ios->sync);
 	return ret;
 }
 
@@ -906,13 +905,13 @@ static int _write_done(struct objio_state *ios)
 	if (likely(!ret)) {
 		/* FIXME: should be based on the OSD's persistence model
 		 * See OSD2r05 Section 4.13 Data persistence model */
-		ios->ol_state.committed = NFS_FILE_SYNC;
+		ios->oir.committed = NFS_FILE_SYNC;
 		status = ios->length;
 	} else {
 		status = ret;
 	}
 
-	objlayout_write_done(&ios->ol_state, status, ios->sync);
+	objlayout_write_done(&ios->oir, status, ios->sync);
 	return ret;
 }
 
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index a82053ae5595..72074e3a04f9 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -179,16 +179,16 @@ void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
  * I/O done common code
  */
 static void
-objlayout_iodone(struct objlayout_io_state *state)
+objlayout_iodone(struct objlayout_io_res *oir)
 {
-	if (likely(state->status >= 0)) {
-		objio_free_result(state);
+	if (likely(oir->status >= 0)) {
+		objio_free_result(oir);
 	} else {
-		struct objlayout *objlay = state->objlay;
+		struct objlayout *objlay = oir->objlay;
 
 		spin_lock(&objlay->lock);
 		objlay->delta_space_valid = OBJ_DSU_INVALID;
-		list_add(&objlay->err_list, &state->err_list);
+		list_add(&objlay->err_list, &oir->err_list);
 		spin_unlock(&objlay->lock);
 	}
 }
@@ -200,13 +200,13 @@ objlayout_iodone(struct objlayout_io_state *state)
  * the error for later reporting at layout-return.
  */
 void
-objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
+objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
 			struct pnfs_osd_objid *pooid, int osd_error,
 			u64 offset, u64 length, bool is_write)
 {
-	struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
+	struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index];
 
-	BUG_ON(index >= state->num_comps);
+	BUG_ON(index >= oir->num_comps);
 	if (osd_error) {
 		ioerr->oer_component = *pooid;
 		ioerr->oer_comp_offset = offset;
@@ -247,15 +247,15 @@ static void _rpc_read_complete(struct work_struct *work)
 }
 
 void
-objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
+objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 {
-	struct nfs_read_data *rdata = state->rpcdata;
+	struct nfs_read_data *rdata = oir->rpcdata;
 
-	state->status = rdata->task.tk_status = status;
+	oir->status = rdata->task.tk_status = status;
 	if (status >= 0)
 		rdata->res.count = status;
-	objlayout_iodone(state);
-	/* must not use state after this point */
+	objlayout_iodone(oir);
+	/* must not use oir after this point */
 
 	dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
 		status, rdata->res.eof, sync);
@@ -326,17 +326,16 @@ static void _rpc_write_complete(struct work_struct *work)
 }
 
 void
-objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
-		     bool sync)
+objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 {
-	struct nfs_write_data *wdata = state->rpcdata;
+	struct nfs_write_data *wdata = oir->rpcdata;
 
-	state->status = wdata->task.tk_status = status;
+	oir->status = wdata->task.tk_status = status;
 	if (status >= 0) {
 		wdata->res.count = status;
-		wdata->verf.committed = state->committed;
+		wdata->verf.committed = oir->committed;
 	}
-	objlayout_iodone(state);
+	objlayout_iodone(oir);
 	/* must not use oir after this point */
 
 	dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
@@ -475,14 +474,14 @@ merge_ioerr(struct pnfs_osd_ioerr *dest_err,
 static void
 encode_accumulated_error(struct objlayout *objlay, __be32 *p)
 {
-	struct objlayout_io_state *state, *tmp;
+	struct objlayout_io_res *oir, *tmp;
 	struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
 
-	list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+	list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
 		unsigned i;
 
-		for (i = 0; i < state->num_comps; i++) {
-			struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+		for (i = 0; i < oir->num_comps; i++) {
+			struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
 
 			if (!ioerr->oer_errno)
 				continue;
@@ -501,8 +500,8 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p)
 
 			merge_ioerr(&accumulated_err, ioerr);
 		}
-		list_del(&state->err_list);
-		objio_free_result(state);
+		list_del(&oir->err_list);
+		objio_free_result(oir);
 	}
 
 	pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
@@ -514,7 +513,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
 			      const struct nfs4_layoutreturn_args *args)
 {
 	struct objlayout *objlay = OBJLAYOUT(pnfslay);
-	struct objlayout_io_state *state, *tmp;
+	struct objlayout_io_res *oir, *tmp;
 	__be32 *start;
 
 	dprintk("%s: Begin\n", __func__);
@@ -523,13 +522,13 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
 
 	spin_lock(&objlay->lock);
 
-	list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+	list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
 		__be32 *last_xdr = NULL, *p;
 		unsigned i;
 		int res = 0;
 
-		for (i = 0; i < state->num_comps; i++) {
-			struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+		for (i = 0; i < oir->num_comps; i++) {
+			struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
 
 			if (!ioerr->oer_errno)
 				continue;
@@ -553,7 +552,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
 			}
 
 			last_xdr = p;
-			pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]);
+			pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]);
 		}
 
 		/* TODO: use xdr_write_pages */
@@ -569,8 +568,8 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
 			encode_accumulated_error(objlay, last_xdr);
 			goto loop_done;
 		}
-		list_del(&state->err_list);
-		objio_free_result(state);
+		list_del(&oir->err_list);
+		objio_free_result(oir);
 	}
 loop_done:
 	spin_unlock(&objlay->lock);
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index d7b2ccfa2132..8ec34727ed21 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -74,7 +74,7 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo)
  * per-I/O operation state
  * embedded in objects provider io_state data structure
  */
-struct objlayout_io_state {
+struct objlayout_io_res {
 	struct objlayout *objlay;
 
 	void *rpcdata;
@@ -93,7 +93,7 @@ struct objlayout_io_state {
 };
 
 static inline
-void objlayout_init_ioerrs(struct objlayout_io_state *oir, unsigned num_comps,
+void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps,
 			struct pnfs_osd_ioerr *ioerrs, void *rpcdata,
 			struct pnfs_layout_hdr *pnfs_layout_type)
 {
@@ -114,7 +114,10 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
 	gfp_t gfp_flags);
 extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
 
-extern void objio_free_result(struct objlayout_io_state *state);
+/* objio_free_result will free these @oir structs recieved from
+ * objlayout_{read,write}_done
+ */
+extern void objio_free_result(struct objlayout_io_res *oir);
 
 extern int objio_read_pagelist(struct nfs_read_data *rdata);
 extern int objio_write_pagelist(struct nfs_write_data *wdata, int how);
@@ -122,7 +125,7 @@ extern int objio_write_pagelist(struct nfs_write_data *wdata, int how);
 /*
  * callback API
  */
-extern void objlayout_io_set_result(struct objlayout_io_state *state,
+extern void objlayout_io_set_result(struct objlayout_io_res *oir,
 			unsigned index, struct pnfs_osd_objid *pooid,
 			int osd_error, u64 offset, u64 length, bool is_write);
 
@@ -141,9 +144,9 @@ objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used)
 	spin_unlock(&objlay->lock);
 }
 
-extern void objlayout_read_done(struct objlayout_io_state *state,
+extern void objlayout_read_done(struct objlayout_io_res *oir,
 				ssize_t status, bool sync);
-extern void objlayout_write_done(struct objlayout_io_state *state,
+extern void objlayout_write_done(struct objlayout_io_res *oir,
 				 ssize_t status, bool sync);
 
 extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
-- 
cgit v1.2.3-58-ga151


From af4f5b54bcf0379089d01518e818f37258708fb7 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 31 Oct 2011 15:04:19 -0700
Subject: pnfs-obj: move to ore 01: ore_layout & ore_components

For Ease of reviewing I split the move to ore into 3 parts
	move to ore 01: ore_layout & ore_components
	move to ore 02: move to ORE
	move to ore 03: Remove old raid engine

This patch modifies the objio_lseg, layout-segment level
and devices and components arrays to use the ORE types.

Though it will be removed soon, also the raid engine
is modified to actually compile, possibly run, with
the new types. So it is the same old raid engine but
with some new ORE types.

For Ease of reviewing, some of the old code is
"#if 0" but is not removed so the diff command works
better. The old code will be removed in the 3rd patch.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/objlayout/objio_osd.c | 272 ++++++++++++++++++++-----------------------
 1 file changed, 128 insertions(+), 144 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 2347e0ac63e6..bd7ec26e2840 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -38,7 +38,7 @@
  */
 
 #include <linux/module.h>
-#include <scsi/osd_initiator.h>
+#include <scsi/osd_ore.h>
 
 #include "objlayout.h"
 
@@ -52,7 +52,7 @@ enum { BIO_MAX_PAGES_KMALLOC =
 
 struct objio_dev_ent {
 	struct nfs4_deviceid_node id_node;
-	struct osd_dev *od;
+	struct ore_dev od;
 };
 
 static void
@@ -60,8 +60,8 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
 {
 	struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
 
-	dprintk("%s: free od=%p\n", __func__, de->od);
-	osduld_put_device(de->od);
+	dprintk("%s: free od=%p\n", __func__, de->od.od);
+	osduld_put_device(de->od.od);
 	kfree(de);
 }
 
@@ -98,12 +98,12 @@ _dev_list_add(const struct nfs_server *nfss,
 				nfss->pnfs_curr_ld,
 				nfss->nfs_client,
 				d_id);
-	de->od = od;
+	de->od.od = od;
 
 	d = nfs4_insert_deviceid_node(&de->id_node);
 	n = container_of(d, struct objio_dev_ent, id_node);
 	if (n != de) {
-		dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
+		dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
 		objio_free_deviceid_node(&de->id_node);
 		de = n;
 	}
@@ -111,28 +111,11 @@ _dev_list_add(const struct nfs_server *nfss,
 	return de;
 }
 
-struct caps_buffers {
-	u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
-	u8 creds[OSD_CAP_LEN];
-};
-
 struct objio_segment {
 	struct pnfs_layout_segment lseg;
 
-	struct pnfs_osd_object_cred *comps;
-
-	unsigned mirrors_p1;
-	unsigned stripe_unit;
-	unsigned group_width;	/* Data stripe_units without integrity comps */
-	u64 group_depth;
-	unsigned group_count;
-
-	unsigned max_io_size;
-
-	unsigned comps_index;
-	unsigned num_comps;
-	/* variable length */
-	struct objio_dev_ent *ods[];
+	struct ore_layout layout;
+	struct ore_components oc;
 };
 
 static inline struct objio_segment *
@@ -155,7 +138,8 @@ struct objio_state {
 	loff_t offset;
 	bool sync;
 
-	struct objio_segment *layout;
+	struct ore_layout *layout;
+	struct ore_components *oc;
 
 	struct kref kref;
 	objio_done_fn done;
@@ -175,32 +159,33 @@ struct objio_state {
 
 /* Send and wait for a get_device_info of devices in the layout,
    then look them up with the osd_initiator library */
-static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
-				struct objio_segment *objio_seg, unsigned comp,
-				gfp_t gfp_flags)
+static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
+	struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id,
+	gfp_t gfp_flags)
 {
 	struct pnfs_osd_deviceaddr *deviceaddr;
-	struct nfs4_deviceid *d_id;
 	struct objio_dev_ent *ode;
 	struct osd_dev *od;
 	struct osd_dev_info odi;
 	int err;
 
-	d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
-
 	ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
-	if (ode)
-		return ode;
+	if (ode) {
+		objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
+		return 0;
+	}
 
 	err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
 	if (unlikely(err)) {
 		dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
 			__func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
-		return ERR_PTR(err);
+		return err;
 	}
 
 	odi.systemid_len = deviceaddr->oda_systemid.len;
 	if (odi.systemid_len > sizeof(odi.systemid)) {
+		dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n",
+			__func__, sizeof(odi.systemid));
 		err = -EINVAL;
 		goto out;
 	} else if (odi.systemid_len)
@@ -225,38 +210,15 @@ static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
 
 	ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
 			    gfp_flags);
-
+	objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
+	dprintk("Adding new dev_id(%llx:%llx)\n",
+		_DEVID_LO(d_id), _DEVID_HI(d_id));
 out:
-	dprintk("%s: return=%d\n", __func__, err);
 	objlayout_put_deviceinfo(deviceaddr);
-	return err ? ERR_PTR(err) : ode;
-}
-
-static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
-	struct objio_segment *objio_seg,
-	gfp_t gfp_flags)
-{
-	unsigned i;
-	int err;
-
-	/* lookup all devices */
-	for (i = 0; i < objio_seg->num_comps; i++) {
-		struct objio_dev_ent *ode;
-
-		ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
-		if (unlikely(IS_ERR(ode))) {
-			err = PTR_ERR(ode);
-			goto out;
-		}
-		objio_seg->ods[i] = ode;
-	}
-	err = 0;
-
-out:
-	dprintk("%s: return=%d\n", __func__, err);
 	return err;
 }
 
+#if 0
 static int _verify_data_map(struct pnfs_osd_layout *layout)
 {
 	struct pnfs_osd_data_map *data_map = &layout->olo_map;
@@ -296,23 +258,45 @@ static int _verify_data_map(struct pnfs_osd_layout *layout)
 
 	return 0;
 }
+#endif
 
-static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
-			     struct pnfs_osd_object_cred *src_comp,
-			     struct caps_buffers *caps_p)
+static void copy_single_comp(struct ore_components *oc, unsigned c,
+			     struct pnfs_osd_object_cred *src_comp)
 {
-	WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
-	WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
+	struct ore_comp *ocomp = &oc->comps[c];
 
-	*cur_comp = *src_comp;
+	WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */
+	WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred));
 
-	memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
-	       sizeof(caps_p->caps_key));
-	cur_comp->oc_cap_key.cred = caps_p->caps_key;
+	ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id;
+	ocomp->obj.id = src_comp->oc_object_id.oid_object_id;
 
-	memcpy(caps_p->creds, src_comp->oc_cap.cred,
-	       sizeof(caps_p->creds));
-	cur_comp->oc_cap.cred = caps_p->creds;
+	memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred));
+}
+
+int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
+		       struct objio_segment **pseg)
+{
+	struct __alloc_objio_segment {
+		struct objio_segment olseg;
+		struct ore_dev *ods[numdevs];
+		struct ore_comp	comps[numdevs];
+	} *aolseg;
+
+	aolseg = kzalloc(sizeof(*aolseg), gfp_flags);
+	if (unlikely(!aolseg)) {
+		dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__,
+			numdevs, sizeof(*aolseg));
+		return -ENOMEM;
+	}
+
+	aolseg->olseg.oc.numdevs = numdevs;
+	aolseg->olseg.oc.single_comp = EC_MULTPLE_COMPS;
+	aolseg->olseg.oc.comps = aolseg->comps;
+	aolseg->olseg.oc.ods = aolseg->ods;
+
+	*pseg = &aolseg->olseg;
+	return 0;
 }
 
 int objio_alloc_lseg(struct pnfs_layout_segment **outp,
@@ -324,59 +308,43 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
 	struct objio_segment *objio_seg;
 	struct pnfs_osd_xdr_decode_layout_iter iter;
 	struct pnfs_osd_layout layout;
-	struct pnfs_osd_object_cred *cur_comp, src_comp;
-	struct caps_buffers *caps_p;
+	struct pnfs_osd_object_cred src_comp;
+	unsigned cur_comp;
 	int err;
 
 	err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
 	if (unlikely(err))
 		return err;
 
-	err = _verify_data_map(&layout);
+	err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg);
 	if (unlikely(err))
 		return err;
 
-	objio_seg = kzalloc(sizeof(*objio_seg) +
-			    sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
-			    sizeof(*objio_seg->comps) * layout.olo_num_comps +
-			    sizeof(struct caps_buffers) * layout.olo_num_comps,
-			    gfp_flags);
-	if (!objio_seg)
-		return -ENOMEM;
+	objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit;
+	objio_seg->layout.group_width = layout.olo_map.odm_group_width;
+	objio_seg->layout.group_depth = layout.olo_map.odm_group_depth;
+	objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
+	objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm;
 
-	objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
-	cur_comp = objio_seg->comps;
-	caps_p = (void *)(cur_comp + layout.olo_num_comps);
-	while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
-		copy_single_comp(cur_comp++, &src_comp, caps_p++);
+	err = ore_verify_layout(layout.olo_map.odm_num_comps,
+					  &objio_seg->layout);
 	if (unlikely(err))
 		goto err;
 
-	objio_seg->num_comps = layout.olo_num_comps;
-	objio_seg->comps_index = layout.olo_comps_index;
-	err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
-	if (err)
-		goto err;
-
-	objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
-	objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
-	if (layout.olo_map.odm_group_width) {
-		objio_seg->group_width = layout.olo_map.odm_group_width;
-		objio_seg->group_depth = layout.olo_map.odm_group_depth;
-		objio_seg->group_count = layout.olo_map.odm_num_comps /
-						objio_seg->mirrors_p1 /
-						objio_seg->group_width;
-	} else {
-		objio_seg->group_width = layout.olo_map.odm_num_comps /
-						objio_seg->mirrors_p1;
-		objio_seg->group_depth = -1;
-		objio_seg->group_count = 1;
+	objio_seg->oc.first_dev = layout.olo_comps_index;
+	cur_comp = 0;
+	while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
+		copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
+		err = objio_devices_lookup(pnfslay, objio_seg, cur_comp,
+					   &src_comp.oc_object_id.oid_device_id,
+					   gfp_flags);
+		if (err)
+			goto err;
+		++cur_comp;
 	}
-
-	/* Cache this calculation it will hit for every page */
-	objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE -
-				  objio_seg->stripe_unit) *
-				 objio_seg->group_width;
+	/* pnfs_osd_xdr_decode_layout_comp returns false on error */
+	if (unlikely(err))
+		goto err;
 
 	*outp = &objio_seg->lseg;
 	return 0;
@@ -393,10 +361,14 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg)
 	int i;
 	struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
 
-	for (i = 0; i < objio_seg->num_comps; i++) {
-		if (!objio_seg->ods[i])
+	for (i = 0; i < objio_seg->oc.numdevs; i++) {
+		struct ore_dev *od = objio_seg->oc.ods[i];
+		struct objio_dev_ent *ode;
+
+		if (!od)
 			break;
-		nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
+		ode = container_of(od, typeof(*ode), od);
+		nfs4_put_deviceid_node(&ode->id_node);
 	}
 	kfree(objio_seg);
 }
@@ -411,8 +383,8 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
 	struct objio_state *ios;
 	struct __alloc_objio_state {
 		struct objio_state objios;
-		struct _objio_per_comp per_dev[objio_seg->num_comps];
-		struct pnfs_osd_ioerr ioerrs[objio_seg->num_comps];
+		struct _objio_per_comp per_dev[objio_seg->oc.numdevs];
+		struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs];
 	} *aos;
 
 	aos = kzalloc(sizeof(*aos), gfp_flags);
@@ -421,8 +393,9 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
 
 	ios = &aos->objios;
 
-	ios->layout = objio_seg;
-	objlayout_init_ioerrs(&aos->objios.oir, objio_seg->num_comps,
+	ios->layout = &objio_seg->layout;
+	ios->oc = &objio_seg->oc;
+	objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs,
 			aos->ioerrs, rpcdata, pnfs_layout_type);
 
 	ios->pages = pages;
@@ -474,6 +447,27 @@ enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
 	}
 }
 
+static void __on_dev_error(struct objio_state *ios, bool is_write,
+	struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep,
+	u64 dev_offset, u64  dev_len)
+{
+	struct objio_state *objios = ios->private;
+	struct pnfs_osd_objid pooid;
+	struct objio_dev_ent *ode = container_of(od, typeof(*ode), od);
+	/* FIXME: what to do with more-then-one-group layouts. We need to
+	 * translate from ore_io_state index to oc->comps index
+	 */
+	unsigned comp = dev_index;
+
+	pooid.oid_device_id = ode->id_node.deviceid;
+	pooid.oid_partition_id = ios->oc->comps[comp].obj.partition;
+	pooid.oid_object_id = ios->oc->comps[comp].obj.id;
+
+	objlayout_io_set_result(&objios->oir, comp,
+				&pooid, osd_pri_2_pnfs_err(oep),
+				dev_offset, dev_len, is_write);
+}
+
 static void _clear_bio(struct bio *bio)
 {
 	struct bio_vec *bv;
@@ -518,12 +512,9 @@ static int _io_check(struct objio_state *ios, bool is_write)
 
 			continue; /* we recovered */
 		}
-		objlayout_io_set_result(&ios->oir, i,
-					&ios->layout->comps[i].oc_object_id,
-					osd_pri_2_pnfs_err(osi.osd_err_pri),
-					ios->per_dev[i].offset,
-					ios->per_dev[i].length,
-					is_write);
+		__on_dev_error(ios, is_write, ios->oc->ods[i],
+				ios->per_dev[i].dev, osi.osd_err_pri,
+				ios->per_dev[i].offset, ios->per_dev[i].length);
 
 		if (osi.osd_err_pri >= oep) {
 			oep = osi.osd_err_pri;
@@ -558,11 +549,11 @@ static void _io_free(struct objio_state *ios)
 
 struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
 {
-	unsigned min_dev = ios->layout->comps_index;
-	unsigned max_dev = min_dev + ios->layout->num_comps;
+	unsigned min_dev = ios->oc->first_dev;
+	unsigned max_dev = min_dev + ios->oc->numdevs;
 
 	BUG_ON(dev < min_dev || max_dev <= dev);
-	return ios->layout->ods[dev - min_dev]->od;
+	return ios->oc->ods[dev - min_dev]->od;
 }
 
 struct _striping_info {
@@ -820,12 +811,9 @@ static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
 	struct osd_request *or = NULL;
 	struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
 	unsigned dev = per_dev->dev;
-	struct pnfs_osd_object_cred *cred =
-			&ios->layout->comps[cur_comp];
-	struct osd_obj_id obj = {
-		.partition = cred->oc_object_id.oid_partition_id,
-		.id = cred->oc_object_id.oid_object_id,
-	};
+	struct ore_comp *cred =
+			&ios->oc->comps[cur_comp];
+	struct osd_obj_id obj = cred->obj;
 	int ret;
 
 	or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
@@ -837,7 +825,7 @@ static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
 
 	osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
 
-	ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
+	ret = osd_finalize_request(or, 0, cred->cred, NULL);
 	if (ret) {
 		dprintk("%s: Faild to osd_finalize_request() => %d\n",
 			__func__, ret);
@@ -924,12 +912,8 @@ static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
 
 	for (; cur_comp < last_comp; ++cur_comp, ++dev) {
 		struct osd_request *or = NULL;
-		struct pnfs_osd_object_cred *cred =
-					&ios->layout->comps[cur_comp];
-		struct osd_obj_id obj = {
-			.partition = cred->oc_object_id.oid_partition_id,
-			.id = cred->oc_object_id.oid_object_id,
-		};
+		struct ore_comp *cred = &ios->oc->comps[cur_comp];
+		struct osd_obj_id obj = cred->obj;
 		struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
 		struct bio *bio;
 
@@ -964,7 +948,7 @@ static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
 
 		osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
 
-		ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
+		ret = osd_finalize_request(or, 0, cred->cred, NULL);
 		if (ret) {
 			dprintk("%s: Faild to osd_finalize_request() => %d\n",
 				__func__, ret);
@@ -1030,7 +1014,7 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
 		return false;
 
 	return pgio->pg_count + req->wb_bytes <=
-			OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
+			OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
 }
 
 static const struct nfs_pageio_ops objio_pg_read_ops = {
-- 
cgit v1.2.3-58-ga151


From eecfc6312a24e6d0d2883de0a9a6ccf8e993f472 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 31 Oct 2011 15:15:38 -0700
Subject: pnfs-obj: move to ore 02: move to ORE

In this patch we are actually moving to the ORE.
(Object Raid Engine).

objio_state holds a pointer to an ore_io_state. Once
we have an ore_io_state at hand we can call the ore
for reading/writing. We register on the done path
to kick off the nfs io_done mechanism.

Again for Ease of reviewing the old code is "#if 0"
but is not removed so the diff command works better.
The old code will be removed in the next patch.

fs/exofs/Kconfig::ORE is modified to also be auto-included
if PNFS_OBJLAYOUT is set. Since we now depend on ORE.
(See comments in fs/exofs/Kconfig)

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/exofs/Kconfig             |   2 +-
 fs/nfs/objlayout/objio_osd.c | 133 +++++++++++++++++++------------------------
 2 files changed, 60 insertions(+), 75 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
index fa9a286c8771..da42f32c49be 100644
--- a/fs/exofs/Kconfig
+++ b/fs/exofs/Kconfig
@@ -5,7 +5,7 @@
 # selected by any of the users.
 config ORE
 	tristate
-	depends on EXOFS_FS
+	depends on EXOFS_FS || PNFS_OBJLAYOUT
 	select ASYNC_XOR
 	default SCSI_OSD_ULD
 
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index bd7ec26e2840..00b384934c32 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -44,12 +44,6 @@
 
 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
 
-#define _LLU(x) ((unsigned long long)x)
-
-enum { BIO_MAX_PAGES_KMALLOC =
-		(PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
-};
-
 struct objio_dev_ent {
 	struct nfs4_deviceid_node id_node;
 	struct ore_dev od;
@@ -124,37 +118,13 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg)
 	return container_of(lseg, struct objio_segment, lseg);
 }
 
-struct objio_state;
-typedef int (*objio_done_fn)(struct objio_state *ios);
-
 struct objio_state {
 	/* Generic layer */
 	struct objlayout_io_res oir;
 
-	struct page **pages;
-	unsigned pgbase;
-	unsigned nr_pages;
-	unsigned long count;
-	loff_t offset;
 	bool sync;
-
-	struct ore_layout *layout;
-	struct ore_components *oc;
-
-	struct kref kref;
-	objio_done_fn done;
-	void *private;
-
-	unsigned long length;
-	unsigned numdevs; /* Actually used devs in this IO */
-	/* A per-device variable array of size numdevs */
-	struct _objio_per_comp {
-		struct bio *bio;
-		struct osd_request *or;
-		unsigned long length;
-		u64 offset;
-		unsigned dev;
-	} per_dev[];
+	/*FIXME: Support for extra_bytes at ore_get_rw_state() */
+	struct ore_io_state *ios;
 };
 
 /* Send and wait for a get_device_info of devices in the layout,
@@ -374,16 +344,16 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg)
 }
 
 static int
-objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
+objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading,
 	struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase,
 	loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags,
 	struct objio_state **outp)
 {
 	struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
-	struct objio_state *ios;
+	struct ore_io_state *ios;
+	int ret;
 	struct __alloc_objio_state {
 		struct objio_state objios;
-		struct _objio_per_comp per_dev[objio_seg->oc.numdevs];
 		struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs];
 	} *aos;
 
@@ -391,30 +361,33 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
 	if (unlikely(!aos))
 		return -ENOMEM;
 
-	ios = &aos->objios;
-
-	ios->layout = &objio_seg->layout;
-	ios->oc = &objio_seg->oc;
 	objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs,
 			aos->ioerrs, rpcdata, pnfs_layout_type);
 
+	ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading,
+			       offset, count, &ios);
+	if (unlikely(ret)) {
+		kfree(aos);
+		return ret;
+	}
+
 	ios->pages = pages;
 	ios->pgbase = pgbase;
-	ios->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	ios->offset = offset;
-	ios->count = count;
-	ios->sync = 0;
+	ios->private = aos;
 	BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT);
 
-	*outp = ios;
+	aos->objios.sync = 0;
+	aos->objios.ios = ios;
+	*outp = &aos->objios;
 	return 0;
 }
 
 void objio_free_result(struct objlayout_io_res *oir)
 {
-	struct objio_state *ios = container_of(oir, struct objio_state, oir);
+	struct objio_state *objios = container_of(oir, struct objio_state, oir);
 
-	kfree(ios);
+	ore_put_io_state(objios->ios);
+	kfree(objios);
 }
 
 enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
@@ -447,7 +420,7 @@ enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
 	}
 }
 
-static void __on_dev_error(struct objio_state *ios, bool is_write,
+static void __on_dev_error(struct ore_io_state *ios,
 	struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep,
 	u64 dev_offset, u64  dev_len)
 {
@@ -465,9 +438,10 @@ static void __on_dev_error(struct objio_state *ios, bool is_write,
 
 	objlayout_io_set_result(&objios->oir, comp,
 				&pooid, osd_pri_2_pnfs_err(oep),
-				dev_offset, dev_len, is_write);
+				dev_offset, dev_len, !ios->reading);
 }
 
+#if 0
 static void _clear_bio(struct bio *bio)
 {
 	struct bio_vec *bv;
@@ -786,26 +760,28 @@ static int _io_exec(struct objio_state *ios)
 
 	return ret;
 }
+#endif
 
 /*
  * read
  */
-static int _read_done(struct objio_state *ios)
+static void _read_done(struct ore_io_state *ios, void *private)
 {
+	struct objio_state *objios = private;
 	ssize_t status;
-	int ret = _io_check(ios, false);
+	int ret = ore_check_io(ios, &__on_dev_error);
 
-	_io_free(ios);
+	/* FIXME: _io_free(ios) can we dealocate the libosd resources; */
 
 	if (likely(!ret))
 		status = ios->length;
 	else
 		status = ret;
 
-	objlayout_read_done(&ios->oir, status, ios->sync);
-	return ret;
+	objlayout_read_done(&objios->oir, status, objios->sync);
 }
 
+#if 0
 static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
 {
 	struct osd_request *or = NULL;
@@ -860,49 +836,50 @@ err:
 	_io_free(ios);
 	return ret;
 }
+#endif
 
 int objio_read_pagelist(struct nfs_read_data *rdata)
 {
-	struct objio_state *ios;
+	struct objio_state *objios;
 	int ret;
 
-	ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout,
+	ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, true,
 			rdata->lseg, rdata->args.pages, rdata->args.pgbase,
 			rdata->args.offset, rdata->args.count, rdata,
-			GFP_KERNEL, &ios);
-	if (unlikely(ret))
-		return ret;
-
-	ret = _io_rw_pagelist(ios, GFP_KERNEL);
+			GFP_KERNEL, &objios);
 	if (unlikely(ret))
 		return ret;
 
-	return _read_exec(ios);
+	objios->ios->done = _read_done;
+	dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
+		rdata->args.offset, rdata->args.count);
+	return ore_read(objios->ios);
 }
 
 /*
  * write
  */
-static int _write_done(struct objio_state *ios)
+static void _write_done(struct ore_io_state *ios, void *private)
 {
+	struct objio_state *objios = private;
 	ssize_t status;
-	int ret = _io_check(ios, true);
+	int ret = ore_check_io(ios, &__on_dev_error);
 
-	_io_free(ios);
+	/* FIXME: _io_free(ios) can we dealocate the libosd resources; */
 
 	if (likely(!ret)) {
 		/* FIXME: should be based on the OSD's persistence model
 		 * See OSD2r05 Section 4.13 Data persistence model */
-		ios->oir.committed = NFS_FILE_SYNC;
+		objios->oir.committed = NFS_FILE_SYNC;
 		status = ios->length;
 	} else {
 		status = ret;
 	}
 
-	objlayout_write_done(&ios->oir, status, ios->sync);
-	return ret;
+	objlayout_write_done(&objios->oir, status, objios->sync);
 }
 
+#if 0
 static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
 {
 	struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
@@ -984,27 +961,35 @@ err:
 	_io_free(ios);
 	return ret;
 }
+#endif
 
 int objio_write_pagelist(struct nfs_write_data *wdata, int how)
 {
-	struct objio_state *ios;
+	struct objio_state *objios;
 	int ret;
 
-	ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout,
+	ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, false,
 			wdata->lseg, wdata->args.pages, wdata->args.pgbase,
 			wdata->args.offset, wdata->args.count, wdata, GFP_NOFS,
-			&ios);
+			&objios);
 	if (unlikely(ret))
 		return ret;
 
-	ios->sync = 0 != (how & FLUSH_SYNC);
+	objios->sync = 0 != (how & FLUSH_SYNC);
 
-	/* TODO: ios->stable = stable; */
-	ret = _io_rw_pagelist(ios, GFP_NOFS);
+	if (!objios->sync)
+		objios->ios->done = _write_done;
+
+	dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
+		wdata->args.offset, wdata->args.count);
+	ret = ore_write(objios->ios);
 	if (unlikely(ret))
 		return ret;
 
-	return _write_exec(ios);
+	if (objios->sync)
+		_write_done(objios->ios, objios);
+
+	return 0;
 }
 
 static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
-- 
cgit v1.2.3-58-ga151


From 04291b628c450ab6fdb606836585f16336662a4e Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 31 Oct 2011 15:16:15 -0700
Subject: pnfs-obj: move to ore 03: Remove old raid engine

Finally remove all the old raid engine, which is by now
dead code.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/objlayout/objio_osd.c | 504 -------------------------------------------
 1 file changed, 504 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 00b384934c32..3161da654a9b 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -188,48 +188,6 @@ out:
 	return err;
 }
 
-#if 0
-static int _verify_data_map(struct pnfs_osd_layout *layout)
-{
-	struct pnfs_osd_data_map *data_map = &layout->olo_map;
-	u64 stripe_length;
-	u32 group_width;
-
-/* FIXME: Only raid0 for now. if not go through MDS */
-	if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
-		printk(KERN_ERR "Only RAID_0 for now\n");
-		return -ENOTSUPP;
-	}
-	if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
-		printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
-			  data_map->odm_num_comps, data_map->odm_mirror_cnt);
-		return -EINVAL;
-	}
-
-	if (data_map->odm_group_width)
-		group_width = data_map->odm_group_width;
-	else
-		group_width = data_map->odm_num_comps /
-						(data_map->odm_mirror_cnt + 1);
-
-	stripe_length = (u64)data_map->odm_stripe_unit * group_width;
-	if (stripe_length >= (1ULL << 32)) {
-		printk(KERN_ERR "Total Stripe length(0x%llx)"
-			  " >= 32bit is not supported\n", _LLU(stripe_length));
-		return -ENOTSUPP;
-	}
-
-	if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
-		printk(KERN_ERR "Stripe Unit(0x%llx)"
-			  " must be Multples of PAGE_SIZE(0x%lx)\n",
-			  _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
-		return -ENOTSUPP;
-	}
-
-	return 0;
-}
-#endif
-
 static void copy_single_comp(struct ore_components *oc, unsigned c,
 			     struct pnfs_osd_object_cred *src_comp)
 {
@@ -441,327 +399,6 @@ static void __on_dev_error(struct ore_io_state *ios,
 				dev_offset, dev_len, !ios->reading);
 }
 
-#if 0
-static void _clear_bio(struct bio *bio)
-{
-	struct bio_vec *bv;
-	unsigned i;
-
-	__bio_for_each_segment(bv, bio, i, 0) {
-		unsigned this_count = bv->bv_len;
-
-		if (likely(PAGE_SIZE == this_count))
-			clear_highpage(bv->bv_page);
-		else
-			zero_user(bv->bv_page, bv->bv_offset, this_count);
-	}
-}
-
-static int _io_check(struct objio_state *ios, bool is_write)
-{
-	enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
-	int lin_ret = 0;
-	int i;
-
-	for (i = 0; i <  ios->numdevs; i++) {
-		struct osd_sense_info osi;
-		struct osd_request *or = ios->per_dev[i].or;
-		int ret;
-
-		if (!or)
-			continue;
-
-		ret = osd_req_decode_sense(or, &osi);
-		if (likely(!ret))
-			continue;
-
-		if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
-			/* start read offset passed endof file */
-			BUG_ON(is_write);
-			_clear_bio(ios->per_dev[i].bio);
-			dprintk("%s: start read offset passed end of file "
-				"offset=0x%llx, length=0x%lx\n", __func__,
-				_LLU(ios->per_dev[i].offset),
-				ios->per_dev[i].length);
-
-			continue; /* we recovered */
-		}
-		__on_dev_error(ios, is_write, ios->oc->ods[i],
-				ios->per_dev[i].dev, osi.osd_err_pri,
-				ios->per_dev[i].offset, ios->per_dev[i].length);
-
-		if (osi.osd_err_pri >= oep) {
-			oep = osi.osd_err_pri;
-			lin_ret = ret;
-		}
-	}
-
-	return lin_ret;
-}
-
-/*
- * Common IO state helpers.
- */
-static void _io_free(struct objio_state *ios)
-{
-	unsigned i;
-
-	for (i = 0; i < ios->numdevs; i++) {
-		struct _objio_per_comp *per_dev = &ios->per_dev[i];
-
-		if (per_dev->or) {
-			osd_end_request(per_dev->or);
-			per_dev->or = NULL;
-		}
-
-		if (per_dev->bio) {
-			bio_put(per_dev->bio);
-			per_dev->bio = NULL;
-		}
-	}
-}
-
-struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
-{
-	unsigned min_dev = ios->oc->first_dev;
-	unsigned max_dev = min_dev + ios->oc->numdevs;
-
-	BUG_ON(dev < min_dev || max_dev <= dev);
-	return ios->oc->ods[dev - min_dev]->od;
-}
-
-struct _striping_info {
-	u64 obj_offset;
-	u64 group_length;
-	unsigned dev;
-	unsigned unit_off;
-};
-
-static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
-			      struct _striping_info *si)
-{
-	u32	stripe_unit = ios->layout->stripe_unit;
-	u32	group_width = ios->layout->group_width;
-	u64	group_depth = ios->layout->group_depth;
-	u32	U = stripe_unit * group_width;
-
-	u64	T = U * group_depth;
-	u64	S = T * ios->layout->group_count;
-	u64	M = div64_u64(file_offset, S);
-
-	/*
-	G = (L - (M * S)) / T
-	H = (L - (M * S)) % T
-	*/
-	u64	LmodU = file_offset - M * S;
-	u32	G = div64_u64(LmodU, T);
-	u64	H = LmodU - G * T;
-
-	u32	N = div_u64(H, U);
-
-	div_u64_rem(file_offset, stripe_unit, &si->unit_off);
-	si->obj_offset = si->unit_off + (N * stripe_unit) +
-				  (M * group_depth * stripe_unit);
-
-	/* "H - (N * U)" is just "H % U" so it's bound to u32 */
-	si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
-	si->dev *= ios->layout->mirrors_p1;
-
-	si->group_length = T - H;
-}
-
-static int _add_stripe_unit(struct objio_state *ios,  unsigned *cur_pg,
-		unsigned pgbase, struct _objio_per_comp *per_dev, int len,
-		gfp_t gfp_flags)
-{
-	unsigned pg = *cur_pg;
-	int cur_len = len;
-	struct request_queue *q =
-			osd_request_queue(_io_od(ios, per_dev->dev));
-
-	if (per_dev->bio == NULL) {
-		unsigned pages_in_stripe = ios->layout->group_width *
-				      (ios->layout->stripe_unit / PAGE_SIZE);
-		unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
-				    ios->layout->group_width;
-
-		if (BIO_MAX_PAGES_KMALLOC < bio_size)
-			bio_size = BIO_MAX_PAGES_KMALLOC;
-
-		per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
-		if (unlikely(!per_dev->bio)) {
-			dprintk("Faild to allocate BIO size=%u\n", bio_size);
-			return -ENOMEM;
-		}
-	}
-
-	while (cur_len > 0) {
-		unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
-		unsigned added_len;
-
-		BUG_ON(ios->nr_pages <= pg);
-		cur_len -= pglen;
-
-		added_len = bio_add_pc_page(q, per_dev->bio,
-					ios->pages[pg], pglen, pgbase);
-		if (unlikely(pglen != added_len))
-			return -ENOMEM;
-		pgbase = 0;
-		++pg;
-	}
-	BUG_ON(cur_len);
-
-	per_dev->length += len;
-	*cur_pg = pg;
-	return 0;
-}
-
-static int _prepare_one_group(struct objio_state *ios, u64 length,
-			      struct _striping_info *si, unsigned *last_pg,
-			      gfp_t gfp_flags)
-{
-	unsigned stripe_unit = ios->layout->stripe_unit;
-	unsigned mirrors_p1 = ios->layout->mirrors_p1;
-	unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
-	unsigned dev = si->dev;
-	unsigned first_dev = dev - (dev % devs_in_group);
-	unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
-	unsigned cur_pg = *last_pg;
-	int ret = 0;
-
-	while (length) {
-		struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev];
-		unsigned cur_len, page_off = 0;
-
-		if (!per_dev->length) {
-			per_dev->dev = dev;
-			if (dev < si->dev) {
-				per_dev->offset = si->obj_offset + stripe_unit -
-								   si->unit_off;
-				cur_len = stripe_unit;
-			} else if (dev == si->dev) {
-				per_dev->offset = si->obj_offset;
-				cur_len = stripe_unit - si->unit_off;
-				page_off = si->unit_off & ~PAGE_MASK;
-				BUG_ON(page_off &&
-				      (page_off != ios->pgbase));
-			} else { /* dev > si->dev */
-				per_dev->offset = si->obj_offset - si->unit_off;
-				cur_len = stripe_unit;
-			}
-
-			if (max_comp < dev - first_dev)
-				max_comp = dev - first_dev;
-		} else {
-			cur_len = stripe_unit;
-		}
-		if (cur_len >= length)
-			cur_len = length;
-
-		ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
-				       cur_len, gfp_flags);
-		if (unlikely(ret))
-			goto out;
-
-		dev += mirrors_p1;
-		dev = (dev % devs_in_group) + first_dev;
-
-		length -= cur_len;
-		ios->length += cur_len;
-	}
-out:
-	ios->numdevs = max_comp + mirrors_p1;
-	*last_pg = cur_pg;
-	return ret;
-}
-
-static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
-{
-	u64 length = ios->count;
-	u64 offset = ios->offset;
-	struct _striping_info si;
-	unsigned last_pg = 0;
-	int ret = 0;
-
-	while (length) {
-		_calc_stripe_info(ios, offset, &si);
-
-		if (length < si.group_length)
-			si.group_length = length;
-
-		ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
-		if (unlikely(ret))
-			goto out;
-
-		offset += si.group_length;
-		length -= si.group_length;
-	}
-
-out:
-	if (!ios->length)
-		return ret;
-
-	return 0;
-}
-
-static int _sync_done(struct objio_state *ios)
-{
-	struct completion *waiting = ios->private;
-
-	complete(waiting);
-	return 0;
-}
-
-static void _last_io(struct kref *kref)
-{
-	struct objio_state *ios = container_of(kref, struct objio_state, kref);
-
-	ios->done(ios);
-}
-
-static void _done_io(struct osd_request *or, void *p)
-{
-	struct objio_state *ios = p;
-
-	kref_put(&ios->kref, _last_io);
-}
-
-static int _io_exec(struct objio_state *ios)
-{
-	DECLARE_COMPLETION_ONSTACK(wait);
-	int ret = 0;
-	unsigned i;
-	objio_done_fn saved_done_fn = ios->done;
-	bool sync = ios->sync;
-
-	if (sync) {
-		ios->done = _sync_done;
-		ios->private = &wait;
-	}
-
-	kref_init(&ios->kref);
-
-	for (i = 0; i < ios->numdevs; i++) {
-		struct osd_request *or = ios->per_dev[i].or;
-
-		if (!or)
-			continue;
-
-		kref_get(&ios->kref);
-		osd_execute_request_async(or, _done_io, ios);
-	}
-
-	kref_put(&ios->kref, _last_io);
-
-	if (sync) {
-		wait_for_completion(&wait);
-		ret = saved_done_fn(ios);
-	}
-
-	return ret;
-}
-#endif
-
 /*
  * read
  */
@@ -781,63 +418,6 @@ static void _read_done(struct ore_io_state *ios, void *private)
 	objlayout_read_done(&objios->oir, status, objios->sync);
 }
 
-#if 0
-static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
-{
-	struct osd_request *or = NULL;
-	struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
-	unsigned dev = per_dev->dev;
-	struct ore_comp *cred =
-			&ios->oc->comps[cur_comp];
-	struct osd_obj_id obj = cred->obj;
-	int ret;
-
-	or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
-	if (unlikely(!or)) {
-		ret = -ENOMEM;
-		goto err;
-	}
-	per_dev->or = or;
-
-	osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
-
-	ret = osd_finalize_request(or, 0, cred->cred, NULL);
-	if (ret) {
-		dprintk("%s: Faild to osd_finalize_request() => %d\n",
-			__func__, ret);
-		goto err;
-	}
-
-	dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
-		__func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
-		per_dev->length);
-
-err:
-	return ret;
-}
-
-static int _read_exec(struct objio_state *ios)
-{
-	unsigned i;
-	int ret;
-
-	for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
-		if (!ios->per_dev[i].length)
-			continue;
-		ret = _read_mirrors(ios, i);
-		if (unlikely(ret))
-			goto err;
-	}
-
-	ios->done = _read_done;
-	return _io_exec(ios);
-
-err:
-	_io_free(ios);
-	return ret;
-}
-#endif
-
 int objio_read_pagelist(struct nfs_read_data *rdata)
 {
 	struct objio_state *objios;
@@ -879,90 +459,6 @@ static void _write_done(struct ore_io_state *ios, void *private)
 	objlayout_write_done(&objios->oir, status, objios->sync);
 }
 
-#if 0
-static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
-{
-	struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
-	unsigned dev = ios->per_dev[cur_comp].dev;
-	unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
-	int ret;
-
-	for (; cur_comp < last_comp; ++cur_comp, ++dev) {
-		struct osd_request *or = NULL;
-		struct ore_comp *cred = &ios->oc->comps[cur_comp];
-		struct osd_obj_id obj = cred->obj;
-		struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
-		struct bio *bio;
-
-		or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
-		if (unlikely(!or)) {
-			ret = -ENOMEM;
-			goto err;
-		}
-		per_dev->or = or;
-
-		if (per_dev != master_dev) {
-			bio = bio_kmalloc(GFP_NOFS,
-					  master_dev->bio->bi_max_vecs);
-			if (unlikely(!bio)) {
-				dprintk("Faild to allocate BIO size=%u\n",
-					master_dev->bio->bi_max_vecs);
-				ret = -ENOMEM;
-				goto err;
-			}
-
-			__bio_clone(bio, master_dev->bio);
-			bio->bi_bdev = NULL;
-			bio->bi_next = NULL;
-			per_dev->bio = bio;
-			per_dev->dev = dev;
-			per_dev->length = master_dev->length;
-			per_dev->offset =  master_dev->offset;
-		} else {
-			bio = master_dev->bio;
-			bio->bi_rw |= REQ_WRITE;
-		}
-
-		osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
-
-		ret = osd_finalize_request(or, 0, cred->cred, NULL);
-		if (ret) {
-			dprintk("%s: Faild to osd_finalize_request() => %d\n",
-				__func__, ret);
-			goto err;
-		}
-
-		dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
-			__func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
-			per_dev->length);
-	}
-
-err:
-	return ret;
-}
-
-static int _write_exec(struct objio_state *ios)
-{
-	unsigned i;
-	int ret;
-
-	for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
-		if (!ios->per_dev[i].length)
-			continue;
-		ret = _write_mirrors(ios, i);
-		if (unlikely(ret))
-			goto err;
-	}
-
-	ios->done = _write_done;
-	return _io_exec(ios);
-
-err:
-	_io_free(ios);
-	return ret;
-}
-#endif
-
 int objio_write_pagelist(struct nfs_write_data *wdata, int how)
 {
 	struct objio_state *objios;
-- 
cgit v1.2.3-58-ga151


From 278c023a99b0d6b471d0f4a79835c703482e29ac Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 31 Oct 2011 15:16:54 -0700
Subject: pnfs-obj: Support for RAID5 read-4-write interface.

The ore need suplied a r4w_get_page/r4w_put_page API
from Filesystem so it can get cache pages to read-into when
writing parial stripes.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/objlayout/objio_osd.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 3161da654a9b..c807ab93140e 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -459,6 +459,43 @@ static void _write_done(struct ore_io_state *ios, void *private)
 	objlayout_write_done(&objios->oir, status, objios->sync);
 }
 
+static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
+{
+	struct objio_state *objios = priv;
+	struct nfs_write_data *wdata = objios->oir.rpcdata;
+	pgoff_t index = offset / PAGE_SIZE;
+	struct page *page = find_get_page(wdata->inode->i_mapping, index);
+
+	if (!page) {
+		page = find_or_create_page(wdata->inode->i_mapping,
+						index, GFP_NOFS);
+		if (unlikely(!page)) {
+			dprintk("%s: grab_cache_page Failed index=0x%lx\n",
+				__func__, index);
+			return NULL;
+		}
+		unlock_page(page);
+	}
+	if (PageDirty(page) || PageWriteback(page))
+		*uptodate = true;
+	else
+		*uptodate = PageUptodate(page);
+	dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate);
+	return page;
+}
+
+static void __r4w_put_page(void *priv, struct page *page)
+{
+	dprintk("%s: index=0x%lx\n", __func__, page->index);
+	page_cache_release(page);
+	return;
+}
+
+static const struct _ore_r4w_op _r4w_op = {
+	.get_page = &__r4w_get_page,
+	.put_page = &__r4w_put_page,
+};
+
 int objio_write_pagelist(struct nfs_write_data *wdata, int how)
 {
 	struct objio_state *objios;
@@ -472,6 +509,7 @@ int objio_write_pagelist(struct nfs_write_data *wdata, int how)
 		return ret;
 
 	objios->sync = 0 != (how & FLUSH_SYNC);
+	objios->ios->r4w = &_r4w_op;
 
 	if (!objios->sync)
 		objios->ios->done = _write_done;
-- 
cgit v1.2.3-58-ga151


From b58dc4100b9190f2cb437f1f67ffcb9f9acc4923 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 15 Mar 2011 15:53:40 -0700
Subject: ceph: clear parent D_COMPLETE flag when on dentry prune

When the VFS prunes a dentry from the cache, clear the D_COMPLETE flag
on the parent dentry.  Do this for the live and snapshotted namespaces. Do
not bother for the .snap dir contents, since we do not cache that.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c   | 28 ++++++++++++++++++++++++++++
 fs/ceph/super.h | 13 +++++++++++++
 2 files changed, 41 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 382abc9a6a54..97fb155b3242 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1092,7 +1092,33 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,
 	return 1;
 }
 
+/*
+ * When the VFS prunes a dentry from the cache, we need to clear the
+ * complete flag on the parent directory.
+ *
+ * Called under dentry->d_lock.
+ */
+static void ceph_d_prune(struct dentry *dentry)
+{
+	struct ceph_dentry_info *di;
+
+	dout("d_release %p\n", dentry);
+
+	/* do we have a valid parent? */
+	if (!dentry->d_parent || IS_ROOT(dentry))
+		return;
+
+	/* if we are not hashed, we don't affect D_COMPLETE */
+	if (d_unhashed(dentry))
+		return;
 
+	/*
+	 * we hold d_lock, so d_parent is stable, and d_fsdata is never
+	 * cleared until d_release
+	 */
+	di = ceph_dentry(dentry->d_parent);
+	clear_bit(CEPH_D_COMPLETE, &di->flags);
+}
 
 /*
  * read() on a dir.  This weird interface hack only works if mounted
@@ -1306,6 +1332,7 @@ const struct inode_operations ceph_dir_iops = {
 const struct dentry_operations ceph_dentry_ops = {
 	.d_revalidate = ceph_d_revalidate,
 	.d_release = ceph_d_release,
+	.d_prune = ceph_d_prune,
 };
 
 const struct dentry_operations ceph_snapdir_dentry_ops = {
@@ -1315,4 +1342,5 @@ const struct dentry_operations ceph_snapdir_dentry_ops = {
 
 const struct dentry_operations ceph_snap_dentry_ops = {
 	.d_release = ceph_d_release,
+	.d_prune = ceph_d_prune,
 };
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b01442aaf278..98e60693c9a1 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -203,6 +203,7 @@ struct ceph_inode_xattr {
  * Ceph dentry state
  */
 struct ceph_dentry_info {
+	unsigned long flags;
 	struct ceph_mds_session *lease_session;
 	u32 lease_gen, lease_shared_gen;
 	u32 lease_seq;
@@ -213,6 +214,18 @@ struct ceph_dentry_info {
 	u64 offset;
 };
 
+/*
+ * dentry flags
+ *
+ * The locking for D_COMPLETE is a bit odd:
+ *  - we can clear it at almost any time (see ceph_d_prune)
+ *  - it is only meaningful if:
+ *    - we hold dir inode i_lock
+ *    - we hold dir FILE_SHARED caps
+ *    - the dentry D_COMPLETE is set
+ */
+#define CEPH_D_COMPLETE 1  /* if set, d_u.d_subdirs is complete directory */
+
 struct ceph_inode_xattrs_info {
 	/*
 	 * (still encoded) xattr blob. we avoid the overhead of parsing
-- 
cgit v1.2.3-58-ga151


From 161ebf9fcca967e7396bb076af5ed18539497a3f Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastry@etersoft.ru>
Date: Sat, 29 Oct 2011 17:17:58 +0400
Subject: CIFS: Simplify setlk error handling for mandatory locking

Now we allocate a lock structure at first, then we request to the server
and save the lock if server returned OK though void function - it prevents
the situation when we locked a file on the server and then return -ENOMEM
from setlk.

Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Acked-by: Jeff Layton <jlayton@samba.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 64 +++++++++++++++++++++++++++++-----------------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c1f063cd1b0c..d9cc07fec99f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -672,7 +672,7 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock)
 }
 
 static bool
-cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset,
+__cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset,
 			__u64 length, __u8 type, __u16 netfid,
 			struct cifsLockInfo **conf_lock)
 {
@@ -694,6 +694,14 @@ cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset,
 	return false;
 }
 
+static bool
+cifs_find_lock_conflict(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
+			struct cifsLockInfo **conf_lock)
+{
+	return __cifs_find_lock_conflict(cinode, lock->offset, lock->length,
+					 lock->type, lock->netfid, conf_lock);
+}
+
 static int
 cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
 	       __u8 type, __u16 netfid, struct file_lock *flock)
@@ -704,8 +712,8 @@ cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
 
 	mutex_lock(&cinode->lock_mutex);
 
-	exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid,
-					&conf_lock);
+	exist = __cifs_find_lock_conflict(cinode, offset, length, type, netfid,
+					  &conf_lock);
 	if (exist) {
 		flock->fl_start = conf_lock->offset;
 		flock->fl_end = conf_lock->offset + conf_lock->length - 1;
@@ -723,40 +731,27 @@ cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
 	return rc;
 }
 
-static int
-cifs_lock_add(struct cifsInodeInfo *cinode, __u64 len, __u64 offset,
-	      __u8 type, __u16 netfid)
+static void
+cifs_lock_add(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock)
 {
-	struct cifsLockInfo *li;
-
-	li = cifs_lock_init(len, offset, type, netfid);
-	if (!li)
-		return -ENOMEM;
-
 	mutex_lock(&cinode->lock_mutex);
-	list_add_tail(&li->llist, &cinode->llist);
+	list_add_tail(&lock->llist, &cinode->llist);
 	mutex_unlock(&cinode->lock_mutex);
-	return 0;
 }
 
 static int
-cifs_lock_add_if(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
-		 __u8 type, __u16 netfid, bool wait)
+cifs_lock_add_if(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
+		 bool wait)
 {
-	struct cifsLockInfo *lock, *conf_lock;
+	struct cifsLockInfo *conf_lock;
 	bool exist;
 	int rc = 0;
 
-	lock = cifs_lock_init(length, offset, type, netfid);
-	if (!lock)
-		return -ENOMEM;
-
 try_again:
 	exist = false;
 	mutex_lock(&cinode->lock_mutex);
 
-	exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid,
-					&conf_lock);
+	exist = cifs_find_lock_conflict(cinode, lock, &conf_lock);
 	if (!exist && cinode->can_cache_brlcks) {
 		list_add_tail(&lock->llist, &cinode->llist);
 		mutex_unlock(&cinode->lock_mutex);
@@ -781,7 +776,6 @@ try_again:
 		}
 	}
 
-	kfree(lock);
 	mutex_unlock(&cinode->lock_mutex);
 	return rc;
 }
@@ -1254,20 +1248,26 @@ cifs_setlk(struct file *file,  struct file_lock *flock, __u8 type,
 	}
 
 	if (lock) {
-		rc = cifs_lock_add_if(cinode, flock->fl_start, length,
-				      type, netfid, wait_flag);
+		struct cifsLockInfo *lock;
+
+		lock = cifs_lock_init(length, flock->fl_start, type, netfid);
+		if (!lock)
+			return -ENOMEM;
+
+		rc = cifs_lock_add_if(cinode, lock, wait_flag);
 		if (rc < 0)
-			return rc;
-		else if (!rc)
+			kfree(lock);
+		if (rc <= 0)
 			goto out;
 
 		rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
 				 flock->fl_start, 0, 1, type, wait_flag, 0);
-		if (rc == 0) {
-			/* For Windows locks we must store them. */
-			rc = cifs_lock_add(cinode, length, flock->fl_start,
-					   type, netfid);
+		if (rc) {
+			kfree(lock);
+			goto out;
 		}
+
+		cifs_lock_add(cinode, lock);
 	} else if (unlock)
 		rc = cifs_unlock_range(cfile, flock, xid);
 
-- 
cgit v1.2.3-58-ga151


From a88b470773bc5b640292d8be7b8e7e1011a86639 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastry@etersoft.ru>
Date: Sat, 29 Oct 2011 17:17:59 +0400
Subject: CIFS: Cleanup byte-range locking code style

Reorder parms of cifs_lock_init, trivially simplify getlk code and
remove extra {} in cifs_lock_add_if.

Cc: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Jeff Layton <jlayton@samba.org>
Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 43 +++++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d9cc07fec99f..cf0b1539b321 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -645,20 +645,20 @@ int cifs_closedir(struct inode *inode, struct file *file)
 }
 
 static struct cifsLockInfo *
-cifs_lock_init(__u64 len, __u64 offset, __u8 type, __u16 netfid)
+cifs_lock_init(__u64 offset, __u64 length, __u8 type, __u16 netfid)
 {
-	struct cifsLockInfo *li =
+	struct cifsLockInfo *lock =
 		kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL);
-	if (!li)
-		return li;
-	li->netfid = netfid;
-	li->offset = offset;
-	li->length = len;
-	li->type = type;
-	li->pid = current->tgid;
-	INIT_LIST_HEAD(&li->blist);
-	init_waitqueue_head(&li->block_q);
-	return li;
+	if (!lock)
+		return lock;
+	lock->offset = offset;
+	lock->length = length;
+	lock->type = type;
+	lock->netfid = netfid;
+	lock->pid = current->tgid;
+	INIT_LIST_HEAD(&lock->blist);
+	init_waitqueue_head(&lock->block_q);
+	return lock;
 }
 
 static void
@@ -770,10 +770,8 @@ try_again:
 					(lock->blist.next == &lock->blist));
 		if (!rc)
 			goto try_again;
-		else {
-			mutex_lock(&cinode->lock_mutex);
-			list_del_init(&lock->blist);
-		}
+		mutex_lock(&cinode->lock_mutex);
+		list_del_init(&lock->blist);
 	}
 
 	mutex_unlock(&cinode->lock_mutex);
@@ -927,7 +925,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 		else
 			type = CIFS_WRLCK;
 
-		lck = cifs_lock_init(length, flock->fl_start, type,
+		lck = cifs_lock_init(flock->fl_start, length, type,
 				     cfile->netfid);
 		if (!lck) {
 			rc = -ENOMEM;
@@ -1064,14 +1062,12 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u8 type,
 		if (rc != 0)
 			cERROR(1, "Error unlocking previously locked "
 				   "range %d during test of lock", rc);
-		rc = 0;
-		return rc;
+		return 0;
 	}
 
 	if (type & LOCKING_ANDX_SHARED_LOCK) {
 		flock->fl_type = F_WRLCK;
-		rc = 0;
-		return rc;
+		return 0;
 	}
 
 	rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
@@ -1089,8 +1085,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u8 type,
 	} else
 		flock->fl_type = F_WRLCK;
 
-	rc = 0;
-	return rc;
+	return 0;
 }
 
 static void
@@ -1250,7 +1245,7 @@ cifs_setlk(struct file *file,  struct file_lock *flock, __u8 type,
 	if (lock) {
 		struct cifsLockInfo *lock;
 
-		lock = cifs_lock_init(length, flock->fl_start, type, netfid);
+		lock = cifs_lock_init(flock->fl_start, length, type, netfid);
 		if (!lock)
 			return -ENOMEM;
 
-- 
cgit v1.2.3-58-ga151


From 6070295efc90d1093b2031c43380bd7d9673c802 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 4 Nov 2011 07:04:10 -0400
Subject: nfs: set vs_hidden on nfs4_callback_version4 (try #2)

This service should not be registered with or unregistered from rpcbind.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback_xdr.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index ee1a5b3cd48d..726e59a9e50f 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -987,4 +987,5 @@ struct svc_version nfs4_callback_version4 = {
 	.vs_proc = nfs4_callback_procedures1,
 	.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
 	.vs_dispatch = NULL,
+	.vs_hidden = 1,
 };
-- 
cgit v1.2.3-58-ga151


From 5c8a0fbba543d9428a486f0d1282bbcf3cf1d95a Mon Sep 17 00:00:00 2001
From: Dan McGee <dpmcgee@gmail.com>
Date: Tue, 1 Nov 2011 18:23:10 -0500
Subject: VFS: fix statfs() automounter semantics regression

No one in their right mind would expect statfs() to not work on a
automounter managed mount point. Fix it.

[ I'm not sure about the "no one in their right mind" part.  It's not
  mounted, and you didn't ask for it to be mounted.  But nobody will
  really care, and this probably makes it match previous semantics, so..
      - Linus ]

This mirrors the fix made to the quota code in 815d405ceff0d69646.

Signed-off-by: Dan McGee <dpmcgee@gmail.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/statfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/statfs.c b/fs/statfs.c
index 8244924dec55..9cf04a118965 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -76,7 +76,7 @@ EXPORT_SYMBOL(vfs_statfs);
 int user_statfs(const char __user *pathname, struct kstatfs *st)
 {
 	struct path path;
-	int error = user_path(pathname, &path);
+	int error = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
 	if (!error) {
 		error = vfs_statfs(&path, st);
 		path_put(&path);
-- 
cgit v1.2.3-58-ga151


From c6ffe10015f4e6fba8a915318b319c43aed1836f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 3 Nov 2011 09:23:36 -0700
Subject: ceph: use new D_COMPLETE dentry flag

We used to use a flag on the directory inode to track whether the dcache
contents for a directory were a complete cached copy.  Switch to a dentry
flag CEPH_D_COMPLETE that is safely updated by ->d_prune().

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c       |  8 +++----
 fs/ceph/dir.c        | 59 ++++++++++++++++++++++++++++++++++++++++++++--------
 fs/ceph/inode.c      |  8 +++----
 fs/ceph/mds_client.c |  6 +++---
 fs/ceph/super.h      | 10 +++++++--
 5 files changed, 68 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 15b21e35078a..0f327c6c9679 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -487,17 +487,15 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
 		ci->i_rdcache_gen++;
 
 	/*
-	 * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
+	 * if we are newly issued FILE_SHARED, clear D_COMPLETE; we
 	 * don't know what happened to this directory while we didn't
 	 * have the cap.
 	 */
 	if ((issued & CEPH_CAP_FILE_SHARED) &&
 	    (had & CEPH_CAP_FILE_SHARED) == 0) {
 		ci->i_shared_gen++;
-		if (S_ISDIR(ci->vfs_inode.i_mode)) {
-			dout(" marking %p NOT complete\n", &ci->vfs_inode);
-			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-		}
+		if (S_ISDIR(ci->vfs_inode.i_mode))
+			ceph_dir_clear_complete(&ci->vfs_inode);
 	}
 }
 
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 97fb155b3242..2abd0dfad7f8 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -108,7 +108,7 @@ static unsigned fpos_off(loff_t p)
  * falling back to a "normal" sync readdir if any dentries in the dir
  * are dropped.
  *
- * I_COMPLETE tells indicates we have all dentries in the dir.  It is
+ * D_COMPLETE tells indicates we have all dentries in the dir.  It is
  * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
  * the MDS if/when the directory is modified).
  */
@@ -199,8 +199,8 @@ more:
 	filp->f_pos++;
 
 	/* make sure a dentry wasn't dropped while we didn't have parent lock */
-	if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
-		dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
+	if (!ceph_dir_test_complete(dir)) {
+		dout(" lost D_COMPLETE on %p; falling back to mds\n", dir);
 		err = -EAGAIN;
 		goto out;
 	}
@@ -285,7 +285,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	if ((filp->f_pos == 2 || fi->dentry) &&
 	    !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
 	    ceph_snap(inode) != CEPH_SNAPDIR &&
-	    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+	    ceph_dir_test_complete(inode) &&
 	    __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
 		spin_unlock(&inode->i_lock);
 		err = __dcache_readdir(filp, dirent, filldir);
@@ -351,7 +351,7 @@ more:
 
 		if (!req->r_did_prepopulate) {
 			dout("readdir !did_prepopulate");
-			fi->dir_release_count--;    /* preclude I_COMPLETE */
+			fi->dir_release_count--;    /* preclude D_COMPLETE */
 		}
 
 		/* note next offset and last dentry name */
@@ -430,8 +430,7 @@ more:
 	 */
 	spin_lock(&inode->i_lock);
 	if (ci->i_release_count == fi->dir_release_count) {
-		dout(" marking %p complete\n", inode);
-		/* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
+		ceph_dir_set_complete(inode);
 		ci->i_max_offset = filp->f_pos;
 	}
 	spin_unlock(&inode->i_lock);
@@ -614,7 +613,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 			    fsc->mount_options->snapdir_name,
 			    dentry->d_name.len) &&
 		    !is_root_ceph_dentry(dir, dentry) &&
-		    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+		    ceph_dir_test_complete(dir) &&
 		    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
 			spin_unlock(&dir->i_lock);
 			dout(" dir %p complete, -ENOENT\n", dir);
@@ -934,7 +933,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
 		 */
 
 		/* d_move screws up d_subdirs order */
-		ceph_i_clear(new_dir, CEPH_I_COMPLETE);
+		ceph_dir_clear_complete(new_dir);
 
 		d_move(old_dentry, new_dentry);
 
@@ -1092,6 +1091,48 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,
 	return 1;
 }
 
+/*
+ * Set/clear/test dir complete flag on the dir's dentry.
+ */
+static struct dentry * __d_find_any_alias(struct inode *inode)
+{
+	struct dentry *alias;
+
+	if (list_empty(&inode->i_dentry))
+		return NULL;
+	alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias);
+	return alias;
+}
+
+void ceph_dir_set_complete(struct inode *inode)
+{
+	struct dentry *dentry = __d_find_any_alias(inode);
+	
+	if (dentry && ceph_dentry(dentry)) {
+		dout(" marking %p (%p) complete\n", inode, dentry);
+		set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+	}
+}
+
+void ceph_dir_clear_complete(struct inode *inode)
+{
+	struct dentry *dentry = __d_find_any_alias(inode);
+
+	if (dentry && ceph_dentry(dentry)) {
+		dout(" marking %p (%p) NOT complete\n", inode, dentry);
+		clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+	}
+}
+
+bool ceph_dir_test_complete(struct inode *inode)
+{
+	struct dentry *dentry = __d_find_any_alias(inode);
+
+	if (dentry && ceph_dentry(dentry))
+		return test_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+	return false;
+}
+
 /*
  * When the VFS prunes a dentry from the cache, we need to clear the
  * complete flag on the parent directory.
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 1616a0d37cbd..e392bfce84a3 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -771,9 +771,9 @@ no_change:
 	    ceph_snap(inode) == CEPH_NOSNAP &&
 	    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
 	    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
-	    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
+	    !ceph_dir_test_complete(inode)) {
 		dout(" marking %p complete (empty)\n", inode);
-		/* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
+		ceph_dir_set_complete(inode);
 		ci->i_max_offset = 2;
 	}
 
@@ -856,7 +856,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
 	di = ceph_dentry(dn);
 
 	spin_lock(&inode->i_lock);
-	if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
+	if (!ceph_dir_test_complete(inode)) {
 		spin_unlock(&inode->i_lock);
 		return;
 	}
@@ -1056,7 +1056,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 			 * d_move() puts the renamed dentry at the end of
 			 * d_subdirs.  We need to assign it an appropriate
 			 * directory offset so we can behave when holding
-			 * I_COMPLETE.
+			 * D_COMPLETE.
 			 */
 			ceph_set_dentry_offset(req->r_old_dentry);
 			dout("dn %p gets new offset %lld\n", req->r_old_dentry, 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1d72f15fe9f4..b4c3efa163c2 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2002,7 +2002,7 @@ out:
 }
 
 /*
- * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS
+ * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS
  * namespace request.
  */
 void ceph_invalidate_dir_request(struct ceph_mds_request *req)
@@ -2010,9 +2010,9 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req)
 	struct inode *inode = req->r_locked_dir;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 
-	dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode);
+	dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode);
 	spin_lock(&inode->i_lock);
-	ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+	ceph_dir_clear_complete(inode);
 	ci->i_release_count++;
 	spin_unlock(&inode->i_lock);
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 98e60693c9a1..01bf189e08a9 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -264,7 +264,7 @@ struct ceph_inode_info {
 	struct timespec i_rctime;
 	u64 i_rbytes, i_rfiles, i_rsubdirs;
 	u64 i_files, i_subdirs;
-	u64 i_max_offset;  /* largest readdir offset, set with I_COMPLETE */
+	u64 i_max_offset;  /* largest readdir offset, set with D_COMPLETE */
 
 	struct rb_root i_fragtree;
 	struct mutex i_fragtree_mutex;
@@ -429,7 +429,6 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
 /*
  * Ceph inode.
  */
-#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
 #define CEPH_I_NODELAY   4  /* do not delay cap release */
 #define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
 #define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
@@ -486,6 +485,13 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
 	return ((loff_t)frag << 32) | (loff_t)off;
 }
 
+/*
+ * set/clear directory D_COMPLETE flag
+ */
+void ceph_dir_set_complete(struct inode *inode);
+void ceph_dir_clear_complete(struct inode *inode);
+bool ceph_dir_test_complete(struct inode *inode);
+
 /*
  * caps helpers
  */
-- 
cgit v1.2.3-58-ga151


From 7fd7d101ff50af55d6d69f4705facc00c324024e Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Fri, 23 Sep 2011 13:22:11 -0700
Subject: ceph/mds_client.c: quiet sparse noise

Quiet the following sparse noise:

warning: symbol 'get_nonsnap_parent' was not declared. Should it be static?
warning: symbol 'done_closing_sessions' was not declared. Should it be static?

Local functions don't need external visability. Make them static.

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Cc: Sage Weil <sage@newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index b4c3efa163c2..264ab701154f 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -619,7 +619,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
  *
  * Called under mdsc->mutex.
  */
-struct dentry *get_nonsnap_parent(struct dentry *dentry)
+static struct dentry *get_nonsnap_parent(struct dentry *dentry)
 {
 	/*
 	 * we don't need to worry about protecting the d_parent access
@@ -3154,7 +3154,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 /*
  * true if all sessions are closed, or we force unmount
  */
-bool done_closing_sessions(struct ceph_mds_client *mdsc)
+static bool done_closing_sessions(struct ceph_mds_client *mdsc)
 {
 	int i, n = 0;
 
-- 
cgit v1.2.3-58-ga151


From 0c6d4b4e22a513f8563a2e00c5ab08e9f8796272 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Fri, 23 Sep 2011 11:53:30 -0700
Subject: ceph/super.c: quiet sparse noise

Quiet the sparse noise:

warning: symbol 'create_fs_client' was not declared. Should it be static?
warning: symbol 'destroy_fs_client' was not declared. Should it be static?

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Cc: Sage Weil <sage@newdream.net>
ceph-devel@vger.kernel.org
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 788f5ad8e66d..a90846fac759 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -426,7 +426,7 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
 /*
  * create a new fs client
  */
-struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
+static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 					struct ceph_options *opt)
 {
 	struct ceph_fs_client *fsc;
@@ -502,7 +502,7 @@ fail:
 	return ERR_PTR(err);
 }
 
-void destroy_fs_client(struct ceph_fs_client *fsc)
+static void destroy_fs_client(struct ceph_fs_client *fsc)
 {
 	dout("destroy_fs_client %p\n", fsc);
 
-- 
cgit v1.2.3-58-ga151


From cd354ad613a393424f85333ceed6b15e07fb94ae Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 20 Oct 2011 15:45:37 -0400
Subject: Btrfs: don't wait as long for more batches during SSD log commit

When we're doing log commits, we try to wait for more writers to come in
and make the commit bigger.  This helps improve performance on rotating
disks, but on SSDs it adds latencies.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/tree-log.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 786639fca067..310ab22cfe58 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2013,10 +2013,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	/* wait for previous tree log sync to complete */
 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
 		wait_log_commit(trans, root, root->log_transid - 1);
-
 	while (1) {
 		unsigned long batch = root->log_batch;
-		if (root->log_multiple_pids) {
+		/* when we're on an ssd, just kick the log commit out */
+		if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
 			mutex_unlock(&root->log_mutex);
 			schedule_timeout_uninterruptible(1);
 			mutex_lock(&root->log_mutex);
-- 
cgit v1.2.3-58-ga151


From 1eae31e918972bbeefc119d23c1d67674f49a301 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 14 Oct 2011 06:31:20 -0400
Subject: Btrfs: make sure btrfs_remove_free_space doesn't leak EAGAIN

btrfs_remove_free_space needs to make sure to set ret back to a
valid return value after setting it to EAGAIN, otherwise we return
it to the callers.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index de205d59b74b..f49475dfa954 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1868,6 +1868,7 @@ again:
 			ctl->total_bitmaps--;
 		}
 		kmem_cache_free(btrfs_free_space_cachep, info);
+		ret = 0;
 		goto out_lock;
 	}
 
@@ -1875,7 +1876,8 @@ again:
 		unlink_free_space(ctl, info);
 		info->offset += bytes;
 		info->bytes -= bytes;
-		link_free_space(ctl, info);
+		ret = link_free_space(ctl, info);
+		WARN_ON(ret);
 		goto out_lock;
 	}
 
-- 
cgit v1.2.3-58-ga151


From e688b7252f784c2479d559f9f70ca8354752c5e7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 31 Oct 2011 20:52:39 -0400
Subject: Btrfs: fix extent pinning bugs in the tree log

The tree log had two important bugs that could cause corruptions after a
crash.  Sometimes we were allowing tree log blocks to be reused after
the tree log was committed but before the transaction commit was done.

This allowed a future metadata write to overwrite the tree log data.  It
is fixed by adding a new variant of freeing reserved extents that always
pins them.  Credit goes to Stefan Behrens and Arne Jansen for many many
hours spent tracking this bug down.

During tree log replay, we do a pass through the tree log and pin all
the extents we find.  This makes sure the replay code won't go in and
use any of those blocks for new allocations during replay.  The problem
is the free space cache isn't honoring these pinned extents.  So the
allocator can end up handing them out, leading to all kinds of problems
during replay.

The fix here is to force any free space cache to load while we pin the
extents, and then to make sure we remove the pinned extents from the
free space rbtree.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
Reported-by: Stefan Behrens <sbehrens@giantdisaster.de>
---
 fs/btrfs/ctree.h       |  5 +++++
 fs/btrfs/extent-tree.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/tree-log.c    | 11 ++++++-----
 3 files changed, 59 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 227620993bce..f63c9b3f6e08 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2156,6 +2156,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 			     u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_pin_extent(struct btrfs_root *root,
 		     u64 bytenr, u64 num, int reserved);
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    u64 bytenr, u64 num_bytes);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  u64 objectid, u64 offset, u64 bytenr);
@@ -2206,6 +2209,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      u64 root_objectid, u64 owner, u64 offset);
 
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+				       u64 start, u64 len);
 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 28c4809851a5..cb7626646bba 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4344,6 +4344,34 @@ int btrfs_pin_extent(struct btrfs_root *root,
 	return 0;
 }
 
+/*
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    u64 bytenr, u64 num_bytes)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+	BUG_ON(!cache);
+
+	/*
+	 * pull in the free space cache (if any) so that our pin
+	 * removes the free space from the cache.  We have load_only set
+	 * to one because the slow code to read in the free extents does check
+	 * the pinned extents.
+	 */
+	cache_block_group(cache, trans, root, 1);
+
+	pin_down_extent(root, cache, bytenr, num_bytes, 0);
+
+	/* remove us from the free space cache (if we're there at all) */
+	btrfs_remove_free_space(cache, bytenr, num_bytes);
+	btrfs_put_block_group(cache);
+	return 0;
+}
+
 /**
  * btrfs_update_reserved_bytes - update the block_group and space info counters
  * @cache:	The cache we are manipulating
@@ -5487,7 +5515,8 @@ again:
 	return ret;
 }
 
-int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
+static int __btrfs_free_reserved_extent(struct btrfs_root *root,
+					u64 start, u64 len, int pin)
 {
 	struct btrfs_block_group_cache *cache;
 	int ret = 0;
@@ -5502,8 +5531,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 	if (btrfs_test_opt(root, DISCARD))
 		ret = btrfs_discard_extent(root, start, len, NULL);
 
-	btrfs_add_free_space(cache, start, len);
-	btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
+	if (pin)
+		pin_down_extent(root, cache, start, len, 1);
+	else {
+		btrfs_add_free_space(cache, start, len);
+		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
+	}
 	btrfs_put_block_group(cache);
 
 	trace_btrfs_reserved_extent_free(root, start, len);
@@ -5511,6 +5544,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 	return ret;
 }
 
+int btrfs_free_reserved_extent(struct btrfs_root *root,
+					u64 start, u64 len)
+{
+	return __btrfs_free_reserved_extent(root, start, len, 0);
+}
+
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+				       u64 start, u64 len)
+{
+	return __btrfs_free_reserved_extent(root, start, len, 1);
+}
+
 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 				      struct btrfs_root *root,
 				      u64 parent, u64 root_objectid,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 310ab22cfe58..8ca1b6b83bd1 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log,
 			      struct walk_control *wc, u64 gen)
 {
 	if (wc->pin)
-		btrfs_pin_extent(log->fs_info->extent_root,
-				 eb->start, eb->len, 0);
+		btrfs_pin_extent_for_log_replay(wc->trans,
+						log->fs_info->extent_root,
+						eb->start, eb->len);
 
 	if (btrfs_buffer_uptodate(eb, gen)) {
 		if (wc->write)
@@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 
 				WARN_ON(root_owner !=
 					BTRFS_TREE_LOG_OBJECTID);
-				ret = btrfs_free_reserved_extent(root,
+				ret = btrfs_free_and_pin_reserved_extent(root,
 							 bytenr, blocksize);
 				BUG_ON(ret);
 			}
@@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 				btrfs_tree_unlock(next);
 
 				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
-				ret = btrfs_free_reserved_extent(root,
+				ret = btrfs_free_and_pin_reserved_extent(root,
 						path->nodes[*level]->start,
 						path->nodes[*level]->len);
 				BUG_ON(ret);
@@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 
 			WARN_ON(log->root_key.objectid !=
 				BTRFS_TREE_LOG_OBJECTID);
-			ret = btrfs_free_reserved_extent(log, next->start,
+			ret = btrfs_free_and_pin_reserved_extent(log, next->start,
 							 next->len);
 			BUG_ON(ret);
 		}
-- 
cgit v1.2.3-58-ga151


From 01d658f2ca3c85c1ffb20b306e30d16197000ce7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 1 Nov 2011 10:08:06 -0400
Subject: Btrfs: make sure to flush queued bios if write_cache_pages waits

write_cache_pages tries to build up a large bio to stuff down the pipe.
But if it needs to wait for a page lock, it needs to make sure and send
down any pending writes so we don't deadlock with anyone who has the
page lock and is waiting for writeback of things inside the bio.

Dave Sterba triggered this as a deadlock between the autodefrag code and
the extent write_cache_pages

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c   | 13 ++++++++++---
 fs/btrfs/disk-io.h   |  2 --
 fs/btrfs/extent_io.c | 14 ++++++++++----
 fs/btrfs/extent_io.h |  3 ++-
 4 files changed, 22 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 51372a521167..6f58911ece0d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2735,7 +2735,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 	return ret;
 }
 
-int btree_lock_page_hook(struct page *page)
+static int btree_lock_page_hook(struct page *page, void *data,
+				void (*flush_fn)(void *))
 {
 	struct inode *inode = page->mapping->host;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2752,7 +2753,10 @@ int btree_lock_page_hook(struct page *page)
 	if (!eb)
 		goto out;
 
-	btrfs_tree_lock(eb);
+	if (!btrfs_try_tree_write_lock(eb)) {
+		flush_fn(data);
+		btrfs_tree_lock(eb);
+	}
 	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
 
 	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
@@ -2767,7 +2771,10 @@ int btree_lock_page_hook(struct page *page)
 	btrfs_tree_unlock(eb);
 	free_extent_buffer(eb);
 out:
-	lock_page(page);
+	if (!trylock_page(page)) {
+		flush_fn(data);
+		lock_page(page);
+	}
 	return 0;
 }
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bec3ea4bd67f..e678539c8519 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -83,8 +83,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info);
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
-int btree_lock_page_hook(struct page *page);
-
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f284d4e5f447..b40ba75f4483 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2613,10 +2613,16 @@ retry:
 			 * swizzled back from swapper_space to tmpfs file
 			 * mapping
 			 */
-			if (tree->ops && tree->ops->write_cache_pages_lock_hook)
-				tree->ops->write_cache_pages_lock_hook(page);
-			else
-				lock_page(page);
+			if (tree->ops &&
+			    tree->ops->write_cache_pages_lock_hook) {
+				tree->ops->write_cache_pages_lock_hook(page,
+							       data, flush_fn);
+			} else {
+				if (!trylock_page(page)) {
+					flush_fn(data);
+					lock_page(page);
+				}
+			}
 
 			if (unlikely(page->mapping != mapping)) {
 				unlock_page(page);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 325a346369da..cbd4824a7c94 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -86,7 +86,8 @@ struct extent_io_ops {
 				  struct extent_state *other);
 	void (*split_extent_hook)(struct inode *inode,
 				  struct extent_state *orig, u64 split);
-	int (*write_cache_pages_lock_hook)(struct page *page);
+	int (*write_cache_pages_lock_hook)(struct page *page, void *data,
+					   void (*flush_fn)(void *));
 };
 
 struct extent_io_tree {
-- 
cgit v1.2.3-58-ga151


From 5a77d76c243be18d854aa1b14d697525f60e169a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 1 Nov 2011 14:32:23 -0400
Subject: Btrfs: release metadata from global reserve if we have to fallback
 for unlink

I fixed a problem where we weren't reserving space for an orphan item when we
had to fallback to using the global reserve for an unlink, but I introduced
another problem.  I was migrating the bytes from the transaction reserve to the
global reserve and then releasing from the global reserve in
btrfs_end_transaction().  The problem with this is that a migrate will jack up
the size for the destination, but leave the size alone for the source, with the
idea that you can do a release normally on the source and it all washes out, and
then you can do a release again on the destination and it works out right.  My
way was skipping the release on the trans_block_rsv which still had the jacked
up size from our original reservation.  So instead release manually from the
global reserve if this transaction was using it, and then set the
trans->block_rsv back to the trans_block_rsv so that btrfs_end_transaction
cleans everything up properly.  With this patch xfstest 83 doesn't emit warnings
about leaking space.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 65474d95f26f..a8586e10953c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2904,7 +2904,7 @@ out:
 	if (!err)
 		err = btrfs_block_rsv_migrate(trans->block_rsv,
 				&root->fs_info->global_block_rsv,
-				btrfs_calc_trans_metadata_size(root, 1));
+				trans->bytes_reserved);
 
 	if (err) {
 		btrfs_end_transaction(trans, root);
@@ -2920,6 +2920,9 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root)
 {
 	if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+		btrfs_block_rsv_release(root, trans->block_rsv,
+					trans->bytes_reserved);
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
 		BUG_ON(!root->fs_info->enospc_unlink);
 		root->fs_info->enospc_unlink = 0;
 	}
-- 
cgit v1.2.3-58-ga151


From c8174313a8102e874aaa321e2fc4c7c460a87151 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 2 Nov 2011 09:29:35 -0400
Subject: Btrfs: use the global reserve when truncating the free space cache
 inode

We no longer use the orphan block rsv for holding the reservation for truncating
the inode, so instead use the global block rsv and check to make sure it has
enough space for us to truncate the space.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f49475dfa954..7a15fcfb3e1f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -198,14 +198,24 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 				    struct inode *inode)
 {
 	struct btrfs_block_rsv *rsv;
+	u64 needed_bytes;
 	loff_t oldsize;
 	int ret = 0;
 
 	rsv = trans->block_rsv;
-	trans->block_rsv = root->orphan_block_rsv;
-	ret = btrfs_block_rsv_check(root, root->orphan_block_rsv, 5);
-	if (ret)
-		return ret;
+	trans->block_rsv = &root->fs_info->global_block_rsv;
+
+	/* 1 for slack space, 1 for updating the inode */
+	needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
+		btrfs_calc_trans_metadata_size(root, 1);
+
+	spin_lock(&trans->block_rsv->lock);
+	if (trans->block_rsv->reserved < needed_bytes) {
+		spin_unlock(&trans->block_rsv->lock);
+		trans->block_rsv = rsv;
+		return -ENOSPC;
+	}
+	spin_unlock(&trans->block_rsv->lock);
 
 	oldsize = i_size_read(inode);
 	btrfs_i_size_write(inode, 0);
@@ -218,13 +228,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 	ret = btrfs_truncate_inode_items(trans, root, inode,
 					 0, BTRFS_EXTENT_DATA_KEY);
 
-	trans->block_rsv = rsv;
 	if (ret) {
+		trans->block_rsv = rsv;
 		WARN_ON(1);
 		return ret;
 	}
 
 	ret = btrfs_update_inode(trans, root, inode);
+	trans->block_rsv = rsv;
+
 	return ret;
 }
 
-- 
cgit v1.2.3-58-ga151


From 6c41761fc6efe1503103a1afe03a6635c0b5d4ec Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Wed, 13 Apr 2011 15:41:04 +0200
Subject: btrfs: separate superblock items out of fs_info

fs_info has now ~9kb, more than fits into one page. This will cause
mount failure when memory is too fragmented. Top space consumers are
super block structures super_copy and super_for_commit, ~2.8kb each.
Allocate them dynamically. fs_info will be ~3.5kb. (measured on x86_64)

Add a wrapper for freeing fs_info and all of it's dynamically allocated
members.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/compression.c |  3 ++-
 fs/btrfs/ctree.h       | 16 ++++++++++++++--
 fs/btrfs/disk-io.c     | 33 ++++++++++-----------------------
 fs/btrfs/extent-tree.c | 16 ++++++++--------
 fs/btrfs/file-item.c   | 17 ++++++-----------
 fs/btrfs/inode.c       |  2 +-
 fs/btrfs/ioctl.c       |  8 ++++----
 fs/btrfs/scrub.c       |  2 +-
 fs/btrfs/super.c       | 19 +++++++++++++------
 fs/btrfs/transaction.c | 10 +++++-----
 fs/btrfs/tree-log.c    |  4 ++--
 fs/btrfs/volumes.c     | 24 ++++++++++++------------
 12 files changed, 78 insertions(+), 76 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8ec5d86f1734..14f1c5a0b2d2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -85,7 +85,8 @@ struct compressed_bio {
 static inline int compressed_bio_size(struct btrfs_root *root,
 				      unsigned long disk_size)
 {
-	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+
 	return sizeof(struct compressed_bio) +
 		((disk_size + root->sectorsize - 1) / root->sectorsize) *
 		csum_size;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f63c9b3f6e08..5181c53c1124 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -936,8 +936,8 @@ struct btrfs_fs_info {
 	wait_queue_head_t transaction_blocked_wait;
 	wait_queue_head_t async_submit_wait;
 
-	struct btrfs_super_block super_copy;
-	struct btrfs_super_block super_for_commit;
+	struct btrfs_super_block *super_copy;
+	struct btrfs_super_block *super_for_commit;
 	struct block_device *__bdev;
 	struct super_block *sb;
 	struct inode *btree_inode;
@@ -2387,6 +2387,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
 	smp_mb();
 	return fs_info->closing;
 }
+static inline void free_fs_info(struct btrfs_fs_info *fs_info)
+{
+	kfree(fs_info->delayed_root);
+	kfree(fs_info->extent_root);
+	kfree(fs_info->tree_root);
+	kfree(fs_info->chunk_root);
+	kfree(fs_info->dev_root);
+	kfree(fs_info->csum_root);
+	kfree(fs_info->super_copy);
+	kfree(fs_info->super_for_commit);
+	kfree(fs_info);
+}
 
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6f58911ece0d..761717c98278 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result)
 static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 			   int verify)
 {
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 	char *result = NULL;
 	unsigned long len;
 	unsigned long cur_len;
@@ -1766,14 +1765,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		goto fail_alloc;
 	}
 
-	memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
-	memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
-	       sizeof(fs_info->super_for_commit));
+	memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
+	memcpy(fs_info->super_for_commit, fs_info->super_copy,
+	       sizeof(*fs_info->super_for_commit));
 	brelse(bh);
 
-	memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
+	memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
 
-	disk_super = &fs_info->super_copy;
+	disk_super = fs_info->super_copy;
 	if (!btrfs_super_root(disk_super))
 		goto fail_alloc;
 
@@ -2152,7 +2151,6 @@ fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->delayed_workers);
 	btrfs_stop_workers(&fs_info->caching_workers);
 fail_alloc:
-	kfree(fs_info->delayed_root);
 fail_iput:
 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 	iput(fs_info->btree_inode);
@@ -2164,12 +2162,7 @@ fail_bdi:
 fail_srcu:
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
-	kfree(extent_root);
-	kfree(tree_root);
-	kfree(fs_info);
-	kfree(chunk_root);
-	kfree(dev_root);
-	kfree(csum_root);
+	free_fs_info(fs_info);
 	return ERR_PTR(err);
 }
 
@@ -2338,10 +2331,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 	int total_errors = 0;
 	u64 flags;
 
-	max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+	max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
 	do_barriers = !btrfs_test_opt(root, NOBARRIER);
 
-	sb = &root->fs_info->super_for_commit;
+	sb = root->fs_info->super_for_commit;
 	dev_item = &sb->dev_item;
 
 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
@@ -2603,7 +2596,6 @@ int close_ctree(struct btrfs_root *root)
 	del_fs_roots(fs_info);
 
 	iput(fs_info->btree_inode);
-	kfree(fs_info->delayed_root);
 
 	btrfs_stop_workers(&fs_info->generic_worker);
 	btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2624,12 +2616,7 @@ int close_ctree(struct btrfs_root *root)
 	bdi_destroy(&fs_info->bdi);
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 
-	kfree(fs_info->extent_root);
-	kfree(fs_info->tree_root);
-	kfree(fs_info->chunk_root);
-	kfree(fs_info->dev_root);
-	kfree(fs_info->csum_root);
-	kfree(fs_info);
+	free_fs_info(fs_info);
 
 	return 0;
 }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cb7626646bba..782eb3ea8edf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3209,7 +3209,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
 	 * about 1% of the FS size.
 	 */
 	if (force == CHUNK_ALLOC_LIMITED) {
-		thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+		thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
 		thresh = max_t(u64, 64 * 1024 * 1024,
 			       div_factor_fine(thresh, 1));
 
@@ -3231,7 +3231,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
 	if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
 		return 0;
 
-	thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
 
 	/* 256MB or 5% of the FS */
 	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
@@ -3843,7 +3843,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
 	u64 num_bytes;
 	u64 meta_used;
 	u64 data_used;
-	int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+	int csum_size = btrfs_super_csum_size(fs_info->super_copy);
 
 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
 	spin_lock(&sinfo->lock);
@@ -4222,12 +4222,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 
 	/* block accounting for super block */
 	spin_lock(&info->delalloc_lock);
-	old_val = btrfs_super_bytes_used(&info->super_copy);
+	old_val = btrfs_super_bytes_used(info->super_copy);
 	if (alloc)
 		old_val += num_bytes;
 	else
 		old_val -= num_bytes;
-	btrfs_set_super_bytes_used(&info->super_copy, old_val);
+	btrfs_set_super_bytes_used(info->super_copy, old_val);
 	spin_unlock(&info->delalloc_lock);
 
 	while (total) {
@@ -7127,9 +7127,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		return -ENOMEM;
 	path->reada = 1;
 
-	cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
+	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
 	if (btrfs_test_opt(root, SPACE_CACHE) &&
-	    btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
+	    btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
 		need_clear = 1;
 	if (btrfs_test_opt(root, CLEAR_CACHE))
 		need_clear = 1;
@@ -7458,7 +7458,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
 	int mixed = 0;
 	int ret;
 
-	disk_super = &fs_info->super_copy;
+	disk_super = fs_info->super_copy;
 	if (!btrfs_super_root(disk_super))
 		return 1;
 
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a1cb7821becd..c7fb3a4247d3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 	struct btrfs_csum_item *item;
 	struct extent_buffer *leaf;
 	u64 csum_offset = 0;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 	int csums_in_item;
 
 	file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 	u64 item_last_offset = 0;
 	u64 disk_bytenr;
 	u32 diff;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 	int ret;
 	struct btrfs_path *path;
 	struct btrfs_csum_item *item = NULL;
@@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 	int ret;
 	size_t size;
 	u64 csum_end;
-	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
 				      u64 bytenr, u64 len)
 {
 	struct extent_buffer *leaf;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 	u64 csum_end;
 	u64 end_byte = bytenr + len;
 	u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
@@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 	u64 csum_end;
 	struct extent_buffer *leaf;
 	int ret;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 	int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
 
 	root = root->fs_info->csum_root;
@@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 	struct btrfs_sector_sum *sector_sum;
 	u32 nritems;
 	u32 ins_size;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 
 	path = btrfs_alloc_path();
 	if (!path)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a8586e10953c..b6b612e14ed7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -824,7 +824,7 @@ static noinline int cow_file_range(struct inode *inode,
 	}
 
 	BUG_ON(disk_num_bytes >
-	       btrfs_super_total_bytes(&root->fs_info->super_copy));
+	       btrfs_super_total_bytes(root->fs_info->super_copy));
 
 	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 33aae13cc74b..8f6e14279409 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -282,7 +282,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 	struct fstrim_range range;
 	u64 minlen = ULLONG_MAX;
 	u64 num_devices = 0;
-	u64 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -1164,7 +1164,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		mutex_unlock(&inode->i_mutex);
 	}
 
-	disk_super = &root->fs_info->super_copy;
+	disk_super = root->fs_info->super_copy;
 	features = btrfs_super_incompat_flags(disk_super);
 	if (range->compress_type == BTRFS_COMPRESS_LZO) {
 		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
@@ -2613,7 +2613,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 		return PTR_ERR(trans);
 	}
 
-	dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
 	di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
 				   dir_id, "default", 7, 1);
 	if (IS_ERR_OR_NULL(di)) {
@@ -2629,7 +2629,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_free_path(path);
 
-	disk_super = &root->fs_info->super_copy;
+	disk_super = root->fs_info->super_copy;
 	features = btrfs_super_incompat_flags(disk_super);
 	if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
 		features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a8d03d5efb5d..69a600f07763 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -182,7 +182,7 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
 	sdev->curr = -1;
 	atomic_set(&sdev->in_flight, 0);
 	atomic_set(&sdev->cancel_req, 0);
-	sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+	sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
 	INIT_LIST_HEAD(&sdev->csum_list);
 
 	spin_lock_init(&sdev->list_lock);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5429b1fa0bfc..f7e9de724ef2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -216,7 +216,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 	char *compress_type;
 	bool compress_force = false;
 
-	cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
+	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
 	if (cache_gen)
 		btrfs_set_opt(info->mount_opt, SPACE_CACHE);
 
@@ -524,7 +524,7 @@ static struct dentry *get_default_root(struct super_block *sb,
 	 * will mount by default if we haven't been given a specific subvolume
 	 * to mount.
 	 */
-	dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
 	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
 	if (IS_ERR(di)) {
 		btrfs_free_path(path);
@@ -937,6 +937,13 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	fs_info->fs_devices = fs_devices;
 	tree_root->fs_info = fs_info;
 
+	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+	fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+	if (!fs_info->super_copy || !fs_info->super_for_commit) {
+		error = -ENOMEM;
+		goto error_close_devices;
+	}
+
 	bdev = fs_devices->latest_bdev;
 	s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
 	if (IS_ERR(s)) {
@@ -951,7 +958,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 		}
 
 		btrfs_close_devices(fs_devices);
-		kfree(fs_info);
+		free_fs_info(fs_info);
 		kfree(tree_root);
 	} else {
 		char b[BDEVNAME_SIZE];
@@ -979,7 +986,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 
 error_close_devices:
 	btrfs_close_devices(fs_devices);
-	kfree(fs_info);
+	free_fs_info(fs_info);
 	kfree(tree_root);
 	return ERR_PTR(error);
 }
@@ -1005,7 +1012,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		if (root->fs_info->fs_devices->rw_devices == 0)
 			return -EACCES;
 
-		if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
+		if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
 			return -EINVAL;
 
 		ret = btrfs_cleanup_fs_roots(root->fs_info);
@@ -1171,7 +1178,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
-	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+	struct btrfs_super_block *disk_super = root->fs_info->super_copy;
 	struct list_head *head = &root->fs_info->space_info;
 	struct btrfs_space_info *found;
 	u64 total_used = 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 29bef63e23ba..373c7ec1a026 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -991,7 +991,7 @@ static void update_super_roots(struct btrfs_root *root)
 	struct btrfs_root_item *root_item;
 	struct btrfs_super_block *super;
 
-	super = &root->fs_info->super_copy;
+	super = root->fs_info->super_copy;
 
 	root_item = &root->fs_info->chunk_root->root_item;
 	super->chunk_root = root_item->bytenr;
@@ -1301,12 +1301,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	update_super_roots(root);
 
 	if (!root->fs_info->log_root_recovering) {
-		btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
-		btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
+		btrfs_set_super_log_root(root->fs_info->super_copy, 0);
+		btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
 	}
 
-	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
-	       sizeof(root->fs_info->super_copy));
+	memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
+	       sizeof(*root->fs_info->super_copy));
 
 	trans->transaction->blocked = 0;
 	spin_lock(&root->fs_info->trans_lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8ca1b6b83bd1..f4d81c06d48f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2118,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
 
-	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
+	btrfs_set_super_log_root(root->fs_info->super_for_commit,
 				log_root_tree->node->start);
-	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
+	btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
 				btrfs_header_level(log_root_tree->node));
 
 	log_root_tree->log_batch = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c6938b45e0fd..c3b45564048e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1395,8 +1395,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	call_rcu(&device->rcu, free_device);
 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
-	num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
-	btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
+	num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
+	btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
 
 	if (cur_devices->open_devices == 0) {
 		struct btrfs_fs_devices *fs_devices;
@@ -1458,7 +1458,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 	struct btrfs_fs_devices *old_devices;
 	struct btrfs_fs_devices *seed_devices;
-	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+	struct btrfs_super_block *disk_super = root->fs_info->super_copy;
 	struct btrfs_device *device;
 	u64 super_flags;
 
@@ -1706,12 +1706,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if (!blk_queue_nonrot(bdev_get_queue(bdev)))
 		root->fs_info->fs_devices->rotating = 1;
 
-	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
-	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+	total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
+	btrfs_set_super_total_bytes(root->fs_info->super_copy,
 				    total_bytes + device->total_bytes);
 
-	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
-	btrfs_set_super_num_devices(&root->fs_info->super_copy,
+	total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
+	btrfs_set_super_num_devices(root->fs_info->super_copy,
 				    total_bytes + 1);
 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
@@ -1802,7 +1802,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
 		      struct btrfs_device *device, u64 new_size)
 {
 	struct btrfs_super_block *super_copy =
-		&device->dev_root->fs_info->super_copy;
+		device->dev_root->fs_info->super_copy;
 	u64 old_total = btrfs_super_total_bytes(super_copy);
 	u64 diff = new_size - device->total_bytes;
 
@@ -1861,7 +1861,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
 			chunk_offset)
 {
-	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
 	struct btrfs_disk_key *disk_key;
 	struct btrfs_chunk *chunk;
 	u8 *ptr;
@@ -2187,7 +2187,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	bool retried = false;
 	struct extent_buffer *l;
 	struct btrfs_key key;
-	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
 	u64 old_total = btrfs_super_total_bytes(super_copy);
 	u64 old_size = device->total_bytes;
 	u64 diff = device->total_bytes - new_size;
@@ -2311,7 +2311,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 			   struct btrfs_key *key,
 			   struct btrfs_chunk *chunk, int item_size)
 {
-	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
 	struct btrfs_disk_key disk_key;
 	u32 array_size;
 	u8 *ptr;
@@ -3653,7 +3653,7 @@ static int read_one_dev(struct btrfs_root *root,
 
 int btrfs_read_sys_array(struct btrfs_root *root)
 {
-	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
 	struct extent_buffer *sb;
 	struct btrfs_disk_key *disk_key;
 	struct btrfs_chunk *chunk;
-- 
cgit v1.2.3-58-ga151


From af31f5e5b84b5bf2bcec464153a5130b170b2770 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Nov 2011 15:17:42 -0400
Subject: Btrfs: add a log of past tree roots

This takes some of the free space in the btrfs super block
to record information about most of the roots in the last four
commits.

It also adds a -o recovery to use the root history log when
we're not able to read the tree of tree roots, the extent
tree root, the device tree root or the csum root.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |  95 +++++++++++++++++
 fs/btrfs/disk-io.c | 291 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 fs/btrfs/super.c   |   7 +-
 3 files changed, 369 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5181c53c1124..78f43d1102a0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -360,6 +360,47 @@ struct btrfs_header {
 #define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
 #define BTRFS_LABEL_SIZE 256
 
+/*
+ * just in case we somehow lose the roots and are not able to mount,
+ * we store an array of the roots from previous transactions
+ * in the super.
+ */
+#define BTRFS_NUM_BACKUP_ROOTS 4
+struct btrfs_root_backup {
+	__le64 tree_root;
+	__le64 tree_root_gen;
+
+	__le64 chunk_root;
+	__le64 chunk_root_gen;
+
+	__le64 extent_root;
+	__le64 extent_root_gen;
+
+	__le64 fs_root;
+	__le64 fs_root_gen;
+
+	__le64 dev_root;
+	__le64 dev_root_gen;
+
+	__le64 csum_root;
+	__le64 csum_root_gen;
+
+	__le64 total_bytes;
+	__le64 bytes_used;
+	__le64 num_devices;
+	/* future */
+	__le64 unsed_64[4];
+
+	u8 tree_root_level;
+	u8 chunk_root_level;
+	u8 extent_root_level;
+	u8 fs_root_level;
+	u8 dev_root_level;
+	u8 csum_root_level;
+	/* future and to align */
+	u8 unused_8[10];
+} __attribute__ ((__packed__));
+
 /*
  * the super block basically lists the main trees of the FS
  * it currently lacks any block count etc etc
@@ -406,6 +447,7 @@ struct btrfs_super_block {
 	/* future expansion */
 	__le64 reserved[31];
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
+	struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
 } __attribute__ ((__packed__));
 
 /*
@@ -1113,6 +1155,9 @@ struct btrfs_fs_info {
 	u64 fs_state;
 
 	struct btrfs_delayed_root *delayed_root;
+
+	/* next backup root to be overwritten */
+	int backup_root_index;
 };
 
 /*
@@ -1357,6 +1402,7 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_ENOSPC_DEBUG	 (1 << 15)
 #define BTRFS_MOUNT_AUTO_DEFRAG		(1 << 16)
 #define BTRFS_MOUNT_INODE_MAP_CACHE	(1 << 17)
+#define BTRFS_MOUNT_RECOVERY		(1 << 18)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
@@ -1972,6 +2018,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
 	return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
 }
 
+/* struct btrfs_root_backup */
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
+		   tree_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
+		   tree_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
+		   tree_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
+		   chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
+		   chunk_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
+		   chunk_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
+		   extent_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
+		   extent_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
+		   extent_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
+		   fs_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
+		   fs_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
+		   fs_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
+		   dev_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
+		   dev_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
+		   dev_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
+		   csum_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
+		   csum_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
+		   csum_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
+		   total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
+		   bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
+		   num_devices, 64);
+
 /* struct btrfs_super_block */
 
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 761717c98278..a61f8a6cf219 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1134,10 +1134,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 
 	generation = btrfs_root_generation(&root->root_item);
 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+	root->commit_root = NULL;
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
 				     blocksize, generation);
 	if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
 		free_extent_buffer(root->node);
+		root->node = NULL;
 		return -EIO;
 	}
 	root->commit_root = btrfs_root_node(root);
@@ -1576,6 +1578,228 @@ sleep:
 	return 0;
 }
 
+/*
+ * this will find the highest generation in the array of
+ * root backups.  The index of the highest array is returned,
+ * or -1 if we can't find anything.
+ *
+ * We check to make sure the array is valid by comparing the
+ * generation of the latest  root in the array with the generation
+ * in the super block.  If they don't match we pitch it.
+ */
+static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
+{
+	u64 cur;
+	int newest_index = -1;
+	struct btrfs_root_backup *root_backup;
+	int i;
+
+	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
+		root_backup = info->super_copy->super_roots + i;
+		cur = btrfs_backup_tree_root_gen(root_backup);
+		if (cur == newest_gen)
+			newest_index = i;
+	}
+
+	/* check to see if we actually wrapped around */
+	if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
+		root_backup = info->super_copy->super_roots;
+		cur = btrfs_backup_tree_root_gen(root_backup);
+		if (cur == newest_gen)
+			newest_index = 0;
+	}
+	return newest_index;
+}
+
+
+/*
+ * find the oldest backup so we know where to store new entries
+ * in the backup array.  This will set the backup_root_index
+ * field in the fs_info struct
+ */
+static void find_oldest_super_backup(struct btrfs_fs_info *info,
+				     u64 newest_gen)
+{
+	int newest_index = -1;
+
+	newest_index = find_newest_super_backup(info, newest_gen);
+	/* if there was garbage in there, just move along */
+	if (newest_index == -1) {
+		info->backup_root_index = 0;
+	} else {
+		info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
+	}
+}
+
+/*
+ * copy all the root pointers into the super backup array.
+ * this will bump the backup pointer by one when it is
+ * done
+ */
+static void backup_super_roots(struct btrfs_fs_info *info)
+{
+	int next_backup;
+	struct btrfs_root_backup *root_backup;
+	int last_backup;
+
+	next_backup = info->backup_root_index;
+	last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
+		BTRFS_NUM_BACKUP_ROOTS;
+
+	/*
+	 * just overwrite the last backup if we're at the same generation
+	 * this happens only at umount
+	 */
+	root_backup = info->super_for_commit->super_roots + last_backup;
+	if (btrfs_backup_tree_root_gen(root_backup) ==
+	    btrfs_header_generation(info->tree_root->node))
+		next_backup = last_backup;
+
+	root_backup = info->super_for_commit->super_roots + next_backup;
+
+	/*
+	 * make sure all of our padding and empty slots get zero filled
+	 * regardless of which ones we use today
+	 */
+	memset(root_backup, 0, sizeof(*root_backup));
+
+	info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
+
+	btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
+	btrfs_set_backup_tree_root_gen(root_backup,
+			       btrfs_header_generation(info->tree_root->node));
+
+	btrfs_set_backup_tree_root_level(root_backup,
+			       btrfs_header_level(info->tree_root->node));
+
+	btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
+	btrfs_set_backup_chunk_root_gen(root_backup,
+			       btrfs_header_generation(info->chunk_root->node));
+	btrfs_set_backup_chunk_root_level(root_backup,
+			       btrfs_header_level(info->chunk_root->node));
+
+	btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
+	btrfs_set_backup_extent_root_gen(root_backup,
+			       btrfs_header_generation(info->extent_root->node));
+	btrfs_set_backup_extent_root_level(root_backup,
+			       btrfs_header_level(info->extent_root->node));
+
+	btrfs_set_backup_fs_root(root_backup, info->fs_root->node->start);
+	btrfs_set_backup_fs_root_gen(root_backup,
+			       btrfs_header_generation(info->fs_root->node));
+	btrfs_set_backup_fs_root_level(root_backup,
+			       btrfs_header_level(info->fs_root->node));
+
+	btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
+	btrfs_set_backup_dev_root_gen(root_backup,
+			       btrfs_header_generation(info->dev_root->node));
+	btrfs_set_backup_dev_root_level(root_backup,
+				       btrfs_header_level(info->dev_root->node));
+
+	btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
+	btrfs_set_backup_csum_root_gen(root_backup,
+			       btrfs_header_generation(info->csum_root->node));
+	btrfs_set_backup_csum_root_level(root_backup,
+			       btrfs_header_level(info->csum_root->node));
+
+	btrfs_set_backup_total_bytes(root_backup,
+			     btrfs_super_total_bytes(info->super_copy));
+	btrfs_set_backup_bytes_used(root_backup,
+			     btrfs_super_bytes_used(info->super_copy));
+	btrfs_set_backup_num_devices(root_backup,
+			     btrfs_super_num_devices(info->super_copy));
+
+	/*
+	 * if we don't copy this out to the super_copy, it won't get remembered
+	 * for the next commit
+	 */
+	memcpy(&info->super_copy->super_roots,
+	       &info->super_for_commit->super_roots,
+	       sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
+}
+
+/*
+ * this copies info out of the root backup array and back into
+ * the in-memory super block.  It is meant to help iterate through
+ * the array, so you send it the number of backups you've already
+ * tried and the last backup index you used.
+ *
+ * this returns -1 when it has tried all the backups
+ */
+static noinline int next_root_backup(struct btrfs_fs_info *info,
+				     struct btrfs_super_block *super,
+				     int *num_backups_tried, int *backup_index)
+{
+	struct btrfs_root_backup *root_backup;
+	int newest = *backup_index;
+
+	if (*num_backups_tried == 0) {
+		u64 gen = btrfs_super_generation(super);
+
+		newest = find_newest_super_backup(info, gen);
+		if (newest == -1)
+			return -1;
+
+		*backup_index = newest;
+		*num_backups_tried = 1;
+	} else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
+		/* we've tried all the backups, all done */
+		return -1;
+	} else {
+		/* jump to the next oldest backup */
+		newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
+			BTRFS_NUM_BACKUP_ROOTS;
+		*backup_index = newest;
+		*num_backups_tried += 1;
+	}
+	root_backup = super->super_roots + newest;
+
+	btrfs_set_super_generation(super,
+				   btrfs_backup_tree_root_gen(root_backup));
+	btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
+	btrfs_set_super_root_level(super,
+				   btrfs_backup_tree_root_level(root_backup));
+	btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
+
+	/*
+	 * fixme: the total bytes and num_devices need to match or we should
+	 * need a fsck
+	 */
+	btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
+	btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
+	return 0;
+}
+
+/* helper to cleanup tree roots */
+static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
+{
+	free_extent_buffer(info->tree_root->node);
+	free_extent_buffer(info->tree_root->commit_root);
+	free_extent_buffer(info->dev_root->node);
+	free_extent_buffer(info->dev_root->commit_root);
+	free_extent_buffer(info->extent_root->node);
+	free_extent_buffer(info->extent_root->commit_root);
+	free_extent_buffer(info->csum_root->node);
+	free_extent_buffer(info->csum_root->commit_root);
+
+	info->tree_root->node = NULL;
+	info->tree_root->commit_root = NULL;
+	info->dev_root->node = NULL;
+	info->dev_root->commit_root = NULL;
+	info->extent_root->node = NULL;
+	info->extent_root->commit_root = NULL;
+	info->csum_root->node = NULL;
+	info->csum_root->commit_root = NULL;
+
+	if (chunk_root) {
+		free_extent_buffer(info->chunk_root->node);
+		free_extent_buffer(info->chunk_root->commit_root);
+		info->chunk_root->node = NULL;
+		info->chunk_root->commit_root = NULL;
+	}
+}
+
+
 struct btrfs_root *open_ctree(struct super_block *sb,
 			      struct btrfs_fs_devices *fs_devices,
 			      char *options)
@@ -1603,6 +1827,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	int ret;
 	int err = -EINVAL;
+	int num_backups_tried = 0;
+	int backup_index = 0;
 
 	struct btrfs_super_block *disk_super;
 
@@ -1781,6 +2007,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
 
+	/*
+	 * run through our array of backup supers and setup
+	 * our ring pointer to the oldest one
+	 */
+	generation = btrfs_super_generation(disk_super);
+	find_oldest_super_backup(fs_info, generation);
+
 	/*
 	 * In the long term, we'll store the compression type in the super
 	 * block, and it'll be used for per file compression control.
@@ -1938,7 +2171,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
 		printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
 		       sb->s_id);
-		goto fail_chunk_root;
+		goto fail_tree_roots;
 	}
 	btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
 	chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1953,11 +2186,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (ret) {
 		printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
 		       sb->s_id);
-		goto fail_chunk_root;
+		goto fail_tree_roots;
 	}
 
 	btrfs_close_extra_devices(fs_devices);
 
+retry_root_backup:
 	blocksize = btrfs_level_size(tree_root,
 				     btrfs_super_root_level(disk_super));
 	generation = btrfs_super_generation(disk_super);
@@ -1965,32 +2199,33 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	tree_root->node = read_tree_block(tree_root,
 					  btrfs_super_root(disk_super),
 					  blocksize, generation);
-	if (!tree_root->node)
-		goto fail_chunk_root;
-	if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+	if (!tree_root->node ||
+	    !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
 		printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
 		       sb->s_id);
-		goto fail_tree_root;
+
+		goto recovery_tree_root;
 	}
+
 	btrfs_set_root_node(&tree_root->root_item, tree_root->node);
 	tree_root->commit_root = btrfs_root_node(tree_root);
 
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
 	if (ret)
-		goto fail_tree_root;
+		goto recovery_tree_root;
 	extent_root->track_dirty = 1;
 
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_DEV_TREE_OBJECTID, dev_root);
 	if (ret)
-		goto fail_extent_root;
+		goto recovery_tree_root;
 	dev_root->track_dirty = 1;
 
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_CSUM_TREE_OBJECTID, csum_root);
 	if (ret)
-		goto fail_dev_root;
+		goto recovery_tree_root;
 
 	csum_root->track_dirty = 1;
 
@@ -2123,20 +2358,10 @@ fail_cleaner:
 
 fail_block_groups:
 	btrfs_free_block_groups(fs_info);
-	free_extent_buffer(csum_root->node);
-	free_extent_buffer(csum_root->commit_root);
-fail_dev_root:
-	free_extent_buffer(dev_root->node);
-	free_extent_buffer(dev_root->commit_root);
-fail_extent_root:
-	free_extent_buffer(extent_root->node);
-	free_extent_buffer(extent_root->commit_root);
-fail_tree_root:
-	free_extent_buffer(tree_root->node);
-	free_extent_buffer(tree_root->commit_root);
-fail_chunk_root:
-	free_extent_buffer(chunk_root->node);
-	free_extent_buffer(chunk_root->commit_root);
+
+fail_tree_roots:
+	free_root_pointers(fs_info, 1);
+
 fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->generic_worker);
 	btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2164,6 +2389,25 @@ fail_srcu:
 fail:
 	free_fs_info(fs_info);
 	return ERR_PTR(err);
+
+recovery_tree_root:
+
+	if (!btrfs_test_opt(tree_root, RECOVERY))
+		goto fail_tree_roots;
+
+	free_root_pointers(fs_info, 0);
+
+	/* don't use the log in recovery mode, it won't be valid */
+	btrfs_set_super_log_root(disk_super, 0);
+
+	/* we can't trust the free space cache either */
+	btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
+
+	ret = next_root_backup(fs_info, fs_info->super_copy,
+			       &num_backups_tried, &backup_index);
+	if (ret == -1)
+		goto fail_block_groups;
+	goto retry_root_backup;
 }
 
 static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
@@ -2333,6 +2577,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 
 	max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
 	do_barriers = !btrfs_test_opt(root, NOBARRIER);
+	backup_super_roots(root->fs_info);
 
 	sb = root->fs_info->super_for_commit;
 	dev_item = &sb->dev_item;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f7e9de724ef2..57080dffdfc6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -164,7 +164,7 @@ enum {
 	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
 	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
 	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
-	Opt_inode_cache, Opt_no_space_cache, Opt_err,
+	Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -198,6 +198,7 @@ static match_table_t tokens = {
 	{Opt_defrag, "autodefrag"},
 	{Opt_inode_cache, "inode_cache"},
 	{Opt_no_space_cache, "no_space_cache"},
+	{Opt_recovery, "recovery"},
 	{Opt_err, NULL},
 };
 
@@ -392,6 +393,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			printk(KERN_INFO "btrfs: enabling auto defrag");
 			btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
 			break;
+		case Opt_recovery:
+			printk(KERN_INFO "btrfs: enabling auto recovery");
+			btrfs_set_opt(info->mount_opt, RECOVERY);
+			break;
 		case Opt_err:
 			printk(KERN_INFO "btrfs: unrecognized mount option "
 			       "'%s'\n", p);
-- 
cgit v1.2.3-58-ga151


From 6d668dda0caec537fbf28c4d91e6d18181af3cff Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 3 Nov 2011 22:54:25 -0400
Subject: Btrfs: make a delayed_block_rsv for the delayed item insertion

I've been hitting warnings in use_block_rsv when running the delayed insertion
stuff.  It's because we will readjust global block rsv based on what is in use,
which means we could end up discarding reservations that are for the delayed
insertion stuff.  So instead create a seperate block rsv for the delayed
insertion stuff.  This will also make it easier to debug problems with the
delayed insertion reservations since we will know that only the delayed
insertion code touches this block_rsv.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h         |  2 ++
 fs/btrfs/delayed-inode.c | 14 +++++++-------
 fs/btrfs/disk-io.c       |  1 +
 fs/btrfs/extent-tree.c   |  3 +++
 4 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 78f43d1102a0..3002e5d4da0b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -957,6 +957,8 @@ struct btrfs_fs_info {
 	struct btrfs_block_rsv trans_block_rsv;
 	/* block reservation for chunk tree */
 	struct btrfs_block_rsv chunk_block_rsv;
+	/* block reservation for delayed operations */
+	struct btrfs_block_rsv delayed_block_rsv;
 
 	struct btrfs_block_rsv empty_block_rsv;
 
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index b52c672f4c18..fc4026af7290 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
 		return 0;
 
 	src_rsv = trans->block_rsv;
-	dst_rsv = &root->fs_info->global_block_rsv;
+	dst_rsv = &root->fs_info->delayed_block_rsv;
 
 	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
 	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
@@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
 	if (!item->bytes_reserved)
 		return;
 
-	rsv = &root->fs_info->global_block_rsv;
+	rsv = &root->fs_info->delayed_block_rsv;
 	btrfs_block_rsv_release(root, rsv,
 				item->bytes_reserved);
 }
@@ -628,7 +628,7 @@ static int btrfs_delayed_inode_reserve_metadata(
 		return 0;
 
 	src_rsv = trans->block_rsv;
-	dst_rsv = &root->fs_info->global_block_rsv;
+	dst_rsv = &root->fs_info->delayed_block_rsv;
 
 	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
 	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
@@ -646,7 +646,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
 	if (!node->bytes_reserved)
 		return;
 
-	rsv = &root->fs_info->global_block_rsv;
+	rsv = &root->fs_info->delayed_block_rsv;
 	btrfs_block_rsv_release(root, rsv,
 				node->bytes_reserved);
 	node->bytes_reserved = 0;
@@ -1026,7 +1026,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
 	path->leave_spinning = 1;
 
 	block_rsv = trans->block_rsv;
-	trans->block_rsv = &root->fs_info->global_block_rsv;
+	trans->block_rsv = &root->fs_info->delayed_block_rsv;
 
 	delayed_root = btrfs_get_delayed_root(root);
 
@@ -1069,7 +1069,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
 	path->leave_spinning = 1;
 
 	block_rsv = trans->block_rsv;
-	trans->block_rsv = &node->root->fs_info->global_block_rsv;
+	trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
 
 	ret = btrfs_insert_delayed_items(trans, path, node->root, node);
 	if (!ret)
@@ -1149,7 +1149,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
 		goto free_path;
 
 	block_rsv = trans->block_rsv;
-	trans->block_rsv = &root->fs_info->global_block_rsv;
+	trans->block_rsv = &root->fs_info->delayed_block_rsv;
 
 	ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
 	if (!ret)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a61f8a6cf219..23b6776477b7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1891,6 +1891,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_init_block_rsv(&fs_info->trans_block_rsv);
 	btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
 	btrfs_init_block_rsv(&fs_info->empty_block_rsv);
+	btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
 	atomic_set(&fs_info->nr_async_submits, 0);
 	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->async_submit_draining, 0);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 782eb3ea8edf..01c1f08b976a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3914,6 +3914,7 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 	fs_info->delalloc_block_rsv.space_info = space_info;
 	fs_info->trans_block_rsv.space_info = space_info;
 	fs_info->empty_block_rsv.space_info = space_info;
+	fs_info->delayed_block_rsv.space_info = space_info;
 
 	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
 	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
@@ -3933,6 +3934,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+	WARN_ON(fs_info->delayed_block_rsv.size > 0);
+	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
 }
 
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.3-58-ga151


From 663350ac38c67ca388acea6e876dc6d668c232b0 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 3 Nov 2011 22:54:25 -0400
Subject: Btrfs: be smarter about committing the transaction in
 reserve_metadata_bytes

Because of the overcommit stuff I had to make it so that we committed the
transaction all the time in reserve_metadata_bytes in case we had overcommitted
because of delayed items.  This was because previously we had no way of knowing
how much space was reserved for delayed items.  Now that we have the
delayed_block_rsv we can check it to see if committing the transaction would get
us anywhere.  This patch breaks out the committing logic into a helper function
that will check to see if committing the transaction would free enough space for
us to get anything done.  With this patch xfstests 83 goes from taking 445
seconds to taking 28 seconds on my box.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 86 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 67 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 01c1f08b976a..5b84205e7685 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3334,12 +3334,12 @@ out:
 /*
  * shrink metadata reservation for delalloc
  */
-static int shrink_delalloc(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, u64 to_reclaim,
+static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
 			   bool wait_ordered)
 {
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_space_info *space_info;
+	struct btrfs_trans_handle *trans;
 	u64 reserved;
 	u64 max_reclaim;
 	u64 reclaimed = 0;
@@ -3348,6 +3348,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	int loops = 0;
 	unsigned long progress;
 
+	trans = (struct btrfs_trans_handle *)current->journal_info;
 	block_rsv = &root->fs_info->delalloc_block_rsv;
 	space_info = block_rsv->space_info;
 
@@ -3417,6 +3418,60 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	return reclaimed >= to_reclaim;
 }
 
+/**
+ * maybe_commit_transaction - possibly commit the transaction if its ok to
+ * @root - the root we're allocating for
+ * @bytes - the number of bytes we want to reserve
+ * @force - force the commit
+ *
+ * This will check to make sure that committing the transaction will actually
+ * get us somewhere and then commit the transaction if it does.  Otherwise it
+ * will return -ENOSPC.
+ */
+static int may_commit_transaction(struct btrfs_root *root,
+				  struct btrfs_space_info *space_info,
+				  u64 bytes, int force)
+{
+	struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
+	struct btrfs_trans_handle *trans;
+
+	trans = (struct btrfs_trans_handle *)current->journal_info;
+	if (trans)
+		return -EAGAIN;
+
+	if (force)
+		goto commit;
+
+	/* See if there is enough pinned space to make this reservation */
+	spin_lock(&space_info->lock);
+	if (space_info->bytes_pinned >= bytes) {
+		spin_unlock(&space_info->lock);
+		goto commit;
+	}
+	spin_unlock(&space_info->lock);
+
+	/*
+	 * See if there is some space in the delayed insertion reservation for
+	 * this reservation.
+	 */
+	if (space_info != delayed_rsv->space_info)
+		return -ENOSPC;
+
+	spin_lock(&delayed_rsv->lock);
+	if (delayed_rsv->size < bytes) {
+		spin_unlock(&delayed_rsv->lock);
+		return -ENOSPC;
+	}
+	spin_unlock(&delayed_rsv->lock);
+
+commit:
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans))
+		return -ENOSPC;
+
+	return btrfs_commit_transaction(trans, root);
+}
+
 /**
  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
  * @root - the root we're allocating for
@@ -3436,7 +3491,6 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
 				  u64 orig_bytes, int flush)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
-	struct btrfs_trans_handle *trans;
 	u64 used;
 	u64 num_bytes = orig_bytes;
 	int retries = 0;
@@ -3445,7 +3499,6 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
 	bool flushing = false;
 	bool wait_ordered = false;
 
-	trans = (struct btrfs_trans_handle *)current->journal_info;
 again:
 	ret = 0;
 	spin_lock(&space_info->lock);
@@ -3461,7 +3514,7 @@ again:
 		 * deadlock since we are waiting for the flusher to finish, but
 		 * hold the current transaction open.
 		 */
-		if (trans)
+		if (current->journal_info)
 			return -EAGAIN;
 		ret = wait_event_interruptible(space_info->wait,
 					       !space_info->flush);
@@ -3517,12 +3570,16 @@ again:
 		 */
 		avail = (space_info->total_bytes - space_info->bytes_used) * 8;
 		do_div(avail, 10);
-		if (space_info->bytes_pinned >= avail && flush && !trans &&
-		    !committed) {
+		if (space_info->bytes_pinned >= avail && flush && !committed) {
 			space_info->flush = 1;
 			flushing = true;
 			spin_unlock(&space_info->lock);
-			goto commit;
+			ret = may_commit_transaction(root, space_info,
+						     orig_bytes, 1);
+			if (ret)
+				goto out;
+			committed = true;
+			goto again;
 		}
 
 		spin_lock(&root->fs_info->free_chunk_lock);
@@ -3575,7 +3632,7 @@ again:
 	 * We do synchronous shrinking since we don't actually unreserve
 	 * metadata until after the IO is completed.
 	 */
-	ret = shrink_delalloc(trans, root, num_bytes, wait_ordered);
+	ret = shrink_delalloc(root, num_bytes, wait_ordered);
 	if (ret < 0)
 		goto out;
 
@@ -3592,21 +3649,12 @@ again:
 		goto again;
 	}
 
-	ret = -EAGAIN;
-	if (trans)
-		goto out;
-
-commit:
 	ret = -ENOSPC;
 	if (committed)
 		goto out;
 
-	trans = btrfs_join_transaction(root);
-	if (IS_ERR(trans))
-		goto out;
-	ret = btrfs_commit_transaction(trans, root);
+	ret = may_commit_transaction(root, space_info, orig_bytes, 0);
 	if (!ret) {
-		trans = NULL;
 		committed = true;
 		goto again;
 	}
-- 
cgit v1.2.3-58-ga151


From bf0da8c183a15656eee63c54f334c3794320872a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 4 Nov 2011 12:29:37 -0400
Subject: Btrfs: ClearPageError during writepage and clean_tree_block

Failure testing was tripping up over stale PageError bits in
metadata pages.  If we have an io error on a block, and later on
end up reusing it, nobody ever clears PageError on those pages.

During commit, we'll find PageError and think we had trouble writing
the block, which will lead to aborts and other problems.

This changes clean_tree_block and the btrfs writepage code to
clear the PageError bit.  In both cases we're either completely
done with the page or the page has good stuff and the error bit
is no longer valid.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c   | 4 ++++
 fs/btrfs/transaction.c | 7 ++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b40ba75f4483..cc3c58970d4e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2334,6 +2334,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	trace___extent_writepage(page, inode, wbc);
 
 	WARN_ON(!PageLocked(page));
+
+	ClearPageError(page);
+
 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
 	if (page->index > end_index ||
 	   (page->index == end_index && !pg_offset)) {
@@ -3402,6 +3405,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 						PAGECACHE_TAG_DIRTY);
 		}
 		spin_unlock_irq(&page->mapping->tree_lock);
+		ClearPageError(page);
 		unlock_page(page);
 	}
 	return 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 373c7ec1a026..29f782cc2cc9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -634,7 +634,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 
 	ret = btrfs_write_marked_extents(root, dirty_pages, mark);
 	ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
-	return ret || ret2;
+
+	if (ret)
+		return ret;
+	if (ret2)
+		return ret2;
+	return 0;
 }
 
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.3-58-ga151


From c06a0e120a4e381a1c291c1fce3c6155c5791cae Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 4 Nov 2011 19:56:02 -0400
Subject: Btrfs: fix delayed insertion reservation

We all keep getting those stupid warnings from use_block_rsv when running
stress.sh, and it's because the delayed insertion stuff is being stupid.  It's
not the delayed insertion stuffs fault, it's all just stupid.  When marking an
inode dirty for oh say updating the time on it, we just do a
btrfs_join_transaction, which doesn't reserve any space.  This is stupid because
we're going to have to have space reserve to make this change, but we do it
because it's fast because chances are we're going to call it over and over again
and it doesn't matter.  Well thanks to the delayed insertion stuff this is
mostly the case, so we do actually need to make this reservation.  So if
trans->bytes_reserved is 0 then try to do a normal reservation.  If not return
ENOSPC which will make the btrfs_dirty_inode start a proper transaction which
will let it do the whole ENOSPC dance and reserve enough space for the delayed
insertion to steal the reservation from the transaction.

The other stupid thing we do is not reserve space for the inode when writing to
the thing.  Usually this is ok since we have to update the time so we'd have
already done all this work before we get to the endio stuff, so it doesn't
matter.  But this is stupid because we could write the data after the
transaction commits where we changed the mtime of the inode so we have to cow
all the way down to the inode anyway.  This used to be masked by the delalloc
reservation stuff, but because we delay the update it doesn't get masked in this
case.  So again the delayed insertion stuff bites us in the ass.  So if our
trans->block_rsv is delalloc, just steal the reservation from the delalloc
reserve.  Hopefully this won't bite us in the ass, but I've said that before.

With this patch stress.sh no longer spits out those stupid warnings (famous last
words).  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h         |  3 +++
 fs/btrfs/delayed-inode.c | 36 ++++++++++++++++++++++++++++--------
 fs/btrfs/extent-tree.c   | 18 ++++++++++++++++++
 3 files changed, 49 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3002e5d4da0b..6bb34fc1ff22 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2353,6 +2353,9 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
 int btrfs_block_rsv_add(struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
 			u64 num_bytes);
+int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
+				struct btrfs_block_rsv *block_rsv,
+				u64 num_bytes);
 int btrfs_block_rsv_check(struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv, int min_factor);
 int btrfs_block_rsv_refill(struct btrfs_root *root,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index fc4026af7290..bbe8496d5339 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -624,13 +624,36 @@ static int btrfs_delayed_inode_reserve_metadata(
 	u64 num_bytes;
 	int ret;
 
-	if (!trans->bytes_reserved)
-		return 0;
-
 	src_rsv = trans->block_rsv;
 	dst_rsv = &root->fs_info->delayed_block_rsv;
 
 	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+
+	/*
+	 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
+	 * which doesn't reserve space for speed.  This is a problem since we
+	 * still need to reserve space for this update, so try to reserve the
+	 * space.
+	 *
+	 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
+	 * we're accounted for.
+	 */
+	if (!trans->bytes_reserved &&
+	    src_rsv != &root->fs_info->delalloc_block_rsv) {
+		ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+		/*
+		 * Since we're under a transaction reserve_metadata_bytes could
+		 * try to commit the transaction which will make it return
+		 * EAGAIN to make us stop the transaction we have, so return
+		 * ENOSPC instead so that btrfs_dirty_inode knows what to do.
+		 */
+		if (ret == -EAGAIN)
+			ret = -ENOSPC;
+		if (!ret)
+			node->bytes_reserved = num_bytes;
+		return ret;
+	}
+
 	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
 	if (!ret)
 		node->bytes_reserved = num_bytes;
@@ -1686,11 +1709,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
 	}
 
 	ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
-	/*
-	 * we must reserve enough space when we start a new transaction,
-	 * so reserving metadata failure is impossible
-	 */
-	BUG_ON(ret);
+	if (ret)
+		goto release_node;
 
 	fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
 	delayed_node->inode_dirty = 1;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5b84205e7685..23e936c3de76 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3814,6 +3814,24 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
 	return ret;
 }
 
+int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
+				struct btrfs_block_rsv *block_rsv,
+				u64 num_bytes)
+{
+	int ret;
+
+	if (num_bytes == 0)
+		return 0;
+
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0);
+	if (!ret) {
+		block_rsv_add_bytes(block_rsv, num_bytes, 1);
+		return 0;
+	}
+
+	return ret;
+}
+
 int btrfs_block_rsv_check(struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv, int min_factor)
 {
-- 
cgit v1.2.3-58-ga151


From 740c3d226cbba6cd6a32adfb66809c94938f3e57 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 2 Nov 2011 15:48:34 -0400
Subject: Btrfs: fix the new inspection ioctls for 32 bit compat

The new ioctls to follow backrefs are not clean for 32/64 bit
compat.  This reworks them for u64s everywhere.  They are brand new, so
there are no problems with changing the interface now.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/backref.c |  8 ++++----
 fs/btrfs/ioctl.c   | 10 +++++-----
 fs/btrfs/ioctl.h   | 11 +++++------
 fs/btrfs/scrub.c   |  2 +-
 4 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 2351df0de450..8855aad3929c 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -676,14 +676,14 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
 	bytes_left = ipath->fspath->bytes_left > s_ptr ?
 					ipath->fspath->bytes_left - s_ptr : 0;
 
-	fspath_min = (char *)ipath->fspath->str + (i + 1) * s_ptr;
+	fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
 	fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
 				inum, fspath_min, bytes_left);
 	if (IS_ERR(fspath))
 		return PTR_ERR(fspath);
 
 	if (fspath > fspath_min) {
-		ipath->fspath->str[i] = fspath;
+		ipath->fspath->val[i] = (u64)fspath;
 		++ipath->fspath->elem_cnt;
 		ipath->fspath->bytes_left = fspath - fspath_min;
 	} else {
@@ -698,9 +698,9 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
 /*
  * this dumps all file system paths to the inode into the ipath struct, provided
  * is has been created large enough. each path is zero-terminated and accessed
- * from ipath->fspath->str[i].
+ * from ipath->fspath->val[i].
  * when it returns, there are ipath->fspath->elem_cnt number of paths available
- * in ipath->fspath->str[]. when the allocated space wasn't sufficient, the
+ * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
  * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
  * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
  * have been needed to return all paths.
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cc9893990341..4a34c472f126 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2895,7 +2895,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
 {
 	int ret = 0;
 	int i;
-	unsigned long rel_ptr;
+	u64 rel_ptr;
 	int size;
 	struct btrfs_ioctl_ino_path_args *ipa = NULL;
 	struct inode_fs_paths *ipath = NULL;
@@ -2930,11 +2930,11 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
 		goto out;
 
 	for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
-		rel_ptr = ipath->fspath->str[i] - (char *)ipath->fspath->str;
-		ipath->fspath->str[i] = (void *)rel_ptr;
+		rel_ptr = ipath->fspath->val[i] - (u64)ipath->fspath->val;
+		ipath->fspath->val[i] = rel_ptr;
 	}
 
-	ret = copy_to_user(ipa->fspath, ipath->fspath, size);
+	ret = copy_to_user((void *)ipa->fspath, (void *)ipath->fspath, size);
 	if (ret) {
 		ret = -EFAULT;
 		goto out;
@@ -3017,7 +3017,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 	if (ret < 0)
 		goto out;
 
-	ret = copy_to_user(loi->inodes, inodes, size);
+	ret = copy_to_user((void *)loi->inodes, (void *)inodes, size);
 	if (ret)
 		ret = -EFAULT;
 
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 2da30d4950e6..252ae9915de8 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -198,24 +198,23 @@ struct btrfs_data_container {
 	__u32	bytes_missing;	/* out -- additional bytes needed for result */
 	__u32	elem_cnt;	/* out */
 	__u32	elem_missed;	/* out */
-	union {
-		char	*str[0];	/* out */
-		__u64	val[0];		/* out */
-	};
+	__u64	val[0];		/* out */
 };
 
 struct btrfs_ioctl_ino_path_args {
 	__u64				inum;		/* in */
 	__u32				size;		/* in */
 	__u64				reserved[4];
-	struct btrfs_data_container	*fspath;	/* out */
+	/* struct btrfs_data_container	*fspath;	   out */
+	__u64				fspath;		/* out */
 };
 
 struct btrfs_ioctl_logical_ino_args {
 	__u64				logical;	/* in */
 	__u32				size;		/* in */
 	__u64				reserved[4];
-	struct btrfs_data_container	*inodes;	/* out */
+	/* struct btrfs_data_container	*inodes;	out   */
+	__u64				inodes;
 };
 
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 94cd3a19e9c8..562dad10dee9 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -272,7 +272,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
 			swarn->logical, swarn->dev->name,
 			(unsigned long long)swarn->sector, root, inum, offset,
 			min(isize - offset, (u64)PAGE_SIZE), nlink,
-			ipath->fspath->str[i]);
+			(char *)ipath->fspath->val[i]);
 
 	free_ipath(ipath);
 	return 0;
-- 
cgit v1.2.3-58-ga151


From c674e04e1cd6049715e7b9446790f4b441e547c0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Nov 2011 22:23:13 -0400
Subject: Btrfs: fix extent_buffer leak in the metadata IO error handling

The scrub readahead branch brought in a new error handling hook,
but it was leaking extent_buffer references.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0eb1f0951251..40a62b980087 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -644,6 +644,7 @@ static int btree_io_failed_hook(struct bio *failed_bio,
 		clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
 		btree_readahead_hook(root, eb, eb->start, -EIO);
 	}
+	free_extent_buffer(eb);
 
 out:
 	return -EIO;	/* we fixed nothing */
-- 
cgit v1.2.3-58-ga151


From 306c8b68c82dfe6b7c9e5b61985760ad5d089205 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Nov 2011 15:21:39 -0400
Subject: Btrfs: stop the readahead threads on failed mount

If we don't stop them, they linger around corrupting
memory by using pointers to freed things.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 40a62b980087..e532892431f4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2444,6 +2444,7 @@ fail_tree_roots:
 
 fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->generic_worker);
+	btrfs_stop_workers(&fs_info->readahead_workers);
 	btrfs_stop_workers(&fs_info->fixup_workers);
 	btrfs_stop_workers(&fs_info->delalloc_workers);
 	btrfs_stop_workers(&fs_info->workers);
-- 
cgit v1.2.3-58-ga151


From 9510dc4c62252e96dca44a94948327c76774f25b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 4 Nov 2011 09:41:02 -0400
Subject: Btrfs: stop leaking btrfs_bios on readahead

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/reada.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index cd857119ba8a..35eaf6893679 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -423,6 +423,7 @@ again:
 	}
 	spin_unlock(&fs_info->reada_lock);
 
+	kfree(multi);
 	return re;
 
 error:
@@ -447,6 +448,7 @@ error:
 		kref_put(&zone->refcnt, reada_zone_release);
 		spin_unlock(&fs_info->reada_lock);
 	}
+	kfree(multi);
 	kfree(re);
 	if (looped)
 		goto again;
-- 
cgit v1.2.3-58-ga151


From 21ca543efc12674fddb22ddf4ea4906427f4e982 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 4 Nov 2011 09:41:02 -0400
Subject: Btrfs: rename btrfs_bio multi -> bbio for consistency

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/reada.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 35eaf6893679..2373b39a132b 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -247,7 +247,7 @@ int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
 
 static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
 					  struct btrfs_device *dev, u64 logical,
-					  struct btrfs_bio *multi)
+					  struct btrfs_bio *bbio)
 {
 	int ret;
 	int looped = 0;
@@ -297,11 +297,11 @@ again:
 	kref_init(&zone->refcnt);
 	zone->elems = 0;
 	zone->device = dev; /* our device always sits at index 0 */
-	for (i = 0; i < multi->num_stripes; ++i) {
+	for (i = 0; i < bbio->num_stripes; ++i) {
 		/* bounds have already been checked */
-		zone->devs[i] = multi->stripes[i].dev;
+		zone->devs[i] = bbio->stripes[i].dev;
 	}
-	zone->ndevs = multi->num_stripes;
+	zone->ndevs = bbio->num_stripes;
 
 	spin_lock(&fs_info->reada_lock);
 	ret = radix_tree_insert(&dev->reada_zones,
@@ -327,7 +327,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 	struct reada_extent *re = NULL;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
-	struct btrfs_bio *multi = NULL;
+	struct btrfs_bio *bbio = NULL;
 	struct btrfs_device *dev;
 	u32 blocksize;
 	u64 length;
@@ -361,21 +361,21 @@ again:
 	 * map block
 	 */
 	length = blocksize;
-	ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &multi, 0);
-	if (ret || !multi || length < blocksize)
+	ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0);
+	if (ret || !bbio || length < blocksize)
 		goto error;
 
-	if (multi->num_stripes > MAX_MIRRORS) {
+	if (bbio->num_stripes > MAX_MIRRORS) {
 		printk(KERN_ERR "btrfs readahead: more than %d copies not "
 				"supported", MAX_MIRRORS);
 		goto error;
 	}
 
-	for (nzones = 0; nzones < multi->num_stripes; ++nzones) {
+	for (nzones = 0; nzones < bbio->num_stripes; ++nzones) {
 		struct reada_zone *zone;
 
-		dev = multi->stripes[nzones].dev;
-		zone = reada_find_zone(fs_info, dev, logical, multi);
+		dev = bbio->stripes[nzones].dev;
+		zone = reada_find_zone(fs_info, dev, logical, bbio);
 		if (!zone)
 			break;
 
@@ -407,11 +407,11 @@ again:
 		goto error;
 	}
 	for (i = 0; i < nzones; ++i) {
-		dev = multi->stripes[i].dev;
+		dev = bbio->stripes[i].dev;
 		ret = radix_tree_insert(&dev->reada_extents, index, re);
 		if (ret) {
 			while (--i >= 0) {
-				dev = multi->stripes[i].dev;
+				dev = bbio->stripes[i].dev;
 				BUG_ON(dev == NULL);
 				radix_tree_delete(&dev->reada_extents, index);
 			}
@@ -423,7 +423,7 @@ again:
 	}
 	spin_unlock(&fs_info->reada_lock);
 
-	kfree(multi);
+	kfree(bbio);
 	return re;
 
 error:
@@ -448,7 +448,7 @@ error:
 		kref_put(&zone->refcnt, reada_zone_release);
 		spin_unlock(&fs_info->reada_lock);
 	}
-	kfree(multi);
+	kfree(bbio);
 	kfree(re);
 	if (looped)
 		goto again;
-- 
cgit v1.2.3-58-ga151


From 56d2a48f81a1bde827c625b90221fade72fe9c61 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 4 Nov 2011 09:41:02 -0400
Subject: Btrfs: fix a potential btrfs_bio leak on scrub fixups

In case we were able to map less than we wanted (length < PAGE_SIZE
clause is true) btrfs_bio is still allocated and we have to free it.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/scrub.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 562dad10dee9..ed11d3866afd 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -655,6 +655,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 		       "scrub_fixup: btrfs_map_block failed us for %llu\n",
 		       (unsigned long long)logical);
 		WARN_ON(1);
+		kfree(bbio);
 		return;
 	}
 
-- 
cgit v1.2.3-58-ga151


From d43317dcd074818d4bd12ddd4184a29aff98907b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sun, 6 Nov 2011 03:26:19 -0500
Subject: Btrfs: fix race during transaction joins

While we're allocating ram for a new transaction, we drop our spinlock.
When we get the lock back, we do check to see if a transaction started
while we slept, but we don't check to make sure it isn't blocked
because a commit has already started.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/transaction.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 29f782cc2cc9..960835eaf4da 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -55,6 +55,7 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
 	struct btrfs_transaction *cur_trans;
 
 	spin_lock(&root->fs_info->trans_lock);
+loop:
 	if (root->fs_info->trans_no_join) {
 		if (!nofail) {
 			spin_unlock(&root->fs_info->trans_lock);
@@ -75,16 +76,18 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
 	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
 	if (!cur_trans)
 		return -ENOMEM;
+
 	spin_lock(&root->fs_info->trans_lock);
 	if (root->fs_info->running_transaction) {
+		/*
+		 * someone started a transaction after we unlocked.  Make sure
+		 * to redo the trans_no_join checks above
+		 */
 		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
 		cur_trans = root->fs_info->running_transaction;
-		atomic_inc(&cur_trans->use_count);
-		atomic_inc(&cur_trans->num_writers);
-		cur_trans->num_joined++;
-		spin_unlock(&root->fs_info->trans_lock);
-		return 0;
+		goto loop;
 	}
+
 	atomic_set(&cur_trans->num_writers, 1);
 	cur_trans->num_joined = 0;
 	init_waitqueue_head(&cur_trans->writer_wait);
-- 
cgit v1.2.3-58-ga151


From 7c7e82a77fe3d89ae50824aa7c897454675eb4c4 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sun, 6 Nov 2011 18:50:56 -0500
Subject: Btrfs: check for a null fs root when writing to the backup root log

During log replay, can commit the transaction before the fs_root
pointers are setup, so we have to make sure they are not null before
trying to use them.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e532892431f4..e53a5bb85670 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1754,11 +1754,18 @@ static void backup_super_roots(struct btrfs_fs_info *info)
 	btrfs_set_backup_extent_root_level(root_backup,
 			       btrfs_header_level(info->extent_root->node));
 
-	btrfs_set_backup_fs_root(root_backup, info->fs_root->node->start);
-	btrfs_set_backup_fs_root_gen(root_backup,
+	/*
+	 * we might commit during log recovery, which happens before we set
+	 * the fs_root.  Make sure it is valid before we fill it in.
+	 */
+	if (info->fs_root && info->fs_root->node) {
+		btrfs_set_backup_fs_root(root_backup,
+					 info->fs_root->node->start);
+		btrfs_set_backup_fs_root_gen(root_backup,
 			       btrfs_header_generation(info->fs_root->node));
-	btrfs_set_backup_fs_root_level(root_backup,
+		btrfs_set_backup_fs_root_level(root_backup,
 			       btrfs_header_level(info->fs_root->node));
+	}
 
 	btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
 	btrfs_set_backup_dev_root_gen(root_backup,
-- 
cgit v1.2.3-58-ga151


From 50e696308c3fb18a4a0dae7b3a4d47469149c919 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Mon, 7 Nov 2011 16:39:57 +0000
Subject: vfs: d_invalidate() should leave mountpoints alone

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 274f13e2f094..a901c6901bce 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -546,9 +546,11 @@ int d_invalidate(struct dentry * dentry)
 	 * would make it unreachable from the root,
 	 * we might still populate it if it was a
 	 * working directory or similar).
+	 * We also need to leave mountpoints alone,
+	 * directory or not.
 	 */
-	if (dentry->d_count > 1) {
-		if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) {
+	if (dentry->d_count > 1 && dentry->d_inode) {
+		if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) {
 			spin_unlock(&dentry->d_lock);
 			return -EBUSY;
 		}
-- 
cgit v1.2.3-58-ga151


From 45ea6095c8f0d6caad5658306416a5d254f1205e Mon Sep 17 00:00:00 2001
From: "slyich@gmail.com" <slyich@gmail.com>
Date: Mon, 7 Nov 2011 16:08:01 -0500
Subject: btrfs: fix double-free 'tree_root' in 'btrfs_mount()'

On error path 'tree_root' is treed in 'free_fs_info()'.
No need to free it explicitely. Noticed by SLUB in debug mode:

Complete reproducer under usermode linux (discovered on real
machine):

    bdev=/dev/ubda
    btr_root=/btr
    /mkfs.btrfs $bdev
    mount $bdev $btr_root
    mkdir $btr_root/subvols/
    cd $btr_root/subvols/
    /btrfs su cr foo
    /btrfs su cr bar
    mount $bdev -osubvol=subvols/foo $btr_root/subvols/bar
    umount $btr_root/subvols/bar

which gives

device fsid 4d55aa28-45b1-474b-b4ec-da912322195e devid 1 transid 7 /dev/ubda
=============================================================================
BUG kmalloc-2048: Object already free
-----------------------------------------------------------------------------

INFO: Allocated in btrfs_mount+0x389/0x7f0 age=0 cpu=0 pid=277
INFO: Freed in btrfs_mount+0x51c/0x7f0 age=0 cpu=0 pid=277
INFO: Slab 0x0000000062886200 objects=15 used=9 fp=0x0000000070b4d2d0 flags=0x4081
INFO: Object 0x0000000070b4d2d0 @offset=21200 fp=0x0000000070b4a968
...
Call Trace:
70b31948:  [<6008c522>] print_trailer+0xe2/0x130
70b31978:  [<6008c5aa>] object_err+0x3a/0x50
70b319a8:  [<6008e242>] free_debug_processing+0x142/0x2a0
70b319e0:  [<600ebf6f>] btrfs_mount+0x55f/0x7f0
70b319f8:  [<6008e5c1>] __slab_free+0x221/0x2d0

Signed-off-by: Sergei Trofimovich <slyfox@gentoo.org>
Cc: Arne Jansen <sensille@gmx.net>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 57080dffdfc6..dcd5aef6b614 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -933,8 +933,12 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	 * then open_ctree will properly initialize everything later.
 	 */
 	fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
+	if (!fs_info) {
+		error = -ENOMEM;
+		goto error_close_devices;
+	}
 	tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-	if (!fs_info || !tree_root) {
+	if (!tree_root) {
 		error = -ENOMEM;
 		goto error_close_devices;
 	}
@@ -964,7 +968,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 
 		btrfs_close_devices(fs_devices);
 		free_fs_info(fs_info);
-		kfree(tree_root);
 	} else {
 		char b[BDEVNAME_SIZE];
 
@@ -992,7 +995,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 error_close_devices:
 	btrfs_close_devices(fs_devices);
 	free_fs_info(fs_info);
-	kfree(tree_root);
 	return ERR_PTR(error);
 }
 
-- 
cgit v1.2.3-58-ga151


From a3fbbde70a0cec017f2431e8f8de208708c76acc Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Mon, 7 Nov 2011 21:21:26 +0000
Subject: VFS: we need to set LOOKUP_JUMPED on mountpoint crossing

Mountpoint crossing is similar to following procfs symlinks - we do
not get ->d_revalidate() called for dentry we have arrived at, with
unpleasant consequences for NFS4.

Simple way to reproduce the problem in mainline:

    cat >/tmp/a.c <<'EOF'
    #include <unistd.h>
    #include <fcntl.h>
    #include <stdio.h>
    main()
    {
            struct flock fl = {.l_type = F_RDLCK, .l_whence = SEEK_SET, .l_len = 1};
            if (fcntl(0, F_SETLK, &fl))
                    perror("setlk");
    }
    EOF
    cc /tmp/a.c -o /tmp/test

then on nfs4:

    mount --bind file1 file2
    /tmp/test < file1		# ok
    /tmp/test < file2		# spews "setlk: No locks available"...

What happens is the missing call of ->d_revalidate() after mountpoint
crossing and that's where NFS4 would issue OPEN request to server.

The fix is simple - treat mountpoint crossing the same way we deal with
following procfs-style symlinks.  I.e.  set LOOKUP_JUMPED...

Cc: stable@kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index ac6d214da827..5008f01787f5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -852,7 +852,7 @@ static int follow_managed(struct path *path, unsigned flags)
 		mntput(path->mnt);
 	if (ret == -EISDIR)
 		ret = 0;
-	return ret;
+	return ret < 0 ? ret : need_mntput;
 }
 
 int follow_down_one(struct path *path)
@@ -900,6 +900,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 			break;
 		path->mnt = mounted;
 		path->dentry = mounted->mnt_root;
+		nd->flags |= LOOKUP_JUMPED;
 		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
 		/*
 		 * Update the inode too. We don't need to re-check the
@@ -1213,6 +1214,8 @@ retry:
 		path_put_conditional(path, nd);
 		return err;
 	}
+	if (err)
+		nd->flags |= LOOKUP_JUMPED;
 	*inode = path->dentry->d_inode;
 	return 0;
 }
@@ -2146,6 +2149,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	}
 
 	/* create side of things */
+	/*
+	 * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED has been
+	 * cleared when we got to the last component we are about to look up
+	 */
 	error = complete_walk(nd);
 	if (error)
 		return ERR_PTR(error);
@@ -2214,6 +2221,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	if (error < 0)
 		goto exit_dput;
 
+	if (error)
+		nd->flags |= LOOKUP_JUMPED;
+
 	error = -ENOENT;
 	if (!path->dentry->d_inode)
 		goto exit_dput;
@@ -2223,6 +2233,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 
 	path_to_nameidata(path, nd);
 	nd->inode = path->dentry->d_inode;
+	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
+	error = complete_walk(nd);
+	if (error)
+		goto exit;
 	error = -EISDIR;
 	if (S_ISDIR(nd->inode->i_mode))
 		goto exit;
-- 
cgit v1.2.3-58-ga151


From b52a360b2aa1c59ba9970fb0f52bbb093fcc7a24 Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Mon, 7 Nov 2011 16:10:24 +0000
Subject: xfs: Fix possible memory corruption in xfs_readlink

Fixes a possible memory corruption when the link is larger than
MAXPATHLEN and XFS_DEBUG is not enabled. This also remove the
S_ISLNK assert, since the inode mode is checked previously in
xfs_readlink_by_handle() and via VFS.

Updated to address concerns raised by Ben Hutchings about the loose
attention paid to 32- vs 64-bit values, and the lack of handling a
potentially negative pathlen value:
 - Changed type of "pathlen" to be xfs_fsize_t, to match that of
   ip->i_d.di_size
 - Added checking for a negative pathlen to the too-long pathlen
   test, and generalized the message that gets reported in that case
   to reflect the change
As a result, if a negative pathlen were encountered, this function
would return EFSCORRUPTED (and would fail an assertion for a debug
build)--just as would a too-long pathlen.

Signed-off-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_vnodeops.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 4ecf2a549060..ce9268a2f56b 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -112,7 +112,7 @@ xfs_readlink(
 	char		*link)
 {
 	xfs_mount_t	*mp = ip->i_mount;
-	int		pathlen;
+	xfs_fsize_t	pathlen;
 	int		error = 0;
 
 	trace_xfs_readlink(ip);
@@ -122,13 +122,19 @@ xfs_readlink(
 
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 
-	ASSERT(S_ISLNK(ip->i_d.di_mode));
-	ASSERT(ip->i_d.di_size <= MAXPATHLEN);
-
 	pathlen = ip->i_d.di_size;
 	if (!pathlen)
 		goto out;
 
+	if (pathlen < 0 || pathlen > MAXPATHLEN) {
+		xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
+			 __func__, (unsigned long long) ip->i_ino,
+			 (long long) pathlen);
+		ASSERT(0);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+
 	if (ip->i_df.if_flags & XFS_IFINLINE) {
 		memcpy(link, ip->i_df.if_u1.if_data, pathlen);
 		link[pathlen] = '\0';
-- 
cgit v1.2.3-58-ga151


From 272e42b215c52d32e06bf035c1f6b70baa6716bd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 28 Oct 2011 09:54:24 +0000
Subject: xfs: constify xfs_item_ops

The log item ops aren't nessecarily the biggest exploit vector, but marking
them const is easy enough.  Also remove the unused xfs_item_ops_t typedef
while we're at it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_buf_item.c     | 2 +-
 fs/xfs/xfs_dquot_item.c   | 6 +++---
 fs/xfs/xfs_extfree_item.c | 4 ++--
 fs/xfs/xfs_inode_item.c   | 2 +-
 fs/xfs/xfs_log.c          | 2 +-
 fs/xfs/xfs_log.h          | 2 +-
 fs/xfs/xfs_trans.h        | 6 +++---
 7 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 1a3513881bce..eac97ef81e2a 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -656,7 +656,7 @@ xfs_buf_item_committing(
 /*
  * This is the ops vector shared by all buf log items.
  */
-static struct xfs_item_ops xfs_buf_item_ops = {
+static const struct xfs_item_ops xfs_buf_item_ops = {
 	.iop_size	= xfs_buf_item_size,
 	.iop_format	= xfs_buf_item_format,
 	.iop_pin	= xfs_buf_item_pin,
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index bb3f71d236d2..0dee0b71029d 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -295,7 +295,7 @@ xfs_qm_dquot_logitem_committing(
 /*
  * This is the ops vector for dquots
  */
-static struct xfs_item_ops xfs_dquot_item_ops = {
+static const struct xfs_item_ops xfs_dquot_item_ops = {
 	.iop_size	= xfs_qm_dquot_logitem_size,
 	.iop_format	= xfs_qm_dquot_logitem_format,
 	.iop_pin	= xfs_qm_dquot_logitem_pin,
@@ -483,7 +483,7 @@ xfs_qm_qoff_logitem_committing(
 {
 }
 
-static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
+static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
 	.iop_size	= xfs_qm_qoff_logitem_size,
 	.iop_format	= xfs_qm_qoff_logitem_format,
 	.iop_pin	= xfs_qm_qoff_logitem_pin,
@@ -498,7 +498,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
 /*
  * This is the ops vector shared by all quotaoff-start log items.
  */
-static struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
+static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
 	.iop_size	= xfs_qm_qoff_logitem_size,
 	.iop_format	= xfs_qm_qoff_logitem_format,
 	.iop_pin	= xfs_qm_qoff_logitem_pin,
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index d22e62623437..35c2aff38b20 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -217,7 +217,7 @@ xfs_efi_item_committing(
 /*
  * This is the ops vector shared by all efi log items.
  */
-static struct xfs_item_ops xfs_efi_item_ops = {
+static const struct xfs_item_ops xfs_efi_item_ops = {
 	.iop_size	= xfs_efi_item_size,
 	.iop_format	= xfs_efi_item_format,
 	.iop_pin	= xfs_efi_item_pin,
@@ -477,7 +477,7 @@ xfs_efd_item_committing(
 /*
  * This is the ops vector shared by all efd log items.
  */
-static struct xfs_item_ops xfs_efd_item_ops = {
+static const struct xfs_item_ops xfs_efd_item_ops = {
 	.iop_size	= xfs_efd_item_size,
 	.iop_format	= xfs_efd_item_format,
 	.iop_pin	= xfs_efd_item_pin,
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index b7cf21ba240f..abaafdbb3e65 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -795,7 +795,7 @@ xfs_inode_item_committing(
 /*
  * This is the ops vector shared by all buf log items.
  */
-static struct xfs_item_ops xfs_inode_item_ops = {
+static const struct xfs_item_ops xfs_inode_item_ops = {
 	.iop_size	= xfs_inode_item_size,
 	.iop_format	= xfs_inode_item_format,
 	.iop_pin	= xfs_inode_item_pin,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 2758a6277c52..a14cd89fe465 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -626,7 +626,7 @@ xfs_log_item_init(
 	struct xfs_mount	*mp,
 	struct xfs_log_item	*item,
 	int			type,
-	struct xfs_item_ops	*ops)
+	const struct xfs_item_ops *ops)
 {
 	item->li_mountp = mp;
 	item->li_ailp = mp->m_ail;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 78c9039994af..3f7bf451c034 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -137,7 +137,7 @@ struct xfs_trans;
 void	xfs_log_item_init(struct xfs_mount	*mp,
 			struct xfs_log_item	*item,
 			int			type,
-			struct xfs_item_ops	*ops);
+			const struct xfs_item_ops *ops);
 
 xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
 		       struct xlog_ticket *ticket,
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 603f3eb52041..3ae713c0abd9 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -326,7 +326,7 @@ typedef struct xfs_log_item {
 						 struct xfs_log_item *);
 							/* buffer item iodone */
 							/* callback func */
-	struct xfs_item_ops		*li_ops;	/* function list */
+	const struct xfs_item_ops	*li_ops;	/* function list */
 
 	/* delayed logging */
 	struct list_head		li_cil;		/* CIL pointers */
@@ -341,7 +341,7 @@ typedef struct xfs_log_item {
 	{ XFS_LI_IN_AIL,	"IN_AIL" }, \
 	{ XFS_LI_ABORTED,	"ABORTED" }
 
-typedef struct xfs_item_ops {
+struct xfs_item_ops {
 	uint (*iop_size)(xfs_log_item_t *);
 	void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
 	void (*iop_pin)(xfs_log_item_t *);
@@ -352,7 +352,7 @@ typedef struct xfs_item_ops {
 	void (*iop_push)(xfs_log_item_t *);
 	bool (*iop_pushbuf)(xfs_log_item_t *);
 	void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
-} xfs_item_ops_t;
+};
 
 #define IOP_SIZE(ip)		(*(ip)->li_ops->iop_size)(ip)
 #define IOP_FORMAT(ip,vp)	(*(ip)->li_ops->iop_format)(ip, vp)
-- 
cgit v1.2.3-58-ga151


From 810627d9a6d0e8820c798001875bc4e1b7754ebf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 8 Nov 2011 08:56:15 +0000
Subject: xfs: fix force shutdown handling in xfs_end_io

Ensure ioend->io_error gets propagated back to e.g. AIO completions.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_aops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 33b13310ee0c..574d4ee9b625 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -189,7 +189,7 @@ xfs_end_io(
 	int		error = 0;
 
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-		error = -EIO;
+		ioend->io_error = -EIO;
 		goto done;
 	}
 	if (ioend->io_error)
-- 
cgit v1.2.3-58-ga151


From 917c16b2b69fc2eeb432eabca73258f08c58361e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 8 Nov 2011 14:49:59 -0500
Subject: Btrfs: fix oops on NULL trans handle in btrfs_truncate

If we fail to reserve space in the transaction during truncate, we can
error out with a NULL trans handle.  The cleanup code needs an extra
check to make sure we aren't trying to use the bad handle.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9d0eaa57d4ee..f60e2490bd0d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6529,14 +6529,16 @@ end_trans:
 		ret = btrfs_orphan_del(NULL, inode);
 	}
 
-	trans->block_rsv = &root->fs_info->trans_block_rsv;
-	ret = btrfs_update_inode(trans, root, inode);
-	if (ret && !err)
-		err = ret;
+	if (trans) {
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
+		ret = btrfs_update_inode(trans, root, inode);
+		if (ret && !err)
+			err = ret;
 
-	nr = trans->blocks_used;
-	ret = btrfs_end_transaction_throttle(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
+		nr = trans->blocks_used;
+		ret = btrfs_end_transaction_throttle(trans, root);
+		btrfs_btree_balance_dirty(root, nr);
+	}
 
 out:
 	btrfs_free_block_rsv(root, rsv);
-- 
cgit v1.2.3-58-ga151


From 7fd2ae21a42d178982679b86086661292b4afe4a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 8 Nov 2011 15:47:34 -0500
Subject: Btrfs: fix our reservations for updating an inode when completing io

People have been reporting ENOSPC crashes in finish_ordered_io.  This is because
we try to steal from the delalloc block rsv to satisfy a reservation to update
the inode.  The problem with this is we don't explicitly save space for updating
the inode when doing delalloc.  This is kind of a problem and we've gotten away
with this because way back when we just stole from the delalloc reserve without
any questions, and this worked out fine because generally speaking the leaf had
been modified either by the mtime update when we did the original write or
because we just updated the leaf when we inserted the file extent item, only on
rare occasions had the leaf not actually been modified, and that was still ok
because we'd just use a block or two out of the over-reservation that is
delalloc.

Then came the delayed inode stuff.  This is amazing, except it wants a full
reservation for updating the inode since it may do it at some point down the
road after we've written the blocks and we have to recow everything again.  This
worked out because the delayed inode stuff just stole from the global reserve,
that is until recently when I changed that because it caused other problems.

So here we are, we're doing everything right and being screwed for it.  So take
an extra reservation for the inode at delalloc reservation time and carry it
through the life of the delalloc reservation.  If we need it we can steal it in
the delayed inode stuff.  If we have already stolen it try and do a normal
metadata reservation.  If that fails try to steal from the delalloc reservation.
If _that_ fails we'll get a WARN_ON() so I can start thinking of a better way to
solve this and in the meantime we'll steal from the global reserve.

With this patch I ran xfstests 13 in a loop for a couple of hours and didn't see
any problems.

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h   |  4 +--
 fs/btrfs/delayed-inode.c | 65 +++++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/extent-tree.c   | 22 +++++++++++++---
 fs/btrfs/inode.c         |  1 +
 4 files changed, 83 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5a5d325a3935..634608d2a6d0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -147,14 +147,12 @@ struct btrfs_inode {
 	 * the btrfs file release call will add this inode to the
 	 * ordered operations list so that we make sure to flush out any
 	 * new data the application may have written before commit.
-	 *
-	 * yes, its silly to have a single bitflag, but we might grow more
-	 * of these.
 	 */
 	unsigned ordered_data_close:1;
 	unsigned orphan_meta_reserved:1;
 	unsigned dummy_inode:1;
 	unsigned in_defrag:1;
+	unsigned delalloc_meta_reserved:1;
 
 	/*
 	 * always compress this one file
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index bbe8496d5339..313ee14cf3b7 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -617,12 +617,14 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
 static int btrfs_delayed_inode_reserve_metadata(
 					struct btrfs_trans_handle *trans,
 					struct btrfs_root *root,
+					struct inode *inode,
 					struct btrfs_delayed_node *node)
 {
 	struct btrfs_block_rsv *src_rsv;
 	struct btrfs_block_rsv *dst_rsv;
 	u64 num_bytes;
 	int ret;
+	int release = false;
 
 	src_rsv = trans->block_rsv;
 	dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -652,11 +654,67 @@ static int btrfs_delayed_inode_reserve_metadata(
 		if (!ret)
 			node->bytes_reserved = num_bytes;
 		return ret;
+	} else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
+		spin_lock(&BTRFS_I(inode)->lock);
+		if (BTRFS_I(inode)->delalloc_meta_reserved) {
+			BTRFS_I(inode)->delalloc_meta_reserved = 0;
+			spin_unlock(&BTRFS_I(inode)->lock);
+			release = true;
+			goto migrate;
+		}
+		spin_unlock(&BTRFS_I(inode)->lock);
+
+		/* Ok we didn't have space pre-reserved.  This shouldn't happen
+		 * too often but it can happen if we do delalloc to an existing
+		 * inode which gets dirtied because of the time update, and then
+		 * isn't touched again until after the transaction commits and
+		 * then we try to write out the data.  First try to be nice and
+		 * reserve something strictly for us.  If not be a pain and try
+		 * to steal from the delalloc block rsv.
+		 */
+		ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+		if (!ret)
+			goto out;
+
+		ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+		if (!ret)
+			goto out;
+
+		/*
+		 * Ok this is a problem, let's just steal from the global rsv
+		 * since this really shouldn't happen that often.
+		 */
+		WARN_ON(1);
+		ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
+					      dst_rsv, num_bytes);
+		goto out;
 	}
 
+migrate:
 	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
-	if (!ret)
-		node->bytes_reserved = num_bytes;
+	if (unlikely(ret)) {
+		/* This shouldn't happen */
+		BUG_ON(release);
+		return ret;
+	}
+
+out:
+	/*
+	 * Migrate only takes a reservation, it doesn't touch the size of the
+	 * block_rsv.  This is to simplify people who don't normally have things
+	 * migrated from their block rsv.  If they go to release their
+	 * reservation, that will decrease the size as well, so if migrate
+	 * reduced size we'd end up with a negative size.  But for the
+	 * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
+	 * but we could in fact do this reserve/migrate dance several times
+	 * between the time we did the original reservation and we'd clean it
+	 * up.  So to take care of this, release the space for the meta
+	 * reservation here.  I think it may be time for a documentation page on
+	 * how block rsvs. work.
+	 */
+	if (release)
+		btrfs_block_rsv_release(root, src_rsv, num_bytes);
+	node->bytes_reserved = num_bytes;
 
 	return ret;
 }
@@ -1708,7 +1766,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
 		goto release_node;
 	}
 
-	ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
+	ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
+						   delayed_node);
 	if (ret)
 		goto release_node;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 18ea90c8943b..0b044e509e9f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4063,23 +4063,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
  */
 static unsigned drop_outstanding_extent(struct inode *inode)
 {
+	unsigned drop_inode_space = 0;
 	unsigned dropped_extents = 0;
 
 	BUG_ON(!BTRFS_I(inode)->outstanding_extents);
 	BTRFS_I(inode)->outstanding_extents--;
 
+	if (BTRFS_I(inode)->outstanding_extents == 0 &&
+	    BTRFS_I(inode)->delalloc_meta_reserved) {
+		drop_inode_space = 1;
+		BTRFS_I(inode)->delalloc_meta_reserved = 0;
+	}
+
 	/*
 	 * If we have more or the same amount of outsanding extents than we have
 	 * reserved then we need to leave the reserved extents count alone.
 	 */
 	if (BTRFS_I(inode)->outstanding_extents >=
 	    BTRFS_I(inode)->reserved_extents)
-		return 0;
+		return drop_inode_space;
 
 	dropped_extents = BTRFS_I(inode)->reserved_extents -
 		BTRFS_I(inode)->outstanding_extents;
 	BTRFS_I(inode)->reserved_extents -= dropped_extents;
-	return dropped_extents;
+	return dropped_extents + drop_inode_space;
 }
 
 /**
@@ -4165,9 +4172,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 		nr_extents = BTRFS_I(inode)->outstanding_extents -
 			BTRFS_I(inode)->reserved_extents;
 		BTRFS_I(inode)->reserved_extents += nr_extents;
+	}
 
-		to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
+	/*
+	 * Add an item to reserve for updating the inode when we complete the
+	 * delalloc io.
+	 */
+	if (!BTRFS_I(inode)->delalloc_meta_reserved) {
+		nr_extents++;
+		BTRFS_I(inode)->delalloc_meta_reserved = 1;
 	}
+
+	to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
 	spin_unlock(&BTRFS_I(inode)->lock);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f60e2490bd0d..2b920596c126 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6607,6 +6607,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->orphan_meta_reserved = 0;
 	ei->dummy_inode = 0;
 	ei->in_defrag = 0;
+	ei->delalloc_meta_reserved = 0;
 	ei->force_compress = BTRFS_COMPRESS_NONE;
 
 	ei->delayed_node = NULL;
-- 
cgit v1.2.3-58-ga151


From a90e8b6fb80db43b029e1e76205452afa8bdc77a Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 8 Nov 2011 16:47:55 +0200
Subject: Btrfs: fix memory leak in btrfs_parse_early_options()

Don't leak subvol_name string in case multiple subvol= options are
given.  "The lastest option is effective" behavior (consistent with
subvolid= and subvolrootid= options) is preserved.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index dcd5aef6b614..6befcaf253bd 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -448,6 +448,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_subvol:
+			kfree(*subvol_name);
 			*subvol_name = match_strdup(&args[0]);
 			break;
 		case Opt_subvolid:
-- 
cgit v1.2.3-58-ga151


From f23c8af8ca2789eeb0ab9ea90c214f9694d96cc5 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 8 Nov 2011 19:15:05 +0200
Subject: Btrfs: fix subvol_name leak on error in btrfs_mount()

btrfs_parse_early_options() can fail due to error while scanning devices
(-o device= option), but still strdup() subvol_name string:

mount -o subvol=SUBV,device=BAD_DEVICE <dev> <mnt>

So free subvol_name string on error.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6befcaf253bd..58e9492230ce 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -905,8 +905,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	error = btrfs_parse_early_options(data, mode, fs_type,
 					  &subvol_name, &subvol_objectid,
 					  &subvol_rootid, &fs_devices);
-	if (error)
+	if (error) {
+		kfree(subvol_name);
 		return ERR_PTR(error);
+	}
 
 	if (subvol_name) {
 		root = mount_subvol(subvol_name, flags, device_name, data);
-- 
cgit v1.2.3-58-ga151


From 4d34b2789538befa45a68a191dc12e0886a69f7d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 9 Nov 2011 00:08:15 +0200
Subject: Btrfs: avoid null dereference and leaks when bailing from
 open_ctree()

Fix bugs introduced by 6c41761f.  Firstly, after failing to allocate any
of the tree roots (first 'goto fail' in open_ctree()) we would
dereference a NULL fs_info pointer in free_fs_info().  Secondly, after
failures from init_srcu_struct(), setup_bdi() and new_inode() we would
leak all earlier allocated roots: fs_info fields haven't been
initialized yet so free_fs_info() is rendered useless.

Fix this by initializing fs_info pointer and fs_info fields before any
allocations happen.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/disk-io.c | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e53a5bb85670..91db90b526c2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1890,31 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	u64 features;
 	struct btrfs_key location;
 	struct buffer_head *bh;
-	struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
-						 GFP_NOFS);
-	struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
-						 GFP_NOFS);
+	struct btrfs_super_block *disk_super;
 	struct btrfs_root *tree_root = btrfs_sb(sb);
-	struct btrfs_fs_info *fs_info = NULL;
-	struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
-						GFP_NOFS);
-	struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
-					      GFP_NOFS);
+	struct btrfs_fs_info *fs_info = tree_root->fs_info;
+	struct btrfs_root *extent_root;
+	struct btrfs_root *csum_root;
+	struct btrfs_root *chunk_root;
+	struct btrfs_root *dev_root;
 	struct btrfs_root *log_tree_root;
-
 	int ret;
 	int err = -EINVAL;
 	int num_backups_tried = 0;
 	int backup_index = 0;
 
-	struct btrfs_super_block *disk_super;
+	extent_root = fs_info->extent_root =
+		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+	csum_root = fs_info->csum_root =
+		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+	chunk_root = fs_info->chunk_root =
+		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+	dev_root = fs_info->dev_root =
+		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
 
-	if (!extent_root || !tree_root || !tree_root->fs_info ||
-	    !chunk_root || !dev_root || !csum_root) {
+	if (!extent_root || !csum_root || !chunk_root || !dev_root) {
 		err = -ENOMEM;
 		goto fail;
 	}
-	fs_info = tree_root->fs_info;
 
 	ret = init_srcu_struct(&fs_info->subvol_srcu);
 	if (ret) {
@@ -1954,12 +1955,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->reloc_mutex);
 
 	init_completion(&fs_info->kobj_unregister);
-	fs_info->tree_root = tree_root;
-	fs_info->extent_root = extent_root;
-	fs_info->csum_root = csum_root;
-	fs_info->chunk_root = chunk_root;
-	fs_info->dev_root = dev_root;
-	fs_info->fs_devices = fs_devices;
 	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
 	INIT_LIST_HEAD(&fs_info->space_info);
 	btrfs_mapping_init(&fs_info->mapping_tree);
-- 
cgit v1.2.3-58-ga151


From 586e46e2813c589d26258a599580421fb6fb576b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 9 Nov 2011 13:26:37 +0200
Subject: Btrfs: close devices on all error paths in open_ctree()

Fix a bug introduced by 7e662854 where we would leave devices busy on
certain error paths in open_ctree().  fs_info is guaranteed to be
non-NULL now so it's safe to dereference it on all error paths.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/disk-io.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 91db90b526c2..b6a5c0dd0dd8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2460,21 +2460,20 @@ fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->caching_workers);
 fail_alloc:
 fail_iput:
+	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 	iput(fs_info->btree_inode);
-
-	btrfs_close_devices(fs_info->fs_devices);
-	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 fail_bdi:
 	bdi_destroy(&fs_info->bdi);
 fail_srcu:
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
+	btrfs_close_devices(fs_info->fs_devices);
 	free_fs_info(fs_info);
 	return ERR_PTR(err);
 
 recovery_tree_root:
-
 	if (!btrfs_test_opt(tree_root, RECOVERY))
 		goto fail_tree_roots;
 
-- 
cgit v1.2.3-58-ga151


From 04d21a244fdf79d0ac892eaaa9a46b682467277c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 9 Nov 2011 14:41:22 +0200
Subject: Btrfs: rework error handling in btrfs_mount()

Commits 6c41761f and 45ea6095 introduced the possibility of NULL pointer
dereference on error paths, also we would leave all devices busy and
leak fs_info with all sub-structures on error when trying to mount an
already mounted fs to a different directory.

Fix this by doing all allocations before trying to open any of the
devices, adjust error path for mount-already-mounted-fs case.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/super.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 58e9492230ce..629281c65ff5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -891,7 +891,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	struct super_block *s;
 	struct dentry *root;
 	struct btrfs_fs_devices *fs_devices = NULL;
-	struct btrfs_root *tree_root = NULL;
 	struct btrfs_fs_info *fs_info = NULL;
 	fmode_t mode = FMODE_READ;
 	char *subvol_name = NULL;
@@ -920,15 +919,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	if (error)
 		return ERR_PTR(error);
 
-	error = btrfs_open_devices(fs_devices, mode, fs_type);
-	if (error)
-		return ERR_PTR(error);
-
-	if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
-		error = -EACCES;
-		goto error_close_devices;
-	}
-
 	/*
 	 * Setup a dummy root and fs_info for test/set super.  This is because
 	 * we don't actually fill this stuff out until open_ctree, but we need
@@ -936,28 +926,36 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	 * then open_ctree will properly initialize everything later.
 	 */
 	fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
-	if (!fs_info) {
-		error = -ENOMEM;
-		goto error_close_devices;
-	}
-	tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-	if (!tree_root) {
+	if (!fs_info)
+		return ERR_PTR(-ENOMEM);
+
+	fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+	if (!fs_info->tree_root) {
 		error = -ENOMEM;
-		goto error_close_devices;
+		goto error_fs_info;
 	}
-	fs_info->tree_root = tree_root;
+	fs_info->tree_root->fs_info = fs_info;
 	fs_info->fs_devices = fs_devices;
-	tree_root->fs_info = fs_info;
 
 	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
 	fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
 	if (!fs_info->super_copy || !fs_info->super_for_commit) {
 		error = -ENOMEM;
+		goto error_fs_info;
+	}
+
+	error = btrfs_open_devices(fs_devices, mode, fs_type);
+	if (error)
+		goto error_fs_info;
+
+	if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
+		error = -EACCES;
 		goto error_close_devices;
 	}
 
 	bdev = fs_devices->latest_bdev;
-	s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
+	s = sget(fs_type, btrfs_test_super, btrfs_set_super,
+		 fs_info->tree_root);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
 		goto error_close_devices;
@@ -966,7 +964,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	if (s->s_root) {
 		if ((flags ^ s->s_flags) & MS_RDONLY) {
 			deactivate_locked_super(s);
-			return ERR_PTR(-EBUSY);
+			error = -EBUSY;
+			goto error_close_devices;
 		}
 
 		btrfs_close_devices(fs_devices);
@@ -997,6 +996,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 
 error_close_devices:
 	btrfs_close_devices(fs_devices);
+error_fs_info:
 	free_fs_info(fs_info);
 	return ERR_PTR(error);
 }
-- 
cgit v1.2.3-58-ga151


From 5e442a493fc59fa536c76db1fff5b49ca36a88c5 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 9 Nov 2011 18:16:00 -0500
Subject: Revert "proc: fix races against execve() of /proc/PID/fd**"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit aa6afca5bcaba8101f3ea09d5c3e4100b2b9f0e5.

It escalates of some of the google-chrome SELinux problems with ptrace
("Check failed: pid_ > 0.  Did not find zygote process"), and Andrew
says that it is also causing mystery lockdep reports.

Reported-by: Alex Villacís Lasso <a_villacis@palosanto.com>
Requested-by: James Morris <jmorris@namei.org>
Requested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 146 +++++++++++++++++----------------------------------------
 1 file changed, 43 insertions(+), 103 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 2db1bd3173b2..851ba3dcdc29 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1652,46 +1652,12 @@ out:
 	return error;
 }
 
-static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		struct kstat *stat)
-{
-	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = get_proc_task(inode);
-	int rc;
-
-	if (task == NULL)
-		return -ESRCH;
-
-	rc = -EACCES;
-	if (lock_trace(task))
-		goto out_task;
-
-	generic_fillattr(inode, stat);
-	unlock_trace(task);
-	rc = 0;
-out_task:
-	put_task_struct(task);
-	return rc;
-}
-
 static const struct inode_operations proc_pid_link_inode_operations = {
 	.readlink	= proc_pid_readlink,
 	.follow_link	= proc_pid_follow_link,
 	.setattr	= proc_setattr,
 };
 
-static const struct inode_operations proc_fdinfo_link_inode_operations = {
-	.setattr	= proc_setattr,
-	.getattr	= proc_pid_fd_link_getattr,
-};
-
-static const struct inode_operations proc_fd_link_inode_operations = {
-	.readlink	= proc_pid_readlink,
-	.follow_link	= proc_pid_follow_link,
-	.setattr	= proc_setattr,
-	.getattr	= proc_pid_fd_link_getattr,
-};
-
 
 /* building an inode */
 
@@ -1923,61 +1889,49 @@ out:
 
 static int proc_fd_info(struct inode *inode, struct path *path, char *info)
 {
-	struct task_struct *task;
-	struct files_struct *files;
+	struct task_struct *task = get_proc_task(inode);
+	struct files_struct *files = NULL;
 	struct file *file;
 	int fd = proc_fd(inode);
-	int rc;
-
-	task = get_proc_task(inode);
-	if (!task)
-		return -ENOENT;
-
-	rc = -EACCES;
-	if (lock_trace(task))
-		goto out_task;
-
-	rc = -ENOENT;
-	files = get_files_struct(task);
-	if (files == NULL)
-		goto out_unlock;
 
-	/*
-	 * We are not taking a ref to the file structure, so we must
-	 * hold ->file_lock.
-	 */
-	spin_lock(&files->file_lock);
-	file = fcheck_files(files, fd);
-	if (file) {
-		unsigned int f_flags;
-		struct fdtable *fdt;
-
-		fdt = files_fdtable(files);
-		f_flags = file->f_flags & ~O_CLOEXEC;
-		if (FD_ISSET(fd, fdt->close_on_exec))
-			f_flags |= O_CLOEXEC;
-
-		if (path) {
-			*path = file->f_path;
-			path_get(&file->f_path);
+	if (task) {
+		files = get_files_struct(task);
+		put_task_struct(task);
+	}
+	if (files) {
+		/*
+		 * We are not taking a ref to the file structure, so we must
+		 * hold ->file_lock.
+		 */
+		spin_lock(&files->file_lock);
+		file = fcheck_files(files, fd);
+		if (file) {
+			unsigned int f_flags;
+			struct fdtable *fdt;
+
+			fdt = files_fdtable(files);
+			f_flags = file->f_flags & ~O_CLOEXEC;
+			if (FD_ISSET(fd, fdt->close_on_exec))
+				f_flags |= O_CLOEXEC;
+
+			if (path) {
+				*path = file->f_path;
+				path_get(&file->f_path);
+			}
+			if (info)
+				snprintf(info, PROC_FDINFO_MAX,
+					 "pos:\t%lli\n"
+					 "flags:\t0%o\n",
+					 (long long) file->f_pos,
+					 f_flags);
+			spin_unlock(&files->file_lock);
+			put_files_struct(files);
+			return 0;
 		}
-		if (info)
-			snprintf(info, PROC_FDINFO_MAX,
-				 "pos:\t%lli\n"
-				 "flags:\t0%o\n",
-				 (long long) file->f_pos,
-				 f_flags);
-		rc = 0;
-	} else
-		rc = -ENOENT;
-	spin_unlock(&files->file_lock);
-	put_files_struct(files);
-
-out_unlock:
-	unlock_trace(task);
-out_task:
-	put_task_struct(task);
-	return rc;
+		spin_unlock(&files->file_lock);
+		put_files_struct(files);
+	}
+	return -ENOENT;
 }
 
 static int proc_fd_link(struct inode *inode, struct path *path)
@@ -2072,7 +2026,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
 	spin_unlock(&files->file_lock);
 	put_files_struct(files);
 
-	inode->i_op = &proc_fd_link_inode_operations;
+	inode->i_op = &proc_pid_link_inode_operations;
 	inode->i_size = 64;
 	ei->op.proc_get_link = proc_fd_link;
 	d_set_d_op(dentry, &tid_fd_dentry_operations);
@@ -2104,12 +2058,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
 	if (fd == ~0U)
 		goto out;
 
-	result = ERR_PTR(-EACCES);
-	if (lock_trace(task))
-		goto out;
-
 	result = instantiate(dir, dentry, task, &fd);
-	unlock_trace(task);
 out:
 	put_task_struct(task);
 out_no_task:
@@ -2129,28 +2078,23 @@ static int proc_readfd_common(struct file * filp, void * dirent,
 	retval = -ENOENT;
 	if (!p)
 		goto out_no_task;
-
-	retval = -EACCES;
-	if (lock_trace(p))
-		goto out;
-
 	retval = 0;
 
 	fd = filp->f_pos;
 	switch (fd) {
 		case 0:
 			if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
-				goto out_unlock;
+				goto out;
 			filp->f_pos++;
 		case 1:
 			ino = parent_ino(dentry);
 			if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
-				goto out_unlock;
+				goto out;
 			filp->f_pos++;
 		default:
 			files = get_files_struct(p);
 			if (!files)
-				goto out_unlock;
+				goto out;
 			rcu_read_lock();
 			for (fd = filp->f_pos-2;
 			     fd < files_fdtable(files)->max_fds;
@@ -2174,9 +2118,6 @@ static int proc_readfd_common(struct file * filp, void * dirent,
 			rcu_read_unlock();
 			put_files_struct(files);
 	}
-
-out_unlock:
-	unlock_trace(p);
 out:
 	put_task_struct(p);
 out_no_task:
@@ -2254,7 +2195,6 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
 	ei->fd = fd;
 	inode->i_mode = S_IFREG | S_IRUSR;
 	inode->i_fop = &proc_fdinfo_file_operations;
-	inode->i_op = &proc_fdinfo_link_inode_operations;
 	d_set_d_op(dentry, &tid_fd_dentry_operations);
 	d_add(dentry, inode);
 	/* Close the race of the process dying before we return the dentry */
-- 
cgit v1.2.3-58-ga151


From 2115133f8b9a8dbdb217d14080814df07ce90479 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 10 Nov 2011 20:39:08 -0500
Subject: Btrfs: tweak the delayed inode reservations again

Josef sent along an incremental to the inode reservation
code to make sure we try and fall back to directly updating
the inode item if things go horribly wrong.

This reworks that patch slightly, adding a fallback function
that will always try to update the inode item directly without
going through the delayed_inode code.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/delayed-inode.c |  9 +++----
 fs/btrfs/inode.c         | 64 +++++++++++++++++++++++++++++++++---------------
 2 files changed, 47 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 313ee14cf3b7..6a1a6800776c 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -692,11 +692,6 @@ static int btrfs_delayed_inode_reserve_metadata(
 
 migrate:
 	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
-	if (unlikely(ret)) {
-		/* This shouldn't happen */
-		BUG_ON(release);
-		return ret;
-	}
 
 out:
 	/*
@@ -712,9 +707,11 @@ out:
 	 * reservation here.  I think it may be time for a documentation page on
 	 * how block rsvs. work.
 	 */
+	if (!ret)
+		node->bytes_reserved = num_bytes;
+
 	if (release)
 		btrfs_block_rsv_release(root, src_rsv, num_bytes);
-	node->bytes_reserved = num_bytes;
 
 	return ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2b920596c126..7d394335bcb5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -93,6 +93,8 @@ static noinline int cow_file_range(struct inode *inode,
 				   struct page *locked_page,
 				   u64 start, u64 end, int *page_started,
 				   unsigned long *nr_written, int unlock);
+static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode);
 
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 				     struct inode *inode,  struct inode *dir,
@@ -1741,7 +1743,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 				trans = btrfs_join_transaction(root);
 			BUG_ON(IS_ERR(trans));
 			trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-			ret = btrfs_update_inode(trans, root, inode);
+			ret = btrfs_update_inode_fallback(trans, root, inode);
 			BUG_ON(ret);
 		}
 		goto out;
@@ -1791,7 +1793,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 
 	ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
 	if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
-		ret = btrfs_update_inode(trans, root, inode);
+		ret = btrfs_update_inode_fallback(trans, root, inode);
 		BUG_ON(ret);
 	}
 	ret = 0;
@@ -2426,7 +2428,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 /*
  * copy everything in the in-memory inode into the btree.
  */
-noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root, struct inode *inode)
 {
 	struct btrfs_inode_item *inode_item;
@@ -2434,21 +2436,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	int ret;
 
-	/*
-	 * If the inode is a free space inode, we can deadlock during commit
-	 * if we put it into the delayed code.
-	 *
-	 * The data relocation inode should also be directly updated
-	 * without delay
-	 */
-	if (!btrfs_is_free_space_inode(root, inode)
-	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
-		ret = btrfs_delayed_update_inode(trans, root, inode);
-		if (!ret)
-			btrfs_set_inode_last_trans(trans, inode);
-		return ret;
-	}
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -2476,6 +2463,43 @@ failed:
 	return ret;
 }
 
+/*
+ * copy everything in the in-memory inode into the btree.
+ */
+noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode)
+{
+	int ret;
+
+	/*
+	 * If the inode is a free space inode, we can deadlock during commit
+	 * if we put it into the delayed code.
+	 *
+	 * The data relocation inode should also be directly updated
+	 * without delay
+	 */
+	if (!btrfs_is_free_space_inode(root, inode)
+	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+		ret = btrfs_delayed_update_inode(trans, root, inode);
+		if (!ret)
+			btrfs_set_inode_last_trans(trans, inode);
+		return ret;
+	}
+
+	return btrfs_update_inode_item(trans, root, inode);
+}
+
+static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode)
+{
+	int ret;
+
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret == -ENOSPC)
+		return btrfs_update_inode_item(trans, root, inode);
+	return ret;
+}
+
 /*
  * unlink helper that gets used here in inode.c and in the tree logging
  * recovery code.  It remove a link in a directory with a given name, and
@@ -5632,7 +5656,7 @@ again:
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
 		ret = btrfs_ordered_update_i_size(inode, 0, ordered);
 		if (!ret)
-			err = btrfs_update_inode(trans, root, inode);
+			err = btrfs_update_inode_fallback(trans, root, inode);
 		goto out;
 	}
 
@@ -5670,7 +5694,7 @@ again:
 	add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
 	ret = btrfs_ordered_update_i_size(inode, 0, ordered);
 	if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
-		btrfs_update_inode(trans, root, inode);
+		btrfs_update_inode_fallback(trans, root, inode);
 	ret = 0;
 out_unlock:
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
-- 
cgit v1.2.3-58-ga151


From 924cd8fbe41851eda2b68bf2ed501b2777fd77b4 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 10 Nov 2011 20:45:04 -0500
Subject: Btrfs: fix nocow when deleting the item

btrfs_previous_item() just search the b+ tree, do not COW the nodes or leaves,
if we modify the result of it, the meta-data will be broken. fix it.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f8e2943101a1..c37433d3cd82 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -999,7 +999,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 	key.objectid = device->devid;
 	key.offset = start;
 	key.type = BTRFS_DEV_EXTENT_KEY;
-
+again:
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret > 0) {
 		ret = btrfs_previous_item(root, path, key.objectid,
@@ -1012,6 +1012,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 					struct btrfs_dev_extent);
 		BUG_ON(found_key.offset > start || found_key.offset +
 		       btrfs_dev_extent_length(leaf, extent) < start);
+		key = found_key;
+		btrfs_release_path(path);
+		goto again;
 	} else if (ret == 0) {
 		leaf = path->nodes[0];
 		extent = btrfs_item_ptr(leaf, path->slots[0],
-- 
cgit v1.2.3-58-ga151


From ba38eb4de354d228f2792f93cde2c748a3a3f3b2 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 10 Nov 2011 20:45:04 -0500
Subject: Btrfs: fix no reserved space for writing out inode cache

I-node cache forgets to reserve the space when writing out it. And when
we do some stress test, such as synctest, it will trigger WARN_ON() in
use_block_rsv().

WARNING: at fs/btrfs/extent-tree.c:5718 btrfs_alloc_free_block+0xbf/0x281 [btrfs]()
...
Call Trace:
 [<ffffffff8104df86>] warn_slowpath_common+0x80/0x98
 [<ffffffff8104dfb3>] warn_slowpath_null+0x15/0x17
 [<ffffffffa0369c60>] btrfs_alloc_free_block+0xbf/0x281 [btrfs]
 [<ffffffff810cbcb8>] ? __set_page_dirty_nobuffers+0xfe/0x108
 [<ffffffffa035c040>] __btrfs_cow_block+0x118/0x3b5 [btrfs]
 [<ffffffffa035c7ba>] btrfs_cow_block+0x103/0x14e [btrfs]
 [<ffffffffa035e4c4>] btrfs_search_slot+0x249/0x6a4 [btrfs]
 [<ffffffffa036d086>] btrfs_lookup_inode+0x2a/0x8a [btrfs]
 [<ffffffffa03788b7>] btrfs_update_inode+0xaa/0x141 [btrfs]
 [<ffffffffa036d7ec>] btrfs_save_ino_cache+0xea/0x202 [btrfs]
 [<ffffffffa03a761e>] ? btrfs_update_reloc_root+0x17e/0x197 [btrfs]
 [<ffffffffa0373867>] commit_fs_roots+0xaa/0x158 [btrfs]
 [<ffffffffa03746a6>] btrfs_commit_transaction+0x405/0x731 [btrfs]
 [<ffffffff810690df>] ? wake_up_bit+0x25/0x25
 [<ffffffffa039d652>] ? btrfs_log_dentry_safe+0x43/0x51 [btrfs]
 [<ffffffffa0381c5f>] btrfs_sync_file+0x16a/0x198 [btrfs]
 [<ffffffff81122806>] ? mntput+0x21/0x23
 [<ffffffff8112d150>] vfs_fsync_range+0x18/0x21
 [<ffffffff8112d170>] vfs_fsync+0x17/0x19
 [<ffffffff8112d316>] do_fsync+0x29/0x3e
 [<ffffffff8112d348>] sys_fsync+0xb/0xf
 [<ffffffff81468352>] system_call_fastpath+0x16/0x1b

Sometimes it causes BUG_ON() in the reservation code of the delayed inode
is triggered.

So we must reserve enough space for inode cache.

Note: If we can not reserve the enough space for inode cache, we will
give up writing out it.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode-map.c | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 53dcbdf446cd..f8962a957d65 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -398,6 +398,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
 	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
 	struct btrfs_path *path;
 	struct inode *inode;
+	struct btrfs_block_rsv *rsv;
+	u64 num_bytes;
 	u64 alloc_hint = 0;
 	int ret;
 	int prealloc;
@@ -421,11 +423,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
 	if (!path)
 		return -ENOMEM;
 
+	rsv = trans->block_rsv;
+	trans->block_rsv = &root->fs_info->trans_block_rsv;
+
+	num_bytes = trans->bytes_reserved;
+	/*
+	 * 1 item for inode item insertion if need
+	 * 3 items for inode item update (in the worst case)
+	 * 1 item for free space object
+	 * 3 items for pre-allocation
+	 */
+	trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
+	ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv,
+					  trans->bytes_reserved);
+	if (ret)
+		goto out;
 again:
 	inode = lookup_free_ino_inode(root, path);
 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
 		ret = PTR_ERR(inode);
-		goto out;
+		goto out_release;
 	}
 
 	if (IS_ERR(inode)) {
@@ -434,7 +451,7 @@ again:
 
 		ret = create_free_ino_inode(root, trans, path);
 		if (ret)
-			goto out;
+			goto out_release;
 		goto again;
 	}
 
@@ -477,11 +494,14 @@ again:
 	}
 	btrfs_free_reserved_data_space(inode, prealloc);
 
+	ret = btrfs_write_out_ino_cache(root, trans, path);
 out_put:
 	iput(inode);
+out_release:
+	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
 out:
-	if (ret == 0)
-		ret = btrfs_write_out_ino_cache(root, trans, path);
+	trans->block_rsv = rsv;
+	trans->bytes_reserved = num_bytes;
 
 	btrfs_free_path(path);
 	return ret;
-- 
cgit v1.2.3-58-ga151


From 3254c87618354e58fa2a7b375c6664f567480c33 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 10 Nov 2011 20:45:05 -0500
Subject: Btrfs: fix unreleased path in btrfs_orphan_cleanup()

When we did stress test for the space relocation, the deadlock happened.
By debugging, We found it was caused by the carelessness that we forgot
to unlock the read lock of the extent buffers in btrfs_orphan_cleanup()
before we end the transaction handle, so the transaction commit task waited
the task, which called btrfs_orphan_cleanup(), to unlock the extent buffer,
but that task waited the commit task to end the transaction commit, and
the deadlock happened. Fix it.

Signed-ff-by: Miao Xie <miaox@cn.fujitsu.com>

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7d394335bcb5..e16215f480d0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2201,6 +2201,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 		if (ret)
 			goto out;
 	}
+	/* release the path since we're done with it */
+	btrfs_release_path(path);
+
 	root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
 
 	if (root->orphan_block_rsv)
-- 
cgit v1.2.3-58-ga151


From 61b520a9d0083b9b361638e456af45fd75150c87 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 10 Nov 2011 20:45:05 -0500
Subject: Btrfs: Abstract similar code for btrfs_block_rsv_add{, _noflush}

btrfs_block_rsv_add{, _noflush}() have similar code, so abstract that code.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0b044e509e9f..0f47b3e2010e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3796,16 +3796,16 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
 	kfree(rsv);
 }
 
-int btrfs_block_rsv_add(struct btrfs_root *root,
-			struct btrfs_block_rsv *block_rsv,
-			u64 num_bytes)
+static inline int __block_rsv_add(struct btrfs_root *root,
+				  struct btrfs_block_rsv *block_rsv,
+				  u64 num_bytes, int flush)
 {
 	int ret;
 
 	if (num_bytes == 0)
 		return 0;
 
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
 		return 0;
@@ -3814,22 +3814,18 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
 	return ret;
 }
 
+int btrfs_block_rsv_add(struct btrfs_root *root,
+			struct btrfs_block_rsv *block_rsv,
+			u64 num_bytes)
+{
+	return __block_rsv_add(root, block_rsv, num_bytes, 1);
+}
+
 int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
 				struct btrfs_block_rsv *block_rsv,
 				u64 num_bytes)
 {
-	int ret;
-
-	if (num_bytes == 0)
-		return 0;
-
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0);
-	if (!ret) {
-		block_rsv_add_bytes(block_rsv, num_bytes, 1);
-		return 0;
-	}
-
-	return ret;
+	return __block_rsv_add(root, block_rsv, num_bytes, 0);
 }
 
 int btrfs_block_rsv_check(struct btrfs_root *root,
-- 
cgit v1.2.3-58-ga151


From 76b9e23d25d5c99f994bee3172de39492e452e93 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 10 Nov 2011 20:45:05 -0500
Subject: Btrfs: fix orphan backref nodes

If the root node of a fs/file tree is in the block group that is
being relocated, but the others are not in the other block groups.
when we create a snapshot for this tree between the relocation tree
creation ends and ->create_reloc_tree is set to 0, Btrfs will create
some backref nodes that are the lowest nodes of the backrefs cache.
But we forget to add them into ->leaves list of the backref cache
and deal with them, and at last, they will triggered BUG_ON().

  kernel BUG at fs/btrfs/relocation.c:239!

This patch fixes it by adding them into ->leaves list of backref cache.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/relocation.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 24d654ce7a06..dff29d5e151a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
 			list_add_tail(&new_edge->list[UPPER],
 				      &new_node->lower);
 		}
+	} else {
+		list_add_tail(&new_node->lower, &cache->leaves);
 	}
 
 	rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
-- 
cgit v1.2.3-58-ga151


From 2f120c05e67ae34c93786b1050c6828904314429 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 10 Nov 2011 20:45:05 -0500
Subject: Btrfs: only map pages if we know we need them when reading the space
 cache

People have been running into a warning when loading space cache because the
page is already mapped when trying to read in a bitmap.  The way we read in
entries and pages is kind of convoluted, so fix it so that io_ctl_read_entry
maps the entries if it needs to, and if it hits the end of the page it simply
unmaps the page.  That way we can unconditionally unmap the io_ctl before
reading in the bitmap and we should stop hitting these warnings.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 7a15fcfb3e1f..181760f9d2ab 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -537,6 +537,13 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl,
 			    struct btrfs_free_space *entry, u8 *type)
 {
 	struct btrfs_free_space_entry *e;
+	int ret;
+
+	if (!io_ctl->cur) {
+		ret = io_ctl_check_crc(io_ctl, io_ctl->index);
+		if (ret)
+			return ret;
+	}
 
 	e = io_ctl->cur;
 	entry->offset = le64_to_cpu(e->offset);
@@ -550,10 +557,7 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl,
 
 	io_ctl_unmap_page(io_ctl);
 
-	if (io_ctl->index >= io_ctl->num_pages)
-		return 0;
-
-	return io_ctl_check_crc(io_ctl, io_ctl->index);
+	return 0;
 }
 
 static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
@@ -561,9 +565,6 @@ static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
 {
 	int ret;
 
-	if (io_ctl->cur && io_ctl->cur != io_ctl->orig)
-		io_ctl_unmap_page(io_ctl);
-
 	ret = io_ctl_check_crc(io_ctl, io_ctl->index);
 	if (ret)
 		return ret;
@@ -699,6 +700,8 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 		num_entries--;
 	}
 
+	io_ctl_unmap_page(&io_ctl);
+
 	/*
 	 * We add the bitmaps at the end of the entries in order that
 	 * the bitmap entries are added to the cache.
-- 
cgit v1.2.3-58-ga151


From 62f30c5462374b991e7e3f42d49ce2265c1b82f1 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 10 Nov 2011 20:45:05 -0500
Subject: Btrfs: fix deadlock caused by the race between relocation

We can not do flushable reservation for the relocation when we create snapshot,
because it may make the transaction commit task and the flush task wait for
each other and the deadlock happens.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/transaction.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 960835eaf4da..6a0574e923bc 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -882,8 +882,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
 
 	if (to_reserve > 0) {
-		ret = btrfs_block_rsv_add(root, &pending->block_rsv,
-					  to_reserve);
+		ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
+						  to_reserve);
 		if (ret) {
 			pending->error = ret;
 			goto fail;
-- 
cgit v1.2.3-58-ga151


From 69f4cb526bd02ae5af35846f9a710c099eec3347 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Fri, 11 Nov 2011 08:17:10 -0500
Subject: Btrfs: handle bio_add_page failure gracefully in scrub

Currently scrub fails with ENOMEM when bio_add_page fails. Unfortunately
dm based targets accept only one page per bio, thus making scrub always
fails. This patch just submits the current bio when an error is encountered
and starts a new one.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/scrub.c | 64 +++++++++++++++++++++++++-------------------------------
 1 file changed, 29 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ed11d3866afd..f4190f22edfb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -944,50 +944,18 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
 static int scrub_submit(struct scrub_dev *sdev)
 {
 	struct scrub_bio *sbio;
-	struct bio *bio;
-	int i;
 
 	if (sdev->curr == -1)
 		return 0;
 
 	sbio = sdev->bios[sdev->curr];
-
-	bio = bio_alloc(GFP_NOFS, sbio->count);
-	if (!bio)
-		goto nomem;
-
-	bio->bi_private = sbio;
-	bio->bi_end_io = scrub_bio_end_io;
-	bio->bi_bdev = sdev->dev->bdev;
-	bio->bi_sector = sbio->physical >> 9;
-
-	for (i = 0; i < sbio->count; ++i) {
-		struct page *page;
-		int ret;
-
-		page = alloc_page(GFP_NOFS);
-		if (!page)
-			goto nomem;
-
-		ret = bio_add_page(bio, page, PAGE_SIZE, 0);
-		if (!ret) {
-			__free_page(page);
-			goto nomem;
-		}
-	}
-
 	sbio->err = 0;
 	sdev->curr = -1;
 	atomic_inc(&sdev->in_flight);
 
-	submit_bio(READ, bio);
+	submit_bio(READ, sbio->bio);
 
 	return 0;
-
-nomem:
-	scrub_free_bio(bio);
-
-	return -ENOMEM;
 }
 
 static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
@@ -995,6 +963,8 @@ static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
 		      u8 *csum, int force)
 {
 	struct scrub_bio *sbio;
+	struct page *page;
+	int ret;
 
 again:
 	/*
@@ -1015,12 +985,22 @@ again:
 	}
 	sbio = sdev->bios[sdev->curr];
 	if (sbio->count == 0) {
+		struct bio *bio;
+
 		sbio->physical = physical;
 		sbio->logical = logical;
+		bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
+		if (!bio)
+			return -ENOMEM;
+
+		bio->bi_private = sbio;
+		bio->bi_end_io = scrub_bio_end_io;
+		bio->bi_bdev = sdev->dev->bdev;
+		bio->bi_sector = sbio->physical >> 9;
+		sbio->err = 0;
+		sbio->bio = bio;
 	} else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
 		   sbio->logical + sbio->count * PAGE_SIZE != logical) {
-		int ret;
-
 		ret = scrub_submit(sdev);
 		if (ret)
 			return ret;
@@ -1030,6 +1010,20 @@ again:
 	sbio->spag[sbio->count].generation = gen;
 	sbio->spag[sbio->count].have_csum = 0;
 	sbio->spag[sbio->count].mirror_num = mirror_num;
+
+	page = alloc_page(GFP_NOFS);
+	if (!page)
+		return -ENOMEM;
+
+	ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
+	if (!ret) {
+		__free_page(page);
+		ret = scrub_submit(sdev);
+		if (ret)
+			return ret;
+		goto again;
+	}
+
 	if (csum) {
 		sbio->spag[sbio->count].have_csum = 1;
 		memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
-- 
cgit v1.2.3-58-ga151


From 8965593e41dd2d0e2a2f1e6f245336005ea94a2c Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Fri, 11 Nov 2011 10:14:57 -0500
Subject: btrfs: rename the option to nospace_cache

Rename no_space_cache option to nospace_cache to be more consistent with
the rest, where the simple prefix 'no' is used to negate an option.

The option has been introduced during the -rc1 cycle and there are has not been
widely used, so it's safe.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 629281c65ff5..8bd9d6d0e07a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -197,7 +197,7 @@ static match_table_t tokens = {
 	{Opt_subvolrootid, "subvolrootid=%d"},
 	{Opt_defrag, "autodefrag"},
 	{Opt_inode_cache, "inode_cache"},
-	{Opt_no_space_cache, "no_space_cache"},
+	{Opt_no_space_cache, "nospace_cache"},
 	{Opt_recovery, "recovery"},
 	{Opt_err, NULL},
 };
@@ -711,7 +711,7 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (btrfs_test_opt(root, SPACE_CACHE))
 		seq_puts(seq, ",space_cache");
 	else
-		seq_puts(seq, ",no_space_cache");
+		seq_puts(seq, ",nospace_cache");
 	if (btrfs_test_opt(root, CLEAR_CACHE))
 		seq_puts(seq, ",clear_cache");
 	if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
-- 
cgit v1.2.3-58-ga151