From 79aec9844de339531f05b019644ccaf5dd777144 Mon Sep 17 00:00:00 2001
From: Sam Lang <sam.lang@inktank.com>
Date: Wed, 19 Dec 2012 09:44:23 -1000
Subject: ceph: Check for err on mds request in atomic_open

The error returned by ceph_mdsc_do_request includes errors sending the
request, errors on timeout, or any errors coming from the mds.  If
ceph_mdsc_do_request returns an error, the reply struct will most likely
be bogus.  We need to bail out and propogate the error instead of
overwriting it.

Signed-off-by: Sam Lang <sam.lang@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/file.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d415096800a6..2c71cbd78332 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -243,6 +243,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 	err = ceph_mdsc_do_request(mdsc,
 				   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
 				   req);
+	if (err)
+		goto out_err;
+
 	err = ceph_handle_snapdir(req, dentry, err);
 	if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
 		err = ceph_handle_notrace_create(dir, dentry);
-- 
cgit v1.2.3-58-ga151


From 6e8575faa8fa680d59404a4d58d12190667be815 Mon Sep 17 00:00:00 2001
From: Sam Lang <sam.lang@inktank.com>
Date: Fri, 28 Dec 2012 09:56:46 -0800
Subject: ceph: Check for created flag in response from mds

The mds now sends back a created inode if the create request
performed the create.  If the file already existed, no inode is
returned in the reply.  This allows ceph to set the created flag
in atomic_open so that permissions are properly checked in the case
that the file wasn't created by the create call to the mds.

To ensure compability with previous kernels, a feature for sending
back the inode in the create reply was added, so that the mds will
only send back the inode if the client indicates it supports the
feature.

Signed-off-by: Sam Lang <sam.lang@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/file.c                     |  3 +++
 fs/ceph/mds_client.c               | 33 +++++++++++++++++++++++++++++++--
 fs/ceph/mds_client.h               |  6 ++++++
 include/linux/ceph/ceph_features.h |  5 ++++-
 4 files changed, 44 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 2c71cbd78332..22b5b71b5401 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -266,6 +266,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 		err = finish_no_open(file, dn);
 	} else {
 		dout("atomic_open finish_open on dn %p\n", dn);
+		if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
+			*opened |= FILE_CREATED;
+		}
 		err = finish_open(file, dentry, ceph_open, opened);
 	}
 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 9165eb8309eb..d95842036c8b 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -232,6 +232,30 @@ bad:
 	return -EIO;
 }
 
+/*
+ * parse create results
+ */
+static int parse_reply_info_create(void **p, void *end,
+				  struct ceph_mds_reply_info_parsed *info,
+				  int features)
+{
+	if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
+		if (*p == end) {
+			info->has_create_ino = false;
+		} else {
+			info->has_create_ino = true;
+			info->ino = ceph_decode_64(p);
+		}
+	}
+
+	if (unlikely(*p != end))
+		goto bad;
+	return 0;
+
+bad:
+	return -EIO;
+}
+
 /*
  * parse extra results
  */
@@ -241,8 +265,12 @@ static int parse_reply_info_extra(void **p, void *end,
 {
 	if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
 		return parse_reply_info_filelock(p, end, info, features);
-	else
+	else if (info->head->op == CEPH_MDS_OP_READDIR)
 		return parse_reply_info_dir(p, end, info, features);
+	else if (info->head->op == CEPH_MDS_OP_CREATE)
+		return parse_reply_info_create(p, end, info, features);
+	else
+		return -EIO;
 }
 
 /*
@@ -2170,7 +2198,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	mutex_lock(&req->r_fill_mutex);
 	err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
 	if (err == 0) {
-		if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
+		if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
+				    req->r_op == CEPH_MDS_OP_LSSNAP) &&
 		    rinfo->dir_nr)
 			ceph_readdir_prepopulate(req, req->r_session);
 		ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index dd26846dd71d..567f7c60354e 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -74,6 +74,12 @@ struct ceph_mds_reply_info_parsed {
 			struct ceph_mds_reply_info_in *dir_in;
 			u8                            dir_complete, dir_end;
 		};
+
+		/* for create results */
+		struct {
+			bool has_create_ino;
+			u64 ino;
+		};
 	};
 
 	/* encoded blob describing snapshot contexts for certain
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index dad579b0c0e6..6b7c6acbb3bf 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -14,13 +14,16 @@
 #define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
 /* bits 8-17 defined by user-space; not supported yet here */
 #define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
+/* bits 19-25 defined by user-space; not supported yet here */
+#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27)
 
 /*
  * Features supported.
  */
 #define CEPH_FEATURES_SUPPORTED_DEFAULT  \
 	(CEPH_FEATURE_NOSRCADDR |	 \
-	 CEPH_FEATURE_CRUSH_TUNABLES)
+	 CEPH_FEATURE_CRUSH_TUNABLES |   \
+	 CEPH_FEATURE_REPLY_CREATE_INODE)
 
 #define CEPH_FEATURES_REQUIRED_DEFAULT   \
 	(CEPH_FEATURE_NOSRCADDR)
-- 
cgit v1.2.3-58-ga151


From 8a92a119b292012a9bd920b908c3e9f1c512291d Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 4 Jan 2013 14:28:07 +0800
Subject: ceph: move dirty inode to migrating list when clearing auth caps

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/caps.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index a1d9bb30c1bf..a9fe2d5784c9 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -611,8 +611,16 @@ retry:
 
 	if (flags & CEPH_CAP_FLAG_AUTH)
 		ci->i_auth_cap = cap;
-	else if (ci->i_auth_cap == cap)
+	else if (ci->i_auth_cap == cap) {
 		ci->i_auth_cap = NULL;
+		spin_lock(&mdsc->cap_dirty_lock);
+		if (!list_empty(&ci->i_dirty_item)) {
+			dout(" moving %p to cap_dirty_migrating\n", inode);
+			list_move(&ci->i_dirty_item,
+				  &mdsc->cap_dirty_migrating);
+		}
+		spin_unlock(&mdsc->cap_dirty_lock);
+	}
 
 	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
 	     inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
-- 
cgit v1.2.3-58-ga151


From 395c312b9c535d57db122cbb5b7292223561d0b8 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 4 Jan 2013 14:37:57 +0800
Subject: ceph: allow revoking duplicated caps issued by non-auth MDS

Allow revoking duplicated caps issued by non-auth MDS if these caps
are also issued by auth MDS.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/caps.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index a9fe2d5784c9..76b19239c426 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1468,7 +1468,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct inode *inode = &ci->vfs_inode;
 	struct ceph_cap *cap;
-	int file_wanted, used;
+	int file_wanted, used, cap_used;
 	int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
 	int issued, implemented, want, retain, revoking, flushing = 0;
 	int mds = -1;   /* keep track of how far we've gone through i_caps list
@@ -1571,9 +1571,14 @@ retry_locked:
 
 		/* NOTE: no side-effects allowed, until we take s_mutex */
 
+		cap_used = used;
+		if (ci->i_auth_cap && cap != ci->i_auth_cap)
+			cap_used &= ~ci->i_auth_cap->issued;
+
 		revoking = cap->implemented & ~cap->issued;
-		dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
+		dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
 		     cap->mds, cap, ceph_cap_string(cap->issued),
+		     ceph_cap_string(cap_used),
 		     ceph_cap_string(cap->implemented),
 		     ceph_cap_string(revoking));
 
@@ -1601,7 +1606,7 @@ retry_locked:
 		}
 
 		/* completed revocation? going down and there are no caps? */
-		if (revoking && (revoking & used) == 0) {
+		if (revoking && (revoking & cap_used) == 0) {
 			dout("completed revocation of %s\n",
 			     ceph_cap_string(cap->implemented & ~cap->issued));
 			goto ack;
@@ -1678,8 +1683,8 @@ ack:
 		sent++;
 
 		/* __send_cap drops i_ceph_lock */
-		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
-				      retain, flushing, NULL);
+		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
+				      want, retain, flushing, NULL);
 		goto retry; /* retake i_ceph_lock and restart our cap scan. */
 	}
 
-- 
cgit v1.2.3-58-ga151


From 66f58691c5c820283dd7e4d6fe8649033ed43ceb Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 4 Jan 2013 14:45:18 +0800
Subject: ceph: allocate cap_release message when receiving cap import

When client wants to release an imported cap, it's possible there
is no reserved cap_release message in corresponding mds session.
so __queue_cap_release causes kernel panic.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/caps.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 76b19239c426..40b5bbe63a39 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2833,6 +2833,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
 	     (unsigned)seq);
 
+	if (op == CEPH_CAP_OP_IMPORT)
+		ceph_add_cap_releases(mdsc, session);
+
 	/* lookup ino */
 	inode = ceph_find_inode(sb, vino);
 	ci = ceph_inode(inode);
-- 
cgit v1.2.3-58-ga151


From 390306c38dd43908f7f7730229999790a773d1d5 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 4 Jan 2013 15:30:10 +0800
Subject: ceph: check mds_wanted for imported cap

The MDS may have incorrect wanted caps after importing caps. So the
client should check the value mds has and send cap update if necessary.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/caps.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 40b5bbe63a39..1e1e02055a2b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2429,7 +2429,9 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 		dout("mds wanted %s -> %s\n",
 		     ceph_cap_string(le32_to_cpu(grant->wanted)),
 		     ceph_cap_string(wanted));
-		grant->wanted = cpu_to_le32(wanted);
+		/* imported cap may not have correct mds_wanted */
+		if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
+			check_caps = 1;
 	}
 
 	cap->seq = seq;
-- 
cgit v1.2.3-58-ga151


From e8afad656cbcd06d02a7bacd4b318fa0e2907de0 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 14 Nov 2012 09:38:19 -0600
Subject: libceph: pass length to ceph_calc_file_object_mapping()

ceph_calc_file_object_mapping() takes (among other things) a "file"
offset and length, and based on the layout, determines the object
number ("bno") backing the affected portion of the file's data and
the offset into that object where the desired range begins.  It also
computes the size that should be used for the request--either the
amount requested or something less if that would exceed the end of
the object.

This patch changes the input length parameter in this function so it
is used only for input.  That is, the argument will be passed by
value rather than by address, so the value provided won't get
updated by the function.

The value would only get updated if the length would surpass the
current object, and in that case the value it got updated to would
be exactly that returned in *oxlen.

Only one of the two callers is affected by this change.  Update
ceph_calc_raw_layout() so it records any updated value.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/ioctl.c             | 2 +-
 include/linux/ceph/osdmap.h | 2 +-
 net/ceph/osd_client.c       | 6 ++++--
 net/ceph/osdmap.c           | 9 ++++-----
 4 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 36549a46e311..3b22150d3e19 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -194,7 +194,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 		return -EFAULT;
 
 	down_read(&osdc->map_sem);
-	r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
+	r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
 					  &dl.object_no, &dl.object_offset,
 					  &olen);
 	if (r < 0)
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 5ea57ba69320..1f653e2ff5cc 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -110,7 +110,7 @@ extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
 
 /* calculate mapping of a file extent to an object */
 extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
-					 u64 off, u64 *plen,
+					 u64 off, u64 len,
 					 u64 *bno, u64 *oxoff, u64 *oxlen);
 
 /* calculate mapping of object to a placement group */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index e29a3ed92958..47e5f5b1f94c 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -53,13 +53,15 @@ int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
 	reqhead->snapid = cpu_to_le64(snapid);
 
 	/* object extent? */
-	r = ceph_calc_file_object_mapping(layout, off, plen, bno,
+	r = ceph_calc_file_object_mapping(layout, off, orig_len, bno,
 					  &objoff, &objlen);
 	if (r < 0)
 		return r;
-	if (*plen < orig_len)
+	if (objlen < orig_len) {
+		*plen = objlen;
 		dout(" skipping last %llu, final file extent %llu~%llu\n",
 		     orig_len - *plen, off, *plen);
+	}
 
 	if (op_has_extent(op->op)) {
 		u32 osize = le32_to_cpu(layout->fl_object_size);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index ca05871635bc..369f03ba9ee5 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1016,7 +1016,7 @@ bad:
  * pass a stride back to the caller.
  */
 int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
-				   u64 off, u64 *plen,
+				   u64 off, u64 len,
 				   u64 *ono,
 				   u64 *oxoff, u64 *oxlen)
 {
@@ -1027,7 +1027,7 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 	u32 su_per_object;
 	u64 t, su_offset;
 
-	dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
+	dout("mapping %llu~%llu  osize %u fl_su %u\n", off, len,
 	     osize, su);
 	if (su == 0 || sc == 0)
 		goto invalid;
@@ -1060,11 +1060,10 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 
 	/*
 	 * Calculate the length of the extent being written to the selected
-	 * object. This is the minimum of the full length requested (plen) or
+	 * object. This is the minimum of the full length requested (len) or
 	 * the remainder of the current stripe being written to.
 	 */
-	*oxlen = min_t(u64, *plen, su - su_offset);
-	*plen = *oxlen;
+	*oxlen = min_t(u64, len, su - su_offset);
 
 	dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
 	return 0;
-- 
cgit v1.2.3-58-ga151


From 39b648d9ec7d4ab0b4362872c6284a12c582afa6 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 31 Jan 2013 11:53:05 -0800
Subject: ceph: remove 'ceph.layout' virtual xattr

This has been deprecated since v3.3, 114fc474.  Kill it.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Sam Lang <sam.lang@inktank.com>
---
 fs/ceph/xattr.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 2c2ae5be9902..c2048b1a5395 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -123,13 +123,6 @@ static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val,
 
 static struct ceph_vxattr ceph_file_vxattrs[] = {
 	XATTR_NAME_CEPH(file, layout),
-	/* The following extended attribute name is deprecated */
-	{
-		.name = XATTR_CEPH_PREFIX "layout",
-		.name_size = sizeof (XATTR_CEPH_PREFIX "layout"),
-		.getxattr_cb = ceph_vxattrcb_file_layout,
-		.readonly = true,
-	},
 	{ 0 }	/* Required table terminator */
 };
 static size_t ceph_file_vxattrs_name_size;	/* total size of all names */
-- 
cgit v1.2.3-58-ga151


From 8860147a01c4243f64f7d602dbf8342ca616ed45 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 31 Jan 2013 11:53:27 -0800
Subject: ceph: support hidden vxattrs

Add ability to flag virtual xattrs as hidden, such that you can
getxattr them but they do not appear in listxattr.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Sam Lang <sam.lang@inktank.com>
---
 fs/ceph/xattr.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index c2048b1a5395..43063d0dee8f 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -29,7 +29,7 @@ struct ceph_vxattr {
 	size_t name_size;	/* strlen(name) + 1 (for '\0') */
 	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
 			      size_t size);
-	bool readonly;
+	bool readonly, hidden;
 };
 
 /* directories */
@@ -85,13 +85,14 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
 
 #define CEPH_XATTR_NAME(_type, _name)	XATTR_CEPH_PREFIX #_type "." #_name
 
-#define XATTR_NAME_CEPH(_type, _name) \
-		{ \
-			.name = CEPH_XATTR_NAME(_type, _name), \
-			.name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
-			.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
-			.readonly = true, \
-		}
+#define XATTR_NAME_CEPH(_type, _name)					\
+	{								\
+		.name = CEPH_XATTR_NAME(_type, _name),			\
+		.name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
+		.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
+		.readonly = true,				\
+		.hidden = false,				\
+	}
 
 static struct ceph_vxattr ceph_dir_vxattrs[] = {
 	XATTR_NAME_CEPH(dir, entries),
@@ -157,7 +158,8 @@ static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
 	size_t size = 0;
 
 	for (vxattr = vxattrs; vxattr->name; vxattr++)
-		size += vxattr->name_size;
+		if (!vxattr->hidden)
+			size += vxattr->name_size;
 
 	return size;
 }
-- 
cgit v1.2.3-58-ga151


From 3adf654ddbc355c23d75c6684128d4b067a7b792 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 31 Jan 2013 11:53:41 -0800
Subject: ceph: pass unhandled ceph.* setxattrs through to MDS

If we do not specifically understand a setxattr on a ceph.* virtual
xattr, send it through to the MDS.  This allows us to implement new
functionality via the MDS without direct support on the client side.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Sam Lang <sam.lang@inktank.com>
---
 fs/ceph/xattr.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 43063d0dee8f..edc47de77fed 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -777,6 +777,10 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
 	if (vxattr && vxattr->readonly)
 		return -EOPNOTSUPP;
 
+	/* pass any unhandled ceph.* xattrs through to the MDS */
+	if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
+		goto do_sync_unlocked;
+
 	/* preallocate memory for xattr name, value, index node */
 	err = -ENOMEM;
 	newname = kmemdup(name, name_len + 1, GFP_NOFS);
@@ -833,6 +837,7 @@ retry:
 
 do_sync:
 	spin_unlock(&ci->i_ceph_lock);
+do_sync_unlocked:
 	err = ceph_sync_setxattr(dentry, name, value, size, flags);
 out:
 	kfree(newname);
-- 
cgit v1.2.3-58-ga151


From d421acb1ad7dfa31b7463b67f1593714b0b727c3 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sun, 20 Jan 2013 21:55:30 -0800
Subject: ceph: pass ceph.* removexattrs through to MDS

If we do not explicitly recognized a vxattr (e.g., as readonly), pass
the request through to the MDS and deal with it there.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Sam Lang <sam.lang@inktank.com>
---
 fs/ceph/xattr.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index edc47de77fed..234270f00c2a 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -892,6 +892,10 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
 	if (vxattr && vxattr->readonly)
 		return -EOPNOTSUPP;
 
+	/* pass any unhandled ceph.* xattrs through to the MDS */
+	if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
+		goto do_sync_unlocked;
+
 	err = -ENOMEM;
 	spin_lock(&ci->i_ceph_lock);
 retry:
@@ -931,6 +935,7 @@ retry:
 	return err;
 do_sync:
 	spin_unlock(&ci->i_ceph_lock);
+do_sync_unlocked:
 	err = ceph_send_removexattr(dentry, name);
 out:
 	return err;
-- 
cgit v1.2.3-58-ga151


From f36e4472969a78ae65e514b553e9a0feacb40a28 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sun, 20 Jan 2013 21:59:29 -0800
Subject: ceph: add exists_cb to vxattr struct

Allow for a callback to dynamically determine if a vxattr exists for
the given inode.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Sam Lang <sam.lang@inktank.com>
---
 fs/ceph/xattr.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 234270f00c2a..06344da4e968 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -30,6 +30,7 @@ struct ceph_vxattr {
 	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
 			      size_t size);
 	bool readonly, hidden;
+	bool (*exists_cb)(struct ceph_inode_info *ci);
 };
 
 /* directories */
@@ -92,6 +93,7 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
 		.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
 		.readonly = true,				\
 		.hidden = false,				\
+		.exists_cb = NULL,			\
 	}
 
 static struct ceph_vxattr ceph_dir_vxattrs[] = {
-- 
cgit v1.2.3-58-ga151


From 0bee82fb4b8d49541fe474ed460d2b917f329568 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sun, 20 Jan 2013 22:00:58 -0800
Subject: ceph: fix getxattr vxattr handling

Change the vxattr handling for getxattr so that vxattrs are checked
prior to any xattr content, and never after.  Enforce vxattr existence
via the exists_cb callback.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Sam Lang <sam.lang@inktank.com>
---
 fs/ceph/xattr.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 06344da4e968..87b85f3403d4 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -569,13 +569,17 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
 	if (!ceph_is_valid_xattr(name))
 		return -ENODATA;
 
-	/* let's see if a virtual xattr was requested */
-	vxattr = ceph_match_vxattr(inode, name);
-
 	spin_lock(&ci->i_ceph_lock);
 	dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
 	     ci->i_xattrs.version, ci->i_xattrs.index_version);
 
+	/* let's see if a virtual xattr was requested */
+	vxattr = ceph_match_vxattr(inode, name);
+	if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
+		err = vxattr->getxattr_cb(ci, value, size);
+		goto out;
+	}
+
 	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
 	    (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
 		goto get_xattr;
@@ -589,11 +593,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
 
 	spin_lock(&ci->i_ceph_lock);
 
-	if (vxattr && vxattr->readonly) {
-		err = vxattr->getxattr_cb(ci, value, size);
-		goto out;
-	}
-
 	err = __build_xattrs(inode);
 	if (err < 0)
 		goto out;
@@ -601,11 +600,8 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
 get_xattr:
 	err = -ENODATA;  /* == ENOATTR */
 	xattr = __get_xattr(ci, name);
-	if (!xattr) {
-		if (vxattr)
-			err = vxattr->getxattr_cb(ci, value, size);
+	if (!xattr)
 		goto out;
-	}
 
 	err = -ERANGE;
 	if (size && size < xattr->val_len)
-- 
cgit v1.2.3-58-ga151


From b65917dd2700b7d12e25e2e0bbfd58eb3c932158 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sun, 20 Jan 2013 22:02:39 -0800
Subject: ceph: fix listxattr handling for vxattrs

Only include vxattrs in the result if they are not hidden and exist
(as determined by the exists_cb callback).

Note that the buffer size we return when 0 is passed in always includes
vxattrs that *might* exist, forming an upper bound.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Sam Lang <sam.lang@inktank.com>
---
 fs/ceph/xattr.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 87b85f3403d4..ec09ea5c4f07 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -657,23 +657,30 @@ list_xattr:
 	vir_namelen = ceph_vxattrs_name_size(vxattrs);
 
 	/* adding 1 byte per each variable due to the null termination */
-	namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
+	namelen = ci->i_xattrs.names_size + ci->i_xattrs.count;
 	err = -ERANGE;
-	if (size && namelen > size)
+	if (size && vir_namelen + namelen > size)
 		goto out;
 
-	err = namelen;
+	err = namelen + vir_namelen;
 	if (size == 0)
 		goto out;
 
 	names = __copy_xattr_names(ci, names);
 
 	/* virtual xattr names, too */
-	if (vxattrs)
+	err = namelen;
+	if (vxattrs) {
 		for (i = 0; vxattrs[i].name; i++) {
-			len = sprintf(names, "%s", vxattrs[i].name);
-			names += len + 1;
+			if (!vxattrs[i].hidden &&
+			    !(vxattrs[i].exists_cb &&
+			      !vxattrs[i].exists_cb(ci))) {
+				len = sprintf(names, "%s", vxattrs[i].name);
+				names += len + 1;
+				err += len + 1;
+			}
 		}
+	}
 
 out:
 	spin_unlock(&ci->i_ceph_lock);
-- 
cgit v1.2.3-58-ga151


From 32ab0bd78d7d9235efb38ad5cba6a3a6b39a1da6 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sat, 19 Jan 2013 16:46:32 -0800
Subject: ceph: change ceph.file.layout.* implementation, content

Implement a new method to generate the ceph.file.layout vxattr using
the new framework.

Use 'stripe_unit' instead of 'chunk_size'.

Include pool name, either as a string or as an integer.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Sam Lang <sam.lang@inktank.com>
---
 fs/ceph/xattr.c | 67 +++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 53 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index ec09ea5c4f07..532c95a6c9fa 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -33,6 +33,50 @@ struct ceph_vxattr {
 	bool (*exists_cb)(struct ceph_inode_info *ci);
 };
 
+/* layouts */
+
+static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
+{
+	size_t s;
+	char *p = (char *)&ci->i_layout;
+
+	for (s = 0; s < sizeof(ci->i_layout); s++, p++)
+		if (*p)
+			return true;
+	return false;
+}
+
+static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
+					size_t size)
+{
+	int ret;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+	struct ceph_osd_client *osdc = &fsc->client->osdc;
+	s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+	const char *pool_name;
+
+	dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
+	down_read(&osdc->map_sem);
+	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
+	if (pool_name)
+		ret = snprintf(val, size,
+		"stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s",
+		(unsigned long long)ceph_file_layout_su(ci->i_layout),
+		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
+	        (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
+		pool_name);
+	else
+		ret = snprintf(val, size,
+		"stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
+		(unsigned long long)ceph_file_layout_su(ci->i_layout),
+		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
+	        (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
+		(unsigned long long)pool);
+
+	up_read(&osdc->map_sem);
+	return ret;
+}
+
 /* directories */
 
 static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
@@ -84,6 +128,7 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
 			(long)ci->i_rctime.tv_nsec);
 }
 
+
 #define CEPH_XATTR_NAME(_type, _name)	XATTR_CEPH_PREFIX #_type "." #_name
 
 #define XATTR_NAME_CEPH(_type, _name)					\
@@ -111,21 +156,15 @@ static size_t ceph_dir_vxattrs_name_size;	/* total size of all names */
 
 /* files */
 
-static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val,
-				   size_t size)
-{
-	int ret;
-
-	ret = snprintf(val, size,
-		"chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
-		(unsigned long long)ceph_file_layout_su(ci->i_layout),
-		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
-		(unsigned long long)ceph_file_layout_object_size(ci->i_layout));
-	return ret;
-}
-
 static struct ceph_vxattr ceph_file_vxattrs[] = {
-	XATTR_NAME_CEPH(file, layout),
+	{
+		.name = "ceph.file.layout",
+		.name_size = sizeof("ceph.file.layout"),
+		.getxattr_cb = ceph_vxattrcb_layout,
+		.readonly = false,
+		.hidden = false,
+		.exists_cb = ceph_vxattrcb_layout_exists,
+	},
 	{ 0 }	/* Required table terminator */
 };
 static size_t ceph_file_vxattrs_name_size;	/* total size of all names */
-- 
cgit v1.2.3-58-ga151


From 1f08f2b056ea6c2e12f4e95e88949a882a996208 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sun, 20 Jan 2013 22:07:12 -0800
Subject: ceph: add ceph.dir.layout vxattr

This virtual xattr will only appear when there is a dir layout policy
set on the directory.  It can be set via setxattr and removed via
removexattr (implemented by the MDS).

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Sam Lang <sam.lang@inktank.com>
---
 fs/ceph/xattr.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 532c95a6c9fa..f3c4fe7202c7 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -142,6 +142,14 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
 	}
 
 static struct ceph_vxattr ceph_dir_vxattrs[] = {
+	{
+		.name = "ceph.dir.layout",
+		.name_size = sizeof("ceph.dir.layout"),
+		.getxattr_cb = ceph_vxattrcb_layout,
+		.readonly = false,
+		.hidden = false,
+		.exists_cb = ceph_vxattrcb_layout_exists,
+	},
 	XATTR_NAME_CEPH(dir, entries),
 	XATTR_NAME_CEPH(dir, files),
 	XATTR_NAME_CEPH(dir, subdirs),
-- 
cgit v1.2.3-58-ga151


From 695b711933689ea51af782760f4b1e2c6a42a631 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sun, 20 Jan 2013 22:08:33 -0800
Subject: ceph: implement hidden per-field ceph.*.layout.* vxattrs

Allow individual fields of the layout to be fetched via getxattr.
The ceph.dir.layout.* vxattr with "disappear" if the exists_cb
indicates there no dir layout set.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Sam Lang <sam.lang@inktank.com>
---
 fs/ceph/xattr.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index f3c4fe7202c7..2135817e708d 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -77,6 +77,46 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 	return ret;
 }
 
+static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
+					       char *val, size_t size)
+{
+	return snprintf(val, size, "%lld",
+			(unsigned long long)ceph_file_layout_su(ci->i_layout));
+}
+
+static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
+						char *val, size_t size)
+{
+	return snprintf(val, size, "%lld",
+	       (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout));
+}
+
+static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
+					       char *val, size_t size)
+{
+	return snprintf(val, size, "%lld",
+	       (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+}
+
+static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
+					char *val, size_t size)
+{
+	int ret;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+	struct ceph_osd_client *osdc = &fsc->client->osdc;
+	s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+	const char *pool_name;
+
+	down_read(&osdc->map_sem);
+	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
+	if (pool_name)
+		ret = snprintf(val, size, "%s", pool_name);
+	else
+		ret = snprintf(val, size, "%lld", (unsigned long long)pool);
+	up_read(&osdc->map_sem);
+	return ret;
+}
+
 /* directories */
 
 static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
@@ -130,6 +170,8 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
 
 
 #define CEPH_XATTR_NAME(_type, _name)	XATTR_CEPH_PREFIX #_type "." #_name
+#define CEPH_XATTR_NAME2(_type, _name, _name2)	\
+	XATTR_CEPH_PREFIX #_type "." #_name "." #_name2
 
 #define XATTR_NAME_CEPH(_type, _name)					\
 	{								\
@@ -140,6 +182,15 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
 		.hidden = false,				\
 		.exists_cb = NULL,			\
 	}
+#define XATTR_LAYOUT_FIELD(_type, _name, _field)			\
+	{								\
+		.name = CEPH_XATTR_NAME2(_type, _name, _field),	\
+		.name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \
+		.getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \
+		.readonly = false,				\
+		.hidden = true,			\
+		.exists_cb = ceph_vxattrcb_layout_exists,	\
+	}
 
 static struct ceph_vxattr ceph_dir_vxattrs[] = {
 	{
@@ -150,6 +201,10 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
 		.hidden = false,
 		.exists_cb = ceph_vxattrcb_layout_exists,
 	},
+	XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
+	XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
+	XATTR_LAYOUT_FIELD(dir, layout, object_size),
+	XATTR_LAYOUT_FIELD(dir, layout, pool),
 	XATTR_NAME_CEPH(dir, entries),
 	XATTR_NAME_CEPH(dir, files),
 	XATTR_NAME_CEPH(dir, subdirs),
@@ -173,6 +228,10 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
 		.hidden = false,
 		.exists_cb = ceph_vxattrcb_layout_exists,
 	},
+	XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
+	XATTR_LAYOUT_FIELD(file, layout, stripe_count),
+	XATTR_LAYOUT_FIELD(file, layout, object_size),
+	XATTR_LAYOUT_FIELD(file, layout, pool),
 	{ 0 }	/* Required table terminator */
 };
 static size_t ceph_file_vxattrs_name_size;	/* total size of all names */
-- 
cgit v1.2.3-58-ga151


From 87f979d390f9ecfa3d0038a9f9a002a62f8a1895 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 15 Feb 2013 11:42:29 -0600
Subject: ceph: kill ceph_osdc_writepages() "nofail" parameter

There is only one caller of ceph_osdc_writepages(), and it always
passes the value true as its "nofail" argument.  Get rid of that
argument and replace its use in ceph_osdc_writepages() with the
constant value true.

This and a number of cleanup patches that follow resolve:
    http://tracker.ceph.com/issues/4126

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/addr.c                  | 2 +-
 include/linux/ceph/osd_client.h | 2 +-
 net/ceph/osd_client.c           | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 064d1a68d2c1..c7e401c96fc9 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -493,7 +493,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 				   page_off, len,
 				   ci->i_truncate_seq, ci->i_truncate_size,
 				   &inode->i_mtime,
-				   &page, 1, 0, 0, true);
+				   &page, 1, 0, 0);
 	if (err < 0) {
 		dout("writepage setting page/mapping error %d %p\n", err, page);
 		SetPageError(page);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 82bf6338d6c1..afcb255b016a 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -275,7 +275,7 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
 				u32 truncate_seq, u64 truncate_size,
 				struct timespec *mtime,
 				struct page **pages, int nr_pages,
-				int flags, int do_sync, bool nofail);
+				int flags, int do_sync);
 
 /* watch/notify events */
 extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index d9d58bbe9f9a..dd01b1340e95 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1867,7 +1867,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 			 u32 truncate_seq, u64 truncate_size,
 			 struct timespec *mtime,
 			 struct page **pages, int num_pages,
-			 int flags, int do_sync, bool nofail)
+			 int flags, int do_sync)
 {
 	struct ceph_osd_request *req;
 	int rc = 0;
@@ -1880,7 +1880,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 					    CEPH_OSD_FLAG_WRITE,
 				    snapc, do_sync,
 				    truncate_seq, truncate_size, mtime,
-				    nofail, 1, page_align);
+				    true, 1, page_align);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1889,7 +1889,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 	dout("writepages %llu~%llu (%d pages)\n", off, len,
 	     req->r_num_pages);
 
-	rc = ceph_osdc_start_request(osdc, req, nofail);
+	rc = ceph_osdc_start_request(osdc, req, true);
 	if (!rc)
 		rc = ceph_osdc_wait_request(osdc, req);
 
-- 
cgit v1.2.3-58-ga151


From fbf8685fb155e12a9f4d4b966c7b3442ed557687 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 15 Feb 2013 11:42:29 -0600
Subject: ceph: kill ceph_osdc_writepages() "dosync" parameter

There is only one caller of ceph_osdc_writepages(), and it always
passes 0 as its "dosync" argument.  Get rid of that argument and
replace its use in ceph_osdc_writepages() with 0.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/addr.c                  | 2 +-
 include/linux/ceph/osd_client.h | 2 +-
 net/ceph/osd_client.c           | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c7e401c96fc9..bef552819312 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -493,7 +493,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 				   page_off, len,
 				   ci->i_truncate_seq, ci->i_truncate_size,
 				   &inode->i_mtime,
-				   &page, 1, 0, 0);
+				   &page, 1, 0);
 	if (err < 0) {
 		dout("writepage setting page/mapping error %d %p\n", err, page);
 		SetPageError(page);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index afcb255b016a..7a63100a3e69 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -275,7 +275,7 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
 				u32 truncate_seq, u64 truncate_size,
 				struct timespec *mtime,
 				struct page **pages, int nr_pages,
-				int flags, int do_sync);
+				int flags);
 
 /* watch/notify events */
 extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index dd01b1340e95..ac186b7c9986 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1867,7 +1867,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 			 u32 truncate_seq, u64 truncate_size,
 			 struct timespec *mtime,
 			 struct page **pages, int num_pages,
-			 int flags, int do_sync)
+			 int flags)
 {
 	struct ceph_osd_request *req;
 	int rc = 0;
@@ -1878,7 +1878,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 				    CEPH_OSD_OP_WRITE,
 				    flags | CEPH_OSD_FLAG_ONDISK |
 					    CEPH_OSD_FLAG_WRITE,
-				    snapc, do_sync,
+				    snapc, 0,
 				    truncate_seq, truncate_size, mtime,
 				    true, 1, page_align);
 	if (IS_ERR(req))
-- 
cgit v1.2.3-58-ga151


From 2480882611e3ab844563dd3d0a822227604ab8fe Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 15 Feb 2013 11:42:29 -0600
Subject: ceph: kill ceph_osdc_writepages() "flags" parameter

There is only one caller of ceph_osdc_writepages(), and it always
passes 0 as its "flags" argument.  Get rid of that argument and
replace its use in ceph_osdc_writepages() with 0.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/addr.c                  | 3 +--
 include/linux/ceph/osd_client.h | 3 +--
 net/ceph/osd_client.c           | 6 ++----
 3 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index bef552819312..8d3240d6f289 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -492,8 +492,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 				   &ci->i_layout, snapc,
 				   page_off, len,
 				   ci->i_truncate_seq, ci->i_truncate_size,
-				   &inode->i_mtime,
-				   &page, 1, 0);
+				   &inode->i_mtime, &page, 1);
 	if (err < 0) {
 		dout("writepage setting page/mapping error %d %p\n", err, page);
 		SetPageError(page);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 7a63100a3e69..6540e8861998 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -274,8 +274,7 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
 				u64 off, u64 len,
 				u32 truncate_seq, u64 truncate_size,
 				struct timespec *mtime,
-				struct page **pages, int nr_pages,
-				int flags);
+				struct page **pages, int nr_pages);
 
 /* watch/notify events */
 extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ac186b7c9986..d4e3812bcebc 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1866,8 +1866,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 			 u64 off, u64 len,
 			 u32 truncate_seq, u64 truncate_size,
 			 struct timespec *mtime,
-			 struct page **pages, int num_pages,
-			 int flags)
+			 struct page **pages, int num_pages)
 {
 	struct ceph_osd_request *req;
 	int rc = 0;
@@ -1876,8 +1875,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 	BUG_ON(vino.snap != CEPH_NOSNAP);
 	req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
 				    CEPH_OSD_OP_WRITE,
-				    flags | CEPH_OSD_FLAG_ONDISK |
-					    CEPH_OSD_FLAG_WRITE,
+				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
 				    snapc, 0,
 				    truncate_seq, truncate_size, mtime,
 				    true, 1, page_align);
-- 
cgit v1.2.3-58-ga151


From a3bea47e8bdd51d921e5b2045720d60140612c7c Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 15 Feb 2013 11:42:29 -0600
Subject: ceph: kill ceph_osdc_new_request() "num_reply" parameter

The "num_reply" parameter to ceph_osdc_new_request() is never
used inside that function, so get rid of it.

Note that ceph_sync_write() passes 2 for that argument, while all
other callers pass 1.  It doesn't matter, but perhaps someone should
verify this doesn't indicate a problem.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/addr.c                  | 4 ++--
 fs/ceph/file.c                  | 2 +-
 include/linux/ceph/osd_client.h | 3 +--
 net/ceph/osd_client.c           | 6 +++---
 4 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 8d3240d6f289..fc613715af46 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -315,7 +315,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
 				    NULL, 0,
 				    ci->i_truncate_seq, ci->i_truncate_size,
-				    NULL, false, 1, 0);
+				    NULL, false, 0);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -837,7 +837,7 @@ get_more_pages:
 					    snapc, do_sync,
 					    ci->i_truncate_seq,
 					    ci->i_truncate_size,
-					    &inode->i_mtime, true, 1, 0);
+					    &inode->i_mtime, true, 0);
 
 				if (IS_ERR(req)) {
 					rc = PTR_ERR(req);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index a1e5b81e8118..9c4325e654ca 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -541,7 +541,7 @@ more:
 				    ci->i_snap_realm->cached_context,
 				    do_sync,
 				    ci->i_truncate_seq, ci->i_truncate_size,
-				    &mtime, false, 2, page_align);
+				    &mtime, false, page_align);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 6540e8861998..5812802bd8ae 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -234,8 +234,7 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 				      int do_sync, u32 truncate_seq,
 				      u64 truncate_size,
 				      struct timespec *mtime,
-				      bool use_mempool, int num_reply,
-				      int page_align);
+				      bool use_mempool, int page_align);
 
 extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
 					 struct ceph_osd_request *req);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index d4e3812bcebc..d3e75138506b 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -393,7 +393,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 					       u32 truncate_seq,
 					       u64 truncate_size,
 					       struct timespec *mtime,
-					       bool use_mempool, int num_reply,
+					       bool use_mempool,
 					       int page_align)
 {
 	struct ceph_osd_req_op ops[2];
@@ -1837,7 +1837,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 	req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
 				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
 				    NULL, 0, truncate_seq, truncate_size, NULL,
-				    false, 1, page_align);
+				    false, page_align);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1878,7 +1878,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
 				    snapc, 0,
 				    truncate_seq, truncate_size, mtime,
-				    true, 1, page_align);
+				    true, page_align);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-- 
cgit v1.2.3-58-ga151


From 0eb40bf65e2fcc1c23ed7c9a10cd0890ee59e68f Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 15 Feb 2013 11:42:30 -0600
Subject: libceph: update ceph_mds_state_name() and ceph_mds_op_name()

Update ceph_mds_state_name() and ceph_mds_op_name() to include the
newly-added definitions in "ceph_fs.h", and to match its counterpart
in the user space code.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/strings.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index cd5097d7c804..89fa4a940a0f 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -15,6 +15,7 @@ const char *ceph_mds_state_name(int s)
 	case CEPH_MDS_STATE_BOOT:       return "up:boot";
 	case CEPH_MDS_STATE_STANDBY:    return "up:standby";
 	case CEPH_MDS_STATE_STANDBY_REPLAY:    return "up:standby-replay";
+	case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay";
 	case CEPH_MDS_STATE_CREATING:   return "up:creating";
 	case CEPH_MDS_STATE_STARTING:   return "up:starting";
 		/* up and in */
@@ -50,10 +51,13 @@ const char *ceph_mds_op_name(int op)
 	case CEPH_MDS_OP_LOOKUP:  return "lookup";
 	case CEPH_MDS_OP_LOOKUPHASH:  return "lookuphash";
 	case CEPH_MDS_OP_LOOKUPPARENT:  return "lookupparent";
+	case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";
 	case CEPH_MDS_OP_GETATTR:  return "getattr";
 	case CEPH_MDS_OP_SETXATTR: return "setxattr";
 	case CEPH_MDS_OP_SETATTR: return "setattr";
 	case CEPH_MDS_OP_RMXATTR: return "rmxattr";
+	case CEPH_MDS_OP_SETLAYOUT: return "setlayou";
+	case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout";
 	case CEPH_MDS_OP_READDIR: return "readdir";
 	case CEPH_MDS_OP_MKNOD: return "mknod";
 	case CEPH_MDS_OP_LINK: return "link";
-- 
cgit v1.2.3-58-ga151


From 9e0eb85d5861d512759caf1301670b36d4c221ed Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 6 Feb 2013 13:11:38 -0600
Subject: ceph: remove a few bogus declarations

There are three ceph page vector functions declared in
"fs/ceph/super.h" that don't belong there.  They're
probably left over from some long-ago code reorganization.

They're properly declared in "include/linux/ceph/libceph.h"
so just delete the ones in "super.h".

This and the next few commits resolve:
    http://tracker.ceph.com/issues/4053

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/super.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 66ebe720e40d..9861cce10a49 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -798,13 +798,7 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
 /* file.c */
 extern const struct file_operations ceph_file_fops;
 extern const struct address_space_operations ceph_aops;
-extern int ceph_copy_to_page_vector(struct page **pages,
-				    const char *data,
-				    loff_t off, size_t len);
-extern int ceph_copy_from_page_vector(struct page **pages,
-				    char *data,
-				    loff_t off, size_t len);
-extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
+
 extern int ceph_open(struct inode *inode, struct file *file);
 extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 			    struct file *file, unsigned flags, umode_t mode,
-- 
cgit v1.2.3-58-ga151


From 92a49fb0f79f3300e6e50ddf56238e70678e4202 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Fri, 22 Feb 2013 15:31:00 -0800
Subject: ceph: fix statvfs fr_size

Different versions of glibc are broken in different ways, but the short of
it is that for the time being, frsize should == bsize, and be used as the
multiple for the blocks, free, and available fields.  This mirrors what is
done for NFS.  The previous reporting of the page size for frsize meant
that newer glibc and df would report a very small value for the fs size.

Fixes http://tracker.ceph.com/issues/3793.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Greg Farnum <greg@inktank.com>
---
 fs/ceph/super.c | 7 ++++++-
 fs/ceph/super.h | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index e86aa9948124..9fe17c6c2876 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -71,8 +71,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 	/*
 	 * express utilization in terms of large blocks to avoid
 	 * overflow on 32-bit machines.
+	 *
+	 * NOTE: for the time being, we make bsize == frsize to humor
+	 * not-yet-ancient versions of glibc that are broken.
+	 * Someday, we will probably want to report a real block
+	 * size...  whatever that may mean for a network file system!
 	 */
 	buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
+	buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
 	buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
 	buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
 	buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
@@ -80,7 +86,6 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_files = le64_to_cpu(st.num_objects);
 	buf->f_ffree = -1;
 	buf->f_namelen = NAME_MAX;
-	buf->f_frsize = PAGE_CACHE_SIZE;
 
 	/* leave fsid little-endian, regardless of host endianness */
 	fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 9861cce10a49..604526a0d6cd 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -21,7 +21,7 @@
 
 /* large granularity for statfs utilization stats to facilitate
  * large volume sizes on 32-bit machines. */
-#define CEPH_BLOCK_SHIFT   20  /* 1 MB */
+#define CEPH_BLOCK_SHIFT   22  /* 4 MB */
 #define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
 
 #define CEPH_MOUNT_OPT_DIRSTAT         (1<<4) /* `cat dirname` for stats */
-- 
cgit v1.2.3-58-ga151


From 2c3dd4ff595e604cd4c4c51cff7a208f23148c2d Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 19 Feb 2013 12:25:56 -0600
Subject: ceph: eliminate sparse warnings in fs code

Fix the causes for sparse warnings reported in the ceph file system
code.  Here there are only two (and they're sort of silly but
they're easy to fix).

This partially resolves:
    http://tracker.ceph.com/issues/4184

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/xattr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 2135817e708d..9b6b2b6dd164 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -213,7 +213,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
 	XATTR_NAME_CEPH(dir, rsubdirs),
 	XATTR_NAME_CEPH(dir, rbytes),
 	XATTR_NAME_CEPH(dir, rctime),
-	{ 0 }	/* Required table terminator */
+	{ .name = NULL, 0 }	/* Required table terminator */
 };
 static size_t ceph_dir_vxattrs_name_size;	/* total size of all names */
 
@@ -232,7 +232,7 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
 	XATTR_LAYOUT_FIELD(file, layout, stripe_count),
 	XATTR_LAYOUT_FIELD(file, layout, object_size),
 	XATTR_LAYOUT_FIELD(file, layout, pool),
-	{ 0 }	/* Required table terminator */
+	{ .name = NULL, 0 }	/* Required table terminator */
 };
 static size_t ceph_file_vxattrs_name_size;	/* total size of all names */
 
-- 
cgit v1.2.3-58-ga151


From 12979354a1d6ef25d86f381e4d5f9e103f29913a Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Tue, 8 Jan 2013 09:15:10 -0800
Subject: libceph: rename ceph_pg -> ceph_pg_v1

Rename the old version this type to distinguish it from the new version.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 fs/ceph/ioctl.c                 |  2 +-
 include/linux/ceph/osd_client.h |  2 +-
 include/linux/ceph/osdmap.h     |  7 ++++---
 include/linux/ceph/rados.h      |  4 ++--
 net/ceph/osd_client.c           |  2 +-
 net/ceph/osdmap.c               | 18 +++++++++---------
 6 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 3b22150d3e19..e831436d6e68 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -186,7 +186,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	u64 len = 1, olen;
 	u64 tmp;
 	struct ceph_object_layout ol;
-	struct ceph_pg pgid;
+	struct ceph_pg_v1 pgid;
 	int r;
 
 	/* copy and validate */
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 388158ff0cbc..be2867330e23 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -56,7 +56,7 @@ struct ceph_osd_request {
 	struct list_head r_linger_item;
 	struct list_head r_linger_osd;
 	struct ceph_osd *r_osd;
-	struct ceph_pg   r_pgid;
+	struct ceph_pg_v1 r_pgid;
 	int              r_pg_osds[CEPH_PG_MAX_SIZE];
 	int              r_num_pg_osds;
 
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index c83a838f89f5..eb4989aa48e8 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -28,7 +28,7 @@ struct ceph_pg_pool_info {
 
 struct ceph_pg_mapping {
 	struct rb_node node;
-	struct ceph_pg pgid;
+	struct ceph_pg_v1 pgid;
 	int len;
 	int osds[];
 };
@@ -118,10 +118,11 @@ extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
 				   const char *oid,
 				   struct ceph_file_layout *fl,
 				   struct ceph_osdmap *osdmap);
-extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
+			       struct ceph_pg_v1 pgid,
 			       int *acting);
 extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
-				struct ceph_pg pgid);
+				struct ceph_pg_v1 pgid);
 
 extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
 extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index b65182aba6f7..e7cece69b13f 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -64,7 +64,7 @@ struct ceph_timespec {
  * placement group.
  * we encode this into one __le64.
  */
-struct ceph_pg {
+struct ceph_pg_v1 {
 	__le16 preferred; /* preferred primary osd */
 	__le16 ps;        /* placement seed */
 	__le32 pool;      /* object pool */
@@ -128,7 +128,7 @@ static inline int ceph_stable_mod(int x, int b, int bmask)
  * object layout - how a given object should be stored.
  */
 struct ceph_object_layout {
-	struct ceph_pg ol_pgid;   /* raw pg, with _full_ ps precision. */
+	struct ceph_pg_v1 ol_pgid;   /* raw pg, with _full_ ps precision. */
 	__le32 ol_stripe_unit;    /* for per-object parity, if any */
 } __attribute__ ((packed));
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 39629b66f3b1..e3ab8d60d080 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -914,7 +914,7 @@ static int __map_request(struct ceph_osd_client *osdc,
 			 struct ceph_osd_request *req, int force_resend)
 {
 	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
-	struct ceph_pg pgid;
+	struct ceph_pg_v1 pgid;
 	int acting[CEPH_PG_MAX_SIZE];
 	int o = -1, num = 0;
 	int err;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 3c61e21611d3..8c89ac25081a 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -350,7 +350,7 @@ bad:
  * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
  * to a set of osds)
  */
-static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+static int pgid_cmp(struct ceph_pg_v1 l, struct ceph_pg_v1 r)
 {
 	u64 a = *(u64 *)&l;
 	u64 b = *(u64 *)&r;
@@ -389,7 +389,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new,
 }
 
 static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
-						   struct ceph_pg pgid)
+						   struct ceph_pg_v1 pgid)
 {
 	struct rb_node *n = root->rb_node;
 	struct ceph_pg_mapping *pg;
@@ -411,7 +411,7 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
 	return NULL;
 }
 
-static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid)
+static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg_v1 pgid)
 {
 	struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid);
 
@@ -721,7 +721,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	ceph_decode_32_safe(p, end, len, bad);
 	for (i = 0; i < len; i++) {
 		int n, j;
-		struct ceph_pg pgid;
+		struct ceph_pg_v1 pgid;
 		struct ceph_pg_mapping *pg;
 
 		ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
@@ -944,7 +944,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	while (len--) {
 		struct ceph_pg_mapping *pg;
 		int j;
-		struct ceph_pg pgid;
+		struct ceph_pg_v1 pgid;
 		u32 pglen;
 		ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
 		ceph_decode_copy(p, &pgid, sizeof(pgid));
@@ -1079,7 +1079,7 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
 			    struct ceph_osdmap *osdmap)
 {
 	unsigned int num, num_mask;
-	struct ceph_pg pgid;
+	struct ceph_pg_v1 pgid;
 	int poolid = le32_to_cpu(fl->fl_pg_pool);
 	struct ceph_pg_pool_info *pool;
 	unsigned int ps;
@@ -1108,7 +1108,7 @@ EXPORT_SYMBOL(ceph_calc_object_layout);
  * Calculate raw osd vector for the given pgid.  Return pointer to osd
  * array, or NULL on failure.
  */
-static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg_v1 pgid,
 			int *osds, int *num)
 {
 	struct ceph_pg_mapping *pg;
@@ -1163,7 +1163,7 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 /*
  * Return acting set for given pgid.
  */
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg_v1 pgid,
 			int *acting)
 {
 	int rawosds[CEPH_PG_MAX_SIZE], *osds;
@@ -1184,7 +1184,7 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 /*
  * Return primary osd for given pgid, or -1 if none.
  */
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg_v1 pgid)
 {
 	int rawosds[CEPH_PG_MAX_SIZE], *osds;
 	int i, num = CEPH_PG_MAX_SIZE;
-- 
cgit v1.2.3-58-ga151


From 5b191d9914eb68257f47de9d5bfe099b77f0687c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sat, 23 Feb 2013 10:38:16 -0800
Subject: libceph: decode into cpu-native ceph_pg type

Always decode data into our cpu-native ceph_pg type that has the correct
field widths.  Limit any remaining uses of ceph_pg_v1 to dealing with the
legacy protocol.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 fs/ceph/ioctl.c                 |  5 +--
 include/linux/ceph/osd_client.h |  2 +-
 include/linux/ceph/osdmap.h     | 11 ++++--
 net/ceph/debugfs.c              |  5 ++-
 net/ceph/osd_client.c           |  9 ++---
 net/ceph/osdmap.c               | 78 +++++++++++++++++++++++------------------
 6 files changed, 62 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index e831436d6e68..fb036ed3e129 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -186,7 +186,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	u64 len = 1, olen;
 	u64 tmp;
 	struct ceph_object_layout ol;
-	struct ceph_pg_v1 pgid;
+	struct ceph_pg pgid;
 	int r;
 
 	/* copy and validate */
@@ -212,7 +212,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
 				osdc->osdmap);
 
-	pgid = ol.ol_pgid;
+	pgid.pool = le32_to_cpu(ol.ol_pgid.pool);
+	pgid.seed = le16_to_cpu(ol.ol_pgid.ps);
 	dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
 	if (dl.osd >= 0) {
 		struct ceph_entity_addr *a =
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index be2867330e23..388158ff0cbc 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -56,7 +56,7 @@ struct ceph_osd_request {
 	struct list_head r_linger_item;
 	struct list_head r_linger_osd;
 	struct ceph_osd *r_osd;
-	struct ceph_pg_v1 r_pgid;
+	struct ceph_pg   r_pgid;
 	int              r_pg_osds[CEPH_PG_MAX_SIZE];
 	int              r_num_pg_osds;
 
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index eb4989aa48e8..8a612df4c248 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -18,6 +18,11 @@
  * The map can be updated either via an incremental map (diff) describing
  * the change between two successive epochs, or as a fully encoded map.
  */
+struct ceph_pg {
+	uint64_t pool;
+	uint32_t seed;
+};
+
 struct ceph_pg_pool_info {
 	struct rb_node node;
 	int id;
@@ -28,7 +33,7 @@ struct ceph_pg_pool_info {
 
 struct ceph_pg_mapping {
 	struct rb_node node;
-	struct ceph_pg_v1 pgid;
+	struct ceph_pg pgid;
 	int len;
 	int osds[];
 };
@@ -119,10 +124,10 @@ extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
 				   struct ceph_file_layout *fl,
 				   struct ceph_osdmap *osdmap);
 extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
-			       struct ceph_pg_v1 pgid,
+			       struct ceph_pg pgid,
 			       int *acting);
 extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
-				struct ceph_pg_v1 pgid);
+				struct ceph_pg pgid);
 
 extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
 extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 38b5dc1823d4..61a9af634f8b 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -131,10 +131,9 @@ static int osdc_show(struct seq_file *s, void *pp)
 
 		req = rb_entry(p, struct ceph_osd_request, r_node);
 
-		seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
+		seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid,
 			   req->r_osd ? req->r_osd->o_osd : -1,
-			   le32_to_cpu(req->r_pgid.pool),
-			   le16_to_cpu(req->r_pgid.ps));
+			   req->r_pgid.pool, req->r_pgid.seed);
 
 		head = req->r_request->front.iov_base;
 		op = (void *)(head + 1);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index e3ab8d60d080..1990834e518b 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -914,7 +914,7 @@ static int __map_request(struct ceph_osd_client *osdc,
 			 struct ceph_osd_request *req, int force_resend)
 {
 	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
-	struct ceph_pg_v1 pgid;
+	struct ceph_pg pgid;
 	int acting[CEPH_PG_MAX_SIZE];
 	int o = -1, num = 0;
 	int err;
@@ -926,7 +926,8 @@ static int __map_request(struct ceph_osd_client *osdc,
 		list_move(&req->r_req_lru_item, &osdc->req_notarget);
 		return err;
 	}
-	pgid = reqhead->layout.ol_pgid;
+	pgid.pool = le32_to_cpu(reqhead->layout.ol_pgid.pool);
+	pgid.seed = le16_to_cpu(reqhead->layout.ol_pgid.ps);
 	req->r_pgid = pgid;
 
 	err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
@@ -943,8 +944,8 @@ static int __map_request(struct ceph_osd_client *osdc,
 	    (req->r_osd == NULL && o == -1))
 		return 0;  /* no change */
 
-	dout("map_request tid %llu pgid %d.%x osd%d (was osd%d)\n",
-	     req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
+	dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
+	     req->r_tid, pgid.pool, pgid.seed, o,
 	     req->r_osd ? req->r_osd->o_osd : -1);
 
 	/* record full pg acting set */
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 8c89ac25081a..81118db5bd11 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -350,14 +350,15 @@ bad:
  * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
  * to a set of osds)
  */
-static int pgid_cmp(struct ceph_pg_v1 l, struct ceph_pg_v1 r)
+static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
 {
-	u64 a = *(u64 *)&l;
-	u64 b = *(u64 *)&r;
-
-	if (a < b)
+	if (l.pool < r.pool)
+		return -1;
+	if (l.pool > r.pool)
+		return 1;
+	if (l.seed < r.seed)
 		return -1;
-	if (a > b)
+	if (l.seed > r.seed)
 		return 1;
 	return 0;
 }
@@ -389,7 +390,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new,
 }
 
 static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
-						   struct ceph_pg_v1 pgid)
+						   struct ceph_pg pgid)
 {
 	struct rb_node *n = root->rb_node;
 	struct ceph_pg_mapping *pg;
@@ -403,25 +404,26 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
 		} else if (c > 0) {
 			n = n->rb_right;
 		} else {
-			dout("__lookup_pg_mapping %llx got %p\n",
-			     *(u64 *)&pgid, pg);
+			dout("__lookup_pg_mapping %lld.%x got %p\n",
+			     pgid.pool, pgid.seed, pg);
 			return pg;
 		}
 	}
 	return NULL;
 }
 
-static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg_v1 pgid)
+static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid)
 {
 	struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid);
 
 	if (pg) {
-		dout("__remove_pg_mapping %llx %p\n", *(u64 *)&pgid, pg);
+		dout("__remove_pg_mapping %lld.%x %p\n", pgid.pool, pgid.seed,
+		     pg);
 		rb_erase(&pg->node, root);
 		kfree(pg);
 		return 0;
 	}
-	dout("__remove_pg_mapping %llx dne\n", *(u64 *)&pgid);
+	dout("__remove_pg_mapping %lld.%x dne\n", pgid.pool, pgid.seed);
 	return -ENOENT;
 }
 
@@ -721,11 +723,14 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	ceph_decode_32_safe(p, end, len, bad);
 	for (i = 0; i < len; i++) {
 		int n, j;
-		struct ceph_pg_v1 pgid;
+		struct ceph_pg pgid;
+		struct ceph_pg_v1 pgid_v1;
 		struct ceph_pg_mapping *pg;
 
 		ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
-		ceph_decode_copy(p, &pgid, sizeof(pgid));
+		ceph_decode_copy(p, &pgid_v1, sizeof(pgid_v1));
+		pgid.pool = le32_to_cpu(pgid_v1.pool);
+		pgid.seed = le16_to_cpu(pgid_v1.ps);
 		n = ceph_decode_32(p);
 		err = -EINVAL;
 		if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
@@ -743,7 +748,8 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 		err = __insert_pg_mapping(pg, &map->pg_temp);
 		if (err)
 			goto bad;
-		dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
+		dout(" added pg_temp %lld.%x len %d\n", pgid.pool, pgid.seed,
+		     len);
 	}
 
 	/* crush */
@@ -944,10 +950,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	while (len--) {
 		struct ceph_pg_mapping *pg;
 		int j;
-		struct ceph_pg_v1 pgid;
+		struct ceph_pg_v1 pgid_v1;
+		struct ceph_pg pgid;
 		u32 pglen;
 		ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
-		ceph_decode_copy(p, &pgid, sizeof(pgid));
+		ceph_decode_copy(p, &pgid_v1, sizeof(pgid_v1));
+		pgid.pool = le32_to_cpu(pgid_v1.pool);
+		pgid.seed = le16_to_cpu(pgid_v1.ps);
 		pglen = ceph_decode_32(p);
 
 		if (pglen) {
@@ -973,8 +982,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 				kfree(pg);
 				goto bad;
 			}
-			dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
-			     pglen);
+			dout(" added pg_temp %lld.%x len %d\n", pgid.pool,
+			     pgid.seed, pglen);
 		} else {
 			/* remove */
 			__remove_pg_mapping(&map->pg_temp, pgid);
@@ -1079,26 +1088,25 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
 			    struct ceph_osdmap *osdmap)
 {
 	unsigned int num, num_mask;
-	struct ceph_pg_v1 pgid;
-	int poolid = le32_to_cpu(fl->fl_pg_pool);
+	struct ceph_pg pgid;
 	struct ceph_pg_pool_info *pool;
-	unsigned int ps;
 
 	BUG_ON(!osdmap);
 
-	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+	pgid.pool = le32_to_cpu(fl->fl_pg_pool);
+	pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
 	if (!pool)
 		return -EIO;
-	ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
+	pgid.seed = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
 	num = le32_to_cpu(pool->v.pg_num);
 	num_mask = pool->pg_num_mask;
 
-	pgid.ps = cpu_to_le16(ps);
-	pgid.preferred = cpu_to_le16(-1);
-	pgid.pool = fl->fl_pg_pool;
-	dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
+	dout("calc_object_layout '%s' pgid %lld.%x\n", oid, pgid.pool,
+	     pgid.seed);
 
-	ol->ol_pgid = pgid;
+	ol->ol_pgid.ps = cpu_to_le16(pgid.seed);
+	ol->ol_pgid.pool = fl->fl_pg_pool;
+	ol->ol_pgid.preferred = cpu_to_le16(-1);
 	ol->ol_stripe_unit = fl->fl_object_stripe_unit;
 	return 0;
 }
@@ -1108,7 +1116,7 @@ EXPORT_SYMBOL(ceph_calc_object_layout);
  * Calculate raw osd vector for the given pgid.  Return pointer to osd
  * array, or NULL on failure.
  */
-static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg_v1 pgid,
+static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 			int *osds, int *num)
 {
 	struct ceph_pg_mapping *pg;
@@ -1116,8 +1124,8 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg_v1 pgid,
 	int ruleno;
 	unsigned int poolid, ps, pps, t, r;
 
-	poolid = le32_to_cpu(pgid.pool);
-	ps = le16_to_cpu(pgid.ps);
+	poolid = pgid.pool;
+	ps = pgid.seed;
 
 	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
 	if (!pool)
@@ -1126,7 +1134,7 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg_v1 pgid,
 	/* pg_temp? */
 	t = ceph_stable_mod(ps, le32_to_cpu(pool->v.pg_num),
 			    pool->pgp_num_mask);
-	pgid.ps = cpu_to_le16(t);
+	pgid.seed = t;
 	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
 	if (pg) {
 		*num = pg->len;
@@ -1163,7 +1171,7 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg_v1 pgid,
 /*
  * Return acting set for given pgid.
  */
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg_v1 pgid,
+int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 			int *acting)
 {
 	int rawosds[CEPH_PG_MAX_SIZE], *osds;
@@ -1184,7 +1192,7 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg_v1 pgid,
 /*
  * Return primary osd for given pgid, or -1 if none.
  */
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg_v1 pgid)
+int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
 {
 	int rawosds[CEPH_PG_MAX_SIZE], *osds;
 	int i, num = CEPH_PG_MAX_SIZE;
-- 
cgit v1.2.3-58-ga151


From 4f6a7e5ee1393ec4b243b39dac9f36992d161540 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sat, 23 Feb 2013 10:41:09 -0800
Subject: ceph: update support for PGID64, PGPOOL3, OSDENC protocol features

Support (and require) the PGID64, PGPOOL3, and OSDENC protocol features.
These have been present in ceph.git since v0.42, Feb 2012.  Require these
features to simplify support; nobody is running older userspace.

Note that the new request and reply encoding is still not in place, so the new
code is not yet functional.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 fs/ceph/mdsmap.c                   |  12 ++-
 include/linux/ceph/ceph_features.h |  14 +++-
 include/linux/ceph/mdsmap.h        |   4 +-
 include/linux/ceph/osdmap.h        |  16 +++-
 include/linux/ceph/rados.h         |  23 ------
 net/ceph/ceph_common.c             |   6 +-
 net/ceph/debugfs.c                 |   6 +-
 net/ceph/osdmap.c                  | 162 ++++++++++++++++++++-----------------
 8 files changed, 124 insertions(+), 119 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 73b7d44e8a35..0d3c9240c61b 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -59,6 +59,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		return ERR_PTR(-ENOMEM);
 
 	ceph_decode_16_safe(p, end, version, bad);
+	if (version > 3) {
+		pr_warning("got mdsmap version %d > 3, failing", version);
+		goto bad;
+	}
 
 	ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
 	m->m_epoch = ceph_decode_32(p);
@@ -144,13 +148,13 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 	/* pg_pools */
 	ceph_decode_32_safe(p, end, n, bad);
 	m->m_num_data_pg_pools = n;
-	m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
+	m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);
 	if (!m->m_data_pg_pools)
 		goto badmem;
-	ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
+	ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);
 	for (i = 0; i < n; i++)
-		m->m_data_pg_pools[i] = ceph_decode_32(p);
-	m->m_cas_pg_pool = ceph_decode_32(p);
+		m->m_data_pg_pools[i] = ceph_decode_64(p);
+	m->m_cas_pg_pool = ceph_decode_64(p);
 
 	/* ok, we don't care about the rest. */
 	dout("mdsmap_decode success epoch %u\n", m->m_epoch);
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 9e0f5a8ba247..ab0a54286e0d 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -39,11 +39,17 @@
  * Features supported.
  */
 #define CEPH_FEATURES_SUPPORTED_DEFAULT  \
-	(CEPH_FEATURE_NOSRCADDR |	 \
-	 CEPH_FEATURE_CRUSH_TUNABLES |	  \
-	 CEPH_FEATURE_CRUSH_TUNABLES2 |   \
+	(CEPH_FEATURE_NOSRCADDR |		\
+	 CEPH_FEATURE_PGID64 |			\
+	 CEPH_FEATURE_PGPOOL3 |			\
+	 CEPH_FEATURE_OSDENC |			\
+	 CEPH_FEATURE_CRUSH_TUNABLES |		\
+	 CEPH_FEATURE_CRUSH_TUNABLES2 |		\
 	 CEPH_FEATURE_REPLY_CREATE_INODE)
 
 #define CEPH_FEATURES_REQUIRED_DEFAULT   \
-	(CEPH_FEATURE_NOSRCADDR)
+	(CEPH_FEATURE_NOSRCADDR |	 \
+	 CEPH_FEATURE_PGID64 |		 \
+	 CEPH_FEATURE_PGPOOL3 |		 \
+	 CEPH_FEATURE_OSDENC)
 #endif
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index cb15b5d867c7..87ed09f54800 100644
--- a/include/linux/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
@@ -29,8 +29,8 @@ struct ceph_mdsmap {
 
 	/* which object pools file data can be stored in */
 	int m_num_data_pg_pools;
-	u32 *m_data_pg_pools;
-	u32 m_cas_pg_pool;
+	u64 *m_data_pg_pools;
+	u64 m_cas_pg_pool;
 };
 
 static inline struct ceph_entity_addr *
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 8a612df4c248..8587746b7f0e 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -25,12 +25,22 @@ struct ceph_pg {
 
 struct ceph_pg_pool_info {
 	struct rb_node node;
-	int id;
-	struct ceph_pg_pool v;
-	int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
+	s64 id;
+	u8 type;
+	u8 size;
+	u8 crush_ruleset;
+	u8 object_hash;
+	u32 pg_num, pgp_num;
+	int pg_num_mask, pgp_num_mask;
+	u64 flags;
 	char *name;
 };
 
+struct ceph_object_locator {
+	uint64_t pool;
+	char *key;
+};
+
 struct ceph_pg_mapping {
 	struct rb_node node;
 	struct ceph_pg pgid;
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index e7cece69b13f..d784c8dfb09a 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -8,14 +8,6 @@
 
 #include <linux/ceph/msgr.h>
 
-/*
- * osdmap encoding versions
- */
-#define CEPH_OSDMAP_INC_VERSION     5
-#define CEPH_OSDMAP_INC_VERSION_EXT 6
-#define CEPH_OSDMAP_VERSION         5
-#define CEPH_OSDMAP_VERSION_EXT     6
-
 /*
  * fs id
  */
@@ -91,21 +83,6 @@ struct ceph_pg_v1 {
 
 #define CEPH_PG_TYPE_REP     1
 #define CEPH_PG_TYPE_RAID4   2
-#define CEPH_PG_POOL_VERSION 2
-struct ceph_pg_pool {
-	__u8 type;                /* CEPH_PG_TYPE_* */
-	__u8 size;                /* number of osds in each pg */
-	__u8 crush_ruleset;       /* crush placement rule */
-	__u8 object_hash;         /* hash mapping object name to ps */
-	__le32 pg_num, pgp_num;   /* number of pg's */
-	__le32 lpg_num, lpgp_num; /* number of localized pg's */
-	__le32 last_change;       /* most recent epoch changed */
-	__le64 snap_seq;          /* seq for per-pool snapshot */
-	__le32 snap_epoch;        /* epoch of last snap */
-	__le32 num_snaps;
-	__le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
-	__le64 auid;               /* who owns the pg */
-} __attribute__ ((packed));
 
 /*
  * stable_mod func is used to control number of placement groups.
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index c236c235c4a2..c5605ae96714 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -601,10 +601,8 @@ static int __init init_ceph_lib(void)
 	if (ret < 0)
 		goto out_crypto;
 
-	pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n",
-		CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL,
-		CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
-		CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
+	pr_info("loaded (mon/osd proto %d/%d)\n",
+		CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL);
 
 	return 0;
 
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 61a9af634f8b..f4d4b27d6026 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -66,9 +66,9 @@ static int osdmap_show(struct seq_file *s, void *p)
 	for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
 		struct ceph_pg_pool_info *pool =
 			rb_entry(n, struct ceph_pg_pool_info, node);
-		seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
-			   pool->id, pool->v.pg_num, pool->pg_num_mask,
-			   pool->v.lpg_num, pool->lpg_num_mask);
+		seq_printf(s, "pg_pool %llu pg_num %d / %d\n",
+			   (unsigned long long)pool->id, pool->pg_num,
+			   pool->pg_num_mask);
 	}
 	for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
 		struct ceph_entity_addr *addr =
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 81118db5bd11..911919320d2e 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -45,13 +45,8 @@ static int calc_bits_of(unsigned int t)
  */
 static void calc_pg_masks(struct ceph_pg_pool_info *pi)
 {
-	pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
-	pi->pgp_num_mask =
-		(1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
-	pi->lpg_num_mask =
-		(1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
-	pi->lpgp_num_mask =
-		(1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
+	pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1;
+	pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1;
 }
 
 /*
@@ -452,7 +447,7 @@ static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
 	return 0;
 }
 
-static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
+static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id)
 {
 	struct ceph_pg_pool_info *pi;
 	struct rb_node *n = root->rb_node;
@@ -508,24 +503,57 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
 
 static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
 {
-	unsigned int n, m;
+	u8 ev, cv;
+	unsigned len, num;
+	void *pool_end;
+
+	ceph_decode_need(p, end, 2 + 4, bad);
+	ev = ceph_decode_8(p);  /* encoding version */
+	cv = ceph_decode_8(p); /* compat version */
+	if (ev < 5) {
+		pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv);
+		return -EINVAL;
+	}
+	if (cv > 7) {
+		pr_warning("got v %d cv %d > 7 of ceph_pg_pool\n", ev, cv);
+		return -EINVAL;
+	}
+	len = ceph_decode_32(p);
+	ceph_decode_need(p, end, len, bad);
+	pool_end = *p + len;
 
-	ceph_decode_copy(p, &pi->v, sizeof(pi->v));
-	calc_pg_masks(pi);
+	pi->type = ceph_decode_8(p);
+	pi->size = ceph_decode_8(p);
+	pi->crush_ruleset = ceph_decode_8(p);
+	pi->object_hash = ceph_decode_8(p);
 
-	/* num_snaps * snap_info_t */
-	n = le32_to_cpu(pi->v.num_snaps);
-	while (n--) {
-		ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
-				 sizeof(struct ceph_timespec), bad);
-		*p += sizeof(u64) +       /* key */
-			1 + sizeof(u64) + /* u8, snapid */
-			sizeof(struct ceph_timespec);
-		m = ceph_decode_32(p);    /* snap name */
-		*p += m;
+	pi->pg_num = ceph_decode_32(p);
+	pi->pgp_num = ceph_decode_32(p);
+
+	*p += 4 + 4;  /* skip lpg* */
+	*p += 4;      /* skip last_change */
+	*p += 8 + 4;  /* skip snap_seq, snap_epoch */
+
+	/* skip snaps */
+	num = ceph_decode_32(p);
+	while (num--) {
+		*p += 8;  /* snapid key */
+		*p += 1 + 1; /* versions */
+		len = ceph_decode_32(p);
+		*p += len;
 	}
 
-	*p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
+	/* skip removed snaps */
+	num = ceph_decode_32(p);
+	*p += num * (8 + 8);
+
+	*p += 8;  /* skip auid */
+	pi->flags = ceph_decode_64(p);
+
+	/* ignore the rest */
+
+	*p = pool_end;
+	calc_pg_masks(pi);
 	return 0;
 
 bad:
@@ -535,14 +563,15 @@ bad:
 static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
 {
 	struct ceph_pg_pool_info *pi;
-	u32 num, len, pool;
+	u32 num, len;
+	u64 pool;
 
 	ceph_decode_32_safe(p, end, num, bad);
 	dout(" %d pool names\n", num);
 	while (num--) {
-		ceph_decode_32_safe(p, end, pool, bad);
+		ceph_decode_64_safe(p, end, pool, bad);
 		ceph_decode_32_safe(p, end, len, bad);
-		dout("  pool %d len %d\n", pool, len);
+		dout("  pool %llu len %d\n", pool, len);
 		ceph_decode_need(p, end, len, bad);
 		pi = __lookup_pg_pool(&map->pg_pools, pool);
 		if (pi) {
@@ -633,7 +662,6 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	struct ceph_osdmap *map;
 	u16 version;
 	u32 len, max, i;
-	u8 ev;
 	int err = -EINVAL;
 	void *start = *p;
 	struct ceph_pg_pool_info *pi;
@@ -646,9 +674,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	map->pg_temp = RB_ROOT;
 
 	ceph_decode_16_safe(p, end, version, bad);
-	if (version > CEPH_OSDMAP_VERSION) {
-		pr_warning("got unknown v %d > %d of osdmap\n", version,
-			   CEPH_OSDMAP_VERSION);
+	if (version > 6) {
+		pr_warning("got unknown v %d > 6 of osdmap\n", version);
+		goto bad;
+	}
+	if (version < 6) {
+		pr_warning("got old v %d < 6 of osdmap\n", version);
 		goto bad;
 	}
 
@@ -660,20 +691,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 
 	ceph_decode_32_safe(p, end, max, bad);
 	while (max--) {
-		ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
+		ceph_decode_need(p, end, 8 + 2, bad);
 		err = -ENOMEM;
 		pi = kzalloc(sizeof(*pi), GFP_NOFS);
 		if (!pi)
 			goto bad;
-		pi->id = ceph_decode_32(p);
-		err = -EINVAL;
-		ev = ceph_decode_8(p); /* encoding version */
-		if (ev > CEPH_PG_POOL_VERSION) {
-			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
-				   ev, CEPH_PG_POOL_VERSION);
-			kfree(pi);
-			goto bad;
-		}
+		pi->id = ceph_decode_64(p);
 		err = __decode_pool(p, end, pi);
 		if (err < 0) {
 			kfree(pi);
@@ -682,12 +705,10 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 		__insert_pg_pool(&map->pg_pools, pi);
 	}
 
-	if (version >= 5) {
-		err = __decode_pool_names(p, end, map);
-		if (err < 0) {
-			dout("fail to decode pool names");
-			goto bad;
-		}
+	err = __decode_pool_names(p, end, map);
+	if (err < 0) {
+		dout("fail to decode pool names");
+		goto bad;
 	}
 
 	ceph_decode_32_safe(p, end, map->pool_max, bad);
@@ -788,16 +809,17 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	struct ceph_fsid fsid;
 	u32 epoch = 0;
 	struct ceph_timespec modified;
-	u32 len, pool;
-	__s32 new_pool_max, new_flags, max;
+	s32 len;
+	u64 pool;
+	__s64 new_pool_max;
+	__s32 new_flags, max;
 	void *start = *p;
 	int err = -EINVAL;
 	u16 version;
 
 	ceph_decode_16_safe(p, end, version, bad);
-	if (version > CEPH_OSDMAP_INC_VERSION) {
-		pr_warning("got unknown v %d > %d of inc osdmap\n", version,
-			   CEPH_OSDMAP_INC_VERSION);
+	if (version > 6) {
+		pr_warning("got unknown v %d > %d of inc osdmap\n", version, 6);
 		goto bad;
 	}
 
@@ -807,7 +829,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	epoch = ceph_decode_32(p);
 	BUG_ON(epoch != map->epoch+1);
 	ceph_decode_copy(p, &modified, sizeof(modified));
-	new_pool_max = ceph_decode_32(p);
+	new_pool_max = ceph_decode_64(p);
 	new_flags = ceph_decode_32(p);
 
 	/* full map? */
@@ -857,18 +879,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	/* new_pool */
 	ceph_decode_32_safe(p, end, len, bad);
 	while (len--) {
-		__u8 ev;
 		struct ceph_pg_pool_info *pi;
 
-		ceph_decode_32_safe(p, end, pool, bad);
-		ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
-		ev = ceph_decode_8(p);  /* encoding version */
-		if (ev > CEPH_PG_POOL_VERSION) {
-			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
-				   ev, CEPH_PG_POOL_VERSION);
-			err = -EINVAL;
-			goto bad;
-		}
+		ceph_decode_64_safe(p, end, pool, bad);
 		pi = __lookup_pg_pool(&map->pg_pools, pool);
 		if (!pi) {
 			pi = kzalloc(sizeof(*pi), GFP_NOFS);
@@ -894,7 +907,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	while (len--) {
 		struct ceph_pg_pool_info *pi;
 
-		ceph_decode_32_safe(p, end, pool, bad);
+		ceph_decode_64_safe(p, end, pool, bad);
 		pi = __lookup_pg_pool(&map->pg_pools, pool);
 		if (pi)
 			__remove_pg_pool(&map->pg_pools, pi);
@@ -1097,8 +1110,8 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
 	pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
 	if (!pool)
 		return -EIO;
-	pgid.seed = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
-	num = le32_to_cpu(pool->v.pg_num);
+	pgid.seed = ceph_str_hash(pool->object_hash, oid, strlen(oid));
+	num = pool->pg_num;
 	num_mask = pool->pg_num_mask;
 
 	dout("calc_object_layout '%s' pgid %lld.%x\n", oid, pgid.pool,
@@ -1132,8 +1145,7 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 		return NULL;
 
 	/* pg_temp? */
-	t = ceph_stable_mod(ps, le32_to_cpu(pool->v.pg_num),
-			    pool->pgp_num_mask);
+	t = ceph_stable_mod(ps, pool->pg_num, pool->pgp_num_mask);
 	pgid.seed = t;
 	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
 	if (pg) {
@@ -1142,26 +1154,24 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 	}
 
 	/* crush */
-	ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
-				 pool->v.type, pool->v.size);
+	ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
+				 pool->type, pool->size);
 	if (ruleno < 0) {
 		pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
-		       poolid, pool->v.crush_ruleset, pool->v.type,
-		       pool->v.size);
+		       poolid, pool->crush_ruleset, pool->type,
+		       pool->size);
 		return NULL;
 	}
 
-	pps = ceph_stable_mod(ps,
-			      le32_to_cpu(pool->v.pgp_num),
-			      pool->pgp_num_mask);
+	pps = ceph_stable_mod(ps, pool->pgp_num, pool->pgp_num_mask);
 	pps += poolid;
 	r = crush_do_rule(osdmap->crush, ruleno, pps, osds,
-			  min_t(int, pool->v.size, *num),
+			  min_t(int, pool->size, *num),
 			  osdmap->osd_weight);
 	if (r < 0) {
 		pr_err("error %d from crush rule: pool %d ruleset %d type %d"
-		       " size %d\n", r, poolid, pool->v.crush_ruleset,
-		       pool->v.type, pool->v.size);
+		       " size %d\n", r, poolid, pool->crush_ruleset,
+		       pool->type, pool->size);
 		return NULL;
 	}
 	*num = r;
-- 
cgit v1.2.3-58-ga151


From 2169aea649c08374bec7d220a3b8f64712275356 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 25 Feb 2013 16:13:08 -0800
Subject: libceph: calculate placement based on the internal data types

Instead of using the old ceph_object_layout struct, update our internal
ceph_calc_object_layout method to use the ceph_pg type.  This allows us to
pass the full 32-bit precision of the pgid.seed to the callers.  It also
allows some callers to avoid reaching into the request structures for the
struct ceph_object_layout fields.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 fs/ceph/ioctl.c                 |  5 +----
 include/linux/ceph/osd_client.h |  1 +
 include/linux/ceph/osdmap.h     |  2 +-
 net/ceph/osd_client.c           | 11 +++++++----
 net/ceph/osdmap.c               | 18 +++++-------------
 5 files changed, 15 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index fb036ed3e129..7d85991fd647 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -185,7 +185,6 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 		&ceph_sb_to_client(inode->i_sb)->client->osdc;
 	u64 len = 1, olen;
 	u64 tmp;
-	struct ceph_object_layout ol;
 	struct ceph_pg pgid;
 	int r;
 
@@ -209,11 +208,9 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 
 	snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
 		 ceph_ino(inode), dl.object_no);
-	ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
+	ceph_calc_object_layout(&pgid, dl.object_name, &ci->i_layout,
 				osdc->osdmap);
 
-	pgid.pool = le32_to_cpu(ol.ol_pgid.pool);
-	pgid.seed = le16_to_cpu(ol.ol_pgid.ps);
 	dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
 	if (dl.osd >= 0) {
 		struct ceph_entity_addr *a =
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 388158ff0cbc..ad8899fc3157 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -82,6 +82,7 @@ struct ceph_osd_request {
 
 	char              r_oid[MAX_OBJ_NAME_SIZE];          /* object name */
 	int               r_oid_len;
+	u64               r_snapid;
 	unsigned long     r_stamp;            /* send OR check time */
 
 	struct ceph_file_layout r_file_layout;
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 8587746b7f0e..35985125f118 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -129,7 +129,7 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 					 u64 *bno, u64 *oxoff, u64 *oxlen);
 
 /* calculate mapping of object to a placement group */
-extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
+extern int ceph_calc_object_layout(struct ceph_pg *pg,
 				   const char *oid,
 				   struct ceph_file_layout *fl,
 				   struct ceph_osdmap *osdmap);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 1990834e518b..5584f0a08e28 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -913,21 +913,18 @@ EXPORT_SYMBOL(ceph_osdc_set_request_linger);
 static int __map_request(struct ceph_osd_client *osdc,
 			 struct ceph_osd_request *req, int force_resend)
 {
-	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
 	struct ceph_pg pgid;
 	int acting[CEPH_PG_MAX_SIZE];
 	int o = -1, num = 0;
 	int err;
 
 	dout("map_request %p tid %lld\n", req, req->r_tid);
-	err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
+	err = ceph_calc_object_layout(&pgid, req->r_oid,
 				      &req->r_file_layout, osdc->osdmap);
 	if (err) {
 		list_move(&req->r_req_lru_item, &osdc->req_notarget);
 		return err;
 	}
-	pgid.pool = le32_to_cpu(reqhead->layout.ol_pgid.pool);
-	pgid.seed = le16_to_cpu(reqhead->layout.ol_pgid.ps);
 	req->r_pgid = pgid;
 
 	err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
@@ -1000,10 +997,16 @@ static void __send_request(struct ceph_osd_client *osdc,
 	     req, req->r_tid, req->r_osd->o_osd, req->r_flags);
 
 	reqhead = req->r_request->front.iov_base;
+	reqhead->snapid = cpu_to_le64(req->r_snapid);
 	reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
 	reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
 	reqhead->reassert_version = req->r_reassert_version;
 
+	reqhead->layout.ol_pgid.ps = cpu_to_le16(req->r_pgid.seed);
+	reqhead->layout.ol_pgid.pool = cpu_to_le32(req->r_pgid.pool);
+	reqhead->layout.ol_pgid.preferred = cpu_to_le16(-1);
+	reqhead->layout.ol_stripe_unit = 0;
+
 	req->r_stamp = jiffies;
 	list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
 
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 911919320d2e..378471644501 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1095,32 +1095,24 @@ EXPORT_SYMBOL(ceph_calc_file_object_mapping);
  * calculate an object layout (i.e. pgid) from an oid,
  * file_layout, and osdmap
  */
-int ceph_calc_object_layout(struct ceph_object_layout *ol,
+int ceph_calc_object_layout(struct ceph_pg *pg,
 			    const char *oid,
 			    struct ceph_file_layout *fl,
 			    struct ceph_osdmap *osdmap)
 {
 	unsigned int num, num_mask;
-	struct ceph_pg pgid;
 	struct ceph_pg_pool_info *pool;
 
 	BUG_ON(!osdmap);
-
-	pgid.pool = le32_to_cpu(fl->fl_pg_pool);
-	pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
+	pg->pool = le32_to_cpu(fl->fl_pg_pool);
+	pool = __lookup_pg_pool(&osdmap->pg_pools, pg->pool);
 	if (!pool)
 		return -EIO;
-	pgid.seed = ceph_str_hash(pool->object_hash, oid, strlen(oid));
+	pg->seed = ceph_str_hash(pool->object_hash, oid, strlen(oid));
 	num = pool->pg_num;
 	num_mask = pool->pg_num_mask;
 
-	dout("calc_object_layout '%s' pgid %lld.%x\n", oid, pgid.pool,
-	     pgid.seed);
-
-	ol->ol_pgid.ps = cpu_to_le16(pgid.seed);
-	ol->ol_pgid.pool = fl->fl_pg_pool;
-	ol->ol_pgid.preferred = cpu_to_le16(-1);
-	ol->ol_stripe_unit = fl->fl_object_stripe_unit;
+	dout("calc_object_layout '%s' pgid %lld.%x\n", oid, pg->pool, pg->seed);
 	return 0;
 }
 EXPORT_SYMBOL(ceph_calc_object_layout);
-- 
cgit v1.2.3-58-ga151


From 1b83bef24c6746a146d39915a18fb5425f2facb0 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 25 Feb 2013 16:11:12 -0800
Subject: libceph: update osd request/reply encoding

Use the new version of the encoding for osd requests and replies.  In the
process, update the way we are tracking request ops and reply lengths and
results in the struct ceph_osd_request.  Update the rbd and fs/ceph users
appropriately.

The main changes are:
 - we keep pointers into the request memory for fields we need to update
   each time the request is sent out over the wire
 - we keep information about the result in an array in the request struct
   where the users can easily get at it.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 drivers/block/rbd.c             |  52 +++++----
 fs/ceph/addr.c                  |  31 ++----
 include/linux/ceph/osd_client.h |  19 +++-
 include/linux/ceph/rados.h      |  38 -------
 net/ceph/debugfs.c              |  18 +---
 net/ceph/osd_client.c           | 233 +++++++++++++++++++++++++++++-----------
 6 files changed, 222 insertions(+), 169 deletions(-)

(limited to 'fs')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 22085e86a409..6c81a4c040b9 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -196,7 +196,7 @@ struct rbd_obj_request {
 
 	u64			xferred;	/* bytes transferred */
 	u64			version;
-	s32			result;
+	int			result;
 	atomic_t		done;
 
 	rbd_obj_callback_t	callback;
@@ -1282,12 +1282,19 @@ static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
 
 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
 {
-
 	dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
 		obj_request->result, obj_request->xferred, obj_request->length);
-	if (obj_request->result == (s32) -ENOENT) {
+	/*
+	 * ENOENT means a hole in the object.  We zero-fill the
+	 * entire length of the request.  A short read also implies
+	 * zero-fill to the end of the request.  Either way we
+	 * update the xferred count to indicate the whole request
+	 * was satisfied.
+	 */
+	if (obj_request->result == -ENOENT) {
 		zero_bio_chain(obj_request->bio_list, 0);
 		obj_request->result = 0;
+		obj_request->xferred = obj_request->length;
 	} else if (obj_request->xferred < obj_request->length &&
 			!obj_request->result) {
 		zero_bio_chain(obj_request->bio_list, obj_request->xferred);
@@ -1298,20 +1305,14 @@ static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
 
 static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
 {
-	dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
-		obj_request->result, obj_request->xferred, obj_request->length);
-
-	/* A short write really shouldn't occur.  Warn if we see one */
-
-	if (obj_request->xferred != obj_request->length) {
-		struct rbd_img_request *img_request = obj_request->img_request;
-		struct rbd_device *rbd_dev;
-
-		rbd_dev = img_request ? img_request->rbd_dev : NULL;
-		rbd_warn(rbd_dev, "wrote %llu want %llu\n",
-			obj_request->xferred, obj_request->length);
-	}
-
+	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
+		obj_request->result, obj_request->length);
+	/*
+	 * There is no such thing as a successful short write.
+	 * Our xferred value is the number of bytes transferred
+	 * back.  Set it to our originally-requested length.
+	 */
+	obj_request->xferred = obj_request->length;
 	obj_request_done_set(obj_request);
 }
 
@@ -1329,9 +1330,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 				struct ceph_msg *msg)
 {
 	struct rbd_obj_request *obj_request = osd_req->r_priv;
-	struct ceph_osd_reply_head *reply_head;
-	struct ceph_osd_op *op;
-	u32 num_ops;
 	u16 opcode;
 
 	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
@@ -1339,22 +1337,19 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 	rbd_assert(!!obj_request->img_request ^
 				(obj_request->which == BAD_WHICH));
 
-	reply_head = msg->front.iov_base;
-	obj_request->result = (s32) le32_to_cpu(reply_head->result);
+	if (osd_req->r_result < 0)
+		obj_request->result = osd_req->r_result;
 	obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
 
-	num_ops = le32_to_cpu(reply_head->num_ops);
-	WARN_ON(num_ops != 1);	/* For now */
+	WARN_ON(osd_req->r_num_ops != 1);	/* For now */
 
 	/*
 	 * We support a 64-bit length, but ultimately it has to be
 	 * passed to blk_end_request(), which takes an unsigned int.
 	 */
-	op = &reply_head->ops[0];
-	obj_request->xferred = le64_to_cpu(op->extent.length);
+	obj_request->xferred = osd_req->r_reply_op_len[0];
 	rbd_assert(obj_request->xferred < (u64) UINT_MAX);
-
-	opcode = le16_to_cpu(op->op);
+	opcode = osd_req->r_request_ops[0].op;
 	switch (opcode) {
 	case CEPH_OSD_OP_READ:
 		rbd_osd_read_callback(obj_request);
@@ -1719,6 +1714,7 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
 		more = blk_end_request(img_request->rq, result, xferred);
 		which++;
 	}
+
 	rbd_assert(more ^ (which == img_request->obj_request_count));
 	img_request->next_completion = which;
 out:
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index fc613715af46..cfef3e01a9b3 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -236,16 +236,10 @@ static int ceph_readpage(struct file *filp, struct page *page)
 static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
 {
 	struct inode *inode = req->r_inode;
-	struct ceph_osd_reply_head *replyhead;
-	int rc, bytes;
+	int rc = req->r_result;
+	int bytes = le32_to_cpu(msg->hdr.data_len);
 	int i;
 
-	/* parse reply */
-	replyhead = msg->front.iov_base;
-	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
-	rc = le32_to_cpu(replyhead->result);
-	bytes = le32_to_cpu(msg->hdr.data_len);
-
 	dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
 
 	/* unlock all pages, zeroing any data we didn't read */
@@ -553,27 +547,18 @@ static void writepages_finish(struct ceph_osd_request *req,
 			      struct ceph_msg *msg)
 {
 	struct inode *inode = req->r_inode;
-	struct ceph_osd_reply_head *replyhead;
-	struct ceph_osd_op *op;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	unsigned wrote;
 	struct page *page;
 	int i;
 	struct ceph_snap_context *snapc = req->r_snapc;
 	struct address_space *mapping = inode->i_mapping;
-	__s32 rc = -EIO;
-	u64 bytes = 0;
+	int rc = req->r_result;
+	u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	long writeback_stat;
 	unsigned issued = ceph_caps_issued(ci);
 
-	/* parse reply */
-	replyhead = msg->front.iov_base;
-	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
-	op = (void *)(replyhead + 1);
-	rc = le32_to_cpu(replyhead->result);
-	bytes = le64_to_cpu(op->extent.length);
-
 	if (rc >= 0) {
 		/*
 		 * Assume we wrote the pages we originally sent.  The
@@ -740,8 +725,6 @@ retry:
 		struct page *page;
 		int want;
 		u64 offset, len;
-		struct ceph_osd_request_head *reqhead;
-		struct ceph_osd_op *op;
 		long writeback_stat;
 
 		next = 0;
@@ -905,10 +888,8 @@ get_more_pages:
 
 		/* revise final length, page count */
 		req->r_num_pages = locked_pages;
-		reqhead = req->r_request->front.iov_base;
-		op = (void *)(reqhead + 1);
-		op->extent.length = cpu_to_le64(len);
-		op->payload_len = cpu_to_le32(len);
+		req->r_request_ops[0].extent.length = cpu_to_le64(len);
+		req->r_request_ops[0].payload_len = cpu_to_le32(len);
 		req->r_request->hdr.data_len = cpu_to_le32(len);
 
 		rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index ad8899fc3157..1dd5d466b6f9 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -47,6 +47,9 @@ struct ceph_osd {
 	struct list_head o_keepalive_item;
 };
 
+
+#define CEPH_OSD_MAX_OP 10
+
 /* an in-flight request */
 struct ceph_osd_request {
 	u64             r_tid;              /* unique for this client */
@@ -63,9 +66,23 @@ struct ceph_osd_request {
 	struct ceph_connection *r_con_filling_msg;
 
 	struct ceph_msg  *r_request, *r_reply;
-	int               r_result;
 	int               r_flags;     /* any additional flags for the osd */
 	u32               r_sent;      /* >0 if r_request is sending/sent */
+	int               r_num_ops;
+
+	/* encoded message content */
+	struct ceph_osd_op *r_request_ops;
+	/* these are updated on each send */
+	__le32           *r_request_osdmap_epoch;
+	__le32           *r_request_flags;
+	__le64           *r_request_pool;
+	void             *r_request_pgid;
+	__le32           *r_request_attempts;
+	struct ceph_eversion *r_request_reassert_version;
+
+	int               r_result;
+	int               r_reply_op_len[CEPH_OSD_MAX_OP];
+	s32               r_reply_op_result[CEPH_OSD_MAX_OP];
 	int               r_got_reply;
 	int		  r_linger;
 
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index d784c8dfb09a..68c96a508ac2 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -416,43 +416,5 @@ struct ceph_osd_op {
 	__le32 payload_len;
 } __attribute__ ((packed));
 
-/*
- * osd request message header.  each request may include multiple
- * ceph_osd_op object operations.
- */
-struct ceph_osd_request_head {
-	__le32 client_inc;                 /* client incarnation */
-	struct ceph_object_layout layout;  /* pgid */
-	__le32 osdmap_epoch;               /* client's osdmap epoch */
-
-	__le32 flags;
-
-	struct ceph_timespec mtime;        /* for mutations only */
-	struct ceph_eversion reassert_version; /* if we are replaying op */
-
-	__le32 object_len;     /* length of object name */
-
-	__le64 snapid;         /* snapid to read */
-	__le64 snap_seq;       /* writer's snap context */
-	__le32 num_snaps;
-
-	__le16 num_ops;
-	struct ceph_osd_op ops[];  /* followed by ops[], obj, ticket, snaps */
-} __attribute__ ((packed));
-
-struct ceph_osd_reply_head {
-	__le32 client_inc;                /* client incarnation */
-	__le32 flags;
-	struct ceph_object_layout layout;
-	__le32 osdmap_epoch;
-	struct ceph_eversion reassert_version; /* for replaying uncommitted */
-
-	__le32 result;                    /* result code */
-
-	__le32 object_len;                /* length of object name */
-	__le32 num_ops;
-	struct ceph_osd_op ops[0];  /* ops[], object */
-} __attribute__ ((packed));
-
 
 #endif
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index f4d4b27d6026..00d051f4894e 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -123,10 +123,7 @@ static int osdc_show(struct seq_file *s, void *pp)
 	mutex_lock(&osdc->request_mutex);
 	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
 		struct ceph_osd_request *req;
-		struct ceph_osd_request_head *head;
-		struct ceph_osd_op *op;
-		int num_ops;
-		int opcode, olen;
+		int opcode;
 		int i;
 
 		req = rb_entry(p, struct ceph_osd_request, r_node);
@@ -135,13 +132,7 @@ static int osdc_show(struct seq_file *s, void *pp)
 			   req->r_osd ? req->r_osd->o_osd : -1,
 			   req->r_pgid.pool, req->r_pgid.seed);
 
-		head = req->r_request->front.iov_base;
-		op = (void *)(head + 1);
-
-		num_ops = le16_to_cpu(head->num_ops);
-		olen = le32_to_cpu(head->object_len);
-		seq_printf(s, "%.*s", olen,
-			   (const char *)(head->ops + num_ops));
+		seq_printf(s, "%.*s", req->r_oid_len, req->r_oid);
 
 		if (req->r_reassert_version.epoch)
 			seq_printf(s, "\t%u'%llu",
@@ -150,10 +141,9 @@ static int osdc_show(struct seq_file *s, void *pp)
 		else
 			seq_printf(s, "\t");
 
-		for (i = 0; i < num_ops; i++) {
-			opcode = le16_to_cpu(op->op);
+		for (i = 0; i < req->r_num_ops; i++) {
+			opcode = le16_to_cpu(req->r_request_ops[i].op);
 			seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
-			op++;
 		}
 
 		seq_printf(s, "\n");
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 5584f0a08e28..d730dd4d8eb2 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -146,15 +146,23 @@ EXPORT_SYMBOL(ceph_osdc_release_request);
 
 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       struct ceph_snap_context *snapc,
-					       unsigned int num_op,
+					       unsigned int num_ops,
 					       bool use_mempool,
 					       gfp_t gfp_flags)
 {
 	struct ceph_osd_request *req;
 	struct ceph_msg *msg;
-	size_t msg_size = sizeof(struct ceph_osd_request_head);
-
-	msg_size += num_op*sizeof(struct ceph_osd_op);
+	size_t msg_size;
+
+	msg_size = 4 + 4 + 8 + 8 + 4+8;
+	msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
+	msg_size += 1 + 8 + 4 + 4;     /* pg_t */
+	msg_size += 4 + MAX_OBJ_NAME_SIZE;
+	msg_size += 2 + num_ops*sizeof(struct ceph_osd_op);
+	msg_size += 8;  /* snapid */
+	msg_size += 8;  /* snap_seq */
+	msg_size += 8 * (snapc ? snapc->num_snaps : 0);  /* snaps */
+	msg_size += 4;
 
 	if (use_mempool) {
 		req = mempool_alloc(osdc->req_mempool, gfp_flags);
@@ -193,9 +201,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	ceph_pagelist_init(&req->r_trail);
 
 	/* create request message; allow space for oid */
-	msg_size += MAX_OBJ_NAME_SIZE;
-	if (snapc)
-		msg_size += sizeof(u64) * snapc->num_snaps;
 	if (use_mempool)
 		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
 	else
@@ -324,55 +329,80 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
  *
  */
 void ceph_osdc_build_request(struct ceph_osd_request *req,
-			     u64 off, u64 len, unsigned int num_op,
+			     u64 off, u64 len, unsigned int num_ops,
 			     struct ceph_osd_req_op *src_ops,
 			     struct ceph_snap_context *snapc, u64 snap_id,
 			     struct timespec *mtime)
 {
 	struct ceph_msg *msg = req->r_request;
-	struct ceph_osd_request_head *head;
 	struct ceph_osd_req_op *src_op;
-	struct ceph_osd_op *op;
 	void *p;
-	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
+	size_t msg_size;
 	int flags = req->r_flags;
 	u64 data_len;
 	int i;
 
-	WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
-
-	head = msg->front.iov_base;
-	head->snapid = cpu_to_le64(snap_id);
-	op = (void *)(head + 1);
-	p = (void *)(op + num_op);
-
+	req->r_num_ops = num_ops;
+	req->r_snapid = snap_id;
 	req->r_snapc = ceph_get_snap_context(snapc);
 
-	head->client_inc = cpu_to_le32(1); /* always, for now. */
-	head->flags = cpu_to_le32(flags);
-	if (flags & CEPH_OSD_FLAG_WRITE)
-		ceph_encode_timespec(&head->mtime, mtime);
-	BUG_ON(num_op > (unsigned int) ((u16) -1));
-	head->num_ops = cpu_to_le16(num_op);
+	/* encode request */
+	msg->hdr.version = cpu_to_le16(4);
 
-	/* fill in oid */
-	head->object_len = cpu_to_le32(req->r_oid_len);
+	p = msg->front.iov_base;
+	ceph_encode_32(&p, 1);   /* client_inc  is always 1 */
+	req->r_request_osdmap_epoch = p;
+	p += 4;
+	req->r_request_flags = p;
+	p += 4;
+	if (req->r_flags & CEPH_OSD_FLAG_WRITE)
+		ceph_encode_timespec(p, mtime);
+	p += sizeof(struct ceph_timespec);
+	req->r_request_reassert_version = p;
+	p += sizeof(struct ceph_eversion); /* will get filled in */
+
+	/* oloc */
+	ceph_encode_8(&p, 4);
+	ceph_encode_8(&p, 4);
+	ceph_encode_32(&p, 8 + 4 + 4);
+	req->r_request_pool = p;
+	p += 8;
+	ceph_encode_32(&p, -1);  /* preferred */
+	ceph_encode_32(&p, 0);   /* key len */
+
+	ceph_encode_8(&p, 1);
+	req->r_request_pgid = p;
+	p += 8 + 4;
+	ceph_encode_32(&p, -1);  /* preferred */
+
+	/* oid */
+	ceph_encode_32(&p, req->r_oid_len);
 	memcpy(p, req->r_oid, req->r_oid_len);
+	dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len);
 	p += req->r_oid_len;
 
+	/* ops */
+	ceph_encode_16(&p, num_ops);
 	src_op = src_ops;
-	while (num_op--)
-		osd_req_encode_op(req, op++, src_op++);
+	req->r_request_ops = p;
+	for (i = 0; i < num_ops; i++, src_op++) {
+		osd_req_encode_op(req, p, src_op);
+		p += sizeof(struct ceph_osd_op);
+	}
 
-	if (snapc) {
-		head->snap_seq = cpu_to_le64(snapc->seq);
-		head->num_snaps = cpu_to_le32(snapc->num_snaps);
+	/* snaps */
+	ceph_encode_64(&p, req->r_snapid);
+	ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0);
+	ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0);
+	if (req->r_snapc) {
 		for (i = 0; i < snapc->num_snaps; i++) {
-			put_unaligned_le64(snapc->snaps[i], p);
-			p += sizeof(u64);
+			ceph_encode_64(&p, req->r_snapc->snaps[i]);
 		}
 	}
 
+	req->r_request_attempts = p;
+	p += 4;
+
 	data_len = req->r_trail.length;
 	if (flags & CEPH_OSD_FLAG_WRITE) {
 		req->r_request->hdr.data_off = cpu_to_le16(off);
@@ -385,6 +415,9 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 	msg_size = p - msg->front.iov_base;
 	msg->front.iov_len = msg_size;
 	msg->hdr.front_len = cpu_to_le32(msg_size);
+
+	dout("build_request msg_size was %d num_ops %d\n", (int)msg_size,
+	     num_ops);
 	return;
 }
 EXPORT_SYMBOL(ceph_osdc_build_request);
@@ -991,21 +1024,22 @@ out:
 static void __send_request(struct ceph_osd_client *osdc,
 			   struct ceph_osd_request *req)
 {
-	struct ceph_osd_request_head *reqhead;
-
-	dout("send_request %p tid %llu to osd%d flags %d\n",
-	     req, req->r_tid, req->r_osd->o_osd, req->r_flags);
-
-	reqhead = req->r_request->front.iov_base;
-	reqhead->snapid = cpu_to_le64(req->r_snapid);
-	reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
-	reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
-	reqhead->reassert_version = req->r_reassert_version;
+	void *p;
 
-	reqhead->layout.ol_pgid.ps = cpu_to_le16(req->r_pgid.seed);
-	reqhead->layout.ol_pgid.pool = cpu_to_le32(req->r_pgid.pool);
-	reqhead->layout.ol_pgid.preferred = cpu_to_le16(-1);
-	reqhead->layout.ol_stripe_unit = 0;
+	dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n",
+	     req, req->r_tid, req->r_osd->o_osd, req->r_flags,
+	     (unsigned long long)req->r_pgid.pool, req->r_pgid.seed);
+
+	/* fill in message content that changes each time we send it */
+	put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
+	put_unaligned_le32(req->r_flags, req->r_request_flags);
+	put_unaligned_le64(req->r_pgid.pool, req->r_request_pool);
+	p = req->r_request_pgid;
+	ceph_encode_64(&p, req->r_pgid.pool);
+	ceph_encode_32(&p, req->r_pgid.seed);
+	put_unaligned_le64(1, req->r_request_attempts);  /* FIXME */
+	memcpy(req->r_request_reassert_version, &req->r_reassert_version,
+	       sizeof(req->r_reassert_version));
 
 	req->r_stamp = jiffies;
 	list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
@@ -1105,6 +1139,26 @@ static void complete_request(struct ceph_osd_request *req)
 	complete_all(&req->r_safe_completion);  /* fsync waiter */
 }
 
+static int __decode_pgid(void **p, void *end, struct ceph_pg *pgid)
+{
+	__u8 v;
+
+	ceph_decode_need(p, end, 1 + 8 + 4 + 4, bad);
+	v = ceph_decode_8(p);
+	if (v > 1) {
+		pr_warning("do not understand pg encoding %d > 1", v);
+		return -EINVAL;
+	}
+	pgid->pool = ceph_decode_64(p);
+	pgid->seed = ceph_decode_32(p);
+	*p += 4;
+	return 0;
+
+bad:
+	pr_warning("incomplete pg encoding");
+	return -EINVAL;
+}
+
 /*
  * handle osd op reply.  either call the callback if it is specified,
  * or do the completion to wake up the waiting thread.
@@ -1112,22 +1166,42 @@ static void complete_request(struct ceph_osd_request *req)
 static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 			 struct ceph_connection *con)
 {
-	struct ceph_osd_reply_head *rhead = msg->front.iov_base;
+	void *p, *end;
 	struct ceph_osd_request *req;
 	u64 tid;
-	int numops, object_len, flags;
+	int object_len;
+	int numops, payload_len, flags;
 	s32 result;
+	s32 retry_attempt;
+	struct ceph_pg pg;
+	int err;
+	u32 reassert_epoch;
+	u64 reassert_version;
+	u32 osdmap_epoch;
+	int i;
 
 	tid = le64_to_cpu(msg->hdr.tid);
-	if (msg->front.iov_len < sizeof(*rhead))
-		goto bad;
-	numops = le32_to_cpu(rhead->num_ops);
-	object_len = le32_to_cpu(rhead->object_len);
-	result = le32_to_cpu(rhead->result);
-	if (msg->front.iov_len != sizeof(*rhead) + object_len +
-	    numops * sizeof(struct ceph_osd_op))
+	dout("handle_reply %p tid %llu\n", msg, tid);
+
+	p = msg->front.iov_base;
+	end = p + msg->front.iov_len;
+
+	ceph_decode_need(&p, end, 4, bad);
+	object_len = ceph_decode_32(&p);
+	ceph_decode_need(&p, end, object_len, bad);
+	p += object_len;
+
+	err = __decode_pgid(&p, end, &pg);
+	if (err)
 		goto bad;
-	dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
+
+	ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad);
+	flags = ceph_decode_64(&p);
+	result = ceph_decode_32(&p);
+	reassert_epoch = ceph_decode_32(&p);
+	reassert_version = ceph_decode_64(&p);
+	osdmap_epoch = ceph_decode_32(&p);
+
 	/* lookup */
 	mutex_lock(&osdc->request_mutex);
 	req = __lookup_request(osdc, tid);
@@ -1137,7 +1211,38 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 		return;
 	}
 	ceph_osdc_get_request(req);
-	flags = le32_to_cpu(rhead->flags);
+
+	dout("handle_reply %p tid %llu req %p result %d\n", msg, tid,
+	     req, result);
+
+	ceph_decode_need(&p, end, 4, bad);
+	numops = ceph_decode_32(&p);
+	if (numops > CEPH_OSD_MAX_OP)
+		goto bad_put;
+	if (numops != req->r_num_ops)
+		goto bad_put;
+	payload_len = 0;
+	ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad);
+	for (i = 0; i < numops; i++) {
+		struct ceph_osd_op *op = p;
+		int len;
+
+		len = le32_to_cpu(op->payload_len);
+		req->r_reply_op_len[i] = len;
+		dout(" op %d has %d bytes\n", i, len);
+		payload_len += len;
+		p += sizeof(*op);
+	}
+	if (payload_len != le32_to_cpu(msg->hdr.data_len)) {
+		pr_warning("sum of op payload lens %d != data_len %d",
+			   payload_len, le32_to_cpu(msg->hdr.data_len));
+		goto bad_put;
+	}
+
+	ceph_decode_need(&p, end, 4 + numops * 4, bad);
+	retry_attempt = ceph_decode_32(&p);
+	for (i = 0; i < numops; i++)
+		req->r_reply_op_result[i] = ceph_decode_32(&p);
 
 	/*
 	 * if this connection filled our message, drop our reference now, to
@@ -1152,7 +1257,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 	if (!req->r_got_reply) {
 		unsigned int bytes;
 
-		req->r_result = le32_to_cpu(rhead->result);
+		req->r_result = result;
 		bytes = le32_to_cpu(msg->hdr.data_len);
 		dout("handle_reply result %d bytes %d\n", req->r_result,
 		     bytes);
@@ -1160,7 +1265,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 			req->r_result = bytes;
 
 		/* in case this is a write and we need to replay, */
-		req->r_reassert_version = rhead->reassert_version;
+		req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch);
+		req->r_reassert_version.version = cpu_to_le64(reassert_version);
 
 		req->r_got_reply = 1;
 	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
@@ -1195,10 +1301,11 @@ done:
 	ceph_osdc_put_request(req);
 	return;
 
+bad_put:
+	ceph_osdc_put_request(req);
 bad:
-	pr_err("corrupt osd_op_reply got %d %d expected %d\n",
-	       (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
-	       (int)sizeof(*rhead));
+	pr_err("corrupt osd_op_reply got %d %d\n",
+	       (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
 	ceph_msg_dump(msg);
 }
 
-- 
cgit v1.2.3-58-ga151