From 7121064b214bbc97007ec4d0e70aaeffef1b55f7 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 3 Jul 2013 15:00:45 -0700
Subject: configfs: use capped length for ->store_attribute()

The difference between "count" and "len" is that "len" is capped at
4095.  Changing it like this makes it match how sysfs_write_file() is
implemented.

This is a static analysis patch.  I haven't found any store_attribute()
functions where this change makes a difference.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/configfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 2b6cb23dd14e..1d1c41f1014d 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -203,7 +203,7 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof
 	mutex_lock(&buffer->mutex);
 	len = fill_write_buffer(buffer, buf, count);
 	if (len > 0)
-		len = flush_write_buffer(file->f_path.dentry, buffer, count);
+		len = flush_write_buffer(file->f_path.dentry, buffer, len);
 	if (len > 0)
 		*ppos += len;
 	mutex_unlock(&buffer->mutex);
-- 
cgit v1.2.3-58-ga151


From 82d627cf1f41c6ca550bb404dfbc4bae2c954611 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Wed, 3 Jul 2013 15:00:46 -0700
Subject: fs/ocfs2/dlm/dlmrecovery.c: remove duplicate declarations

Below 3 functions have already been declared in dlmcommon.h, so we have
no need to declare them again in dlmrecovery.c:

  dlm_complete_recovery_thread
  dlm_launch_recovery_thread
  dlm_kick_recovery_thread

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Acked-by: Sunil Mushran <sunil.mushran@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmrecovery.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index e68588e6b1e8..3e18641626b4 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -55,9 +55,6 @@
 static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
 
 static int dlm_recovery_thread(void *data);
-void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
-int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
-void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
 static int dlm_do_recovery(struct dlm_ctxt *dlm);
 
 static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
-- 
cgit v1.2.3-58-ga151


From 22ab9014bfdaf97449759aef946871aabe78bb40 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Wed, 3 Jul 2013 15:00:47 -0700
Subject: fs/ocfs2/dlm/dlmrecovery.c:dlm_request_all_locks(): ret should be int
 instead of enum

In dlm_request_all_locks, ret is type enum.  But o2net_send_message
returns a type int value.  Then it will never run into the following
error branch.  So we should change the ret type from enum to int.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Acked-by: Sunil Mushran <sunil.mushran@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmrecovery.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 3e18641626b4..53e5c0284fa2 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -786,7 +786,7 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
 				 u8 dead_node)
 {
 	struct dlm_lock_request lr;
-	enum dlm_status ret;
+	int ret;
 
 	mlog(0, "\n");
 
@@ -799,7 +799,6 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
 	lr.dead_node = dead_node;
 
 	// send message
-	ret = DLM_NOLOCKMGR;
 	ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
 				 &lr, sizeof(lr), request_from, NULL);
 
-- 
cgit v1.2.3-58-ga151


From 13eb98874c837b4ca0cb2db6a4fe3551e51b7632 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Wed, 3 Jul 2013 15:00:48 -0700
Subject: ocfs2: should not use le32_add_cpu to set ocfs2_dinode i_flags

If we use le32_add_cpu to set ocfs2_dinode i_flags, it may lead to the
corresponding flag corrupted.  So we should change it to bitwise and/or
operation.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: shencanquan <shencanquan@huawei.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/namei.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index b4a5cdf9dbc5..75964addfe32 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -522,7 +522,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
 
 	fe->i_last_eb_blk = 0;
 	strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
-	le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL);
+	fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
 	fe->i_atime = fe->i_ctime = fe->i_mtime =
 		cpu_to_le64(CURRENT_TIME.tv_sec);
 	fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
@@ -2044,7 +2044,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 		goto leave;
 	}
 
-	le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
+	fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
 	OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
 
 	/* Record which orphan dir our inode now resides
@@ -2434,7 +2434,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
 	}
 
 	di = (struct ocfs2_dinode *)di_bh->b_data;
-	le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL);
+	di->i_flags &= ~cpu_to_le32(OCFS2_ORPHANED_FL);
 	di->i_orphaned_slot = 0;
 	set_nlink(inode, 1);
 	ocfs2_set_links_count(di, inode->i_nlink);
-- 
cgit v1.2.3-58-ga151


From 40c7f2eaf59230135fd91a398924bdbf873378ec Mon Sep 17 00:00:00 2001
From: Xue jiufei <xuejiufei@huawei.com>
Date: Wed, 3 Jul 2013 15:00:50 -0700
Subject: ocfs2: add missing dlm_put() in dlm_begin_reco_handler()

dlm_begin_reco_handler() returns without putting dlm when dlm recovery
state is DLM_RECO_STATE_FINALIZE.

Signed-off-by: joyce <xuejiufei@huawei.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Acked-by: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmrecovery.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 53e5c0284fa2..773bd32bfd8c 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2692,6 +2692,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
 		     dlm->name, br->node_idx, br->dead_node,
 		     dlm->reco.dead_node, dlm->reco.new_master);
 		spin_unlock(&dlm->spinlock);
+		dlm_put(dlm);
 		return -EAGAIN;
 	}
 	spin_unlock(&dlm->spinlock);
-- 
cgit v1.2.3-58-ga151


From 8fa9d17f93ee8eb79d595c98cef0944f6ee5de75 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@gmail.com>
Date: Wed, 3 Jul 2013 15:00:51 -0700
Subject: ocfs2: remove unecessary variable needs_checkpoint

Code cleanup: needs_checkpoint is assigned to but never used.  Delete
the variable.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Cc: Jeff Liu <jeff.liu@oracle.com>
Acked-by: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/journal.h | 1 -
 fs/ocfs2/ocfs2.h   | 1 -
 fs/ocfs2/super.c   | 6 ++----
 3 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index a3385b63ff5e..0a992737dcaf 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -200,7 +200,6 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
 
 static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
 {
-	atomic_set(&osb->needs_checkpoint, 1);
 	wake_up(&osb->checkpoint_event);
 }
 
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d355e6e36b36..3a903470c794 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -347,7 +347,6 @@ struct ocfs2_super
 	struct task_struct *recovery_thread_task;
 	int disable_recovery;
 	wait_queue_head_t checkpoint_event;
-	atomic_t needs_checkpoint;
 	struct ocfs2_journal *journal;
 	unsigned long osb_commit_interval;
 
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 01b85165552b..854d80955bf8 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -286,10 +286,9 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
 	spin_unlock(&osb->osb_lock);
 
 	out += snprintf(buf + out, len - out,
-			"%10s => Pid: %d  Interval: %lu  Needs: %d\n", "Commit",
+			"%10s => Pid: %d  Interval: %lu\n", "Commit",
 			(osb->commit_task ? task_pid_nr(osb->commit_task) : -1),
-			osb->osb_commit_interval,
-			atomic_read(&osb->needs_checkpoint));
+			osb->osb_commit_interval);
 
 	out += snprintf(buf + out, len - out,
 			"%10s => State: %d  TxnId: %lu  NumTxns: %d\n",
@@ -2154,7 +2153,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	}
 
 	init_waitqueue_head(&osb->checkpoint_event);
-	atomic_set(&osb->needs_checkpoint, 0);
 
 	osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
 
-- 
cgit v1.2.3-58-ga151


From 33add0e3a09a62c7796ea9838243c1cd933d8543 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Wed, 3 Jul 2013 15:00:52 -0700
Subject: ocfs2: fix mutex_unlock and possible memory leak in
 ocfs2_remove_btree_range

In ocfs2_remove_btree_range, when calling ocfs2_lock_refcount_tree and
ocfs2_prepare_refcount_change_for_del failed, it goes to out and then
tries to call mutex_unlock without mutex_lock before.  And when calling
ocfs2_reserve_blocks_for_rec_trunc failed, it should free ref_tree
before return.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/alloc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index b8a9d87231b1..17e6bdde96c5 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5655,7 +5655,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
 					       &ref_tree, NULL);
 		if (ret) {
 			mlog_errno(ret);
-			goto out;
+			goto bail;
 		}
 
 		ret = ocfs2_prepare_refcount_change_for_del(inode,
@@ -5666,7 +5666,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
 							    &extra_blocks);
 		if (ret < 0) {
 			mlog_errno(ret);
-			goto out;
+			goto bail;
 		}
 	}
 
@@ -5674,7 +5674,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
 						 extra_blocks);
 	if (ret) {
 		mlog_errno(ret);
-		return ret;
+		goto bail;
 	}
 
 	mutex_lock(&tl_inode->i_mutex);
@@ -5734,7 +5734,7 @@ out_commit:
 	ocfs2_commit_trans(osb, handle);
 out:
 	mutex_unlock(&tl_inode->i_mutex);
-
+bail:
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
 
-- 
cgit v1.2.3-58-ga151


From 40bd62eb7fb8447798732e809a676ebaf2a7f910 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@gmail.com>
Date: Wed, 3 Jul 2013 15:00:53 -0700
Subject: fs/ocfs2/journal.h: add bits_wanted while calculating credits in
 ocfs2_calc_extend_credits

While adding extends to a file, the credits are calculated incorrectly
and if the requested clusters is more than one (or more because we used
a conservative limit) then we run out of journal credits and we hit an
assert in journalling code.

The function parameter bits_wanted variable was not used at all.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/journal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 0a992737dcaf..96f9ac237e86 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -537,7 +537,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
 	extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
 
 	return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
-	       ocfs2_quota_trans_credits(sb);
+	       ocfs2_quota_trans_credits(sb) + bits_wanted;
 }
 
 static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
-- 
cgit v1.2.3-58-ga151


From d3e3b41b3dffff50c66651d60146b155d6ce0484 Mon Sep 17 00:00:00 2001
From: Younger Liu <younger.liu@huawei.com>
Date: Wed, 3 Jul 2013 15:00:54 -0700
Subject: fs/ocfs2/cluster/tcp.c: free sc->sc_page in sc_kref_release()

There is a memory leak in sc_kref_release().  When free struct
o2net_sock_container (sc), we should release sc->sc_page.

Signed-off-by: Younger Liu <younger.liu@huawei.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/tcp.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index aa88bd8bcedc..bb9a48bfbd01 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -406,6 +406,9 @@ static void sc_kref_release(struct kref *kref)
 	sc->sc_node = NULL;
 
 	o2net_debug_del_sc(sc);
+
+	if (sc->sc_page)
+		__free_page(sc->sc_page);
 	kfree(sc);
 }
 
-- 
cgit v1.2.3-58-ga151


From b30f14c490d0e3ff1b7c0952ccd343408557484e Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Wed, 3 Jul 2013 15:00:55 -0700
Subject: ocfs2: xattr: remove useless free space checking

Free space checking will be done in ocfs2_xattr_ibody_init().  So remove
here.

[akpm@linux-foundation.org: remove unused local]
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Acked-by: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/xattr.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 2e3ea308c144..0deabcdaf916 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2751,20 +2751,12 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
 {
 	int ret;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
 	struct ocfs2_xa_loc loc;
 
 	if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
 		return -ENOSPC;
 
 	down_write(&oi->ip_alloc_sem);
-	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
-		if (!ocfs2_xattr_has_space_inline(inode, di)) {
-			ret = -ENOSPC;
-			goto out;
-		}
-	}
-
 	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
 		ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt);
 		if (ret) {
-- 
cgit v1.2.3-58-ga151


From 096b2ef83c87d7a7e0d2838f7f1391c74fdad0e6 Mon Sep 17 00:00:00 2001
From: Xue jiufei <xuejiufei@huawei.com>
Date: Wed, 3 Jul 2013 15:00:56 -0700
Subject: ocfs2: dlmlock_master() should return DLM_NORMAL after adding lock to
 blocked list

dlmlock_master() returns DLM_RECOVERING/DLM_MIGRATING/ DLM_FORWAR after
adding lock to blocked list if lockres has the state
DLM_LOCK_RES_RECOVERING/DLM_LOCK_RES_MIGRATING/ DLM_LOCK_RES_IN_PROGRESS.
so it will retry in dlmlock().  And this may cause dlm_thread fall into an
infinite loop

	Thread1                                  dlm_thread

  calls dlm_lock->dlmlock_master,
  if lockresA is in state
  DLM_LOCK_RES_RECOVERING, calls
  __dlm_wait_on_lockres() and waits
  until others threads clear this
  state;

  If cannot grant this lock,
  adding lock to blocked list,
  and return DLM_RECOVERING;

                                        Grant this lock and move it to
                                        grant list;

  After a while, retry and
  calls list_add_tail(), adding lock
  to blocked list again.

Granted and blocked list of this lockres will become the following
conditions:

    lock_res->granted.next = dlm_lock->list_head;
    lock_res->blocked.next = dlm_lock->list_head;
    dlm_lock->list_head.next = dlm_lock_resource->blocked;

When dlm_thread traverses the granted list, it will fall into an endless
loop, checking dlm_lock.list_head, dlm_lock->list_head.next
(i.e.lock_res->blocked), lock_res->blocked.next(i.e.dlm_lock.list_head
again) .....

Signed-off-by: joyce <xuejiufei@huawei.com>
Reviewed-by: jensen <shencanquan@huawei.com>
Cc: Jeff Liu <jeff.liu@oracle.com>
Acked-by: Sunil Mushran <sunil.mushran@gmail.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmlock.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 975810b98492..47e67c2d228f 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -178,6 +178,7 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
 				     lock->ml.node);
 			}
 		} else {
+			status = DLM_NORMAL;
 			dlm_lock_get(lock);
 			list_add_tail(&lock->list, &res->blocked);
 			kick_thread = 1;
-- 
cgit v1.2.3-58-ga151


From ea45466aec98b68faaf354901c42e281e58af50a Mon Sep 17 00:00:00 2001
From: Younger Liu <younger.liu@huawei.com>
Date: Wed, 3 Jul 2013 15:00:58 -0700
Subject: ocfs2: need rollback when journal_access failed in ocfs2_orphan_add()

While adding a file into orphan dir in ocfs2_orphan_add(), it calls
__ocfs2_add_entry() before ocfs2_journal_access_di().  If
ocfs2_journal_access_di() failed, the file is added into orphan dir, and
orphan dir dinode updated, but file dinode has not been updated.
Accordingly, the data is not consistent between file dinode and orphan
dir.

So, need to call ocfs2_journal_access_di() before __ocfs2_add_entry(),
and if ocfs2_journal_access_di() failed, orphan_fe and
orphan_dir_inode->i_nlink need rollback.

This bug was added by 3939fda4 ("Ocfs2: Journaling i_flags and
i_orphaned_slot when adding inode to orphan dir.").

Signed-off-by: Younger Liu <younger.liu@huawei.com>
Acked-by: Jeff Liu <jeff.liu@oracle.com>
Cc: Sunil Mushran <sunil.mushran@gmail.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/namei.c | 41 +++++++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 75964addfe32..30842f5cf590 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -2012,6 +2012,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 		goto leave;
 	}
 
+	/*
+	 * We're going to journal the change of i_flags and i_orphaned_slot.
+	 * It's safe anyway, though some callers may duplicate the journaling.
+	 * Journaling within the func just make the logic look more
+	 * straightforward.
+	 */
+	status = ocfs2_journal_access_di(handle,
+					 INODE_CACHE(inode),
+					 fe_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
 	/* we're a cluster, and nlink can change on disk from
 	 * underneath us... */
 	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
@@ -2026,22 +2041,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 				   orphan_dir_bh, lookup);
 	if (status < 0) {
 		mlog_errno(status);
-		goto leave;
-	}
-
-	/*
-	 * We're going to journal the change of i_flags and i_orphaned_slot.
-	 * It's safe anyway, though some callers may duplicate the journaling.
-	 * Journaling within the func just make the logic look more
-	 * straightforward.
-	 */
-	status = ocfs2_journal_access_di(handle,
-					 INODE_CACHE(inode),
-					 fe_bh,
-					 OCFS2_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
+		goto rollback;
 	}
 
 	fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
@@ -2057,11 +2057,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 	trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
 				   osb->slot_num);
 
+rollback:
+	if (status < 0) {
+		if (S_ISDIR(inode->i_mode))
+			ocfs2_add_links_count(orphan_fe, -1);
+		set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
+	}
+
 leave:
 	brelse(orphan_dir_bh);
 
-	if (status)
-		mlog_errno(status);
 	return status;
 }
 
-- 
cgit v1.2.3-58-ga151


From 493098413bdb45f223ff0552e2f734849491dbbe Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Wed, 3 Jul 2013 15:00:59 -0700
Subject: ocfs2: rework transaction rollback in ocfs2_relink_block_group()

In ocfs2_relink_block_group(), we roll back all those changes if notify
intent to modify buffers for metadata update failed even if the relevant
buffer has not yet been modified/got dirty at that point, that are not
quite right because of:

 - None buffer has been modified/dirty if failed to call
   ocfs2_journal_access_gd() against the previous block group buffer

 - Only the previous block group buffer has got dirty if failed to call
   ocfs2_journal_access_gd() against the block group buffer

 - There is no need to roll back the change for file entry buffer at all

Those problems will not cause anything wrong but unnecessary.  This
patch fix them and kill the useless bg_ptr variable as well.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Cc: Younger Liu <younger.liu@huawei.com>
Cc: Sunil Mushran <sunil.mushran@gmail.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/suballoc.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index b7e74b580c0f..101d16d78162 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1422,7 +1422,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
 	int status;
 	/* there is a really tiny chance the journal calls could fail,
 	 * but we wouldn't want inconsistent blocks in *any* case. */
-	u64 fe_ptr, bg_ptr, prev_bg_ptr;
+	u64 bg_ptr, prev_bg_ptr;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
@@ -1437,7 +1437,6 @@ static int ocfs2_relink_block_group(handle_t *handle,
 		(unsigned long long)le64_to_cpu(bg->bg_blkno),
 		(unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
 
-	fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
 	bg_ptr = le64_to_cpu(bg->bg_next_group);
 	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
 
@@ -1446,7 +1445,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
 					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
-		goto out_rollback;
+		goto out;
 	}
 
 	prev_bg->bg_next_group = bg->bg_next_group;
@@ -1456,7 +1455,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
 					 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
-		goto out_rollback;
+		goto out_rollback_prev_bg;
 	}
 
 	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
@@ -1466,21 +1465,21 @@ static int ocfs2_relink_block_group(handle_t *handle,
 					 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
-		goto out_rollback;
+		goto out_rollback_bg;
 	}
 
 	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
 	ocfs2_journal_dirty(handle, fe_bh);
 
-out_rollback:
-	if (status < 0) {
-		fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
-		bg->bg_next_group = cpu_to_le64(bg_ptr);
-		prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
-	}
+out:
+	return status;
 
-	if (status)
-		mlog_errno(status);
+out_rollback_bg:
+	bg->bg_next_group = cpu_to_le64(bg_ptr);
+out_rollback_prev_bg:
+	prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
+
+	mlog_errno(status);
 	return status;
 }
 
-- 
cgit v1.2.3-58-ga151


From 25e2892101ba541dce8593c698d70ccc278bc1fd Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 3 Jul 2013 15:01:00 -0700
Subject: ocfs2: remove duplicated mlog_errno() in ocfs2_relink_block_group

Cc: Jie Liu <jeff.liu@oracle.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Sunil Mushran <sunil.mushran@gmail.com>
Cc: Younger Liu <younger.liu@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/suballoc.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 101d16d78162..5397c07ce608 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1443,44 +1443,38 @@ static int ocfs2_relink_block_group(handle_t *handle,
 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
 					 prev_bg_bh,
 					 OCFS2_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
-		mlog_errno(status);
+	if (status < 0)
 		goto out;
-	}
 
 	prev_bg->bg_next_group = bg->bg_next_group;
 	ocfs2_journal_dirty(handle, prev_bg_bh);
 
 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
 					 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
-		mlog_errno(status);
+	if (status < 0)
 		goto out_rollback_prev_bg;
-	}
 
 	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
 	ocfs2_journal_dirty(handle, bg_bh);
 
 	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
 					 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
-		mlog_errno(status);
+	if (status < 0)
 		goto out_rollback_bg;
-	}
 
 	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
 	ocfs2_journal_dirty(handle, fe_bh);
 
 out:
+	if (status < 0)
+		mlog_errno(status);
 	return status;
 
 out_rollback_bg:
 	bg->bg_next_group = cpu_to_le64(bg_ptr);
 out_rollback_prev_bg:
 	prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
-
-	mlog_errno(status);
-	return status;
+	goto out;
 }
 
 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
-- 
cgit v1.2.3-58-ga151


From b5a8bb717e29b549584cc27ac8e109a803c4f8c4 Mon Sep 17 00:00:00 2001
From: Younger Liu <younger.liu@huawei.com>
Date: Wed, 3 Jul 2013 15:01:01 -0700
Subject: ocfs2: fix readonly issue in ocfs2_unlink()

While deleting a file with ocfs2_unlink(), there is a bug in this
function.  This bug will result in filesystem read-only.

After calling ocfs2_orphan_add(), the file which will be deleted is
added into orphan dir.  If ocfs2_delete_entry() fails, the file still
exists in the parent dir.  And this scenario introduces a conflict of
metadata.

If a file is added into orphan dir, when we put inode of the file with
iput(), the inode i_flags is setted (~OCFS2_VALID_FL) in
ocfs2_remove_inode(), and then write back to disk.

But as previously mentioned, the file still exists in the parent dir.
On other nodes, the file can be still accessed.  When first read the
file with ocfs2_read_blocks() from disk, It will check and avalidate
inode using ocfs2_validate_inode_block().  So File system will be
readonly because the inode is invalid.  In other words, the inode
i_flags has been set (~OCFS2_VALID_FL).

[akpm@linux-foundation.org: cleanups]
[jeff.liu@oracle.com: s/inode_is_unlinkable/ocfs2_inode_is_unlinkable/]
Signed-off-by: Younger Liu <younger.liu@huawei.com>
Signed-off-by: Jensen <shencanquan@huawei.com>
Cc: Jie Liu <jeff.liu@oracle.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Sunil Mushran <sunil.mushran@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/namei.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 30842f5cf590..be3f8676a438 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -773,7 +773,7 @@ static int ocfs2_remote_dentry_delete(struct dentry *dentry)
 	return ret;
 }
 
-static inline int inode_is_unlinkable(struct inode *inode)
+static inline int ocfs2_inode_is_unlinkable(struct inode *inode)
 {
 	if (S_ISDIR(inode->i_mode)) {
 		if (inode->i_nlink == 2)
@@ -791,6 +791,7 @@ static int ocfs2_unlink(struct inode *dir,
 {
 	int status;
 	int child_locked = 0;
+	bool is_unlinkable = false;
 	struct inode *inode = dentry->d_inode;
 	struct inode *orphan_dir = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -865,7 +866,7 @@ static int ocfs2_unlink(struct inode *dir,
 		goto leave;
 	}
 
-	if (inode_is_unlinkable(inode)) {
+	if (ocfs2_inode_is_unlinkable(inode)) {
 		status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
 						  OCFS2_I(inode)->ip_blkno,
 						  orphan_name, &orphan_insert);
@@ -873,6 +874,7 @@ static int ocfs2_unlink(struct inode *dir,
 			mlog_errno(status);
 			goto leave;
 		}
+		is_unlinkable = true;
 	}
 
 	handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
@@ -892,15 +894,6 @@ static int ocfs2_unlink(struct inode *dir,
 
 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
-	if (inode_is_unlinkable(inode)) {
-		status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
-					  &orphan_insert, orphan_dir);
-		if (status < 0) {
-			mlog_errno(status);
-			goto leave;
-		}
-	}
-
 	/* delete the name from the parent dir */
 	status = ocfs2_delete_entry(handle, dir, &lookup);
 	if (status < 0) {
@@ -923,6 +916,14 @@ static int ocfs2_unlink(struct inode *dir,
 		mlog_errno(status);
 		if (S_ISDIR(inode->i_mode))
 			inc_nlink(dir);
+		goto leave;
+	}
+
+	if (is_unlinkable) {
+		status = ocfs2_orphan_add(osb, handle, inode, fe_bh,
+				orphan_name, &orphan_insert, orphan_dir);
+		if (status < 0)
+			mlog_errno(status);
 	}
 
 leave:
-- 
cgit v1.2.3-58-ga151


From ef962df057aaafd714f5c22ba3de1be459571fdf Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Wed, 3 Jul 2013 15:01:03 -0700
Subject: ocfs2: xattr: fix inlined xattr reflink

Inlined xattr shared free space of inode block with inlined data or data
extent record, so the size of the later two should be adjusted when
inlined xattr is enabled.  See ocfs2_xattr_ibody_init().  But this isn't
done well when reflink.  For inode with inlined data, its max inlined
data size is adjusted in ocfs2_duplicate_inline_data(), no problem.  But
for inode with data extent record, its record count isn't adjusted.  Fix
it, or data extent record and inlined xattr may overwrite each other,
then cause data corruption or xattr failure.

One panic caused by this bug in our test environment is the following:

  kernel BUG at fs/ocfs2/xattr.c:1435!
  invalid opcode: 0000 [#1] SMP
  Pid: 10871, comm: multi_reflink_t Not tainted 2.6.39-300.17.1.el5uek #1
  RIP: ocfs2_xa_offset_pointer+0x17/0x20 [ocfs2]
  RSP: e02b:ffff88007a587948  EFLAGS: 00010283
  RAX: 0000000000000000 RBX: 0000000000000010 RCX: 00000000000051e4
  RDX: ffff880057092060 RSI: 0000000000000f80 RDI: ffff88007a587a68
  RBP: ffff88007a587948 R08: 00000000000062f4 R09: 0000000000000000
  R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000010
  R13: ffff88007a587a68 R14: 0000000000000001 R15: ffff88007a587c68
  FS:  00007fccff7f06e0(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000
  CS:  e033 DS: 0000 ES: 0000 CR0: 000000008005003b
  CR2: 00000000015cf000 CR3: 000000007aa76000 CR4: 0000000000000660
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
  Process multi_reflink_t
  Call Trace:
    ocfs2_xa_reuse_entry+0x60/0x280 [ocfs2]
    ocfs2_xa_prepare_entry+0x17e/0x2a0 [ocfs2]
    ocfs2_xa_set+0xcc/0x250 [ocfs2]
    ocfs2_xattr_ibody_set+0x98/0x230 [ocfs2]
    __ocfs2_xattr_set_handle+0x4f/0x700 [ocfs2]
    ocfs2_xattr_set+0x6c6/0x890 [ocfs2]
    ocfs2_xattr_user_set+0x46/0x50 [ocfs2]
    generic_setxattr+0x70/0x90
    __vfs_setxattr_noperm+0x80/0x1a0
    vfs_setxattr+0xa9/0xb0
    setxattr+0xc3/0x120
    sys_fsetxattr+0xa8/0xd0
    system_call_fastpath+0x16/0x1b

Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Acked-by: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Sunil Mushran <sunil.mushran@gmail.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/xattr.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 0deabcdaf916..317ef0abccbb 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -6491,6 +6491,16 @@ static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args)
 	}
 
 	new_oi = OCFS2_I(args->new_inode);
+	/*
+	 * Adjust extent record count to reserve space for extended attribute.
+	 * Inline data count had been adjusted in ocfs2_duplicate_inline_data().
+	 */
+	if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) &&
+	    !(ocfs2_inode_is_fast_symlink(args->new_inode))) {
+		struct ocfs2_extent_list *el = &new_di->id2.i_list;
+		le16_add_cpu(&el->l_count, -(inline_size /
+					sizeof(struct ocfs2_extent_rec)));
+	}
 	spin_lock(&new_oi->ip_lock);
 	new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL;
 	new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
-- 
cgit v1.2.3-58-ga151


From e873fdb5259ac62cf099621590645470c12a1d21 Mon Sep 17 00:00:00 2001
From: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
Date: Wed, 3 Jul 2013 15:01:04 -0700
Subject: ocfs2: submit disk heartbeat bio using WRITE_SYNC

Under heavy I/O load, writing the disk heartbeat can be forced to wait for
minutes, and this causes the node to be fenced.

This patch tries to use WRITE_SYNC in submitting the heartbeat bio, so
that writing the heartbeat will have a priority over other requests.

Signed-off-by: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
Acked-by: Tao Ma <tm@tao.ma>
Acked-by: Sunil Mushran <sunil.mushran@gmail.com>
Cc: Srinivas Eeeda <srinivas.eeda@oracle.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Tested-by: Gurudas Pai <gurudas.pai@oracle.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/heartbeat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 42252bf64b51..f89b46b66235 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -500,7 +500,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg,
 	}
 
 	atomic_inc(&write_wc->wc_num_reqs);
-	submit_bio(WRITE, bio);
+	submit_bio(WRITE_SYNC, bio);
 
 	status = 0;
 bail:
-- 
cgit v1.2.3-58-ga151


From 70f651edb75b68e3c039d137a56be84f53211558 Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Wed, 3 Jul 2013 15:01:06 -0700
Subject: ocfs2: consolidate o2hb_global_hearbeat_mode_set() naming convention

s/o2hb_global_hearbeat_mode_set/o2hb_global_heartbeat_mode_set/ to make
the signature of those routines in a consistent manner with others for
heartbeating.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Acked-by: Sunil Mushran <sunil.mushran@gmail.com>
Cc: Gurudas Pai <gurudas.pai@oracle.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
Cc: Srinivas Eeeda <srinivas.eeda@oracle.com>
Cc: Tao Ma <tm@tao.ma>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/heartbeat.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f89b46b66235..956d79d1fdde 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -176,7 +176,7 @@ static void o2hb_dead_threshold_set(unsigned int threshold)
 	}
 }
 
-static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode)
+static int o2hb_global_heartbeat_mode_set(unsigned int hb_mode)
 {
 	int ret = -1;
 
@@ -2271,7 +2271,7 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
 		if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
 			continue;
 
-		ret = o2hb_global_hearbeat_mode_set(i);
+		ret = o2hb_global_heartbeat_mode_set(i);
 		if (!ret)
 			printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
 			       o2hb_heartbeat_mode_desc[i]);
@@ -2304,7 +2304,7 @@ static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
 	NULL,
 };
 
-static struct configfs_item_operations o2hb_hearbeat_group_item_ops = {
+static struct configfs_item_operations o2hb_heartbeat_group_item_ops = {
 	.show_attribute		= o2hb_heartbeat_group_show,
 	.store_attribute	= o2hb_heartbeat_group_store,
 };
@@ -2316,7 +2316,7 @@ static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
 
 static struct config_item_type o2hb_heartbeat_group_type = {
 	.ct_group_ops	= &o2hb_heartbeat_group_group_ops,
-	.ct_item_ops	= &o2hb_hearbeat_group_item_ops,
+	.ct_item_ops	= &o2hb_heartbeat_group_item_ops,
 	.ct_attrs	= o2hb_heartbeat_group_attrs,
 	.ct_owner	= THIS_MODULE,
 };
-- 
cgit v1.2.3-58-ga151


From b4d8ed4f8e527751c7ece6cef73f6212808e6130 Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Wed, 3 Jul 2013 15:01:07 -0700
Subject: ocfs2: fix a comments typo at o2quo_hb_still_up()

Fix a comment typo in o2quo_hb_still_up()

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Cc: Gurudas Pai <gurudas.pai@oracle.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
Cc: Srinivas Eeeda <srinivas.eeda@oracle.com>
Cc: Sunil Mushran <sunil.mushran@gmail.com>
Cc: Tao Ma <tm@tao.ma>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/quorum.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index c19897d0fe14..1ec141e758d7 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -264,7 +264,7 @@ void o2quo_hb_still_up(u8 node)
 /* This is analogous to hb_up.  as a node's connection comes up we delay the
  * quorum decision until we see it heartbeating.  the hold will be droped in
  * hb_up or hb_down.  it might be perpetuated by con_err until hb_down.  if
- * it's already heartbeating we we might be dropping a hold that conn_up got.
+ * it's already heartbeating we might be dropping a hold that conn_up got.
  * */
 void o2quo_conn_up(u8 node)
 {
-- 
cgit v1.2.3-58-ga151


From 44e89cb8e279abdf83581a10e592c2451bae7824 Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Wed, 3 Jul 2013 15:01:08 -0700
Subject: ocfs2: adjust switch_case syntax at o2net_state_change()

Adjust switch..case syntax at o2net_state_change to meet the kernel coding
standard.

s/printk/pr_info/.

[akpm@linux-foundation.org: revert pr_foo() change]
Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Acked-by: Joel Becker <jlbec@evilplan.org>
Cc: Gurudas Pai <gurudas.pai@oracle.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
Cc: Srinivas Eeeda <srinivas.eeda@oracle.com>
Cc: Sunil Mushran <sunil.mushran@gmail.com>
Cc: Tao Ma <tm@tao.ma>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/tcp.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index bb9a48bfbd01..d644dc611425 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -633,19 +633,19 @@ static void o2net_state_change(struct sock *sk)
 	state_change = sc->sc_state_change;
 
 	switch(sk->sk_state) {
-		/* ignore connecting sockets as they make progress */
-		case TCP_SYN_SENT:
-		case TCP_SYN_RECV:
-			break;
-		case TCP_ESTABLISHED:
-			o2net_sc_queue_work(sc, &sc->sc_connect_work);
-			break;
-		default:
-			printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
-			      " shutdown, state %d\n",
-			      SC_NODEF_ARGS(sc), sk->sk_state);
-			o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
-			break;
+	/* ignore connecting sockets as they make progress */
+	case TCP_SYN_SENT:
+	case TCP_SYN_RECV:
+		break;
+	case TCP_ESTABLISHED:
+		o2net_sc_queue_work(sc, &sc->sc_connect_work);
+		break;
+	default:
+		printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
+			" shutdown, state %d\n",
+			SC_NODEF_ARGS(sc), sk->sk_state);
+		o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
+		break;
 	}
 out:
 	read_unlock(&sk->sk_callback_lock);
-- 
cgit v1.2.3-58-ga151


From 4a184b4ff424e544359f081087723fc36efe603e Mon Sep 17 00:00:00 2001
From: Xue jiufei <xuejiufei@huawei.com>
Date: Wed, 3 Jul 2013 15:01:10 -0700
Subject: ocfs2: fix NULL pointer dereference when traversing o2hb_all_regions

There may exist NULL pointer dereference in config_item_name() when one
volume (say Volume A) unmounts while another (say Volume B) mounting.

     Volume A                          Volume B

  already Mounted.
  Unmounting, call
  o2hb_heartbeat_group_drop_item()
    -> config_item_put(item)
    set reg(A)->item.ci_name to NULL
    in function config_item_cleanup().

                                    begin mounting, call
                                    o2hb_region_pin() and tranverse all
                                    regions. When reading
                                    reg(A)->item.ci_name, it causes
                                    NULL pointer dereference.

  call o2hb_region_release() and
  del reg(A) from list.

So we should skip accessing regions that is going to release when
tranverse o2hb_all_regions.

Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
Signed-off-by: joyce <xuejiufei@huawei.com>
Acked-by: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Sunil Mushran <sunil.mushran@gmail.com>
Cc: Jie Liu <jeff.liu@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/heartbeat.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 956d79d1fdde..5c1c864e81cc 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -2389,6 +2389,9 @@ static int o2hb_region_pin(const char *region_uuid)
 	assert_spin_locked(&o2hb_live_lock);
 
 	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+		if (reg->hr_item_dropped)
+			continue;
+
 		uuid = config_item_name(&reg->hr_item);
 
 		/* local heartbeat */
@@ -2439,6 +2442,9 @@ static void o2hb_region_unpin(const char *region_uuid)
 	assert_spin_locked(&o2hb_live_lock);
 
 	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+		if (reg->hr_item_dropped)
+			continue;
+
 		uuid = config_item_name(&reg->hr_item);
 		if (region_uuid) {
 			if (strcmp(region_uuid, uuid))
@@ -2654,6 +2660,9 @@ int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
 
 	p = region_uuids;
 	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+		if (reg->hr_item_dropped)
+			continue;
+
 		mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
 		if (numregs < max_regions) {
 			memcpy(p, config_item_name(&reg->hr_item),
-- 
cgit v1.2.3-58-ga151


From 040fa02077de01c7e08fa75be6125e4ca5636011 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 3 Jul 2013 15:01:16 -0700
Subject: clear_refs: sanitize accepted commands declaration

This is the implementation of the soft-dirty bit concept that should
help keep track of changes in user memory, which in turn is very-very
required by the checkpoint-restore project (http://criu.org).

To create a dump of an application(s) we save all the information about
it to files, and the biggest part of such dump is the contents of tasks'
memory.  However, there are usage scenarios where it's not required to
get _all_ the task memory while creating a dump.  For example, when
doing periodical dumps, it's only required to take full memory dump only
at the first step and then take incremental changes of memory.  Another
example is live migration.  We copy all the memory to the destination
node without stopping all tasks, then stop them, check for what pages
has changed, dump it and the rest of the state, then copy it to the
destination node.  This decreases freeze time significantly.

That said, some help from kernel to watch how processes modify the
contents of their memory is required.

The proposal is to track changes with the help of new soft-dirty bit
this way:

1. First do "echo 4 > /proc/$pid/clear_refs".
   At that point kernel clears the soft dirty _and_ the writable bits from all
   ptes of process $pid. From now on every write to any page will result in #pf
   and the subsequent call to pte_mkdirty/pmd_mkdirty, which in turn will set
   the soft dirty flag.

2. Then read the /proc/$pid/pagemap2 and check the soft-dirty bit reported there
   (the 55'th one). If set, the respective pte was written to since last call
   to clear refs.

The soft-dirty bit is the _PAGE_BIT_HIDDEN one.  Although it's used by
kmemcheck, the latter one marks kernel pages with it, while the former
bit is put on user pages so they do not conflict to each other.

This patch:

A new clear-refs type will be added in the next patch, so prepare
code for that.

[akpm@linux-foundation.org: don't assume that sizeof(enum clear_refs_types) == sizeof(int)]
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3e636d864d56..dad0809db551 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -688,6 +688,13 @@ const struct file_operations proc_tid_smaps_operations = {
 	.release	= seq_release_private,
 };
 
+enum clear_refs_types {
+	CLEAR_REFS_ALL = 1,
+	CLEAR_REFS_ANON,
+	CLEAR_REFS_MAPPED,
+	CLEAR_REFS_LAST,
+};
+
 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 				unsigned long end, struct mm_walk *walk)
 {
@@ -719,10 +726,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 	return 0;
 }
 
-#define CLEAR_REFS_ALL 1
-#define CLEAR_REFS_ANON 2
-#define CLEAR_REFS_MAPPED 3
-
 static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *ppos)
 {
@@ -730,7 +733,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 	char buffer[PROC_NUMBUF];
 	struct mm_struct *mm;
 	struct vm_area_struct *vma;
-	int type;
+	enum clear_refs_types type;
+	int itype;
 	int rv;
 
 	memset(buffer, 0, sizeof(buffer));
@@ -738,10 +742,11 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 		count = sizeof(buffer) - 1;
 	if (copy_from_user(buffer, buf, count))
 		return -EFAULT;
-	rv = kstrtoint(strstrip(buffer), 10, &type);
+	rv = kstrtoint(strstrip(buffer), 10, &itype);
 	if (rv < 0)
 		return rv;
-	if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED)
+	type = (enum clear_refs_types)itype;
+	if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
 		return -EINVAL;
 	task = get_proc_task(file_inode(file));
 	if (!task)
-- 
cgit v1.2.3-58-ga151


From af9de7eb180fa9b74c2cdc256349304a58c63c02 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 3 Jul 2013 15:01:18 -0700
Subject: clear_refs: introduce private struct for mm_walk

In the next patch the clear-refs-type will be required in
clear_refs_pte_range funciton, so prepare the walk->private to carry
this info.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index dad0809db551..ef6f6c62dfee 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -695,10 +695,15 @@ enum clear_refs_types {
 	CLEAR_REFS_LAST,
 };
 
+struct clear_refs_private {
+	struct vm_area_struct *vma;
+};
+
 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 				unsigned long end, struct mm_walk *walk)
 {
-	struct vm_area_struct *vma = walk->private;
+	struct clear_refs_private *cp = walk->private;
+	struct vm_area_struct *vma = cp->vma;
 	pte_t *pte, ptent;
 	spinlock_t *ptl;
 	struct page *page;
@@ -753,13 +758,16 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 		return -ESRCH;
 	mm = get_task_mm(task);
 	if (mm) {
+		struct clear_refs_private cp = {
+		};
 		struct mm_walk clear_refs_walk = {
 			.pmd_entry = clear_refs_pte_range,
 			.mm = mm,
+			.private = &cp,
 		};
 		down_read(&mm->mmap_sem);
 		for (vma = mm->mmap; vma; vma = vma->vm_next) {
-			clear_refs_walk.private = vma;
+			cp.vma = vma;
 			if (is_vm_hugetlb_page(vma))
 				continue;
 			/*
-- 
cgit v1.2.3-58-ga151


From 2b0a9f017548f05e42fbf7e67c4a626c1ebd5e12 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 3 Jul 2013 15:01:19 -0700
Subject: pagemap: introduce pagemap_entry_t without pmshift bits

These bits are always constant (== PAGE_SHIFT) and just occupy space in
the entry.  Moreover, in next patch we will need to report one more bit
in the pagemap, but all bits are already busy on it.

That said, describe the pagemap entry that has 6 more free zero bits.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ef6f6c62dfee..39d641292579 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -807,6 +807,7 @@ typedef struct {
 struct pagemapread {
 	int pos, len;
 	pagemap_entry_t *buffer;
+	bool v2;
 };
 
 #define PAGEMAP_WALK_SIZE	(PMD_SIZE)
@@ -820,14 +821,16 @@ struct pagemapread {
 #define PM_PSHIFT_BITS      6
 #define PM_PSHIFT_OFFSET    (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
 #define PM_PSHIFT_MASK      (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
-#define PM_PSHIFT(x)        (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
+#define __PM_PSHIFT(x)      (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
 #define PM_PFRAME_MASK      ((1LL << PM_PSHIFT_OFFSET) - 1)
 #define PM_PFRAME(x)        ((x) & PM_PFRAME_MASK)
+/* in "new" pagemap pshift bits are occupied with more status bits */
+#define PM_STATUS2(v2, x)   (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
 
 #define PM_PRESENT          PM_STATUS(4LL)
 #define PM_SWAP             PM_STATUS(2LL)
 #define PM_FILE             PM_STATUS(1LL)
-#define PM_NOT_PRESENT      PM_PSHIFT(PAGE_SHIFT)
+#define PM_NOT_PRESENT(v2)  PM_STATUS2(v2, 0)
 #define PM_END_OF_BUFFER    1
 
 static inline pagemap_entry_t make_pme(u64 val)
@@ -850,7 +853,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
 	struct pagemapread *pm = walk->private;
 	unsigned long addr;
 	int err = 0;
-	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
+	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
 
 	for (addr = start; addr < end; addr += PAGE_SIZE) {
 		err = add_to_pagemap(addr, &pme, pm);
@@ -860,7 +863,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
 	return err;
 }
 
-static void pte_to_pagemap_entry(pagemap_entry_t *pme,
+static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
 		struct vm_area_struct *vma, unsigned long addr, pte_t pte)
 {
 	u64 frame, flags;
@@ -879,18 +882,18 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme,
 		if (is_migration_entry(entry))
 			page = migration_entry_to_page(entry);
 	} else {
-		*pme = make_pme(PM_NOT_PRESENT);
+		*pme = make_pme(PM_NOT_PRESENT(pm->v2));
 		return;
 	}
 
 	if (page && !PageAnon(page))
 		flags |= PM_FILE;
 
-	*pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags);
+	*pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, 0) | flags);
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
+static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
 					pmd_t pmd, int offset)
 {
 	/*
@@ -900,12 +903,12 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
 	 */
 	if (pmd_present(pmd))
 		*pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
-				| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
+				| PM_STATUS2(pm->v2, 0) | PM_PRESENT);
 	else
-		*pme = make_pme(PM_NOT_PRESENT);
+		*pme = make_pme(PM_NOT_PRESENT(pm->v2));
 }
 #else
-static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
+static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
 						pmd_t pmd, int offset)
 {
 }
@@ -918,7 +921,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	struct pagemapread *pm = walk->private;
 	pte_t *pte;
 	int err = 0;
-	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
+	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
 
 	/* find the first VMA at or above 'addr' */
 	vma = find_vma(walk->mm, addr);
@@ -928,7 +931,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 
 			offset = (addr & ~PAGEMAP_WALK_MASK) >>
 					PAGE_SHIFT;
-			thp_pmd_to_pagemap_entry(&pme, *pmd, offset);
+			thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset);
 			err = add_to_pagemap(addr, &pme, pm);
 			if (err)
 				break;
@@ -945,7 +948,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 		 * and need a new, higher one */
 		if (vma && (addr >= vma->vm_end)) {
 			vma = find_vma(walk->mm, addr);
-			pme = make_pme(PM_NOT_PRESENT);
+			pme = make_pme(PM_NOT_PRESENT(pm->v2));
 		}
 
 		/* check that 'vma' actually covers this address,
@@ -953,7 +956,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 		if (vma && (vma->vm_start <= addr) &&
 		    !is_vm_hugetlb_page(vma)) {
 			pte = pte_offset_map(pmd, addr);
-			pte_to_pagemap_entry(&pme, vma, addr, *pte);
+			pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
 			/* unmap before userspace copy */
 			pte_unmap(pte);
 		}
@@ -968,14 +971,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 }
 
 #ifdef CONFIG_HUGETLB_PAGE
-static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme,
+static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
 					pte_t pte, int offset)
 {
 	if (pte_present(pte))
 		*pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)
-				| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
+				| PM_STATUS2(pm->v2, 0) | PM_PRESENT);
 	else
-		*pme = make_pme(PM_NOT_PRESENT);
+		*pme = make_pme(PM_NOT_PRESENT(pm->v2));
 }
 
 /* This function walks within one hugetlb entry in the single call */
@@ -989,7 +992,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
 
 	for (; addr != end; addr += PAGE_SIZE) {
 		int offset = (addr & ~hmask) >> PAGE_SHIFT;
-		huge_pte_to_pagemap_entry(&pme, *pte, offset);
+		huge_pte_to_pagemap_entry(&pme, pm, *pte, offset);
 		err = add_to_pagemap(addr, &pme, pm);
 		if (err)
 			return err;
@@ -1051,6 +1054,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	if (!count)
 		goto out_task;
 
+	pm.v2 = false;
 	pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
 	pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
 	ret = -ENOMEM;
-- 
cgit v1.2.3-58-ga151


From 0f8975ec4db2c8b5bd111b211292ca9be0feb6b8 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 3 Jul 2013 15:01:20 -0700
Subject: mm: soft-dirty bits for user memory changes tracking

The soft-dirty is a bit on a PTE which helps to track which pages a task
writes to.  In order to do this tracking one should

  1. Clear soft-dirty bits from PTEs ("echo 4 > /proc/PID/clear_refs)
  2. Wait some time.
  3. Read soft-dirty bits (55'th in /proc/PID/pagemap2 entries)

To do this tracking, the writable bit is cleared from PTEs when the
soft-dirty bit is.  Thus, after this, when the task tries to modify a
page at some virtual address the #PF occurs and the kernel sets the
soft-dirty bit on the respective PTE.

Note, that although all the task's address space is marked as r/o after
the soft-dirty bits clear, the #PF-s that occur after that are processed
fast.  This is so, since the pages are still mapped to physical memory,
and thus all the kernel does is finds this fact out and puts back
writable, dirty and soft-dirty bits on the PTE.

Another thing to note, is that when mremap moves PTEs they are marked
with soft-dirty as well, since from the user perspective mremap modifies
the virtual memory at mremap's new address.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt   |  7 +++++-
 Documentation/vm/soft-dirty.txt      | 36 +++++++++++++++++++++++++++
 arch/Kconfig                         |  3 +++
 arch/x86/Kconfig                     |  1 +
 arch/x86/include/asm/pgtable.h       | 24 ++++++++++++++++--
 arch/x86/include/asm/pgtable_types.h | 12 +++++++++
 fs/proc/task_mmu.c                   | 47 ++++++++++++++++++++++++++++++++----
 include/asm-generic/pgtable.h        | 22 +++++++++++++++++
 mm/Kconfig                           | 12 +++++++++
 mm/huge_memory.c                     |  2 +-
 mm/mremap.c                          |  2 +-
 11 files changed, 158 insertions(+), 10 deletions(-)
 create mode 100644 Documentation/vm/soft-dirty.txt

(limited to 'fs')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index fd8d0d594fc7..fcc22c982a25 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -473,7 +473,8 @@ This file is only present if the CONFIG_MMU kernel configuration option is
 enabled.
 
 The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG
-bits on both physical and virtual pages associated with a process.
+bits on both physical and virtual pages associated with a process, and the
+soft-dirty bit on pte (see Documentation/vm/soft-dirty.txt for details).
 To clear the bits for all the pages associated with the process
     > echo 1 > /proc/PID/clear_refs
 
@@ -482,6 +483,10 @@ To clear the bits for the anonymous pages associated with the process
 
 To clear the bits for the file mapped pages associated with the process
     > echo 3 > /proc/PID/clear_refs
+
+To clear the soft-dirty bit
+    > echo 4 > /proc/PID/clear_refs
+
 Any other value written to /proc/PID/clear_refs will have no effect.
 
 The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags
diff --git a/Documentation/vm/soft-dirty.txt b/Documentation/vm/soft-dirty.txt
new file mode 100644
index 000000000000..9a12a5956bc0
--- /dev/null
+++ b/Documentation/vm/soft-dirty.txt
@@ -0,0 +1,36 @@
+                            SOFT-DIRTY PTEs
+
+  The soft-dirty is a bit on a PTE which helps to track which pages a task
+writes to. In order to do this tracking one should
+
+  1. Clear soft-dirty bits from the task's PTEs.
+
+     This is done by writing "4" into the /proc/PID/clear_refs file of the
+     task in question.
+
+  2. Wait some time.
+
+  3. Read soft-dirty bits from the PTEs.
+
+     This is done by reading from the /proc/PID/pagemap. The bit 55 of the
+     64-bit qword is the soft-dirty one. If set, the respective PTE was
+     written to since step 1.
+
+
+  Internally, to do this tracking, the writable bit is cleared from PTEs
+when the soft-dirty bit is cleared. So, after this, when the task tries to
+modify a page at some virtual address the #PF occurs and the kernel sets
+the soft-dirty bit on the respective PTE.
+
+  Note, that although all the task's address space is marked as r/o after the
+soft-dirty bits clear, the #PF-s that occur after that are processed fast.
+This is so, since the pages are still mapped to physical memory, and thus all
+the kernel does is finds this fact out and puts both writable and soft-dirty
+bits on the PTE.
+
+
+  This feature is actively used by the checkpoint-restore project. You
+can find more details about it on http://criu.org
+
+
+-- Pavel Emelyanov, Apr 9, 2013
diff --git a/arch/Kconfig b/arch/Kconfig
index a4429bcd609e..8d2ae24b9f4a 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -365,6 +365,9 @@ config HAVE_IRQ_TIME_ACCOUNTING
 config HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	bool
 
+config HAVE_ARCH_SOFT_DIRTY
+	bool
+
 config HAVE_MOD_ARCH_SPECIFIC
 	bool
 	help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b094816a7e0f..10764a3d62cc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -102,6 +102,7 @@ config X86
 	select HAVE_ARCH_SECCOMP_FILTER
 	select BUILDTIME_EXTABLE_SORT
 	select GENERIC_CMOS_UPDATE
+	select HAVE_ARCH_SOFT_DIRTY
 	select CLOCKSOURCE_WATCHDOG
 	select GENERIC_CLOCKEVENTS
 	select ARCH_CLOCKSOURCE_DATA if X86_64
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5b0818bc8963..7dc305a46058 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -207,7 +207,7 @@ static inline pte_t pte_mkexec(pte_t pte)
 
 static inline pte_t pte_mkdirty(pte_t pte)
 {
-	return pte_set_flags(pte, _PAGE_DIRTY);
+	return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
 }
 
 static inline pte_t pte_mkyoung(pte_t pte)
@@ -271,7 +271,7 @@ static inline pmd_t pmd_wrprotect(pmd_t pmd)
 
 static inline pmd_t pmd_mkdirty(pmd_t pmd)
 {
-	return pmd_set_flags(pmd, _PAGE_DIRTY);
+	return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
 }
 
 static inline pmd_t pmd_mkhuge(pmd_t pmd)
@@ -294,6 +294,26 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
 	return pmd_clear_flags(pmd, _PAGE_PRESENT);
 }
 
+static inline int pte_soft_dirty(pte_t pte)
+{
+	return pte_flags(pte) & _PAGE_SOFT_DIRTY;
+}
+
+static inline int pmd_soft_dirty(pmd_t pmd)
+{
+	return pmd_flags(pmd) & _PAGE_SOFT_DIRTY;
+}
+
+static inline pte_t pte_mksoft_dirty(pte_t pte)
+{
+	return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
+}
+
+static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
+{
+	return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
+}
+
 /*
  * Mask out unsupported bits in a present pgprot.  Non-present pgprots
  * can use those bits for other purposes, so leave them be.
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index e6423002c10b..c98ac63aae48 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -55,6 +55,18 @@
 #define _PAGE_HIDDEN	(_AT(pteval_t, 0))
 #endif
 
+/*
+ * The same hidden bit is used by kmemcheck, but since kmemcheck
+ * works on kernel pages while soft-dirty engine on user space,
+ * they do not conflict with each other.
+ */
+
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define _PAGE_SOFT_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
+#else
+#define _PAGE_SOFT_DIRTY	(_AT(pteval_t, 0))
+#endif
+
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 #define _PAGE_NX	(_AT(pteval_t, 1) << _PAGE_BIT_NX)
 #else
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 39d641292579..a18e065c1c3e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -11,6 +11,7 @@
 #include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/elf.h>
 #include <asm/uaccess.h>
@@ -692,13 +693,32 @@ enum clear_refs_types {
 	CLEAR_REFS_ALL = 1,
 	CLEAR_REFS_ANON,
 	CLEAR_REFS_MAPPED,
+	CLEAR_REFS_SOFT_DIRTY,
 	CLEAR_REFS_LAST,
 };
 
 struct clear_refs_private {
 	struct vm_area_struct *vma;
+	enum clear_refs_types type;
 };
 
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+		unsigned long addr, pte_t *pte)
+{
+#ifdef CONFIG_MEM_SOFT_DIRTY
+	/*
+	 * The soft-dirty tracker uses #PF-s to catch writes
+	 * to pages, so write-protect the pte as well. See the
+	 * Documentation/vm/soft-dirty.txt for full description
+	 * of how soft-dirty works.
+	 */
+	pte_t ptent = *pte;
+	ptent = pte_wrprotect(ptent);
+	ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
+	set_pte_at(vma->vm_mm, addr, pte, ptent);
+#endif
+}
+
 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 				unsigned long end, struct mm_walk *walk)
 {
@@ -718,6 +738,11 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 		if (!pte_present(ptent))
 			continue;
 
+		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
+			clear_soft_dirty(vma, addr, pte);
+			continue;
+		}
+
 		page = vm_normal_page(vma, addr, ptent);
 		if (!page)
 			continue;
@@ -759,6 +784,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 	mm = get_task_mm(task);
 	if (mm) {
 		struct clear_refs_private cp = {
+			.type = type,
 		};
 		struct mm_walk clear_refs_walk = {
 			.pmd_entry = clear_refs_pte_range,
@@ -766,6 +792,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 			.private = &cp,
 		};
 		down_read(&mm->mmap_sem);
+		if (type == CLEAR_REFS_SOFT_DIRTY)
+			mmu_notifier_invalidate_range_start(mm, 0, -1);
 		for (vma = mm->mmap; vma; vma = vma->vm_next) {
 			cp.vma = vma;
 			if (is_vm_hugetlb_page(vma))
@@ -786,6 +814,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 			walk_page_range(vma->vm_start, vma->vm_end,
 					&clear_refs_walk);
 		}
+		if (type == CLEAR_REFS_SOFT_DIRTY)
+			mmu_notifier_invalidate_range_end(mm, 0, -1);
 		flush_tlb_mm(mm);
 		up_read(&mm->mmap_sem);
 		mmput(mm);
@@ -827,6 +857,7 @@ struct pagemapread {
 /* in "new" pagemap pshift bits are occupied with more status bits */
 #define PM_STATUS2(v2, x)   (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
 
+#define __PM_SOFT_DIRTY      (1LL)
 #define PM_PRESENT          PM_STATUS(4LL)
 #define PM_SWAP             PM_STATUS(2LL)
 #define PM_FILE             PM_STATUS(1LL)
@@ -868,6 +899,7 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
 {
 	u64 frame, flags;
 	struct page *page = NULL;
+	int flags2 = 0;
 
 	if (pte_present(pte)) {
 		frame = pte_pfn(pte);
@@ -888,13 +920,15 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
 
 	if (page && !PageAnon(page))
 		flags |= PM_FILE;
+	if (pte_soft_dirty(pte))
+		flags2 |= __PM_SOFT_DIRTY;
 
-	*pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, 0) | flags);
+	*pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
-					pmd_t pmd, int offset)
+		pmd_t pmd, int offset, int pmd_flags2)
 {
 	/*
 	 * Currently pmd for thp is always present because thp can not be
@@ -903,13 +937,13 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *p
 	 */
 	if (pmd_present(pmd))
 		*pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
-				| PM_STATUS2(pm->v2, 0) | PM_PRESENT);
+				| PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
 	else
 		*pme = make_pme(PM_NOT_PRESENT(pm->v2));
 }
 #else
 static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
-						pmd_t pmd, int offset)
+		pmd_t pmd, int offset, int pmd_flags2)
 {
 }
 #endif
@@ -926,12 +960,15 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	/* find the first VMA at or above 'addr' */
 	vma = find_vma(walk->mm, addr);
 	if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
+		int pmd_flags2;
+
+		pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0);
 		for (; addr != end; addr += PAGE_SIZE) {
 			unsigned long offset;
 
 			offset = (addr & ~PAGEMAP_WALK_MASK) >>
 					PAGE_SHIFT;
-			thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset);
+			thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
 			err = add_to_pagemap(addr, &pme, pm);
 			if (err)
 				break;
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index b1836987d506..a7126d28f4cf 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -396,6 +396,28 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
 #define arch_start_context_switch(prev)	do {} while (0)
 #endif
 
+#ifndef CONFIG_HAVE_ARCH_SOFT_DIRTY
+static inline int pte_soft_dirty(pte_t pte)
+{
+	return 0;
+}
+
+static inline int pmd_soft_dirty(pmd_t pmd)
+{
+	return 0;
+}
+
+static inline pte_t pte_mksoft_dirty(pte_t pte)
+{
+	return pte;
+}
+
+static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
+{
+	return pmd;
+}
+#endif
+
 #ifndef __HAVE_PFNMAP_TRACKING
 /*
  * Interfaces that can be used by architecture code to keep track of
diff --git a/mm/Kconfig b/mm/Kconfig
index f5e698e30d4a..7e28ecfa8aa4 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -477,3 +477,15 @@ config FRONTSWAP
 	  and swap data is stored as normal on the matching swap device.
 
 	  If unsure, say Y to enable frontswap.
+
+config MEM_SOFT_DIRTY
+	bool "Track memory changes"
+	depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY
+	select PROC_PAGE_MONITOR
+	help
+	  This option enables memory changes tracking by introducing a
+	  soft-dirty bit on pte-s. This bit it set when someone writes
+	  into a page just as regular dirty bit, but unlike the latter
+	  it can be cleared by hands.
+
+	  See Documentation/vm/soft-dirty.txt for more details.
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 362c329b83fe..d8b3b850150c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1429,7 +1429,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
 	if (ret == 1) {
 		pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
 		VM_BUG_ON(!pmd_none(*new_pmd));
-		set_pmd_at(mm, new_addr, new_pmd, pmd);
+		set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
 		spin_unlock(&mm->page_table_lock);
 	}
 out:
diff --git a/mm/mremap.c b/mm/mremap.c
index 463a25705ac6..3708655378e9 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -126,7 +126,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 			continue;
 		pte = ptep_get_and_clear(mm, old_addr, old_pte);
 		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
-		set_pte_at(mm, new_addr, new_pte, pte);
+		set_pte_at(mm, new_addr, new_pte, pte_mksoft_dirty(pte));
 	}
 
 	arch_leave_lazy_mmu_mode();
-- 
cgit v1.2.3-58-ga151


From 541c237c0923f567c9c4cabb8a81635baadc713f Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 3 Jul 2013 15:01:22 -0700
Subject: pagemap: prepare to reuse constant bits with page-shift

In order to reuse bits from pagemap entries gracefully, we leave the
entries as is but on pagemap open emit a warning in dmesg, that bits
55-60 are about to change in a couple of releases.  Next, if a user
issues soft-dirty clear command via the clear_refs file (it was disabled
before v3.9) we assume that he's aware of the new pagemap format, note
that fact and report the bits in pagemap in the new manner.

The "migration strategy" looks like this then:

1. existing users are not affected -- they don't touch soft-dirty feature, thus
   see old bits in pagemap, but are warned and have time to fix themselves
2. those who use soft-dirty know about new pagemap format
3. some time soon we get rid of any signs of page-shift in pagemap as well as
   this trick with clear-soft-dirty affecting pagemap format.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/pagemap.txt |  3 ++-
 fs/proc/task_mmu.c           | 35 ++++++++++++++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index 7587493c67f1..fd7c3cfddd8e 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -15,7 +15,8 @@ There are three components to pagemap:
     * Bits 0-54  page frame number (PFN) if present
     * Bits 0-4   swap type if swapped
     * Bits 5-54  swap offset if swapped
-    * Bits 55-60 page shift (page size = 1<<page shift)
+    * Bit  55    pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
+    * Bits 56-60 zero
     * Bit  61    page is file-page or shared-anon
     * Bit  62    page swapped
     * Bit  63    page present
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index a18e065c1c3e..dbf61f6174f0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -689,6 +689,23 @@ const struct file_operations proc_tid_smaps_operations = {
 	.release	= seq_release_private,
 };
 
+/*
+ * We do not want to have constant page-shift bits sitting in
+ * pagemap entries and are about to reuse them some time soon.
+ *
+ * Here's the "migration strategy":
+ * 1. when the system boots these bits remain what they are,
+ *    but a warning about future change is printed in log;
+ * 2. once anyone clears soft-dirty bits via clear_refs file,
+ *    these flag is set to denote, that user is aware of the
+ *    new API and those page-shift bits change their meaning.
+ *    The respective warning is printed in dmesg;
+ * 3. In a couple of releases we will remove all the mentions
+ *    of page-shift in pagemap entries.
+ */
+
+static bool soft_dirty_cleared __read_mostly;
+
 enum clear_refs_types {
 	CLEAR_REFS_ALL = 1,
 	CLEAR_REFS_ANON,
@@ -778,6 +795,13 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 	type = (enum clear_refs_types)itype;
 	if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
 		return -EINVAL;
+
+	if (type == CLEAR_REFS_SOFT_DIRTY) {
+		soft_dirty_cleared = true;
+		pr_warn_once("The pagemap bits 55-60 has changed their meaning! "
+				"See the linux/Documentation/vm/pagemap.txt for details.\n");
+	}
+
 	task = get_proc_task(file_inode(file));
 	if (!task)
 		return -ESRCH;
@@ -1091,7 +1115,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	if (!count)
 		goto out_task;
 
-	pm.v2 = false;
+	pm.v2 = soft_dirty_cleared;
 	pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
 	pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
 	ret = -ENOMEM;
@@ -1164,9 +1188,18 @@ out:
 	return ret;
 }
 
+static int pagemap_open(struct inode *inode, struct file *file)
+{
+	pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
+			"to stop being page-shift some time soon. See the "
+			"linux/Documentation/vm/pagemap.txt for details.\n");
+	return 0;
+}
+
 const struct file_operations proc_pagemap_operations = {
 	.llseek		= mem_lseek, /* borrow this */
 	.read		= pagemap_read,
+	.open		= pagemap_open,
 };
 #endif /* CONFIG_PROC_PAGE_MONITOR */
 
-- 
cgit v1.2.3-58-ga151


From ef9f515a4c38c759c56c7d1e760cc243b6d516f4 Mon Sep 17 00:00:00 2001
From: Libin <huawei.libin@huawei.com>
Date: Wed, 3 Jul 2013 15:01:26 -0700
Subject: ncpfs: use vma_pages() to replace (vm_end - vm_start) >> PAGE_SHIFT

(*->vm_end - *->vm_start) >> PAGE_SHIFT operation is implemented
as a inline funcion vma_pages() in linux/mm.h, so using it.

Signed-off-by: Libin <huawei.libin@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ncpfs/mmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index ee24df5af1f9..3c5dd55d284c 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -117,7 +117,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EINVAL;
 	/* we do not support files bigger than 4GB... We eventually 
 	   supports just 4GB... */
-	if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff 
+	if (vma_pages(vma) + vma->vm_pgoff
 	   > (1U << (32 - PAGE_SHIFT)))
 		return -EFBIG;
 
-- 
cgit v1.2.3-58-ga151


From b45972265f823ed01eae0867a176320071665787 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 3 Jul 2013 15:02:05 -0700
Subject: mm: vmscan: take page buffers dirty and locked state into account

Page reclaim keeps track of dirty and under writeback pages and uses it
to determine if wait_iff_congested() should stall or if kswapd should
begin writing back pages.  This fails to account for buffer pages that
can be under writeback but not PageWriteback which is the case for
filesystems like ext3 ordered mode.  Furthermore, PageDirty buffer pages
can have all the buffers clean and writepage does no IO so it should not
be accounted as congested.

This patch adds an address_space operation that filesystems may
optionally use to check if a page is really dirty or really under
writeback.  An implementation is provided for for buffer_heads is added
and used for block operations and ext3 in ordered mode.  By default the
page flags are obeyed.

Credit goes to Jan Kara for identifying that the page flags alone are
not sufficient for ext3 and sanity checking a number of ideas on how the
problem could be addressed.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: Zlatko Calusic <zcalusic@bitsync.net>
Cc: dormando <dormando@rydia.net>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c              |  1 +
 fs/buffer.c                 | 34 ++++++++++++++++++++++++++++++++++
 fs/ext3/inode.c             |  1 +
 include/linux/buffer_head.h |  3 +++
 include/linux/fs.h          |  1 +
 mm/vmscan.c                 | 10 ++++++++++
 6 files changed, 50 insertions(+)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 431b6a04ebfd..bb43ce081d6e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1562,6 +1562,7 @@ static const struct address_space_operations def_blk_aops = {
 	.writepages	= generic_writepages,
 	.releasepage	= blkdev_releasepage,
 	.direct_IO	= blkdev_direct_IO,
+	.is_dirty_writeback = buffer_check_dirty_writeback,
 };
 
 const struct file_operations def_blk_fops = {
diff --git a/fs/buffer.c b/fs/buffer.c
index f93392e2df12..4d7433534f5c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -82,6 +82,40 @@ void unlock_buffer(struct buffer_head *bh)
 }
 EXPORT_SYMBOL(unlock_buffer);
 
+/*
+ * Returns if the page has dirty or writeback buffers. If all the buffers
+ * are unlocked and clean then the PageDirty information is stale. If
+ * any of the pages are locked, it is assumed they are locked for IO.
+ */
+void buffer_check_dirty_writeback(struct page *page,
+				     bool *dirty, bool *writeback)
+{
+	struct buffer_head *head, *bh;
+	*dirty = false;
+	*writeback = false;
+
+	BUG_ON(!PageLocked(page));
+
+	if (!page_has_buffers(page))
+		return;
+
+	if (PageWriteback(page))
+		*writeback = true;
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		if (buffer_locked(bh))
+			*writeback = true;
+
+		if (buffer_dirty(bh))
+			*dirty = true;
+
+		bh = bh->b_this_page;
+	} while (bh != head);
+}
+EXPORT_SYMBOL(buffer_check_dirty_writeback);
+
 /*
  * Block until a buffer comes unlocked.  This doesn't stop it
  * from becoming locked again - you have to lock it yourself
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index f67668f724ba..2bd85486b879 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1985,6 +1985,7 @@ static const struct address_space_operations ext3_ordered_aops = {
 	.direct_IO		= ext3_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
+	.is_dirty_writeback	= buffer_check_dirty_writeback,
 	.error_remove_page	= generic_error_remove_page,
 };
 
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index f5a3b838ddb0..91fa9a94ae92 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -139,6 +139,9 @@ BUFFER_FNS(Prio, prio)
 	})
 #define page_has_buffers(page)	PagePrivate(page)
 
+void buffer_check_dirty_writeback(struct page *page,
+				     bool *dirty, bool *writeback);
+
 /*
  * Declarations
  */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2b82c8041490..99be011e00de 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -380,6 +380,7 @@ struct address_space_operations {
 	int (*launder_page) (struct page *);
 	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
 					unsigned long);
+	void (*is_dirty_writeback) (struct page *, bool *, bool *);
 	int (*error_remove_page)(struct address_space *, struct page *);
 
 	/* swapfile support */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bf4778479e3a..c85794399848 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -673,6 +673,8 @@ static enum page_references page_check_references(struct page *page,
 static void page_check_dirty_writeback(struct page *page,
 				       bool *dirty, bool *writeback)
 {
+	struct address_space *mapping;
+
 	/*
 	 * Anonymous pages are not handled by flushers and must be written
 	 * from reclaim context. Do not stall reclaim based on them
@@ -686,6 +688,14 @@ static void page_check_dirty_writeback(struct page *page,
 	/* By default assume that the page flags are accurate */
 	*dirty = PageDirty(page);
 	*writeback = PageWriteback(page);
+
+	/* Verify dirty/writeback state if the filesystem supports it */
+	if (!page_has_private(page))
+		return;
+
+	mapping = page_mapping(page);
+	if (mapping && mapping->a_ops->is_dirty_writeback)
+		mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
 }
 
 /*
-- 
cgit v1.2.3-58-ga151


From f919b19614f06711cba300c1bb1e3d94c9ca21b0 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 3 Jul 2013 15:02:06 -0700
Subject: fs: nfs: inform the VM about pages being committed or unstable

VM page reclaim uses dirty and writeback page states to determine if
flushers are cleaning pages too slowly and that page reclaim should
stall waiting on flushers to catch up.  Page state in NFS is a bit more
complex and a clean page can be unreclaimable due to being unstable
which is effectively "dirty" from the perspective of the VM from reclaim
context.  Similarly, if the inode is currently being committed then it's
similar to being under writeback.

This patch adds a is_dirty_writeback() handled for NFS that checks if a
pages backing inode is being committed and should be accounted as
writeback and if a page has private state indicating that it is
effectively dirty.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: Zlatko Calusic <zcalusic@bitsync.net>
Cc: dormando <dormando@rydia.net>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfs/file.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 6b4a79f4ad1d..94e94bd11aae 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -495,6 +495,35 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
 	return nfs_fscache_release_page(page, gfp);
 }
 
+static void nfs_check_dirty_writeback(struct page *page,
+				bool *dirty, bool *writeback)
+{
+	struct nfs_inode *nfsi;
+	struct address_space *mapping = page_file_mapping(page);
+
+	if (!mapping || PageSwapCache(page))
+		return;
+
+	/*
+	 * Check if an unstable page is currently being committed and
+	 * if so, have the VM treat it as if the page is under writeback
+	 * so it will not block due to pages that will shortly be freeable.
+	 */
+	nfsi = NFS_I(mapping->host);
+	if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) {
+		*writeback = true;
+		return;
+	}
+
+	/*
+	 * If PagePrivate() is set, then the page is not freeable and as the
+	 * inode is not being committed, it's not going to be cleaned in the
+	 * near future so treat it as dirty
+	 */
+	if (PagePrivate(page))
+		*dirty = true;
+}
+
 /*
  * Attempt to clear the private state associated with a page when an error
  * occurs that requires the cached contents of an inode to be written back or
@@ -542,6 +571,7 @@ const struct address_space_operations nfs_file_aops = {
 	.direct_IO = nfs_direct_IO,
 	.migratepage = nfs_migrate_page,
 	.launder_page = nfs_launder_page,
+	.is_dirty_writeback = nfs_check_dirty_writeback,
 	.error_remove_page = generic_error_remove_page,
 #ifdef CONFIG_NFS_SWAP
 	.swap_activate = nfs_swap_activate,
-- 
cgit v1.2.3-58-ga151


From b27eb186608c89ef0979ae47c649859ceaa1b2e7 Mon Sep 17 00:00:00 2001
From: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Date: Wed, 3 Jul 2013 15:02:13 -0700
Subject: vmcore: clean up read_vmcore()

Rewrite part of read_vmcore() that reads objects in vmcore_list in the
same way as part reading ELF headers, by which some duplicated and
redundant codes are removed.

Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Cc: Lisa Mitchell <lisa.mitchell@hp.com>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/vmcore.c | 68 +++++++++++++++++---------------------------------------
 1 file changed, 20 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 17f7e080d7ff..ab0c92e64411 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -118,27 +118,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
 	return read;
 }
 
-/* Maps vmcore file offset to respective physical address in memroy. */
-static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list,
-					struct vmcore **m_ptr)
-{
-	struct vmcore *m;
-	u64 paddr;
-
-	list_for_each_entry(m, vc_list, list) {
-		u64 start, end;
-		start = m->offset;
-		end = m->offset + m->size - 1;
-		if (offset >= start && offset <= end) {
-			paddr = m->paddr + offset - start;
-			*m_ptr = m;
-			return paddr;
-		}
-	}
-	*m_ptr = NULL;
-	return 0;
-}
-
 /* Read from the ELF header and then the crash dump. On error, negative value is
  * returned otherwise number of bytes read are returned.
  */
@@ -147,8 +126,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 {
 	ssize_t acc = 0, tmp;
 	size_t tsz;
-	u64 start, nr_bytes;
-	struct vmcore *curr_m = NULL;
+	u64 start;
+	struct vmcore *m = NULL;
 
 	if (buflen == 0 || *fpos >= vmcore_size)
 		return 0;
@@ -174,33 +153,26 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 			return acc;
 	}
 
-	start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m);
-	if (!curr_m)
-        	return -EINVAL;
-
-	while (buflen) {
-		tsz = min_t(size_t, buflen, PAGE_SIZE - (start & ~PAGE_MASK));
-
-		/* Calculate left bytes in current memory segment. */
-		nr_bytes = (curr_m->size - (start - curr_m->paddr));
-		if (tsz > nr_bytes)
-			tsz = nr_bytes;
-
-		tmp = read_from_oldmem(buffer, tsz, &start, 1);
-		if (tmp < 0)
-			return tmp;
-		buflen -= tsz;
-		*fpos += tsz;
-		buffer += tsz;
-		acc += tsz;
-		if (start >= (curr_m->paddr + curr_m->size)) {
-			if (curr_m->list.next == &vmcore_list)
-				return acc;	/*EOF*/
-			curr_m = list_entry(curr_m->list.next,
-						struct vmcore, list);
-			start = curr_m->paddr;
+	list_for_each_entry(m, &vmcore_list, list) {
+		if (*fpos < m->offset + m->size) {
+			tsz = m->offset + m->size - *fpos;
+			if (buflen < tsz)
+				tsz = buflen;
+			start = m->paddr + *fpos - m->offset;
+			tmp = read_from_oldmem(buffer, tsz, &start, 1);
+			if (tmp < 0)
+				return tmp;
+			buflen -= tsz;
+			*fpos += tsz;
+			buffer += tsz;
+			acc += tsz;
+
+			/* leave now if filled buffer already */
+			if (buflen == 0)
+				return acc;
 		}
 	}
+
 	return acc;
 }
 
-- 
cgit v1.2.3-58-ga151


From f2bdacdd597d8d05c3d5f5d36273084f7ef7e6f5 Mon Sep 17 00:00:00 2001
From: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Date: Wed, 3 Jul 2013 15:02:14 -0700
Subject: vmcore: allocate buffer for ELF headers on page-size alignment

Allocate ELF headers on page-size boundary using __get_free_pages()
instead of kmalloc().

Later patch will merge PT_NOTE entries into a single unique one and
decrease the buffer size actually used.  Keep original buffer size in
variable elfcorebuf_sz_orig to kfree the buffer later and actually used
buffer size with rounded up to page-size boundary in variable
elfcorebuf_sz separately.

The size of part of the ELF buffer exported from /proc/vmcore is
elfcorebuf_sz.

The merged, removed PT_NOTE entries, i.e.  the range [elfcorebuf_sz,
elfcorebuf_sz_orig], is filled with 0.

Use size of the ELF headers as an initial offset value in
set_vmcore_list_offsets_elf{64,32} and
process_ptload_program_headers_elf{64,32} in order to indicate that the
offset includes the holes towards the page boundary.

As a result, both set_vmcore_list_offsets_elf{64,32} have the same
definition.  Merge them as set_vmcore_list_offsets.

[akpm@linux-foundation.org: add free_elfcorebuf(), cleanups]
Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Cc: Lisa Mitchell <lisa.mitchell@hp.com>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/vmcore.c | 119 ++++++++++++++++++++++++-------------------------------
 1 file changed, 51 insertions(+), 68 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index ab0c92e64411..0b1c04e5e2c5 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -32,6 +32,7 @@ static LIST_HEAD(vmcore_list);
 /* Stores the pointer to the buffer containing kernel elf core headers. */
 static char *elfcorebuf;
 static size_t elfcorebuf_sz;
+static size_t elfcorebuf_sz_orig;
 
 /* Total size of vmcore file. */
 static u64 vmcore_size;
@@ -186,7 +187,7 @@ static struct vmcore* __init get_new_element(void)
 	return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
 }
 
-static u64 __init get_vmcore_size_elf64(char *elfptr)
+static u64 __init get_vmcore_size_elf64(char *elfptr, size_t elfsz)
 {
 	int i;
 	u64 size;
@@ -195,7 +196,7 @@ static u64 __init get_vmcore_size_elf64(char *elfptr)
 
 	ehdr_ptr = (Elf64_Ehdr *)elfptr;
 	phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
-	size = sizeof(Elf64_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr));
+	size = elfsz;
 	for (i = 0; i < ehdr_ptr->e_phnum; i++) {
 		size += phdr_ptr->p_memsz;
 		phdr_ptr++;
@@ -203,7 +204,7 @@ static u64 __init get_vmcore_size_elf64(char *elfptr)
 	return size;
 }
 
-static u64 __init get_vmcore_size_elf32(char *elfptr)
+static u64 __init get_vmcore_size_elf32(char *elfptr, size_t elfsz)
 {
 	int i;
 	u64 size;
@@ -212,7 +213,7 @@ static u64 __init get_vmcore_size_elf32(char *elfptr)
 
 	ehdr_ptr = (Elf32_Ehdr *)elfptr;
 	phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr));
-	size = sizeof(Elf32_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr));
+	size = elfsz;
 	for (i = 0; i < ehdr_ptr->e_phnum; i++) {
 		size += phdr_ptr->p_memsz;
 		phdr_ptr++;
@@ -294,6 +295,8 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
 	i = (nr_ptnote - 1) * sizeof(Elf64_Phdr);
 	*elfsz = *elfsz - i;
 	memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr)));
+	memset(elfptr + *elfsz, 0, i);
+	*elfsz = roundup(*elfsz, PAGE_SIZE);
 
 	/* Modify e_phnum to reflect merged headers. */
 	ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
@@ -375,6 +378,8 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
 	i = (nr_ptnote - 1) * sizeof(Elf32_Phdr);
 	*elfsz = *elfsz - i;
 	memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr)));
+	memset(elfptr + *elfsz, 0, i);
+	*elfsz = roundup(*elfsz, PAGE_SIZE);
 
 	/* Modify e_phnum to reflect merged headers. */
 	ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
@@ -398,8 +403,7 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
 	phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
 
 	/* First program header is PT_NOTE header. */
-	vmcore_off = sizeof(Elf64_Ehdr) +
-			(ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr) +
+	vmcore_off = elfsz +
 			phdr_ptr->p_memsz; /* Note sections */
 
 	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
@@ -435,8 +439,7 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
 	phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */
 
 	/* First program header is PT_NOTE header. */
-	vmcore_off = sizeof(Elf32_Ehdr) +
-			(ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr) +
+	vmcore_off = elfsz +
 			phdr_ptr->p_memsz; /* Note sections */
 
 	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
@@ -459,18 +462,14 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
 }
 
 /* Sets offset fields of vmcore elements. */
-static void __init set_vmcore_list_offsets_elf64(char *elfptr,
-						struct list_head *vc_list)
+static void __init set_vmcore_list_offsets(size_t elfsz,
+					   struct list_head *vc_list)
 {
 	loff_t vmcore_off;
-	Elf64_Ehdr *ehdr_ptr;
 	struct vmcore *m;
 
-	ehdr_ptr = (Elf64_Ehdr *)elfptr;
-
 	/* Skip Elf header and program headers. */
-	vmcore_off = sizeof(Elf64_Ehdr) +
-			(ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr);
+	vmcore_off = elfsz;
 
 	list_for_each_entry(m, vc_list, list) {
 		m->offset = vmcore_off;
@@ -478,24 +477,10 @@ static void __init set_vmcore_list_offsets_elf64(char *elfptr,
 	}
 }
 
-/* Sets offset fields of vmcore elements. */
-static void __init set_vmcore_list_offsets_elf32(char *elfptr,
-						struct list_head *vc_list)
+static void free_elfcorebuf(void)
 {
-	loff_t vmcore_off;
-	Elf32_Ehdr *ehdr_ptr;
-	struct vmcore *m;
-
-	ehdr_ptr = (Elf32_Ehdr *)elfptr;
-
-	/* Skip Elf header and program headers. */
-	vmcore_off = sizeof(Elf32_Ehdr) +
-			(ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr);
-
-	list_for_each_entry(m, vc_list, list) {
-		m->offset = vmcore_off;
-		vmcore_off += m->size;
-	}
+	free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig));
+	elfcorebuf = NULL;
 }
 
 static int __init parse_crash_elf64_headers(void)
@@ -526,31 +511,31 @@ static int __init parse_crash_elf64_headers(void)
 	}
 
 	/* Read in all elf headers. */
-	elfcorebuf_sz = sizeof(Elf64_Ehdr) + ehdr.e_phnum * sizeof(Elf64_Phdr);
-	elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL);
+	elfcorebuf_sz_orig = sizeof(Elf64_Ehdr) +
+				ehdr.e_phnum * sizeof(Elf64_Phdr);
+	elfcorebuf_sz = elfcorebuf_sz_orig;
+	elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+					      get_order(elfcorebuf_sz_orig));
 	if (!elfcorebuf)
 		return -ENOMEM;
 	addr = elfcorehdr_addr;
-	rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0);
-	if (rc < 0) {
-		kfree(elfcorebuf);
-		return rc;
-	}
+	rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0);
+	if (rc < 0)
+		goto fail;
 
 	/* Merge all PT_NOTE headers into one. */
 	rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, &vmcore_list);
-	if (rc) {
-		kfree(elfcorebuf);
-		return rc;
-	}
+	if (rc)
+		goto fail;
 	rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz,
 							&vmcore_list);
-	if (rc) {
-		kfree(elfcorebuf);
-		return rc;
-	}
-	set_vmcore_list_offsets_elf64(elfcorebuf, &vmcore_list);
+	if (rc)
+		goto fail;
+	set_vmcore_list_offsets(elfcorebuf_sz, &vmcore_list);
 	return 0;
+fail:
+	free_elfcorebuf();
+	return rc;
 }
 
 static int __init parse_crash_elf32_headers(void)
@@ -581,31 +566,30 @@ static int __init parse_crash_elf32_headers(void)
 	}
 
 	/* Read in all elf headers. */
-	elfcorebuf_sz = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr);
-	elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL);
+	elfcorebuf_sz_orig = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr);
+	elfcorebuf_sz = elfcorebuf_sz_orig;
+	elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+					      get_order(elfcorebuf_sz_orig));
 	if (!elfcorebuf)
 		return -ENOMEM;
 	addr = elfcorehdr_addr;
-	rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0);
-	if (rc < 0) {
-		kfree(elfcorebuf);
-		return rc;
-	}
+	rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0);
+	if (rc < 0)
+		goto fail;
 
 	/* Merge all PT_NOTE headers into one. */
 	rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, &vmcore_list);
-	if (rc) {
-		kfree(elfcorebuf);
-		return rc;
-	}
+	if (rc)
+		goto fail;
 	rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz,
 								&vmcore_list);
-	if (rc) {
-		kfree(elfcorebuf);
-		return rc;
-	}
-	set_vmcore_list_offsets_elf32(elfcorebuf, &vmcore_list);
+	if (rc)
+		goto fail;
+	set_vmcore_list_offsets(elfcorebuf_sz, &vmcore_list);
 	return 0;
+fail:
+	free_elfcorebuf();
+	return rc;
 }
 
 static int __init parse_crash_elf_headers(void)
@@ -629,14 +613,14 @@ static int __init parse_crash_elf_headers(void)
 			return rc;
 
 		/* Determine vmcore size. */
-		vmcore_size = get_vmcore_size_elf64(elfcorebuf);
+		vmcore_size = get_vmcore_size_elf64(elfcorebuf, elfcorebuf_sz);
 	} else if (e_ident[EI_CLASS] == ELFCLASS32) {
 		rc = parse_crash_elf32_headers();
 		if (rc)
 			return rc;
 
 		/* Determine vmcore size. */
-		vmcore_size = get_vmcore_size_elf32(elfcorebuf);
+		vmcore_size = get_vmcore_size_elf32(elfcorebuf, elfcorebuf_sz);
 	} else {
 		pr_warn("Warning: Core image elf header is not sane\n");
 		return -EINVAL;
@@ -683,7 +667,6 @@ void vmcore_cleanup(void)
 		list_del(&m->list);
 		kfree(m);
 	}
-	kfree(elfcorebuf);
-	elfcorebuf = NULL;
+	free_elfcorebuf();
 }
 EXPORT_SYMBOL_GPL(vmcore_cleanup);
-- 
cgit v1.2.3-58-ga151


From 7f614cd1e052ebbddee7ea49c725dc75fee74a5a Mon Sep 17 00:00:00 2001
From: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Date: Wed, 3 Jul 2013 15:02:15 -0700
Subject: vmcore: treat memory chunks referenced by PT_LOAD program header
 entries in page-size boundary in vmcore_list

Treat memory chunks referenced by PT_LOAD program header entries in
page-size boundary in vmcore_list.  Formally, for each range [start,
end], we set up the corresponding vmcore object in vmcore_list to
[rounddown(start, PAGE_SIZE), roundup(end, PAGE_SIZE)].

This change affects layout of /proc/vmcore.  The gaps generated by the
rearrangement are newly made visible to applications as holes.
Concretely, they are two ranges [rounddown(start, PAGE_SIZE), start] and
[end, roundup(end, PAGE_SIZE)].

Suppose variable m points at a vmcore object in vmcore_list, and
variable phdr points at the program header of PT_LOAD type the variable
m corresponds to.  Then, pictorially:

  m->offset                    +---------------+
                               | hole          |
phdr->p_offset =               +---------------+
  m->offset + (paddr - start)  |               |\
                               | kernel memory | phdr->p_memsz
                               |               |/
                               +---------------+
                               | hole          |
  m->offset + m->size          +---------------+

where m->offset and m->offset + m->size are always page-size aligned.

Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Cc: Lisa Mitchell <lisa.mitchell@hp.com>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/vmcore.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 0b1c04e5e2c5..78c87a1ac01a 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -407,20 +407,27 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
 			phdr_ptr->p_memsz; /* Note sections */
 
 	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		u64 paddr, start, end, size;
+
 		if (phdr_ptr->p_type != PT_LOAD)
 			continue;
 
+		paddr = phdr_ptr->p_offset;
+		start = rounddown(paddr, PAGE_SIZE);
+		end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
+		size = end - start;
+
 		/* Add this contiguous chunk of memory to vmcore list.*/
 		new = get_new_element();
 		if (!new)
 			return -ENOMEM;
-		new->paddr = phdr_ptr->p_offset;
-		new->size = phdr_ptr->p_memsz;
+		new->paddr = start;
+		new->size = size;
 		list_add_tail(&new->list, vc_list);
 
 		/* Update the program header offset. */
-		phdr_ptr->p_offset = vmcore_off;
-		vmcore_off = vmcore_off + phdr_ptr->p_memsz;
+		phdr_ptr->p_offset = vmcore_off + (paddr - start);
+		vmcore_off = vmcore_off + size;
 	}
 	return 0;
 }
@@ -443,20 +450,27 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
 			phdr_ptr->p_memsz; /* Note sections */
 
 	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		u64 paddr, start, end, size;
+
 		if (phdr_ptr->p_type != PT_LOAD)
 			continue;
 
+		paddr = phdr_ptr->p_offset;
+		start = rounddown(paddr, PAGE_SIZE);
+		end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
+		size = end - start;
+
 		/* Add this contiguous chunk of memory to vmcore list.*/
 		new = get_new_element();
 		if (!new)
 			return -ENOMEM;
-		new->paddr = phdr_ptr->p_offset;
-		new->size = phdr_ptr->p_memsz;
+		new->paddr = start;
+		new->size = size;
 		list_add_tail(&new->list, vc_list);
 
 		/* Update the program header offset */
-		phdr_ptr->p_offset = vmcore_off;
-		vmcore_off = vmcore_off + phdr_ptr->p_memsz;
+		phdr_ptr->p_offset = vmcore_off + (paddr - start);
+		vmcore_off = vmcore_off + size;
 	}
 	return 0;
 }
-- 
cgit v1.2.3-58-ga151


From 087350c9dcf1b38c597b31d7761f7366e2866e6b Mon Sep 17 00:00:00 2001
From: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Date: Wed, 3 Jul 2013 15:02:19 -0700
Subject: vmcore: allocate ELF note segment in the 2nd kernel vmalloc memory

The reasons why we don't allocate ELF note segment in the 1st kernel
(old memory) on page boundary is to keep backward compatibility for old
kernels, and that if doing so, we waste not a little memory due to
round-up operation to fit the memory to page boundary since most of the
buffers are in per-cpu area.

ELF notes are per-cpu, so total size of ELF note segments depends on
number of CPUs.  The current maximum number of CPUs on x86_64 is 5192,
and there's already system with 4192 CPUs in SGI, where total size
amounts to 1MB.  This can be larger in the near future or possibly even
now on another architecture that has larger size of note per a single
cpu.  Thus, to avoid the case where memory allocation for large block
fails, we allocate vmcore objects on vmalloc memory.

This patch adds elfnotes_buf and elfnotes_sz variables to keep pointer
to the ELF note segment buffer and its size.  There's no longer the
vmcore object that corresponds to the ELF note segment in vmcore_list.
Accordingly, read_vmcore() has new case for ELF note segment and
set_vmcore_list_offsets_elf{64,32}() and other helper functions starts
calculating offset from sum of size of ELF headers and size of ELF note
segment.

[akpm@linux-foundation.org: use min(), fix error-path vzalloc() leaks]
Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Cc: Lisa Mitchell <lisa.mitchell@hp.com>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/vmcore.c | 359 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 286 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 78c87a1ac01a..9b9270eb0599 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -34,6 +34,9 @@ static char *elfcorebuf;
 static size_t elfcorebuf_sz;
 static size_t elfcorebuf_sz_orig;
 
+static char *elfnotes_buf;
+static size_t elfnotes_sz;
+
 /* Total size of vmcore file. */
 static u64 vmcore_size;
 
@@ -139,9 +142,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 
 	/* Read ELF core header */
 	if (*fpos < elfcorebuf_sz) {
-		tsz = elfcorebuf_sz - *fpos;
-		if (buflen < tsz)
-			tsz = buflen;
+		tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen);
 		if (copy_to_user(buffer, elfcorebuf + *fpos, tsz))
 			return -EFAULT;
 		buflen -= tsz;
@@ -154,11 +155,27 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 			return acc;
 	}
 
+	/* Read Elf note segment */
+	if (*fpos < elfcorebuf_sz + elfnotes_sz) {
+		void *kaddr;
+
+		tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
+		kaddr = elfnotes_buf + *fpos - elfcorebuf_sz;
+		if (copy_to_user(buffer, kaddr, tsz))
+			return -EFAULT;
+		buflen -= tsz;
+		*fpos += tsz;
+		buffer += tsz;
+		acc += tsz;
+
+		/* leave now if filled buffer already */
+		if (buflen == 0)
+			return acc;
+	}
+
 	list_for_each_entry(m, &vmcore_list, list) {
 		if (*fpos < m->offset + m->size) {
-			tsz = m->offset + m->size - *fpos;
-			if (buflen < tsz)
-				tsz = buflen;
+			tsz = min_t(size_t, m->offset + m->size - *fpos, buflen);
 			start = m->paddr + *fpos - m->offset;
 			tmp = read_from_oldmem(buffer, tsz, &start, 1);
 			if (tmp < 0)
@@ -221,27 +238,27 @@ static u64 __init get_vmcore_size_elf32(char *elfptr, size_t elfsz)
 	return size;
 }
 
-/* Merges all the PT_NOTE headers into one. */
-static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
-						struct list_head *vc_list)
+/**
+ * update_note_header_size_elf64 - update p_memsz member of each PT_NOTE entry
+ *
+ * @ehdr_ptr: ELF header
+ *
+ * This function updates p_memsz member of each PT_NOTE entry in the
+ * program header table pointed to by @ehdr_ptr to real size of ELF
+ * note segment.
+ */
+static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
 {
-	int i, nr_ptnote=0, rc=0;
-	char *tmp;
-	Elf64_Ehdr *ehdr_ptr;
-	Elf64_Phdr phdr, *phdr_ptr;
+	int i, rc=0;
+	Elf64_Phdr *phdr_ptr;
 	Elf64_Nhdr *nhdr_ptr;
-	u64 phdr_sz = 0, note_off;
 
-	ehdr_ptr = (Elf64_Ehdr *)elfptr;
-	phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
+	phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
 	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
-		int j;
 		void *notes_section;
-		struct vmcore *new;
 		u64 offset, max_sz, sz, real_sz = 0;
 		if (phdr_ptr->p_type != PT_NOTE)
 			continue;
-		nr_ptnote++;
 		max_sz = phdr_ptr->p_memsz;
 		offset = phdr_ptr->p_offset;
 		notes_section = kmalloc(max_sz, GFP_KERNEL);
@@ -253,7 +270,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
 			return rc;
 		}
 		nhdr_ptr = notes_section;
-		for (j = 0; j < max_sz; j += sz) {
+		while (real_sz < max_sz) {
 			if (nhdr_ptr->n_namesz == 0)
 				break;
 			sz = sizeof(Elf64_Nhdr) +
@@ -262,26 +279,122 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
 			real_sz += sz;
 			nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz);
 		}
-
-		/* Add this contiguous chunk of notes section to vmcore list.*/
-		new = get_new_element();
-		if (!new) {
-			kfree(notes_section);
-			return -ENOMEM;
-		}
-		new->paddr = phdr_ptr->p_offset;
-		new->size = real_sz;
-		list_add_tail(&new->list, vc_list);
-		phdr_sz += real_sz;
 		kfree(notes_section);
+		phdr_ptr->p_memsz = real_sz;
 	}
 
+	return 0;
+}
+
+/**
+ * get_note_number_and_size_elf64 - get the number of PT_NOTE program
+ * headers and sum of real size of their ELF note segment headers and
+ * data.
+ *
+ * @ehdr_ptr: ELF header
+ * @nr_ptnote: buffer for the number of PT_NOTE program headers
+ * @sz_ptnote: buffer for size of unique PT_NOTE program header
+ *
+ * This function is used to merge multiple PT_NOTE program headers
+ * into a unique single one. The resulting unique entry will have
+ * @sz_ptnote in its phdr->p_mem.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf64
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init get_note_number_and_size_elf64(const Elf64_Ehdr *ehdr_ptr,
+						 int *nr_ptnote, u64 *sz_ptnote)
+{
+	int i;
+	Elf64_Phdr *phdr_ptr;
+
+	*nr_ptnote = *sz_ptnote = 0;
+
+	phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		if (phdr_ptr->p_type != PT_NOTE)
+			continue;
+		*nr_ptnote += 1;
+		*sz_ptnote += phdr_ptr->p_memsz;
+	}
+
+	return 0;
+}
+
+/**
+ * copy_notes_elf64 - copy ELF note segments in a given buffer
+ *
+ * @ehdr_ptr: ELF header
+ * @notes_buf: buffer into which ELF note segments are copied
+ *
+ * This function is used to copy ELF note segment in the 1st kernel
+ * into the buffer @notes_buf in the 2nd kernel. It is assumed that
+ * size of the buffer @notes_buf is equal to or larger than sum of the
+ * real ELF note segment headers and data.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf64
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf)
+{
+	int i, rc=0;
+	Elf64_Phdr *phdr_ptr;
+
+	phdr_ptr = (Elf64_Phdr*)(ehdr_ptr + 1);
+
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		u64 offset;
+		if (phdr_ptr->p_type != PT_NOTE)
+			continue;
+		offset = phdr_ptr->p_offset;
+		rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0);
+		if (rc < 0)
+			return rc;
+		notes_buf += phdr_ptr->p_memsz;
+	}
+
+	return 0;
+}
+
+/* Merges all the PT_NOTE headers into one. */
+static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
+					   char **notes_buf, size_t *notes_sz)
+{
+	int i, nr_ptnote=0, rc=0;
+	char *tmp;
+	Elf64_Ehdr *ehdr_ptr;
+	Elf64_Phdr phdr;
+	u64 phdr_sz = 0, note_off;
+
+	ehdr_ptr = (Elf64_Ehdr *)elfptr;
+
+	rc = update_note_header_size_elf64(ehdr_ptr);
+	if (rc < 0)
+		return rc;
+
+	rc = get_note_number_and_size_elf64(ehdr_ptr, &nr_ptnote, &phdr_sz);
+	if (rc < 0)
+		return rc;
+
+	*notes_sz = roundup(phdr_sz, PAGE_SIZE);
+	*notes_buf = vzalloc(*notes_sz);
+	if (!*notes_buf)
+		return -ENOMEM;
+
+	rc = copy_notes_elf64(ehdr_ptr, *notes_buf);
+	if (rc < 0)
+		return rc;
+
 	/* Prepare merged PT_NOTE program header. */
 	phdr.p_type    = PT_NOTE;
 	phdr.p_flags   = 0;
 	note_off = sizeof(Elf64_Ehdr) +
 			(ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr);
-	phdr.p_offset  = note_off;
+	phdr.p_offset  = roundup(note_off, PAGE_SIZE);
 	phdr.p_vaddr   = phdr.p_paddr = 0;
 	phdr.p_filesz  = phdr.p_memsz = phdr_sz;
 	phdr.p_align   = 0;
@@ -304,27 +417,27 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
 	return 0;
 }
 
-/* Merges all the PT_NOTE headers into one. */
-static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
-						struct list_head *vc_list)
+/**
+ * update_note_header_size_elf32 - update p_memsz member of each PT_NOTE entry
+ *
+ * @ehdr_ptr: ELF header
+ *
+ * This function updates p_memsz member of each PT_NOTE entry in the
+ * program header table pointed to by @ehdr_ptr to real size of ELF
+ * note segment.
+ */
+static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
 {
-	int i, nr_ptnote=0, rc=0;
-	char *tmp;
-	Elf32_Ehdr *ehdr_ptr;
-	Elf32_Phdr phdr, *phdr_ptr;
+	int i, rc=0;
+	Elf32_Phdr *phdr_ptr;
 	Elf32_Nhdr *nhdr_ptr;
-	u64 phdr_sz = 0, note_off;
 
-	ehdr_ptr = (Elf32_Ehdr *)elfptr;
-	phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr));
+	phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
 	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
-		int j;
 		void *notes_section;
-		struct vmcore *new;
 		u64 offset, max_sz, sz, real_sz = 0;
 		if (phdr_ptr->p_type != PT_NOTE)
 			continue;
-		nr_ptnote++;
 		max_sz = phdr_ptr->p_memsz;
 		offset = phdr_ptr->p_offset;
 		notes_section = kmalloc(max_sz, GFP_KERNEL);
@@ -336,7 +449,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
 			return rc;
 		}
 		nhdr_ptr = notes_section;
-		for (j = 0; j < max_sz; j += sz) {
+		while (real_sz < max_sz) {
 			if (nhdr_ptr->n_namesz == 0)
 				break;
 			sz = sizeof(Elf32_Nhdr) +
@@ -345,26 +458,122 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
 			real_sz += sz;
 			nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz);
 		}
-
-		/* Add this contiguous chunk of notes section to vmcore list.*/
-		new = get_new_element();
-		if (!new) {
-			kfree(notes_section);
-			return -ENOMEM;
-		}
-		new->paddr = phdr_ptr->p_offset;
-		new->size = real_sz;
-		list_add_tail(&new->list, vc_list);
-		phdr_sz += real_sz;
 		kfree(notes_section);
+		phdr_ptr->p_memsz = real_sz;
+	}
+
+	return 0;
+}
+
+/**
+ * get_note_number_and_size_elf32 - get the number of PT_NOTE program
+ * headers and sum of real size of their ELF note segment headers and
+ * data.
+ *
+ * @ehdr_ptr: ELF header
+ * @nr_ptnote: buffer for the number of PT_NOTE program headers
+ * @sz_ptnote: buffer for size of unique PT_NOTE program header
+ *
+ * This function is used to merge multiple PT_NOTE program headers
+ * into a unique single one. The resulting unique entry will have
+ * @sz_ptnote in its phdr->p_mem.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf32
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init get_note_number_and_size_elf32(const Elf32_Ehdr *ehdr_ptr,
+						 int *nr_ptnote, u64 *sz_ptnote)
+{
+	int i;
+	Elf32_Phdr *phdr_ptr;
+
+	*nr_ptnote = *sz_ptnote = 0;
+
+	phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		if (phdr_ptr->p_type != PT_NOTE)
+			continue;
+		*nr_ptnote += 1;
+		*sz_ptnote += phdr_ptr->p_memsz;
 	}
 
+	return 0;
+}
+
+/**
+ * copy_notes_elf32 - copy ELF note segments in a given buffer
+ *
+ * @ehdr_ptr: ELF header
+ * @notes_buf: buffer into which ELF note segments are copied
+ *
+ * This function is used to copy ELF note segment in the 1st kernel
+ * into the buffer @notes_buf in the 2nd kernel. It is assumed that
+ * size of the buffer @notes_buf is equal to or larger than sum of the
+ * real ELF note segment headers and data.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf32
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf)
+{
+	int i, rc=0;
+	Elf32_Phdr *phdr_ptr;
+
+	phdr_ptr = (Elf32_Phdr*)(ehdr_ptr + 1);
+
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		u64 offset;
+		if (phdr_ptr->p_type != PT_NOTE)
+			continue;
+		offset = phdr_ptr->p_offset;
+		rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0);
+		if (rc < 0)
+			return rc;
+		notes_buf += phdr_ptr->p_memsz;
+	}
+
+	return 0;
+}
+
+/* Merges all the PT_NOTE headers into one. */
+static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
+					   char **notes_buf, size_t *notes_sz)
+{
+	int i, nr_ptnote=0, rc=0;
+	char *tmp;
+	Elf32_Ehdr *ehdr_ptr;
+	Elf32_Phdr phdr;
+	u64 phdr_sz = 0, note_off;
+
+	ehdr_ptr = (Elf32_Ehdr *)elfptr;
+
+	rc = update_note_header_size_elf32(ehdr_ptr);
+	if (rc < 0)
+		return rc;
+
+	rc = get_note_number_and_size_elf32(ehdr_ptr, &nr_ptnote, &phdr_sz);
+	if (rc < 0)
+		return rc;
+
+	*notes_sz = roundup(phdr_sz, PAGE_SIZE);
+	*notes_buf = vzalloc(*notes_sz);
+	if (!*notes_buf)
+		return -ENOMEM;
+
+	rc = copy_notes_elf32(ehdr_ptr, *notes_buf);
+	if (rc < 0)
+		return rc;
+
 	/* Prepare merged PT_NOTE program header. */
 	phdr.p_type    = PT_NOTE;
 	phdr.p_flags   = 0;
 	note_off = sizeof(Elf32_Ehdr) +
 			(ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr);
-	phdr.p_offset  = note_off;
+	phdr.p_offset  = roundup(note_off, PAGE_SIZE);
 	phdr.p_vaddr   = phdr.p_paddr = 0;
 	phdr.p_filesz  = phdr.p_memsz = phdr_sz;
 	phdr.p_align   = 0;
@@ -391,6 +600,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
  * the new offset fields of exported program headers. */
 static int __init process_ptload_program_headers_elf64(char *elfptr,
 						size_t elfsz,
+						size_t elfnotes_sz,
 						struct list_head *vc_list)
 {
 	int i;
@@ -402,9 +612,8 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
 	ehdr_ptr = (Elf64_Ehdr *)elfptr;
 	phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
 
-	/* First program header is PT_NOTE header. */
-	vmcore_off = elfsz +
-			phdr_ptr->p_memsz; /* Note sections */
+	/* Skip Elf header, program headers and Elf note segment. */
+	vmcore_off = elfsz + elfnotes_sz;
 
 	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
 		u64 paddr, start, end, size;
@@ -434,6 +643,7 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
 
 static int __init process_ptload_program_headers_elf32(char *elfptr,
 						size_t elfsz,
+						size_t elfnotes_sz,
 						struct list_head *vc_list)
 {
 	int i;
@@ -445,9 +655,8 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
 	ehdr_ptr = (Elf32_Ehdr *)elfptr;
 	phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */
 
-	/* First program header is PT_NOTE header. */
-	vmcore_off = elfsz +
-			phdr_ptr->p_memsz; /* Note sections */
+	/* Skip Elf header, program headers and Elf note segment. */
+	vmcore_off = elfsz + elfnotes_sz;
 
 	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
 		u64 paddr, start, end, size;
@@ -476,14 +685,14 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
 }
 
 /* Sets offset fields of vmcore elements. */
-static void __init set_vmcore_list_offsets(size_t elfsz,
+static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
 					   struct list_head *vc_list)
 {
 	loff_t vmcore_off;
 	struct vmcore *m;
 
-	/* Skip Elf header and program headers. */
-	vmcore_off = elfsz;
+	/* Skip Elf header, program headers and Elf note segment. */
+	vmcore_off = elfsz + elfnotes_sz;
 
 	list_for_each_entry(m, vc_list, list) {
 		m->offset = vmcore_off;
@@ -495,6 +704,8 @@ static void free_elfcorebuf(void)
 {
 	free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig));
 	elfcorebuf = NULL;
+	vfree(elfnotes_buf);
+	elfnotes_buf = NULL;
 }
 
 static int __init parse_crash_elf64_headers(void)
@@ -538,14 +749,15 @@ static int __init parse_crash_elf64_headers(void)
 		goto fail;
 
 	/* Merge all PT_NOTE headers into one. */
-	rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, &vmcore_list);
+	rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz,
+				      &elfnotes_buf, &elfnotes_sz);
 	if (rc)
 		goto fail;
 	rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz,
-							&vmcore_list);
+						  elfnotes_sz, &vmcore_list);
 	if (rc)
 		goto fail;
-	set_vmcore_list_offsets(elfcorebuf_sz, &vmcore_list);
+	set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
 	return 0;
 fail:
 	free_elfcorebuf();
@@ -592,14 +804,15 @@ static int __init parse_crash_elf32_headers(void)
 		goto fail;
 
 	/* Merge all PT_NOTE headers into one. */
-	rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, &vmcore_list);
+	rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz,
+				      &elfnotes_buf, &elfnotes_sz);
 	if (rc)
 		goto fail;
 	rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz,
-								&vmcore_list);
+						  elfnotes_sz, &vmcore_list);
 	if (rc)
 		goto fail;
-	set_vmcore_list_offsets(elfcorebuf_sz, &vmcore_list);
+	set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
 	return 0;
 fail:
 	free_elfcorebuf();
-- 
cgit v1.2.3-58-ga151


From ef9e78fd2753213ea01d77f7a76a9cb6ad0f50a7 Mon Sep 17 00:00:00 2001
From: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Date: Wed, 3 Jul 2013 15:02:21 -0700
Subject: vmcore: allow user process to remap ELF note segment buffer

Now ELF note segment has been copied in the buffer on vmalloc memory.
To allow user process to remap the ELF note segment buffer with
remap_vmalloc_page, the corresponding VM area object has to have
VM_USERMAP flag set.

[akpm@linux-foundation.org: use the conventional comment layout]
Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Cc: Lisa Mitchell <lisa.mitchell@hp.com>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/vmcore.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 9b9270eb0599..1082492e02fc 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -369,6 +369,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
 	Elf64_Ehdr *ehdr_ptr;
 	Elf64_Phdr phdr;
 	u64 phdr_sz = 0, note_off;
+	struct vm_struct *vm;
 
 	ehdr_ptr = (Elf64_Ehdr *)elfptr;
 
@@ -385,6 +386,14 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
 	if (!*notes_buf)
 		return -ENOMEM;
 
+	/*
+	 * Allow users to remap ELF note segment buffer on vmalloc memory using
+	 * remap_vmalloc_range.()
+	 */
+	vm = find_vm_area(*notes_buf);
+	BUG_ON(!vm);
+	vm->flags |= VM_USERMAP;
+
 	rc = copy_notes_elf64(ehdr_ptr, *notes_buf);
 	if (rc < 0)
 		return rc;
@@ -548,6 +557,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
 	Elf32_Ehdr *ehdr_ptr;
 	Elf32_Phdr phdr;
 	u64 phdr_sz = 0, note_off;
+	struct vm_struct *vm;
 
 	ehdr_ptr = (Elf32_Ehdr *)elfptr;
 
@@ -564,6 +574,14 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
 	if (!*notes_buf)
 		return -ENOMEM;
 
+	/*
+	 * Allow users to remap ELF note segment buffer on vmalloc memory using
+	 * remap_vmalloc_range()
+	 */
+	vm = find_vm_area(*notes_buf);
+	BUG_ON(!vm);
+	vm->flags |= VM_USERMAP;
+
 	rc = copy_notes_elf32(ehdr_ptr, *notes_buf);
 	if (rc < 0)
 		return rc;
-- 
cgit v1.2.3-58-ga151


From 591ff71664e764a3806e341370f3c758cb2e7e3c Mon Sep 17 00:00:00 2001
From: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Date: Wed, 3 Jul 2013 15:02:22 -0700
Subject: vmcore: calculate vmcore file size from buffer size and total size of
 vmcore objects

The previous patches newly added holes before each chunk of memory and
the holes need to be count in vmcore file size.  There are two ways to
count file size in such a way:

1) suppose m is a poitner to the last vmcore object in vmcore_list.
   Then file size is (m->offset + m->size), or

2) calculate sum of size of buffers for ELF header, program headers,
   ELF note segments and objects in vmcore_list.

Although 1) is more direct and simpler than 2), 2) seems better in that
it reflects internal object structure of /proc/vmcore.  Thus, this patch
changes get_vmcore_size_elf{64, 32} so that it calculates size in the
way of 2).

As a result, both get_vmcore_size_elf{64, 32} have the same definition.
Merge them as get_vmcore_size.

Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Cc: Lisa Mitchell <lisa.mitchell@hp.com>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/vmcore.c | 44 +++++++++++---------------------------------
 1 file changed, 11 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 1082492e02fc..8ec648368985 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -204,36 +204,15 @@ static struct vmcore* __init get_new_element(void)
 	return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
 }
 
-static u64 __init get_vmcore_size_elf64(char *elfptr, size_t elfsz)
+static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
+				  struct list_head *vc_list)
 {
-	int i;
 	u64 size;
-	Elf64_Ehdr *ehdr_ptr;
-	Elf64_Phdr *phdr_ptr;
-
-	ehdr_ptr = (Elf64_Ehdr *)elfptr;
-	phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
-	size = elfsz;
-	for (i = 0; i < ehdr_ptr->e_phnum; i++) {
-		size += phdr_ptr->p_memsz;
-		phdr_ptr++;
-	}
-	return size;
-}
-
-static u64 __init get_vmcore_size_elf32(char *elfptr, size_t elfsz)
-{
-	int i;
-	u64 size;
-	Elf32_Ehdr *ehdr_ptr;
-	Elf32_Phdr *phdr_ptr;
+	struct vmcore *m;
 
-	ehdr_ptr = (Elf32_Ehdr *)elfptr;
-	phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr));
-	size = elfsz;
-	for (i = 0; i < ehdr_ptr->e_phnum; i++) {
-		size += phdr_ptr->p_memsz;
-		phdr_ptr++;
+	size = elfsz + elfnotesegsz;
+	list_for_each_entry(m, vc_list, list) {
+		size += m->size;
 	}
 	return size;
 }
@@ -856,20 +835,19 @@ static int __init parse_crash_elf_headers(void)
 		rc = parse_crash_elf64_headers();
 		if (rc)
 			return rc;
-
-		/* Determine vmcore size. */
-		vmcore_size = get_vmcore_size_elf64(elfcorebuf, elfcorebuf_sz);
 	} else if (e_ident[EI_CLASS] == ELFCLASS32) {
 		rc = parse_crash_elf32_headers();
 		if (rc)
 			return rc;
-
-		/* Determine vmcore size. */
-		vmcore_size = get_vmcore_size_elf32(elfcorebuf, elfcorebuf_sz);
 	} else {
 		pr_warn("Warning: Core image elf header is not sane\n");
 		return -EINVAL;
 	}
+
+	/* Determine vmcore size. */
+	vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
+				      &vmcore_list);
+
 	return 0;
 }
 
-- 
cgit v1.2.3-58-ga151


From 83086978c63afd7c73e1c173c84aeab184c1e916 Mon Sep 17 00:00:00 2001
From: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Date: Wed, 3 Jul 2013 15:02:23 -0700
Subject: vmcore: support mmap() on /proc/vmcore

This patch introduces mmap_vmcore().

Don't permit writable nor executable mapping even with mprotect()
because this mmap() is aimed at reading crash dump memory.  Non-writable
mapping is also requirement of remap_pfn_range() when mapping linear
pages on non-consecutive physical pages; see is_cow_mapping().

Set VM_MIXEDMAP flag to remap memory by remap_pfn_range and by
remap_vmalloc_range_pertial at the same time for a single vma.
do_munmap() can correctly clean partially remapped vma with two
functions in abnormal case.  See zap_pte_range(), vm_normal_page() and
their comments for details.

On x86-32 PAE kernels, mmap() supports at most 16TB memory only.  This
limitation comes from the fact that the third argument of
remap_pfn_range(), pfn, is of 32-bit length on x86-32: unsigned long.

[akpm@linux-foundation.org: use min(), switch to conventional error-unwinding approach]
Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Cc: Lisa Mitchell <lisa.mitchell@hp.com>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Tested-by: Maxim Uvarov <muvarov@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/vmcore.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 116 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 8ec648368985..28503172f2e4 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -20,6 +20,7 @@
 #include <linux/init.h>
 #include <linux/crash_dump.h>
 #include <linux/list.h>
+#include <linux/vmalloc.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include "internal.h"
@@ -194,9 +195,122 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 	return acc;
 }
 
+/**
+ * alloc_elfnotes_buf - allocate buffer for ELF note segment in
+ *                      vmalloc memory
+ *
+ * @notes_sz: size of buffer
+ *
+ * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
+ * the buffer to user-space by means of remap_vmalloc_range().
+ *
+ * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
+ * disabled and there's no need to allow users to mmap the buffer.
+ */
+static inline char *alloc_elfnotes_buf(size_t notes_sz)
+{
+#ifdef CONFIG_MMU
+	return vmalloc_user(notes_sz);
+#else
+	return vzalloc(notes_sz);
+#endif
+}
+
+/*
+ * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is
+ * essential for mmap_vmcore() in order to map physically
+ * non-contiguous objects (ELF header, ELF note segment and memory
+ * regions in the 1st kernel pointed to by PT_LOAD entries) into
+ * virtually contiguous user-space in ELF layout.
+ */
+#ifdef CONFIG_MMU
+static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
+{
+	size_t size = vma->vm_end - vma->vm_start;
+	u64 start, end, len, tsz;
+	struct vmcore *m;
+
+	start = (u64)vma->vm_pgoff << PAGE_SHIFT;
+	end = start + size;
+
+	if (size > vmcore_size || end > vmcore_size)
+		return -EINVAL;
+
+	if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+		return -EPERM;
+
+	vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
+	vma->vm_flags |= VM_MIXEDMAP;
+
+	len = 0;
+
+	if (start < elfcorebuf_sz) {
+		u64 pfn;
+
+		tsz = min(elfcorebuf_sz - (size_t)start, size);
+		pfn = __pa(elfcorebuf + start) >> PAGE_SHIFT;
+		if (remap_pfn_range(vma, vma->vm_start, pfn, tsz,
+				    vma->vm_page_prot))
+			return -EAGAIN;
+		size -= tsz;
+		start += tsz;
+		len += tsz;
+
+		if (size == 0)
+			return 0;
+	}
+
+	if (start < elfcorebuf_sz + elfnotes_sz) {
+		void *kaddr;
+
+		tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size);
+		kaddr = elfnotes_buf + start - elfcorebuf_sz;
+		if (remap_vmalloc_range_partial(vma, vma->vm_start + len,
+						kaddr, tsz))
+			goto fail;
+		size -= tsz;
+		start += tsz;
+		len += tsz;
+
+		if (size == 0)
+			return 0;
+	}
+
+	list_for_each_entry(m, &vmcore_list, list) {
+		if (start < m->offset + m->size) {
+			u64 paddr = 0;
+
+			tsz = min_t(size_t, m->offset + m->size - start, size);
+			paddr = m->paddr + start - m->offset;
+			if (remap_pfn_range(vma, vma->vm_start + len,
+					    paddr >> PAGE_SHIFT, tsz,
+					    vma->vm_page_prot))
+				goto fail;
+			size -= tsz;
+			start += tsz;
+			len += tsz;
+
+			if (size == 0)
+				return 0;
+		}
+	}
+
+	return 0;
+fail:
+	do_munmap(vma->vm_mm, vma->vm_start, len);
+	return -EAGAIN;
+}
+#else
+static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
+{
+	return -ENOSYS;
+}
+#endif
+
 static const struct file_operations proc_vmcore_operations = {
 	.read		= read_vmcore,
 	.llseek		= default_llseek,
+	.mmap		= mmap_vmcore,
 };
 
 static struct vmcore* __init get_new_element(void)
@@ -348,7 +462,6 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
 	Elf64_Ehdr *ehdr_ptr;
 	Elf64_Phdr phdr;
 	u64 phdr_sz = 0, note_off;
-	struct vm_struct *vm;
 
 	ehdr_ptr = (Elf64_Ehdr *)elfptr;
 
@@ -361,18 +474,10 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
 		return rc;
 
 	*notes_sz = roundup(phdr_sz, PAGE_SIZE);
-	*notes_buf = vzalloc(*notes_sz);
+	*notes_buf = alloc_elfnotes_buf(*notes_sz);
 	if (!*notes_buf)
 		return -ENOMEM;
 
-	/*
-	 * Allow users to remap ELF note segment buffer on vmalloc memory using
-	 * remap_vmalloc_range.()
-	 */
-	vm = find_vm_area(*notes_buf);
-	BUG_ON(!vm);
-	vm->flags |= VM_USERMAP;
-
 	rc = copy_notes_elf64(ehdr_ptr, *notes_buf);
 	if (rc < 0)
 		return rc;
@@ -536,7 +641,6 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
 	Elf32_Ehdr *ehdr_ptr;
 	Elf32_Phdr phdr;
 	u64 phdr_sz = 0, note_off;
-	struct vm_struct *vm;
 
 	ehdr_ptr = (Elf32_Ehdr *)elfptr;
 
@@ -549,18 +653,10 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
 		return rc;
 
 	*notes_sz = roundup(phdr_sz, PAGE_SIZE);
-	*notes_buf = vzalloc(*notes_sz);
+	*notes_buf = alloc_elfnotes_buf(*notes_sz);
 	if (!*notes_buf)
 		return -ENOMEM;
 
-	/*
-	 * Allow users to remap ELF note segment buffer on vmalloc memory using
-	 * remap_vmalloc_range()
-	 */
-	vm = find_vm_area(*notes_buf);
-	BUG_ON(!vm);
-	vm->flags |= VM_USERMAP;
-
 	rc = copy_notes_elf32(ehdr_ptr, *notes_buf);
 	if (rc < 0)
 		return rc;
-- 
cgit v1.2.3-58-ga151


From a0b8cab3b9b2efadabdcff264c450ca515e2619c Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 3 Jul 2013 15:02:32 -0700
Subject: mm: remove lru parameter from __pagevec_lru_add and remove parts of
 pagevec API

Now that the LRU to add a page to is decided at LRU-add time, remove the
misleading lru parameter from __pagevec_lru_add.  A consequence of this
is that the pagevec_lru_add_file, pagevec_lru_add_anon and similar
helpers are misleading as the caller no longer has direct control over
what LRU the page is added to.  Unused helpers are removed by this patch
and existing users of pagevec_lru_add_file() are converted to use
lru_cache_add_file() directly and use the per-cpu pagevecs instead of
creating their own pagevec.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Alexey Lyahkov <alexey.lyashkov@gmail.com>
Cc: Andrew Perepechko <anserper@ya.ru>
Cc: Robin Dong <sanbai@taobao.com>
Cc: Theodore Tso <tytso@mit.edu>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Bernd Schubert <bernd.schubert@fastmail.fm>
Cc: David Howells <dhowells@redhat.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/cachefiles/rdwr.c    | 30 +++++++-----------------------
 fs/nfs/dir.c            |  7 ++-----
 include/linux/pagevec.h | 34 +---------------------------------
 mm/swap.c               | 12 ++++--------
 4 files changed, 14 insertions(+), 69 deletions(-)

(limited to 'fs')

diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 317f9ee9c991..ebaff368120d 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -12,6 +12,7 @@
 #include <linux/mount.h>
 #include <linux/slab.h>
 #include <linux/file.h>
+#include <linux/swap.h>
 #include "internal.h"
 
 /*
@@ -227,8 +228,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
  */
 static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
 					    struct fscache_retrieval *op,
-					    struct page *netpage,
-					    struct pagevec *pagevec)
+					    struct page *netpage)
 {
 	struct cachefiles_one_read *monitor;
 	struct address_space *bmapping;
@@ -237,8 +237,6 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
 
 	_enter("");
 
-	pagevec_reinit(pagevec);
-
 	_debug("read back %p{%lu,%d}",
 	       netpage, netpage->index, page_count(netpage));
 
@@ -283,9 +281,7 @@ installed_new_backing_page:
 	backpage = newpage;
 	newpage = NULL;
 
-	page_cache_get(backpage);
-	pagevec_add(pagevec, backpage);
-	__pagevec_lru_add_file(pagevec);
+	lru_cache_add_file(backpage);
 
 read_backing_page:
 	ret = bmapping->a_ops->readpage(NULL, backpage);
@@ -452,8 +448,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
 	if (block) {
 		/* submit the apparently valid page to the backing fs to be
 		 * read from disk */
-		ret = cachefiles_read_backing_file_one(object, op, page,
-						       &pagevec);
+		ret = cachefiles_read_backing_file_one(object, op, page);
 	} else if (cachefiles_has_space(cache, 0, 1) == 0) {
 		/* there's space in the cache we can use */
 		fscache_mark_page_cached(op, page);
@@ -482,14 +477,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 {
 	struct cachefiles_one_read *monitor = NULL;
 	struct address_space *bmapping = object->backer->d_inode->i_mapping;
-	struct pagevec lru_pvec;
 	struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
 	int ret = 0;
 
 	_enter("");
 
-	pagevec_init(&lru_pvec, 0);
-
 	list_for_each_entry_safe(netpage, _n, list, lru) {
 		list_del(&netpage->lru);
 
@@ -534,9 +526,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 		backpage = newpage;
 		newpage = NULL;
 
-		page_cache_get(backpage);
-		if (!pagevec_add(&lru_pvec, backpage))
-			__pagevec_lru_add_file(&lru_pvec);
+		lru_cache_add_file(backpage);
 
 	reread_backing_page:
 		ret = bmapping->a_ops->readpage(NULL, backpage);
@@ -559,9 +549,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 			goto nomem;
 		}
 
-		page_cache_get(netpage);
-		if (!pagevec_add(&lru_pvec, netpage))
-			__pagevec_lru_add_file(&lru_pvec);
+		lru_cache_add_file(netpage);
 
 		/* install a monitor */
 		page_cache_get(netpage);
@@ -643,9 +631,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 
 		fscache_mark_page_cached(op, netpage);
 
-		page_cache_get(netpage);
-		if (!pagevec_add(&lru_pvec, netpage))
-			__pagevec_lru_add_file(&lru_pvec);
+		lru_cache_add_file(netpage);
 
 		/* the netpage is unlocked and marked up to date here */
 		fscache_end_io(op, netpage, 0);
@@ -661,8 +647,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 
 out:
 	/* tidy up */
-	pagevec_lru_add_file(&lru_pvec);
-
 	if (newpage)
 		page_cache_release(newpage);
 	if (netpage)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5d051419527b..d7ed697133f0 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,6 +33,7 @@
 #include <linux/pagevec.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
+#include <linux/swap.h>
 #include <linux/sched.h>
 #include <linux/kmemleak.h>
 #include <linux/xattr.h>
@@ -1758,7 +1759,6 @@ EXPORT_SYMBOL_GPL(nfs_unlink);
  */
 int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 {
-	struct pagevec lru_pvec;
 	struct page *page;
 	char *kaddr;
 	struct iattr attr;
@@ -1798,11 +1798,8 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 	 * No big deal if we can't add this page to the page cache here.
 	 * READLINK will get the missing page from the server if needed.
 	 */
-	pagevec_init(&lru_pvec, 0);
-	if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
+	if (!add_to_page_cache_lru(page, dentry->d_inode->i_mapping, 0,
 							GFP_KERNEL)) {
-		pagevec_add(&lru_pvec, page);
-		pagevec_lru_add_file(&lru_pvec);
 		SetPageUptodate(page);
 		unlock_page(page);
 	} else
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 2aa12b8499c0..e4dbfab37729 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -21,7 +21,7 @@ struct pagevec {
 };
 
 void __pagevec_release(struct pagevec *pvec);
-void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru);
+void __pagevec_lru_add(struct pagevec *pvec);
 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 		pgoff_t start, unsigned nr_pages);
 unsigned pagevec_lookup_tag(struct pagevec *pvec,
@@ -64,36 +64,4 @@ static inline void pagevec_release(struct pagevec *pvec)
 		__pagevec_release(pvec);
 }
 
-static inline void __pagevec_lru_add_anon(struct pagevec *pvec)
-{
-	__pagevec_lru_add(pvec, LRU_INACTIVE_ANON);
-}
-
-static inline void __pagevec_lru_add_active_anon(struct pagevec *pvec)
-{
-	__pagevec_lru_add(pvec, LRU_ACTIVE_ANON);
-}
-
-static inline void __pagevec_lru_add_file(struct pagevec *pvec)
-{
-	__pagevec_lru_add(pvec, LRU_INACTIVE_FILE);
-}
-
-static inline void __pagevec_lru_add_active_file(struct pagevec *pvec)
-{
-	__pagevec_lru_add(pvec, LRU_ACTIVE_FILE);
-}
-
-static inline void pagevec_lru_add_file(struct pagevec *pvec)
-{
-	if (pagevec_count(pvec))
-		__pagevec_lru_add_file(pvec);
-}
-
-static inline void pagevec_lru_add_anon(struct pagevec *pvec)
-{
-	if (pagevec_count(pvec))
-		__pagevec_lru_add_anon(pvec);
-}
-
 #endif /* _LINUX_PAGEVEC_H */
diff --git a/mm/swap.c b/mm/swap.c
index c53d161fc76d..6a9d0c43924a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -505,7 +505,7 @@ void __lru_cache_add(struct page *page, enum lru_list lru)
 
 	page_cache_get(page);
 	if (!pagevec_space(pvec))
-		__pagevec_lru_add(pvec, lru);
+		__pagevec_lru_add(pvec);
 	pagevec_add(pvec, page);
 	put_cpu_var(lru_add_pvec);
 }
@@ -628,7 +628,7 @@ void lru_add_drain_cpu(int cpu)
 	struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu);
 
 	if (pagevec_count(pvec))
-		__pagevec_lru_add(pvec, NR_LRU_LISTS);
+		__pagevec_lru_add(pvec);
 
 	pvec = &per_cpu(lru_rotate_pvecs, cpu);
 	if (pagevec_count(pvec)) {
@@ -832,12 +832,10 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
 				 void *arg)
 {
-	enum lru_list requested_lru = (enum lru_list)arg;
 	int file = page_is_file_cache(page);
 	int active = PageActive(page);
 	enum lru_list lru = page_lru(page);
 
-	WARN_ON_ONCE(requested_lru < NR_LRU_LISTS && requested_lru != lru);
 	VM_BUG_ON(PageUnevictable(page));
 	VM_BUG_ON(PageLRU(page));
 
@@ -851,11 +849,9 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
  * Add the passed pages to the LRU, then drop the caller's refcount
  * on them.  Reinitialises the caller's pagevec.
  */
-void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
+void __pagevec_lru_add(struct pagevec *pvec)
 {
-	VM_BUG_ON(is_unevictable_lru(lru));
-
-	pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru);
+	pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
 }
 EXPORT_SYMBOL(__pagevec_lru_add);
 
-- 
cgit v1.2.3-58-ga151


From 0ed5fd138539940a493dc69359cb2f49de70ad89 Mon Sep 17 00:00:00 2001
From: Jiang Liu <liuj97@gmail.com>
Date: Wed, 3 Jul 2013 15:03:43 -0700
Subject: mm: use totalram_pages instead of num_physpages at runtime

The global variable num_physpages is scheduled to be removed, so use
totalram_pages instead of num_physpages at runtime.

Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: James Morris <jmorris@namei.org>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Patrick McHardy <kaber@trash.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/inode.c          | 2 +-
 kernel/power/snapshot.c  | 4 ++--
 net/ipv4/inet_fragment.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9a0cdde14a08..0b578598c6ac 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -785,7 +785,7 @@ static const struct super_operations fuse_super_operations = {
 static void sanitize_global_limit(unsigned *limit)
 {
 	if (*limit == 0)
-		*limit = ((num_physpages << PAGE_SHIFT) >> 13) /
+		*limit = ((totalram_pages << PAGE_SHIFT) >> 13) /
 			 sizeof(struct fuse_req);
 
 	if (*limit >= 1 << 16)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0de28576807d..8b5d1cd933f4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1651,7 +1651,7 @@ unsigned long snapshot_get_image_size(void)
 static int init_header(struct swsusp_info *info)
 {
 	memset(info, 0, sizeof(struct swsusp_info));
-	info->num_physpages = num_physpages;
+	info->num_physpages = get_num_physpages();
 	info->image_pages = nr_copy_pages;
 	info->pages = snapshot_get_image_size();
 	info->size = info->pages;
@@ -1795,7 +1795,7 @@ static int check_header(struct swsusp_info *info)
 	char *reason;
 
 	reason = check_image_kernel(info);
-	if (!reason && info->num_physpages != num_physpages)
+	if (!reason && info->num_physpages != get_num_physpages())
 		reason = "memory size";
 	if (reason) {
 		printk(KERN_ERR "PM: Image mismatch: %s\n", reason);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 7e06641e36ae..cec539458307 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -93,7 +93,7 @@ void inet_frags_init(struct inet_frags *f)
 	}
 	rwlock_init(&f->lock);
 
-	f->rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
+	f->rnd = (u32) ((totalram_pages ^ (totalram_pages >> 7)) ^
 				   (jiffies ^ (jiffies >> 6)));
 
 	setup_timer(&f->secret_timer, inet_frag_secret_rebuild,
-- 
cgit v1.2.3-58-ga151


From 096a8aac6bf4a5a0b2ef812ad76d056bbf3fb2af Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 3 Jul 2013 15:04:55 -0700
Subject: clean up scary strncpy(dst, src, strlen(src)) uses

Fix various weird constructions of strncpy(dst, src, strlen(src)).

Length limits should be about the space available in the destination,
not repurposed as a method to either always include or always exclude a
trailing NULL byte.  Either the NULL should always be copied (using
strlcpy), or it should not be copied (using something like memcpy).
Readable code should not depend on the weird behavior of strncpy when it
hits the length limit.  Better to avoid the anti-pattern entirely.

[akpm@linux-foundation.org: revert getdelays.c part due to missing bsd/string.h]
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>	[staging]
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>	[acpi]
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ursula Braun <ursula.braun@de.ibm.com>
Cc: Frank Blaschka <blaschka@linux.vnet.ibm.com>
Cc: Richard Weinberger <richard@nod.at>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/acpi/sysfs.c                             |  3 +--
 drivers/s390/net/qeth_l3_sys.c                   |  6 ++----
 drivers/staging/tidspbridge/rmgr/drv_interface.c |  3 +--
 fs/hppfs/hppfs.c                                 | 11 ++++++-----
 4 files changed, 10 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c
index fcae5fa2e1b3..193745d96fcc 100644
--- a/drivers/acpi/sysfs.c
+++ b/drivers/acpi/sysfs.c
@@ -677,10 +677,9 @@ void acpi_irq_stats_init(void)
 		else
 			sprintf(buffer, "bug%02X", i);
 
-		name = kzalloc(strlen(buffer) + 1, GFP_KERNEL);
+		name = kstrdup(buffer, GFP_KERNEL);
 		if (name == NULL)
 			goto fail;
-		strncpy(name, buffer, strlen(buffer) + 1);
 
 		sysfs_attr_init(&counter_attrs[i].attr);
 		counter_attrs[i].attr.name = name;
diff --git a/drivers/s390/net/qeth_l3_sys.c b/drivers/s390/net/qeth_l3_sys.c
index e70af2406ff9..d1c8025b0b03 100644
--- a/drivers/s390/net/qeth_l3_sys.c
+++ b/drivers/s390/net/qeth_l3_sys.c
@@ -315,10 +315,8 @@ static ssize_t qeth_l3_dev_hsuid_store(struct device *dev,
 	if (qeth_configure_cq(card, QETH_CQ_ENABLED))
 		return -EPERM;
 
-	for (i = 0; i < 8; i++)
-		card->options.hsuid[i] = ' ';
-	card->options.hsuid[8] = '\0';
-	strncpy(card->options.hsuid, tmp, strlen(tmp));
+	snprintf(card->options.hsuid, sizeof(card->options.hsuid),
+		 "%-8s", tmp);
 	ASCEBC(card->options.hsuid, 8);
 	if (card->dev)
 		memcpy(card->dev->perm_addr, card->options.hsuid, 9);
diff --git a/drivers/staging/tidspbridge/rmgr/drv_interface.c b/drivers/staging/tidspbridge/rmgr/drv_interface.c
index 9c020562c846..6d04eb48bfbc 100644
--- a/drivers/staging/tidspbridge/rmgr/drv_interface.c
+++ b/drivers/staging/tidspbridge/rmgr/drv_interface.c
@@ -421,12 +421,11 @@ static int omap3_bridge_startup(struct platform_device *pdev)
 	drv_datap->tc_wordswapon = tc_wordswapon;
 
 	if (base_img) {
-		drv_datap->base_img = kmalloc(strlen(base_img) + 1, GFP_KERNEL);
+		drv_datap->base_img = kstrdup(base_img, GFP_KERNEL);
 		if (!drv_datap->base_img) {
 			err = -ENOMEM;
 			goto err2;
 		}
-		strncpy(drv_datap->base_img, base_img, strlen(base_img) + 1);
 	}
 
 	dev_set_drvdata(bridge, drv_datap);
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index fc90ab11c340..4338ff32959d 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -69,7 +69,7 @@ static char *dentry_name(struct dentry *dentry, int extra)
 	struct dentry *parent;
 	char *root, *name;
 	const char *seg_name;
-	int len, seg_len;
+	int len, seg_len, root_len;
 
 	len = 0;
 	parent = dentry;
@@ -81,7 +81,8 @@ static char *dentry_name(struct dentry *dentry, int extra)
 	}
 
 	root = "proc";
-	len += strlen(root);
+	root_len = strlen(root);
+	len += root_len;
 	name = kmalloc(len + extra + 1, GFP_KERNEL);
 	if (name == NULL)
 		return NULL;
@@ -91,7 +92,7 @@ static char *dentry_name(struct dentry *dentry, int extra)
 	while (parent->d_parent != parent) {
 		if (is_pid(parent)) {
 			seg_name = "pid";
-			seg_len = strlen("pid");
+			seg_len = strlen(seg_name);
 		}
 		else {
 			seg_name = parent->d_name.name;
@@ -100,10 +101,10 @@ static char *dentry_name(struct dentry *dentry, int extra)
 
 		len -= seg_len + 1;
 		name[len] = '/';
-		strncpy(&name[len + 1], seg_name, seg_len);
+		memcpy(&name[len + 1], seg_name, seg_len);
 		parent = parent->d_parent;
 	}
-	strncpy(name, root, strlen(root));
+	memcpy(name, root, root_len);
 	return name;
 }
 
-- 
cgit v1.2.3-58-ga151


From f170168b9a0b61ea1e647b082b38f605f1d3de3e Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 3 Jul 2013 15:04:58 -0700
Subject: drivers: avoid parsing names as kthread_run() format strings

Calling kthread_run with a single name parameter causes it to be handled
as a format string. Many callers are passing potentially dynamic string
content, so use "%s" in those cases to avoid any potential accidents.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/aoe/aoecmd.c           | 2 +-
 drivers/block/mtip32xx/mtip32xx.c    | 3 ++-
 drivers/block/xen-blkback/xenbus.c   | 2 +-
 drivers/hwmon/adt7470.c              | 2 +-
 drivers/media/i2c/tvaudio.c          | 3 ++-
 drivers/media/pci/ivtv/ivtv-driver.c | 2 +-
 drivers/media/platform/vivi.c        | 3 ++-
 drivers/mtd/ubi/build.c              | 2 +-
 drivers/net/wireless/airo.c          | 3 ++-
 drivers/scsi/aacraid/commctrl.c      | 3 ++-
 drivers/scsi/aacraid/commsup.c       | 3 ++-
 drivers/spi/spi.c                    | 2 +-
 drivers/staging/rtl8712/os_intfs.c   | 2 +-
 drivers/usb/atm/usbatm.c             | 5 +++--
 fs/lockd/svc.c                       | 2 +-
 fs/nfs/callback.c                    | 5 ++---
 fs/nfs/nfs4state.c                   | 2 +-
 kernel/rcutree.c                     | 2 +-
 net/sunrpc/svc.c                     | 2 +-
 19 files changed, 28 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index fc803ecbbce4..b75c7db16559 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -1340,7 +1340,7 @@ aoe_ktstart(struct ktstate *k)
 	struct task_struct *task;
 
 	init_completion(&k->rendez);
-	task = kthread_run(kthread, k, k->name);
+	task = kthread_run(kthread, k, "%s", k->name);
 	if (task == NULL || IS_ERR(task))
 		return -ENOMEM;
 	k->task = task;
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 20dd52a2f92f..952dbfe22126 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -4087,7 +4087,8 @@ skip_create_disk:
 start_service_thread:
 	sprintf(thd_name, "mtip_svc_thd_%02d", index);
 	dd->mtip_svc_handler = kthread_create_on_node(mtip_service_thread,
-						dd, dd->numa_node, thd_name);
+						dd, dd->numa_node, "%s",
+						thd_name);
 
 	if (IS_ERR(dd->mtip_svc_handler)) {
 		dev_err(&dd->pdev->dev, "service thread failed to start\n");
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 8bfd1bcf95ec..04608a6502d7 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -93,7 +93,7 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
 	}
 	invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
 
-	blkif->xenblkd = kthread_run(xen_blkif_schedule, blkif, name);
+	blkif->xenblkd = kthread_run(xen_blkif_schedule, blkif, "%s", name);
 	if (IS_ERR(blkif->xenblkd)) {
 		err = PTR_ERR(blkif->xenblkd);
 		blkif->xenblkd = NULL;
diff --git a/drivers/hwmon/adt7470.c b/drivers/hwmon/adt7470.c
index b83bf4bb95eb..0f34bca9f5e5 100644
--- a/drivers/hwmon/adt7470.c
+++ b/drivers/hwmon/adt7470.c
@@ -1285,7 +1285,7 @@ static int adt7470_probe(struct i2c_client *client,
 	}
 
 	init_completion(&data->auto_update_stop);
-	data->auto_update = kthread_run(adt7470_update_thread, client,
+	data->auto_update = kthread_run(adt7470_update_thread, client, "%s",
 					dev_name(data->hwmon_dev));
 	if (IS_ERR(data->auto_update)) {
 		err = PTR_ERR(data->auto_update);
diff --git a/drivers/media/i2c/tvaudio.c b/drivers/media/i2c/tvaudio.c
index b72a59d3216a..e0634c8b7e0b 100644
--- a/drivers/media/i2c/tvaudio.c
+++ b/drivers/media/i2c/tvaudio.c
@@ -2020,7 +2020,8 @@ static int tvaudio_probe(struct i2c_client *client, const struct i2c_device_id *
 		/* start async thread */
 		chip->wt.function = chip_thread_wake;
 		chip->wt.data     = (unsigned long)chip;
-		chip->thread = kthread_run(chip_thread, chip, client->name);
+		chip->thread = kthread_run(chip_thread, chip, "%s",
+					   client->name);
 		if (IS_ERR(chip->thread)) {
 			v4l2_warn(sd, "failed to create kthread\n");
 			chip->thread = NULL;
diff --git a/drivers/media/pci/ivtv/ivtv-driver.c b/drivers/media/pci/ivtv/ivtv-driver.c
index 07b8460953b6..b809bc868a9f 100644
--- a/drivers/media/pci/ivtv/ivtv-driver.c
+++ b/drivers/media/pci/ivtv/ivtv-driver.c
@@ -753,7 +753,7 @@ static int ivtv_init_struct1(struct ivtv *itv)
 
 	init_kthread_worker(&itv->irq_worker);
 	itv->irq_worker_task = kthread_run(kthread_worker_fn, &itv->irq_worker,
-					   itv->v4l2_dev.name);
+					   "%s", itv->v4l2_dev.name);
 	if (IS_ERR(itv->irq_worker_task)) {
 		IVTV_ERR("Could not create ivtv task\n");
 		return -1;
diff --git a/drivers/media/platform/vivi.c b/drivers/media/platform/vivi.c
index 85bc314382d3..1d3f11965196 100644
--- a/drivers/media/platform/vivi.c
+++ b/drivers/media/platform/vivi.c
@@ -768,7 +768,8 @@ static int vivi_start_generating(struct vivi_dev *dev)
 
 	dma_q->frame = 0;
 	dma_q->ini_jiffies = jiffies;
-	dma_q->kthread = kthread_run(vivi_thread, dev, dev->v4l2_dev.name);
+	dma_q->kthread = kthread_run(vivi_thread, dev, "%s",
+				     dev->v4l2_dev.name);
 
 	if (IS_ERR(dma_q->kthread)) {
 		v4l2_err(&dev->v4l2_dev, "kernel_thread() failed\n");
diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c
index a56133585e92..0aaece9107c7 100644
--- a/drivers/mtd/ubi/build.c
+++ b/drivers/mtd/ubi/build.c
@@ -1005,7 +1005,7 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num,
 	if (err)
 		goto out_uif;
 
-	ubi->bgt_thread = kthread_create(ubi_thread, ubi, ubi->bgt_name);
+	ubi->bgt_thread = kthread_create(ubi_thread, ubi, "%s", ubi->bgt_name);
 	if (IS_ERR(ubi->bgt_thread)) {
 		err = PTR_ERR(ubi->bgt_thread);
 		ubi_err("cannot spawn \"%s\", error %d", ubi->bgt_name,
diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
index 6125adb520a3..d0adbaf86186 100644
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -1893,7 +1893,8 @@ static int airo_open(struct net_device *dev) {
 
 	if (ai->wifidev != dev) {
 		clear_bit(JOB_DIE, &ai->jobs);
-		ai->airo_thread_task = kthread_run(airo_thread, dev, dev->name);
+		ai->airo_thread_task = kthread_run(airo_thread, dev, "%s",
+						   dev->name);
 		if (IS_ERR(ai->airo_thread_task))
 			return (int)PTR_ERR(ai->airo_thread_task);
 
diff --git a/drivers/scsi/aacraid/commctrl.c b/drivers/scsi/aacraid/commctrl.c
index 1ef041bc60c8..d85ac1a9d2c0 100644
--- a/drivers/scsi/aacraid/commctrl.c
+++ b/drivers/scsi/aacraid/commctrl.c
@@ -318,7 +318,8 @@ return_fib:
 			kthread_stop(dev->thread);
 			ssleep(1);
 			dev->aif_thread = 0;
-			dev->thread = kthread_run(aac_command_thread, dev, dev->name);
+			dev->thread = kthread_run(aac_command_thread, dev,
+						  "%s", dev->name);
 			ssleep(1);
 		}
 		if (f.wait) {
diff --git a/drivers/scsi/aacraid/commsup.c b/drivers/scsi/aacraid/commsup.c
index 1be0776a80c4..cab190af6345 100644
--- a/drivers/scsi/aacraid/commsup.c
+++ b/drivers/scsi/aacraid/commsup.c
@@ -1336,7 +1336,8 @@ static int _aac_reset_adapter(struct aac_dev *aac, int forced)
 		if ((retval = pci_set_dma_mask(aac->pdev, DMA_BIT_MASK(32))))
 			goto out;
 	if (jafo) {
-		aac->thread = kthread_run(aac_command_thread, aac, aac->name);
+		aac->thread = kthread_run(aac_command_thread, aac, "%s",
+					  aac->name);
 		if (IS_ERR(aac->thread)) {
 			retval = PTR_ERR(aac->thread);
 			goto out;
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 32b7bb111eb6..085db8b2f2bc 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -601,7 +601,7 @@ static int spi_init_queue(struct spi_master *master)
 
 	init_kthread_worker(&master->kworker);
 	master->kworker_task = kthread_run(kthread_worker_fn,
-					   &master->kworker,
+					   &master->kworker, "%s",
 					   dev_name(&master->dev));
 	if (IS_ERR(master->kworker_task)) {
 		dev_err(&master->dev, "failed to create message pump task\n");
diff --git a/drivers/staging/rtl8712/os_intfs.c b/drivers/staging/rtl8712/os_intfs.c
index b65bf5e177a8..6e81ba0eaf1e 100644
--- a/drivers/staging/rtl8712/os_intfs.c
+++ b/drivers/staging/rtl8712/os_intfs.c
@@ -238,7 +238,7 @@ struct net_device *r8712_init_netdev(void)
 
 static u32 start_drv_threads(struct _adapter *padapter)
 {
-	padapter->cmdThread = kthread_run(r8712_cmd_thread, padapter,
+	padapter->cmdThread = kthread_run(r8712_cmd_thread, padapter, "%s",
 			      padapter->pnetdev->name);
 	if (IS_ERR(padapter->cmdThread) < 0)
 		return _FAIL;
diff --git a/drivers/usb/atm/usbatm.c b/drivers/usb/atm/usbatm.c
index d3527dd8b90c..5e0d33a7da58 100644
--- a/drivers/usb/atm/usbatm.c
+++ b/drivers/usb/atm/usbatm.c
@@ -1020,7 +1020,7 @@ static int usbatm_heavy_init(struct usbatm_data *instance)
 {
 	struct task_struct *t;
 
-	t = kthread_create(usbatm_do_heavy_init, instance,
+	t = kthread_create(usbatm_do_heavy_init, instance, "%s",
 			instance->driver->driver_name);
 	if (IS_ERR(t)) {
 		usb_err(instance, "%s: failed to create kernel_thread (%ld)!\n",
@@ -1076,7 +1076,8 @@ int usbatm_usb_probe(struct usb_interface *intf, const struct usb_device_id *id,
 	/* public fields */
 
 	instance->driver = driver;
-	snprintf(instance->driver_name, sizeof(instance->driver_name), driver->driver_name);
+	strlcpy(instance->driver_name, driver->driver_name,
+		sizeof(instance->driver_name));
 
 	instance->usb_dev = usb_dev;
 	instance->usb_intf = intf;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index a2aa97d45670..10d6c41aecad 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -305,7 +305,7 @@ static int lockd_start_svc(struct svc_serv *serv)
 	svc_sock_update_bufs(serv);
 	serv->sv_maxconn = nlm_max_connections;
 
-	nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
+	nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, "%s", serv->sv_name);
 	if (IS_ERR(nlmsvc_task)) {
 		error = PTR_ERR(nlmsvc_task);
 		printk(KERN_WARNING
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index cff089a412c7..da6a43d19aa3 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -211,7 +211,6 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
 	struct svc_rqst *rqstp;
 	int (*callback_svc)(void *vrqstp);
 	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
-	char svc_name[12];
 	int ret;
 
 	nfs_callback_bc_serv(minorversion, xprt, serv);
@@ -235,10 +234,10 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
 
 	svc_sock_update_bufs(serv);
 
-	sprintf(svc_name, "nfsv4.%u-svc", minorversion);
 	cb_info->serv = serv;
 	cb_info->rqst = rqstp;
-	cb_info->task = kthread_run(callback_svc, cb_info->rqst, svc_name);
+	cb_info->task = kthread_run(callback_svc, cb_info->rqst,
+				    "nfsv4.%u-svc", minorversion);
 	if (IS_ERR(cb_info->task)) {
 		ret = PTR_ERR(cb_info->task);
 		svc_exit_thread(cb_info->rqst);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index ff10b4aa534c..55418811a55a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1194,7 +1194,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
 	snprintf(buf, sizeof(buf), "%s-manager",
 			rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
 	rcu_read_unlock();
-	task = kthread_run(nfs4_run_state_manager, clp, buf);
+	task = kthread_run(nfs4_run_state_manager, clp, "%s", buf);
 	if (IS_ERR(task)) {
 		printk(KERN_ERR "%s: kthread_run: %ld\n",
 			__func__, PTR_ERR(task));
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index cf3adc6fe001..e08abb9461ac 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -3026,7 +3026,7 @@ static int __init rcu_spawn_gp_kthread(void)
 	struct task_struct *t;
 
 	for_each_rcu_flavor(rsp) {
-		t = kthread_run(rcu_gp_kthread, rsp, rsp->name);
+		t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name);
 		BUG_ON(IS_ERR(t));
 		rnp = rcu_get_root(rsp);
 		raw_spin_lock_irqsave(&rnp->lock, flags);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 89a588b4478b..b974571126fe 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -740,7 +740,7 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 
 		__module_get(serv->sv_module);
 		task = kthread_create_on_node(serv->sv_function, rqstp,
-					      node, serv->sv_name);
+					      node, "%s", serv->sv_name);
 		if (IS_ERR(task)) {
 			error = PTR_ERR(task);
 			module_put(serv->sv_module);
-- 
cgit v1.2.3-58-ga151


From c7ef972c440fc9f1eda28b450cd30ad15c4d60cf Mon Sep 17 00:00:00 2001
From: Vyacheslav Dubeyko <slava@dubeyko.com>
Date: Wed, 3 Jul 2013 15:08:05 -0700
Subject: nilfs2: implement calculation of free inodes count

Currently, NILFS2 returns 0 as free inodes count (f_ffree) and current
used inodes count as total file nodes in file system (f_files):

df -i
Filesystem      Inodes  IUsed   IFree IUse% Mounted on
/dev/loop0           2      2       0  100% /mnt/nilfs2

This patch implements real calculation of free inodes count.  First of
all, it is calculated total file nodes in file system as
(desc_blocks_count * groups_per_desc_block * entries_per_group).  Then, it
is calculated free inodes count as difference the total file nodes and
used inodes count.  As a result, we have such output for NILFS2:

df -i
Filesystem       Inodes   IUsed    IFree IUse% Mounted on
/dev/loop0      4194304 2114701  2079603   51% /mnt/nilfs2

Reported-by: Clemens Eisserer <linuxhippy@gmail.com>
Signed-off-by: Vyacheslav Dubeyko <slava@dubeyko.com>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Tested-by: Vyacheslav Dubeyko <slava@dubeyko.com>
Cc: Joern Engel <joern@logfs.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/alloc.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nilfs2/alloc.h |  2 ++
 fs/nilfs2/ifile.c | 22 +++++++++++++++++++
 fs/nilfs2/ifile.h |  2 ++
 fs/nilfs2/super.c | 25 ++++++++++++++++++++--
 5 files changed, 112 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index eed4d7b26249..741fd02e0444 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -397,6 +397,69 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
 		     max - curr + 1);
 }
 
+/**
+ * nilfs_palloc_count_desc_blocks - count descriptor blocks number
+ * @inode: inode of metadata file using this allocator
+ * @desc_blocks: descriptor blocks number [out]
+ */
+static int nilfs_palloc_count_desc_blocks(struct inode *inode,
+					    unsigned long *desc_blocks)
+{
+	unsigned long blknum;
+	int ret;
+
+	ret = nilfs_bmap_last_key(NILFS_I(inode)->i_bmap, &blknum);
+	if (likely(!ret))
+		*desc_blocks = DIV_ROUND_UP(
+			blknum, NILFS_MDT(inode)->mi_blocks_per_desc_block);
+	return ret;
+}
+
+/**
+ * nilfs_palloc_mdt_file_can_grow - check potential opportunity for
+ *					MDT file growing
+ * @inode: inode of metadata file using this allocator
+ * @desc_blocks: known current descriptor blocks count
+ */
+static inline bool nilfs_palloc_mdt_file_can_grow(struct inode *inode,
+						    unsigned long desc_blocks)
+{
+	return (nilfs_palloc_groups_per_desc_block(inode) * desc_blocks) <
+			nilfs_palloc_groups_count(inode);
+}
+
+/**
+ * nilfs_palloc_count_max_entries - count max number of entries that can be
+ *					described by descriptor blocks count
+ * @inode: inode of metadata file using this allocator
+ * @nused: current number of used entries
+ * @nmaxp: max number of entries [out]
+ */
+int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp)
+{
+	unsigned long desc_blocks = 0;
+	u64 entries_per_desc_block, nmax;
+	int err;
+
+	err = nilfs_palloc_count_desc_blocks(inode, &desc_blocks);
+	if (unlikely(err))
+		return err;
+
+	entries_per_desc_block = (u64)nilfs_palloc_entries_per_group(inode) *
+				nilfs_palloc_groups_per_desc_block(inode);
+	nmax = entries_per_desc_block * desc_blocks;
+
+	if (nused == nmax &&
+			nilfs_palloc_mdt_file_can_grow(inode, desc_blocks))
+		nmax += entries_per_desc_block;
+
+	if (nused > nmax)
+		return -ERANGE;
+
+	*nmaxp = nmax;
+	return 0;
+}
+
 /**
  * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
  * @inode: inode of metadata file using this allocator
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index fb7238100548..4bd6451b5703 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -48,6 +48,8 @@ int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
 void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
 				   const struct buffer_head *, void *);
 
+int nilfs_palloc_count_max_entries(struct inode *, u64, u64 *);
+
 /**
  * nilfs_palloc_req - persistent allocator request and reply
  * @pr_entry_nr: entry number (vblocknr or inode number)
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index d8e65bde083c..d788a5928351 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -159,6 +159,28 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
 	return err;
 }
 
+/**
+ * nilfs_ifile_count_free_inodes - calculate free inodes count
+ * @ifile: ifile inode
+ * @nmaxinodes: current maximum of available inodes count [out]
+ * @nfreeinodes: free inodes count [out]
+ */
+int nilfs_ifile_count_free_inodes(struct inode *ifile,
+				    u64 *nmaxinodes, u64 *nfreeinodes)
+{
+	u64 nused;
+	int err;
+
+	*nmaxinodes = 0;
+	*nfreeinodes = 0;
+
+	nused = atomic_read(&NILFS_I(ifile)->i_root->inodes_count);
+	err = nilfs_palloc_count_max_entries(ifile, nused, nmaxinodes);
+	if (likely(!err))
+		*nfreeinodes = *nmaxinodes - nused;
+	return err;
+}
+
 /**
  * nilfs_ifile_read - read or get ifile inode
  * @sb: super block instance
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 59b6f2b51df6..679674d13372 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -49,6 +49,8 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
 int nilfs_ifile_delete_inode(struct inode *, ino_t);
 int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
 
+int nilfs_ifile_count_free_inodes(struct inode *, u64 *, u64 *);
+
 int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
 		     size_t inode_size, struct nilfs_inode *raw_inode,
 		     struct inode **inodep);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index c7d1f9f18b09..7d257e78fbad 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -609,6 +609,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	unsigned long overhead;
 	unsigned long nrsvblocks;
 	sector_t nfreeblocks;
+	u64 nmaxinodes, nfreeinodes;
 	int err;
 
 	/*
@@ -633,14 +634,34 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	if (unlikely(err))
 		return err;
 
+	err = nilfs_ifile_count_free_inodes(root->ifile,
+					    &nmaxinodes, &nfreeinodes);
+	if (unlikely(err)) {
+		printk(KERN_WARNING
+			"NILFS warning: fail to count free inodes: err %d.\n",
+			err);
+		if (err == -ERANGE) {
+			/*
+			 * If nilfs_palloc_count_max_entries() returns
+			 * -ERANGE error code then we simply treat
+			 * curent inodes count as maximum possible and
+			 * zero as free inodes value.
+			 */
+			nmaxinodes = atomic_read(&root->inodes_count);
+			nfreeinodes = 0;
+			err = 0;
+		} else
+			return err;
+	}
+
 	buf->f_type = NILFS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = blocks - overhead;
 	buf->f_bfree = nfreeblocks;
 	buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
 		(buf->f_bfree - nrsvblocks) : 0;
-	buf->f_files = atomic_read(&root->inodes_count);
-	buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
+	buf->f_files = nmaxinodes;
+	buf->f_ffree = nfreeinodes;
 	buf->f_namelen = NILFS_NAME_LEN;
 	buf->f_fsid.val[0] = (u32)id;
 	buf->f_fsid.val[1] = (u32)(id >> 32);
-- 
cgit v1.2.3-58-ga151


From e5f7f84843154db8b6ef5b2ac5e286f72212f54e Mon Sep 17 00:00:00 2001
From: Vyacheslav Dubeyko <slava@dubeyko.com>
Date: Wed, 3 Jul 2013 15:08:06 -0700
Subject: ] nilfs2: use atomic64_t type for inodes_count and blocks_count
 fields in nilfs_root struct

The cp_inodes_count and cp_blocks_count are represented as __le64 type in
on-disk structure (struct nilfs_checkpoint).  But analogous fields in
in-core structure (struct nilfs_root) are represented by atomic_t type.

This patch replaces atomic_t on atomic64_t type in representation of
inodes_count and blocks_count fields in struct nilfs_root.

Signed-off-by: Vyacheslav Dubeyko <slava@dubeyko.com>
Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Acked-by: Joern Engel <joern@logfs.org>
Cc: Clemens Eisserer <linuxhippy@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/ifile.c     | 2 +-
 fs/nilfs2/inode.c     | 8 ++++----
 fs/nilfs2/segment.c   | 4 ++--
 fs/nilfs2/super.c     | 8 +++++---
 fs/nilfs2/the_nilfs.c | 4 ++--
 fs/nilfs2/the_nilfs.h | 4 ++--
 6 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index d788a5928351..6548c7851b48 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -174,7 +174,7 @@ int nilfs_ifile_count_free_inodes(struct inode *ifile,
 	*nmaxinodes = 0;
 	*nfreeinodes = 0;
 
-	nused = atomic_read(&NILFS_I(ifile)->i_root->inodes_count);
+	nused = atomic64_read(&NILFS_I(ifile)->i_root->inodes_count);
 	err = nilfs_palloc_count_max_entries(ifile, nused, nmaxinodes);
 	if (likely(!err))
 		*nfreeinodes = *nmaxinodes - nused;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index bccfec8343c5..b1a5277cfd18 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -54,7 +54,7 @@ void nilfs_inode_add_blocks(struct inode *inode, int n)
 
 	inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
 	if (root)
-		atomic_add(n, &root->blocks_count);
+		atomic64_add(n, &root->blocks_count);
 }
 
 void nilfs_inode_sub_blocks(struct inode *inode, int n)
@@ -63,7 +63,7 @@ void nilfs_inode_sub_blocks(struct inode *inode, int n)
 
 	inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
 	if (root)
-		atomic_sub(n, &root->blocks_count);
+		atomic64_sub(n, &root->blocks_count);
 }
 
 /**
@@ -369,7 +369,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
 		goto failed_ifile_create_inode;
 	/* reference count of i_bh inherits from nilfs_mdt_read_block() */
 
-	atomic_inc(&root->inodes_count);
+	atomic64_inc(&root->inodes_count);
 	inode_init_owner(inode, dir, mode);
 	inode->i_ino = ino;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -801,7 +801,7 @@ void nilfs_evict_inode(struct inode *inode)
 
 	ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
 	if (!ret)
-		atomic_dec(&ii->i_root->inodes_count);
+		atomic64_dec(&ii->i_root->inodes_count);
 
 	nilfs_clear_inode(inode);
 
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index a5752a589932..bd88a7461063 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -835,9 +835,9 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
 	raw_cp->cp_snapshot_list.ssl_next = 0;
 	raw_cp->cp_snapshot_list.ssl_prev = 0;
 	raw_cp->cp_inodes_count =
-		cpu_to_le64(atomic_read(&sci->sc_root->inodes_count));
+		cpu_to_le64(atomic64_read(&sci->sc_root->inodes_count));
 	raw_cp->cp_blocks_count =
-		cpu_to_le64(atomic_read(&sci->sc_root->blocks_count));
+		cpu_to_le64(atomic64_read(&sci->sc_root->blocks_count));
 	raw_cp->cp_nblk_inc =
 		cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
 	raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 7d257e78fbad..1427de5ebf4d 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -554,8 +554,10 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
 	if (err)
 		goto failed_bh;
 
-	atomic_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
-	atomic_set(&root->blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
+	atomic64_set(&root->inodes_count,
+			le64_to_cpu(raw_cp->cp_inodes_count));
+	atomic64_set(&root->blocks_count,
+			le64_to_cpu(raw_cp->cp_blocks_count));
 
 	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
 
@@ -647,7 +649,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 			 * curent inodes count as maximum possible and
 			 * zero as free inodes value.
 			 */
-			nmaxinodes = atomic_read(&root->inodes_count);
+			nmaxinodes = atomic64_read(&root->inodes_count);
 			nfreeinodes = 0;
 			err = 0;
 		} else
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 41e6a04a561f..94c451ce6d24 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -764,8 +764,8 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
 	new->ifile = NULL;
 	new->nilfs = nilfs;
 	atomic_set(&new->count, 1);
-	atomic_set(&new->inodes_count, 0);
-	atomic_set(&new->blocks_count, 0);
+	atomic64_set(&new->inodes_count, 0);
+	atomic64_set(&new->blocks_count, 0);
 
 	rb_link_node(&new->rb_node, parent, p);
 	rb_insert_color(&new->rb_node, &nilfs->ns_cptree);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index be1267a34cea..de8cc53b4a5c 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -241,8 +241,8 @@ struct nilfs_root {
 	struct the_nilfs *nilfs;
 	struct inode *ifile;
 
-	atomic_t inodes_count;
-	atomic_t blocks_count;
+	atomic64_t inodes_count;
+	atomic64_t blocks_count;
 };
 
 /* Special checkpoint number */
-- 
cgit v1.2.3-58-ga151


From e68e96d2a7b2391cc1f70e207ec9cbe9d092adc9 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Wed, 3 Jul 2013 15:08:08 -0700
Subject: fs/fat: use fat_msg() to replace printk() in __fat_fs_error()

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/misc.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 359d307b5507..628e22a5a543 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -30,7 +30,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
 		va_start(args, fmt);
 		vaf.fmt = fmt;
 		vaf.va = &args;
-		printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf);
+		fat_msg(sb, KERN_ERR, "error, %pV", &vaf);
 		va_end(args);
 	}
 
@@ -38,8 +38,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
 		panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id);
 	else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) {
 		sb->s_flags |= MS_RDONLY;
-		printk(KERN_ERR "FAT-fs (%s): Filesystem has been "
-				"set read-only\n", sb->s_id);
+		fat_msg(sb, KERN_ERR, "Filesystem has been set read-only");
 	}
 }
 EXPORT_SYMBOL_GPL(__fat_fs_error);
-- 
cgit v1.2.3-58-ga151


From 77d55918029ef4626bfd6ba716728dcc7d919de3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 3 Jul 2013 15:08:14 -0700
Subject: signals: eventpoll: do not use sigprocmask()

sigprocmask() should die. None of the current callers actually
need this strange interface.

Change fs/eventpoll.c to use set_current_blocked(). This also
means we should not worry about SIGKILL/SIGSTOP.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eric Wong <normalperson@yhbt.net>
Cc: Jason Baron <jbaron@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index deecc7294a67..0f0f73624a88 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1975,8 +1975,8 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
 			return -EINVAL;
 		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
 			return -EFAULT;
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+		sigsaved = current->blocked;
+		set_current_blocked(&ksigmask);
 	}
 
 	error = sys_epoll_wait(epfd, events, maxevents, timeout);
@@ -1993,7 +1993,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
 			       sizeof(sigsaved));
 			set_restore_sigmask();
 		} else
-			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+			set_current_blocked(&sigsaved);
 	}
 
 	return error;
@@ -2020,8 +2020,8 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 		if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
 			return -EFAULT;
 		sigset_from_compat(&ksigmask, &csigmask);
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+		sigsaved = current->blocked;
+		set_current_blocked(&ksigmask);
 	}
 
 	err = sys_epoll_wait(epfd, events, maxevents, timeout);
@@ -2038,7 +2038,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 			       sizeof(sigsaved));
 			set_restore_sigmask();
 		} else
-			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+			set_current_blocked(&sigsaved);
 	}
 
 	return err;
-- 
cgit v1.2.3-58-ga151


From e7fd1549aeb83e34ee0955cdf5dee5d4088508f3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 3 Jul 2013 15:08:16 -0700
Subject: coredump: format_corename() can leak cn->corename

do_coredump() assumes that format_corename() can only fail if
expand_corename() fails and frees cn->corename.  This is not true, for
example cn_print_exe_file() can fail and in this case nobody frees
cn->corename.

Change do_coredump() to always do kfree(cn->corename) after it calls
format_corename() (NULL is fine), change expand_corename() to do nothing
if kmalloc() fails.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Colin Walters <walters@verbum.org>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Lennart Poettering <mzxreary@0pointer.de>
Cc: Lucas De Marchi <lucas.de.marchi@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coredump.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/coredump.c b/fs/coredump.c
index dafafbafa731..11bc368e0017 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -58,16 +58,14 @@ static atomic_t call_count = ATOMIC_INIT(1);
 
 static int expand_corename(struct core_name *cn)
 {
-	char *old_corename = cn->corename;
+	int size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
+	char *corename = krealloc(cn->corename, size, GFP_KERNEL);
 
-	cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
-	cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
-
-	if (!cn->corename) {
-		kfree(old_corename);
+	if (!corename)
 		return -ENOMEM;
-	}
 
+	cn->size = size;
+	cn->corename = corename;
 	return 0;
 }
 
@@ -157,10 +155,9 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
 	int pid_in_pattern = 0;
 	int err = 0;
 
+	cn->used = 0;
 	cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
 	cn->corename = kmalloc(cn->size, GFP_KERNEL);
-	cn->used = 0;
-
 	if (!cn->corename)
 		return -ENOMEM;
 
@@ -549,7 +546,7 @@ void do_coredump(siginfo_t *siginfo)
 		if (ispipe < 0) {
 			printk(KERN_WARNING "format_corename failed\n");
 			printk(KERN_WARNING "Aborting core\n");
-			goto fail_corename;
+			goto fail_unlock;
 		}
 
 		if (cprm.limit == 1) {
@@ -669,7 +666,6 @@ fail_dropcount:
 		atomic_dec(&core_dump_count);
 fail_unlock:
 	kfree(cn.corename);
-fail_corename:
 	coredump_finish(mm, core_dumped);
 	revert_creds(old_cred);
 fail_creds:
-- 
cgit v1.2.3-58-ga151


From bc03c691aa86948af4e272ebdcdd4203018210f3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 3 Jul 2013 15:08:17 -0700
Subject: coredump: introduce cn_vprintf()

Turn cn_printf(...) into cn_vprintf(va_list args), reintroduce
cn_printf() as a trivial wrapper.

This simplifies the next change and cn_vprintf() will have more
callers.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Colin Walters <walters@verbum.org>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Lennart Poettering <mzxreary@0pointer.de>
Cc: Lucas De Marchi <lucas.de.marchi@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coredump.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/coredump.c b/fs/coredump.c
index 11bc368e0017..c10a43aae220 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -69,17 +69,13 @@ static int expand_corename(struct core_name *cn)
 	return 0;
 }
 
-static int cn_printf(struct core_name *cn, const char *fmt, ...)
+static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg)
 {
 	char *cur;
 	int need;
 	int ret;
-	va_list arg;
 
-	va_start(arg, fmt);
 	need = vsnprintf(NULL, 0, fmt, arg);
-	va_end(arg);
-
 	if (likely(need < cn->size - cn->used - 1))
 		goto out_printf;
 
@@ -89,9 +85,7 @@ static int cn_printf(struct core_name *cn, const char *fmt, ...)
 
 out_printf:
 	cur = cn->corename + cn->used;
-	va_start(arg, fmt);
 	vsnprintf(cur, need + 1, fmt, arg);
-	va_end(arg);
 	cn->used += need;
 	return 0;
 
@@ -99,6 +93,18 @@ expand_fail:
 	return ret;
 }
 
+static int cn_printf(struct core_name *cn, const char *fmt, ...)
+{
+	va_list arg;
+	int ret;
+
+	va_start(arg, fmt);
+	ret = cn_vprintf(cn, fmt, arg);
+	va_end(arg);
+
+	return ret;
+}
+
 static void cn_escape(char *str)
 {
 	for (; *str; str++)
-- 
cgit v1.2.3-58-ga151


From 5fe9d8ca21cc1517258fe448639392d5d542eec6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 3 Jul 2013 15:08:19 -0700
Subject: coredump: cn_vprintf() has no reason to call vsnprintf() twice

cn_vprintf() looks really overcomplicated and sub-optimal.  We do not need
vsnprintf(NULL) to calculate the size we need, we can simply try to print
into the current buffer and expand/retry only if necessary.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Colin Walters <walters@verbum.org>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Lennart Poettering <mzxreary@0pointer.de>
Cc: Lucas De Marchi <lucas.de.marchi@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coredump.c | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/coredump.c b/fs/coredump.c
index c10a43aae220..2b1d1f54e630 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -71,26 +71,20 @@ static int expand_corename(struct core_name *cn)
 
 static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg)
 {
-	char *cur;
-	int need;
-	int ret;
-
-	need = vsnprintf(NULL, 0, fmt, arg);
-	if (likely(need < cn->size - cn->used - 1))
-		goto out_printf;
-
-	ret = expand_corename(cn);
-	if (ret)
-		goto expand_fail;
+	int free, need;
+
+again:
+	free = cn->size - cn->used;
+	need = vsnprintf(cn->corename + cn->used, free, fmt, arg);
+	if (need < free) {
+		cn->used += need;
+		return 0;
+	}
 
-out_printf:
-	cur = cn->corename + cn->used;
-	vsnprintf(cur, need + 1, fmt, arg);
-	cn->used += need;
-	return 0;
+	if (!expand_corename(cn))
+		goto again;
 
-expand_fail:
-	return ret;
+	return -ENOMEM;
 }
 
 static int cn_printf(struct core_name *cn, const char *fmt, ...)
-- 
cgit v1.2.3-58-ga151


From 923bed030ff6e20b5176e10da151fade83097891 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 3 Jul 2013 15:08:20 -0700
Subject: coredump: kill cn_escape(), introduce cn_esc_printf()

The usage of cn_escape() looks really annoying, imho this sequence needs a
wrapper.  And it is buggy.  If cn_printf() does expand_corename()
cn_escape() writes to the freed memory.

Introduce cn_esc_printf() which hopefully does this all right.  It records
the index before cn_vprintf(), not "char *" which is no longer valid (in
general) after krealloc().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Colin Walters <walters@verbum.org>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Lennart Poettering <mzxreary@0pointer.de>
Cc: Lucas De Marchi <lucas.de.marchi@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coredump.c | 44 +++++++++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/coredump.c b/fs/coredump.c
index 2b1d1f54e630..90d7cee54347 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -99,11 +99,21 @@ static int cn_printf(struct core_name *cn, const char *fmt, ...)
 	return ret;
 }
 
-static void cn_escape(char *str)
+static int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
 {
-	for (; *str; str++)
-		if (*str == '/')
-			*str = '!';
+	int cur = cn->used;
+	va_list arg;
+	int ret;
+
+	va_start(arg, fmt);
+	ret = cn_vprintf(cn, fmt, arg);
+	va_end(arg);
+
+	for (; cur < cn->used; ++cur) {
+		if (cn->corename[cur] == '/')
+			cn->corename[cur] = '!';
+	}
+	return ret;
 }
 
 static int cn_print_exe_file(struct core_name *cn)
@@ -113,12 +123,8 @@ static int cn_print_exe_file(struct core_name *cn)
 	int ret;
 
 	exe_file = get_mm_exe_file(current->mm);
-	if (!exe_file) {
-		char *commstart = cn->corename + cn->used;
-		ret = cn_printf(cn, "%s (path unknown)", current->comm);
-		cn_escape(commstart);
-		return ret;
-	}
+	if (!exe_file)
+		return cn_esc_printf(cn, "%s (path unknown)", current->comm);
 
 	pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
 	if (!pathbuf) {
@@ -132,9 +138,7 @@ static int cn_print_exe_file(struct core_name *cn)
 		goto free_buf;
 	}
 
-	cn_escape(path);
-
-	ret = cn_printf(cn, "%s", path);
+	ret = cn_esc_printf(cn, "%s", path);
 
 free_buf:
 	kfree(pathbuf);
@@ -207,22 +211,16 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
 				break;
 			}
 			/* hostname */
-			case 'h': {
-				char *namestart = cn->corename + cn->used;
+			case 'h':
 				down_read(&uts_sem);
-				err = cn_printf(cn, "%s",
+				err = cn_esc_printf(cn, "%s",
 					      utsname()->nodename);
 				up_read(&uts_sem);
-				cn_escape(namestart);
 				break;
-			}
 			/* executable */
-			case 'e': {
-				char *commstart = cn->corename + cn->used;
-				err = cn_printf(cn, "%s", current->comm);
-				cn_escape(commstart);
+			case 'e':
+				err = cn_esc_printf(cn, "%s", current->comm);
 				break;
-			}
 			case 'E':
 				err = cn_print_exe_file(cn);
 				break;
-- 
cgit v1.2.3-58-ga151


From 3ceadcf6d489650ade673b7197c11c521aecb038 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 3 Jul 2013 15:08:22 -0700
Subject: coredump: kill call_count, add core_name_size

Imho, "atomic_t call_count" is ugly and should die.  It buys nothing and
in fact it can grow more than necessary, expand doesn't check if it was
already incremented by another task.

Kill it, and introduce "static int core_name_size" updated by
expand_corename().  This is obviously racy too but harmless, and
core_name_size never grows for no reason.

We do not bother to to calculate the "right" new size, we simply do
kmalloc(size_we_need) and use ksize() to rely on kmalloc_index's decision.

Finally change format_corename() to use expand_corename(), krealloc(NULL)
is fine.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Colin Walters <walters@verbum.org>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Lennart Poettering <mzxreary@0pointer.de>
Cc: Lucas De Marchi <lucas.de.marchi@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coredump.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/coredump.c b/fs/coredump.c
index 90d7cee54347..56a9ab963a40 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -45,26 +45,28 @@
 #include <trace/events/sched.h>
 
 int core_uses_pid;
-char core_pattern[CORENAME_MAX_SIZE] = "core";
 unsigned int core_pipe_limit;
+char core_pattern[CORENAME_MAX_SIZE] = "core";
+static int core_name_size = CORENAME_MAX_SIZE;
 
 struct core_name {
 	char *corename;
 	int used, size;
 };
-static atomic_t call_count = ATOMIC_INIT(1);
 
 /* The maximal length of core_pattern is also specified in sysctl.c */
 
-static int expand_corename(struct core_name *cn)
+static int expand_corename(struct core_name *cn, int size)
 {
-	int size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
 	char *corename = krealloc(cn->corename, size, GFP_KERNEL);
 
 	if (!corename)
 		return -ENOMEM;
 
-	cn->size = size;
+	if (size > core_name_size) /* racy but harmless */
+		core_name_size = size;
+
+	cn->size = ksize(corename);
 	cn->corename = corename;
 	return 0;
 }
@@ -81,7 +83,7 @@ again:
 		return 0;
 	}
 
-	if (!expand_corename(cn))
+	if (!expand_corename(cn, cn->size + need - free + 1))
 		goto again;
 
 	return -ENOMEM;
@@ -160,9 +162,8 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
 	int err = 0;
 
 	cn->used = 0;
-	cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
-	cn->corename = kmalloc(cn->size, GFP_KERNEL);
-	if (!cn->corename)
+	cn->corename = NULL;
+	if (expand_corename(cn, core_name_size))
 		return -ENOMEM;
 
 	/* Repeat as long as we have more pattern to process and more output
-- 
cgit v1.2.3-58-ga151


From 888ffc5923e4343a78575918ab781e85fa22d244 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 3 Jul 2013 15:08:23 -0700
Subject: coredump: '% at the end' shouldn't bypass core_uses_pid logic

"goto end" should not bypass the "Backward compatibility with
core_uses_pid" code, move this label up.

While at it,

	- It is ugly to copy '|' into cn->corename and then inc
	  the pointer for argv_split().

	  Change format_corename() to increment pat_ptr instead.

	- Remove the dead "if (*pat_ptr == 0)" in format_corename(),
	  we already checked it is not zero.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Colin Walters <walters@verbum.org>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Lennart Poettering <mzxreary@0pointer.de>
Cc: Lucas De Marchi <lucas.de.marchi@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coredump.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/coredump.c b/fs/coredump.c
index 56a9ab963a40..72f816d6cad9 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -165,13 +165,15 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
 	cn->corename = NULL;
 	if (expand_corename(cn, core_name_size))
 		return -ENOMEM;
+	cn->corename[0] = '\0';
+
+	if (ispipe)
+		++pat_ptr;
 
 	/* Repeat as long as we have more pattern to process and more output
 	   space */
 	while (*pat_ptr) {
 		if (*pat_ptr != '%') {
-			if (*pat_ptr == 0)
-				goto out;
 			err = cn_printf(cn, "%c", *pat_ptr++);
 		} else {
 			switch (*++pat_ptr) {
@@ -240,6 +242,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
 			return err;
 	}
 
+out:
 	/* Backward compatibility with core_uses_pid:
 	 *
 	 * If core_pattern does not include a %p (as is the default)
@@ -250,7 +253,6 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
 		if (err)
 			return err;
 	}
-out:
 	return ispipe;
 }
 
@@ -580,7 +582,7 @@ void do_coredump(siginfo_t *siginfo)
 			goto fail_dropcount;
 		}
 
-		helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
+		helper_argv = argv_split(GFP_KERNEL, cn.corename, NULL);
 		if (!helper_argv) {
 			printk(KERN_WARNING "%s failed to allocate memory\n",
 			       __func__);
@@ -597,7 +599,7 @@ void do_coredump(siginfo_t *siginfo)
 
 		argv_free(helper_argv);
 		if (retval) {
-			printk(KERN_INFO "Core dump to %s pipe failed\n",
+			printk(KERN_INFO "Core dump to |%s pipe failed\n",
 			       cn.corename);
 			goto close_fail;
 		}
-- 
cgit v1.2.3-58-ga151


From 3f4185483832ccf3d2977923db576fa689c2abce Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 3 Jul 2013 15:08:25 -0700
Subject: fs/exec.c:de_thread(): use change_pid() rather than
 detach_pid/attach_pid

de_thread() can use change_pid() instead of detach + attach.  This looks
better and this ensures that, say, next_thread() can never see a task with
->pid == NULL.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Sergey Dyasly <dserrg@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 03b907cfd765..7619dddd5622 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -947,9 +947,8 @@ static int de_thread(struct task_struct *tsk)
 		 * Note: The old leader also uses this pid until release_task
 		 *       is called.  Odd but simple and correct.
 		 */
-		detach_pid(tsk, PIDTYPE_PID);
 		tsk->pid = leader->pid;
-		attach_pid(tsk, PIDTYPE_PID,  task_pid(leader));
+		change_pid(tsk, PIDTYPE_PID, task_pid(leader));
 		transfer_pid(leader, tsk, PIDTYPE_PGID);
 		transfer_pid(leader, tsk, PIDTYPE_SID);
 
-- 
cgit v1.2.3-58-ga151


From 1d98a5fa11e9a66a7cf7b03f73cab60184781065 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 3 Jul 2013 15:08:27 -0700
Subject: fs/proc/uptime.c:uptime_proc_show(): use get_monotonic_boottime()

Change uptime_proc_show() to use get_monotonic_boottime() instead of
do_posix_clock_monotonic_gettime() + monotonic_to_bootbased().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Tomas Janousek <tjanouse@redhat.com>
Cc: Tomas Smetana <tsmetana@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/uptime.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 9610ac772d7e..061894625903 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -20,8 +20,7 @@ static int uptime_proc_show(struct seq_file *m, void *v)
 	for_each_possible_cpu(i)
 		idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
 
-	do_posix_clock_monotonic_gettime(&uptime);
-	monotonic_to_bootbased(&uptime);
+	get_monotonic_boottime(&uptime);
 	nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC;
 	idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
 	idle.tv_nsec = rem;
-- 
cgit v1.2.3-58-ga151


From 30bc30df102b2d0c003d93477e04b97e6c528573 Mon Sep 17 00:00:00 2001
From: Zhao Hongjiang <zhaohongjiang@huawei.com>
Date: Wed, 3 Jul 2013 15:08:28 -0700
Subject: fs/proc/kcore.c: using strlcpy() instead of strncpy()

For NUL terminated string, set '\0' at the end.

Signed-off-by: Zhao Hongjiang <zhaohongjiang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/kcore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 0a22194e5d58..06ea155e1a59 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -408,7 +408,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
 	prpsinfo.pr_zomb	= 0;
 
 	strcpy(prpsinfo.pr_fname, "vmlinux");
-	strncpy(prpsinfo.pr_psargs, saved_command_line, ELF_PRARGSZ);
+	strlcpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs));
 
 	nhdr->p_filesz	+= notesize(&notes[1]);
 	bufp = storenote(&notes[1], bufp);
-- 
cgit v1.2.3-58-ga151


From bd9d43f47d6944019918a801398c587ee86c4b52 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 3 Jul 2013 15:08:34 -0700
Subject: fs/exec.c: do_execve_common(): use current_user()

Trivial cleanup.  do_execve_common() can use current_user() and avoid the
unnecessary "struct cred *cred" var.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Vasiliy Kulikov <segoon@openwall.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 7619dddd5622..396aec2dae90 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1464,7 +1464,6 @@ static int do_execve_common(const char *filename,
 	struct files_struct *displaced;
 	bool clear_in_exec;
 	int retval;
-	const struct cred *cred = current_cred();
 
 	/*
 	 * We move the actual failure in case of RLIMIT_NPROC excess from
@@ -1473,7 +1472,7 @@ static int do_execve_common(const char *filename,
 	 * whether NPROC limit is still exceeded.
 	 */
 	if ((current->flags & PF_NPROC_EXCEEDED) &&
-	    atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) {
+	    atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
 		retval = -EAGAIN;
 		goto out_ret;
 	}
-- 
cgit v1.2.3-58-ga151


From 266b7a021f7dcc4d4531961a47f4ef74c3c4ab6b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 3 Jul 2013 15:08:35 -0700
Subject: fs/exec.c:de_thread: mt-exec should update ->real_start_time

924b42d5 ("Use boot based time for process start time and boot time in
/proc") updated copy_process/do_task_stat but forgot about de_thread().
This breaks "ps axOT" if a sub-thread execs.

Note: I think that task->start_time should die.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Tomas Janousek <tjanouse@redhat.com>
Cc: Tomas Smetana <tsmetana@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 396aec2dae90..9c73def87642 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -932,6 +932,7 @@ static int de_thread(struct task_struct *tsk)
 		 * also take its birthdate (always earlier than our own).
 		 */
 		tsk->start_time = leader->start_time;
+		tsk->real_start_time = leader->real_start_time;
 
 		BUG_ON(!same_thread_group(leader, tsk));
 		BUG_ON(has_group_leader_pid(tsk));
-- 
cgit v1.2.3-58-ga151


From 4b30f07e74d2df673925019ad58bc59a719df3bb Mon Sep 17 00:00:00 2001
From: Tang Chen <tangchen@cn.fujitsu.com>
Date: Wed, 3 Jul 2013 15:09:16 -0700
Subject: aio: fix wrong comment in aio_complete()

ctx->ctx_lock should be ctx->completion_lock.

Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index a8ecc8313fb0..9b5ca1137419 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -625,7 +625,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 
 	/*
 	 * Add a completion event to the ring buffer. Must be done holding
-	 * ctx->ctx_lock to prevent other code from messing with the tail
+	 * ctx->completion_lock to prevent other code from messing with the tail
 	 * pointer since we might be called from irq context.
 	 */
 	spin_lock_irqsave(&ctx->completion_lock, flags);
-- 
cgit v1.2.3-58-ga151