From 35e481761cdc688dbee0ef552a13f49af8eba6cc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 19 May 2016 17:08:59 -0700 Subject: fsnotify: avoid spurious EMFILE errors from inotify_init() Inotify instance is destroyed when all references to it are dropped. That not only means that the corresponding file descriptor needs to be closed but also that all corresponding instance marks are freed (as each mark holds a reference to the inotify instance). However marks are freed only after SRCU period ends which can take some time and thus if user rapidly creates and frees inotify instances, number of existing inotify instances can exceed max_user_instances limit although from user point of view there is always at most one existing instance. Thus inotify_init() returns EMFILE error which is hard to justify from user point of view. This problem is exposed by LTP inotify06 testcase on some machines. We fix the problem by making sure all group marks are properly freed while destroying inotify instance. We wait for SRCU period to end in that path anyway since we have to make sure there is no event being added to the instance while we are tearing down the instance. So it takes only some plumbing to allow for marks to be destroyed in that path as well and not from a dedicated work item. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Jan Kara Reported-by: Xiaoguang Wang Tested-by: Xiaoguang Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fsnotify_backend.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 1259e53d9296..29f917517299 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -359,8 +359,6 @@ extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group); /* run all the marks in a group, and clear all of the marks where mark->flags & flags is true*/ extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags); -/* run all the marks in a group, and flag them to be freed */ -extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group); extern void fsnotify_get_mark(struct fsnotify_mark *mark); extern void fsnotify_put_mark(struct fsnotify_mark *mark); extern void fsnotify_unmount_inodes(struct super_block *sb); -- cgit v1.2.3-58-ga151 From bc2c53e5f1a2bae69ae50ce3a592633da7fcf6d9 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Thu, 19 May 2016 17:09:02 -0700 Subject: time: add missing implementation for timespec64_add_safe() timespec64_add_safe() has been defined in time64.h for 64 bit systems. But, 32 bit systems only have an extern function prototype defined. Provide a definition for the above function. The function will be necessary as part of y2038 changes. struct timespec is not y2038 safe. All references to timespec will be replaced by struct timespec64. The function is meant to be a replacement for timespec_add_safe(). The implementation is similar to timespec_add_safe(). Link: http://lkml.kernel.org/r/1461947989-21926-2-git-send-email-deepa.kernel@gmail.com Signed-off-by: Deepa Dinamani Acked-by: John Stultz Cc: Thomas Gleixner Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/time64.h | 4 +--- kernel/time/time.c | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/time64.h b/include/linux/time64.h index 367d5af899e8..1778937221bf 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -136,13 +136,11 @@ extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 n /* * timespec64_add_safe assumes both values are positive and checks for - * overflow. It will return TIME_T_MAX if the returned value would be - * smaller then either of the arguments. + * overflow. It will return TIME64_MAX in case of overflow. */ extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs, const struct timespec64 rhs); - static inline struct timespec64 timespec64_add(struct timespec64 lhs, struct timespec64 rhs) { diff --git a/kernel/time/time.c b/kernel/time/time.c index a4064b612066..cb1f83eb5599 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -769,3 +769,28 @@ struct timespec timespec_add_safe(const struct timespec lhs, return res; } + +#if __BITS_PER_LONG != 64 + +/* + * Add two timespec64 values and do a safety check for overflow. + * It's assumed that both values are valid (>= 0). + * And, each timespec64 is in normalized form. + */ +struct timespec64 timespec64_add_safe(const struct timespec64 lhs, + const struct timespec64 rhs) +{ + struct timespec64 res; + + set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec, + lhs.tv_nsec + rhs.tv_nsec); + + if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) { + res.tv_sec = TIME64_MAX; + res.tv_nsec = 0; + } + + return res; +} + +#endif -- cgit v1.2.3-58-ga151 From 766b9f928bd5b9b185d986d40355d1f143484136 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Thu, 19 May 2016 17:09:05 -0700 Subject: fs: poll/select/recvmmsg: use timespec64 for timeout events struct timespec is not y2038 safe. Even though timespec might be sufficient to represent timeouts, use struct timespec64 here as the plan is to get rid of all timespec reference in the kernel. The patch transitions the common functions: poll_select_set_timeout() and select_estimate_accuracy() to use timespec64. And, all the syscalls that use these functions are transitioned in the same patch. The restart block parameters for poll uses monotonic time. Use timespec64 here as well to assign timeout value. This parameter in the restart block need not change because this only holds the monotonic timestamp at which timeout should occur. And, unsigned long data type should be big enough for this timestamp. The system call interfaces will be handled in a separate series. Compat interfaces need not change as timespec64 is an alias to struct timespec on a 64 bit system. Link: http://lkml.kernel.org/r/1461947989-21926-3-git-send-email-deepa.kernel@gmail.com Signed-off-by: Deepa Dinamani Acked-by: John Stultz Acked-by: David S. Miller Cc: Alexander Viro Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 12 +++++----- fs/select.c | 67 +++++++++++++++++++++++++++++----------------------- include/linux/poll.h | 11 +++++---- net/socket.c | 8 ++++--- 4 files changed, 54 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8a74a2a52e0f..10db91218933 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1583,15 +1583,15 @@ static int ep_send_events(struct eventpoll *ep, return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false); } -static inline struct timespec ep_set_mstimeout(long ms) +static inline struct timespec64 ep_set_mstimeout(long ms) { - struct timespec now, ts = { + struct timespec64 now, ts = { .tv_sec = ms / MSEC_PER_SEC, .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC), }; - ktime_get_ts(&now); - return timespec_add_safe(now, ts); + ktime_get_ts64(&now); + return timespec64_add_safe(now, ts); } /** @@ -1621,11 +1621,11 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, ktime_t expires, *to = NULL; if (timeout > 0) { - struct timespec end_time = ep_set_mstimeout(timeout); + struct timespec64 end_time = ep_set_mstimeout(timeout); slack = select_estimate_accuracy(&end_time); to = &expires; - *to = timespec_to_ktime(end_time); + *to = timespec64_to_ktime(end_time); } else if (timeout == 0) { /* * Avoid the unnecessary trip to the wait queue loop, if the diff --git a/fs/select.c b/fs/select.c index 869293988c2a..8ed9da50896a 100644 --- a/fs/select.c +++ b/fs/select.c @@ -47,7 +47,7 @@ #define MAX_SLACK (100 * NSEC_PER_MSEC) -static long __estimate_accuracy(struct timespec *tv) +static long __estimate_accuracy(struct timespec64 *tv) { long slack; int divfactor = 1000; @@ -70,10 +70,10 @@ static long __estimate_accuracy(struct timespec *tv) return slack; } -u64 select_estimate_accuracy(struct timespec *tv) +u64 select_estimate_accuracy(struct timespec64 *tv) { u64 ret; - struct timespec now; + struct timespec64 now; /* * Realtime tasks get a slack of 0 for obvious reasons. @@ -82,8 +82,8 @@ u64 select_estimate_accuracy(struct timespec *tv) if (rt_task(current)) return 0; - ktime_get_ts(&now); - now = timespec_sub(*tv, now); + ktime_get_ts64(&now); + now = timespec64_sub(*tv, now); ret = __estimate_accuracy(&now); if (ret < current->timer_slack_ns) return current->timer_slack_ns; @@ -260,7 +260,7 @@ EXPORT_SYMBOL(poll_schedule_timeout); /** * poll_select_set_timeout - helper function to setup the timeout value - * @to: pointer to timespec variable for the final timeout + * @to: pointer to timespec64 variable for the final timeout * @sec: seconds (from user space) * @nsec: nanoseconds (from user space) * @@ -269,26 +269,28 @@ EXPORT_SYMBOL(poll_schedule_timeout); * * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0. */ -int poll_select_set_timeout(struct timespec *to, long sec, long nsec) +int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec) { - struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec}; + struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec}; - if (!timespec_valid(&ts)) + if (!timespec64_valid(&ts)) return -EINVAL; /* Optimize for the zero timeout value here */ if (!sec && !nsec) { to->tv_sec = to->tv_nsec = 0; } else { - ktime_get_ts(to); - *to = timespec_add_safe(*to, ts); + ktime_get_ts64(to); + *to = timespec64_add_safe(*to, ts); } return 0; } -static int poll_select_copy_remaining(struct timespec *end_time, void __user *p, +static int poll_select_copy_remaining(struct timespec64 *end_time, + void __user *p, int timeval, int ret) { + struct timespec64 rts64; struct timespec rts; struct timeval rtv; @@ -302,16 +304,18 @@ static int poll_select_copy_remaining(struct timespec *end_time, void __user *p, if (!end_time->tv_sec && !end_time->tv_nsec) return ret; - ktime_get_ts(&rts); - rts = timespec_sub(*end_time, rts); - if (rts.tv_sec < 0) - rts.tv_sec = rts.tv_nsec = 0; + ktime_get_ts64(&rts64); + rts64 = timespec64_sub(*end_time, rts64); + if (rts64.tv_sec < 0) + rts64.tv_sec = rts64.tv_nsec = 0; + + rts = timespec64_to_timespec(rts64); if (timeval) { if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) memset(&rtv, 0, sizeof(rtv)); - rtv.tv_sec = rts.tv_sec; - rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; + rtv.tv_sec = rts64.tv_sec; + rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; @@ -396,7 +400,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in, wait->_key |= POLLOUT_SET; } -int do_select(int n, fd_set_bits *fds, struct timespec *end_time) +int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; @@ -522,7 +526,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) * pointer to the expiry value. */ if (end_time && !to) { - expire = timespec_to_ktime(*end_time); + expire = timespec64_to_ktime(*end_time); to = &expire; } @@ -545,7 +549,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) * I'm trying ERESTARTNOHAND which restart only when you want to. */ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, - fd_set __user *exp, struct timespec *end_time) + fd_set __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; @@ -622,7 +626,7 @@ out_nofds: SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct timeval __user *, tvp) { - struct timespec end_time, *to = NULL; + struct timespec64 end_time, *to = NULL; struct timeval tv; int ret; @@ -648,15 +652,17 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, const sigset_t __user *sigmask, size_t sigsetsize) { sigset_t ksigmask, sigsaved; - struct timespec ts, end_time, *to = NULL; + struct timespec ts; + struct timespec64 ts64, end_time, *to = NULL; int ret; if (tsp) { if (copy_from_user(&ts, tsp, sizeof(ts))) return -EFAULT; + ts64 = timespec_to_timespec64(ts); to = &end_time; - if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec)) return -EINVAL; } @@ -779,7 +785,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait, } static int do_poll(struct poll_list *list, struct poll_wqueues *wait, - struct timespec *end_time) + struct timespec64 *end_time) { poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; @@ -854,7 +860,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, * pointer to the expiry value. */ if (end_time && !to) { - expire = timespec_to_ktime(*end_time); + expire = timespec64_to_ktime(*end_time); to = &expire; } @@ -868,7 +874,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, sizeof(struct pollfd)) int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, - struct timespec *end_time) + struct timespec64 *end_time) { struct poll_wqueues table; int err = -EFAULT, fdcount, len, size; @@ -936,7 +942,7 @@ static long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; - struct timespec *to = NULL, end_time; + struct timespec64 *to = NULL, end_time; int ret; if (restart_block->poll.has_timeout) { @@ -957,7 +963,7 @@ static long do_restart_poll(struct restart_block *restart_block) SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, int, timeout_msecs) { - struct timespec end_time, *to = NULL; + struct timespec64 end_time, *to = NULL; int ret; if (timeout_msecs >= 0) { @@ -993,7 +999,8 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, size_t, sigsetsize) { sigset_t ksigmask, sigsaved; - struct timespec ts, end_time, *to = NULL; + struct timespec ts; + struct timespec64 end_time, *to = NULL; int ret; if (tsp) { diff --git a/include/linux/poll.h b/include/linux/poll.h index 9fb4f40d9a26..37b057b63b46 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -96,7 +96,7 @@ extern void poll_initwait(struct poll_wqueues *pwq); extern void poll_freewait(struct poll_wqueues *pwq); extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack); -extern u64 select_estimate_accuracy(struct timespec *tv); +extern u64 select_estimate_accuracy(struct timespec64 *tv); static inline int poll_schedule(struct poll_wqueues *pwq, int state) @@ -153,12 +153,13 @@ void zero_fd_set(unsigned long nr, unsigned long *fdset) #define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1) -extern int do_select(int n, fd_set_bits *fds, struct timespec *end_time); +extern int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time); extern int do_sys_poll(struct pollfd __user * ufds, unsigned int nfds, - struct timespec *end_time); + struct timespec64 *end_time); extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, - fd_set __user *exp, struct timespec *end_time); + fd_set __user *exp, struct timespec64 *end_time); -extern int poll_select_set_timeout(struct timespec *to, long sec, long nsec); +extern int poll_select_set_timeout(struct timespec64 *to, time64_t sec, + long nsec); #endif /* _LINUX_POLL_H */ diff --git a/net/socket.c b/net/socket.c index e7793f5601ae..a1bd16106625 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2168,7 +2168,8 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, struct mmsghdr __user *entry; struct compat_mmsghdr __user *compat_entry; struct msghdr msg_sys; - struct timespec end_time; + struct timespec64 end_time; + struct timespec64 timeout64; if (timeout && poll_select_set_timeout(&end_time, timeout->tv_sec, @@ -2220,8 +2221,9 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, flags |= MSG_DONTWAIT; if (timeout) { - ktime_get_ts(timeout); - *timeout = timespec_sub(end_time, *timeout); + ktime_get_ts64(&timeout64); + *timeout = timespec64_to_timespec( + timespec64_sub(end_time, timeout64)); if (timeout->tv_sec < 0) { timeout->tv_sec = timeout->tv_nsec = 0; break; -- cgit v1.2.3-58-ga151 From 8e4f70e21877297577dce13cca97599a5864a91f Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Thu, 19 May 2016 17:09:08 -0700 Subject: time: remove timespec_add_safe() All references to timespec_add_safe() now use timespec64_add_safe(). The plan is to replace struct timespec references with struct timespec64 throughout the kernel as timespec is not y2038 safe. Drop timespec_add_safe() and use timespec64_add_safe() for all architectures. Link: http://lkml.kernel.org/r/1461947989-21926-4-git-send-email-deepa.kernel@gmail.com Signed-off-by: Deepa Dinamani Acked-by: John Stultz Cc: Thomas Gleixner Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/time64.h | 15 +++++++-------- kernel/time/time.c | 4 ---- 2 files changed, 7 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/time64.h b/include/linux/time64.h index 1778937221bf..7e5d2fa9ac46 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -65,7 +65,6 @@ static inline struct itimerspec64 itimerspec_to_itimerspec64(struct itimerspec * # define timespec64_equal timespec_equal # define timespec64_compare timespec_compare # define set_normalized_timespec64 set_normalized_timespec -# define timespec64_add_safe timespec_add_safe # define timespec64_add timespec_add # define timespec64_sub timespec_sub # define timespec64_valid timespec_valid @@ -134,13 +133,6 @@ static inline int timespec64_compare(const struct timespec64 *lhs, const struct extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec); -/* - * timespec64_add_safe assumes both values are positive and checks for - * overflow. It will return TIME64_MAX in case of overflow. - */ -extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs, - const struct timespec64 rhs); - static inline struct timespec64 timespec64_add(struct timespec64 lhs, struct timespec64 rhs) { @@ -222,4 +214,11 @@ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns) #endif +/* + * timespec64_add_safe assumes both values are positive and checks for + * overflow. It will return TIME64_MAX in case of overflow. + */ +extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs, + const struct timespec64 rhs); + #endif /* _LINUX_TIME64_H */ diff --git a/kernel/time/time.c b/kernel/time/time.c index cb1f83eb5599..667b9335f5d6 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -770,8 +770,6 @@ struct timespec timespec_add_safe(const struct timespec lhs, return res; } -#if __BITS_PER_LONG != 64 - /* * Add two timespec64 values and do a safety check for overflow. * It's assumed that both values are valid (>= 0). @@ -792,5 +790,3 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, return res; } - -#endif -- cgit v1.2.3-58-ga151 From b1e4d9d82df8ab9097f80aa208c40eab6fc29858 Mon Sep 17 00:00:00 2001 From: "Du, Changbin" Date: Thu, 19 May 2016 17:09:20 -0700 Subject: debugobjects: make fixup functions return bool instead of int I am going to introduce debugobjects infrastructure to USB subsystem. But before this, I found the code of debugobjects could be improved. This patchset will make fixup functions return bool type instead of int. Because fixup only need report success or no. boolean is the 'real' type. This patch (of 7): The object debugging infrastructure core provides some fixup callbacks for the subsystem who use it. These callbacks are called from the debug code whenever a problem in debug_object_init is detected. And debugobjects core suppose them returns 1 when the fixup was successful, otherwise 0. So the return type is boolean. A bad thing is that debug_object_fixup use the return value for arithmetic operation. It confused me that what is the reall return type. Reading over the whole code, I found some place do use the return value incorrectly(see next patch). So why use bool type instead? Signed-off-by: Du, Changbin Cc: Jonathan Corbet Cc: Josh Triplett Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tejun Heo Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/debugobjects.h | 15 ++++++++------- lib/debugobjects.c | 43 +++++++++++++++++++++---------------------- 2 files changed, 29 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h index 98ffcbd4888e..a899f10c9365 100644 --- a/include/linux/debugobjects.h +++ b/include/linux/debugobjects.h @@ -39,7 +39,8 @@ struct debug_obj { * @debug_hint: function returning address, which have associated * kernel symbol, to allow identify the object * @fixup_init: fixup function, which is called when the init check - * fails + * fails. All fixup functions must return true if fixup + * was successful, otherwise return false * @fixup_activate: fixup function, which is called when the activate check * fails * @fixup_destroy: fixup function, which is called when the destroy check @@ -51,12 +52,12 @@ struct debug_obj { */ struct debug_obj_descr { const char *name; - void *(*debug_hint) (void *addr); - int (*fixup_init) (void *addr, enum debug_obj_state state); - int (*fixup_activate) (void *addr, enum debug_obj_state state); - int (*fixup_destroy) (void *addr, enum debug_obj_state state); - int (*fixup_free) (void *addr, enum debug_obj_state state); - int (*fixup_assert_init)(void *addr, enum debug_obj_state state); + void *(*debug_hint)(void *addr); + bool (*fixup_init)(void *addr, enum debug_obj_state state); + bool (*fixup_activate)(void *addr, enum debug_obj_state state); + bool (*fixup_destroy)(void *addr, enum debug_obj_state state); + bool (*fixup_free)(void *addr, enum debug_obj_state state); + bool (*fixup_assert_init)(void *addr, enum debug_obj_state state); }; #ifdef CONFIG_DEBUG_OBJECTS diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 519b5a10fd70..a9cee165cf25 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -269,16 +269,15 @@ static void debug_print_object(struct debug_obj *obj, char *msg) * Try to repair the damage, so we have a better chance to get useful * debug output. */ -static int -debug_object_fixup(int (*fixup)(void *addr, enum debug_obj_state state), +static bool +debug_object_fixup(bool (*fixup)(void *addr, enum debug_obj_state state), void * addr, enum debug_obj_state state) { - int fixed = 0; - - if (fixup) - fixed = fixup(addr, state); - debug_objects_fixups += fixed; - return fixed; + if (fixup && fixup(addr, state)) { + debug_objects_fixups++; + return true; + } + return false; } static void debug_object_is_on_stack(void *addr, int onstack) @@ -797,7 +796,7 @@ static __initdata struct debug_obj_descr descr_type_test; * fixup_init is called when: * - an active object is initialized */ -static int __init fixup_init(void *addr, enum debug_obj_state state) +static bool __init fixup_init(void *addr, enum debug_obj_state state) { struct self_test *obj = addr; @@ -805,9 +804,9 @@ static int __init fixup_init(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: debug_object_deactivate(obj, &descr_type_test); debug_object_init(obj, &descr_type_test); - return 1; + return true; default: - return 0; + return false; } } @@ -816,7 +815,7 @@ static int __init fixup_init(void *addr, enum debug_obj_state state) * - an active object is activated * - an unknown object is activated (might be a statically initialized object) */ -static int __init fixup_activate(void *addr, enum debug_obj_state state) +static bool __init fixup_activate(void *addr, enum debug_obj_state state) { struct self_test *obj = addr; @@ -825,17 +824,17 @@ static int __init fixup_activate(void *addr, enum debug_obj_state state) if (obj->static_init == 1) { debug_object_init(obj, &descr_type_test); debug_object_activate(obj, &descr_type_test); - return 0; + return false; } - return 1; + return true; case ODEBUG_STATE_ACTIVE: debug_object_deactivate(obj, &descr_type_test); debug_object_activate(obj, &descr_type_test); - return 1; + return true; default: - return 0; + return false; } } @@ -843,7 +842,7 @@ static int __init fixup_activate(void *addr, enum debug_obj_state state) * fixup_destroy is called when: * - an active object is destroyed */ -static int __init fixup_destroy(void *addr, enum debug_obj_state state) +static bool __init fixup_destroy(void *addr, enum debug_obj_state state) { struct self_test *obj = addr; @@ -851,9 +850,9 @@ static int __init fixup_destroy(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: debug_object_deactivate(obj, &descr_type_test); debug_object_destroy(obj, &descr_type_test); - return 1; + return true; default: - return 0; + return false; } } @@ -861,7 +860,7 @@ static int __init fixup_destroy(void *addr, enum debug_obj_state state) * fixup_free is called when: * - an active object is freed */ -static int __init fixup_free(void *addr, enum debug_obj_state state) +static bool __init fixup_free(void *addr, enum debug_obj_state state) { struct self_test *obj = addr; @@ -869,9 +868,9 @@ static int __init fixup_free(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: debug_object_deactivate(obj, &descr_type_test); debug_object_free(obj, &descr_type_test); - return 1; + return true; default: - return 0; + return false; } } -- cgit v1.2.3-58-ga151 From b9fdac7f660609abb157500e468d2165b3c9cf08 Mon Sep 17 00:00:00 2001 From: "Du, Changbin" Date: Thu, 19 May 2016 17:09:41 -0700 Subject: debugobjects: insulate non-fixup logic related to static obj from fixup callbacks When activating a static object we need make sure that the object is tracked in the object tracker. If it is a non-static object then the activation is illegal. In previous implementation, each subsystem need take care of this in their fixup callbacks. Actually we can put it into debugobjects core. Thus we can save duplicated code, and have *pure* fixup callbacks. To achieve this, a new callback "is_static_object" is introduced to let the type specific code decide whether a object is static or not. If yes, we take it into object tracker, otherwise give warning and invoke fixup callback. This change has paassed debugobjects selftest, and I also do some test with all debugobjects supports enabled. At last, I have a concern about the fixups that can it change the object which is in incorrect state on fixup? Because the 'addr' may not point to any valid object if a non-static object is not tracked. Then Change such object can overwrite someone's memory and cause unexpected behaviour. For example, the timer_fixup_activate bind timer to function stub_timer. Link: http://lkml.kernel.org/r/1462576157-14539-1-git-send-email-changbin.du@intel.com [changbin.du@intel.com: improve code comments where invoke the new is_static_object callback] Link: http://lkml.kernel.org/r/1462777431-8171-1-git-send-email-changbin.du@intel.com Signed-off-by: Du, Changbin Cc: Jonathan Corbet Cc: Josh Triplett Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tejun Heo Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/debugobjects.h | 2 ++ kernel/rcu/update.c | 26 +++-------------------- kernel/time/hrtimer.c | 7 +------ kernel/time/timer.c | 43 +++++++++++++------------------------- kernel/workqueue.c | 42 ++++++++----------------------------- lib/debugobjects.c | 49 +++++++++++++++++++++++++++++--------------- 6 files changed, 60 insertions(+), 109 deletions(-) (limited to 'include') diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h index a899f10c9365..46056cb161fc 100644 --- a/include/linux/debugobjects.h +++ b/include/linux/debugobjects.h @@ -38,6 +38,7 @@ struct debug_obj { * @name: name of the object typee * @debug_hint: function returning address, which have associated * kernel symbol, to allow identify the object + * @is_static_object return true if the obj is static, otherwise return false * @fixup_init: fixup function, which is called when the init check * fails. All fixup functions must return true if fixup * was successful, otherwise return false @@ -53,6 +54,7 @@ struct debug_obj { struct debug_obj_descr { const char *name; void *(*debug_hint)(void *addr); + bool (*is_static_object)(void *addr); bool (*fixup_init)(void *addr, enum debug_obj_state state); bool (*fixup_activate)(void *addr, enum debug_obj_state state); bool (*fixup_destroy)(void *addr, enum debug_obj_state state); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index a9df198eb22d..3e888cd5a594 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -380,29 +380,9 @@ void destroy_rcu_head(struct rcu_head *head) debug_object_free(head, &rcuhead_debug_descr); } -/* - * fixup_activate is called when: - * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) - * Activation is performed internally by call_rcu(). - */ -static bool rcuhead_fixup_activate(void *addr, enum debug_obj_state state) +static bool rcuhead_is_static_object(void *addr) { - struct rcu_head *head = addr; - - switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. We just make sure that it is - * tracked in the object tracker. - */ - debug_object_init(head, &rcuhead_debug_descr); - debug_object_activate(head, &rcuhead_debug_descr); - return false; - default: - return true; - } + return true; } /** @@ -440,7 +420,7 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); struct debug_obj_descr rcuhead_debug_descr = { .name = "rcu_head", - .fixup_activate = rcuhead_fixup_activate, + .is_static_object = rcuhead_is_static_object, }; EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f962a58c0957..8c7392c4fdbd 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -351,16 +351,11 @@ static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state) /* * fixup_activate is called when: * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) + * - an unknown non-static object is activated */ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) { switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - WARN_ON_ONCE(1); - return false; - case ODEBUG_STATE_ACTIVE: WARN_ON(1); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index be33481a4da1..3a95f9728778 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -489,6 +489,14 @@ static void *timer_debug_hint(void *addr) return ((struct timer_list *) addr)->function; } +static bool timer_is_static_object(void *addr) +{ + struct timer_list *timer = addr; + + return (timer->entry.pprev == NULL && + timer->entry.next == TIMER_ENTRY_STATIC); +} + /* * fixup_init is called when: * - an active object is initialized @@ -516,30 +524,16 @@ static void stub_timer(unsigned long data) /* * fixup_activate is called when: * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) + * - an unknown non-static object is activated */ static bool timer_fixup_activate(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. The timer was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - if (timer->entry.pprev == NULL && - timer->entry.next == TIMER_ENTRY_STATIC) { - debug_object_init(timer, &timer_debug_descr); - debug_object_activate(timer, &timer_debug_descr); - return false; - } else { - setup_timer(timer, stub_timer, 0); - return true; - } - return false; + setup_timer(timer, stub_timer, 0); + return true; case ODEBUG_STATE_ACTIVE: WARN_ON(1); @@ -577,18 +571,8 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_NOTAVAILABLE: - if (timer->entry.next == TIMER_ENTRY_STATIC) { - /* - * This is not really a fixup. The timer was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - debug_object_init(timer, &timer_debug_descr); - return false; - } else { - setup_timer(timer, stub_timer, 0); - return true; - } + setup_timer(timer, stub_timer, 0); + return true; default: return false; } @@ -597,6 +581,7 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) static struct debug_obj_descr timer_debug_descr = { .name = "timer_list", .debug_hint = timer_debug_hint, + .is_static_object = timer_is_static_object, .fixup_init = timer_fixup_init, .fixup_activate = timer_fixup_activate, .fixup_free = timer_fixup_free, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6751b18fd9ac..e1c0e996b5ae 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -433,6 +433,13 @@ static void *work_debug_hint(void *addr) return ((struct work_struct *) addr)->func; } +static bool work_is_static_object(void *addr) +{ + struct work_struct *work = addr; + + return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work)); +} + /* * fixup_init is called when: * - an active object is initialized @@ -451,39 +458,6 @@ static bool work_fixup_init(void *addr, enum debug_obj_state state) } } -/* - * fixup_activate is called when: - * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) - */ -static bool work_fixup_activate(void *addr, enum debug_obj_state state) -{ - struct work_struct *work = addr; - - switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. The work struct was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) { - debug_object_init(work, &work_debug_descr); - debug_object_activate(work, &work_debug_descr); - return false; - } - WARN_ON_ONCE(1); - return false; - - case ODEBUG_STATE_ACTIVE: - WARN_ON(1); - - default: - return false; - } -} - /* * fixup_free is called when: * - an active object is freed @@ -505,8 +479,8 @@ static bool work_fixup_free(void *addr, enum debug_obj_state state) static struct debug_obj_descr work_debug_descr = { .name = "work_struct", .debug_hint = work_debug_hint, + .is_static_object = work_is_static_object, .fixup_init = work_fixup_init, - .fixup_activate = work_fixup_activate, .fixup_free = work_fixup_free, }; diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 2f07c8c697b8..a8e12601eb37 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -431,14 +431,21 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr) raw_spin_unlock_irqrestore(&db->lock, flags); /* - * This happens when a static object is activated. We - * let the type specific code decide whether this is - * true or not. + * We are here when a static object is activated. We + * let the type specific code confirm whether this is + * true or not. if true, we just make sure that the + * static object is tracked in the object tracker. If + * not, this must be a bug, so we try to fix it up. */ - if (debug_object_fixup(descr->fixup_activate, addr, - ODEBUG_STATE_NOTAVAILABLE)) { + if (descr->is_static_object && descr->is_static_object(addr)) { + /* track this static object */ + debug_object_init(addr, descr); + debug_object_activate(addr, descr); + } else { debug_print_object(&o, "activate"); - return -EINVAL; + ret = debug_object_fixup(descr->fixup_activate, addr, + ODEBUG_STATE_NOTAVAILABLE); + return ret ? 0 : -EINVAL; } return 0; } @@ -602,12 +609,18 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr) raw_spin_unlock_irqrestore(&db->lock, flags); /* - * Maybe the object is static. Let the type specific - * code decide what to do. + * Maybe the object is static, and we let the type specific + * code confirm. Track this static object if true, else invoke + * fixup. */ - if (debug_object_fixup(descr->fixup_assert_init, addr, - ODEBUG_STATE_NOTAVAILABLE)) + if (descr->is_static_object && descr->is_static_object(addr)) { + /* Track this static object */ + debug_object_init(addr, descr); + } else { debug_print_object(&o, "assert_init"); + debug_object_fixup(descr->fixup_assert_init, addr, + ODEBUG_STATE_NOTAVAILABLE); + } return; } @@ -792,6 +805,13 @@ struct self_test { static __initdata struct debug_obj_descr descr_type_test; +static bool __init is_static_object(void *addr) +{ + struct self_test *obj = addr; + + return obj->static_init; +} + /* * fixup_init is called when: * - an active object is initialized @@ -813,7 +833,7 @@ static bool __init fixup_init(void *addr, enum debug_obj_state state) /* * fixup_activate is called when: * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) + * - an unknown non-static object is activated */ static bool __init fixup_activate(void *addr, enum debug_obj_state state) { @@ -821,13 +841,7 @@ static bool __init fixup_activate(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_NOTAVAILABLE: - if (obj->static_init == 1) { - debug_object_init(obj, &descr_type_test); - debug_object_activate(obj, &descr_type_test); - return false; - } return true; - case ODEBUG_STATE_ACTIVE: debug_object_deactivate(obj, &descr_type_test); debug_object_activate(obj, &descr_type_test); @@ -916,6 +930,7 @@ out: static __initdata struct debug_obj_descr descr_type_test = { .name = "selftest", + .is_static_object = is_static_object, .fixup_init = fixup_init, .fixup_activate = fixup_activate, .fixup_destroy = fixup_destroy, -- cgit v1.2.3-58-ga151 From 815613da6a67c196d7458d0e6c278ea88e21933f Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Thu, 19 May 2016 17:09:56 -0700 Subject: kernel/padata.c: removed unused code By accident I stumbled across code that has never been used. This driver has EXPORT_SYMBOL functions, and the only user of the code is pcrypt.c, but this only uses a subset of the exported symbols. According to 'git log -G', the functions, padata_set_cpumasks, padata_add_cpu, and padata_remove_cpu have never been used since they were first introduced. This patch removes the unused code. On one 64 bit build, with CRYPTO_PCRYPT built in, the text is more than 4k smaller. kbuild_hp> size $KBUILD_OUTPUT/vmlinux text data bss dec hex filename 10566658 4678360 1122304 16367322 f9beda vmlinux 10561984 4678360 1122304 16362648 f9ac98 vmlinux On another config, 32 bit, the saving is about 0.5k bytes. kbuild_hp-x86> size $KBUILD_OUTPUT/vmlinux 6012005 2409513 2785280 11206798 ab008e vmlinux 6011491 2409513 2785280 11206284 aafe8c vmlinux Signed-off-by: Richard Cochran Cc: Steffen Klassert Cc: Herbert Xu Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/padata.h | 5 ---- kernel/padata.c | 64 -------------------------------------------------- 2 files changed, 69 deletions(-) (limited to 'include') diff --git a/include/linux/padata.h b/include/linux/padata.h index 438694650471..113ee626a4dc 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -175,11 +175,6 @@ extern int padata_do_parallel(struct padata_instance *pinst, extern void padata_do_serial(struct padata_priv *padata); extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, cpumask_var_t cpumask); -extern int padata_set_cpumasks(struct padata_instance *pinst, - cpumask_var_t pcpumask, - cpumask_var_t cbcpumask); -extern int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask); -extern int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask); extern int padata_start(struct padata_instance *pinst); extern void padata_stop(struct padata_instance *pinst); extern int padata_register_cpumask_notifier(struct padata_instance *pinst, diff --git a/kernel/padata.c b/kernel/padata.c index b38bea9c466a..67ddd4acde9d 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -606,33 +606,6 @@ out_replace: return 0; } -/** - * padata_set_cpumasks - Set both parallel and serial cpumasks. The first - * one is used by parallel workers and the second one - * by the wokers doing serialization. - * - * @pinst: padata instance - * @pcpumask: the cpumask to use for parallel workers - * @cbcpumask: the cpumsak to use for serial workers - */ -int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask, - cpumask_var_t cbcpumask) -{ - int err; - - mutex_lock(&pinst->lock); - get_online_cpus(); - - err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask); - - put_online_cpus(); - mutex_unlock(&pinst->lock); - - return err; - -} -EXPORT_SYMBOL(padata_set_cpumasks); - /** * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value * equivalent to @cpumask. @@ -694,42 +667,6 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu) return 0; } - /** - * padata_add_cpu - add a cpu to one or both(parallel and serial) - * padata cpumasks. - * - * @pinst: padata instance - * @cpu: cpu to add - * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added. - * The @mask may be any combination of the following flags: - * PADATA_CPU_SERIAL - serial cpumask - * PADATA_CPU_PARALLEL - parallel cpumask - */ - -int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask) -{ - int err; - - if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) - return -EINVAL; - - mutex_lock(&pinst->lock); - - get_online_cpus(); - if (mask & PADATA_CPU_SERIAL) - cpumask_set_cpu(cpu, pinst->cpumask.cbcpu); - if (mask & PADATA_CPU_PARALLEL) - cpumask_set_cpu(cpu, pinst->cpumask.pcpu); - - err = __padata_add_cpu(pinst, cpu); - put_online_cpus(); - - mutex_unlock(&pinst->lock); - - return err; -} -EXPORT_SYMBOL(padata_add_cpu); - static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) { struct parallel_data *pd = NULL; @@ -1091,7 +1028,6 @@ err_free_inst: err: return NULL; } -EXPORT_SYMBOL(padata_alloc); /** * padata_free - free a padata instance -- cgit v1.2.3-58-ga151 From c7ce4f60ac199fb3521c5fcd64da21cee801ec2b Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Thu, 19 May 2016 17:10:37 -0700 Subject: mm: SLAB freelist randomization Provides an optional config (CONFIG_SLAB_FREELIST_RANDOM) to randomize the SLAB freelist. The list is randomized during initialization of a new set of pages. The order on different freelist sizes is pre-computed at boot for performance. Each kmem_cache has its own randomized freelist. Before pre-computed lists are available freelists are generated dynamically. This security feature reduces the predictability of the kernel SLAB allocator against heap overflows rendering attacks much less stable. For example this attack against SLUB (also applicable against SLAB) would be affected: https://jon.oberheide.org/blog/2010/09/10/linux-kernel-can-slub-overflow/ Also, since v4.6 the freelist was moved at the end of the SLAB. It means a controllable heap is opened to new attacks not yet publicly discussed. A kernel heap overflow can be transformed to multiple use-after-free. This feature makes this type of attack harder too. To generate entropy, we use get_random_bytes_arch because 0 bits of entropy is available in the boot stage. In the worse case this function will fallback to the get_random_bytes sub API. We also generate a shift random number to shift pre-computed freelist for each new set of pages. The config option name is not specific to the SLAB as this approach will be extended to other allocators like SLUB. Performance results highlighted no major changes: Hackbench (running 90 10 times): Before average: 0.0698 After average: 0.0663 (-5.01%) slab_test 1 run on boot. Difference only seen on the 2048 size test being the worse case scenario covered by freelist randomization. New slab pages are constantly being created on the 10000 allocations. Variance should be mainly due to getting new pages every few allocations. Before: Single thread testing ===================== 1. Kmalloc: Repeatedly allocate then free test 10000 times kmalloc(8) -> 99 cycles kfree -> 112 cycles 10000 times kmalloc(16) -> 109 cycles kfree -> 140 cycles 10000 times kmalloc(32) -> 129 cycles kfree -> 137 cycles 10000 times kmalloc(64) -> 141 cycles kfree -> 141 cycles 10000 times kmalloc(128) -> 152 cycles kfree -> 148 cycles 10000 times kmalloc(256) -> 195 cycles kfree -> 167 cycles 10000 times kmalloc(512) -> 257 cycles kfree -> 199 cycles 10000 times kmalloc(1024) -> 393 cycles kfree -> 251 cycles 10000 times kmalloc(2048) -> 649 cycles kfree -> 228 cycles 10000 times kmalloc(4096) -> 806 cycles kfree -> 370 cycles 10000 times kmalloc(8192) -> 814 cycles kfree -> 411 cycles 10000 times kmalloc(16384) -> 892 cycles kfree -> 455 cycles 2. Kmalloc: alloc/free test 10000 times kmalloc(8)/kfree -> 121 cycles 10000 times kmalloc(16)/kfree -> 121 cycles 10000 times kmalloc(32)/kfree -> 121 cycles 10000 times kmalloc(64)/kfree -> 121 cycles 10000 times kmalloc(128)/kfree -> 121 cycles 10000 times kmalloc(256)/kfree -> 119 cycles 10000 times kmalloc(512)/kfree -> 119 cycles 10000 times kmalloc(1024)/kfree -> 119 cycles 10000 times kmalloc(2048)/kfree -> 119 cycles 10000 times kmalloc(4096)/kfree -> 121 cycles 10000 times kmalloc(8192)/kfree -> 119 cycles 10000 times kmalloc(16384)/kfree -> 119 cycles After: Single thread testing ===================== 1. Kmalloc: Repeatedly allocate then free test 10000 times kmalloc(8) -> 130 cycles kfree -> 86 cycles 10000 times kmalloc(16) -> 118 cycles kfree -> 86 cycles 10000 times kmalloc(32) -> 121 cycles kfree -> 85 cycles 10000 times kmalloc(64) -> 176 cycles kfree -> 102 cycles 10000 times kmalloc(128) -> 178 cycles kfree -> 100 cycles 10000 times kmalloc(256) -> 205 cycles kfree -> 109 cycles 10000 times kmalloc(512) -> 262 cycles kfree -> 136 cycles 10000 times kmalloc(1024) -> 342 cycles kfree -> 157 cycles 10000 times kmalloc(2048) -> 701 cycles kfree -> 238 cycles 10000 times kmalloc(4096) -> 803 cycles kfree -> 364 cycles 10000 times kmalloc(8192) -> 835 cycles kfree -> 404 cycles 10000 times kmalloc(16384) -> 896 cycles kfree -> 441 cycles 2. Kmalloc: alloc/free test 10000 times kmalloc(8)/kfree -> 121 cycles 10000 times kmalloc(16)/kfree -> 121 cycles 10000 times kmalloc(32)/kfree -> 123 cycles 10000 times kmalloc(64)/kfree -> 142 cycles 10000 times kmalloc(128)/kfree -> 121 cycles 10000 times kmalloc(256)/kfree -> 119 cycles 10000 times kmalloc(512)/kfree -> 119 cycles 10000 times kmalloc(1024)/kfree -> 119 cycles 10000 times kmalloc(2048)/kfree -> 119 cycles 10000 times kmalloc(4096)/kfree -> 119 cycles 10000 times kmalloc(8192)/kfree -> 119 cycles 10000 times kmalloc(16384)/kfree -> 119 cycles [akpm@linux-foundation.org: propagate gfp_t into cache_random_seq_create()] Signed-off-by: Thomas Garnier Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Kees Cook Cc: Greg Thelen Cc: Laura Abbott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab_def.h | 4 ++ init/Kconfig | 9 +++ mm/slab.c | 167 ++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 178 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 9edbbf352340..8694f7a5d92b 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -80,6 +80,10 @@ struct kmem_cache { struct kasan_cache kasan_info; #endif +#ifdef CONFIG_SLAB_FREELIST_RANDOM + void *random_seq; +#endif + struct kmem_cache_node *node[MAX_NUMNODES]; }; diff --git a/init/Kconfig b/init/Kconfig index 0dfd09d54c65..79a91a2c0444 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1742,6 +1742,15 @@ config SLOB endchoice +config SLAB_FREELIST_RANDOM + default n + depends on SLAB + bool "SLAB freelist randomization" + help + Randomizes the freelist order used on creating new SLABs. This + security feature reduces the predictability of the kernel slab + allocator against heap overflows. + config SLUB_CPU_PARTIAL default y depends on SLUB && SMP diff --git a/mm/slab.c b/mm/slab.c index 8133ebea77a4..1ee26a0d358f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1243,6 +1243,61 @@ static void __init set_up_node(struct kmem_cache *cachep, int index) } } +#ifdef CONFIG_SLAB_FREELIST_RANDOM +static void freelist_randomize(struct rnd_state *state, freelist_idx_t *list, + size_t count) +{ + size_t i; + unsigned int rand; + + for (i = 0; i < count; i++) + list[i] = i; + + /* Fisher-Yates shuffle */ + for (i = count - 1; i > 0; i--) { + rand = prandom_u32_state(state); + rand %= (i + 1); + swap(list[i], list[rand]); + } +} + +/* Create a random sequence per cache */ +static int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp) +{ + unsigned int seed, count = cachep->num; + struct rnd_state state; + + if (count < 2) + return 0; + + /* If it fails, we will just use the global lists */ + cachep->random_seq = kcalloc(count, sizeof(freelist_idx_t), gfp); + if (!cachep->random_seq) + return -ENOMEM; + + /* Get best entropy at this stage */ + get_random_bytes_arch(&seed, sizeof(seed)); + prandom_seed_state(&state, seed); + + freelist_randomize(&state, cachep->random_seq, count); + return 0; +} + +/* Destroy the per-cache random freelist sequence */ +static void cache_random_seq_destroy(struct kmem_cache *cachep) +{ + kfree(cachep->random_seq); + cachep->random_seq = NULL; +} +#else +static inline int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp) +{ + return 0; +} +static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + + /* * Initialisation. Called after the page allocator have been initialised and * before smp_init(). @@ -2374,6 +2429,8 @@ void __kmem_cache_release(struct kmem_cache *cachep) int i; struct kmem_cache_node *n; + cache_random_seq_destroy(cachep); + free_percpu(cachep->cpu_cache); /* NUMA: free the node structures */ @@ -2480,15 +2537,115 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) #endif } +#ifdef CONFIG_SLAB_FREELIST_RANDOM +/* Hold information during a freelist initialization */ +union freelist_init_state { + struct { + unsigned int pos; + freelist_idx_t *list; + unsigned int count; + unsigned int rand; + }; + struct rnd_state rnd_state; +}; + +/* + * Initialize the state based on the randomization methode available. + * return true if the pre-computed list is available, false otherwize. + */ +static bool freelist_state_initialize(union freelist_init_state *state, + struct kmem_cache *cachep, + unsigned int count) +{ + bool ret; + unsigned int rand; + + /* Use best entropy available to define a random shift */ + get_random_bytes_arch(&rand, sizeof(rand)); + + /* Use a random state if the pre-computed list is not available */ + if (!cachep->random_seq) { + prandom_seed_state(&state->rnd_state, rand); + ret = false; + } else { + state->list = cachep->random_seq; + state->count = count; + state->pos = 0; + state->rand = rand; + ret = true; + } + return ret; +} + +/* Get the next entry on the list and randomize it using a random shift */ +static freelist_idx_t next_random_slot(union freelist_init_state *state) +{ + return (state->list[state->pos++] + state->rand) % state->count; +} + +/* + * Shuffle the freelist initialization state based on pre-computed lists. + * return true if the list was successfully shuffled, false otherwise. + */ +static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page) +{ + unsigned int objfreelist = 0, i, count = cachep->num; + union freelist_init_state state; + bool precomputed; + + if (count < 2) + return false; + + precomputed = freelist_state_initialize(&state, cachep, count); + + /* Take a random entry as the objfreelist */ + if (OBJFREELIST_SLAB(cachep)) { + if (!precomputed) + objfreelist = count - 1; + else + objfreelist = next_random_slot(&state); + page->freelist = index_to_obj(cachep, page, objfreelist) + + obj_offset(cachep); + count--; + } + + /* + * On early boot, generate the list dynamically. + * Later use a pre-computed list for speed. + */ + if (!precomputed) { + freelist_randomize(&state.rnd_state, page->freelist, count); + } else { + for (i = 0; i < count; i++) + set_free_obj(page, i, next_random_slot(&state)); + } + + if (OBJFREELIST_SLAB(cachep)) + set_free_obj(page, cachep->num - 1, objfreelist); + + return true; +} +#else +static inline bool shuffle_freelist(struct kmem_cache *cachep, + struct page *page) +{ + return false; +} +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + static void cache_init_objs(struct kmem_cache *cachep, struct page *page) { int i; void *objp; + bool shuffled; cache_init_objs_debug(cachep, page); - if (OBJFREELIST_SLAB(cachep)) { + /* Try to randomize the freelist if enabled */ + shuffled = shuffle_freelist(cachep, page); + + if (!shuffled && OBJFREELIST_SLAB(cachep)) { page->freelist = index_to_obj(cachep, page, cachep->num - 1) + obj_offset(cachep); } @@ -2502,7 +2659,8 @@ static void cache_init_objs(struct kmem_cache *cachep, kasan_poison_object_data(cachep, objp); } - set_free_obj(page, i, i); + if (!shuffled) + set_free_obj(page, i, i); } } @@ -3841,6 +3999,10 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) int shared = 0; int batchcount = 0; + err = cache_random_seq_create(cachep, gfp); + if (err) + goto end; + if (!is_root_cache(cachep)) { struct kmem_cache *root = memcg_root_cache(cachep); limit = root->limit; @@ -3894,6 +4056,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) batchcount = (limit + 1) / 2; skip_setup: err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); +end: if (err) pr_err("enable_cpucache failed for %s, error %d\n", cachep->name, -err); -- cgit v1.2.3-58-ga151 From 0139aa7b7fa12ceef095d99dc36606a5b10ab83a Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:49 -0700 Subject: mm: rename _count, field of the struct page, to _refcount Many developers already know that field for reference count of the struct page is _count and atomic type. They would try to handle it directly and this could break the purpose of page reference count tracepoint. To prevent direct _count modification, this patch rename it to _refcount and add warning message on the code. After that, developer who need to handle reference count will find that field should not be accessed directly. [akpm@linux-foundation.org: fix comments, per Vlastimil] [akpm@linux-foundation.org: Documentation/vm/transhuge.txt too] [sfr@canb.auug.org.au: sync ethernet driver changes] Signed-off-by: Joonsoo Kim Signed-off-by: Stephen Rothwell Cc: Vlastimil Babka Cc: Hugh Dickins Cc: Johannes Berg Cc: "David S. Miller" Cc: Sunil Goutham Cc: Chris Metcalf Cc: Manish Chopra Cc: Yuval Mintz Cc: Tariq Toukan Cc: Saeed Mahameed Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/transhuge.txt | 10 +++++----- arch/tile/mm/init.c | 2 +- drivers/block/aoe/aoecmd.c | 2 +- drivers/hwtracing/intel_th/msu.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 20 +++++++++---------- drivers/net/ethernet/qlogic/qede/qede_main.c | 4 ++-- fs/proc/page.c | 2 +- include/linux/mm.h | 2 +- include/linux/mm_types.h | 14 ++++++++----- include/linux/page_ref.h | 26 ++++++++++++------------- include/linux/pagemap.h | 8 ++++---- kernel/kexec_core.c | 2 +- mm/huge_memory.c | 4 ++-- mm/internal.h | 2 +- mm/page_alloc.c | 4 ++-- mm/slub.c | 4 ++-- mm/vmscan.c | 4 ++-- 17 files changed, 58 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index d9cb65cf5cfd..fb0e1f2a19cc 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt @@ -394,9 +394,9 @@ hugepage natively. Once finished you can drop the page table lock. Refcounting on THP is mostly consistent with refcounting on other compound pages: - - get_page()/put_page() and GUP operate in head page's ->_count. + - get_page()/put_page() and GUP operate in head page's ->_refcount. - - ->_count in tail pages is always zero: get_page_unless_zero() never + - ->_refcount in tail pages is always zero: get_page_unless_zero() never succeed on tail pages. - map/unmap of the pages with PTE entry increment/decrement ->_mapcount @@ -426,15 +426,15 @@ requests to split pinned huge page: it expects page count to be equal to sum of mapcount of all sub-pages plus one (split_huge_page caller must have reference for head page). -split_huge_page uses migration entries to stabilize page->_count and +split_huge_page uses migration entries to stabilize page->_refcount and page->_mapcount. We safe against physical memory scanners too: the only legitimate way scanner can get reference to a page is get_page_unless_zero(). -All tail pages has zero ->_count until atomic_add(). It prevent scanner +All tail pages has zero ->_refcount until atomic_add(). It prevent scanner from geting reference to tail page up to the point. After the atomic_add() -we don't care about ->_count value. We already known how many references +we don't care about ->_refcount value. We already known how many references with should uncharge from head page. For head page get_page_unless_zero() will succeed and we don't mind. It's diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c index a0582b7f41d3..adce25462b0d 100644 --- a/arch/tile/mm/init.c +++ b/arch/tile/mm/init.c @@ -679,7 +679,7 @@ static void __init init_free_pfn_range(unsigned long start, unsigned long end) * Hacky direct set to avoid unnecessary * lock take/release for EVERY page here. */ - p->_count.counter = 0; + p->_refcount.counter = 0; p->_mapcount.counter = -1; } init_page_count(page); diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 437b3a822f44..d597e432e195 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -861,7 +861,7 @@ rqbiocnt(struct request *r) * discussion. * * We cannot use get_page in the workaround, because it insists on a - * positive page count as a precondition. So we use _count directly. + * positive page count as a precondition. So we use _refcount directly. */ static void bio_pageinc(struct bio *bio) diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c index d9d6022c5aca..d2209147dc89 100644 --- a/drivers/hwtracing/intel_th/msu.c +++ b/drivers/hwtracing/intel_th/msu.c @@ -1164,7 +1164,7 @@ static void msc_mmap_close(struct vm_area_struct *vma) if (!atomic_dec_and_mutex_lock(&msc->mmap_count, &msc->buf_mutex)) return; - /* drop page _counts */ + /* drop page _refcounts */ for (pg = 0; pg < msc->nr_pages; pg++) { struct page *page = msc_buffer_get_page(msc, pg); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index f3456798c596..bd947704b59c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -433,8 +433,8 @@ static int mlx5e_alloc_rx_fragmented_mpwqe(struct mlx5e_rq *rq, for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) { if (unlikely(mlx5e_alloc_and_map_page(rq, wi, i))) goto err_unmap; - atomic_add(mlx5e_mpwqe_strides_per_page(rq), - &wi->umr.dma_info[i].page->_count); + page_ref_add(wi->umr.dma_info[i].page, + mlx5e_mpwqe_strides_per_page(rq)); wi->skbs_frags[i] = 0; } @@ -452,8 +452,8 @@ err_unmap: while (--i >= 0) { dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE, PCI_DMA_FROMDEVICE); - atomic_sub(mlx5e_mpwqe_strides_per_page(rq), - &wi->umr.dma_info[i].page->_count); + page_ref_sub(wi->umr.dma_info[i].page, + mlx5e_mpwqe_strides_per_page(rq)); put_page(wi->umr.dma_info[i].page); } dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE); @@ -477,8 +477,8 @@ void mlx5e_free_rx_fragmented_mpwqe(struct mlx5e_rq *rq, for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) { dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE, PCI_DMA_FROMDEVICE); - atomic_sub(mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i], - &wi->umr.dma_info[i].page->_count); + page_ref_sub(wi->umr.dma_info[i].page, + mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i]); put_page(wi->umr.dma_info[i].page); } dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE); @@ -527,8 +527,8 @@ static int mlx5e_alloc_rx_linear_mpwqe(struct mlx5e_rq *rq, */ split_page(wi->dma_info.page, MLX5_MPWRQ_WQE_PAGE_ORDER); for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) { - atomic_add(mlx5e_mpwqe_strides_per_page(rq), - &wi->dma_info.page[i]._count); + page_ref_add(&wi->dma_info.page[i], + mlx5e_mpwqe_strides_per_page(rq)); wi->skbs_frags[i] = 0; } @@ -551,8 +551,8 @@ void mlx5e_free_rx_linear_mpwqe(struct mlx5e_rq *rq, dma_unmap_page(rq->pdev, wi->dma_info.addr, rq->wqe_sz, PCI_DMA_FROMDEVICE); for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) { - atomic_sub(mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i], - &wi->dma_info.page[i]._count); + page_ref_sub(&wi->dma_info.page[i], + mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i]); put_page(&wi->dma_info.page[i]); } } diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index 3aabfc0adefe..73dd525fbf08 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -1036,7 +1036,7 @@ static int qede_fill_frag_skb(struct qede_dev *edev, /* Incr page ref count to reuse on allocation failure * so that it doesn't get freed while freeing SKB. */ - atomic_inc(¤t_bd->data->_count); + page_ref_inc(current_bd->data); goto out; } @@ -1487,7 +1487,7 @@ static int qede_rx_int(struct qede_fastpath *fp, int budget) * freeing SKB. */ - atomic_inc(&sw_rx_data->data->_count); + page_ref_inc(sw_rx_data->data); rxq->rx_alloc_errors++; qede_recycle_rx_bd_ring(rxq, edev, fp_cqe->bd_num); diff --git a/fs/proc/page.c b/fs/proc/page.c index 712f1b9992cc..3ecd445e830d 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -142,7 +142,7 @@ u64 stable_page_flags(struct page *page) /* - * Caveats on high order pages: page->_count will only be set + * Caveats on high order pages: page->_refcount will only be set * -1 on the head page; SLUB/SLQB do the same for PG_slab; * SLOB won't set PG_slab at all on compound pages. */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 727f799757ab..1193a54ea2b3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -734,7 +734,7 @@ static inline void get_page(struct page *page) page = compound_head(page); /* * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_count. + * requires to already have an elevated page->_refcount. */ VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); page_ref_inc(page); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c2d75b4fa86c..1fda9c99ef95 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -73,9 +73,9 @@ struct page { unsigned long counters; #else /* - * Keep _count separate from slub cmpxchg_double data. - * As the rest of the double word is protected by - * slab_lock but _count is not. + * Keep _refcount separate from slub cmpxchg_double + * data. As the rest of the double word is protected by + * slab_lock but _refcount is not. */ unsigned counters; #endif @@ -97,7 +97,11 @@ struct page { }; int units; /* SLOB */ }; - atomic_t _count; /* Usage count, see below. */ + /* + * Usage count, *USE WRAPPER FUNCTION* + * when manual accounting. See page_ref.h + */ + atomic_t _refcount; }; unsigned int active; /* SLAB */ }; @@ -248,7 +252,7 @@ struct page_frag_cache { __u32 offset; #endif /* we maintain a pagecount bias, so that we dont dirty cache line - * containing page->_count every time we allocate a fragment. + * containing page->_refcount every time we allocate a fragment. */ unsigned int pagecnt_bias; bool pfmemalloc; diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index e596d5d9540e..8b5e0a9f2431 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -63,17 +63,17 @@ static inline void __page_ref_unfreeze(struct page *page, int v) static inline int page_ref_count(struct page *page) { - return atomic_read(&page->_count); + return atomic_read(&page->_refcount); } static inline int page_count(struct page *page) { - return atomic_read(&compound_head(page)->_count); + return atomic_read(&compound_head(page)->_refcount); } static inline void set_page_count(struct page *page, int v) { - atomic_set(&page->_count, v); + atomic_set(&page->_refcount, v); if (page_ref_tracepoint_active(__tracepoint_page_ref_set)) __page_ref_set(page, v); } @@ -89,35 +89,35 @@ static inline void init_page_count(struct page *page) static inline void page_ref_add(struct page *page, int nr) { - atomic_add(nr, &page->_count); + atomic_add(nr, &page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) __page_ref_mod(page, nr); } static inline void page_ref_sub(struct page *page, int nr) { - atomic_sub(nr, &page->_count); + atomic_sub(nr, &page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) __page_ref_mod(page, -nr); } static inline void page_ref_inc(struct page *page) { - atomic_inc(&page->_count); + atomic_inc(&page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) __page_ref_mod(page, 1); } static inline void page_ref_dec(struct page *page) { - atomic_dec(&page->_count); + atomic_dec(&page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) __page_ref_mod(page, -1); } static inline int page_ref_sub_and_test(struct page *page, int nr) { - int ret = atomic_sub_and_test(nr, &page->_count); + int ret = atomic_sub_and_test(nr, &page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test)) __page_ref_mod_and_test(page, -nr, ret); @@ -126,7 +126,7 @@ static inline int page_ref_sub_and_test(struct page *page, int nr) static inline int page_ref_dec_and_test(struct page *page) { - int ret = atomic_dec_and_test(&page->_count); + int ret = atomic_dec_and_test(&page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test)) __page_ref_mod_and_test(page, -1, ret); @@ -135,7 +135,7 @@ static inline int page_ref_dec_and_test(struct page *page) static inline int page_ref_dec_return(struct page *page) { - int ret = atomic_dec_return(&page->_count); + int ret = atomic_dec_return(&page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return)) __page_ref_mod_and_return(page, -1, ret); @@ -144,7 +144,7 @@ static inline int page_ref_dec_return(struct page *page) static inline int page_ref_add_unless(struct page *page, int nr, int u) { - int ret = atomic_add_unless(&page->_count, nr, u); + int ret = atomic_add_unless(&page->_refcount, nr, u); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_unless)) __page_ref_mod_unless(page, nr, ret); @@ -153,7 +153,7 @@ static inline int page_ref_add_unless(struct page *page, int nr, int u) static inline int page_ref_freeze(struct page *page, int count) { - int ret = likely(atomic_cmpxchg(&page->_count, count, 0) == count); + int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count); if (page_ref_tracepoint_active(__tracepoint_page_ref_freeze)) __page_ref_freeze(page, count, ret); @@ -165,7 +165,7 @@ static inline void page_ref_unfreeze(struct page *page, int count) VM_BUG_ON_PAGE(page_count(page) != 0, page); VM_BUG_ON(count == 0); - atomic_set(&page->_count, count); + atomic_set(&page->_refcount, count); if (page_ref_tracepoint_active(__tracepoint_page_ref_unfreeze)) __page_ref_unfreeze(page, count); } diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 7e1ab155c67c..fe1513ffb7bf 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -90,12 +90,12 @@ void release_pages(struct page **pages, int nr, bool cold); /* * speculatively take a reference to a page. - * If the page is free (_count == 0), then _count is untouched, and 0 - * is returned. Otherwise, _count is incremented by 1 and 1 is returned. + * If the page is free (_refcount == 0), then _refcount is untouched, and 0 + * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned. * * This function must be called inside the same rcu_read_lock() section as has * been used to lookup the page in the pagecache radix-tree (or page table): - * this allows allocators to use a synchronize_rcu() to stabilize _count. + * this allows allocators to use a synchronize_rcu() to stabilize _refcount. * * Unless an RCU grace period has passed, the count of all pages coming out * of the allocator must be considered unstable. page_count may return higher @@ -111,7 +111,7 @@ void release_pages(struct page **pages, int nr, bool cold); * 2. conditionally increment refcount * 3. check the page is still in pagecache (if no, goto 1) * - * Remove-side that cares about stability of _count (eg. reclaim) has the + * Remove-side that cares about stability of _refcount (eg. reclaim) has the * following (with tree_lock held for write): * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg) * B. remove page from pagecache diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 1391d3ee3b86..1c03dfb4abfd 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1410,7 +1410,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_STRUCT_SIZE(list_head); VMCOREINFO_SIZE(nodemask_t); VMCOREINFO_OFFSET(page, flags); - VMCOREINFO_OFFSET(page, _count); + VMCOREINFO_OFFSET(page, _refcount); VMCOREINFO_OFFSET(page, mapping); VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b49ee126d4d1..f8ac8f582fd8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3113,7 +3113,7 @@ static void __split_huge_page_tail(struct page *head, int tail, VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail); /* - * tail_page->_count is zero and not changing from under us. But + * tail_page->_refcount is zero and not changing from under us. But * get_page_unless_zero() may be running from under us on the * tail_page. If we used atomic_set() below instead of atomic_inc(), we * would then run atomic_set() concurrently with @@ -3340,7 +3340,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mlocked) lru_add_drain(); - /* Prevent deferred_split_scan() touching ->_count */ + /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock_irqsave(&pgdata->split_queue_lock, flags); count = page_count(head); mapcount = total_mapcount(head); diff --git a/mm/internal.h b/mm/internal.h index b79abb6721cf..098a89e3b97c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -58,7 +58,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra, } /* - * Turn a non-refcounted page (->_count == 0) into refcounted with + * Turn a non-refcounted page (->_refcount == 0) into refcounted with * a count of one. */ static inline void set_page_refcounted(struct page *page) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c1069efcc4d7..4ce57f938b7f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -794,7 +794,7 @@ static inline int free_pages_check(struct page *page) if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) - bad_reason = "nonzero _count"; + bad_reason = "nonzero _refcount"; if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; bad_flags = PAGE_FLAGS_CHECK_AT_FREE; @@ -6864,7 +6864,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * We can't use page_count without pin a page * because another CPU can free compound page. * This check already skips compound tails of THP - * because their page->_count is zero at all time. + * because their page->_refcount is zero at all time. */ if (!page_ref_count(page)) { if (PageBuddy(page)) diff --git a/mm/slub.c b/mm/slub.c index 8671de2e5b12..cf1faa4d3992 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -329,8 +329,8 @@ static inline void set_page_slub_counters(struct page *page, unsigned long count tmp.counters = counters_new; /* * page->counters can cover frozen/inuse/objects as well - * as page->_count. If we assign to ->counters directly - * we run the risk of losing updates to page->_count, so + * as page->_refcount. If we assign to ->counters directly + * we run the risk of losing updates to page->_refcount, so * be careful and only assign to the fields we need. */ page->frozen = tmp.frozen; diff --git a/mm/vmscan.c b/mm/vmscan.c index 142cb61f4822..d3a02ac3eed7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -633,7 +633,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * * Reversing the order of the tests ensures such a situation cannot * escape unnoticed. The smp_rmb is needed to ensure the page->flags - * load is not satisfied before that of page->_count. + * load is not satisfied before that of page->_refcount. * * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under tree_lock, then this ordering is not required. @@ -1720,7 +1720,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * It is safe to rely on PG_active against the non-LRU pages in here because * nobody will play with that bit on a non-LRU page. * - * The downside is that we have to touch page->_count against each page. + * The downside is that we have to touch page->_refcount against each page. * But we had to alter page->flags anyway. */ -- cgit v1.2.3-58-ga151 From d64e85d3e1c59c3664b9ec1183052ec4641ea1e2 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 19 May 2016 17:10:52 -0700 Subject: compiler.h: add support for malloc attribute gcc as far back as at least 3.04 documents the function attribute __malloc__. Add a shorthand for attaching that to a function declaration. This was also suggested by Andi Kleen way back in 2002 [1], but didn't get applied, perhaps because gcc at that time generated the exact same code with and without this attribute. This attribute tells the compiler that the return value (if non-NULL) can be assumed not to alias any other valid pointers at the time of the call. Please note that the documentation for a range of gcc versions (starting from around 4.7) contained a somewhat confusing and self-contradicting text: The malloc attribute is used to tell the compiler that a function may be treated as if any non-NULL pointer it returns cannot alias any other pointer valid when the function returns and *that the memory has undefined content*. [...] Standard functions with this property include malloc and *calloc*. (emphasis mine). The intended meaning has later been clarified [2]: This tells the compiler that a function is malloc-like, i.e., that the pointer P returned by the function cannot alias any other pointer valid when the function returns, and moreover no pointers to valid objects occur in any storage addressed by P. What this means is that we can apply the attribute to kmalloc and friends, and it is ok for the returned memory to have well-defined contents (__GFP_ZERO). But it is not ok to apply it to kmemdup(), nor to other functions which both allocate and possibly initialize the memory with existing pointers. So unless someone is doing something pretty perverted kstrdup() should also be a fine candidate. [1] http://thread.gmane.org/gmane.linux.kernel/57172 [2] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56955 Signed-off-by: Rasmus Villemoes Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc.h | 1 + include/linux/compiler.h | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 3d5202eda22f..e2949397c19b 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -142,6 +142,7 @@ #if GCC_VERSION >= 30400 #define __must_check __attribute__((warn_unused_result)) +#define __malloc __attribute__((__malloc__)) #endif #if GCC_VERSION >= 40000 diff --git a/include/linux/compiler.h b/include/linux/compiler.h index b5ff9881bef8..793c0829e3a3 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -357,6 +357,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s #define __deprecated_for_modules #endif +#ifndef __malloc +#define __malloc +#endif + /* * Allow us to avoid 'defined but not used' warnings on functions and data, * as well as force them to be emitted to the assembly file. -- cgit v1.2.3-58-ga151 From 48a270554a3251681ae11173f2fd6389d943e183 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 19 May 2016 17:10:55 -0700 Subject: include/linux: apply __malloc attribute Attach the malloc attribute to a few allocation functions. This helps gcc generate better code by telling it that the return value doesn't alias any existing pointers (which is even more valuable given the pessimizations implied by -fno-strict-aliasing). A simple example of what this allows gcc to do can be seen by looking at the last part of drm_atomic_helper_plane_reset: plane->state = kzalloc(sizeof(*plane->state), GFP_KERNEL); if (plane->state) { plane->state->plane = plane; plane->state->rotation = BIT(DRM_ROTATE_0); } which compiles to e8 99 bf d6 ff callq ffffffff8116d540 48 85 c0 test %rax,%rax 48 89 83 40 02 00 00 mov %rax,0x240(%rbx) 74 11 je ffffffff814015c4 48 89 18 mov %rbx,(%rax) 48 8b 83 40 02 00 00 mov 0x240(%rbx),%rax [*] c7 40 40 01 00 00 00 movl $0x1,0x40(%rax) With this patch applied, the instruction at [*] is elided, since the store to plane->state->plane is known to not alter the value of plane->state. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Rasmus Villemoes Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 16 ++++++++-------- include/linux/device.h | 12 ++++++------ include/linux/kernel.h | 4 ++-- include/linux/mempool.h | 3 ++- include/linux/slab.h | 16 ++++++++-------- include/linux/string.h | 2 +- 6 files changed, 27 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 35b22f94d2d2..f9be32691718 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -83,34 +83,34 @@ extern void *__alloc_bootmem(unsigned long size, unsigned long goal); extern void *__alloc_bootmem_nopanic(unsigned long size, unsigned long align, - unsigned long goal); + unsigned long goal) __malloc; extern void *__alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, - unsigned long goal); + unsigned long goal) __malloc; void *__alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, unsigned long align, - unsigned long goal); + unsigned long goal) __malloc; extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, - unsigned long goal); + unsigned long goal) __malloc; void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, - unsigned long limit); + unsigned long limit) __malloc; extern void *__alloc_bootmem_low(unsigned long size, unsigned long align, - unsigned long goal); + unsigned long goal) __malloc; void *__alloc_bootmem_low_nopanic(unsigned long size, unsigned long align, - unsigned long goal); + unsigned long goal) __malloc; extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, - unsigned long goal); + unsigned long goal) __malloc; #ifdef CONFIG_NO_BOOTMEM /* We are using top down, so it is safe to use 0 here */ diff --git a/include/linux/device.h b/include/linux/device.h index b130304f9b1b..ca90ad8bcd61 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -609,14 +609,14 @@ typedef int (*dr_match_t)(struct device *dev, void *res, void *match_data); #ifdef CONFIG_DEBUG_DEVRES extern void *__devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp, - int nid, const char *name); + int nid, const char *name) __malloc; #define devres_alloc(release, size, gfp) \ __devres_alloc_node(release, size, gfp, NUMA_NO_NODE, #release) #define devres_alloc_node(release, size, gfp, nid) \ __devres_alloc_node(release, size, gfp, nid, #release) #else extern void *devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp, - int nid); + int nid) __malloc; static inline void *devres_alloc(dr_release_t release, size_t size, gfp_t gfp) { return devres_alloc_node(release, size, gfp, NUMA_NO_NODE); @@ -648,12 +648,12 @@ extern void devres_remove_group(struct device *dev, void *id); extern int devres_release_group(struct device *dev, void *id); /* managed devm_k.alloc/kfree for device drivers */ -extern void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp); +extern void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) __malloc; extern __printf(3, 0) char *devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, - va_list ap); + va_list ap) __malloc; extern __printf(3, 4) -char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...); +char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...) __malloc; static inline void *devm_kzalloc(struct device *dev, size_t size, gfp_t gfp) { return devm_kmalloc(dev, size, gfp | __GFP_ZERO); @@ -671,7 +671,7 @@ static inline void *devm_kcalloc(struct device *dev, return devm_kmalloc_array(dev, n, size, flags | __GFP_ZERO); } extern void devm_kfree(struct device *dev, void *p); -extern char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp); +extern char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp) __malloc; extern void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2f7775e229b0..cc7398287fdd 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -412,9 +412,9 @@ extern __printf(3, 4) int scnprintf(char *buf, size_t size, const char *fmt, ...); extern __printf(3, 0) int vscnprintf(char *buf, size_t size, const char *fmt, va_list args); -extern __printf(2, 3) +extern __printf(2, 3) __malloc char *kasprintf(gfp_t gfp, const char *fmt, ...); -extern __printf(2, 0) +extern __printf(2, 0) __malloc char *kvasprintf(gfp_t gfp, const char *fmt, va_list args); extern __printf(2, 0) const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list args); diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 69b6951e8fd2..b1086c936507 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -5,6 +5,7 @@ #define _LINUX_MEMPOOL_H #include +#include struct kmem_cache; @@ -31,7 +32,7 @@ extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, extern int mempool_resize(mempool_t *pool, int new_min_nr); extern void mempool_destroy(mempool_t *pool); -extern void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask); +extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc; extern void mempool_free(void *element, mempool_t *pool); /* diff --git a/include/linux/slab.h b/include/linux/slab.h index 508bd827e6dc..aeb3e6d00a66 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -315,8 +315,8 @@ static __always_inline int kmalloc_index(size_t size) } #endif /* !CONFIG_SLOB */ -void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment; -void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment; +void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc; +void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc; void kmem_cache_free(struct kmem_cache *, void *); /* @@ -339,8 +339,8 @@ static __always_inline void kfree_bulk(size_t size, void **p) } #ifdef CONFIG_NUMA -void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment; -void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment; +void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc; +void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc; #else static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node) { @@ -354,12 +354,12 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f #endif #ifdef CONFIG_TRACING -extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment; +extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment __malloc; #ifdef CONFIG_NUMA extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t size) __assume_slab_alignment; + int node, size_t size) __assume_slab_alignment __malloc; #else static __always_inline void * kmem_cache_alloc_node_trace(struct kmem_cache *s, @@ -392,10 +392,10 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s, } #endif /* CONFIG_TRACING */ -extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment; +extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc; #ifdef CONFIG_TRACING -extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment; +extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc; #else static __always_inline void * kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) diff --git a/include/linux/string.h b/include/linux/string.h index d3993a79a325..26b6f6a66f83 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -119,7 +119,7 @@ char *strreplace(char *s, char old, char new); extern void kfree_const(const void *x); -extern char *kstrdup(const char *s, gfp_t gfp); +extern char *kstrdup(const char *s, gfp_t gfp) __malloc; extern const char *kstrdup_const(const char *s, gfp_t gfp); extern char *kstrndup(const char *s, size_t len, gfp_t gfp); extern void *kmemdup(const void *src, size_t len, gfp_t gfp); -- cgit v1.2.3-58-ga151 From 0edaf86cf1a6a97d811fc34765ddbcbc310de564 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 May 2016 17:10:58 -0700 Subject: include/linux/nodemask.h: create next_node_in() helper Lots of code does node = next_node(node, XXX); if (node == MAX_NUMNODES) node = first_node(XXX); so create next_node_in() to do this and use it in various places. [mhocko@suse.com: use next_node_in() helper] Acked-by: Vlastimil Babka Acked-by: Michal Hocko Signed-off-by: Michal Hocko Cc: Xishi Qiu Cc: Joonsoo Kim Cc: David Rientjes Cc: Naoya Horiguchi Cc: Laura Abbott Cc: Hui Zhu Cc: Wang Xiaoqiang Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/tile/kernel/setup.c | 4 +--- arch/x86/mm/numa.c | 4 +--- include/linux/nodemask.h | 11 ++++++++++- kernel/cpuset.c | 8 +------- lib/Makefile | 2 +- lib/nodemask.c | 30 ++++++++++++++++++++++++++++++ mm/hugetlb.c | 4 +--- mm/memcontrol.c | 4 +--- mm/mempolicy.c | 24 ++---------------------- mm/page_isolation.c | 9 +++------ mm/slab.c | 13 +++---------- 11 files changed, 54 insertions(+), 59 deletions(-) create mode 100644 lib/nodemask.c (limited to 'include') diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c index a992238e9b58..153020abd2f5 100644 --- a/arch/tile/kernel/setup.c +++ b/arch/tile/kernel/setup.c @@ -962,9 +962,7 @@ static void __init setup_numa_mapping(void) cpumask_set_cpu(best_cpu, &node_2_cpu_mask[node]); cpu_2_node[best_cpu] = node; cpumask_clear_cpu(best_cpu, &unbound_cpus); - node = next_node(node, default_nodes); - if (node == MAX_NUMNODES) - node = first_node(default_nodes); + node = next_node_in(node, default_nodes); } /* Print out node assignments and set defaults for disabled cpus */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index f70c1ff46125..9c086c57105c 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -617,9 +617,7 @@ static void __init numa_init_array(void) if (early_cpu_to_node(i) != NUMA_NO_NODE) continue; numa_set_node(i, rr); - rr = next_node(rr, node_online_map); - if (rr == MAX_NUMNODES) - rr = first_node(node_online_map); + rr = next_node_in(rr, node_online_map); } } diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 6e85889cf9ab..f746e44d4046 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -43,8 +43,10 @@ * * int first_node(mask) Number lowest set bit, or MAX_NUMNODES * int next_node(node, mask) Next node past 'node', or MAX_NUMNODES + * int next_node_in(node, mask) Next node past 'node', or wrap to first, + * or MAX_NUMNODES * int first_unset_node(mask) First node not set in mask, or - * MAX_NUMNODES. + * MAX_NUMNODES * * nodemask_t nodemask_of_node(node) Return nodemask with bit 'node' set * NODE_MASK_ALL Initializer - all bits set @@ -259,6 +261,13 @@ static inline int __next_node(int n, const nodemask_t *srcp) return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); } +/* + * Find the next present node in src, starting after node n, wrapping around to + * the first node in src if needed. Returns MAX_NUMNODES if src is empty. + */ +#define next_node_in(n, src) __next_node_in((n), &(src)) +int __next_node_in(int node, const nodemask_t *srcp); + static inline void init_nodemask_of_node(nodemask_t *mask, int node) { nodes_clear(*mask); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1902956baba1..611cc69af8f0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2591,13 +2591,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask) static int cpuset_spread_node(int *rotor) { - int node; - - node = next_node(*rotor, current->mems_allowed); - if (node == MAX_NUMNODES) - node = first_node(current->mems_allowed); - *rotor = node; - return node; + return *rotor = next_node_in(*rotor, current->mems_allowed); } int cpuset_mem_spread_node(void) diff --git a/lib/Makefile b/lib/Makefile index 931396ada5eb..42b69185f963 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -25,7 +25,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ sha1.o md5.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ - earlycpio.o seq_buf.o nmi_backtrace.o + earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o lib-$(CONFIG_MMU) += ioremap.o diff --git a/lib/nodemask.c b/lib/nodemask.c new file mode 100644 index 000000000000..e42a5bf44d33 --- /dev/null +++ b/lib/nodemask.c @@ -0,0 +1,30 @@ +#include +#include +#include + +int __next_node_in(int node, const nodemask_t *srcp) +{ + int ret = __next_node(node, srcp); + + if (ret == MAX_NUMNODES) + ret = __first_node(srcp); + return ret; +} +EXPORT_SYMBOL(__next_node_in); + +#ifdef CONFIG_NUMA +/* + * Return the bit number of a random bit set in the nodemask. + * (returns NUMA_NO_NODE if nodemask is empty) + */ +int node_random(const nodemask_t *maskp) +{ + int w, bit = NUMA_NO_NODE; + + w = nodes_weight(*maskp); + if (w) + bit = bitmap_ord_to_pos(maskp->bits, + get_random_int() % w, MAX_NUMNODES); + return bit; +} +#endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 19d0d08b396f..5856093f9062 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -937,9 +937,7 @@ err: */ static int next_node_allowed(int nid, nodemask_t *nodes_allowed) { - nid = next_node(nid, *nodes_allowed); - if (nid == MAX_NUMNODES) - nid = first_node(*nodes_allowed); + nid = next_node_in(nid, *nodes_allowed); VM_BUG_ON(nid >= MAX_NUMNODES); return nid; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fe787f5c41bd..6740c4c2b550 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1389,9 +1389,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) mem_cgroup_may_update_nodemask(memcg); node = memcg->last_scanned_node; - node = next_node(node, memcg->scan_nodes); - if (node == MAX_NUMNODES) - node = first_node(memcg->scan_nodes); + node = next_node_in(node, memcg->scan_nodes); /* * We call this when we hit limit, not when pages are added to LRU. * No LRU may hold pages because all pages are UNEVICTABLE or diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 36cc01bc950a..8d369cee0cd6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -97,7 +97,6 @@ #include #include -#include #include "internal.h" @@ -347,9 +346,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, BUG(); if (!node_isset(current->il_next, tmp)) { - current->il_next = next_node(current->il_next, tmp); - if (current->il_next >= MAX_NUMNODES) - current->il_next = first_node(tmp); + current->il_next = next_node_in(current->il_next, tmp); if (current->il_next >= MAX_NUMNODES) current->il_next = numa_node_id(); } @@ -1709,9 +1706,7 @@ static unsigned interleave_nodes(struct mempolicy *policy) struct task_struct *me = current; nid = me->il_next; - next = next_node(nid, policy->v.nodes); - if (next >= MAX_NUMNODES) - next = first_node(policy->v.nodes); + next = next_node_in(nid, policy->v.nodes); if (next < MAX_NUMNODES) me->il_next = next; return nid; @@ -1805,21 +1800,6 @@ static inline unsigned interleave_nid(struct mempolicy *pol, return interleave_nodes(pol); } -/* - * Return the bit number of a random bit set in the nodemask. - * (returns NUMA_NO_NODE if nodemask is empty) - */ -int node_random(const nodemask_t *maskp) -{ - int w, bit = NUMA_NO_NODE; - - w = nodes_weight(*maskp); - if (w) - bit = bitmap_ord_to_pos(maskp->bits, - get_random_int() % w, MAX_NUMNODES); - return bit; -} - #ifdef CONFIG_HUGETLBFS /* * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c4f568206544..67bedd18429c 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -288,13 +288,10 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private, * accordance with memory policy of the user process if possible. For * now as a simple work-around, we use the next node for destination. */ - if (PageHuge(page)) { - int node = next_online_node(page_to_nid(page)); - if (node == MAX_NUMNODES) - node = first_online_node; + if (PageHuge(page)) return alloc_huge_page_node(page_hstate(compound_head(page)), - node); - } + next_node_in(page_to_nid(page), + node_online_map)); if (PageHighMem(page)) gfp_mask |= __GFP_HIGHMEM; diff --git a/mm/slab.c b/mm/slab.c index d81565a92864..c11bf5007952 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -522,22 +522,15 @@ static DEFINE_PER_CPU(unsigned long, slab_reap_node); static void init_reap_node(int cpu) { - int node; - - node = next_node(cpu_to_mem(cpu), node_online_map); - if (node == MAX_NUMNODES) - node = first_node(node_online_map); - - per_cpu(slab_reap_node, cpu) = node; + per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu), + node_online_map); } static void next_reap_node(void) { int node = __this_cpu_read(slab_reap_node); - node = next_node(node, node_online_map); - if (unlikely(node >= MAX_NUMNODES)) - node = first_node(node_online_map); + node = next_node_in(node, node_online_map); __this_cpu_write(slab_reap_node, node); } -- cgit v1.2.3-58-ga151 From 9fee021d15ddd884d40d1540913474e8112313fe Mon Sep 17 00:00:00 2001 From: Vaishali Thakkar Date: Thu, 19 May 2016 17:11:04 -0700 Subject: mm/hugetlb: introduce hugetlb_bad_size() When any unsupported hugepage size is specified, 'hugepagesz=' and 'hugepages=' should be ignored during command line parsing until any supported hugepage size is found. But currently incorrect number of hugepages are allocated when unsupported size is specified as it fails to ignore the 'hugepages=' command. Test case: Note that this is specific to x86 architecture. Boot the kernel with command line option 'hugepagesz=256M hugepages=X'. After boot, dmesg output shows that X number of hugepages of the size 2M is pre-allocated instead of 0. So, to handle such command line options, introduce new routine hugetlb_bad_size. The routine hugetlb_bad_size sets the global variable parsed_valid_hugepagesz. We are using parsed_valid_hugepagesz to save the state when unsupported hugepagesize is found so that we can ignore the 'hugepages=' parameters after that and then reset the variable when supported hugepage size is found. The routine hugetlb_bad_size can be called while setting 'hugepagesz=' parameter in an architecture specific code. Signed-off-by: Vaishali Thakkar Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Acked-by: Michal Hocko Cc: Hillf Danton Cc: Yaowei Bai Cc: Dominik Dingel Cc: Kirill A. Shutemov Cc: Paul Gortmaker Cc: Dave Hansen Cc: Benjamin Herrenschmidt Cc: James Hogan Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 1 + mm/hugetlb.c | 14 +++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 7d953c2542a8..e44c57876e89 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -338,6 +338,7 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, /* arch callback */ int __init alloc_bootmem_huge_page(struct hstate *h); +void __init hugetlb_bad_size(void); void __init hugetlb_add_hstate(unsigned order); struct hstate *size_to_hstate(unsigned long size); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fb37ef810655..0adb74d0a4e1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -51,6 +51,7 @@ __initdata LIST_HEAD(huge_boot_pages); static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; static unsigned long __initdata default_hstate_size; +static bool __initdata parsed_valid_hugepagesz = true; /* * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, @@ -2659,6 +2660,11 @@ static int __init hugetlb_init(void) subsys_initcall(hugetlb_init); /* Should be called on processing a hugepagesz=... option */ +void __init hugetlb_bad_size(void) +{ + parsed_valid_hugepagesz = false; +} + void __init hugetlb_add_hstate(unsigned int order) { struct hstate *h; @@ -2691,11 +2697,17 @@ static int __init hugetlb_nrpages_setup(char *s) unsigned long *mhp; static unsigned long *last_mhp; + if (!parsed_valid_hugepagesz) { + pr_warn("hugepages = %s preceded by " + "an unsupported hugepagesz, ignoring\n", s); + parsed_valid_hugepagesz = true; + return 1; + } /* * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, * so this hugepages= parameter goes to the "default hstate". */ - if (!hugetlb_max_hstate) + else if (!hugetlb_max_hstate) mhp = &default_hstate_max_huge_pages; else mhp = &parsed_hstate->max_huge_pages; -- cgit v1.2.3-58-ga151 From 32f6271dbdc351dce96b11c5f3567bae8188004f Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 19 May 2016 17:11:23 -0700 Subject: mm/hugetlb: is_vm_hugetlb_page() can return bool Make is_vm_hugetlb_page() return bool to improve readability due to this particular function only using either one or zero as its return value. Signed-off-by: Yaowei Bai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb_inline.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index 2bb681fbeb35..a4e7ca0f3585 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -5,16 +5,16 @@ #include -static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) +static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) { return !!(vma->vm_flags & VM_HUGETLB); } #else -static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) +static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) { - return 0; + return false; } #endif -- cgit v1.2.3-58-ga151 From c98940f6fa3d06fa8fec75aa2362b25227573d06 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 19 May 2016 17:11:26 -0700 Subject: mm/memory_hotplug: is_mem_section_removable() can return bool Make is_mem_section_removable() return bool to improve readability due to this particular function only using either one or zero as its return value. Signed-off-by: Yaowei Bai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 6 +++--- mm/memory_hotplug.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index adbef586e696..20d8a5d4d133 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -247,16 +247,16 @@ static inline void mem_hotplug_done(void) {} #ifdef CONFIG_MEMORY_HOTREMOVE -extern int is_mem_section_removable(unsigned long pfn, unsigned long nr_pages); +extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages); extern void try_offline_node(int nid); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern void remove_memory(int nid, u64 start, u64 size); #else -static inline int is_mem_section_removable(unsigned long pfn, +static inline bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages) { - return 0; + return false; } static inline void try_offline_node(int nid) {} diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index aa34431c3f31..b21d8895ea41 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1410,7 +1410,7 @@ static struct page *next_active_pageblock(struct page *page) } /* Checks if this range of memory is likely to be hot-removable. */ -int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) +bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) { struct page *page = pfn_to_page(start_pfn); struct page *end_page = page + nr_pages; @@ -1418,12 +1418,12 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) /* Check the starting page of each pageblock within the range */ for (; page < end_page; page = next_active_pageblock(page)) { if (!is_pageblock_removable_nolock(page)) - return 0; + return false; cond_resched(); } /* All pageblocks in the memory block are likely to be hot-removable */ - return 1; + return true; } /* -- cgit v1.2.3-58-ga151 From bb00a789e565b96c52b2224c2280f7ac83175bec Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 19 May 2016 17:11:29 -0700 Subject: mm/vmalloc.c: is_vmalloc_addr() can return bool Make is_vmalloc_addr() return bool to improve readability due to this particular function only using either one or zero as its return value. Signed-off-by: Yaowei Bai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1193a54ea2b3..5b375133c695 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -447,14 +447,14 @@ unsigned long vmalloc_to_pfn(const void *addr); * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there * is no special casing required. */ -static inline int is_vmalloc_addr(const void *x) +static inline bool is_vmalloc_addr(const void *x) { #ifdef CONFIG_MMU unsigned long addr = (unsigned long)x; return addr >= VMALLOC_START && addr < VMALLOC_END; #else - return 0; + return false; #endif } #ifdef CONFIG_MMU -- cgit v1.2.3-58-ga151 From 4ee815be1d34a6f254b3d09bdebcb27f294f2bd3 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 19 May 2016 17:11:32 -0700 Subject: mm/mempolicy.c: vma_migratable() can return bool Make vma_migratable() return bool due to this particular function only using either one or zero as its return value. Signed-off-by: Yaowei Bai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 2696c1f05ed1..6978a99e571f 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -172,14 +172,14 @@ extern int mpol_parse_str(char *str, struct mempolicy **mpol); extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol); /* Check if a vma is migratable */ -static inline int vma_migratable(struct vm_area_struct *vma) +static inline bool vma_migratable(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_IO | VM_PFNMAP)) - return 0; + return false; #ifndef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION if (vma->vm_flags & VM_HUGETLB) - return 0; + return false; #endif /* @@ -190,8 +190,8 @@ static inline int vma_migratable(struct vm_area_struct *vma) if (vma->vm_file && gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) < policy_zone) - return 0; - return 1; + return false; + return true; } extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); -- cgit v1.2.3-58-ga151 From 29f9cb53d25cd9916537b44b0af7f0b95a2e4438 Mon Sep 17 00:00:00 2001 From: Chanho Min Date: Thu, 19 May 2016 17:11:57 -0700 Subject: mm/highmem: simplify is_highmem() is_highmem() can be simplified by use of is_highmem_idx(). This patch removes redundant code and will make it easier to maintain if the zone policy is changed or a new zone is added. (akpm: saves me 25 bytes of text per is_highmem() callsite) Signed-off-by: Chanho Min Reviewed-by: Dan Williams Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c60df9257cc7..150c6049f961 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -828,10 +828,7 @@ static inline int is_highmem_idx(enum zone_type idx) static inline int is_highmem(struct zone *zone) { #ifdef CONFIG_HIGHMEM - int zone_off = (char *)zone - (char *)zone->zone_pgdat->node_zones; - return zone_off == ZONE_HIGHMEM * sizeof(*zone) || - (zone_off == ZONE_MOVABLE * sizeof(*zone) && - zone_movable_is_highmem()); + return is_highmem_idx(zone_idx(zone)); #else return 0; #endif -- cgit v1.2.3-58-ga151 From 1aa8aea535977f0e0b398f39d052e7befff81da6 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 May 2016 17:12:00 -0700 Subject: mm: uninline page_mapped() It's huge. Uninlining it saves 206 bytes per callsite. Shaves 4924 bytes from the x86_64 allmodconfig vmlinux. [akpm@linux-foundation.org: coding-style fixes] Cc: Steve Capper Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 21 +-------------------- mm/util.c | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5b375133c695..9c2852cabf01 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1032,26 +1032,7 @@ static inline pgoff_t page_file_index(struct page *page) return page->index; } -/* - * Return true if this page is mapped into pagetables. - * For compound page it returns true if any subpage of compound page is mapped. - */ -static inline bool page_mapped(struct page *page) -{ - int i; - if (likely(!PageCompound(page))) - return atomic_read(&page->_mapcount) >= 0; - page = compound_head(page); - if (atomic_read(compound_mapcount_ptr(page)) >= 0) - return true; - if (PageHuge(page)) - return false; - for (i = 0; i < hpage_nr_pages(page); i++) { - if (atomic_read(&page[i]._mapcount) >= 0) - return true; - } - return false; -} +bool page_mapped(struct page *page); /* * Return true only if the page has been allocated with diff --git a/mm/util.c b/mm/util.c index 6cc81e7b8705..8a1b3a1fb595 100644 --- a/mm/util.c +++ b/mm/util.c @@ -346,6 +346,29 @@ void *page_rmapping(struct page *page) return __page_rmapping(page); } +/* + * Return true if this page is mapped into pagetables. + * For compound page it returns true if any subpage of compound page is mapped. + */ +bool page_mapped(struct page *page) +{ + int i; + + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) >= 0; + page = compound_head(page); + if (atomic_read(compound_mapcount_ptr(page)) >= 0) + return true; + if (PageHuge(page)) + return false; + for (i = 0; i < hpage_nr_pages(page); i++) { + if (atomic_read(&page[i]._mapcount) >= 0) + return true; + } + return false; +} +EXPORT_SYMBOL(page_mapped); + struct anon_vma *page_anon_vma(struct page *page) { unsigned long mapping; -- cgit v1.2.3-58-ga151 From ca707239e8a7958ffb1c31737d41cae1a674c938 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:12:35 -0700 Subject: mm: update_lru_size warn and reset bad lru_size Though debug kernels have a VM_BUG_ON to help protect from misaccounting lru_size, non-debug kernels are liable to wrap it around: and then the vast unsigned long size draws page reclaim into a loop of repeatedly doing nothing on an empty list, without even a cond_resched(). That soft lockup looks confusingly like an over-busy reclaim scenario, with lots of contention on the lru_lock in shrink_inactive_list(): yet has a totally different origin. Help differentiate with a custom warning in mem_cgroup_update_lru_size(), even in non-debug kernels; and reset the size to avoid the lockup. But the particular bug which suggested this change was mine alone, and since fixed. Make it a WARN_ONCE: the first occurrence is the most informative, a flurry may follow, yet even when rate-limited little more is learnt. Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Andres Lagar-Cavilla Cc: Konstantin Khlebnikov Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 2 +- mm/memcontrol.c | 24 ++++++++++++++++++++---- 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 712e8c37a200..d8cea81ab1ac 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -35,8 +35,8 @@ static __always_inline void del_page_from_lru_list(struct page *page, struct lruvec *lruvec, enum lru_list lru) { int nr_pages = hpage_nr_pages(page); - mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); list_del(&page->lru); + mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, -nr_pages); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 011dac8ab5d7..6a0199706f00 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1023,22 +1023,38 @@ out: * @lru: index of lru list the page is sitting on * @nr_pages: positive when adding or negative when removing * - * This function must be called when a page is added to or removed from an - * lru list. + * This function must be called under lru_lock, just before a page is added + * to or just after a page is removed from an lru list (that ordering being + * so as to allow it to check that lru_size 0 is consistent with list_empty). */ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int nr_pages) { struct mem_cgroup_per_zone *mz; unsigned long *lru_size; + long size; + bool empty; if (mem_cgroup_disabled()) return; mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); lru_size = mz->lru_size + lru; - *lru_size += nr_pages; - VM_BUG_ON((long)(*lru_size) < 0); + empty = list_empty(lruvec->lists + lru); + + if (nr_pages < 0) + *lru_size += nr_pages; + + size = *lru_size; + if (WARN_ONCE(size < 0 || empty != !size, + "%s(%p, %d, %d): lru_size %ld but %sempty\n", + __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) { + VM_BUG_ON(1); + *lru_size = 0; + } + + if (nr_pages > 0) + *lru_size += nr_pages; } bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) -- cgit v1.2.3-58-ga151 From 9d5e6a9f22311b00a20ff9b072760ad3e73f0d99 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:12:38 -0700 Subject: mm: update_lru_size do the __mod_zone_page_state Konstantin Khlebnikov pointed out (nearly four years ago, when lumpy reclaim was removed) that lru_size can be updated by -nr_taken once per call to isolate_lru_pages(), instead of page by page. Update it inside isolate_lru_pages(), or at its two callsites? I chose to update it at the callsites, rearranging and grouping the updates by nr_taken and nr_scanned together in both. With one exception, mem_cgroup_update_lru_size(,lru,) is then used where __mod_zone_page_state(,NR_LRU_BASE+lru,) is used; and we shall be adding some more calls in a future commit. Make the code a little smaller and simpler by incorporating stat update in lru_size update. The exception was move_active_pages_to_lru(), which aggregated the pgmoved stat update separately from the individual lru_size updates; but I still think this a simplification worth making. However, the __mod_zone_page_state is not peculiar to mem_cgroups: so better use the name update_lru_size, calls mem_cgroup_update_lru_size when CONFIG_MEMCG. Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ------ include/linux/mm_inline.h | 24 ++++++++++++++++++------ mm/memcontrol.c | 2 ++ mm/vmscan.c | 23 ++++++++++------------- 4 files changed, 30 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1191d79aa495..94da96738df3 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -658,12 +658,6 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) return 0; } -static inline void -mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int increment) -{ -} - static inline unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index d8cea81ab1ac..5bd29ba4f174 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -22,22 +22,34 @@ static inline int page_is_file_cache(struct page *page) return !PageSwapBacked(page); } +static __always_inline void __update_lru_size(struct lruvec *lruvec, + enum lru_list lru, int nr_pages) +{ + __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages); +} + +static __always_inline void update_lru_size(struct lruvec *lruvec, + enum lru_list lru, int nr_pages) +{ +#ifdef CONFIG_MEMCG + mem_cgroup_update_lru_size(lruvec, lru, nr_pages); +#else + __update_lru_size(lruvec, lru, nr_pages); +#endif +} + static __always_inline void add_page_to_lru_list(struct page *page, struct lruvec *lruvec, enum lru_list lru) { - int nr_pages = hpage_nr_pages(page); - mem_cgroup_update_lru_size(lruvec, lru, nr_pages); + update_lru_size(lruvec, lru, hpage_nr_pages(page)); list_add(&page->lru, &lruvec->lists[lru]); - __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages); } static __always_inline void del_page_from_lru_list(struct page *page, struct lruvec *lruvec, enum lru_list lru) { - int nr_pages = hpage_nr_pages(page); list_del(&page->lru); - mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); - __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, -nr_pages); + update_lru_size(lruvec, lru, -hpage_nr_pages(page)); } /** diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6a0199706f00..1b40dcad2b90 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1035,6 +1035,8 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, long size; bool empty; + __update_lru_size(lruvec, lru, nr_pages); + if (mem_cgroup_disabled()) return; diff --git a/mm/vmscan.c b/mm/vmscan.c index d3a02ac3eed7..dcfdfc1a0942 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1374,7 +1374,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src); scan++) { struct page *page; - int nr_pages; page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); @@ -1383,10 +1382,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, switch (__isolate_lru_page(page, mode)) { case 0: - nr_pages = hpage_nr_pages(page); - mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); + nr_taken += hpage_nr_pages(page); list_move(&page->lru, dst); - nr_taken += nr_pages; break; case -EBUSY: @@ -1602,8 +1599,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, &nr_scanned, sc, isolate_mode, lru); - __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); + update_lru_size(lruvec, lru, -nr_taken); __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + reclaim_stat->recent_scanned[file] += nr_taken; if (global_reclaim(sc)) { __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); @@ -1624,8 +1622,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&zone->lru_lock); - reclaim_stat->recent_scanned[file] += nr_taken; - if (global_reclaim(sc)) { if (current_is_kswapd()) __count_zone_vm_events(PGSTEAL_KSWAPD, zone, @@ -1742,7 +1738,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, SetPageLRU(page); nr_pages = hpage_nr_pages(page); - mem_cgroup_update_lru_size(lruvec, lru, nr_pages); + update_lru_size(lruvec, lru, nr_pages); list_move(&page->lru, &lruvec->lists[lru]); pgmoved += nr_pages; @@ -1760,7 +1756,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, list_add(&page->lru, pages_to_free); } } - __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); + if (!is_active_lru(lru)) __count_vm_events(PGDEACTIVATE, pgmoved); } @@ -1794,14 +1790,15 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, isolate_mode, lru); - if (global_reclaim(sc)) - __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); + update_lru_size(lruvec, lru, -nr_taken); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; + if (global_reclaim(sc)) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); __count_zone_vm_events(PGREFILL, zone, nr_scanned); - __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + spin_unlock_irq(&zone->lru_lock); while (!list_empty(&l_hold)) { -- cgit v1.2.3-58-ga151 From 75edd345e8ede51bc8f00672feff5d622f2b3af6 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:12:44 -0700 Subject: tmpfs: preliminary minor tidyups Make a few cleanups in mm/shmem.c, before going on to complicate it. shmem_alloc_page() will become more complicated: we can't afford to to have that complication duplicated between a CONFIG_NUMA version and a !CONFIG_NUMA version, so rearrange the #ifdef'ery there to yield a single shmem_swapin() and a single shmem_alloc_page(). Yes, it's a shame to inflict the horrid pseudo-vma on non-NUMA configurations, but eliminating it is a larger cleanup: I have an alloc_pages_mpol() patchset not yet ready - mpol handling is subtle and bug-prone, and changed yet again since my last version. Move __SetPageLocked, __SetPageSwapBacked from shmem_getpage_gfp() to shmem_alloc_page(): that SwapBacked flag will be useful in future, to help to distinguish different cases appropriately. And the SGP_DIRTY variant of SGP_CACHE is hard to understand and of little use (IIRC it dates back to when shmem_getpage() returned the page unlocked): kill it and do the necessary in shmem_file_read_iter(). But an arm64 build then complained that info may be uninitialized (where shmem_getpage_gfp() deletes a freshly alloced page beyond eof), and advancing to an "sgp <= SGP_CACHE" test jogged it back to reality. Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 6 +++++ mm/shmem.c | 69 ++++++++++++++++++----------------------------- 2 files changed, 32 insertions(+), 43 deletions(-) (limited to 'include') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 6978a99e571f..4429d255c8ab 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -228,6 +228,12 @@ static inline void mpol_free_shared_policy(struct shared_policy *p) { } +static inline struct mempolicy * +mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) +{ + return NULL; +} + #define vma_policy(vma) NULL static inline int diff --git a/mm/shmem.c b/mm/shmem.c index 9e609d58df73..6d2de2c1bf11 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -101,7 +101,6 @@ struct shmem_falloc { enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ - SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; @@ -169,7 +168,7 @@ static inline int shmem_reacct_size(unsigned long flags, /* * ... whereas tmpfs objects are accounted incrementally as - * pages are allocated, in order to allow huge sparse files. + * pages are allocated, in order to allow large sparse files. * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ @@ -947,8 +946,7 @@ redirty: return 0; } -#ifdef CONFIG_NUMA -#ifdef CONFIG_TMPFS +#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) { char buffer[64]; @@ -972,7 +970,18 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) } return mpol; } -#endif /* CONFIG_TMPFS */ +#else /* !CONFIG_NUMA || !CONFIG_TMPFS */ +static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) +{ +} +static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) +{ + return NULL; +} +#endif /* CONFIG_NUMA && CONFIG_TMPFS */ +#ifndef CONFIG_NUMA +#define vm_policy vm_private_data +#endif static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) @@ -1008,39 +1017,17 @@ static struct page *shmem_alloc_page(gfp_t gfp, pvma.vm_ops = NULL; pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); - page = alloc_page_vma(gfp, &pvma, 0); + page = alloc_pages_vma(gfp, 0, &pvma, 0, numa_node_id(), false); + if (page) { + __SetPageLocked(page); + __SetPageSwapBacked(page); + } /* Drop reference taken by mpol_shared_policy_lookup() */ mpol_cond_put(pvma.vm_policy); return page; } -#else /* !CONFIG_NUMA */ -#ifdef CONFIG_TMPFS -static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) -{ -} -#endif /* CONFIG_TMPFS */ - -static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) -{ - return swapin_readahead(swap, gfp, NULL, 0); -} - -static inline struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) -{ - return alloc_page(gfp); -} -#endif /* CONFIG_NUMA */ - -#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS) -static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) -{ - return NULL; -} -#endif /* * When a page is moved from swapcache to shmem filecache (either by the @@ -1084,8 +1071,6 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, copy_highpage(newpage, oldpage); flush_dcache_page(newpage); - __SetPageLocked(newpage); - __SetPageSwapBacked(newpage); SetPageUptodate(newpage); set_page_private(newpage, swap_index); SetPageSwapCache(newpage); @@ -1155,7 +1140,7 @@ repeat: page = NULL; } - if (sgp != SGP_WRITE && sgp != SGP_FALLOC && + if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; goto unlock; @@ -1275,9 +1260,6 @@ repeat: error = -ENOMEM; goto decused; } - - __SetPageLocked(page); - __SetPageSwapBacked(page); if (sgp == SGP_WRITE) __SetPageReferenced(page); @@ -1321,12 +1303,10 @@ clear: flush_dcache_page(page); SetPageUptodate(page); } - if (sgp == SGP_DIRTY) - set_page_dirty(page); } /* Perhaps the file has been truncated since we checked */ - if (sgp != SGP_WRITE && sgp != SGP_FALLOC && + if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { if (alloced) { ClearPageDirty(page); @@ -1633,7 +1613,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * and even mark them dirty, so it cannot exceed the max_blocks limit. */ if (!iter_is_iovec(to)) - sgp = SGP_DIRTY; + sgp = SGP_CACHE; index = *ppos >> PAGE_SHIFT; offset = *ppos & ~PAGE_MASK; @@ -1659,8 +1639,11 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) error = 0; break; } - if (page) + if (page) { + if (sgp == SGP_CACHE) + set_page_dirty(page); unlock_page(page); + } /* * We must evaluate after, since reads (unlike writes) -- cgit v1.2.3-58-ga151 From 52b6f46bc163eef17ecba4cd552beeafe2b24453 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:12:50 -0700 Subject: mm: /proc/sys/vm/stat_refresh to force vmstat update Provide /proc/sys/vm/stat_refresh to force an immediate update of per-cpu into global vmstats: useful to avoid a sleep(2) or whatever before checking counts when testing. Originally added to work around a bug which left counts stranded indefinitely on a cpu going idle (an inaccuracy magnified when small below-batch numbers represent "huge" amounts of memory), but I believe that bug is now fixed: nonetheless, this is still a useful knob. Its schedule_on_each_cpu() is probably too expensive just to fold into reading /proc/meminfo itself: give this mode 0600 to prevent abuse. Allow a write or a read to do the same: nothing to read, but "grep -h Shmem /proc/sys/vm/stat_refresh /proc/meminfo" is convenient. Oh, and since global_page_state() itself is careful to disguise any underflow as 0, hack in an "Invalid argument" and pr_warn() if a counter is negative after the refresh - this helped to fix a misaccounting of NR_ISOLATED_FILE in my migration code. But on recent kernels, I find that NR_ALLOC_BATCH and NR_PAGES_SCANNED often go negative some of the time. I have not yet worked out why, but have no evidence that it's actually harmful. Punt for the moment by just ignoring the anomaly on those. Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Andres Lagar-Cavilla Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 14 +++++++++++ include/linux/vmstat.h | 4 +++ kernel/sysctl.c | 7 ++++++ mm/vmstat.c | 60 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 85 insertions(+) (limited to 'include') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 34a5fece3121..720355cbdf45 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -57,6 +57,7 @@ Currently, these files are in /proc/sys/vm: - panic_on_oom - percpu_pagelist_fraction - stat_interval +- stat_refresh - swappiness - user_reserve_kbytes - vfs_cache_pressure @@ -755,6 +756,19 @@ is 1 second. ============================================================== +stat_refresh + +Any read or write (by root only) flushes all the per-cpu vm statistics +into their global totals, for more accurate reports when testing +e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo + +As a side-effect, it also checks for negative totals (elsewhere reported +as 0) and "fails" with EINVAL if any are found, with a warning in dmesg. +(At time of writing, a few stats are known sometimes to be found negative, +with no ill effects: errors and warnings on these stats are suppressed.) + +============================================================== + swappiness This control is used to define how aggressive the kernel will swap diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 73fae8c4a5fb..02fce415b3d9 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -193,6 +193,10 @@ void quiet_vmstat(void); void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); +struct ctl_table; +int vmstat_refresh(struct ctl_table *, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); int calculate_pressure_threshold(struct zone *zone); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c8b318663525..2effd84d83e3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1521,6 +1521,13 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "stat_refresh", + .data = NULL, + .maxlen = 0, + .mode = 0600, + .proc_handler = vmstat_refresh, + }, #endif #ifdef CONFIG_MMU { diff --git a/mm/vmstat.c b/mm/vmstat.c index a7de9adacbd9..c831be32a1a3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1379,6 +1379,66 @@ static DEFINE_PER_CPU(struct delayed_work, vmstat_work); int sysctl_stat_interval __read_mostly = HZ; static cpumask_var_t cpu_stat_off; +#ifdef CONFIG_PROC_FS +static void refresh_vm_stats(struct work_struct *work) +{ + refresh_cpu_vm_stats(true); +} + +int vmstat_refresh(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + long val; + int err; + int i; + + /* + * The regular update, every sysctl_stat_interval, may come later + * than expected: leaving a significant amount in per_cpu buckets. + * This is particularly misleading when checking a quantity of HUGE + * pages, immediately after running a test. /proc/sys/vm/stat_refresh, + * which can equally be echo'ed to or cat'ted from (by root), + * can be used to update the stats just before reading them. + * + * Oh, and since global_page_state() etc. are so careful to hide + * transiently negative values, report an error here if any of + * the stats is negative, so we know to go looking for imbalance. + */ + err = schedule_on_each_cpu(refresh_vm_stats); + if (err) + return err; + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { + val = atomic_long_read(&vm_stat[i]); + if (val < 0) { + switch (i) { + case NR_ALLOC_BATCH: + case NR_PAGES_SCANNED: + /* + * These are often seen to go negative in + * recent kernels, but not to go permanently + * negative. Whilst it would be nicer not to + * have exceptions, rooting them out would be + * another task, of rather low priority. + */ + break; + default: + pr_warn("%s: %s %ld\n", + __func__, vmstat_text[i], val); + err = -EINVAL; + break; + } + } + } + if (err) + return err; + if (write) + *ppos += *lenp; + else + *lenp = 0; + return 0; +} +#endif /* CONFIG_PROC_FS */ + static void vmstat_update(struct work_struct *w) { if (refresh_cpu_vm_stats(true)) { -- cgit v1.2.3-58-ga151 From bf8616d5fa179d6c755f06726567c6d63c6fbbc7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:12:54 -0700 Subject: huge mm: move_huge_pmd does not need new_vma Remove move_huge_pmd()'s redundant new_vma arg: all it was used for was a VM_NOHUGEPAGE check on new_vma flags, but the new_vma is cloned from the old vma, so a trans_huge_pmd in the new_vma will be as acceptable as it was in the old vma, alignment and size permitting. Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Andres Lagar-Cavilla Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 4 +--- mm/huge_memory.c | 7 ++----- mm/mremap.c | 5 ++--- 3 files changed, 5 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index d7b9e5346fba..419fb9e03447 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -28,9 +28,7 @@ extern int zap_huge_pmd(struct mmu_gather *tlb, extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec); -extern bool move_huge_pmd(struct vm_area_struct *vma, - struct vm_area_struct *new_vma, - unsigned long old_addr, +extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd); extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f8ac8f582fd8..66675eed67be 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1698,20 +1698,17 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return 1; } -bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, - unsigned long old_addr, +bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd) { spinlock_t *old_ptl, *new_ptl; pmd_t pmd; - struct mm_struct *mm = vma->vm_mm; if ((old_addr & ~HPAGE_PMD_MASK) || (new_addr & ~HPAGE_PMD_MASK) || - old_end - old_addr < HPAGE_PMD_SIZE || - (new_vma->vm_flags & VM_NOHUGEPAGE)) + old_end - old_addr < HPAGE_PMD_SIZE) return false; /* diff --git a/mm/mremap.c b/mm/mremap.c index 3fa0a467df66..7d98fe1adc12 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -198,9 +198,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, /* See comment in move_ptes() */ if (need_rmap_locks) anon_vma_lock_write(vma->anon_vma); - moved = move_huge_pmd(vma, new_vma, old_addr, - new_addr, old_end, - old_pmd, new_pmd); + moved = move_huge_pmd(vma, old_addr, new_addr, + old_end, old_pmd, new_pmd); if (need_rmap_locks) anon_vma_unlock_write(vma->anon_vma); if (moved) { -- cgit v1.2.3-58-ga151 From fd8cfd3000191cb7f5b9ea8640bd46181f6b4b74 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:13:00 -0700 Subject: arch: fix has_transparent_hugepage() I've just discovered that the useful-sounding has_transparent_hugepage() is actually an architecture-dependent minefield: on some arches it only builds if CONFIG_TRANSPARENT_HUGEPAGE=y, on others it's also there when not, but on some of those (arm and arm64) it then gives the wrong answer; and on mips alone it's marked __init, which would crash if called later (but so far it has not been called later). Straighten this out: make it available to all configs, with a sensible default in asm-generic/pgtable.h, removing its definitions from those arches (arc, arm, arm64, sparc, tile) which are served by the default, adding #define has_transparent_hugepage has_transparent_hugepage to those (mips, powerpc, s390, x86) which need to override the default at runtime, and removing the __init from mips (but maybe that kind of code should be avoided after init: set a static variable the first time it's called). Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Konstantin Khlebnikov Acked-by: David S. Miller Acked-by: Vineet Gupta [arch/arc] Acked-by: Gerald Schaefer [arch/s390] Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arc/include/asm/hugepage.h | 2 -- arch/arm/include/asm/pgtable-3level.h | 5 ----- arch/arm64/include/asm/pgtable.h | 5 ----- arch/mips/include/asm/pgtable.h | 1 + arch/mips/mm/tlb-r4k.c | 21 +++++++++++---------- arch/powerpc/include/asm/book3s/64/pgtable.h | 1 + arch/powerpc/include/asm/pgtable.h | 1 - arch/s390/include/asm/pgtable.h | 1 + arch/sparc/include/asm/pgtable_64.h | 2 -- arch/tile/include/asm/pgtable.h | 1 - arch/x86/include/asm/pgtable.h | 1 + include/asm-generic/pgtable.h | 8 ++++++++ 12 files changed, 23 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/arch/arc/include/asm/hugepage.h b/arch/arc/include/asm/hugepage.h index 7afe3356b770..317ff773e1ca 100644 --- a/arch/arc/include/asm/hugepage.h +++ b/arch/arc/include/asm/hugepage.h @@ -61,8 +61,6 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd); -#define has_transparent_hugepage() 1 - /* Generic variants assume pgtable_t is struct page *, hence need for these */ #define __HAVE_ARCH_PGTABLE_DEPOSIT extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index dc46398bc3a5..fa70db7c714b 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -281,11 +281,6 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, flush_pmd_entry(pmdp); } -static inline int has_transparent_hugepage(void) -{ - return 1; -} - #endif /* __ASSEMBLY__ */ #endif /* _ASM_PGTABLE_3LEVEL_H */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 2da46ae9c991..a7ac45a03dd0 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -314,11 +314,6 @@ static inline int pmd_protnone(pmd_t pmd) #define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)) -static inline int has_transparent_hugepage(void) -{ - return 1; -} - #define __pgprot_modify(prot,mask,bits) \ __pgprot((pgprot_val(prot) & ~(mask)) | (bits)) diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 9a4fe0133ff1..f53a7e3a4dd9 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -468,6 +468,7 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, #ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define has_transparent_hugepage has_transparent_hugepage extern int has_transparent_hugepage(void); static inline int pmd_trans_huge(pmd_t pmd) diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c index c17d7627f872..2d93b63cf830 100644 --- a/arch/mips/mm/tlb-r4k.c +++ b/arch/mips/mm/tlb-r4k.c @@ -400,19 +400,20 @@ void add_wired_entry(unsigned long entrylo0, unsigned long entrylo1, #ifdef CONFIG_TRANSPARENT_HUGEPAGE -int __init has_transparent_hugepage(void) +int has_transparent_hugepage(void) { - unsigned int mask; - unsigned long flags; - - local_irq_save(flags); - write_c0_pagemask(PM_HUGE_MASK); - back_to_back_c0_hazard(); - mask = read_c0_pagemask(); - write_c0_pagemask(PM_DEFAULT_MASK); + static unsigned int mask = -1; - local_irq_restore(flags); + if (mask == -1) { /* first call comes during __init */ + unsigned long flags; + local_irq_save(flags); + write_c0_pagemask(PM_HUGE_MASK); + back_to_back_c0_hazard(); + mask = read_c0_pagemask(); + write_c0_pagemask(PM_DEFAULT_MASK); + local_irq_restore(flags); + } return mask == PM_HUGE_MASK; } diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 77d3ce05798e..8fe6f6b48aa5 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -219,6 +219,7 @@ extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd); +#define has_transparent_hugepage has_transparent_hugepage extern int has_transparent_hugepage(void); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 47897a30982d..ee09e99097f0 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -65,7 +65,6 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, struct page **pages, int *nr); #ifndef CONFIG_TRANSPARENT_HUGEPAGE #define pmd_large(pmd) 0 -#define has_transparent_hugepage() 0 #endif pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, bool *is_thp, unsigned *shift); diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 2f66645587a2..18d2beb89340 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1223,6 +1223,7 @@ static inline int pmd_trans_huge(pmd_t pmd) return pmd_val(pmd) & _SEGMENT_ENTRY_LARGE; } +#define has_transparent_hugepage has_transparent_hugepage static inline int has_transparent_hugepage(void) { return MACHINE_HAS_HPAGE ? 1 : 0; diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index f089cfa249f3..93ce0ada3c63 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -681,8 +681,6 @@ static inline unsigned long pmd_trans_huge(pmd_t pmd) return pte_val(pte) & _PAGE_PMD_HUGE; } -#define has_transparent_hugepage() 1 - static inline pmd_t pmd_mkold(pmd_t pmd) { pte_t pte = __pte(pmd_val(pmd)); diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h index 96cecf55522e..2a26cc4fefc2 100644 --- a/arch/tile/include/asm/pgtable.h +++ b/arch/tile/include/asm/pgtable.h @@ -487,7 +487,6 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define has_transparent_hugepage() 1 #define pmd_trans_huge pmd_huge_page #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index f86491a7bc9d..1a27396b6ea0 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -181,6 +181,7 @@ static inline int pmd_trans_huge(pmd_t pmd) return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; } +#define has_transparent_hugepage has_transparent_hugepage static inline int has_transparent_hugepage(void) { return boot_cpu_has(X86_FEATURE_PSE); diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 9401f4819891..d4458b6dbfb4 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -806,4 +806,12 @@ static inline int pmd_clear_huge(pmd_t *pmd) #define io_remap_pfn_range remap_pfn_range #endif +#ifndef has_transparent_hugepage +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define has_transparent_hugepage() 1 +#else +#define has_transparent_hugepage() 0 +#endif +#endif + #endif /* _ASM_GENERIC_PGTABLE_H */ -- cgit v1.2.3-58-ga151 From 3ef22dfff2390e75b379f9715388a852aa56e0d5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 19 May 2016 17:13:12 -0700 Subject: oom, oom_reaper: try to reap tasks which skip regular OOM killer path If either the current task is already killed or PF_EXITING or a selected task is PF_EXITING then the oom killer is suppressed and so is the oom reaper. This patch adds try_oom_reaper which checks the given task and queues it for the oom reaper if that is safe to be done meaning that the task doesn't share the mm with an alive process. This might help to release the memory pressure while the task tries to exit. [akpm@linux-foundation.org: fix nommu build] Signed-off-by: Michal Hocko Cc: Raushaniya Maksudova Cc: Michael S. Tsirkin Cc: Paul E. McKenney Cc: David Rientjes Cc: Tetsuo Handa Cc: Daniel Vetter Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 8 +++++ mm/memcontrol.c | 1 + mm/oom_kill.c | 86 ++++++++++++++++++++++++++++++++++++++++++----------- 3 files changed, 77 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/oom.h b/include/linux/oom.h index 628a43242a34..83b9c39bd8b7 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -72,6 +72,14 @@ static inline bool oom_task_origin(const struct task_struct *p) extern void mark_oom_victim(struct task_struct *tsk); +#ifdef CONFIG_MMU +extern void try_oom_reaper(struct task_struct *tsk); +#else +static inline void try_oom_reaper(struct task_struct *tsk) +{ +} +#endif + extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1b40dcad2b90..d71d387868e6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1275,6 +1275,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, */ if (fatal_signal_pending(current) || task_will_free_mem(current)) { mark_oom_victim(current); + try_oom_reaper(current); goto unlock; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 32d8210b8773..850b6ff66bdf 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -412,6 +412,25 @@ bool oom_killer_disabled __read_mostly; #define K(x) ((x) << (PAGE_SHIFT-10)) +/* + * task->mm can be NULL if the task is the exited group leader. So to + * determine whether the task is using a particular mm, we examine all the + * task's threads: if one of those is using this mm then this task was also + * using it. + */ +static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) +{ + struct task_struct *t; + + for_each_thread(p, t) { + struct mm_struct *t_mm = READ_ONCE(t->mm); + if (t_mm) + return t_mm == mm; + } + return false; +} + + #ifdef CONFIG_MMU /* * OOM Reaper kernel thread which tries to reap the memory used by the OOM @@ -563,6 +582,53 @@ static void wake_oom_reaper(struct task_struct *tsk) wake_up(&oom_reaper_wait); } +/* Check if we can reap the given task. This has to be called with stable + * tsk->mm + */ +void try_oom_reaper(struct task_struct *tsk) +{ + struct mm_struct *mm = tsk->mm; + struct task_struct *p; + + if (!mm) + return; + + /* + * There might be other threads/processes which are either not + * dying or even not killable. + */ + if (atomic_read(&mm->mm_users) > 1) { + rcu_read_lock(); + for_each_process(p) { + bool exiting; + + if (!process_shares_mm(p, mm)) + continue; + if (same_thread_group(p, tsk)) + continue; + if (fatal_signal_pending(p)) + continue; + + /* + * If the task is exiting make sure the whole thread group + * is exiting and cannot acces mm anymore. + */ + spin_lock_irq(&p->sighand->siglock); + exiting = signal_group_exit(p->signal); + spin_unlock_irq(&p->sighand->siglock); + if (exiting) + continue; + + /* Give up */ + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + } + + wake_oom_reaper(tsk); +} + static int __init oom_init(void) { oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); @@ -652,24 +718,6 @@ void oom_killer_enable(void) oom_killer_disabled = false; } -/* - * task->mm can be NULL if the task is the exited group leader. So to - * determine whether the task is using a particular mm, we examine all the - * task's threads: if one of those is using this mm then this task was also - * using it. - */ -static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) -{ - struct task_struct *t; - - for_each_thread(p, t) { - struct mm_struct *t_mm = READ_ONCE(t->mm); - if (t_mm) - return t_mm == mm; - } - return false; -} - /* * Must be called while holding a reference to p, which will be released upon * returning. @@ -694,6 +742,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, task_lock(p); if (p->mm && task_will_free_mem(p)) { mark_oom_victim(p); + try_oom_reaper(p); task_unlock(p); put_task_struct(p); return; @@ -873,6 +922,7 @@ bool out_of_memory(struct oom_control *oc) if (current->mm && (fatal_signal_pending(current) || task_will_free_mem(current))) { mark_oom_victim(current); + try_oom_reaper(current); return true; } -- cgit v1.2.3-58-ga151 From 175145748d00794369317070dd19ce12dd816241 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:21 -0700 Subject: mm, page_alloc: use new PageAnonHead helper in the free page fast path The PageAnon check always checks for compound_head but this is a relatively expensive check if the caller already knows the page is a head page. This patch creates a helper and uses it in the page free path which only operates on head pages. With this patch and "Only check PageCompound for high-order pages", the performance difference on a page allocator microbenchmark is; 4.6.0-rc2 4.6.0-rc2 vanilla nocompound-v1r20 Min alloc-odr0-1 425.00 ( 0.00%) 417.00 ( 1.88%) Min alloc-odr0-2 313.00 ( 0.00%) 308.00 ( 1.60%) Min alloc-odr0-4 257.00 ( 0.00%) 253.00 ( 1.56%) Min alloc-odr0-8 224.00 ( 0.00%) 221.00 ( 1.34%) Min alloc-odr0-16 208.00 ( 0.00%) 205.00 ( 1.44%) Min alloc-odr0-32 199.00 ( 0.00%) 199.00 ( 0.00%) Min alloc-odr0-64 195.00 ( 0.00%) 193.00 ( 1.03%) Min alloc-odr0-128 192.00 ( 0.00%) 191.00 ( 0.52%) Min alloc-odr0-256 204.00 ( 0.00%) 200.00 ( 1.96%) Min alloc-odr0-512 213.00 ( 0.00%) 212.00 ( 0.47%) Min alloc-odr0-1024 219.00 ( 0.00%) 219.00 ( 0.00%) Min alloc-odr0-2048 225.00 ( 0.00%) 225.00 ( 0.00%) Min alloc-odr0-4096 230.00 ( 0.00%) 231.00 ( -0.43%) Min alloc-odr0-8192 235.00 ( 0.00%) 234.00 ( 0.43%) Min alloc-odr0-16384 235.00 ( 0.00%) 234.00 ( 0.43%) Min free-odr0-1 215.00 ( 0.00%) 191.00 ( 11.16%) Min free-odr0-2 152.00 ( 0.00%) 136.00 ( 10.53%) Min free-odr0-4 119.00 ( 0.00%) 107.00 ( 10.08%) Min free-odr0-8 106.00 ( 0.00%) 96.00 ( 9.43%) Min free-odr0-16 97.00 ( 0.00%) 87.00 ( 10.31%) Min free-odr0-32 91.00 ( 0.00%) 83.00 ( 8.79%) Min free-odr0-64 89.00 ( 0.00%) 81.00 ( 8.99%) Min free-odr0-128 88.00 ( 0.00%) 80.00 ( 9.09%) Min free-odr0-256 106.00 ( 0.00%) 95.00 ( 10.38%) Min free-odr0-512 116.00 ( 0.00%) 111.00 ( 4.31%) Min free-odr0-1024 125.00 ( 0.00%) 118.00 ( 5.60%) Min free-odr0-2048 133.00 ( 0.00%) 126.00 ( 5.26%) Min free-odr0-4096 136.00 ( 0.00%) 130.00 ( 4.41%) Min free-odr0-8192 138.00 ( 0.00%) 130.00 ( 5.80%) Min free-odr0-16384 137.00 ( 0.00%) 130.00 ( 5.11%) There is a sizable boost to the free allocator performance. While there is an apparent boost on the allocation side, it's likely a co-incidence or due to the patches slightly reducing cache footprint. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 7 ++++++- mm/page_alloc.c | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 6b052aa7b5b7..a61e06e5fbce 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -371,10 +371,15 @@ PAGEFLAG(Idle, idle, PF_ANY) #define PAGE_MAPPING_KSM 2 #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) +static __always_inline int PageAnonHead(struct page *page) +{ + return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; +} + static __always_inline int PageAnon(struct page *page) { page = compound_head(page); - return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; + return PageAnonHead(page); } #ifdef CONFIG_KSM diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 087ba3e417ec..7be1ce8b6be0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1048,7 +1048,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order) bad += free_pages_check(page + i); } } - if (PageAnon(page)) + if (PageAnonHead(page)) page->mapping = NULL; bad += free_pages_check(page); if (bad) -- cgit v1.2.3-58-ga151 From 060e74173f292fb3e0398b3dca8765568d195ff1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:27 -0700 Subject: mm, page_alloc: inline zone_statistics zone_statistics has one call-site but it's a public function. Make it static and inline. The performance difference on a page allocator microbenchmark is; 4.6.0-rc2 4.6.0-rc2 statbranch-v1r20 statinline-v1r20 Min alloc-odr0-1 419.00 ( 0.00%) 412.00 ( 1.67%) Min alloc-odr0-2 305.00 ( 0.00%) 301.00 ( 1.31%) Min alloc-odr0-4 250.00 ( 0.00%) 247.00 ( 1.20%) Min alloc-odr0-8 219.00 ( 0.00%) 215.00 ( 1.83%) Min alloc-odr0-16 203.00 ( 0.00%) 199.00 ( 1.97%) Min alloc-odr0-32 195.00 ( 0.00%) 191.00 ( 2.05%) Min alloc-odr0-64 191.00 ( 0.00%) 187.00 ( 2.09%) Min alloc-odr0-128 189.00 ( 0.00%) 185.00 ( 2.12%) Min alloc-odr0-256 198.00 ( 0.00%) 193.00 ( 2.53%) Min alloc-odr0-512 210.00 ( 0.00%) 207.00 ( 1.43%) Min alloc-odr0-1024 216.00 ( 0.00%) 213.00 ( 1.39%) Min alloc-odr0-2048 221.00 ( 0.00%) 220.00 ( 0.45%) Min alloc-odr0-4096 227.00 ( 0.00%) 226.00 ( 0.44%) Min alloc-odr0-8192 232.00 ( 0.00%) 229.00 ( 1.29%) Min alloc-odr0-16384 232.00 ( 0.00%) 229.00 ( 1.29%) Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 2 -- mm/page_alloc.c | 31 +++++++++++++++++++++++++++++++ mm/vmstat.c | 29 ----------------------------- 3 files changed, 31 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 02fce415b3d9..d2da8e053210 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -163,12 +163,10 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, #ifdef CONFIG_NUMA extern unsigned long node_page_state(int node, enum zone_stat_item item); -extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp); #else #define node_page_state(node, item) global_page_state(item) -#define zone_statistics(_zl, _z, gfp) do { } while (0) #endif /* CONFIG_NUMA */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7be1ce8b6be0..36384baa74e1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2354,6 +2354,37 @@ int split_free_page(struct page *page) return nr_pages; } +/* + * Update NUMA hit/miss statistics + * + * Must be called with interrupts disabled. + * + * When __GFP_OTHER_NODE is set assume the node of the preferred + * zone is the local node. This is useful for daemons who allocate + * memory on behalf of other processes. + */ +static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, + gfp_t flags) +{ +#ifdef CONFIG_NUMA + int local_nid = numa_node_id(); + enum zone_stat_item local_stat = NUMA_LOCAL; + + if (unlikely(flags & __GFP_OTHER_NODE)) { + local_stat = NUMA_OTHER; + local_nid = preferred_zone->node; + } + + if (z->node == local_nid) { + __inc_zone_state(z, NUMA_HIT); + __inc_zone_state(z, local_stat); + } else { + __inc_zone_state(z, NUMA_MISS); + __inc_zone_state(preferred_zone, NUMA_FOREIGN); + } +#endif +} + /* * Allocate a page from the given zone. Use pcplists for order-0 allocations. */ diff --git a/mm/vmstat.c b/mm/vmstat.c index d585de27e960..f1a73bfb77b5 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -569,35 +569,6 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) #endif #ifdef CONFIG_NUMA -/* - * zonelist = the list of zones passed to the allocator - * z = the zone from which the allocation occurred. - * - * Must be called with interrupts disabled. - * - * When __GFP_OTHER_NODE is set assume the node of the preferred - * zone is the local node. This is useful for daemons who allocate - * memory on behalf of other processes. - */ -void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) -{ - int local_nid = numa_node_id(); - enum zone_stat_item local_stat = NUMA_LOCAL; - - if (unlikely(flags & __GFP_OTHER_NODE)) { - local_stat = NUMA_OTHER; - local_nid = preferred_zone->node; - } - - if (z->node == local_nid) { - __inc_zone_state(z, NUMA_HIT); - __inc_zone_state(z, local_stat); - } else { - __inc_zone_state(z, NUMA_MISS); - __inc_zone_state(preferred_zone, NUMA_FOREIGN); - } -} - /* * Determine the per node value of a stat item. */ -- cgit v1.2.3-58-ga151 From 682a3385e7734fa3abbd504cbeb5fe91793f1827 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:30 -0700 Subject: mm, page_alloc: inline the fast path of the zonelist iterator The page allocator iterates through a zonelist for zones that match the addressing limitations and nodemask of the caller but many allocations will not be restricted. Despite this, there is always functional call overhead which builds up. This patch inlines the optimistic basic case and only calls the iterator function for the complex case. A hindrance was the fact that cpuset_current_mems_allowed is used in the fastpath as the allowed nodemask even though all nodes are allowed on most systems. The patch handles this by only considering cpuset_current_mems_allowed if a cpuset exists. As well as being faster in the fast-path, this removes some junk in the slowpath. The performance difference on a page allocator microbenchmark is; 4.6.0-rc2 4.6.0-rc2 statinline-v1r20 optiter-v1r20 Min alloc-odr0-1 412.00 ( 0.00%) 382.00 ( 7.28%) Min alloc-odr0-2 301.00 ( 0.00%) 282.00 ( 6.31%) Min alloc-odr0-4 247.00 ( 0.00%) 233.00 ( 5.67%) Min alloc-odr0-8 215.00 ( 0.00%) 203.00 ( 5.58%) Min alloc-odr0-16 199.00 ( 0.00%) 188.00 ( 5.53%) Min alloc-odr0-32 191.00 ( 0.00%) 182.00 ( 4.71%) Min alloc-odr0-64 187.00 ( 0.00%) 177.00 ( 5.35%) Min alloc-odr0-128 185.00 ( 0.00%) 175.00 ( 5.41%) Min alloc-odr0-256 193.00 ( 0.00%) 184.00 ( 4.66%) Min alloc-odr0-512 207.00 ( 0.00%) 197.00 ( 4.83%) Min alloc-odr0-1024 213.00 ( 0.00%) 203.00 ( 4.69%) Min alloc-odr0-2048 220.00 ( 0.00%) 209.00 ( 5.00%) Min alloc-odr0-4096 226.00 ( 0.00%) 214.00 ( 5.31%) Min alloc-odr0-8192 229.00 ( 0.00%) 218.00 ( 4.80%) Min alloc-odr0-16384 229.00 ( 0.00%) 219.00 ( 4.37%) perf indicated that next_zones_zonelist disappeared in the profile and __next_zones_zonelist did not appear. This is expected as the micro-benchmark would hit the inlined fast-path every time. Signed-off-by: Mel Gorman Cc: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 13 +++++++++++-- mm/mmzone.c | 2 +- mm/page_alloc.c | 26 +++++++++----------------- 3 files changed, 21 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 150c6049f961..cfcd7723edb6 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -919,6 +919,10 @@ static inline int zonelist_node_idx(struct zoneref *zoneref) #endif /* CONFIG_NUMA */ } +struct zoneref *__next_zones_zonelist(struct zoneref *z, + enum zone_type highest_zoneidx, + nodemask_t *nodes); + /** * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point * @z - The cursor used as a starting point for the search @@ -931,9 +935,14 @@ static inline int zonelist_node_idx(struct zoneref *zoneref) * being examined. It should be advanced by one before calling * next_zones_zonelist again. */ -struct zoneref *next_zones_zonelist(struct zoneref *z, +static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, - nodemask_t *nodes); + nodemask_t *nodes) +{ + if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx)) + return z; + return __next_zones_zonelist(z, highest_zoneidx, nodes); +} /** * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist diff --git a/mm/mmzone.c b/mm/mmzone.c index 52687fb4de6f..5652be858e5e 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -52,7 +52,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes) } /* Returns the next zone at or below highest_zoneidx in a zonelist */ -struct zoneref *next_zones_zonelist(struct zoneref *z, +struct zoneref *__next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 36384baa74e1..789e5f065e8d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3192,17 +3192,6 @@ retry: */ alloc_flags = gfp_to_alloc_flags(gfp_mask); - /* - * Find the true preferred zone if the allocation is unconstrained by - * cpusets. - */ - if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) { - struct zoneref *preferred_zoneref; - preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, NULL, &ac->preferred_zone); - ac->classzone_idx = zonelist_zone_idx(preferred_zoneref); - } - /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, order, alloc_flags & ~ALLOC_NO_WATERMARKS, ac); @@ -3358,14 +3347,21 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zoneref *preferred_zoneref; struct page *page = NULL; unsigned int cpuset_mems_cookie; - int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; + int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { .high_zoneidx = gfp_zone(gfp_mask), + .zonelist = zonelist, .nodemask = nodemask, .migratetype = gfpflags_to_migratetype(gfp_mask), }; + if (cpusets_enabled()) { + alloc_flags |= ALLOC_CPUSET; + if (!ac.nodemask) + ac.nodemask = &cpuset_current_mems_allowed; + } + gfp_mask &= gfp_allowed_mask; lockdep_trace_alloc(gfp_mask); @@ -3389,16 +3385,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); - /* We set it here, as __alloc_pages_slowpath might have changed it */ - ac.zonelist = zonelist; - /* Dirty zone balancing only done in the fast path */ ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); /* The preferred zone is used for statistics later */ preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, - ac.nodemask ? : &cpuset_current_mems_allowed, - &ac.preferred_zone); + ac.nodemask, &ac.preferred_zone); if (!ac.preferred_zone) goto out; ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); -- cgit v1.2.3-58-ga151 From c603844bdcb5238980de8d58b393f52d7729d651 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:38 -0700 Subject: mm, page_alloc: convert alloc_flags to unsigned alloc_flags is a bitmask of flags but it is signed which does not necessarily generate the best code depending on the compiler. Even without an impact, it makes more sense that this be unsigned. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 6 +++--- include/linux/mmzone.h | 3 ++- mm/compaction.c | 12 +++++++----- mm/internal.h | 2 +- mm/page_alloc.c | 26 ++++++++++++++------------ 5 files changed, 27 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index d7c8de583a23..242b660f64e6 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -39,12 +39,12 @@ extern int sysctl_compact_unevictable_allowed; extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, int *contended); + unsigned int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended); extern void compact_pgdat(pg_data_t *pgdat, int order); extern void reset_isolation_suitable(pg_data_t *pgdat); extern unsigned long compaction_suitable(struct zone *zone, int order, - int alloc_flags, int classzone_idx); + unsigned int alloc_flags, int classzone_idx); extern void defer_compaction(struct zone *zone, int order); extern bool compaction_deferred(struct zone *zone, int order); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cfcd7723edb6..327f0fa1e1ce 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -747,7 +747,8 @@ extern struct mutex zonelists_mutex; void build_all_zonelists(pg_data_t *pgdat, struct zone *zone); void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); bool zone_watermark_ok(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, int alloc_flags); + unsigned long mark, int classzone_idx, + unsigned int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx); enum memmap_context { diff --git a/mm/compaction.c b/mm/compaction.c index 7487067b4613..8f339ca25621 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1313,7 +1313,8 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, * COMPACT_CONTINUE - If compaction should run now */ static unsigned long __compaction_suitable(struct zone *zone, int order, - int alloc_flags, int classzone_idx) + unsigned int alloc_flags, + int classzone_idx) { int fragindex; unsigned long watermark; @@ -1358,7 +1359,8 @@ static unsigned long __compaction_suitable(struct zone *zone, int order, } unsigned long compaction_suitable(struct zone *zone, int order, - int alloc_flags, int classzone_idx) + unsigned int alloc_flags, + int classzone_idx) { unsigned long ret; @@ -1530,7 +1532,7 @@ out: static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, enum migrate_mode mode, int *contended, - int alloc_flags, int classzone_idx) + unsigned int alloc_flags, int classzone_idx) { unsigned long ret; struct compact_control cc = { @@ -1571,8 +1573,8 @@ int sysctl_extfrag_threshold = 500; * This is the main entry point for direct page compaction. */ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, int *contended) + unsigned int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended) { int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; diff --git a/mm/internal.h b/mm/internal.h index 098a89e3b97c..114593aab55c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -175,7 +175,7 @@ struct compact_control { bool direct_compaction; /* False from kcompactd or /proc/... */ int order; /* order a direct compactor needs */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ - const int alloc_flags; /* alloc flags of a direct compactor */ + const unsigned int alloc_flags; /* alloc flags of a direct compactor */ const int classzone_idx; /* zone index of a direct compactor */ struct zone *zone; int contended; /* Signal need_sched() or lock diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7f328cfb137d..094587a4ed81 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1525,7 +1525,7 @@ static inline bool free_pages_prezeroed(bool poisoned) } static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, - int alloc_flags) + unsigned int alloc_flags) { int i; bool poisoned = true; @@ -2391,7 +2391,8 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, static inline struct page *buffered_rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, - gfp_t gfp_flags, int alloc_flags, int migratetype) + gfp_t gfp_flags, unsigned int alloc_flags, + int migratetype) { unsigned long flags; struct page *page; @@ -2545,12 +2546,13 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) * to check in the allocation paths if no pages are free. */ static bool __zone_watermark_ok(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, int alloc_flags, + unsigned long mark, int classzone_idx, + unsigned int alloc_flags, long free_pages) { long min = mark; int o; - const int alloc_harder = (alloc_flags & ALLOC_HARDER); + const bool alloc_harder = (alloc_flags & ALLOC_HARDER); /* free_pages may go negative - that's OK */ free_pages -= (1 << order) - 1; @@ -2613,7 +2615,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, } bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, int alloc_flags) + int classzone_idx, unsigned int alloc_flags) { return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, zone_page_state(z, NR_FREE_PAGES)); @@ -2957,7 +2959,7 @@ out: /* Try memory compaction for high-order allocations before reclaim */ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, + unsigned int alloc_flags, const struct alloc_context *ac, enum migrate_mode mode, int *contended_compaction, bool *deferred_compaction) { @@ -3013,7 +3015,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, #else static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, + unsigned int alloc_flags, const struct alloc_context *ac, enum migrate_mode mode, int *contended_compaction, bool *deferred_compaction) { @@ -3053,7 +3055,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, /* The really slow allocator path where we enter direct reclaim */ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, + unsigned int alloc_flags, const struct alloc_context *ac, unsigned long *did_some_progress) { struct page *page = NULL; @@ -3092,10 +3094,10 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone)); } -static inline int +static inline unsigned int gfp_to_alloc_flags(gfp_t gfp_mask) { - int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; + unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); @@ -3156,7 +3158,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; struct page *page = NULL; - int alloc_flags; + unsigned int alloc_flags; unsigned long pages_reclaimed = 0; unsigned long did_some_progress; enum migrate_mode migration_mode = MIGRATE_ASYNC; @@ -3348,7 +3350,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zoneref *preferred_zoneref; struct page *page = NULL; unsigned int cpuset_mems_cookie; - int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; + unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { .high_zoneidx = gfp_zone(gfp_mask), -- cgit v1.2.3-58-ga151 From 09940a4f1e816abe3248fa0d185fc0e7f54c8c12 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:53 -0700 Subject: mm, page_alloc: simplify last cpupid reset The current reset unnecessarily clears flags and makes pointless calculations. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9c2852cabf01..2b97be1147ec 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -850,10 +850,7 @@ extern int page_cpupid_xchg_last(struct page *page, int cpupid); static inline void page_cpupid_reset_last(struct page *page) { - int cpupid = (1 << LAST_CPUPID_SHIFT) - 1; - - page->flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); - page->flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; + page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; } #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ #else /* !CONFIG_NUMA_BALANCING */ -- cgit v1.2.3-58-ga151 From c33d6c06f60f710f0305ae792773e1c2560e1e51 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:10 -0700 Subject: mm, page_alloc: avoid looking up the first zone in a zonelist twice The allocator fast path looks up the first usable zone in a zonelist and then get_page_from_freelist does the same job in the zonelist iterator. This patch preserves the necessary information. 4.6.0-rc2 4.6.0-rc2 fastmark-v1r20 initonce-v1r20 Min alloc-odr0-1 364.00 ( 0.00%) 359.00 ( 1.37%) Min alloc-odr0-2 262.00 ( 0.00%) 260.00 ( 0.76%) Min alloc-odr0-4 214.00 ( 0.00%) 214.00 ( 0.00%) Min alloc-odr0-8 186.00 ( 0.00%) 186.00 ( 0.00%) Min alloc-odr0-16 173.00 ( 0.00%) 173.00 ( 0.00%) Min alloc-odr0-32 165.00 ( 0.00%) 165.00 ( 0.00%) Min alloc-odr0-64 161.00 ( 0.00%) 162.00 ( -0.62%) Min alloc-odr0-128 159.00 ( 0.00%) 161.00 ( -1.26%) Min alloc-odr0-256 168.00 ( 0.00%) 170.00 ( -1.19%) Min alloc-odr0-512 180.00 ( 0.00%) 181.00 ( -0.56%) Min alloc-odr0-1024 190.00 ( 0.00%) 190.00 ( 0.00%) Min alloc-odr0-2048 196.00 ( 0.00%) 196.00 ( 0.00%) Min alloc-odr0-4096 202.00 ( 0.00%) 202.00 ( 0.00%) Min alloc-odr0-8192 206.00 ( 0.00%) 205.00 ( 0.49%) Min alloc-odr0-16384 206.00 ( 0.00%) 205.00 ( 0.49%) The benefit is negligible and the results are within the noise but each cycle counts. Signed-off-by: Mel Gorman Cc: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 10 +++++----- include/linux/mmzone.h | 18 +++++++++++------- mm/internal.h | 2 +- mm/mempolicy.c | 19 ++++++++++--------- mm/page_alloc.c | 34 ++++++++++++++++------------------ 5 files changed, 43 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/fs/buffer.c b/fs/buffer.c index af0d9a82a8ed..754813a6962b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -255,17 +255,17 @@ out: */ static void free_more_memory(void) { - struct zone *zone; + struct zoneref *z; int nid; wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM); yield(); for_each_online_node(nid) { - (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS), - gfp_zone(GFP_NOFS), NULL, - &zone); - if (zone) + + z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS), + gfp_zone(GFP_NOFS), NULL); + if (z->zone) try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, GFP_NOFS, NULL); } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 327f0fa1e1ce..4b28d2f8125e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -959,13 +959,10 @@ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, */ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, enum zone_type highest_zoneidx, - nodemask_t *nodes, - struct zone **zone) + nodemask_t *nodes) { - struct zoneref *z = next_zones_zonelist(zonelist->_zonerefs, + return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes); - *zone = zonelist_zone(z); - return z; } /** @@ -980,10 +977,17 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, * within a given nodemask */ #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ - for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \ + for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z); \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ - zone = zonelist_zone(z)) \ + zone = zonelist_zone(z)) + +#define for_next_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ + for (zone = z->zone; \ + zone; \ + z = next_zones_zonelist(++z, highidx, nodemask), \ + zone = zonelist_zone(z)) + /** * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index diff --git a/mm/internal.h b/mm/internal.h index 114593aab55c..d1ddd71c1bbf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -102,7 +102,7 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); struct alloc_context { struct zonelist *zonelist; nodemask_t *nodemask; - struct zone *preferred_zone; + struct zoneref *preferred_zoneref; int classzone_idx; int migratetype; enum zone_type high_zoneidx; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7f80ebcd6552..297d6854f849 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1739,18 +1739,18 @@ unsigned int mempolicy_slab_node(void) return interleave_nodes(policy); case MPOL_BIND: { + struct zoneref *z; + /* * Follow bind policy behavior and start allocation at the * first node. */ struct zonelist *zonelist; - struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); zonelist = &NODE_DATA(node)->node_zonelists[0]; - (void)first_zones_zonelist(zonelist, highest_zoneidx, - &policy->v.nodes, - &zone); - return zone ? zone->node : node; + z = first_zones_zonelist(zonelist, highest_zoneidx, + &policy->v.nodes); + return z->zone ? z->zone->node : node; } default: @@ -2266,7 +2266,7 @@ static void sp_free(struct sp_node *n) int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol; - struct zone *zone; + struct zoneref *z; int curnid = page_to_nid(page); unsigned long pgoff; int thiscpu = raw_smp_processor_id(); @@ -2298,6 +2298,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long break; case MPOL_BIND: + /* * allows binding to multiple nodes. * use current page if in policy nodemask, @@ -2306,11 +2307,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long */ if (node_isset(curnid, pol->v.nodes)) goto out; - (void)first_zones_zonelist( + z = first_zones_zonelist( node_zonelist(numa_node_id(), GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), - &pol->v.nodes, &zone); - polnid = zone->node; + &pol->v.nodes); + polnid = z->zone->node; break; default: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 732875b1bdfb..dba8cfd0b2d6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2704,7 +2704,7 @@ static struct page * get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac) { - struct zoneref *z; + struct zoneref *z = ac->preferred_zoneref; struct zone *zone; bool fair_skipped = false; bool apply_fair = (alloc_flags & ALLOC_FAIR); @@ -2714,7 +2714,7 @@ zonelist_scan: * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, + for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { struct page *page; unsigned long mark; @@ -2734,7 +2734,7 @@ zonelist_scan: fair_skipped = true; continue; } - if (!zone_local(ac->preferred_zone, zone)) { + if (!zone_local(ac->preferred_zoneref->zone, zone)) { if (fair_skipped) goto reset_fair; apply_fair = false; @@ -2780,7 +2780,7 @@ zonelist_scan: goto try_this_zone; if (zone_reclaim_mode == 0 || - !zone_allows_reclaim(ac->preferred_zone, zone)) + !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) continue; ret = zone_reclaim(zone, gfp_mask, order); @@ -2802,7 +2802,7 @@ zonelist_scan: } try_this_zone: - page = buffered_rmqueue(ac->preferred_zone, zone, order, + page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order, gfp_mask, alloc_flags, ac->migratetype); if (page) { if (prep_new_page(page, order, gfp_mask, alloc_flags)) @@ -2831,7 +2831,7 @@ try_this_zone: reset_fair: apply_fair = false; fair_skipped = false; - reset_alloc_batches(ac->preferred_zone); + reset_alloc_batches(ac->preferred_zoneref->zone); goto zonelist_scan; } @@ -3114,7 +3114,7 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) - wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone)); + wakeup_kswapd(zone, order, zonelist_zone_idx(ac->preferred_zoneref)); } static inline unsigned int @@ -3332,7 +3332,7 @@ retry: if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) || ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) { /* Wait for some write requests to complete then retry */ - wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); + wait_iff_congested(ac->preferred_zoneref->zone, BLK_RW_ASYNC, HZ/50); goto retry; } @@ -3370,7 +3370,6 @@ struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask) { - struct zoneref *preferred_zoneref; struct page *page; unsigned int cpuset_mems_cookie; unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; @@ -3416,14 +3415,14 @@ retry_cpuset: ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); /* The preferred zone is used for statistics later */ - preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, - ac.nodemask, &ac.preferred_zone); - if (!ac.preferred_zone) { + ac.preferred_zoneref = first_zones_zonelist(ac.zonelist, + ac.high_zoneidx, ac.nodemask); + if (!ac.preferred_zoneref) { page = NULL; goto no_zone; } - ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); + ac.classzone_idx = zonelist_zone_idx(ac.preferred_zoneref); /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); @@ -4462,13 +4461,12 @@ static void build_zonelists(pg_data_t *pgdat) */ int local_memory_node(int node) { - struct zone *zone; + struct zoneref *z; - (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), + z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), gfp_zone(GFP_KERNEL), - NULL, - &zone); - return zone->node; + NULL); + return z->zone->node; } #endif -- cgit v1.2.3-58-ga151 From 0b423ca22f95a867f789aab1fe57ee4e378df43b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:27 -0700 Subject: mm, page_alloc: inline pageblock lookup in page free fast paths The function call overhead of get_pfnblock_flags_mask() is measurable in the page free paths. This patch uses an inlined version that is faster. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 7 -- mm/page_alloc.c | 188 ++++++++++++++++++++++++++----------------------- mm/page_owner.c | 2 +- mm/vmstat.c | 2 +- 4 files changed, 102 insertions(+), 97 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4b28d2f8125e..c60db2096fd8 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -85,13 +85,6 @@ extern int page_group_by_mobility_disabled; get_pfnblock_flags_mask(page, page_to_pfn(page), \ PB_migrate_end, MIGRATETYPE_MASK) -static inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) -{ - BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2); - return get_pfnblock_flags_mask(page, pfn, PB_migrate_end, - MIGRATETYPE_MASK); -} - struct free_area { struct list_head free_list[MIGRATE_TYPES]; unsigned long nr_free; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 822ce86fc883..bdf7a13311b5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -352,6 +352,106 @@ static inline bool update_defer_init(pg_data_t *pgdat, } #endif +/* Return a pointer to the bitmap storing bits affecting a block of pages */ +static inline unsigned long *get_pageblock_bitmap(struct page *page, + unsigned long pfn) +{ +#ifdef CONFIG_SPARSEMEM + return __pfn_to_section(pfn)->pageblock_flags; +#else + return page_zone(page)->pageblock_flags; +#endif /* CONFIG_SPARSEMEM */ +} + +static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) +{ +#ifdef CONFIG_SPARSEMEM + pfn &= (PAGES_PER_SECTION-1); + return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; +#else + pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); + return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; +#endif /* CONFIG_SPARSEMEM */ +} + +/** + * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest to retrieve + * @mask: mask of bits that the caller is interested in + * + * Return: pageblock_bits flags + */ +static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page, + unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) +{ + unsigned long *bitmap; + unsigned long bitidx, word_bitidx; + unsigned long word; + + bitmap = get_pageblock_bitmap(page, pfn); + bitidx = pfn_to_bitidx(page, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + + word = bitmap[word_bitidx]; + bitidx += end_bitidx; + return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; +} + +unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) +{ + return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask); +} + +static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) +{ + return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK); +} + +/** + * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @flags: The flags to set + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest + * @mask: mask of bits that the caller is interested in + */ +void set_pfnblock_flags_mask(struct page *page, unsigned long flags, + unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) +{ + unsigned long *bitmap; + unsigned long bitidx, word_bitidx; + unsigned long old_word, word; + + BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); + + bitmap = get_pageblock_bitmap(page, pfn); + bitidx = pfn_to_bitidx(page, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + + VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); + + bitidx += end_bitidx; + mask <<= (BITS_PER_LONG - bitidx - 1); + flags <<= (BITS_PER_LONG - bitidx - 1); + + word = READ_ONCE(bitmap[word_bitidx]); + for (;;) { + old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); + if (word == old_word) + break; + word = old_word; + } +} void set_pageblock_migratetype(struct page *page, int migratetype) { @@ -6831,94 +6931,6 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } -/* Return a pointer to the bitmap storing bits affecting a block of pages */ -static inline unsigned long *get_pageblock_bitmap(struct page *page, - unsigned long pfn) -{ -#ifdef CONFIG_SPARSEMEM - return __pfn_to_section(pfn)->pageblock_flags; -#else - return page_zone(page)->pageblock_flags; -#endif /* CONFIG_SPARSEMEM */ -} - -static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) -{ -#ifdef CONFIG_SPARSEMEM - pfn &= (PAGES_PER_SECTION-1); - return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; -#else - pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); - return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; -#endif /* CONFIG_SPARSEMEM */ -} - -/** - * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages - * @page: The page within the block of interest - * @pfn: The target page frame number - * @end_bitidx: The last bit of interest to retrieve - * @mask: mask of bits that the caller is interested in - * - * Return: pageblock_bits flags - */ -unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, - unsigned long end_bitidx, - unsigned long mask) -{ - unsigned long *bitmap; - unsigned long bitidx, word_bitidx; - unsigned long word; - - bitmap = get_pageblock_bitmap(page, pfn); - bitidx = pfn_to_bitidx(page, pfn); - word_bitidx = bitidx / BITS_PER_LONG; - bitidx &= (BITS_PER_LONG-1); - - word = bitmap[word_bitidx]; - bitidx += end_bitidx; - return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; -} - -/** - * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages - * @page: The page within the block of interest - * @flags: The flags to set - * @pfn: The target page frame number - * @end_bitidx: The last bit of interest - * @mask: mask of bits that the caller is interested in - */ -void set_pfnblock_flags_mask(struct page *page, unsigned long flags, - unsigned long pfn, - unsigned long end_bitidx, - unsigned long mask) -{ - unsigned long *bitmap; - unsigned long bitidx, word_bitidx; - unsigned long old_word, word; - - BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); - - bitmap = get_pageblock_bitmap(page, pfn); - bitidx = pfn_to_bitidx(page, pfn); - word_bitidx = bitidx / BITS_PER_LONG; - bitidx &= (BITS_PER_LONG-1); - - VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); - - bitidx += end_bitidx; - mask <<= (BITS_PER_LONG - bitidx - 1); - flags <<= (BITS_PER_LONG - bitidx - 1); - - word = READ_ONCE(bitmap[word_bitidx]); - for (;;) { - old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); - if (word == old_word) - break; - word = old_word; - } -} - /* * This function checks whether pageblock includes unmovable pages or not. * If @count is not zero, it is okay to include less @count unmovable pages diff --git a/mm/page_owner.c b/mm/page_owner.c index 438768c092ac..792b56da13d8 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -143,7 +143,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, goto err; /* Print information relevant to grouping pages by mobility */ - pageblock_mt = get_pfnblock_migratetype(page, pfn); + pageblock_mt = get_pageblock_migratetype(page); page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); ret += snprintf(kbuf + ret, count - ret, "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", diff --git a/mm/vmstat.c b/mm/vmstat.c index f1a73bfb77b5..5b72a8ad2813 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1041,7 +1041,7 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m, block_end_pfn = min(block_end_pfn, end_pfn); page = pfn_to_page(pfn); - pageblock_mt = get_pfnblock_migratetype(page, pfn); + pageblock_mt = get_pageblock_migratetype(page); for (; pfn < block_end_pfn; pfn++) { if (!pfn_valid_within(pfn)) -- cgit v1.2.3-58-ga151 From 002f290627c27068087f6204baec7a334e5a3b48 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 19 May 2016 17:14:30 -0700 Subject: cpuset: use static key better and convert to new API An important function for cpusets is cpuset_node_allowed(), which optimizes on the fact if there's a single root CPU set, it must be trivially allowed. But the check "nr_cpusets() <= 1" doesn't use the cpusets_enabled_key static key the right way where static keys eliminate branching overhead with jump labels. This patch converts it so that static key is used properly. It's also switched to the new static key API and the checking functions are converted to return bool instead of int. We also provide a new variant __cpuset_zone_allowed() which expects that the static key check was already done and they key was enabled. This is needed for get_page_from_freelist() where we want to also avoid the relatively slower check when ALLOC_CPUSET is not set in alloc_flags. The impact on the page allocator microbenchmark is less than expected but the cleanup in itself is worthwhile. 4.6.0-rc2 4.6.0-rc2 multcheck-v1r20 cpuset-v1r20 Min alloc-odr0-1 348.00 ( 0.00%) 348.00 ( 0.00%) Min alloc-odr0-2 254.00 ( 0.00%) 254.00 ( 0.00%) Min alloc-odr0-4 213.00 ( 0.00%) 213.00 ( 0.00%) Min alloc-odr0-8 186.00 ( 0.00%) 183.00 ( 1.61%) Min alloc-odr0-16 173.00 ( 0.00%) 171.00 ( 1.16%) Min alloc-odr0-32 166.00 ( 0.00%) 163.00 ( 1.81%) Min alloc-odr0-64 162.00 ( 0.00%) 159.00 ( 1.85%) Min alloc-odr0-128 160.00 ( 0.00%) 157.00 ( 1.88%) Min alloc-odr0-256 169.00 ( 0.00%) 166.00 ( 1.78%) Min alloc-odr0-512 180.00 ( 0.00%) 180.00 ( 0.00%) Min alloc-odr0-1024 188.00 ( 0.00%) 187.00 ( 0.53%) Min alloc-odr0-2048 194.00 ( 0.00%) 193.00 ( 0.52%) Min alloc-odr0-4096 199.00 ( 0.00%) 198.00 ( 0.50%) Min alloc-odr0-8192 202.00 ( 0.00%) 201.00 ( 0.50%) Min alloc-odr0-16384 203.00 ( 0.00%) 202.00 ( 0.49%) Signed-off-by: Vlastimil Babka Signed-off-by: Mel Gorman Acked-by: Zefan Li Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 42 ++++++++++++++++++++++++++++-------------- kernel/cpuset.c | 14 +++++++------- mm/page_alloc.c | 2 +- 3 files changed, 36 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 85a868ccb493..bfc204e70338 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -16,26 +16,26 @@ #ifdef CONFIG_CPUSETS -extern struct static_key cpusets_enabled_key; +extern struct static_key_false cpusets_enabled_key; static inline bool cpusets_enabled(void) { - return static_key_false(&cpusets_enabled_key); + return static_branch_unlikely(&cpusets_enabled_key); } static inline int nr_cpusets(void) { /* jump label reference count + the top-level cpuset */ - return static_key_count(&cpusets_enabled_key) + 1; + return static_key_count(&cpusets_enabled_key.key) + 1; } static inline void cpuset_inc(void) { - static_key_slow_inc(&cpusets_enabled_key); + static_branch_inc(&cpusets_enabled_key); } static inline void cpuset_dec(void) { - static_key_slow_dec(&cpusets_enabled_key); + static_branch_dec(&cpusets_enabled_key); } extern int cpuset_init(void); @@ -48,16 +48,25 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p); void cpuset_init_current_mems_allowed(void); int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); -extern int __cpuset_node_allowed(int node, gfp_t gfp_mask); +extern bool __cpuset_node_allowed(int node, gfp_t gfp_mask); -static inline int cpuset_node_allowed(int node, gfp_t gfp_mask) +static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask) { - return nr_cpusets() <= 1 || __cpuset_node_allowed(node, gfp_mask); + if (cpusets_enabled()) + return __cpuset_node_allowed(node, gfp_mask); + return true; } -static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { - return cpuset_node_allowed(zone_to_nid(z), gfp_mask); + return __cpuset_node_allowed(zone_to_nid(z), gfp_mask); +} + +static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +{ + if (cpusets_enabled()) + return __cpuset_zone_allowed(z, gfp_mask); + return true; } extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, @@ -172,14 +181,19 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) return 1; } -static inline int cpuset_node_allowed(int node, gfp_t gfp_mask) +static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask) { - return 1; + return true; } -static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { - return 1; + return true; +} + +static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +{ + return true; } static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 611cc69af8f0..73e93e53884d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -61,7 +61,7 @@ #include #include -struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; +DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); /* See "Frequency meter" comments, below. */ @@ -2528,27 +2528,27 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ -int __cpuset_node_allowed(int node, gfp_t gfp_mask) +bool __cpuset_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ int allowed; /* is allocation in zone z allowed? */ unsigned long flags; if (in_interrupt()) - return 1; + return true; if (node_isset(node, current->mems_allowed)) - return 1; + return true; /* * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. */ if (unlikely(test_thread_flag(TIF_MEMDIE))) - return 1; + return true; if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ - return 0; + return false; if (current->flags & PF_EXITING) /* Let dying task have memory */ - return 1; + return true; /* Not hardwall and node outside mems_allowed: scan up cpusets */ spin_lock_irqsave(&callback_lock, flags); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bdf7a13311b5..39c441bb8d61 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2859,7 +2859,7 @@ zonelist_scan: if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && - !cpuset_zone_allowed(zone, gfp_mask)) + !__cpuset_zone_allowed(zone, gfp_mask)) continue; /* * Distribute pages in proportion to the individual -- cgit v1.2.3-58-ga151