summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/memcontrol.h32
-rw-r--r--include/linux/vmpressure.h7
-rw-r--r--mm/memcontrol.c17
-rw-r--r--mm/vmpressure.c78
-rw-r--r--mm/vmscan.c10
5 files changed, 104 insertions, 40 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a355f61a2ed3..c5a51039df57 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -249,6 +249,10 @@ struct mem_cgroup {
struct wb_domain cgwb_domain;
#endif
+#ifdef CONFIG_INET
+ unsigned long socket_pressure;
+#endif
+
/* List of events which userspace want to receive */
struct list_head event_list;
spinlock_t event_list_lock;
@@ -290,18 +294,34 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
-struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
return css ? container_of(css, struct mem_cgroup, css) : NULL;
}
+#define mem_cgroup_from_counter(counter, member) \
+ container_of(counter, struct mem_cgroup, member)
+
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
struct mem_cgroup *,
struct mem_cgroup_reclaim_cookie *);
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+/**
+ * parent_mem_cgroup - find the accounting parent of a memcg
+ * @memcg: memcg whose parent to find
+ *
+ * Returns the parent memcg, or NULL if this is the root or the memory
+ * controller is in legacy no-hierarchy mode.
+ */
+static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
+{
+ if (!memcg->memory.parent)
+ return NULL;
+ return mem_cgroup_from_counter(memcg->memory.parent, memory);
+}
+
static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
struct mem_cgroup *root)
{
@@ -689,10 +709,14 @@ extern struct static_key memcg_sockets_enabled_key;
static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
{
#ifdef CONFIG_MEMCG_KMEM
- return memcg->tcp_mem.memory_pressure;
-#else
- return false;
+ if (memcg->tcp_mem.memory_pressure)
+ return true;
#endif
+ do {
+ if (time_before(jiffies, memcg->socket_pressure))
+ return true;
+ } while ((memcg = parent_mem_cgroup(memcg)));
+ return false;
}
#else
#define mem_cgroup_sockets_enabled 0
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index 3e4535876d37..3347cc3ec0ab 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -12,6 +12,9 @@
struct vmpressure {
unsigned long scanned;
unsigned long reclaimed;
+
+ unsigned long tree_scanned;
+ unsigned long tree_reclaimed;
/* The lock is used to keep the scanned/reclaimed above in sync. */
struct spinlock sr_lock;
@@ -26,7 +29,7 @@ struct vmpressure {
struct mem_cgroup;
#ifdef CONFIG_MEMCG
-extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed);
extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
@@ -40,7 +43,7 @@ extern int vmpressure_register_event(struct mem_cgroup *memcg,
extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd);
#else
-static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed) {}
static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg,
int prio) {}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 60ebc486c2aa..df7f144a5a4b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1113,9 +1113,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
return ret;
}
-#define mem_cgroup_from_counter(counter, member) \
- container_of(counter, struct mem_cgroup, member)
-
/**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
* @memcg: the memory cgroup
@@ -4183,17 +4180,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
kfree(memcg);
}
-/*
- * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
- */
-struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
-{
- if (!memcg->memory.parent)
- return NULL;
- return mem_cgroup_from_counter(memcg->memory.parent, memory);
-}
-EXPORT_SYMBOL(parent_mem_cgroup);
-
static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
@@ -4234,6 +4220,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
#endif
+#ifdef CONFIG_INET
+ memcg->socket_pressure = jiffies;
+#endif
return &memcg->css;
free_out:
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index c5afd573d7da..506f03e4be47 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -137,14 +137,11 @@ struct vmpressure_event {
};
static bool vmpressure_event(struct vmpressure *vmpr,
- unsigned long scanned, unsigned long reclaimed)
+ enum vmpressure_levels level)
{
struct vmpressure_event *ev;
- enum vmpressure_levels level;
bool signalled = false;
- level = vmpressure_calc_level(scanned, reclaimed);
-
mutex_lock(&vmpr->events_lock);
list_for_each_entry(ev, &vmpr->events, node) {
@@ -164,6 +161,7 @@ static void vmpressure_work_fn(struct work_struct *work)
struct vmpressure *vmpr = work_to_vmpressure(work);
unsigned long scanned;
unsigned long reclaimed;
+ enum vmpressure_levels level;
spin_lock(&vmpr->sr_lock);
/*
@@ -174,19 +172,21 @@ static void vmpressure_work_fn(struct work_struct *work)
* here. No need for any locks here since we don't care if
* vmpr->reclaimed is in sync.
*/
- scanned = vmpr->scanned;
+ scanned = vmpr->tree_scanned;
if (!scanned) {
spin_unlock(&vmpr->sr_lock);
return;
}
- reclaimed = vmpr->reclaimed;
- vmpr->scanned = 0;
- vmpr->reclaimed = 0;
+ reclaimed = vmpr->tree_reclaimed;
+ vmpr->tree_scanned = 0;
+ vmpr->tree_reclaimed = 0;
spin_unlock(&vmpr->sr_lock);
+ level = vmpressure_calc_level(scanned, reclaimed);
+
do {
- if (vmpressure_event(vmpr, scanned, reclaimed))
+ if (vmpressure_event(vmpr, level))
break;
/*
* If not handled, propagate the event upward into the
@@ -199,6 +199,7 @@ static void vmpressure_work_fn(struct work_struct *work)
* vmpressure() - Account memory pressure through scanned/reclaimed ratio
* @gfp: reclaimer's gfp mask
* @memcg: cgroup memory controller handle
+ * @tree: legacy subtree mode
* @scanned: number of pages scanned
* @reclaimed: number of pages reclaimed
*
@@ -206,9 +207,16 @@ static void vmpressure_work_fn(struct work_struct *work)
* "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
* pressure index is then further refined and averaged over time.
*
+ * If @tree is set, vmpressure is in traditional userspace reporting
+ * mode: @memcg is considered the pressure root and userspace is
+ * notified of the entire subtree's reclaim efficiency.
+ *
+ * If @tree is not set, reclaim efficiency is recorded for @memcg, and
+ * only in-kernel users are notified.
+ *
* This function does not return any value.
*/
-void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed)
{
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
@@ -238,15 +246,47 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
if (!scanned)
return;
- spin_lock(&vmpr->sr_lock);
- vmpr->scanned += scanned;
- vmpr->reclaimed += reclaimed;
- scanned = vmpr->scanned;
- spin_unlock(&vmpr->sr_lock);
+ if (tree) {
+ spin_lock(&vmpr->sr_lock);
+ vmpr->tree_scanned += scanned;
+ vmpr->tree_reclaimed += reclaimed;
+ scanned = vmpr->scanned;
+ spin_unlock(&vmpr->sr_lock);
- if (scanned < vmpressure_win)
- return;
- schedule_work(&vmpr->work);
+ if (scanned < vmpressure_win)
+ return;
+ schedule_work(&vmpr->work);
+ } else {
+ enum vmpressure_levels level;
+
+ /* For now, no users for root-level efficiency */
+ if (memcg == root_mem_cgroup)
+ return;
+
+ spin_lock(&vmpr->sr_lock);
+ scanned = vmpr->scanned += scanned;
+ reclaimed = vmpr->reclaimed += reclaimed;
+ if (scanned < vmpressure_win) {
+ spin_unlock(&vmpr->sr_lock);
+ return;
+ }
+ vmpr->scanned = vmpr->reclaimed = 0;
+ spin_unlock(&vmpr->sr_lock);
+
+ level = vmpressure_calc_level(scanned, reclaimed);
+
+ if (level > VMPRESSURE_LOW) {
+ /*
+ * Let the socket buffer allocator know that
+ * we are having trouble reclaiming LRU pages.
+ *
+ * For hysteresis keep the pressure state
+ * asserted for a second in which subsequent
+ * pressure events can occur.
+ */
+ memcg->socket_pressure = jiffies + HZ;
+ }
+ }
}
/**
@@ -276,7 +316,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
* to the vmpressure() basically means that we signal 'critical'
* level.
*/
- vmpressure(gfp, memcg, vmpressure_win, 0);
+ vmpressure(gfp, memcg, true, vmpressure_win, 0);
}
/**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e36d766dade9..bb0cbd4c9f01 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2396,6 +2396,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
unsigned long lru_pages;
+ unsigned long reclaimed;
unsigned long scanned;
struct lruvec *lruvec;
int swappiness;
@@ -2408,6 +2409,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
swappiness = mem_cgroup_swappiness(memcg);
+ reclaimed = sc->nr_reclaimed;
scanned = sc->nr_scanned;
shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
@@ -2418,6 +2420,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
memcg, sc->nr_scanned - scanned,
lru_pages);
+ /* Record the group's reclaim efficiency */
+ vmpressure(sc->gfp_mask, memcg, false,
+ sc->nr_scanned - scanned,
+ sc->nr_reclaimed - reclaimed);
+
/*
* Direct reclaim and kswapd have to scan all memory
* cgroups to fulfill the overall scan target for the
@@ -2449,7 +2456,8 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
reclaim_state->reclaimed_slab = 0;
}
- vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
+ /* Record the subtree's reclaim efficiency */
+ vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);