diff options
Diffstat (limited to 'mm/slub.c')
-rw-r--r-- | mm/slub.c | 412 |
1 files changed, 271 insertions, 141 deletions
diff --git a/mm/slub.c b/mm/slub.c index be36f1df809c..21f71cb6cc06 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -750,6 +750,50 @@ static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab, return false; } +/* + * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API + * family will round up the real request size to these fixed ones, so + * there could be an extra area than what is requested. Save the original + * request size in the meta data area, for better debug and sanity check. + */ +static inline void set_orig_size(struct kmem_cache *s, + void *object, unsigned int orig_size) +{ + void *p = kasan_reset_tag(object); + unsigned int kasan_meta_size; + + if (!slub_debug_orig_size(s)) + return; + + /* + * KASAN can save its free meta data inside of the object at offset 0. + * If this meta data size is larger than 'orig_size', it will overlap + * the data redzone in [orig_size+1, object_size]. Thus, we adjust + * 'orig_size' to be as at least as big as KASAN's meta data. + */ + kasan_meta_size = kasan_metadata_size(s, true); + if (kasan_meta_size > orig_size) + orig_size = kasan_meta_size; + + p += get_info_end(s); + p += sizeof(struct track) * 2; + + *(unsigned int *)p = orig_size; +} + +static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) +{ + void *p = kasan_reset_tag(object); + + if (!slub_debug_orig_size(s)) + return s->object_size; + + p += get_info_end(s); + p += sizeof(struct track) * 2; + + return *(unsigned int *)p; +} + #ifdef CONFIG_SLUB_DEBUG static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; static DEFINE_SPINLOCK(object_map_lock); @@ -979,50 +1023,6 @@ static void print_slab_info(const struct slab *slab) &slab->__page_flags); } -/* - * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API - * family will round up the real request size to these fixed ones, so - * there could be an extra area than what is requested. Save the original - * request size in the meta data area, for better debug and sanity check. - */ -static inline void set_orig_size(struct kmem_cache *s, - void *object, unsigned int orig_size) -{ - void *p = kasan_reset_tag(object); - unsigned int kasan_meta_size; - - if (!slub_debug_orig_size(s)) - return; - - /* - * KASAN can save its free meta data inside of the object at offset 0. - * If this meta data size is larger than 'orig_size', it will overlap - * the data redzone in [orig_size+1, object_size]. Thus, we adjust - * 'orig_size' to be as at least as big as KASAN's meta data. - */ - kasan_meta_size = kasan_metadata_size(s, true); - if (kasan_meta_size > orig_size) - orig_size = kasan_meta_size; - - p += get_info_end(s); - p += sizeof(struct track) * 2; - - *(unsigned int *)p = orig_size; -} - -static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) -{ - void *p = kasan_reset_tag(object); - - if (!slub_debug_orig_size(s)) - return s->object_size; - - p += get_info_end(s); - p += sizeof(struct track) * 2; - - return *(unsigned int *)p; -} - void skip_orig_size_check(struct kmem_cache *s, const void *object) { set_orig_size(s, (void *)object, s->object_size); @@ -1888,7 +1888,6 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) {} static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} - #ifndef CONFIG_SLUB_TINY static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, void **freelist, void *nextfree) @@ -2183,6 +2182,45 @@ void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, __memcg_slab_free_hook(s, slab, p, objects, obj_exts); } + +static __fastpath_inline +bool memcg_slab_post_charge(void *p, gfp_t flags) +{ + struct slabobj_ext *slab_exts; + struct kmem_cache *s; + struct folio *folio; + struct slab *slab; + unsigned long off; + + folio = virt_to_folio(p); + if (!folio_test_slab(folio)) { + return folio_memcg_kmem(folio) || + (__memcg_kmem_charge_page(folio_page(folio, 0), flags, + folio_order(folio)) == 0); + } + + slab = folio_slab(folio); + s = slab->slab_cache; + + /* + * Ignore KMALLOC_NORMAL cache to avoid possible circular dependency + * of slab_obj_exts being allocated from the same slab and thus the slab + * becoming effectively unfreeable. + */ + if (is_kmalloc_normal(s)) + return true; + + /* Ignore already charged objects. */ + slab_exts = slab_obj_exts(slab); + if (slab_exts) { + off = obj_to_index(s, slab, p); + if (unlikely(slab_exts[off].objcg)) + return true; + } + + return __memcg_slab_post_alloc_hook(s, NULL, flags, 1, &p); +} + #else /* CONFIG_MEMCG */ static inline bool memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, @@ -2196,18 +2234,37 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { } + +static inline bool memcg_slab_post_charge(void *p, gfp_t flags) +{ + return true; +} #endif /* CONFIG_MEMCG */ +#ifdef CONFIG_SLUB_RCU_DEBUG +static void slab_free_after_rcu_debug(struct rcu_head *rcu_head); + +struct rcu_delayed_free { + struct rcu_head head; + void *object; +}; +#endif + /* * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. * * Returns true if freeing of the object can proceed, false if its reuse - * was delayed by KASAN quarantine, or it was returned to KFENCE. + * was delayed by CONFIG_SLUB_RCU_DEBUG or KASAN quarantine, or it was returned + * to KFENCE. */ static __always_inline -bool slab_free_hook(struct kmem_cache *s, void *x, bool init) +bool slab_free_hook(struct kmem_cache *s, void *x, bool init, + bool after_rcu_delay) { + /* Are the object contents still accessible? */ + bool still_accessible = (s->flags & SLAB_TYPESAFE_BY_RCU) && !after_rcu_delay; + kmemleak_free_recursive(x, s->flags); kmsan_slab_free(s, x); @@ -2217,7 +2274,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init) debug_check_no_obj_freed(x, s->object_size); /* Use KCSAN to help debug racy use-after-free. */ - if (!(s->flags & SLAB_TYPESAFE_BY_RCU)) + if (!still_accessible) __kcsan_check_access(x, s->object_size, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); @@ -2225,6 +2282,35 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init) return false; /* + * Give KASAN a chance to notice an invalid free operation before we + * modify the object. + */ + if (kasan_slab_pre_free(s, x)) + return false; + +#ifdef CONFIG_SLUB_RCU_DEBUG + if (still_accessible) { + struct rcu_delayed_free *delayed_free; + + delayed_free = kmalloc(sizeof(*delayed_free), GFP_NOWAIT); + if (delayed_free) { + /* + * Let KASAN track our call stack as a "related work + * creation", just like if the object had been freed + * normally via kfree_rcu(). + * We have to do this manually because the rcu_head is + * not located inside the object. + */ + kasan_record_aux_stack_noalloc(x); + + delayed_free->object = x; + call_rcu(&delayed_free->head, slab_free_after_rcu_debug); + return false; + } + } +#endif /* CONFIG_SLUB_RCU_DEBUG */ + + /* * As memory initialization might be integrated into KASAN, * kasan_slab_free and initialization memset's must be * kept together to avoid discrepancies in behavior. @@ -2237,17 +2323,24 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init) */ if (unlikely(init)) { int rsize; - unsigned int inuse; + unsigned int inuse, orig_size; inuse = get_info_end(s); + orig_size = get_orig_size(s, x); if (!kasan_has_integrated_init()) - memset(kasan_reset_tag(x), 0, s->object_size); + memset(kasan_reset_tag(x), 0, orig_size); rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0; memset((char *)kasan_reset_tag(x) + inuse, 0, s->size - inuse - rsize); + /* + * Restore orig_size, otherwize kmalloc redzone overwritten + * would be reported + */ + set_orig_size(s, x, orig_size); + } /* KASAN might put x into memory quarantine, delaying its reuse. */ - return !kasan_slab_free(s, x, init); + return !kasan_slab_free(s, x, init, still_accessible); } static __fastpath_inline @@ -2261,7 +2354,7 @@ bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail, bool init; if (is_kfence_address(next)) { - slab_free_hook(s, next, false); + slab_free_hook(s, next, false, false); return false; } @@ -2276,7 +2369,7 @@ bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail, next = get_freepointer(s, object); /* If object's reuse doesn't have to be delayed */ - if (likely(slab_free_hook(s, object, init))) { + if (likely(slab_free_hook(s, object, init, false))) { /* Move object to the new freelist */ set_freepointer(s, object, *head); *head = object; @@ -2316,7 +2409,11 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node, struct slab *slab; unsigned int order = oo_order(oo); - folio = (struct folio *)alloc_pages_node(node, flags, order); + if (node == NUMA_NO_NODE) + folio = (struct folio *)alloc_pages(flags, order); + else + folio = (struct folio *)__alloc_pages_node(node, flags, order); + if (!folio) return NULL; @@ -3414,14 +3511,15 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + int cpu = raw_smp_processor_id(); int node; struct kmem_cache_node *n; if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) return; - pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", - nid, gfpflags, &gfpflags); + pr_warn("SLUB: Unable to allocate memory on CPU %u (of node %d) on node %d, gfp=%#x(%pGg)\n", + cpu, cpu_to_node(cpu), nid, gfpflags, &gfpflags); pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n", s->name, s->object_size, s->size, oo_order(s->oo), oo_order(s->min)); @@ -3920,8 +4018,7 @@ static void *__slab_alloc_node(struct kmem_cache *s, * If the object has been wiped upon free, make sure it's fully initialized by * zeroing out freelist pointer. * - * Note that we also wipe custom freelist pointers specified via - * s->rcu_freeptr_offset. + * Note that we also wipe custom freelist pointers. */ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, void *obj) @@ -4063,6 +4160,15 @@ void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru, } EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof); +bool kmem_cache_charge(void *objp, gfp_t gfpflags) +{ + if (!memcg_kmem_online()) + return true; + + return memcg_slab_post_charge(objp, gfpflags); +} +EXPORT_SYMBOL(kmem_cache_charge); + /** * kmem_cache_alloc_node - Allocate an object on the specified node * @s: The cache to allocate from. @@ -4471,7 +4577,7 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, memcg_slab_free_hook(s, slab, &object, 1); alloc_tagging_slab_free_hook(s, slab, &object, 1); - if (likely(slab_free_hook(s, object, slab_want_init_on_free(s)))) + if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) do_slab_free(s, slab, object, object, 1, addr); } @@ -4480,7 +4586,7 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, static noinline void memcg_alloc_abort_single(struct kmem_cache *s, void *object) { - if (likely(slab_free_hook(s, object, slab_want_init_on_free(s)))) + if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_); } #endif @@ -4499,6 +4605,33 @@ void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head, do_slab_free(s, slab, head, tail, cnt, addr); } +#ifdef CONFIG_SLUB_RCU_DEBUG +static void slab_free_after_rcu_debug(struct rcu_head *rcu_head) +{ + struct rcu_delayed_free *delayed_free = + container_of(rcu_head, struct rcu_delayed_free, head); + void *object = delayed_free->object; + struct slab *slab = virt_to_slab(object); + struct kmem_cache *s; + + kfree(delayed_free); + + if (WARN_ON(is_kfence_address(object))) + return; + + /* find the object and the cache again */ + if (WARN_ON(!slab)) + return; + s = slab->slab_cache; + if (WARN_ON(!(s->flags & SLAB_TYPESAFE_BY_RCU))) + return; + + /* resume freeing */ + if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) + do_slab_free(s, slab, object, object, 1, _THIS_IP_); +} +#endif /* CONFIG_SLUB_RCU_DEBUG */ + #ifdef CONFIG_KASAN_GENERIC void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) { @@ -5145,17 +5278,11 @@ static void set_cpu_partial(struct kmem_cache *s) #endif } -/* Was a valid freeptr offset requested? */ -static inline bool has_freeptr_offset(const struct kmem_cache *s) -{ - return s->rcu_freeptr_offset != UINT_MAX; -} - /* * calculate_sizes() determines the order and the distribution of data within * a slab object. */ -static int calculate_sizes(struct kmem_cache *s) +static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) { slab_flags_t flags = s->flags; unsigned int size = s->object_size; @@ -5196,7 +5323,7 @@ static int calculate_sizes(struct kmem_cache *s) */ s->inuse = size; - if (((flags & SLAB_TYPESAFE_BY_RCU) && !has_freeptr_offset(s)) || + if (((flags & SLAB_TYPESAFE_BY_RCU) && !args->use_freeptr_offset) || (flags & SLAB_POISON) || s->ctor || ((flags & SLAB_RED_ZONE) && (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) { @@ -5218,8 +5345,8 @@ static int calculate_sizes(struct kmem_cache *s) */ s->offset = size; size += sizeof(void *); - } else if ((flags & SLAB_TYPESAFE_BY_RCU) && has_freeptr_offset(s)) { - s->offset = s->rcu_freeptr_offset; + } else if ((flags & SLAB_TYPESAFE_BY_RCU) && args->use_freeptr_offset) { + s->offset = args->freeptr_offset; } else { /* * Store freelist pointer near middle of object to keep @@ -5294,65 +5421,6 @@ static int calculate_sizes(struct kmem_cache *s) return !!oo_objects(s->oo); } -static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) -{ - s->flags = kmem_cache_flags(flags, s->name); -#ifdef CONFIG_SLAB_FREELIST_HARDENED - s->random = get_random_long(); -#endif - - if (!calculate_sizes(s)) - goto error; - if (disable_higher_order_debug) { - /* - * Disable debugging flags that store metadata if the min slab - * order increased. - */ - if (get_order(s->size) > get_order(s->object_size)) { - s->flags &= ~DEBUG_METADATA_FLAGS; - s->offset = 0; - if (!calculate_sizes(s)) - goto error; - } - } - -#ifdef system_has_freelist_aba - if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) { - /* Enable fast mode */ - s->flags |= __CMPXCHG_DOUBLE; - } -#endif - - /* - * The larger the object size is, the more slabs we want on the partial - * list to avoid pounding the page allocator excessively. - */ - s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); - s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); - - set_cpu_partial(s); - -#ifdef CONFIG_NUMA - s->remote_node_defrag_ratio = 1000; -#endif - - /* Initialize the pre-computed randomized freelist if slab is up */ - if (slab_state >= UP) { - if (init_cache_random_seq(s)) - goto error; - } - - if (!init_kmem_cache_nodes(s)) - goto error; - - if (alloc_kmem_cache_cpus(s)) - return 0; - -error: - __kmem_cache_release(s); - return -EINVAL; -} - static void list_slab_objects(struct kmem_cache *s, struct slab *slab, const char *text) { @@ -5906,28 +5974,90 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, return s; } -int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) +int do_kmem_cache_create(struct kmem_cache *s, const char *name, + unsigned int size, struct kmem_cache_args *args, + slab_flags_t flags) { - int err; + int err = -EINVAL; - err = kmem_cache_open(s, flags); - if (err) - return err; + s->name = name; + s->size = s->object_size = size; + + s->flags = kmem_cache_flags(flags, s->name); +#ifdef CONFIG_SLAB_FREELIST_HARDENED + s->random = get_random_long(); +#endif + s->align = args->align; + s->ctor = args->ctor; +#ifdef CONFIG_HARDENED_USERCOPY + s->useroffset = args->useroffset; + s->usersize = args->usersize; +#endif + + if (!calculate_sizes(args, s)) + goto out; + if (disable_higher_order_debug) { + /* + * Disable debugging flags that store metadata if the min slab + * order increased. + */ + if (get_order(s->size) > get_order(s->object_size)) { + s->flags &= ~DEBUG_METADATA_FLAGS; + s->offset = 0; + if (!calculate_sizes(args, s)) + goto out; + } + } + +#ifdef system_has_freelist_aba + if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) { + /* Enable fast mode */ + s->flags |= __CMPXCHG_DOUBLE; + } +#endif + + /* + * The larger the object size is, the more slabs we want on the partial + * list to avoid pounding the page allocator excessively. + */ + s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); + s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); + + set_cpu_partial(s); + +#ifdef CONFIG_NUMA + s->remote_node_defrag_ratio = 1000; +#endif + + /* Initialize the pre-computed randomized freelist if slab is up */ + if (slab_state >= UP) { + if (init_cache_random_seq(s)) + goto out; + } + + if (!init_kmem_cache_nodes(s)) + goto out; + + if (!alloc_kmem_cache_cpus(s)) + goto out; /* Mutex is not taken during early boot */ - if (slab_state <= UP) - return 0; + if (slab_state <= UP) { + err = 0; + goto out; + } err = sysfs_slab_add(s); - if (err) { - __kmem_cache_release(s); - return err; - } + if (err) + goto out; if (s->flags & SLAB_STORE_USER) debugfs_slab_add(s); - return 0; +out: + if (err) + __kmem_cache_release(s); + return err; } #ifdef SLAB_SUPPORTS_SYSFS |