From fdde6abb3e8dd45e4b026fcf32d40aed96ce3944 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 29 Jul 2011 18:22:13 +0200 Subject: [PATCH 01/16] slab: use print_hex_dump Less code and the advantage of ascii dump. before: | Slab corruption: names_cache start=c5788000, len=4096 | 000: 6b 6b 01 00 00 00 56 00 00 00 24 00 00 00 2a 00 | 010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 | 020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ff ff | 030: ff ff ff ff e2 b4 17 18 c7 e4 08 06 00 01 08 00 | 040: 06 04 00 01 e2 b4 17 18 c7 e4 0a 00 00 01 00 00 | 050: 00 00 00 00 0a 00 00 02 6b 6b 6b 6b 6b 6b 6b 6b after: | Slab corruption: size-4096 start=c38a9000, len=4096 | 000: 6b 6b 01 00 00 00 56 00 00 00 24 00 00 00 2a 00 kk....V...$...*. | 010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ | 020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ff ff ................ | 030: ff ff ff ff d2 56 5f aa db 9c 08 06 00 01 08 00 .....V_......... | 040: 06 04 00 01 d2 56 5f aa db 9c 0a 00 00 01 00 00 .....V_......... | 050: 00 00 00 00 0a 00 00 02 6b 6b 6b 6b 6b 6b 6b 6b ........kkkkkkkk Acked-by: Christoph Lameter Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Pekka Enberg --- mm/slab.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 1e523ed47c61..41fc5781c7cc 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1811,15 +1811,15 @@ static void dump_line(char *data, int offset, int limit) unsigned char error = 0; int bad_count = 0; - printk(KERN_ERR "%03x:", offset); + printk(KERN_ERR "%03x: ", offset); for (i = 0; i < limit; i++) { if (data[offset + i] != POISON_FREE) { error = data[offset + i]; bad_count++; } - printk(" %02x", (unsigned char)data[offset + i]); } - printk("\n"); + print_hex_dump(KERN_CONT, "", 0, 16, 1, + &data[offset], limit, 1); if (bad_count == 1) { error ^= POISON_FREE; @@ -2989,14 +2989,9 @@ bad: printk(KERN_ERR "slab: Internal list corruption detected in " "cache '%s'(%d), slabp %p(%d). Hexdump:\n", cachep->name, cachep->num, slabp, slabp->inuse); - for (i = 0; - i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); - i++) { - if (i % 16 == 0) - printk("\n%03x:", i); - printk(" %02x", ((unsigned char *)slabp)[i]); - } - printk("\n"); + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp, + sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t), + 1); BUG(); } } From ffc79d2880009ea0460d679f8413cfa40366bef4 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 29 Jul 2011 14:10:20 +0200 Subject: [PATCH 02/16] slub: use print_hex_dump Less code and same functionality. The output would be: | Object c7428000: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk | Object c7428010: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk | Object c7428020: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk | Object c7428030: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b a5 kkkkkkkkkkk. | Redzone c742803c: bb bb bb bb .... | Padding c7428064: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ | Padding c7428074: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZ Acked-by: Christoph Lameter Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Pekka Enberg --- mm/slub.c | 44 +++++++++----------------------------------- 1 file changed, 9 insertions(+), 35 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index eb5a8f93338a..2dc22160aff1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -467,34 +467,8 @@ static int disable_higher_order_debug; */ static void print_section(char *text, u8 *addr, unsigned int length) { - int i, offset; - int newline = 1; - char ascii[17]; - - ascii[16] = 0; - - for (i = 0; i < length; i++) { - if (newline) { - printk(KERN_ERR "%8s 0x%p: ", text, addr + i); - newline = 0; - } - printk(KERN_CONT " %02x", addr[i]); - offset = i % 16; - ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; - if (offset == 15) { - printk(KERN_CONT " %s\n", ascii); - newline = 1; - } - } - if (!newline) { - i %= 16; - while (i < 16) { - printk(KERN_CONT " "); - ascii[i] = ' '; - i++; - } - printk(KERN_CONT " %s\n", ascii); - } + print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, + length, 1); } static struct track *get_track(struct kmem_cache *s, void *object, @@ -625,12 +599,12 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) p, p - addr, get_freepointer(s, p)); if (p > addr + 16) - print_section("Bytes b4", p - 16, 16); - - print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE)); + print_section("Bytes b4 ", p - 16, 16); + print_section("Object ", p, min_t(unsigned long, s->objsize, + PAGE_SIZE)); if (s->flags & SLAB_RED_ZONE) - print_section("Redzone", p + s->objsize, + print_section("Redzone ", p + s->objsize, s->inuse - s->objsize); if (s->offset) @@ -643,7 +617,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (off != s->size) /* Beginning of the filler is the free pointer */ - print_section("Padding", p + off, s->size - off); + print_section("Padding ", p + off, s->size - off); dump_stack(); } @@ -838,7 +812,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) end--; slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); - print_section("Padding", end - remainder, remainder); + print_section("Padding ", end - remainder, remainder); restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); return 0; @@ -987,7 +961,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, page->freelist); if (!alloc) - print_section("Object", (void *)object, s->objsize); + print_section("Object ", (void *)object, s->objsize); dump_stack(); } From 69cb8e6b7c2982e015d2b35a34ac2674c79e801c Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 9 Aug 2011 16:12:22 -0500 Subject: [PATCH 03/16] slub: free slabs without holding locks There are two situations in which slub holds a lock while releasing pages: A. During kmem_cache_shrink() B. During kmem_cache_close() For A build a list while holding the lock and then release the pages later. In case of B we are the last remaining user of the slab so there is no need to take the listlock. After this patch all calls to the page allocator to free pages are done without holding any spinlocks. kmem_cache_destroy() will still hold the slub_lock semaphore. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 9f662d70eb47..30c4558acc8b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2970,13 +2970,13 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, /* * Attempt to free all partial slabs on a node. + * This is called from kmem_cache_close(). We must be the last thread + * using the cache and therefore we do not need to lock anymore. */ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { - unsigned long flags; struct page *page, *h; - spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { remove_partial(n, page); @@ -2986,7 +2986,6 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) "Objects remaining on kmem_cache_close()"); } } - spin_unlock_irqrestore(&n->list_lock, flags); } /* @@ -3020,6 +3019,7 @@ void kmem_cache_destroy(struct kmem_cache *s) s->refcount--; if (!s->refcount) { list_del(&s->list); + up_write(&slub_lock); if (kmem_cache_close(s)) { printk(KERN_ERR "SLUB %s: %s called for cache that " "still has objects.\n", s->name, __func__); @@ -3028,8 +3028,8 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); sysfs_slab_remove(s); - } - up_write(&slub_lock); + } else + up_write(&slub_lock); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -3347,23 +3347,23 @@ int kmem_cache_shrink(struct kmem_cache *s) * list_lock. page->inuse here is the upper limit. */ list_for_each_entry_safe(page, t, &n->partial, lru) { - if (!page->inuse) { - remove_partial(n, page); - discard_slab(s, page); - } else { - list_move(&page->lru, - slabs_by_inuse + page->inuse); - } + list_move(&page->lru, slabs_by_inuse + page->inuse); + if (!page->inuse) + n->nr_partial--; } /* * Rebuild the partial list with the slabs filled up most * first and the least used slabs at the end. */ - for (i = objects - 1; i >= 0; i--) + for (i = objects - 1; i > 0; i--) list_splice(slabs_by_inuse + i, n->partial.prev); spin_unlock_irqrestore(&n->list_lock, flags); + + /* Release empty slabs */ + list_for_each_entry_safe(page, t, slabs_by_inuse, lru) + discard_slab(s, page); } kfree(slabs_by_inuse); From 7db0d7054048da140798d8bfdd7fff5f74f85abf Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 9 Aug 2011 16:12:23 -0500 Subject: [PATCH 04/16] slub: Remove useless statements in __slab_alloc Two statements in __slab_alloc() do not have any effect. 1. c->page is already set to NULL by deactivate_slab() called right before. 2. gfpflags are masked in new_slab() before being passed to the page allocator. There is no need to mask gfpflags in __slab_alloc in particular since most frequent processing in __slab_alloc does not require the use of a gfpmask. Cc: torvalds@linux-foundation.org Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 30c4558acc8b..d4b76bea5dba 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2064,9 +2064,6 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, c = this_cpu_ptr(s->cpu_slab); #endif - /* We handle __GFP_ZERO in the caller */ - gfpflags &= ~__GFP_ZERO; - page = c->page; if (!page) goto new_slab; @@ -2163,7 +2160,6 @@ debug: c->freelist = get_freepointer(s, object); deactivate_slab(s, c); - c->page = NULL; c->node = NUMA_NO_NODE; local_irq_restore(flags); return object; From e6e82ea1127d899af7eee876f1c1103716d13772 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 9 Aug 2011 16:12:24 -0500 Subject: [PATCH 05/16] slub: Prepare inuse field in new_slab() inuse will always be set to page->objects. There is no point in initializing the field to zero in new_slab() and then overwriting the value in __slab_alloc(). Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index d4b76bea5dba..83bb81de4468 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1447,7 +1447,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, last, NULL); page->freelist = start; - page->inuse = 0; + page->inuse = page->objects; page->frozen = 1; out: return page; @@ -2139,7 +2139,6 @@ new_slab: */ object = page->freelist; page->freelist = NULL; - page->inuse = page->objects; stat(s, ALLOC_SLAB); c->node = page_to_nid(page); @@ -2681,7 +2680,7 @@ static void early_kmem_cache_node_alloc(int node) n = page->freelist; BUG_ON(!n); page->freelist = get_freepointer(kmem_cache_node, n); - page->inuse++; + page->inuse = 1; page->frozen = 0; kmem_cache_node->node[node] = n; #ifdef CONFIG_SLUB_DEBUG From acd19fd1a7b5152cf29f67aaab23aa61078aaa74 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 9 Aug 2011 16:12:25 -0500 Subject: [PATCH 06/16] slub: pass kmem_cache_cpu pointer to get_partial() Pass the kmem_cache_cpu pointer to get_partial(). That way we can avoid the this_cpu_write() statements. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 83bb81de4468..cb53affecca7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1557,7 +1557,8 @@ static inline void remove_partial(struct kmem_cache_node *n, * Must hold list_lock. */ static inline int acquire_slab(struct kmem_cache *s, - struct kmem_cache_node *n, struct page *page) + struct kmem_cache_node *n, struct page *page, + struct kmem_cache_cpu *c) { void *freelist; unsigned long counters; @@ -1586,9 +1587,9 @@ static inline int acquire_slab(struct kmem_cache *s, if (freelist) { /* Populate the per cpu freelist */ - this_cpu_write(s->cpu_slab->freelist, freelist); - this_cpu_write(s->cpu_slab->page, page); - this_cpu_write(s->cpu_slab->node, page_to_nid(page)); + c->freelist = freelist; + c->page = page; + c->node = page_to_nid(page); return 1; } else { /* @@ -1606,7 +1607,7 @@ static inline int acquire_slab(struct kmem_cache *s, * Try to allocate a partial slab from a specific node. */ static struct page *get_partial_node(struct kmem_cache *s, - struct kmem_cache_node *n) + struct kmem_cache_node *n, struct kmem_cache_cpu *c) { struct page *page; @@ -1621,7 +1622,7 @@ static struct page *get_partial_node(struct kmem_cache *s, spin_lock(&n->list_lock); list_for_each_entry(page, &n->partial, lru) - if (acquire_slab(s, n, page)) + if (acquire_slab(s, n, page, c)) goto out; page = NULL; out: @@ -1632,7 +1633,8 @@ out: /* * Get a page from somewhere. Search in increasing NUMA distances. */ -static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) +static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, + struct kmem_cache_cpu *c) { #ifdef CONFIG_NUMA struct zonelist *zonelist; @@ -1672,7 +1674,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) if (n && cpuset_zone_allowed_hardwall(zone, flags) && n->nr_partial > s->min_partial) { - page = get_partial_node(s, n); + page = get_partial_node(s, n, c); if (page) { put_mems_allowed(); return page; @@ -1687,16 +1689,17 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) /* * Get a partial page, lock it and return it. */ -static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) +static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node, + struct kmem_cache_cpu *c) { struct page *page; int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; - page = get_partial_node(s, get_node(s, searchnode)); + page = get_partial_node(s, get_node(s, searchnode), c); if (page || node != NUMA_NO_NODE) return page; - return get_any_partial(s, flags); + return get_any_partial(s, flags, c); } #ifdef CONFIG_PREEMPT @@ -1765,9 +1768,6 @@ void init_kmem_cache_cpus(struct kmem_cache *s) for_each_possible_cpu(cpu) per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); } -/* - * Remove the cpu slab - */ /* * Remove the cpu slab @@ -2116,7 +2116,7 @@ load_freelist: return object; new_slab: - page = get_partial(s, gfpflags, node); + page = get_partial(s, gfpflags, node, c); if (page) { stat(s, ALLOC_FROM_PARTIAL); object = c->freelist; From 497b66f2ecc97844493e6a147fd5a7e73f73f408 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 9 Aug 2011 16:12:26 -0500 Subject: [PATCH 07/16] slub: return object pointer from get_partial() / new_slab(). There is no need anymore to return the pointer to a slab page from get_partial() since the page reference can be stored in the kmem_cache_cpu structures "page" field. Return an object pointer instead. That in turn allows a simplification of the spaghetti code in __slab_alloc(). Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 137 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 75 insertions(+), 62 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index cb53affecca7..df381af963b7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1554,9 +1554,11 @@ static inline void remove_partial(struct kmem_cache_node *n, * Lock slab, remove from the partial list and put the object into the * per cpu freelist. * + * Returns a list of objects or NULL if it fails. + * * Must hold list_lock. */ -static inline int acquire_slab(struct kmem_cache *s, +static inline void *acquire_slab(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page, struct kmem_cache_cpu *c) { @@ -1587,10 +1589,11 @@ static inline int acquire_slab(struct kmem_cache *s, if (freelist) { /* Populate the per cpu freelist */ - c->freelist = freelist; c->page = page; c->node = page_to_nid(page); - return 1; + stat(s, ALLOC_FROM_PARTIAL); + + return freelist; } else { /* * Slab page came from the wrong list. No object to allocate @@ -1599,17 +1602,18 @@ static inline int acquire_slab(struct kmem_cache *s, */ printk(KERN_ERR "SLUB: %s : Page without available objects on" " partial list\n", s->name); - return 0; + return NULL; } } /* * Try to allocate a partial slab from a specific node. */ -static struct page *get_partial_node(struct kmem_cache *s, +static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, struct kmem_cache_cpu *c) { struct page *page; + void *object; /* * Racy check. If we mistakenly see no partial slabs then we @@ -1621,13 +1625,15 @@ static struct page *get_partial_node(struct kmem_cache *s, return NULL; spin_lock(&n->list_lock); - list_for_each_entry(page, &n->partial, lru) - if (acquire_slab(s, n, page, c)) + list_for_each_entry(page, &n->partial, lru) { + object = acquire_slab(s, n, page, c); + if (object) goto out; - page = NULL; + } + object = NULL; out: spin_unlock(&n->list_lock); - return page; + return object; } /* @@ -1641,7 +1647,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, struct zoneref *z; struct zone *zone; enum zone_type high_zoneidx = gfp_zone(flags); - struct page *page; + void *object; /* * The defrag ratio allows a configuration of the tradeoffs between @@ -1674,10 +1680,10 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, if (n && cpuset_zone_allowed_hardwall(zone, flags) && n->nr_partial > s->min_partial) { - page = get_partial_node(s, n, c); - if (page) { + object = get_partial_node(s, n, c); + if (object) { put_mems_allowed(); - return page; + return object; } } } @@ -1689,15 +1695,15 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, /* * Get a partial page, lock it and return it. */ -static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node, +static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, struct kmem_cache_cpu *c) { - struct page *page; + void *object; int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; - page = get_partial_node(s, get_node(s, searchnode), c); - if (page || node != NUMA_NO_NODE) - return page; + object = get_partial_node(s, get_node(s, searchnode), c); + if (object || node != NUMA_NO_NODE) + return object; return get_any_partial(s, flags, c); } @@ -2027,6 +2033,35 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) } } +static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, + int node, struct kmem_cache_cpu **pc) +{ + void *object; + struct kmem_cache_cpu *c; + struct page *page = new_slab(s, flags, node); + + if (page) { + c = __this_cpu_ptr(s->cpu_slab); + if (c->page) + flush_slab(s, c); + + /* + * No other reference to the page yet so we can + * muck around with it freely without cmpxchg + */ + object = page->freelist; + page->freelist = NULL; + + stat(s, ALLOC_SLAB); + c->node = page_to_nid(page); + c->page = page; + *pc = c; + } else + object = NULL; + + return object; +} + /* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. @@ -2049,7 +2084,6 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c) { void **object; - struct page *page; unsigned long flags; struct page new; unsigned long counters; @@ -2064,8 +2098,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, c = this_cpu_ptr(s->cpu_slab); #endif - page = c->page; - if (!page) + if (!c->page) goto new_slab; if (unlikely(!node_match(c, node))) { @@ -2077,8 +2110,8 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, stat(s, ALLOC_SLOWPATH); do { - object = page->freelist; - counters = page->counters; + object = c->page->freelist; + counters = c->page->counters; new.counters = counters; VM_BUG_ON(!new.frozen); @@ -2090,12 +2123,12 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, * * If there are objects left then we retrieve them * and use them to refill the per cpu queue. - */ + */ - new.inuse = page->objects; + new.inuse = c->page->objects; new.frozen = object != NULL; - } while (!__cmpxchg_double_slab(s, page, + } while (!__cmpxchg_double_slab(s, c->page, object, counters, NULL, new.counters, "__slab_alloc")); @@ -2109,53 +2142,33 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, stat(s, ALLOC_REFILL); load_freelist: - VM_BUG_ON(!page->frozen); c->freelist = get_freepointer(s, object); c->tid = next_tid(c->tid); local_irq_restore(flags); return object; new_slab: - page = get_partial(s, gfpflags, node, c); - if (page) { - stat(s, ALLOC_FROM_PARTIAL); - object = c->freelist; + object = get_partial(s, gfpflags, node, c); - if (kmem_cache_debug(s)) - goto debug; - goto load_freelist; + if (unlikely(!object)) { + + object = new_slab_objects(s, gfpflags, node, &c); + + if (unlikely(!object)) { + if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) + slab_out_of_memory(s, gfpflags, node); + + local_irq_restore(flags); + return NULL; + } } - page = new_slab(s, gfpflags, node); - - if (page) { - c = __this_cpu_ptr(s->cpu_slab); - if (c->page) - flush_slab(s, c); - - /* - * No other reference to the page yet so we can - * muck around with it freely without cmpxchg - */ - object = page->freelist; - page->freelist = NULL; - - stat(s, ALLOC_SLAB); - c->node = page_to_nid(page); - c->page = page; - - if (kmem_cache_debug(s)) - goto debug; + if (likely(!kmem_cache_debug(s))) goto load_freelist; - } - if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) - slab_out_of_memory(s, gfpflags, node); - local_irq_restore(flags); - return NULL; -debug: - if (!object || !alloc_debug_processing(s, page, object, addr)) - goto new_slab; + /* Only entered in the debug case */ + if (!alloc_debug_processing(s, c->page, object, addr)) + goto new_slab; /* Slab failed checks. Next slab needed */ c->freelist = get_freepointer(s, object); deactivate_slab(s, c); From 49e2258586b423684f03c278149ab46d8f8b6700 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 9 Aug 2011 16:12:27 -0500 Subject: [PATCH 08/16] slub: per cpu cache for partial pages Allow filling out the rest of the kmem_cache_cpu cacheline with pointers to partial pages. The partial page list is used in slab_free() to avoid per node lock taking. In __slab_alloc() we can then take multiple partial pages off the per node partial list in one go reducing node lock pressure. We can also use the per cpu partial list in slab_alloc() to avoid scanning partial lists for pages with free objects. The main effect of a per cpu partial list is that the per node list_lock is taken for batches of partial pages instead of individual ones. Potential future enhancements: 1. The pickup from the partial list could be perhaps be done without disabling interrupts with some work. The free path already puts the page into the per cpu partial list without disabling interrupts. 2. __slab_free() may have some code paths that could use optimization. Performance: Before After ./hackbench 100 process 200000 Time: 1953.047 1564.614 ./hackbench 100 process 20000 Time: 207.176 156.940 ./hackbench 100 process 20000 Time: 204.468 156.940 ./hackbench 100 process 20000 Time: 204.879 158.772 ./hackbench 10 process 20000 Time: 20.153 15.853 ./hackbench 10 process 20000 Time: 20.153 15.986 ./hackbench 10 process 20000 Time: 19.363 16.111 ./hackbench 1 process 20000 Time: 2.518 2.307 ./hackbench 1 process 20000 Time: 2.258 2.339 ./hackbench 1 process 20000 Time: 2.864 2.163 Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/mm_types.h | 14 +- include/linux/slub_def.h | 4 + mm/slub.c | 339 +++++++++++++++++++++++++++++++++------ 3 files changed, 309 insertions(+), 48 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 774b8952deb4..7870e473033c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -79,9 +79,21 @@ struct page { }; /* Third double word block */ - struct list_head lru; /* Pageout list, eg. active_list + union { + struct list_head lru; /* Pageout list, eg. active_list * protected by zone->lru_lock ! */ + struct { /* slub per cpu partial pages */ + struct page *next; /* Next partial slab */ +#ifdef CONFIG_64BIT + int pages; /* Nr of partial slabs left */ + int pobjects; /* Approximate # of objects */ +#else + short int pages; + short int pobjects; +#endif + }; + }; /* Remainder is not double word aligned */ union { diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index f58d6413d230..4890ef79d752 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -36,12 +36,15 @@ enum stat_item { ORDER_FALLBACK, /* Number of times fallback was necessary */ CMPXCHG_DOUBLE_CPU_FAIL,/* Failure of this_cpu_cmpxchg_double */ CMPXCHG_DOUBLE_FAIL, /* Number of times that cmpxchg double did not match */ + CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ + CPU_PARTIAL_FREE, /* USed cpu partial on free */ NR_SLUB_STAT_ITEMS }; struct kmem_cache_cpu { void **freelist; /* Pointer to next available object */ unsigned long tid; /* Globally unique transaction id */ struct page *page; /* The slab from which we are allocating */ + struct page *partial; /* Partially allocated frozen slabs */ int node; /* The node of the page (or -1 for debug) */ #ifdef CONFIG_SLUB_STATS unsigned stat[NR_SLUB_STAT_ITEMS]; @@ -79,6 +82,7 @@ struct kmem_cache { int size; /* The size of an object including meta data */ int objsize; /* The size of an object without meta data */ int offset; /* Free pointer offset. */ + int cpu_partial; /* Number of per cpu partial pages to keep around */ struct kmem_cache_order_objects oo; /* Allocation and freeing of slabs */ diff --git a/mm/slub.c b/mm/slub.c index df381af963b7..0e286acef62a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1560,7 +1560,7 @@ static inline void remove_partial(struct kmem_cache_node *n, */ static inline void *acquire_slab(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page, - struct kmem_cache_cpu *c) + int mode) { void *freelist; unsigned long counters; @@ -1575,7 +1575,8 @@ static inline void *acquire_slab(struct kmem_cache *s, freelist = page->freelist; counters = page->counters; new.counters = counters; - new.inuse = page->objects; + if (mode) + new.inuse = page->objects; VM_BUG_ON(new.frozen); new.frozen = 1; @@ -1586,34 +1587,20 @@ static inline void *acquire_slab(struct kmem_cache *s, "lock and freeze")); remove_partial(n, page); - - if (freelist) { - /* Populate the per cpu freelist */ - c->page = page; - c->node = page_to_nid(page); - stat(s, ALLOC_FROM_PARTIAL); - - return freelist; - } else { - /* - * Slab page came from the wrong list. No object to allocate - * from. Put it onto the correct list and continue partial - * scan. - */ - printk(KERN_ERR "SLUB: %s : Page without available objects on" - " partial list\n", s->name); - return NULL; - } + return freelist; } +static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); + /* * Try to allocate a partial slab from a specific node. */ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, struct kmem_cache_cpu *c) { - struct page *page; - void *object; + struct page *page, *page2; + void *object = NULL; + int count = 0; /* * Racy check. If we mistakenly see no partial slabs then we @@ -1625,13 +1612,28 @@ static void *get_partial_node(struct kmem_cache *s, return NULL; spin_lock(&n->list_lock); - list_for_each_entry(page, &n->partial, lru) { - object = acquire_slab(s, n, page, c); - if (object) - goto out; + list_for_each_entry_safe(page, page2, &n->partial, lru) { + void *t = acquire_slab(s, n, page, count == 0); + int available; + + if (!t) + break; + + if (!count) { + c->page = page; + c->node = page_to_nid(page); + stat(s, ALLOC_FROM_PARTIAL); + count++; + object = t; + available = page->objects - page->inuse; + } else { + page->freelist = t; + available = put_cpu_partial(s, page, 0); + } + if (kmem_cache_debug(s) || available > s->cpu_partial / 2) + break; + } - object = NULL; -out: spin_unlock(&n->list_lock); return object; } @@ -1926,6 +1928,123 @@ redo: } } +/* Unfreeze all the cpu partial slabs */ +static void unfreeze_partials(struct kmem_cache *s) +{ + struct kmem_cache_node *n = NULL; + struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); + struct page *page; + + while ((page = c->partial)) { + enum slab_modes { M_PARTIAL, M_FREE }; + enum slab_modes l, m; + struct page new; + struct page old; + + c->partial = page->next; + l = M_FREE; + + do { + + old.freelist = page->freelist; + old.counters = page->counters; + VM_BUG_ON(!old.frozen); + + new.counters = old.counters; + new.freelist = old.freelist; + + new.frozen = 0; + + if (!new.inuse && (!n || n->nr_partial < s->min_partial)) + m = M_FREE; + else { + struct kmem_cache_node *n2 = get_node(s, + page_to_nid(page)); + + m = M_PARTIAL; + if (n != n2) { + if (n) + spin_unlock(&n->list_lock); + + n = n2; + spin_lock(&n->list_lock); + } + } + + if (l != m) { + if (l == M_PARTIAL) + remove_partial(n, page); + else + add_partial(n, page, 1); + + l = m; + } + + } while (!cmpxchg_double_slab(s, page, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")); + + if (m == M_FREE) { + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, page); + stat(s, FREE_SLAB); + } + } + + if (n) + spin_unlock(&n->list_lock); +} + +/* + * Put a page that was just frozen (in __slab_free) into a partial page + * slot if available. This is done without interrupts disabled and without + * preemption disabled. The cmpxchg is racy and may put the partial page + * onto a random cpus partial slot. + * + * If we did not find a slot then simply move all the partials to the + * per node partial list. + */ +int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) +{ + struct page *oldpage; + int pages; + int pobjects; + + do { + pages = 0; + pobjects = 0; + oldpage = this_cpu_read(s->cpu_slab->partial); + + if (oldpage) { + pobjects = oldpage->pobjects; + pages = oldpage->pages; + if (drain && pobjects > s->cpu_partial) { + unsigned long flags; + /* + * partial array is full. Move the existing + * set to the per node partial list. + */ + local_irq_save(flags); + unfreeze_partials(s); + local_irq_restore(flags); + pobjects = 0; + pages = 0; + } + } + + pages++; + pobjects += page->objects - page->inuse; + + page->pages = pages; + page->pobjects = pobjects; + page->next = oldpage; + + } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); + stat(s, CPU_PARTIAL_FREE); + return pobjects; +} + static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { stat(s, CPUSLAB_FLUSH); @@ -1941,8 +2060,12 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - if (likely(c && c->page)) - flush_slab(s, c); + if (likely(c)) { + if (c->page) + flush_slab(s, c); + + unfreeze_partials(s); + } } static void flush_cpu_slab(void *d) @@ -2066,8 +2189,6 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, * Slow path. The lockless freelist is empty or we need to perform * debugging duties. * - * Interrupts are disabled. - * * Processing is still very fast if new objects have been freed to the * regular freelist. In that case we simply take over the regular freelist * as the lockless freelist and zap the regular freelist. @@ -2100,7 +2221,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (!c->page) goto new_slab; - +redo: if (unlikely(!node_match(c, node))) { stat(s, ALLOC_NODE_MISMATCH); deactivate_slab(s, c); @@ -2133,7 +2254,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, NULL, new.counters, "__slab_alloc")); - if (unlikely(!object)) { + if (!object) { c->page = NULL; stat(s, DEACTIVATE_BYPASS); goto new_slab; @@ -2148,6 +2269,17 @@ load_freelist: return object; new_slab: + + if (c->partial) { + c->page = c->partial; + c->partial = c->page->next; + c->node = page_to_nid(c->page); + stat(s, CPU_PARTIAL_ALLOC); + c->freelist = NULL; + goto redo; + } + + /* Then do expensive stuff like retrieving pages from the partial lists */ object = get_partial(s, gfpflags, node, c); if (unlikely(!object)) { @@ -2341,16 +2473,29 @@ static void __slab_free(struct kmem_cache *s, struct page *page, was_frozen = new.frozen; new.inuse--; if ((!new.inuse || !prior) && !was_frozen && !n) { - n = get_node(s, page_to_nid(page)); - /* - * Speculatively acquire the list_lock. - * If the cmpxchg does not succeed then we may - * drop the list_lock without any processing. - * - * Otherwise the list_lock will synchronize with - * other processors updating the list of slabs. - */ - spin_lock_irqsave(&n->list_lock, flags); + + if (!kmem_cache_debug(s) && !prior) + + /* + * Slab was on no list before and will be partially empty + * We can defer the list move and instead freeze it. + */ + new.frozen = 1; + + else { /* Needs to be taken off a list */ + + n = get_node(s, page_to_nid(page)); + /* + * Speculatively acquire the list_lock. + * If the cmpxchg does not succeed then we may + * drop the list_lock without any processing. + * + * Otherwise the list_lock will synchronize with + * other processors updating the list of slabs. + */ + spin_lock_irqsave(&n->list_lock, flags); + + } } inuse = new.inuse; @@ -2360,7 +2505,15 @@ static void __slab_free(struct kmem_cache *s, struct page *page, "__slab_free")); if (likely(!n)) { - /* + + /* + * If we just froze the page then put it onto the + * per cpu partial list. + */ + if (new.frozen && !was_frozen) + put_cpu_partial(s, page, 1); + + /* * The list lock was not taken therefore no list * activity can be necessary. */ @@ -2429,7 +2582,6 @@ static __always_inline void slab_free(struct kmem_cache *s, slab_free_hook(s, x); redo: - /* * Determine the currently cpus per cpu slab. * The cpu may change afterward. However that does not matter since @@ -2919,7 +3071,34 @@ static int kmem_cache_open(struct kmem_cache *s, * The larger the object size is, the more pages we want on the partial * list to avoid pounding the page allocator excessively. */ - set_min_partial(s, ilog2(s->size)); + set_min_partial(s, ilog2(s->size) / 2); + + /* + * cpu_partial determined the maximum number of objects kept in the + * per cpu partial lists of a processor. + * + * Per cpu partial lists mainly contain slabs that just have one + * object freed. If they are used for allocation then they can be + * filled up again with minimal effort. The slab will never hit the + * per node partial lists and therefore no locking will be required. + * + * This setting also determines + * + * A) The number of objects from per cpu partial slabs dumped to the + * per node list when we reach the limit. + * B) The number of objects in partial partial slabs to extract from the + * per node list when we run out of per cpu objects. We only fetch 50% + * to keep some capacity around for frees. + */ + if (s->size >= PAGE_SIZE) + s->cpu_partial = 2; + else if (s->size >= 1024) + s->cpu_partial = 6; + else if (s->size >= 256) + s->cpu_partial = 13; + else + s->cpu_partial = 30; + s->refcount = 1; #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; @@ -4327,6 +4506,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, for_each_possible_cpu(cpu) { struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + struct page *page; if (!c || c->node < 0) continue; @@ -4342,6 +4522,13 @@ static ssize_t show_slab_objects(struct kmem_cache *s, total += x; nodes[c->node] += x; } + page = c->partial; + + if (page) { + x = page->pobjects; + total += x; + nodes[c->node] += x; + } per_cpu[c->node]++; } } @@ -4493,6 +4680,27 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, } SLAB_ATTR(min_partial); +static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%u\n", s->cpu_partial); +} + +static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + unsigned long objects; + int err; + + err = strict_strtoul(buf, 10, &objects); + if (err) + return err; + + s->cpu_partial = objects; + flush_all(s); + return length; +} +SLAB_ATTR(cpu_partial); + static ssize_t ctor_show(struct kmem_cache *s, char *buf) { if (!s->ctor) @@ -4531,6 +4739,37 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(objects_partial); +static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) +{ + int objects = 0; + int pages = 0; + int cpu; + int len; + + for_each_online_cpu(cpu) { + struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; + + if (page) { + pages += page->pages; + objects += page->pobjects; + } + } + + len = sprintf(buf, "%d(%d)", objects, pages); + +#ifdef CONFIG_SMP + for_each_online_cpu(cpu) { + struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; + + if (page && len < PAGE_SIZE - 20) + len += sprintf(buf + len, " C%d=%d(%d)", cpu, + page->pobjects, page->pages); + } +#endif + return len + sprintf(buf + len, "\n"); +} +SLAB_ATTR_RO(slabs_cpu_partial); + static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); @@ -4853,6 +5092,8 @@ STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); STAT_ATTR(ORDER_FALLBACK, order_fallback); STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); +STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); +STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); #endif static struct attribute *slab_attrs[] = { @@ -4861,6 +5102,7 @@ static struct attribute *slab_attrs[] = { &objs_per_slab_attr.attr, &order_attr.attr, &min_partial_attr.attr, + &cpu_partial_attr.attr, &objects_attr.attr, &objects_partial_attr.attr, &partial_attr.attr, @@ -4873,6 +5115,7 @@ static struct attribute *slab_attrs[] = { &destroy_by_rcu_attr.attr, &shrink_attr.attr, &reserved_attr.attr, + &slabs_cpu_partial_attr.attr, #ifdef CONFIG_SLUB_DEBUG &total_objects_attr.attr, &slabs_attr.attr, @@ -4914,6 +5157,8 @@ static struct attribute *slab_attrs[] = { &order_fallback_attr.attr, &cmpxchg_double_fail_attr.attr, &cmpxchg_double_cpu_fail_attr.attr, + &cpu_partial_alloc_attr.attr, + &cpu_partial_free_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, From aca726a07a71ff7aedc0e90a91f80a2701adcca5 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 9 Aug 2011 16:12:28 -0500 Subject: [PATCH 09/16] slub: update slabinfo tools to report per cpu partial list statistics Update the slabinfo tool to report the stats on per cpu partial list usage. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- tools/slub/slabinfo.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/slub/slabinfo.c b/tools/slub/slabinfo.c index 868cc93f7ac2..47d78533d72e 100644 --- a/tools/slub/slabinfo.c +++ b/tools/slub/slabinfo.c @@ -42,6 +42,7 @@ struct slabinfo { unsigned long deactivate_remote_frees, order_fallback; unsigned long cmpxchg_double_cpu_fail, cmpxchg_double_fail; unsigned long alloc_node_mismatch, deactivate_bypass; + unsigned long cpu_partial_alloc, cpu_partial_free; int numa[MAX_NODES]; int numa_partial[MAX_NODES]; } slabinfo[MAX_SLABS]; @@ -455,6 +456,11 @@ static void slab_stats(struct slabinfo *s) s->alloc_from_partial * 100 / total_alloc, s->free_remove_partial * 100 / total_free); + printf("Cpu partial list %8lu %8lu %3lu %3lu\n", + s->cpu_partial_alloc, s->cpu_partial_free, + s->cpu_partial_alloc * 100 / total_alloc, + s->cpu_partial_free * 100 / total_free); + printf("RemoteObj/SlabFrozen %8lu %8lu %3lu %3lu\n", s->deactivate_remote_frees, s->free_frozen, s->deactivate_remote_frees * 100 / total_alloc, @@ -1209,6 +1215,8 @@ static void read_slab_dir(void) slab->order_fallback = get_obj("order_fallback"); slab->cmpxchg_double_cpu_fail = get_obj("cmpxchg_double_cpu_fail"); slab->cmpxchg_double_fail = get_obj("cmpxchg_double_fail"); + slab->cpu_partial_alloc = get_obj("cpu_partial_alloc"); + slab->cpu_partial_free = get_obj("cpu_partial_free"); slab->alloc_node_mismatch = get_obj("alloc_node_mismatch"); slab->deactivate_bypass = get_obj("deactivate_bypass"); chdir(".."); From 136333d104bd3a62d783b0ac3d0f32ac0108c5d0 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 24 Aug 2011 08:57:52 +0800 Subject: [PATCH 10/16] slub: explicitly document position of inserting slab to partial list Adding slab to partial list head/tail is sensitive to performance. So explicitly uses DEACTIVATE_TO_TAIL/DEACTIVATE_TO_HEAD to document it to avoid we get it wrong. Acked-by: Christoph Lameter Signed-off-by: Shaohua Li Signed-off-by: Shaohua Li Signed-off-by: Pekka Enberg --- mm/slub.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 7c54fe83a90c..91a120f185d1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1534,7 +1534,7 @@ static inline void add_partial(struct kmem_cache_node *n, struct page *page, int tail) { n->nr_partial++; - if (tail) + if (tail == DEACTIVATE_TO_TAIL) list_add_tail(&page->lru, &n->partial); else list_add(&page->lru, &n->partial); @@ -1781,13 +1781,13 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) enum slab_modes l = M_NONE, m = M_NONE; void *freelist; void *nextfree; - int tail = 0; + int tail = DEACTIVATE_TO_HEAD; struct page new; struct page old; if (page->freelist) { stat(s, DEACTIVATE_REMOTE_FREES); - tail = 1; + tail = DEACTIVATE_TO_TAIL; } c->tid = next_tid(c->tid); @@ -1893,7 +1893,7 @@ redo: if (m == M_PARTIAL) { add_partial(n, page, tail); - stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); + stat(s, tail); } else if (m == M_FULL) { @@ -2377,7 +2377,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, */ if (unlikely(!prior)) { remove_full(s, page); - add_partial(n, page, 1); + add_partial(n, page, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } } @@ -2695,7 +2695,7 @@ static void early_kmem_cache_node_alloc(int node) init_kmem_cache_node(n, kmem_cache_node); inc_slabs_node(kmem_cache_node, node, page->objects); - add_partial(n, page, 0); + add_partial(n, page, DEACTIVATE_TO_HEAD); } static void free_kmem_cache_nodes(struct kmem_cache *s) From a37933c37c14b64e81c7c9cc44a5d3f5e0c91412 Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Tue, 30 Aug 2011 14:52:06 +0800 Subject: [PATCH 11/16] slub: doc: update the slabinfo.c file path slabinfo.c has been moved from Documentaion/vm/ to tools/slub/ by commit:0d24db337e6d81c0c620ab65cc6947bd6553f742 Update the slub.txt doc to reflect this change too. Signed-off-by: Jason Liu Acked-by: Christoph Lameter Acked-by: David Rientjes Signed-off-by: Pekka Enberg --- Documentation/vm/00-INDEX | 2 -- Documentation/vm/slub.txt | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX index dca82d7c83d8..5481c8ba3412 100644 --- a/Documentation/vm/00-INDEX +++ b/Documentation/vm/00-INDEX @@ -30,8 +30,6 @@ page_migration - description of page migration in NUMA systems. pagemap.txt - pagemap, from the userspace perspective -slabinfo.c - - source code for a tool to get reports about slabs. slub.txt - a short users guide for SLUB. unevictable-lru.txt diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt index 07375e73981a..f464f47bc60d 100644 --- a/Documentation/vm/slub.txt +++ b/Documentation/vm/slub.txt @@ -17,7 +17,7 @@ data and perform operation on the slabs. By default slabinfo only lists slabs that have data in them. See "slabinfo -h" for more options when running the command. slabinfo can be compiled with -gcc -o slabinfo Documentation/vm/slabinfo.c +gcc -o slabinfo tools/slub/slabinfo.c Some of the modes of operation of slabinfo require that slub debugging be enabled on the command line. F.e. no tracking information will be From 12d79634f8d7af5229b7d21143d50e7cf7d94177 Mon Sep 17 00:00:00 2001 From: "Alex,Shi" Date: Wed, 7 Sep 2011 10:26:36 +0800 Subject: [PATCH 12/16] slub: Code optimization in get_partial_node() I find a way to reduce a variable in get_partial_node(). That is also helpful for code understanding. Acked-by: Christoph Lameter Signed-off-by: Alex Shi Signed-off-by: Pekka Enberg --- mm/slub.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 0e286acef62a..4982fb5c91de 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1600,7 +1600,6 @@ static void *get_partial_node(struct kmem_cache *s, { struct page *page, *page2; void *object = NULL; - int count = 0; /* * Racy check. If we mistakenly see no partial slabs then we @@ -1613,17 +1612,16 @@ static void *get_partial_node(struct kmem_cache *s, spin_lock(&n->list_lock); list_for_each_entry_safe(page, page2, &n->partial, lru) { - void *t = acquire_slab(s, n, page, count == 0); + void *t = acquire_slab(s, n, page, object == NULL); int available; if (!t) break; - if (!count) { + if (!object) { c->page = page; c->node = page_to_nid(page); stat(s, ALLOC_FROM_PARTIAL); - count++; object = t; available = page->objects - page->inuse; } else { From ab067e99d22ec78ff646de1283348729d1aa66d4 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Tue, 27 Sep 2011 21:54:53 +0400 Subject: [PATCH 13/16] mm: restrict access to slab files under procfs and sysfs Historically /proc/slabinfo and files under /sys/kernel/slab/* have world read permissions and are accessible to the world. slabinfo contains rather private information related both to the kernel and userspace tasks. Depending on the situation, it might reveal either private information per se or information useful to make another targeted attack. Some examples of what can be learned by reading/watching for /proc/slabinfo entries: 1) dentry (and different *inode*) number might reveal other processes fs activity. The number of dentry "active objects" doesn't strictly show file count opened/touched by a process, however, there is a good correlation between them. The patch "proc: force dcache drop on unauthorized access" relies on the privacy of dentry count. 2) different inode entries might reveal the same information as (1), but these are more fine granted counters. If a filesystem is mounted in a private mount point (or even a private namespace) and fs type differs from other mounted fs types, fs activity in this mount point/namespace is revealed. If there is a single ecryptfs mount point, the whole fs activity of a single user is revealed. Number of files in ecryptfs mount point is a private information per se. 3) fuse_* reveals number of files / fs activity of a user in a user private mount point. It is approx. the same severity as ecryptfs infoleak in (2). 4) sysfs_dir_cache similar to (2) reveals devices' addition/removal, which can be otherwise hidden by "chmod 0700 /sys/". With 0444 slabinfo the precise number of sysfs files is known to the world. 5) buffer_head might reveal some kernel activity. With other information leaks an attacker might identify what specific kernel routines generate buffer_head activity. 6) *kmalloc* infoleaks are very situational. Attacker should watch for the specific kmalloc size entry and filter the noise related to the unrelated kernel activity. If an attacker has relatively silent victim system, he might get rather precise counters. Additional information sources might significantly increase the slabinfo infoleak benefits. E.g. if an attacker knows that the processes activity on the system is very low (only core daemons like syslog and cron), he may run setxid binaries / trigger local daemon activity / trigger network services activity / await sporadic cron jobs activity / etc. and get rather precise counters for fs and network activity of these privileged tasks, which is unknown otherwise. Also hiding slabinfo and /sys/kernel/slab/* is a one step to complicate exploitation of kernel heap overflows (and possibly, other bugs). The related discussion: http://thread.gmane.org/gmane.linux.kernel/1108378 To keep compatibility with old permission model where non-root monitoring daemon could watch for kernel memleaks though slabinfo one should do: groupadd slabinfo usermod -a -G slabinfo $MONITOR_USER And add the following commands to init scripts (to mountall.conf in Ubuntu's upstart case): chmod g+r /proc/slabinfo /sys/kernel/slab/*/* chgrp slabinfo /proc/slabinfo /sys/kernel/slab/*/* Signed-off-by: Vasiliy Kulikov Reviewed-by: Kees Cook Reviewed-by: Dave Hansen Acked-by: Christoph Lameter Acked-by: David Rientjes CC: Valdis.Kletnieks@vt.edu CC: Linus Torvalds CC: Alan Cox Signed-off-by: Pekka Enberg --- mm/slab.c | 2 +- mm/slub.c | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 5bfc2047afe1..708efe886154 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4579,7 +4579,7 @@ static const struct file_operations proc_slabstats_operations = { static int __init slab_proc_init(void) { - proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); + proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations); #ifdef CONFIG_DEBUG_SLAB_LEAK proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); #endif diff --git a/mm/slub.c b/mm/slub.c index 3b3f17bc0d17..943f4906131b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4386,11 +4386,12 @@ struct slab_attribute { }; #define SLAB_ATTR_RO(_name) \ - static struct slab_attribute _name##_attr = __ATTR_RO(_name) + static struct slab_attribute _name##_attr = \ + __ATTR(_name, 0400, _name##_show, NULL) #define SLAB_ATTR(_name) \ static struct slab_attribute _name##_attr = \ - __ATTR(_name, 0644, _name##_show, _name##_store) + __ATTR(_name, 0600, _name##_show, _name##_store) static ssize_t slab_size_show(struct kmem_cache *s, char *buf) { @@ -5231,7 +5232,7 @@ static const struct file_operations proc_slabinfo_operations = { static int __init slab_proc_init(void) { - proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations); + proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); return 0; } module_init(slab_proc_init); From 9f26490412cf15b04ac8f44a512ba0b09e774576 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 1 Sep 2011 11:32:18 +0800 Subject: [PATCH 14/16] slub: correct comments error for per cpu partial Correct comment errors, that mistake cpu partial objects number as pages number, may make reader misunderstand. Signed-off-by: Alex Shi Reviewed-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 2 +- mm/slub.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 4890ef79d752..a32bcfdc7834 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -82,7 +82,7 @@ struct kmem_cache { int size; /* The size of an object including meta data */ int objsize; /* The size of an object without meta data */ int offset; /* Free pointer offset. */ - int cpu_partial; /* Number of per cpu partial pages to keep around */ + int cpu_partial; /* Number of per cpu partial objects to keep around */ struct kmem_cache_order_objects oo; /* Allocation and freeing of slabs */ diff --git a/mm/slub.c b/mm/slub.c index 4982fb5c91de..8f687575d310 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3084,7 +3084,7 @@ static int kmem_cache_open(struct kmem_cache *s, * * A) The number of objects from per cpu partial slabs dumped to the * per node list when we reach the limit. - * B) The number of objects in partial partial slabs to extract from the + * B) The number of objects in cpu partial slabs to extract from the * per node list when we run out of per cpu objects. We only fetch 50% * to keep some capacity around for frees. */ From dcc3be6a548a1e51adaab3be6d9dfbb68bc0e3a0 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Tue, 6 Sep 2011 14:46:01 +0800 Subject: [PATCH 15/16] slub: Discard slab page when node partial > minimum partial number Discarding slab should be done when node partial > min_partial. Otherwise, node partial slab may eat up all memory. Signed-off-by: Alex Shi Acked-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 8f687575d310..e06f72c81e53 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1953,7 +1953,7 @@ static void unfreeze_partials(struct kmem_cache *s) new.frozen = 0; - if (!new.inuse && (!n || n->nr_partial < s->min_partial)) + if (!new.inuse && (!n || n->nr_partial > s->min_partial)) m = M_FREE; else { struct kmem_cache_node *n2 = get_node(s, From fe353178653b15add8626f5474842601be160281 Mon Sep 17 00:00:00 2001 From: Thomas Jarosch Date: Mon, 17 Oct 2011 16:48:10 +0200 Subject: [PATCH 16/16] tools, slub: Fix off-by-one buffer corruption after readlink() call readlink() never zero terminates the provided buffer. Therefore we already do buffer[count] = 0; This leads to an off-by-one buffer corruption as readlink() might return the full size of the buffer. The common technique is to reduce the buffer size by one. Another fix would be to check if (count < 0 || count == sizeof(buffer)) fatal(); Reducing the buffer size by one is easier IMHO. Signed-off-by: Thomas Jarosch Acked-by: David Rientjes Acked-by: Christoph Lameter Signed-off-by: Pekka Enberg --- tools/slub/slabinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/slub/slabinfo.c b/tools/slub/slabinfo.c index 868cc93f7ac2..cc1a378f9c06 100644 --- a/tools/slub/slabinfo.c +++ b/tools/slub/slabinfo.c @@ -1145,7 +1145,7 @@ static void read_slab_dir(void) switch (de->d_type) { case DT_LNK: alias->name = strdup(de->d_name); - count = readlink(de->d_name, buffer, sizeof(buffer)); + count = readlink(de->d_name, buffer, sizeof(buffer)-1); if (count < 0) fatal("Cannot read symlink %s\n", de->d_name);