2005-04-17 06:20:36 +08:00
|
|
|
/* pgalloc.c: page directory & page table allocation
|
|
|
|
*
|
|
|
|
* Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
|
|
|
|
* Written by David Howells (dhowells@redhat.com)
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* as published by the Free Software Foundation; either version
|
|
|
|
* 2 of the License, or (at your option) any later version.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/sched.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
|
|
|
#include <linux/gfp.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/highmem.h>
|
2007-05-09 17:32:48 +08:00
|
|
|
#include <linux/quicklist.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <asm/pgalloc.h>
|
|
|
|
#include <asm/page.h>
|
|
|
|
#include <asm/cacheflush.h>
|
|
|
|
|
|
|
|
pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((aligned(PAGE_SIZE)));
|
|
|
|
|
|
|
|
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
|
|
|
|
{
|
|
|
|
pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
|
|
|
|
if (pte)
|
|
|
|
clear_page(pte);
|
|
|
|
return pte;
|
|
|
|
}
|
|
|
|
|
2008-02-08 20:22:04 +08:00
|
|
|
pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct page *page;
|
|
|
|
|
|
|
|
#ifdef CONFIG_HIGHPTE
|
|
|
|
page = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT, 0);
|
|
|
|
#else
|
|
|
|
page = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
|
|
|
|
#endif
|
2008-02-08 20:22:04 +08:00
|
|
|
if (page) {
|
2005-04-17 06:20:36 +08:00
|
|
|
clear_highpage(page);
|
2008-02-08 20:22:04 +08:00
|
|
|
pgtable_page_ctor(page);
|
|
|
|
flush_dcache_page(page);
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
|
|
|
void __set_pmd(pmd_t *pmdptr, unsigned long pmd)
|
|
|
|
{
|
|
|
|
unsigned long *__ste_p = pmdptr->ste;
|
|
|
|
int loop;
|
|
|
|
|
|
|
|
if (!pmd) {
|
|
|
|
memset(__ste_p, 0, PME_SIZE);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
BUG_ON(pmd & (0x3f00 | xAMPRx_SS | 0xe));
|
|
|
|
|
|
|
|
for (loop = PME_SIZE; loop > 0; loop -= 4) {
|
|
|
|
*__ste_p++ = pmd;
|
|
|
|
pmd += __frv_PT_SIZE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
frv_dcache_writeback((unsigned long) pmdptr, (unsigned long) (pmdptr + 1));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* List of all pgd's needed for non-PAE so it can invalidate entries
|
|
|
|
* in both cached and uncached pgd's; not needed for PAE since the
|
|
|
|
* kernel pmd is shared. If PAE were not to share the pmd a similar
|
|
|
|
* tactic would be needed. This is essentially codepath-based locking
|
|
|
|
* against pageattr.c; it is the unique case in which a valid change
|
|
|
|
* of kernel pagetables can't be lazily synchronized by vmalloc faults.
|
|
|
|
* vmalloc faults work because attached pagetables are never freed.
|
|
|
|
* If the locking proves to be non-performant, a ticketing scheme with
|
|
|
|
* checks at dup_mmap(), exec(), and other mmlist addition points
|
|
|
|
* could be used. The locking scheme was chosen on the basis of
|
|
|
|
* manfred's recommendations and having no core impact whatsoever.
|
|
|
|
* -- wli
|
|
|
|
*/
|
|
|
|
DEFINE_SPINLOCK(pgd_lock);
|
|
|
|
struct page *pgd_list;
|
|
|
|
|
|
|
|
static inline void pgd_list_add(pgd_t *pgd)
|
|
|
|
{
|
|
|
|
struct page *page = virt_to_page(pgd);
|
|
|
|
page->index = (unsigned long) pgd_list;
|
|
|
|
if (pgd_list)
|
2005-11-29 05:43:51 +08:00
|
|
|
set_page_private(pgd_list, (unsigned long) &page->index);
|
2005-04-17 06:20:36 +08:00
|
|
|
pgd_list = page;
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:40 +08:00
|
|
|
set_page_private(page, (unsigned long)&pgd_list);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void pgd_list_del(pgd_t *pgd)
|
|
|
|
{
|
|
|
|
struct page *next, **pprev, *page = virt_to_page(pgd);
|
|
|
|
next = (struct page *) page->index;
|
2005-11-29 05:43:51 +08:00
|
|
|
pprev = (struct page **) page_private(page);
|
2005-04-17 06:20:36 +08:00
|
|
|
*pprev = next;
|
|
|
|
if (next)
|
2005-11-29 05:43:51 +08:00
|
|
|
set_page_private(next, (unsigned long) pprev);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2007-05-09 17:32:48 +08:00
|
|
|
void pgd_ctor(void *pgd)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
if (PTRS_PER_PMD == 1)
|
|
|
|
spin_lock_irqsave(&pgd_lock, flags);
|
|
|
|
|
|
|
|
memcpy((pgd_t *) pgd + USER_PGDS_IN_LAST_PML4,
|
|
|
|
swapper_pg_dir + USER_PGDS_IN_LAST_PML4,
|
|
|
|
(PTRS_PER_PGD - USER_PGDS_IN_LAST_PML4) * sizeof(pgd_t));
|
|
|
|
|
|
|
|
if (PTRS_PER_PMD > 1)
|
|
|
|
return;
|
|
|
|
|
|
|
|
pgd_list_add(pgd);
|
|
|
|
spin_unlock_irqrestore(&pgd_lock, flags);
|
|
|
|
memset(pgd, 0, USER_PGDS_IN_LAST_PML4 * sizeof(pgd_t));
|
|
|
|
}
|
|
|
|
|
|
|
|
/* never called when PTRS_PER_PMD > 1 */
|
2007-05-09 17:32:48 +08:00
|
|
|
void pgd_dtor(void *pgd)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
unsigned long flags; /* can be called from interrupt context */
|
|
|
|
|
|
|
|
spin_lock_irqsave(&pgd_lock, flags);
|
|
|
|
pgd_list_del(pgd);
|
|
|
|
spin_unlock_irqrestore(&pgd_lock, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
pgd_t *pgd_alloc(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
pgd_t *pgd;
|
|
|
|
|
2007-05-09 17:32:48 +08:00
|
|
|
pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!pgd)
|
|
|
|
return pgd;
|
|
|
|
|
|
|
|
return pgd;
|
|
|
|
}
|
|
|
|
|
2008-02-05 14:29:14 +08:00
|
|
|
void pgd_free(struct mm_struct *mm, pgd_t *pgd)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
/* in the non-PAE case, clear_page_tables() clears user pgd entries */
|
2007-05-09 17:32:48 +08:00
|
|
|
quicklist_free(0, pgd_dtor, pgd);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void __init pgtable_cache_init(void)
|
|
|
|
{
|
|
|
|
}
|
2007-05-09 17:32:48 +08:00
|
|
|
|
|
|
|
void check_pgt_cache(void)
|
|
|
|
{
|
|
|
|
quicklist_trim(0, pgd_dtor, 25, 16);
|
|
|
|
}
|
|
|
|
|