summaryrefslogtreecommitdiff
path: root/arch/powerpc/mm/gup.c
diff options
context:
space:
mode:
authorDavid Gibson <david@gibson.dropbear.id.au>2009-10-26 19:24:31 +0000
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2009-10-30 17:20:58 +1100
commita4fe3ce7699bfe1bd88f816b55d42d8fe1dac655 (patch)
treeb72c982ffbb9f05d78a952288d60c4dc2d31a4d9 /arch/powerpc/mm/gup.c
parenta0668cdc154e54bf0c85182e0535eea237d53146 (diff)
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different pagetable layout: that is, the bottem level table of pointers to hugepages is a different size, and may branch off from the normal page tables at a different level. Every hugepage aware path that needs to walk the pagetables must therefore look up the hugepage size from the slice info first, and work out the correct way to walk the pagetables accordingly. Future hardware is likely to add more possible hugepage sizes, more layout options and more mess. This patch, therefore reworks the handling of hugepage pagetables to reduce this complexity. In the new scheme, instead of having to consult the slice mask, pagetable walking code can check a flag in the PGD/PUD/PMD entries to see where to branch off to hugepage pagetables, and the entry also contains the information (eseentially hugepage shift) necessary to then interpret that table without recourse to the slice mask. This scheme can be extended neatly to handle multiple levels of self-describing "special" hugepage pagetables, although for now we assume only one level exists. This approach means that only the pagetable allocation path needs to know how the pagetables should be set out. All other (hugepage) pagetable walking paths can just interpret the structure as they go. There already was a flag bit in PGD/PUD/PMD entries for hugepage directory pointers, but it was only used for debug. We alter that flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable pointer (normally it would be 1 since the pointer lies in the linear mapping). This means that asm pagetable walking can test for (and punt on) hugepage pointers with the same test that checks for unpopulated page directory entries (beq becomes bge), since hugepage pointers will always be positive, and normal pointers always negative. While we're at it, we get rid of the confusing (and grep defeating) #defining of hugepte_shift to be the same thing as mmu_huge_psizes. Signed-off-by: David Gibson <dwg@au1.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/mm/gup.c')
-rw-r--r--arch/powerpc/mm/gup.c149
1 files changed, 26 insertions, 123 deletions
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index bc122a120bf0..d7efdbf640c7 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -55,57 +55,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
return 1;
}
-#ifdef CONFIG_HUGETLB_PAGE
-static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate,
- unsigned long *addr, unsigned long end,
- int write, struct page **pages, int *nr)
-{
- unsigned long mask;
- unsigned long pte_end;
- struct page *head, *page;
- pte_t pte;
- int refs;
-
- pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate);
- if (pte_end < end)
- end = pte_end;
-
- pte = *ptep;
- mask = _PAGE_PRESENT|_PAGE_USER;
- if (write)
- mask |= _PAGE_RW;
- if ((pte_val(pte) & mask) != mask)
- return 0;
- /* hugepages are never "special" */
- VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-
- refs = 0;
- head = pte_page(pte);
- page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT);
- do {
- VM_BUG_ON(compound_head(page) != head);
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (*addr += PAGE_SIZE, *addr != end);
-
- if (!page_cache_add_speculative(head, refs)) {
- *nr -= refs;
- return 0;
- }
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- /* Could be optimized better */
- while (*nr) {
- put_page(page);
- (*nr)--;
- }
- }
-
- return 1;
-}
-#endif /* CONFIG_HUGETLB_PAGE */
-
static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
@@ -119,7 +68,11 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
next = pmd_addr_end(addr, end);
if (pmd_none(pmd))
return 0;
- if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+ if (is_hugepd(pmdp)) {
+ if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT,
+ addr, next, write, pages, nr))
+ return 0;
+ } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
@@ -139,7 +92,11 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
next = pud_addr_end(addr, end);
if (pud_none(pud))
return 0;
- if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+ if (is_hugepd(pudp)) {
+ if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT,
+ addr, next, write, pages, nr))
+ return 0;
+ } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
return 0;
} while (pudp++, addr = next, addr != end);
@@ -154,10 +111,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
unsigned long next;
pgd_t *pgdp;
int nr = 0;
-#ifdef CONFIG_PPC64
- unsigned int shift;
- int psize;
-#endif
pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
@@ -172,25 +125,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
pr_devel(" aligned: %lx .. %lx\n", start, end);
-#ifdef CONFIG_HUGETLB_PAGE
- /* We bail out on slice boundary crossing when hugetlb is
- * enabled in order to not have to deal with two different
- * page table formats
- */
- if (addr < SLICE_LOW_TOP) {
- if (end > SLICE_LOW_TOP)
- goto slow_irqon;
-
- if (unlikely(GET_LOW_SLICE_INDEX(addr) !=
- GET_LOW_SLICE_INDEX(end - 1)))
- goto slow_irqon;
- } else {
- if (unlikely(GET_HIGH_SLICE_INDEX(addr) !=
- GET_HIGH_SLICE_INDEX(end - 1)))
- goto slow_irqon;
- }
-#endif /* CONFIG_HUGETLB_PAGE */
-
/*
* XXX: batch / limit 'nr', to avoid large irq off latency
* needs some instrumenting to determine the common sizes used by
@@ -210,54 +144,23 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
*/
local_irq_disable();
-#ifdef CONFIG_PPC64
- /* Those bits are related to hugetlbfs implementation and only exist
- * on 64-bit for now
- */
- psize = get_slice_psize(mm, addr);
- shift = mmu_psize_defs[psize].shift;
-#endif /* CONFIG_PPC64 */
-
-#ifdef CONFIG_HUGETLB_PAGE
- if (unlikely(mmu_huge_psizes[psize])) {
- pte_t *ptep;
- unsigned long a = addr;
- unsigned long sz = ((1UL) << shift);
- struct hstate *hstate = size_to_hstate(sz);
-
- BUG_ON(!hstate);
- /*
- * XXX: could be optimized to avoid hstate
- * lookup entirely (just use shift)
- */
-
- do {
- VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift);
- ptep = huge_pte_offset(mm, a);
- pr_devel(" %016lx: huge ptep %p\n", a, ptep);
- if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages,
- &nr))
- goto slow;
- } while (a != end);
- } else
-#endif /* CONFIG_HUGETLB_PAGE */
- {
- pgdp = pgd_offset(mm, addr);
- do {
- pgd_t pgd = *pgdp;
-
-#ifdef CONFIG_PPC64
- VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift);
-#endif
- pr_devel(" %016lx: normal pgd %p\n", addr,
- (void *)pgd_val(pgd));
- next = pgd_addr_end(addr, end);
- if (pgd_none(pgd))
- goto slow;
- if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ pgdp = pgd_offset(mm, addr);
+ do {
+ pgd_t pgd = *pgdp;
+
+ pr_devel(" %016lx: normal pgd %p\n", addr,
+ (void *)pgd_val(pgd));
+ next = pgd_addr_end(addr, end);
+ if (pgd_none(pgd))
+ goto slow;
+ if (is_hugepd(pgdp)) {
+ if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT,
+ addr, next, write, pages, &nr))
goto slow;
- } while (pgdp++, addr = next, addr != end);
- }
+ } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ goto slow;
+ } while (pgdp++, addr = next, addr != end);
+
local_irq_enable();
VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);