2008-07-30 13:23:13 +08:00
|
|
|
/*
|
|
|
|
* Lockless get_user_pages_fast for powerpc
|
|
|
|
*
|
|
|
|
* Copyright (C) 2008 Nick Piggin
|
|
|
|
* Copyright (C) 2008 Novell Inc.
|
|
|
|
*/
|
|
|
|
#undef DEBUG
|
|
|
|
|
|
|
|
#include <linux/sched.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/hugetlb.h>
|
|
|
|
#include <linux/vmstat.h>
|
|
|
|
#include <linux/pagemap.h>
|
|
|
|
#include <linux/rwsem.h>
|
|
|
|
#include <asm/pgtable.h>
|
|
|
|
|
2009-03-11 01:24:37 +08:00
|
|
|
#ifdef __HAVE_ARCH_PTE_SPECIAL
|
|
|
|
|
2011-01-14 07:46:32 +08:00
|
|
|
static inline void get_huge_page_tail(struct page *page)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* __split_huge_page_refcount() cannot run
|
|
|
|
* from under us.
|
|
|
|
*/
|
|
|
|
VM_BUG_ON(atomic_read(&page->_count) < 0);
|
|
|
|
atomic_inc(&page->_count);
|
|
|
|
}
|
|
|
|
|
2008-07-30 13:23:13 +08:00
|
|
|
/*
|
|
|
|
* The performance critical leaf functions are made noinline otherwise gcc
|
|
|
|
* inlines everything into a single function which results in too much
|
|
|
|
* register pressure.
|
|
|
|
*/
|
|
|
|
static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
|
|
|
|
unsigned long end, int write, struct page **pages, int *nr)
|
|
|
|
{
|
|
|
|
unsigned long mask, result;
|
|
|
|
pte_t *ptep;
|
|
|
|
|
|
|
|
result = _PAGE_PRESENT|_PAGE_USER;
|
|
|
|
if (write)
|
|
|
|
result |= _PAGE_RW;
|
|
|
|
mask = result | _PAGE_SPECIAL;
|
|
|
|
|
|
|
|
ptep = pte_offset_kernel(&pmd, addr);
|
|
|
|
do {
|
|
|
|
pte_t pte = *ptep;
|
|
|
|
struct page *page;
|
|
|
|
|
|
|
|
if ((pte_val(pte) & mask) != result)
|
|
|
|
return 0;
|
|
|
|
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
|
|
|
|
page = pte_page(pte);
|
|
|
|
if (!page_cache_get_speculative(page))
|
|
|
|
return 0;
|
2008-10-13 01:54:24 +08:00
|
|
|
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
|
2008-07-30 13:23:13 +08:00
|
|
|
put_page(page);
|
|
|
|
return 0;
|
|
|
|
}
|
2011-01-14 07:46:32 +08:00
|
|
|
if (PageTail(page))
|
|
|
|
get_huge_page_tail(page);
|
2008-07-30 13:23:13 +08:00
|
|
|
pages[*nr] = page;
|
|
|
|
(*nr)++;
|
|
|
|
|
|
|
|
} while (ptep++, addr += PAGE_SIZE, addr != end);
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
|
|
|
|
int write, struct page **pages, int *nr)
|
|
|
|
{
|
|
|
|
unsigned long next;
|
|
|
|
pmd_t *pmdp;
|
|
|
|
|
|
|
|
pmdp = pmd_offset(&pud, addr);
|
|
|
|
do {
|
|
|
|
pmd_t pmd = *pmdp;
|
|
|
|
|
|
|
|
next = pmd_addr_end(addr, end);
|
|
|
|
if (pmd_none(pmd))
|
|
|
|
return 0;
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-27 03:24:31 +08:00
|
|
|
if (is_hugepd(pmdp)) {
|
|
|
|
if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT,
|
|
|
|
addr, next, write, pages, nr))
|
|
|
|
return 0;
|
|
|
|
} else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
|
2008-07-30 13:23:13 +08:00
|
|
|
return 0;
|
|
|
|
} while (pmdp++, addr = next, addr != end);
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
|
|
|
|
int write, struct page **pages, int *nr)
|
|
|
|
{
|
|
|
|
unsigned long next;
|
|
|
|
pud_t *pudp;
|
|
|
|
|
|
|
|
pudp = pud_offset(&pgd, addr);
|
|
|
|
do {
|
|
|
|
pud_t pud = *pudp;
|
|
|
|
|
|
|
|
next = pud_addr_end(addr, end);
|
|
|
|
if (pud_none(pud))
|
|
|
|
return 0;
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-27 03:24:31 +08:00
|
|
|
if (is_hugepd(pudp)) {
|
|
|
|
if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT,
|
|
|
|
addr, next, write, pages, nr))
|
|
|
|
return 0;
|
|
|
|
} else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
|
2008-07-30 13:23:13 +08:00
|
|
|
return 0;
|
|
|
|
} while (pudp++, addr = next, addr != end);
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
|
|
|
|
struct page **pages)
|
|
|
|
{
|
|
|
|
struct mm_struct *mm = current->mm;
|
|
|
|
unsigned long addr, len, end;
|
|
|
|
unsigned long next;
|
|
|
|
pgd_t *pgdp;
|
2009-03-11 01:24:37 +08:00
|
|
|
int nr = 0;
|
2008-07-30 13:23:13 +08:00
|
|
|
|
2009-06-18 02:13:51 +08:00
|
|
|
pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
|
2008-07-30 13:23:13 +08:00
|
|
|
|
|
|
|
start &= PAGE_MASK;
|
|
|
|
addr = start;
|
|
|
|
len = (unsigned long) nr_pages << PAGE_SHIFT;
|
|
|
|
end = start + len;
|
|
|
|
|
|
|
|
if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
|
|
|
|
start, len)))
|
|
|
|
goto slow_irqon;
|
|
|
|
|
2009-06-18 02:13:51 +08:00
|
|
|
pr_devel(" aligned: %lx .. %lx\n", start, end);
|
2008-07-30 13:23:13 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* XXX: batch / limit 'nr', to avoid large irq off latency
|
|
|
|
* needs some instrumenting to determine the common sizes used by
|
|
|
|
* important workloads (eg. DB2), and whether limiting the batch size
|
|
|
|
* will decrease performance.
|
|
|
|
*
|
|
|
|
* It seems like we're in the clear for the moment. Direct-IO is
|
|
|
|
* the main guy that batches up lots of get_user_pages, and even
|
|
|
|
* they are limited to 64-at-a-time which is not so many.
|
|
|
|
*/
|
|
|
|
/*
|
|
|
|
* This doesn't prevent pagetable teardown, but does prevent
|
|
|
|
* the pagetables from being freed on powerpc.
|
|
|
|
*
|
|
|
|
* So long as we atomically load page table pointers versus teardown,
|
|
|
|
* we can follow the address down to the the page and take a ref on it.
|
|
|
|
*/
|
|
|
|
local_irq_disable();
|
|
|
|
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-27 03:24:31 +08:00
|
|
|
pgdp = pgd_offset(mm, addr);
|
|
|
|
do {
|
|
|
|
pgd_t pgd = *pgdp;
|
|
|
|
|
|
|
|
pr_devel(" %016lx: normal pgd %p\n", addr,
|
|
|
|
(void *)pgd_val(pgd));
|
|
|
|
next = pgd_addr_end(addr, end);
|
|
|
|
if (pgd_none(pgd))
|
|
|
|
goto slow;
|
|
|
|
if (is_hugepd(pgdp)) {
|
|
|
|
if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT,
|
|
|
|
addr, next, write, pages, &nr))
|
2008-07-30 13:23:13 +08:00
|
|
|
goto slow;
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-27 03:24:31 +08:00
|
|
|
} else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
|
|
|
|
goto slow;
|
|
|
|
} while (pgdp++, addr = next, addr != end);
|
|
|
|
|
2008-07-30 13:23:13 +08:00
|
|
|
local_irq_enable();
|
|
|
|
|
|
|
|
VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
|
|
|
|
return nr;
|
|
|
|
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
slow:
|
|
|
|
local_irq_enable();
|
|
|
|
slow_irqon:
|
2009-06-18 02:13:51 +08:00
|
|
|
pr_devel(" slow path ! nr = %d\n", nr);
|
2008-07-30 13:23:13 +08:00
|
|
|
|
|
|
|
/* Try to get the remaining pages with get_user_pages */
|
|
|
|
start += nr << PAGE_SHIFT;
|
|
|
|
pages += nr;
|
|
|
|
|
|
|
|
down_read(&mm->mmap_sem);
|
|
|
|
ret = get_user_pages(current, mm, start,
|
|
|
|
(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
|
|
|
|
up_read(&mm->mmap_sem);
|
|
|
|
|
|
|
|
/* Have to be a bit careful with return values */
|
|
|
|
if (nr > 0) {
|
|
|
|
if (ret < 0)
|
|
|
|
ret = nr;
|
|
|
|
else
|
|
|
|
ret += nr;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|
2009-03-11 01:24:37 +08:00
|
|
|
|
|
|
|
#endif /* __HAVE_ARCH_PTE_SPECIAL */
|