2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* linux/mm/memory.c
|
|
|
|
*
|
|
|
|
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* demand-loading started 01.12.91 - seems it is high on the list of
|
|
|
|
* things wanted, and it should be easy to implement. - Linus
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Ok, demand-loading was easy, shared pages a little bit tricker. Shared
|
|
|
|
* pages started 02.12.91, seems to work. - Linus.
|
|
|
|
*
|
|
|
|
* Tested sharing by executing about 30 /bin/sh: under the old kernel it
|
|
|
|
* would have taken more than the 6M I have free, but it worked well as
|
|
|
|
* far as I could see.
|
|
|
|
*
|
|
|
|
* Also corrected some "invalidate()"s - I wasn't doing enough of them.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Real VM (paging to/from disk) started 18.12.91. Much more work and
|
|
|
|
* thought has to go into this. Oh, well..
|
|
|
|
* 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
|
|
|
|
* Found it. Everything seems to work now.
|
|
|
|
* 20.12.91 - Ok, making the swap-device changeable like the root.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* 05.04.94 - Multi-page memory management added for v1.1.
|
|
|
|
* Idea by Alex Bligh (alex@cconcepts.co.uk)
|
|
|
|
*
|
|
|
|
* 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
|
|
|
|
* (Gerhard.Wichert@pdb.siemens.de)
|
|
|
|
*
|
|
|
|
* Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/kernel_stat.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/hugetlb.h>
|
|
|
|
#include <linux/mman.h>
|
|
|
|
#include <linux/swap.h>
|
|
|
|
#include <linux/highmem.h>
|
|
|
|
#include <linux/pagemap.h>
|
|
|
|
#include <linux/rmap.h>
|
|
|
|
#include <linux/module.h>
|
2006-07-14 15:24:37 +08:00
|
|
|
#include <linux/delayacct.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/init.h>
|
2006-09-26 14:30:58 +08:00
|
|
|
#include <linux/writeback.h>
|
2008-02-07 16:13:53 +08:00
|
|
|
#include <linux/memcontrol.h>
|
mmu-notifiers: core
With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages.
There are secondary MMUs (with secondary sptes and secondary tlbs) too.
sptes in the kvm case are shadow pagetables, but when I say spte in
mmu-notifier context, I mean "secondary pte". In GRU case there's no
actual secondary pte and there's only a secondary tlb because the GRU
secondary MMU has no knowledge about sptes and every secondary tlb miss
event in the MMU always generates a page fault that has to be resolved by
the CPU (this is not the case of KVM where the a secondary tlb miss will
walk sptes in hardware and it will refill the secondary tlb transparently
to software if the corresponding spte is present). The same way
zap_page_range has to invalidate the pte before freeing the page, the spte
(and secondary tlb) must also be invalidated before any page is freed and
reused.
Currently we take a page_count pin on every page mapped by sptes, but that
means the pages can't be swapped whenever they're mapped by any spte
because they're part of the guest working set. Furthermore a spte unmap
event can immediately lead to a page to be freed when the pin is released
(so requiring the same complex and relatively slow tlb_gather smp safe
logic we have in zap_page_range and that can be avoided completely if the
spte unmap event doesn't require an unpin of the page previously mapped in
the secondary MMU).
The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know
when the VM is swapping or freeing or doing anything on the primary MMU so
that the secondary MMU code can drop sptes before the pages are freed,
avoiding all page pinning and allowing 100% reliable swapping of guest
physical address space. Furthermore it avoids the code that teardown the
mappings of the secondary MMU, to implement a logic like tlb_gather in
zap_page_range that would require many IPI to flush other cpu tlbs, for
each fixed number of spte unmapped.
To make an example: if what happens on the primary MMU is a protection
downgrade (from writeable to wrprotect) the secondary MMU mappings will be
invalidated, and the next secondary-mmu-page-fault will call
get_user_pages and trigger a do_wp_page through get_user_pages if it
called get_user_pages with write=1, and it'll re-establishing an updated
spte or secondary-tlb-mapping on the copied page. Or it will setup a
readonly spte or readonly tlb mapping if it's a guest-read, if it calls
get_user_pages with write=0. This is just an example.
This allows to map any page pointed by any pte (and in turn visible in the
primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an
full MMU with both sptes and secondary-tlb like the shadow-pagetable layer
with kvm), or a remote DMA in software like XPMEM (hence needing of
schedule in XPMEM code to send the invalidate to the remote node, while no
need to schedule in kvm/gru as it's an immediate event like invalidating
primary-mmu pte).
At least for KVM without this patch it's impossible to swap guests
reliably. And having this feature and removing the page pin allows
several other optimizations that simplify life considerably.
Dependencies:
1) mm_take_all_locks() to register the mmu notifier when the whole VM
isn't doing anything with "mm". This allows mmu notifier users to keep
track if the VM is in the middle of the invalidate_range_begin/end
critical section with an atomic counter incraese in range_begin and
decreased in range_end. No secondary MMU page fault is allowed to map
any spte or secondary tlb reference, while the VM is in the middle of
range_begin/end as any page returned by get_user_pages in that critical
section could later immediately be freed without any further
->invalidate_page notification (invalidate_range_begin/end works on
ranges and ->invalidate_page isn't called immediately before freeing
the page). To stop all page freeing and pagetable overwrites the
mmap_sem must be taken in write mode and all other anon_vma/i_mmap
locks must be taken too.
2) It'd be a waste to add branches in the VM if nobody could possibly
run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if
CONFIG_KVM=m/y. In the current kernel kvm won't yet take advantage of
mmu notifiers, but this already allows to compile a KVM external module
against a kernel with mmu notifiers enabled and from the next pull from
kvm.git we'll start using them. And GRU/XPMEM will also be able to
continue the development by enabling KVM=m in their config, until they
submit all GRU/XPMEM GPLv2 code to the mainline kernel. Then they can
also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n).
This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM
are all =n.
The mmu_notifier_register call can fail because mm_take_all_locks may be
interrupted by a signal and return -EINTR. Because mmu_notifier_reigster
is used when a driver startup, a failure can be gracefully handled. Here
an example of the change applied to kvm to register the mmu notifiers.
Usually when a driver startups other allocations are required anyway and
-ENOMEM failure paths exists already.
struct kvm *kvm_arch_create_vm(void)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+ int err;
if (!kvm)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+ err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+ if (err) {
+ kfree(kvm);
+ return ERR_PTR(err);
+ }
+
return kvm;
}
mmu_notifier_unregister returns void and it's reliable.
The patch also adds a few needed but missing includes that would prevent
kernel to compile after these changes on non-x86 archs (x86 didn't need
them by luck).
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix mm/filemap_xip.c build]
[akpm@linux-foundation.org: fix mm/mmu_notifier.c build]
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Kanoj Sarcar <kanojsarcar@yahoo.com>
Cc: Roland Dreier <rdreier@cisco.com>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Chris Wright <chrisw@redhat.com>
Cc: Marcelo Tosatti <marcelo@kvack.org>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Izik Eidus <izike@qumranet.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-29 06:46:29 +08:00
|
|
|
#include <linux/mmu_notifier.h>
|
badpage: replace page_remove_rmap Eeek and BUG
Now that bad pages are kept out of circulation, there is no need for the
infamous page_remove_rmap() BUG() - once that page is freed, its negative
mapcount will issue a "Bad page state" message and the page won't be
freed. Removing the BUG() allows more info, on subsequent pages, to be
gathered.
We do have more info about the page at this point than bad_page() can know
- notably, what the pmd is, which might pinpoint something like low 64kB
corruption - but page_remove_rmap() isn't given the address to find that.
In practice, there is only one call to page_remove_rmap() which has ever
reported anything, that from zap_pte_range() (usually on exit, sometimes
on munmap). It has all the info, so remove page_remove_rmap()'s "Eeek"
message and leave it all to zap_pte_range().
mm/memory.c already has a hardly used print_bad_pte() function, showing
some of the appropriate info: extend it to show what we want for the rmap
case: pte info, page info (when there is a page) and vma info to compare.
zap_pte_range() already knows the pmd, but print_bad_pte() is easier to
use if it works that out for itself.
Some of this info is also shown in bad_page()'s "Bad page state" message.
Keep them separate, but adjust them to match each other as far as
possible. Say "Bad page map" in print_bad_pte(), and add a TAINT_BAD_PAGE
there too.
print_bad_pte() show current->comm unconditionally (though it should get
repeated in the usually irrelevant stack trace): sorry, I misled Nick
Piggin to make it conditional on vm_mm == current->mm, but current->mm is
already NULL in the exit case. Usually current->comm is good, though
exceptionally it may not be that of the mm (when "swapoff" for example).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:40:08 +08:00
|
|
|
#include <linux/kallsyms.h>
|
|
|
|
#include <linux/swapops.h>
|
|
|
|
#include <linux/elf.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#include <asm/pgalloc.h>
|
|
|
|
#include <asm/uaccess.h>
|
|
|
|
#include <asm/tlb.h>
|
|
|
|
#include <asm/tlbflush.h>
|
|
|
|
#include <asm/pgtable.h>
|
|
|
|
|
2008-07-24 12:27:10 +08:00
|
|
|
#include "internal.h"
|
|
|
|
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:54 +08:00
|
|
|
#ifndef CONFIG_NEED_MULTIPLE_NODES
|
2005-04-17 06:20:36 +08:00
|
|
|
/* use the per-pgdat data instead for discontigmem - mbligh */
|
|
|
|
unsigned long max_mapnr;
|
|
|
|
struct page *mem_map;
|
|
|
|
|
|
|
|
EXPORT_SYMBOL(max_mapnr);
|
|
|
|
EXPORT_SYMBOL(mem_map);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
unsigned long num_physpages;
|
|
|
|
/*
|
|
|
|
* A number of key systems in x86 including ioremap() rely on the assumption
|
|
|
|
* that high_memory defines the upper bound on direct map memory, then end
|
|
|
|
* of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and
|
|
|
|
* highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
|
|
|
|
* and ZONE_HIGHMEM.
|
|
|
|
*/
|
|
|
|
void * high_memory;
|
|
|
|
|
|
|
|
EXPORT_SYMBOL(num_physpages);
|
|
|
|
EXPORT_SYMBOL(high_memory);
|
|
|
|
|
2008-02-07 05:39:44 +08:00
|
|
|
/*
|
|
|
|
* Randomize the address space (stacks, mmaps, brk, etc.).
|
|
|
|
*
|
|
|
|
* ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
|
|
|
|
* as ancient (libc5 based) binaries can segfault. )
|
|
|
|
*/
|
|
|
|
int randomize_va_space __read_mostly =
|
|
|
|
#ifdef CONFIG_COMPAT_BRK
|
|
|
|
1;
|
|
|
|
#else
|
|
|
|
2;
|
|
|
|
#endif
|
2006-02-17 06:41:58 +08:00
|
|
|
|
|
|
|
static int __init disable_randmaps(char *s)
|
|
|
|
{
|
|
|
|
randomize_va_space = 0;
|
2006-03-31 18:30:33 +08:00
|
|
|
return 1;
|
2006-02-17 06:41:58 +08:00
|
|
|
}
|
|
|
|
__setup("norandmaps", disable_randmaps);
|
|
|
|
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* If a p?d_bad entry is found while walking page tables, report
|
|
|
|
* the error, before resetting entry to p?d_none. Usually (but
|
|
|
|
* very seldom) called out from the p?d_none_or_clear_bad macros.
|
|
|
|
*/
|
|
|
|
|
|
|
|
void pgd_clear_bad(pgd_t *pgd)
|
|
|
|
{
|
|
|
|
pgd_ERROR(*pgd);
|
|
|
|
pgd_clear(pgd);
|
|
|
|
}
|
|
|
|
|
|
|
|
void pud_clear_bad(pud_t *pud)
|
|
|
|
{
|
|
|
|
pud_ERROR(*pud);
|
|
|
|
pud_clear(pud);
|
|
|
|
}
|
|
|
|
|
|
|
|
void pmd_clear_bad(pmd_t *pmd)
|
|
|
|
{
|
|
|
|
pmd_ERROR(*pmd);
|
|
|
|
pmd_clear(pmd);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note: this doesn't free the actual pages themselves. That
|
|
|
|
* has been handled earlier when unmapping all the memory regions.
|
|
|
|
*/
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2008-02-08 20:22:04 +08:00
|
|
|
pgtable_t token = pmd_pgtable(*pmd);
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
pmd_clear(pmd);
|
2008-02-08 20:22:04 +08:00
|
|
|
pte_free_tlb(tlb, token);
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
tlb->mm->nr_ptes--;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
|
|
|
|
unsigned long addr, unsigned long end,
|
|
|
|
unsigned long floor, unsigned long ceiling)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
pmd_t *pmd;
|
|
|
|
unsigned long next;
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
unsigned long start;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
start = addr;
|
2005-04-17 06:20:36 +08:00
|
|
|
pmd = pmd_offset(pud, addr);
|
|
|
|
do {
|
|
|
|
next = pmd_addr_end(addr, end);
|
|
|
|
if (pmd_none_or_clear_bad(pmd))
|
|
|
|
continue;
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
free_pte_range(tlb, pmd);
|
2005-04-17 06:20:36 +08:00
|
|
|
} while (pmd++, addr = next, addr != end);
|
|
|
|
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
start &= PUD_MASK;
|
|
|
|
if (start < floor)
|
|
|
|
return;
|
|
|
|
if (ceiling) {
|
|
|
|
ceiling &= PUD_MASK;
|
|
|
|
if (!ceiling)
|
|
|
|
return;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
if (end - 1 > ceiling - 1)
|
|
|
|
return;
|
|
|
|
|
|
|
|
pmd = pmd_offset(pud, start);
|
|
|
|
pud_clear(pud);
|
|
|
|
pmd_free_tlb(tlb, pmd);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
|
|
|
|
unsigned long addr, unsigned long end,
|
|
|
|
unsigned long floor, unsigned long ceiling)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
pud_t *pud;
|
|
|
|
unsigned long next;
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
unsigned long start;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
start = addr;
|
2005-04-17 06:20:36 +08:00
|
|
|
pud = pud_offset(pgd, addr);
|
|
|
|
do {
|
|
|
|
next = pud_addr_end(addr, end);
|
|
|
|
if (pud_none_or_clear_bad(pud))
|
|
|
|
continue;
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
free_pmd_range(tlb, pud, addr, next, floor, ceiling);
|
2005-04-17 06:20:36 +08:00
|
|
|
} while (pud++, addr = next, addr != end);
|
|
|
|
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
start &= PGDIR_MASK;
|
|
|
|
if (start < floor)
|
|
|
|
return;
|
|
|
|
if (ceiling) {
|
|
|
|
ceiling &= PGDIR_MASK;
|
|
|
|
if (!ceiling)
|
|
|
|
return;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
if (end - 1 > ceiling - 1)
|
|
|
|
return;
|
|
|
|
|
|
|
|
pud = pud_offset(pgd, start);
|
|
|
|
pgd_clear(pgd);
|
|
|
|
pud_free_tlb(tlb, pud);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
* This function frees user-level page tables of a process.
|
|
|
|
*
|
2005-04-17 06:20:36 +08:00
|
|
|
* Must be called with pagetable lock held.
|
|
|
|
*/
|
2008-07-24 12:27:10 +08:00
|
|
|
void free_pgd_range(struct mmu_gather *tlb,
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
unsigned long addr, unsigned long end,
|
|
|
|
unsigned long floor, unsigned long ceiling)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
pgd_t *pgd;
|
|
|
|
unsigned long next;
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
unsigned long start;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The next few lines have given us lots of grief...
|
|
|
|
*
|
|
|
|
* Why are we testing PMD* at this top level? Because often
|
|
|
|
* there will be no work to do at all, and we'd prefer not to
|
|
|
|
* go all the way down to the bottom just to discover that.
|
|
|
|
*
|
|
|
|
* Why all these "- 1"s? Because 0 represents both the bottom
|
|
|
|
* of the address space and the top of it (using -1 for the
|
|
|
|
* top wouldn't help much: the masks would do the wrong thing).
|
|
|
|
* The rule is that addr 0 and floor 0 refer to the bottom of
|
|
|
|
* the address space, but end 0 and ceiling 0 refer to the top
|
|
|
|
* Comparisons need to use "end - 1" and "ceiling - 1" (though
|
|
|
|
* that end 0 case should be mythical).
|
|
|
|
*
|
|
|
|
* Wherever addr is brought up or ceiling brought down, we must
|
|
|
|
* be careful to reject "the opposite 0" before it confuses the
|
|
|
|
* subsequent tests. But what about where end is brought down
|
|
|
|
* by PMD_SIZE below? no, end can't go down to 0 there.
|
|
|
|
*
|
|
|
|
* Whereas we round start (addr) and ceiling down, by different
|
|
|
|
* masks at different levels, in order to test whether a table
|
|
|
|
* now has no other vmas using it, so can be freed, we don't
|
|
|
|
* bother to round floor or end up - the tests don't need that.
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
addr &= PMD_MASK;
|
|
|
|
if (addr < floor) {
|
|
|
|
addr += PMD_SIZE;
|
|
|
|
if (!addr)
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (ceiling) {
|
|
|
|
ceiling &= PMD_MASK;
|
|
|
|
if (!ceiling)
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (end - 1 > ceiling - 1)
|
|
|
|
end -= PMD_SIZE;
|
|
|
|
if (addr > end - 1)
|
|
|
|
return;
|
|
|
|
|
|
|
|
start = addr;
|
2008-07-24 12:27:10 +08:00
|
|
|
pgd = pgd_offset(tlb->mm, addr);
|
2005-04-17 06:20:36 +08:00
|
|
|
do {
|
|
|
|
next = pgd_addr_end(addr, end);
|
|
|
|
if (pgd_none_or_clear_bad(pgd))
|
|
|
|
continue;
|
2008-07-24 12:27:10 +08:00
|
|
|
free_pud_range(tlb, pgd, addr, next, floor, ceiling);
|
2005-04-17 06:20:36 +08:00
|
|
|
} while (pgd++, addr = next, addr != end);
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
}
|
|
|
|
|
2008-07-24 12:27:10 +08:00
|
|
|
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
2005-04-20 04:29:16 +08:00
|
|
|
unsigned long floor, unsigned long ceiling)
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
{
|
|
|
|
while (vma) {
|
|
|
|
struct vm_area_struct *next = vma->vm_next;
|
|
|
|
unsigned long addr = vma->vm_start;
|
|
|
|
|
[PATCH] mm: unlink vma before pagetables
In most places the descent from pgd to pud to pmd to pte holds mmap_sem
(exclusively or not), which ensures that free_pgtables cannot be freeing page
tables from any level at the same time. But truncation and reverse mapping
descend without mmap_sem.
No problem: just make sure that a vma is unlinked from its prio_tree (or
nonlinear list) and from its anon_vma list, after zapping the vma, but before
freeing its page tables. Then neither vmtruncate nor rmap can reach that vma
whose page tables are now volatile (nor do they need to reach it, since all
its page entries have been zapped by this stage).
The i_mmap_lock and anon_vma->lock already serialize this correctly; but the
locking hierarchy is such that we cannot take them while holding
page_table_lock. Well, we're trying to push that down anyway. So in this
patch, move anon_vma_unlink and unlink_file_vma into free_pgtables, at the
same time as moving page_table_lock around calls to unmap_vmas.
tlb_gather_mmu and tlb_finish_mmu then fall outside the page_table_lock, but
we made them preempt_disable and preempt_enable earlier; and a long source
audit of all the architectures has shown no problem with removing
page_table_lock from them. free_pgtables doesn't need page_table_lock for
itself, nor for what it calls; tlb->mm->nr_ptes is usually protected by
page_table_lock, but partly by non-exclusive mmap_sem - here it's decremented
with exclusive mmap_sem, or mm_users 0. update_hiwater_rss and
vm_unacct_memory don't need page_table_lock either.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:29 +08:00
|
|
|
/*
|
|
|
|
* Hide vma from rmap and vmtruncate before freeing pgtables
|
|
|
|
*/
|
|
|
|
anon_vma_unlink(vma);
|
|
|
|
unlink_file_vma(vma);
|
|
|
|
|
[PATCH] hugepage: Fix hugepage logic in free_pgtables()
free_pgtables() has special logic to call hugetlb_free_pgd_range() instead
of the normal free_pgd_range() on hugepage VMAs. However, the test it uses
to do so is incorrect: it calls is_hugepage_only_range on a hugepage sized
range at the start of the vma. is_hugepage_only_range() will return true
if the given range has any intersection with a hugepage address region, and
in this case the given region need not be hugepage aligned. So, for
example, this test can return true if called on, say, a 4k VMA immediately
preceding a (nicely aligned) hugepage VMA.
At present we get away with this because the powerpc version of
hugetlb_free_pgd_range() is just a call to free_pgd_range(). On ia64 (the
only other arch with a non-trivial is_hugepage_only_range()) we get away
with it for a different reason; the hugepage area is not contiguous with
the rest of the user address space, and VMAs are not permitted in between,
so the test can't return a false positive there.
Nonetheless this should be fixed. We do that in the patch below by
replacing the is_hugepage_only_range() test with an explicit test of the
VMA using is_vm_hugetlb_page().
This in turn changes behaviour for platforms where is_hugepage_only_range()
returns false always (everything except powerpc and ia64). We address this
by ensuring that hugetlb_free_pgd_range() is defined to be identical to
free_pgd_range() (instead of a no-op) on everything except ia64. Even so,
it will prevent some otherwise possible coalescing of calls down to
free_pgd_range(). Since this only happens for hugepage VMAs, removing this
small optimization seems unlikely to cause any trouble.
This patch causes no regressions on the libhugetlbfs testsuite - ppc64
POWER5 (8-way), ppc64 G5 (2-way) and i386 Pentium M (UP).
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 16:08:57 +08:00
|
|
|
if (is_vm_hugetlb_page(vma)) {
|
2005-04-20 04:29:16 +08:00
|
|
|
hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
floor, next? next->vm_start: ceiling);
|
2005-04-20 04:29:16 +08:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Optimization: gather nearby vmas into one call down
|
|
|
|
*/
|
|
|
|
while (next && next->vm_start <= vma->vm_end + PMD_SIZE
|
2006-03-22 16:08:58 +08:00
|
|
|
&& !is_vm_hugetlb_page(next)) {
|
2005-04-20 04:29:16 +08:00
|
|
|
vma = next;
|
|
|
|
next = vma->vm_next;
|
[PATCH] mm: unlink vma before pagetables
In most places the descent from pgd to pud to pmd to pte holds mmap_sem
(exclusively or not), which ensures that free_pgtables cannot be freeing page
tables from any level at the same time. But truncation and reverse mapping
descend without mmap_sem.
No problem: just make sure that a vma is unlinked from its prio_tree (or
nonlinear list) and from its anon_vma list, after zapping the vma, but before
freeing its page tables. Then neither vmtruncate nor rmap can reach that vma
whose page tables are now volatile (nor do they need to reach it, since all
its page entries have been zapped by this stage).
The i_mmap_lock and anon_vma->lock already serialize this correctly; but the
locking hierarchy is such that we cannot take them while holding
page_table_lock. Well, we're trying to push that down anyway. So in this
patch, move anon_vma_unlink and unlink_file_vma into free_pgtables, at the
same time as moving page_table_lock around calls to unmap_vmas.
tlb_gather_mmu and tlb_finish_mmu then fall outside the page_table_lock, but
we made them preempt_disable and preempt_enable earlier; and a long source
audit of all the architectures has shown no problem with removing
page_table_lock from them. free_pgtables doesn't need page_table_lock for
itself, nor for what it calls; tlb->mm->nr_ptes is usually protected by
page_table_lock, but partly by non-exclusive mmap_sem - here it's decremented
with exclusive mmap_sem, or mm_users 0. update_hiwater_rss and
vm_unacct_memory don't need page_table_lock either.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:29 +08:00
|
|
|
anon_vma_unlink(vma);
|
|
|
|
unlink_file_vma(vma);
|
2005-04-20 04:29:16 +08:00
|
|
|
}
|
|
|
|
free_pgd_range(tlb, addr, vma->vm_end,
|
|
|
|
floor, next? next->vm_start: ceiling);
|
|
|
|
}
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-20 04:29:15 +08:00
|
|
|
vma = next;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2005-10-30 09:16:22 +08:00
|
|
|
int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2008-02-08 20:22:04 +08:00
|
|
|
pgtable_t new = pte_alloc_one(mm, address);
|
2005-10-30 09:16:22 +08:00
|
|
|
if (!new)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
fix SMP data race in pagetable setup vs walking
There is a possible data race in the page table walking code. After the split
ptlock patches, it actually seems to have been introduced to the core code, but
even before that I think it would have impacted some architectures (powerpc
and sparc64, at least, walk the page tables without taking locks eg. see
find_linux_pte()).
The race is as follows:
The pte page is allocated, zeroed, and its struct page gets its spinlock
initialized. The mm-wide ptl is then taken, and then the pte page is inserted
into the pagetables.
At this point, the spinlock is not guaranteed to have ordered the previous
stores to initialize the pte page with the subsequent store to put it in the
page tables. So another Linux page table walker might be walking down (without
any locks, because we have split-leaf-ptls), and find that new pte we've
inserted. It might try to take the spinlock before the store from the other
CPU initializes it. And subsequently it might read a pte_t out before stores
from the other CPU have cleared the memory.
There are also similar races in higher levels of the page tables. They
obviously don't involve the spinlock, but could see uninitialized memory.
Arch code and hardware pagetable walkers that walk the pagetables without
locks could see similar uninitialized memory problems, regardless of whether
split ptes are enabled or not.
I prefer to put the barriers in core code, because that's where the higher
level logic happens, but the page table accessors are per-arch, and open-coding
them everywhere I don't think is an option. I'll put the read-side barriers
in alpha arch code for now (other architectures perform data-dependent loads
in order).
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-05-14 12:37:36 +08:00
|
|
|
/*
|
|
|
|
* Ensure all pte setup (eg. pte page lock and page clearing) are
|
|
|
|
* visible before the pte is made visible to other CPUs by being
|
|
|
|
* put into page tables.
|
|
|
|
*
|
|
|
|
* The other side of the story is the pointer chasing in the page
|
|
|
|
* table walking code (when walking the page table without locking;
|
|
|
|
* ie. most of the time). Fortunately, these data accesses consist
|
|
|
|
* of a chain of data-dependent loads, meaning most CPUs (alpha
|
|
|
|
* being the notable exception) will already guarantee loads are
|
|
|
|
* seen in-order. See the alpha page table accessors for the
|
|
|
|
* smp_read_barrier_depends() barriers in page table walking code.
|
|
|
|
*/
|
|
|
|
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
|
|
|
|
|
2005-10-30 09:16:23 +08:00
|
|
|
spin_lock(&mm->page_table_lock);
|
2008-02-08 20:22:04 +08:00
|
|
|
if (!pmd_present(*pmd)) { /* Has another populated it ? */
|
2005-04-17 06:20:36 +08:00
|
|
|
mm->nr_ptes++;
|
|
|
|
pmd_populate(mm, pmd, new);
|
2008-02-08 20:22:04 +08:00
|
|
|
new = NULL;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2005-10-30 09:16:23 +08:00
|
|
|
spin_unlock(&mm->page_table_lock);
|
2008-02-08 20:22:04 +08:00
|
|
|
if (new)
|
|
|
|
pte_free(mm, new);
|
2005-10-30 09:16:22 +08:00
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2005-10-30 09:16:22 +08:00
|
|
|
int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2005-10-30 09:16:22 +08:00
|
|
|
pte_t *new = pte_alloc_one_kernel(&init_mm, address);
|
|
|
|
if (!new)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
fix SMP data race in pagetable setup vs walking
There is a possible data race in the page table walking code. After the split
ptlock patches, it actually seems to have been introduced to the core code, but
even before that I think it would have impacted some architectures (powerpc
and sparc64, at least, walk the page tables without taking locks eg. see
find_linux_pte()).
The race is as follows:
The pte page is allocated, zeroed, and its struct page gets its spinlock
initialized. The mm-wide ptl is then taken, and then the pte page is inserted
into the pagetables.
At this point, the spinlock is not guaranteed to have ordered the previous
stores to initialize the pte page with the subsequent store to put it in the
page tables. So another Linux page table walker might be walking down (without
any locks, because we have split-leaf-ptls), and find that new pte we've
inserted. It might try to take the spinlock before the store from the other
CPU initializes it. And subsequently it might read a pte_t out before stores
from the other CPU have cleared the memory.
There are also similar races in higher levels of the page tables. They
obviously don't involve the spinlock, but could see uninitialized memory.
Arch code and hardware pagetable walkers that walk the pagetables without
locks could see similar uninitialized memory problems, regardless of whether
split ptes are enabled or not.
I prefer to put the barriers in core code, because that's where the higher
level logic happens, but the page table accessors are per-arch, and open-coding
them everywhere I don't think is an option. I'll put the read-side barriers
in alpha arch code for now (other architectures perform data-dependent loads
in order).
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-05-14 12:37:36 +08:00
|
|
|
smp_wmb(); /* See comment in __pte_alloc */
|
|
|
|
|
2005-10-30 09:16:22 +08:00
|
|
|
spin_lock(&init_mm.page_table_lock);
|
2008-02-08 20:22:04 +08:00
|
|
|
if (!pmd_present(*pmd)) { /* Has another populated it ? */
|
2005-10-30 09:16:22 +08:00
|
|
|
pmd_populate_kernel(&init_mm, pmd, new);
|
2008-02-08 20:22:04 +08:00
|
|
|
new = NULL;
|
|
|
|
}
|
2005-10-30 09:16:22 +08:00
|
|
|
spin_unlock(&init_mm.page_table_lock);
|
2008-02-08 20:22:04 +08:00
|
|
|
if (new)
|
|
|
|
pte_free_kernel(&init_mm, new);
|
2005-10-30 09:16:22 +08:00
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2005-10-30 09:16:05 +08:00
|
|
|
static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
|
|
|
|
{
|
|
|
|
if (file_rss)
|
|
|
|
add_mm_counter(mm, file_rss, file_rss);
|
|
|
|
if (anon_rss)
|
|
|
|
add_mm_counter(mm, anon_rss, anon_rss);
|
|
|
|
}
|
|
|
|
|
2005-10-30 09:16:12 +08:00
|
|
|
/*
|
2005-11-29 06:34:23 +08:00
|
|
|
* This function is called to print an error when a bad pte
|
|
|
|
* is found. For example, we might have a PFN-mapped pte in
|
|
|
|
* a region that doesn't allow it.
|
2005-10-30 09:16:12 +08:00
|
|
|
*
|
|
|
|
* The calling function must still handle the error.
|
|
|
|
*/
|
badpage: replace page_remove_rmap Eeek and BUG
Now that bad pages are kept out of circulation, there is no need for the
infamous page_remove_rmap() BUG() - once that page is freed, its negative
mapcount will issue a "Bad page state" message and the page won't be
freed. Removing the BUG() allows more info, on subsequent pages, to be
gathered.
We do have more info about the page at this point than bad_page() can know
- notably, what the pmd is, which might pinpoint something like low 64kB
corruption - but page_remove_rmap() isn't given the address to find that.
In practice, there is only one call to page_remove_rmap() which has ever
reported anything, that from zap_pte_range() (usually on exit, sometimes
on munmap). It has all the info, so remove page_remove_rmap()'s "Eeek"
message and leave it all to zap_pte_range().
mm/memory.c already has a hardly used print_bad_pte() function, showing
some of the appropriate info: extend it to show what we want for the rmap
case: pte info, page info (when there is a page) and vma info to compare.
zap_pte_range() already knows the pmd, but print_bad_pte() is easier to
use if it works that out for itself.
Some of this info is also shown in bad_page()'s "Bad page state" message.
Keep them separate, but adjust them to match each other as far as
possible. Say "Bad page map" in print_bad_pte(), and add a TAINT_BAD_PAGE
there too.
print_bad_pte() show current->comm unconditionally (though it should get
repeated in the usually irrelevant stack trace): sorry, I misled Nick
Piggin to make it conditional on vm_mm == current->mm, but current->mm is
already NULL in the exit case. Usually current->comm is good, though
exceptionally it may not be that of the mm (when "swapoff" for example).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:40:08 +08:00
|
|
|
static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
|
|
|
|
pte_t pte, struct page *page)
|
2005-10-30 09:16:12 +08:00
|
|
|
{
|
badpage: replace page_remove_rmap Eeek and BUG
Now that bad pages are kept out of circulation, there is no need for the
infamous page_remove_rmap() BUG() - once that page is freed, its negative
mapcount will issue a "Bad page state" message and the page won't be
freed. Removing the BUG() allows more info, on subsequent pages, to be
gathered.
We do have more info about the page at this point than bad_page() can know
- notably, what the pmd is, which might pinpoint something like low 64kB
corruption - but page_remove_rmap() isn't given the address to find that.
In practice, there is only one call to page_remove_rmap() which has ever
reported anything, that from zap_pte_range() (usually on exit, sometimes
on munmap). It has all the info, so remove page_remove_rmap()'s "Eeek"
message and leave it all to zap_pte_range().
mm/memory.c already has a hardly used print_bad_pte() function, showing
some of the appropriate info: extend it to show what we want for the rmap
case: pte info, page info (when there is a page) and vma info to compare.
zap_pte_range() already knows the pmd, but print_bad_pte() is easier to
use if it works that out for itself.
Some of this info is also shown in bad_page()'s "Bad page state" message.
Keep them separate, but adjust them to match each other as far as
possible. Say "Bad page map" in print_bad_pte(), and add a TAINT_BAD_PAGE
there too.
print_bad_pte() show current->comm unconditionally (though it should get
repeated in the usually irrelevant stack trace): sorry, I misled Nick
Piggin to make it conditional on vm_mm == current->mm, but current->mm is
already NULL in the exit case. Usually current->comm is good, though
exceptionally it may not be that of the mm (when "swapoff" for example).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:40:08 +08:00
|
|
|
pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
|
|
|
|
pud_t *pud = pud_offset(pgd, addr);
|
|
|
|
pmd_t *pmd = pmd_offset(pud, addr);
|
|
|
|
struct address_space *mapping;
|
|
|
|
pgoff_t index;
|
2009-01-07 06:40:12 +08:00
|
|
|
static unsigned long resume;
|
|
|
|
static unsigned long nr_shown;
|
|
|
|
static unsigned long nr_unshown;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allow a burst of 60 reports, then keep quiet for that minute;
|
|
|
|
* or allow a steady drip of one report per second.
|
|
|
|
*/
|
|
|
|
if (nr_shown == 60) {
|
|
|
|
if (time_before(jiffies, resume)) {
|
|
|
|
nr_unshown++;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (nr_unshown) {
|
2009-01-07 06:40:13 +08:00
|
|
|
printk(KERN_ALERT
|
|
|
|
"BUG: Bad page map: %lu messages suppressed\n",
|
2009-01-07 06:40:12 +08:00
|
|
|
nr_unshown);
|
|
|
|
nr_unshown = 0;
|
|
|
|
}
|
|
|
|
nr_shown = 0;
|
|
|
|
}
|
|
|
|
if (nr_shown++ == 0)
|
|
|
|
resume = jiffies + 60 * HZ;
|
badpage: replace page_remove_rmap Eeek and BUG
Now that bad pages are kept out of circulation, there is no need for the
infamous page_remove_rmap() BUG() - once that page is freed, its negative
mapcount will issue a "Bad page state" message and the page won't be
freed. Removing the BUG() allows more info, on subsequent pages, to be
gathered.
We do have more info about the page at this point than bad_page() can know
- notably, what the pmd is, which might pinpoint something like low 64kB
corruption - but page_remove_rmap() isn't given the address to find that.
In practice, there is only one call to page_remove_rmap() which has ever
reported anything, that from zap_pte_range() (usually on exit, sometimes
on munmap). It has all the info, so remove page_remove_rmap()'s "Eeek"
message and leave it all to zap_pte_range().
mm/memory.c already has a hardly used print_bad_pte() function, showing
some of the appropriate info: extend it to show what we want for the rmap
case: pte info, page info (when there is a page) and vma info to compare.
zap_pte_range() already knows the pmd, but print_bad_pte() is easier to
use if it works that out for itself.
Some of this info is also shown in bad_page()'s "Bad page state" message.
Keep them separate, but adjust them to match each other as far as
possible. Say "Bad page map" in print_bad_pte(), and add a TAINT_BAD_PAGE
there too.
print_bad_pte() show current->comm unconditionally (though it should get
repeated in the usually irrelevant stack trace): sorry, I misled Nick
Piggin to make it conditional on vm_mm == current->mm, but current->mm is
already NULL in the exit case. Usually current->comm is good, though
exceptionally it may not be that of the mm (when "swapoff" for example).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:40:08 +08:00
|
|
|
|
|
|
|
mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
|
|
|
|
index = linear_page_index(vma, addr);
|
|
|
|
|
2009-01-07 06:40:13 +08:00
|
|
|
printk(KERN_ALERT
|
|
|
|
"BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
|
badpage: replace page_remove_rmap Eeek and BUG
Now that bad pages are kept out of circulation, there is no need for the
infamous page_remove_rmap() BUG() - once that page is freed, its negative
mapcount will issue a "Bad page state" message and the page won't be
freed. Removing the BUG() allows more info, on subsequent pages, to be
gathered.
We do have more info about the page at this point than bad_page() can know
- notably, what the pmd is, which might pinpoint something like low 64kB
corruption - but page_remove_rmap() isn't given the address to find that.
In practice, there is only one call to page_remove_rmap() which has ever
reported anything, that from zap_pte_range() (usually on exit, sometimes
on munmap). It has all the info, so remove page_remove_rmap()'s "Eeek"
message and leave it all to zap_pte_range().
mm/memory.c already has a hardly used print_bad_pte() function, showing
some of the appropriate info: extend it to show what we want for the rmap
case: pte info, page info (when there is a page) and vma info to compare.
zap_pte_range() already knows the pmd, but print_bad_pte() is easier to
use if it works that out for itself.
Some of this info is also shown in bad_page()'s "Bad page state" message.
Keep them separate, but adjust them to match each other as far as
possible. Say "Bad page map" in print_bad_pte(), and add a TAINT_BAD_PAGE
there too.
print_bad_pte() show current->comm unconditionally (though it should get
repeated in the usually irrelevant stack trace): sorry, I misled Nick
Piggin to make it conditional on vm_mm == current->mm, but current->mm is
already NULL in the exit case. Usually current->comm is good, though
exceptionally it may not be that of the mm (when "swapoff" for example).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:40:08 +08:00
|
|
|
current->comm,
|
|
|
|
(long long)pte_val(pte), (long long)pmd_val(*pmd));
|
|
|
|
if (page) {
|
2009-01-07 06:40:13 +08:00
|
|
|
printk(KERN_ALERT
|
badpage: replace page_remove_rmap Eeek and BUG
Now that bad pages are kept out of circulation, there is no need for the
infamous page_remove_rmap() BUG() - once that page is freed, its negative
mapcount will issue a "Bad page state" message and the page won't be
freed. Removing the BUG() allows more info, on subsequent pages, to be
gathered.
We do have more info about the page at this point than bad_page() can know
- notably, what the pmd is, which might pinpoint something like low 64kB
corruption - but page_remove_rmap() isn't given the address to find that.
In practice, there is only one call to page_remove_rmap() which has ever
reported anything, that from zap_pte_range() (usually on exit, sometimes
on munmap). It has all the info, so remove page_remove_rmap()'s "Eeek"
message and leave it all to zap_pte_range().
mm/memory.c already has a hardly used print_bad_pte() function, showing
some of the appropriate info: extend it to show what we want for the rmap
case: pte info, page info (when there is a page) and vma info to compare.
zap_pte_range() already knows the pmd, but print_bad_pte() is easier to
use if it works that out for itself.
Some of this info is also shown in bad_page()'s "Bad page state" message.
Keep them separate, but adjust them to match each other as far as
possible. Say "Bad page map" in print_bad_pte(), and add a TAINT_BAD_PAGE
there too.
print_bad_pte() show current->comm unconditionally (though it should get
repeated in the usually irrelevant stack trace): sorry, I misled Nick
Piggin to make it conditional on vm_mm == current->mm, but current->mm is
already NULL in the exit case. Usually current->comm is good, though
exceptionally it may not be that of the mm (when "swapoff" for example).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:40:08 +08:00
|
|
|
"page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
|
|
|
|
page, (void *)page->flags, page_count(page),
|
|
|
|
page_mapcount(page), page->mapping, page->index);
|
|
|
|
}
|
2009-01-07 06:40:13 +08:00
|
|
|
printk(KERN_ALERT
|
badpage: replace page_remove_rmap Eeek and BUG
Now that bad pages are kept out of circulation, there is no need for the
infamous page_remove_rmap() BUG() - once that page is freed, its negative
mapcount will issue a "Bad page state" message and the page won't be
freed. Removing the BUG() allows more info, on subsequent pages, to be
gathered.
We do have more info about the page at this point than bad_page() can know
- notably, what the pmd is, which might pinpoint something like low 64kB
corruption - but page_remove_rmap() isn't given the address to find that.
In practice, there is only one call to page_remove_rmap() which has ever
reported anything, that from zap_pte_range() (usually on exit, sometimes
on munmap). It has all the info, so remove page_remove_rmap()'s "Eeek"
message and leave it all to zap_pte_range().
mm/memory.c already has a hardly used print_bad_pte() function, showing
some of the appropriate info: extend it to show what we want for the rmap
case: pte info, page info (when there is a page) and vma info to compare.
zap_pte_range() already knows the pmd, but print_bad_pte() is easier to
use if it works that out for itself.
Some of this info is also shown in bad_page()'s "Bad page state" message.
Keep them separate, but adjust them to match each other as far as
possible. Say "Bad page map" in print_bad_pte(), and add a TAINT_BAD_PAGE
there too.
print_bad_pte() show current->comm unconditionally (though it should get
repeated in the usually irrelevant stack trace): sorry, I misled Nick
Piggin to make it conditional on vm_mm == current->mm, but current->mm is
already NULL in the exit case. Usually current->comm is good, though
exceptionally it may not be that of the mm (when "swapoff" for example).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:40:08 +08:00
|
|
|
"addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
|
|
|
|
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
|
|
|
|
/*
|
|
|
|
* Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
|
|
|
|
*/
|
|
|
|
if (vma->vm_ops)
|
2009-01-07 06:40:13 +08:00
|
|
|
print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
|
badpage: replace page_remove_rmap Eeek and BUG
Now that bad pages are kept out of circulation, there is no need for the
infamous page_remove_rmap() BUG() - once that page is freed, its negative
mapcount will issue a "Bad page state" message and the page won't be
freed. Removing the BUG() allows more info, on subsequent pages, to be
gathered.
We do have more info about the page at this point than bad_page() can know
- notably, what the pmd is, which might pinpoint something like low 64kB
corruption - but page_remove_rmap() isn't given the address to find that.
In practice, there is only one call to page_remove_rmap() which has ever
reported anything, that from zap_pte_range() (usually on exit, sometimes
on munmap). It has all the info, so remove page_remove_rmap()'s "Eeek"
message and leave it all to zap_pte_range().
mm/memory.c already has a hardly used print_bad_pte() function, showing
some of the appropriate info: extend it to show what we want for the rmap
case: pte info, page info (when there is a page) and vma info to compare.
zap_pte_range() already knows the pmd, but print_bad_pte() is easier to
use if it works that out for itself.
Some of this info is also shown in bad_page()'s "Bad page state" message.
Keep them separate, but adjust them to match each other as far as
possible. Say "Bad page map" in print_bad_pte(), and add a TAINT_BAD_PAGE
there too.
print_bad_pte() show current->comm unconditionally (though it should get
repeated in the usually irrelevant stack trace): sorry, I misled Nick
Piggin to make it conditional on vm_mm == current->mm, but current->mm is
already NULL in the exit case. Usually current->comm is good, though
exceptionally it may not be that of the mm (when "swapoff" for example).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:40:08 +08:00
|
|
|
(unsigned long)vma->vm_ops->fault);
|
|
|
|
if (vma->vm_file && vma->vm_file->f_op)
|
2009-01-07 06:40:13 +08:00
|
|
|
print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
|
badpage: replace page_remove_rmap Eeek and BUG
Now that bad pages are kept out of circulation, there is no need for the
infamous page_remove_rmap() BUG() - once that page is freed, its negative
mapcount will issue a "Bad page state" message and the page won't be
freed. Removing the BUG() allows more info, on subsequent pages, to be
gathered.
We do have more info about the page at this point than bad_page() can know
- notably, what the pmd is, which might pinpoint something like low 64kB
corruption - but page_remove_rmap() isn't given the address to find that.
In practice, there is only one call to page_remove_rmap() which has ever
reported anything, that from zap_pte_range() (usually on exit, sometimes
on munmap). It has all the info, so remove page_remove_rmap()'s "Eeek"
message and leave it all to zap_pte_range().
mm/memory.c already has a hardly used print_bad_pte() function, showing
some of the appropriate info: extend it to show what we want for the rmap
case: pte info, page info (when there is a page) and vma info to compare.
zap_pte_range() already knows the pmd, but print_bad_pte() is easier to
use if it works that out for itself.
Some of this info is also shown in bad_page()'s "Bad page state" message.
Keep them separate, but adjust them to match each other as far as
possible. Say "Bad page map" in print_bad_pte(), and add a TAINT_BAD_PAGE
there too.
print_bad_pte() show current->comm unconditionally (though it should get
repeated in the usually irrelevant stack trace): sorry, I misled Nick
Piggin to make it conditional on vm_mm == current->mm, but current->mm is
already NULL in the exit case. Usually current->comm is good, though
exceptionally it may not be that of the mm (when "swapoff" for example).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:40:08 +08:00
|
|
|
(unsigned long)vma->vm_file->f_op->mmap);
|
2005-10-30 09:16:12 +08:00
|
|
|
dump_stack();
|
badpage: replace page_remove_rmap Eeek and BUG
Now that bad pages are kept out of circulation, there is no need for the
infamous page_remove_rmap() BUG() - once that page is freed, its negative
mapcount will issue a "Bad page state" message and the page won't be
freed. Removing the BUG() allows more info, on subsequent pages, to be
gathered.
We do have more info about the page at this point than bad_page() can know
- notably, what the pmd is, which might pinpoint something like low 64kB
corruption - but page_remove_rmap() isn't given the address to find that.
In practice, there is only one call to page_remove_rmap() which has ever
reported anything, that from zap_pte_range() (usually on exit, sometimes
on munmap). It has all the info, so remove page_remove_rmap()'s "Eeek"
message and leave it all to zap_pte_range().
mm/memory.c already has a hardly used print_bad_pte() function, showing
some of the appropriate info: extend it to show what we want for the rmap
case: pte info, page info (when there is a page) and vma info to compare.
zap_pte_range() already knows the pmd, but print_bad_pte() is easier to
use if it works that out for itself.
Some of this info is also shown in bad_page()'s "Bad page state" message.
Keep them separate, but adjust them to match each other as far as
possible. Say "Bad page map" in print_bad_pte(), and add a TAINT_BAD_PAGE
there too.
print_bad_pte() show current->comm unconditionally (though it should get
repeated in the usually irrelevant stack trace): sorry, I misled Nick
Piggin to make it conditional on vm_mm == current->mm, but current->mm is
already NULL in the exit case. Usually current->comm is good, though
exceptionally it may not be that of the mm (when "swapoff" for example).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:40:08 +08:00
|
|
|
add_taint(TAINT_BAD_PAGE);
|
2005-10-30 09:16:12 +08:00
|
|
|
}
|
|
|
|
|
2005-12-12 12:38:17 +08:00
|
|
|
static inline int is_cow_mapping(unsigned int flags)
|
|
|
|
{
|
|
|
|
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
|
|
|
|
}
|
|
|
|
|
[PATCH] unpaged: anon in VM_UNPAGED
copy_one_pte needs to copy the anonymous COWed pages in a VM_UNPAGED area,
zap_pte_range needs to free them, do_wp_page needs to COW them: just like
ordinary pages, not like the unpaged.
But recognizing them is a little subtle: because PageReserved is no longer a
condition for remap_pfn_range, we can now mmap all of /dev/mem (whether the
distro permits, and whether it's advisable on this or that architecture, is
another matter). So if we can see a PageAnon, it may not be ours to mess with
(or may be ours from elsewhere in the address space). I suspect there's an
entertaining insoluble self-referential problem here, but the page_is_anon
function does a good practical job, and MAP_PRIVATE PROT_WRITE VM_UNPAGED will
always be an odd choice.
In updating the comment on page_address_in_vma, noticed a potential NULL
dereference, in a path we don't actually take, but fixed it.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-22 13:32:18 +08:00
|
|
|
/*
|
mm: introduce pte_special pte bit
s390 for one, cannot implement VM_MIXEDMAP with pfn_valid, due to their memory
model (which is more dynamic than most). Instead, they had proposed to
implement it with an additional path through vm_normal_page(), using a bit in
the pte to determine whether or not the page should be refcounted:
vm_normal_page()
{
...
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
#else
if (!pfn_valid(pfn))
return NULL;
#endif
goto out;
}
...
}
This is fine, however if we are allowed to use a bit in the pte to determine
refcountedness, we can use that to _completely_ replace all the vma based
schemes. So instead of adding more cases to the already complex vma-based
scheme, we can have a clearly seperate and simple pte-based scheme (and get
slightly better code generation in the process):
vm_normal_page()
{
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
return pte_page(pte);
#else
...
#endif
}
And finally, we may rather make this concept usable by any architecture rather
than making it s390 only, so implement a new type of pte state for this.
Unfortunately the old vma based code must stay, because some architectures may
not be able to spare pte bits. This makes vm_normal_page a little bit more
ugly than we would like, but the 2 cases are clearly seperate.
So introduce a pte_special pte state, and use it in mm/memory.c. It is
currently a noop for all architectures, so this doesn't actually result in any
compiled code changes to mm/memory.o.
BTW:
I haven't put vm_normal_page() into arch code as-per an earlier suggestion.
The reason is that, regardless of where vm_normal_page is actually
implemented, the *abstraction* is still exactly the same. Also, while it
depends on whether the architecture has pte_special or not, that is the
only two possible cases, and it really isn't an arch specific function --
the role of the arch code should be to provide primitive functions and
accessors with which to build the core code; pte_special does that. We do
not want architectures to know or care about vm_normal_page itself, and
we definitely don't want them being able to invent something new there
out of sight of mm/ code. If we made vm_normal_page an arch function, then
we have to make vm_insert_mixed (next patch) an arch function too. So I
don't think moving it to arch code fundamentally improves any abstractions,
while it does practically make the code more difficult to follow, for both
mm and arch developers, and easier to misuse.
[akpm@linux-foundation.org: build fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:13:00 +08:00
|
|
|
* vm_normal_page -- This function gets the "struct page" associated with a pte.
|
2005-11-29 06:34:23 +08:00
|
|
|
*
|
mm: introduce pte_special pte bit
s390 for one, cannot implement VM_MIXEDMAP with pfn_valid, due to their memory
model (which is more dynamic than most). Instead, they had proposed to
implement it with an additional path through vm_normal_page(), using a bit in
the pte to determine whether or not the page should be refcounted:
vm_normal_page()
{
...
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
#else
if (!pfn_valid(pfn))
return NULL;
#endif
goto out;
}
...
}
This is fine, however if we are allowed to use a bit in the pte to determine
refcountedness, we can use that to _completely_ replace all the vma based
schemes. So instead of adding more cases to the already complex vma-based
scheme, we can have a clearly seperate and simple pte-based scheme (and get
slightly better code generation in the process):
vm_normal_page()
{
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
return pte_page(pte);
#else
...
#endif
}
And finally, we may rather make this concept usable by any architecture rather
than making it s390 only, so implement a new type of pte state for this.
Unfortunately the old vma based code must stay, because some architectures may
not be able to spare pte bits. This makes vm_normal_page a little bit more
ugly than we would like, but the 2 cases are clearly seperate.
So introduce a pte_special pte state, and use it in mm/memory.c. It is
currently a noop for all architectures, so this doesn't actually result in any
compiled code changes to mm/memory.o.
BTW:
I haven't put vm_normal_page() into arch code as-per an earlier suggestion.
The reason is that, regardless of where vm_normal_page is actually
implemented, the *abstraction* is still exactly the same. Also, while it
depends on whether the architecture has pte_special or not, that is the
only two possible cases, and it really isn't an arch specific function --
the role of the arch code should be to provide primitive functions and
accessors with which to build the core code; pte_special does that. We do
not want architectures to know or care about vm_normal_page itself, and
we definitely don't want them being able to invent something new there
out of sight of mm/ code. If we made vm_normal_page an arch function, then
we have to make vm_insert_mixed (next patch) an arch function too. So I
don't think moving it to arch code fundamentally improves any abstractions,
while it does practically make the code more difficult to follow, for both
mm and arch developers, and easier to misuse.
[akpm@linux-foundation.org: build fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:13:00 +08:00
|
|
|
* "Special" mappings do not wish to be associated with a "struct page" (either
|
|
|
|
* it doesn't exist, or it exists but they don't want to touch it). In this
|
|
|
|
* case, NULL is returned here. "Normal" mappings do have a struct page.
|
mm: introduce VM_MIXEDMAP
This series introduces some important infrastructure work. The overall result
is that:
1. We now support XIP backed filesystems using memory that have no
struct page allocated to them. And patches 6 and 7 actually implement
this for s390.
This is pretty important in a number of cases. As far as I understand,
in the case of virtualisation (eg. s390), each guest may mount a
readonly copy of the same filesystem (eg. the distro). Currently,
guests need to allocate struct pages for this image. So if you have
100 guests, you already need to allocate more memory for the struct
pages than the size of the image. I think. (Carsten?)
For other (eg. embedded) systems, you may have a very large non-
volatile filesystem. If you have to have struct pages for this, then
your RAM consumption will go up proportionally to fs size. Even
though it is just a small proportion, the RAM can be much more costly
eg in terms of power, so every KB less that Linux uses makes it more
attractive to a lot of these guys.
2. VM_MIXEDMAP allows us to support mappings where you actually do want
to refcount _some_ pages in the mapping, but not others, and support
COW on arbitrary (non-linear) mappings. Jared needs this for his NVRAM
filesystem in progress. Future iterations of this filesystem will
most likely want to migrate pages between pagecache and XIP backing,
which is where the requirement for mixed (some refcounted, some not)
comes from.
3. pte_special also has a peripheral usage that I need for my lockless
get_user_pages patch. That was shown to speed up "oltp" on db2 by
10% on a 2 socket system, which is kind of significant because they
scrounge for months to try to find 0.1% improvement on these
workloads. I'm hoping we might finally be faster than AIX on
pSeries with this :). My reference to lockless get_user_pages is not
meant to justify this patchset (which doesn't include lockless gup),
but just to show that pte_special is not some s390 specific thing that
should be hidden in arch code or xip code: I definitely want to use it
on at least x86 and powerpc as well.
This patch:
Introduce a new type of mapping, VM_MIXEDMAP. This is unlike VM_PFNMAP in
that it can support COW mappings of arbitrary ranges including ranges without
struct page *and* ranges with a struct page that we actually want to refcount
(PFNMAP can only support COW in those cases where the un-COW-ed translations
are mapped linearly in the virtual address, and can only support non
refcounted ranges).
VM_MIXEDMAP achieves this by refcounting all pfn_valid pages, and not
refcounting !pfn_valid pages (which is not an option for VM_PFNMAP, because it
needs to avoid refcounting pfn_valid pages eg. for /dev/mem mappings).
Signed-off-by: Jared Hulbert <jaredeh@gmail.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:12:58 +08:00
|
|
|
*
|
mm: introduce pte_special pte bit
s390 for one, cannot implement VM_MIXEDMAP with pfn_valid, due to their memory
model (which is more dynamic than most). Instead, they had proposed to
implement it with an additional path through vm_normal_page(), using a bit in
the pte to determine whether or not the page should be refcounted:
vm_normal_page()
{
...
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
#else
if (!pfn_valid(pfn))
return NULL;
#endif
goto out;
}
...
}
This is fine, however if we are allowed to use a bit in the pte to determine
refcountedness, we can use that to _completely_ replace all the vma based
schemes. So instead of adding more cases to the already complex vma-based
scheme, we can have a clearly seperate and simple pte-based scheme (and get
slightly better code generation in the process):
vm_normal_page()
{
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
return pte_page(pte);
#else
...
#endif
}
And finally, we may rather make this concept usable by any architecture rather
than making it s390 only, so implement a new type of pte state for this.
Unfortunately the old vma based code must stay, because some architectures may
not be able to spare pte bits. This makes vm_normal_page a little bit more
ugly than we would like, but the 2 cases are clearly seperate.
So introduce a pte_special pte state, and use it in mm/memory.c. It is
currently a noop for all architectures, so this doesn't actually result in any
compiled code changes to mm/memory.o.
BTW:
I haven't put vm_normal_page() into arch code as-per an earlier suggestion.
The reason is that, regardless of where vm_normal_page is actually
implemented, the *abstraction* is still exactly the same. Also, while it
depends on whether the architecture has pte_special or not, that is the
only two possible cases, and it really isn't an arch specific function --
the role of the arch code should be to provide primitive functions and
accessors with which to build the core code; pte_special does that. We do
not want architectures to know or care about vm_normal_page itself, and
we definitely don't want them being able to invent something new there
out of sight of mm/ code. If we made vm_normal_page an arch function, then
we have to make vm_insert_mixed (next patch) an arch function too. So I
don't think moving it to arch code fundamentally improves any abstractions,
while it does practically make the code more difficult to follow, for both
mm and arch developers, and easier to misuse.
[akpm@linux-foundation.org: build fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:13:00 +08:00
|
|
|
* There are 2 broad cases. Firstly, an architecture may define a pte_special()
|
|
|
|
* pte bit, in which case this function is trivial. Secondly, an architecture
|
|
|
|
* may not have a spare pte bit, which requires a more complicated scheme,
|
|
|
|
* described below.
|
|
|
|
*
|
|
|
|
* A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
|
|
|
|
* special mapping (even if there are underlying and valid "struct pages").
|
|
|
|
* COWed pages of a VM_PFNMAP are always normal.
|
2005-11-29 06:34:23 +08:00
|
|
|
*
|
mm: introduce VM_MIXEDMAP
This series introduces some important infrastructure work. The overall result
is that:
1. We now support XIP backed filesystems using memory that have no
struct page allocated to them. And patches 6 and 7 actually implement
this for s390.
This is pretty important in a number of cases. As far as I understand,
in the case of virtualisation (eg. s390), each guest may mount a
readonly copy of the same filesystem (eg. the distro). Currently,
guests need to allocate struct pages for this image. So if you have
100 guests, you already need to allocate more memory for the struct
pages than the size of the image. I think. (Carsten?)
For other (eg. embedded) systems, you may have a very large non-
volatile filesystem. If you have to have struct pages for this, then
your RAM consumption will go up proportionally to fs size. Even
though it is just a small proportion, the RAM can be much more costly
eg in terms of power, so every KB less that Linux uses makes it more
attractive to a lot of these guys.
2. VM_MIXEDMAP allows us to support mappings where you actually do want
to refcount _some_ pages in the mapping, but not others, and support
COW on arbitrary (non-linear) mappings. Jared needs this for his NVRAM
filesystem in progress. Future iterations of this filesystem will
most likely want to migrate pages between pagecache and XIP backing,
which is where the requirement for mixed (some refcounted, some not)
comes from.
3. pte_special also has a peripheral usage that I need for my lockless
get_user_pages patch. That was shown to speed up "oltp" on db2 by
10% on a 2 socket system, which is kind of significant because they
scrounge for months to try to find 0.1% improvement on these
workloads. I'm hoping we might finally be faster than AIX on
pSeries with this :). My reference to lockless get_user_pages is not
meant to justify this patchset (which doesn't include lockless gup),
but just to show that pte_special is not some s390 specific thing that
should be hidden in arch code or xip code: I definitely want to use it
on at least x86 and powerpc as well.
This patch:
Introduce a new type of mapping, VM_MIXEDMAP. This is unlike VM_PFNMAP in
that it can support COW mappings of arbitrary ranges including ranges without
struct page *and* ranges with a struct page that we actually want to refcount
(PFNMAP can only support COW in those cases where the un-COW-ed translations
are mapped linearly in the virtual address, and can only support non
refcounted ranges).
VM_MIXEDMAP achieves this by refcounting all pfn_valid pages, and not
refcounting !pfn_valid pages (which is not an option for VM_PFNMAP, because it
needs to avoid refcounting pfn_valid pages eg. for /dev/mem mappings).
Signed-off-by: Jared Hulbert <jaredeh@gmail.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:12:58 +08:00
|
|
|
* The way we recognize COWed pages within VM_PFNMAP mappings is through the
|
|
|
|
* rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
|
mm: introduce pte_special pte bit
s390 for one, cannot implement VM_MIXEDMAP with pfn_valid, due to their memory
model (which is more dynamic than most). Instead, they had proposed to
implement it with an additional path through vm_normal_page(), using a bit in
the pte to determine whether or not the page should be refcounted:
vm_normal_page()
{
...
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
#else
if (!pfn_valid(pfn))
return NULL;
#endif
goto out;
}
...
}
This is fine, however if we are allowed to use a bit in the pte to determine
refcountedness, we can use that to _completely_ replace all the vma based
schemes. So instead of adding more cases to the already complex vma-based
scheme, we can have a clearly seperate and simple pte-based scheme (and get
slightly better code generation in the process):
vm_normal_page()
{
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
return pte_page(pte);
#else
...
#endif
}
And finally, we may rather make this concept usable by any architecture rather
than making it s390 only, so implement a new type of pte state for this.
Unfortunately the old vma based code must stay, because some architectures may
not be able to spare pte bits. This makes vm_normal_page a little bit more
ugly than we would like, but the 2 cases are clearly seperate.
So introduce a pte_special pte state, and use it in mm/memory.c. It is
currently a noop for all architectures, so this doesn't actually result in any
compiled code changes to mm/memory.o.
BTW:
I haven't put vm_normal_page() into arch code as-per an earlier suggestion.
The reason is that, regardless of where vm_normal_page is actually
implemented, the *abstraction* is still exactly the same. Also, while it
depends on whether the architecture has pte_special or not, that is the
only two possible cases, and it really isn't an arch specific function --
the role of the arch code should be to provide primitive functions and
accessors with which to build the core code; pte_special does that. We do
not want architectures to know or care about vm_normal_page itself, and
we definitely don't want them being able to invent something new there
out of sight of mm/ code. If we made vm_normal_page an arch function, then
we have to make vm_insert_mixed (next patch) an arch function too. So I
don't think moving it to arch code fundamentally improves any abstractions,
while it does practically make the code more difficult to follow, for both
mm and arch developers, and easier to misuse.
[akpm@linux-foundation.org: build fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:13:00 +08:00
|
|
|
* set, and the vm_pgoff will point to the first PFN mapped: thus every special
|
|
|
|
* mapping will always honor the rule
|
2005-11-29 06:34:23 +08:00
|
|
|
*
|
|
|
|
* pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
|
|
|
|
*
|
mm: introduce pte_special pte bit
s390 for one, cannot implement VM_MIXEDMAP with pfn_valid, due to their memory
model (which is more dynamic than most). Instead, they had proposed to
implement it with an additional path through vm_normal_page(), using a bit in
the pte to determine whether or not the page should be refcounted:
vm_normal_page()
{
...
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
#else
if (!pfn_valid(pfn))
return NULL;
#endif
goto out;
}
...
}
This is fine, however if we are allowed to use a bit in the pte to determine
refcountedness, we can use that to _completely_ replace all the vma based
schemes. So instead of adding more cases to the already complex vma-based
scheme, we can have a clearly seperate and simple pte-based scheme (and get
slightly better code generation in the process):
vm_normal_page()
{
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
return pte_page(pte);
#else
...
#endif
}
And finally, we may rather make this concept usable by any architecture rather
than making it s390 only, so implement a new type of pte state for this.
Unfortunately the old vma based code must stay, because some architectures may
not be able to spare pte bits. This makes vm_normal_page a little bit more
ugly than we would like, but the 2 cases are clearly seperate.
So introduce a pte_special pte state, and use it in mm/memory.c. It is
currently a noop for all architectures, so this doesn't actually result in any
compiled code changes to mm/memory.o.
BTW:
I haven't put vm_normal_page() into arch code as-per an earlier suggestion.
The reason is that, regardless of where vm_normal_page is actually
implemented, the *abstraction* is still exactly the same. Also, while it
depends on whether the architecture has pte_special or not, that is the
only two possible cases, and it really isn't an arch specific function --
the role of the arch code should be to provide primitive functions and
accessors with which to build the core code; pte_special does that. We do
not want architectures to know or care about vm_normal_page itself, and
we definitely don't want them being able to invent something new there
out of sight of mm/ code. If we made vm_normal_page an arch function, then
we have to make vm_insert_mixed (next patch) an arch function too. So I
don't think moving it to arch code fundamentally improves any abstractions,
while it does practically make the code more difficult to follow, for both
mm and arch developers, and easier to misuse.
[akpm@linux-foundation.org: build fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:13:00 +08:00
|
|
|
* And for normal mappings this is false.
|
|
|
|
*
|
|
|
|
* This restricts such mappings to be a linear translation from virtual address
|
|
|
|
* to pfn. To get around this restriction, we allow arbitrary mappings so long
|
|
|
|
* as the vma is not a COW mapping; in that case, we know that all ptes are
|
|
|
|
* special (because none can have been COWed).
|
mm: introduce VM_MIXEDMAP
This series introduces some important infrastructure work. The overall result
is that:
1. We now support XIP backed filesystems using memory that have no
struct page allocated to them. And patches 6 and 7 actually implement
this for s390.
This is pretty important in a number of cases. As far as I understand,
in the case of virtualisation (eg. s390), each guest may mount a
readonly copy of the same filesystem (eg. the distro). Currently,
guests need to allocate struct pages for this image. So if you have
100 guests, you already need to allocate more memory for the struct
pages than the size of the image. I think. (Carsten?)
For other (eg. embedded) systems, you may have a very large non-
volatile filesystem. If you have to have struct pages for this, then
your RAM consumption will go up proportionally to fs size. Even
though it is just a small proportion, the RAM can be much more costly
eg in terms of power, so every KB less that Linux uses makes it more
attractive to a lot of these guys.
2. VM_MIXEDMAP allows us to support mappings where you actually do want
to refcount _some_ pages in the mapping, but not others, and support
COW on arbitrary (non-linear) mappings. Jared needs this for his NVRAM
filesystem in progress. Future iterations of this filesystem will
most likely want to migrate pages between pagecache and XIP backing,
which is where the requirement for mixed (some refcounted, some not)
comes from.
3. pte_special also has a peripheral usage that I need for my lockless
get_user_pages patch. That was shown to speed up "oltp" on db2 by
10% on a 2 socket system, which is kind of significant because they
scrounge for months to try to find 0.1% improvement on these
workloads. I'm hoping we might finally be faster than AIX on
pSeries with this :). My reference to lockless get_user_pages is not
meant to justify this patchset (which doesn't include lockless gup),
but just to show that pte_special is not some s390 specific thing that
should be hidden in arch code or xip code: I definitely want to use it
on at least x86 and powerpc as well.
This patch:
Introduce a new type of mapping, VM_MIXEDMAP. This is unlike VM_PFNMAP in
that it can support COW mappings of arbitrary ranges including ranges without
struct page *and* ranges with a struct page that we actually want to refcount
(PFNMAP can only support COW in those cases where the un-COW-ed translations
are mapped linearly in the virtual address, and can only support non
refcounted ranges).
VM_MIXEDMAP achieves this by refcounting all pfn_valid pages, and not
refcounting !pfn_valid pages (which is not an option for VM_PFNMAP, because it
needs to avoid refcounting pfn_valid pages eg. for /dev/mem mappings).
Signed-off-by: Jared Hulbert <jaredeh@gmail.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:12:58 +08:00
|
|
|
*
|
|
|
|
*
|
mm: introduce pte_special pte bit
s390 for one, cannot implement VM_MIXEDMAP with pfn_valid, due to their memory
model (which is more dynamic than most). Instead, they had proposed to
implement it with an additional path through vm_normal_page(), using a bit in
the pte to determine whether or not the page should be refcounted:
vm_normal_page()
{
...
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
#else
if (!pfn_valid(pfn))
return NULL;
#endif
goto out;
}
...
}
This is fine, however if we are allowed to use a bit in the pte to determine
refcountedness, we can use that to _completely_ replace all the vma based
schemes. So instead of adding more cases to the already complex vma-based
scheme, we can have a clearly seperate and simple pte-based scheme (and get
slightly better code generation in the process):
vm_normal_page()
{
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
return pte_page(pte);
#else
...
#endif
}
And finally, we may rather make this concept usable by any architecture rather
than making it s390 only, so implement a new type of pte state for this.
Unfortunately the old vma based code must stay, because some architectures may
not be able to spare pte bits. This makes vm_normal_page a little bit more
ugly than we would like, but the 2 cases are clearly seperate.
So introduce a pte_special pte state, and use it in mm/memory.c. It is
currently a noop for all architectures, so this doesn't actually result in any
compiled code changes to mm/memory.o.
BTW:
I haven't put vm_normal_page() into arch code as-per an earlier suggestion.
The reason is that, regardless of where vm_normal_page is actually
implemented, the *abstraction* is still exactly the same. Also, while it
depends on whether the architecture has pte_special or not, that is the
only two possible cases, and it really isn't an arch specific function --
the role of the arch code should be to provide primitive functions and
accessors with which to build the core code; pte_special does that. We do
not want architectures to know or care about vm_normal_page itself, and
we definitely don't want them being able to invent something new there
out of sight of mm/ code. If we made vm_normal_page an arch function, then
we have to make vm_insert_mixed (next patch) an arch function too. So I
don't think moving it to arch code fundamentally improves any abstractions,
while it does practically make the code more difficult to follow, for both
mm and arch developers, and easier to misuse.
[akpm@linux-foundation.org: build fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:13:00 +08:00
|
|
|
* In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
|
mm: introduce VM_MIXEDMAP
This series introduces some important infrastructure work. The overall result
is that:
1. We now support XIP backed filesystems using memory that have no
struct page allocated to them. And patches 6 and 7 actually implement
this for s390.
This is pretty important in a number of cases. As far as I understand,
in the case of virtualisation (eg. s390), each guest may mount a
readonly copy of the same filesystem (eg. the distro). Currently,
guests need to allocate struct pages for this image. So if you have
100 guests, you already need to allocate more memory for the struct
pages than the size of the image. I think. (Carsten?)
For other (eg. embedded) systems, you may have a very large non-
volatile filesystem. If you have to have struct pages for this, then
your RAM consumption will go up proportionally to fs size. Even
though it is just a small proportion, the RAM can be much more costly
eg in terms of power, so every KB less that Linux uses makes it more
attractive to a lot of these guys.
2. VM_MIXEDMAP allows us to support mappings where you actually do want
to refcount _some_ pages in the mapping, but not others, and support
COW on arbitrary (non-linear) mappings. Jared needs this for his NVRAM
filesystem in progress. Future iterations of this filesystem will
most likely want to migrate pages between pagecache and XIP backing,
which is where the requirement for mixed (some refcounted, some not)
comes from.
3. pte_special also has a peripheral usage that I need for my lockless
get_user_pages patch. That was shown to speed up "oltp" on db2 by
10% on a 2 socket system, which is kind of significant because they
scrounge for months to try to find 0.1% improvement on these
workloads. I'm hoping we might finally be faster than AIX on
pSeries with this :). My reference to lockless get_user_pages is not
meant to justify this patchset (which doesn't include lockless gup),
but just to show that pte_special is not some s390 specific thing that
should be hidden in arch code or xip code: I definitely want to use it
on at least x86 and powerpc as well.
This patch:
Introduce a new type of mapping, VM_MIXEDMAP. This is unlike VM_PFNMAP in
that it can support COW mappings of arbitrary ranges including ranges without
struct page *and* ranges with a struct page that we actually want to refcount
(PFNMAP can only support COW in those cases where the un-COW-ed translations
are mapped linearly in the virtual address, and can only support non
refcounted ranges).
VM_MIXEDMAP achieves this by refcounting all pfn_valid pages, and not
refcounting !pfn_valid pages (which is not an option for VM_PFNMAP, because it
needs to avoid refcounting pfn_valid pages eg. for /dev/mem mappings).
Signed-off-by: Jared Hulbert <jaredeh@gmail.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:12:58 +08:00
|
|
|
*
|
|
|
|
* VM_MIXEDMAP mappings can likewise contain memory with or without "struct
|
|
|
|
* page" backing, however the difference is that _all_ pages with a struct
|
|
|
|
* page (that is, those where pfn_valid is true) are refcounted and considered
|
|
|
|
* normal pages by the VM. The disadvantage is that pages are refcounted
|
|
|
|
* (which can be slower and simply not an option for some PFNMAP users). The
|
|
|
|
* advantage is that we don't have to follow the strict linearity rule of
|
|
|
|
* PFNMAP mappings in order to support COWable mappings.
|
|
|
|
*
|
[PATCH] unpaged: anon in VM_UNPAGED
copy_one_pte needs to copy the anonymous COWed pages in a VM_UNPAGED area,
zap_pte_range needs to free them, do_wp_page needs to COW them: just like
ordinary pages, not like the unpaged.
But recognizing them is a little subtle: because PageReserved is no longer a
condition for remap_pfn_range, we can now mmap all of /dev/mem (whether the
distro permits, and whether it's advisable on this or that architecture, is
another matter). So if we can see a PageAnon, it may not be ours to mess with
(or may be ours from elsewhere in the address space). I suspect there's an
entertaining insoluble self-referential problem here, but the page_is_anon
function does a good practical job, and MAP_PRIVATE PROT_WRITE VM_UNPAGED will
always be an odd choice.
In updating the comment on page_address_in_vma, noticed a potential NULL
dereference, in a path we don't actually take, but fixed it.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-22 13:32:18 +08:00
|
|
|
*/
|
mm: introduce pte_special pte bit
s390 for one, cannot implement VM_MIXEDMAP with pfn_valid, due to their memory
model (which is more dynamic than most). Instead, they had proposed to
implement it with an additional path through vm_normal_page(), using a bit in
the pte to determine whether or not the page should be refcounted:
vm_normal_page()
{
...
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
#else
if (!pfn_valid(pfn))
return NULL;
#endif
goto out;
}
...
}
This is fine, however if we are allowed to use a bit in the pte to determine
refcountedness, we can use that to _completely_ replace all the vma based
schemes. So instead of adding more cases to the already complex vma-based
scheme, we can have a clearly seperate and simple pte-based scheme (and get
slightly better code generation in the process):
vm_normal_page()
{
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
return pte_page(pte);
#else
...
#endif
}
And finally, we may rather make this concept usable by any architecture rather
than making it s390 only, so implement a new type of pte state for this.
Unfortunately the old vma based code must stay, because some architectures may
not be able to spare pte bits. This makes vm_normal_page a little bit more
ugly than we would like, but the 2 cases are clearly seperate.
So introduce a pte_special pte state, and use it in mm/memory.c. It is
currently a noop for all architectures, so this doesn't actually result in any
compiled code changes to mm/memory.o.
BTW:
I haven't put vm_normal_page() into arch code as-per an earlier suggestion.
The reason is that, regardless of where vm_normal_page is actually
implemented, the *abstraction* is still exactly the same. Also, while it
depends on whether the architecture has pte_special or not, that is the
only two possible cases, and it really isn't an arch specific function --
the role of the arch code should be to provide primitive functions and
accessors with which to build the core code; pte_special does that. We do
not want architectures to know or care about vm_normal_page itself, and
we definitely don't want them being able to invent something new there
out of sight of mm/ code. If we made vm_normal_page an arch function, then
we have to make vm_insert_mixed (next patch) an arch function too. So I
don't think moving it to arch code fundamentally improves any abstractions,
while it does practically make the code more difficult to follow, for both
mm and arch developers, and easier to misuse.
[akpm@linux-foundation.org: build fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:13:00 +08:00
|
|
|
#ifdef __HAVE_ARCH_PTE_SPECIAL
|
|
|
|
# define HAVE_PTE_SPECIAL 1
|
|
|
|
#else
|
|
|
|
# define HAVE_PTE_SPECIAL 0
|
|
|
|
#endif
|
|
|
|
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
|
|
|
|
pte_t pte)
|
[PATCH] unpaged: anon in VM_UNPAGED
copy_one_pte needs to copy the anonymous COWed pages in a VM_UNPAGED area,
zap_pte_range needs to free them, do_wp_page needs to COW them: just like
ordinary pages, not like the unpaged.
But recognizing them is a little subtle: because PageReserved is no longer a
condition for remap_pfn_range, we can now mmap all of /dev/mem (whether the
distro permits, and whether it's advisable on this or that architecture, is
another matter). So if we can see a PageAnon, it may not be ours to mess with
(or may be ours from elsewhere in the address space). I suspect there's an
entertaining insoluble self-referential problem here, but the page_is_anon
function does a good practical job, and MAP_PRIVATE PROT_WRITE VM_UNPAGED will
always be an odd choice.
In updating the comment on page_address_in_vma, noticed a potential NULL
dereference, in a path we don't actually take, but fixed it.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-22 13:32:18 +08:00
|
|
|
{
|
2009-01-07 06:40:09 +08:00
|
|
|
unsigned long pfn = pte_pfn(pte);
|
mm: introduce pte_special pte bit
s390 for one, cannot implement VM_MIXEDMAP with pfn_valid, due to their memory
model (which is more dynamic than most). Instead, they had proposed to
implement it with an additional path through vm_normal_page(), using a bit in
the pte to determine whether or not the page should be refcounted:
vm_normal_page()
{
...
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
#else
if (!pfn_valid(pfn))
return NULL;
#endif
goto out;
}
...
}
This is fine, however if we are allowed to use a bit in the pte to determine
refcountedness, we can use that to _completely_ replace all the vma based
schemes. So instead of adding more cases to the already complex vma-based
scheme, we can have a clearly seperate and simple pte-based scheme (and get
slightly better code generation in the process):
vm_normal_page()
{
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
return pte_page(pte);
#else
...
#endif
}
And finally, we may rather make this concept usable by any architecture rather
than making it s390 only, so implement a new type of pte state for this.
Unfortunately the old vma based code must stay, because some architectures may
not be able to spare pte bits. This makes vm_normal_page a little bit more
ugly than we would like, but the 2 cases are clearly seperate.
So introduce a pte_special pte state, and use it in mm/memory.c. It is
currently a noop for all architectures, so this doesn't actually result in any
compiled code changes to mm/memory.o.
BTW:
I haven't put vm_normal_page() into arch code as-per an earlier suggestion.
The reason is that, regardless of where vm_normal_page is actually
implemented, the *abstraction* is still exactly the same. Also, while it
depends on whether the architecture has pte_special or not, that is the
only two possible cases, and it really isn't an arch specific function --
the role of the arch code should be to provide primitive functions and
accessors with which to build the core code; pte_special does that. We do
not want architectures to know or care about vm_normal_page itself, and
we definitely don't want them being able to invent something new there
out of sight of mm/ code. If we made vm_normal_page an arch function, then
we have to make vm_insert_mixed (next patch) an arch function too. So I
don't think moving it to arch code fundamentally improves any abstractions,
while it does practically make the code more difficult to follow, for both
mm and arch developers, and easier to misuse.
[akpm@linux-foundation.org: build fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:13:00 +08:00
|
|
|
|
|
|
|
if (HAVE_PTE_SPECIAL) {
|
2009-01-07 06:40:09 +08:00
|
|
|
if (likely(!pte_special(pte)))
|
|
|
|
goto check_pfn;
|
|
|
|
if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
|
|
|
|
print_bad_pte(vma, addr, pte, NULL);
|
mm: introduce pte_special pte bit
s390 for one, cannot implement VM_MIXEDMAP with pfn_valid, due to their memory
model (which is more dynamic than most). Instead, they had proposed to
implement it with an additional path through vm_normal_page(), using a bit in
the pte to determine whether or not the page should be refcounted:
vm_normal_page()
{
...
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
#else
if (!pfn_valid(pfn))
return NULL;
#endif
goto out;
}
...
}
This is fine, however if we are allowed to use a bit in the pte to determine
refcountedness, we can use that to _completely_ replace all the vma based
schemes. So instead of adding more cases to the already complex vma-based
scheme, we can have a clearly seperate and simple pte-based scheme (and get
slightly better code generation in the process):
vm_normal_page()
{
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
return pte_page(pte);
#else
...
#endif
}
And finally, we may rather make this concept usable by any architecture rather
than making it s390 only, so implement a new type of pte state for this.
Unfortunately the old vma based code must stay, because some architectures may
not be able to spare pte bits. This makes vm_normal_page a little bit more
ugly than we would like, but the 2 cases are clearly seperate.
So introduce a pte_special pte state, and use it in mm/memory.c. It is
currently a noop for all architectures, so this doesn't actually result in any
compiled code changes to mm/memory.o.
BTW:
I haven't put vm_normal_page() into arch code as-per an earlier suggestion.
The reason is that, regardless of where vm_normal_page is actually
implemented, the *abstraction* is still exactly the same. Also, while it
depends on whether the architecture has pte_special or not, that is the
only two possible cases, and it really isn't an arch specific function --
the role of the arch code should be to provide primitive functions and
accessors with which to build the core code; pte_special does that. We do
not want architectures to know or care about vm_normal_page itself, and
we definitely don't want them being able to invent something new there
out of sight of mm/ code. If we made vm_normal_page an arch function, then
we have to make vm_insert_mixed (next patch) an arch function too. So I
don't think moving it to arch code fundamentally improves any abstractions,
while it does practically make the code more difficult to follow, for both
mm and arch developers, and easier to misuse.
[akpm@linux-foundation.org: build fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:13:00 +08:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* !HAVE_PTE_SPECIAL case follows: */
|
|
|
|
|
mm: introduce VM_MIXEDMAP
This series introduces some important infrastructure work. The overall result
is that:
1. We now support XIP backed filesystems using memory that have no
struct page allocated to them. And patches 6 and 7 actually implement
this for s390.
This is pretty important in a number of cases. As far as I understand,
in the case of virtualisation (eg. s390), each guest may mount a
readonly copy of the same filesystem (eg. the distro). Currently,
guests need to allocate struct pages for this image. So if you have
100 guests, you already need to allocate more memory for the struct
pages than the size of the image. I think. (Carsten?)
For other (eg. embedded) systems, you may have a very large non-
volatile filesystem. If you have to have struct pages for this, then
your RAM consumption will go up proportionally to fs size. Even
though it is just a small proportion, the RAM can be much more costly
eg in terms of power, so every KB less that Linux uses makes it more
attractive to a lot of these guys.
2. VM_MIXEDMAP allows us to support mappings where you actually do want
to refcount _some_ pages in the mapping, but not others, and support
COW on arbitrary (non-linear) mappings. Jared needs this for his NVRAM
filesystem in progress. Future iterations of this filesystem will
most likely want to migrate pages between pagecache and XIP backing,
which is where the requirement for mixed (some refcounted, some not)
comes from.
3. pte_special also has a peripheral usage that I need for my lockless
get_user_pages patch. That was shown to speed up "oltp" on db2 by
10% on a 2 socket system, which is kind of significant because they
scrounge for months to try to find 0.1% improvement on these
workloads. I'm hoping we might finally be faster than AIX on
pSeries with this :). My reference to lockless get_user_pages is not
meant to justify this patchset (which doesn't include lockless gup),
but just to show that pte_special is not some s390 specific thing that
should be hidden in arch code or xip code: I definitely want to use it
on at least x86 and powerpc as well.
This patch:
Introduce a new type of mapping, VM_MIXEDMAP. This is unlike VM_PFNMAP in
that it can support COW mappings of arbitrary ranges including ranges without
struct page *and* ranges with a struct page that we actually want to refcount
(PFNMAP can only support COW in those cases where the un-COW-ed translations
are mapped linearly in the virtual address, and can only support non
refcounted ranges).
VM_MIXEDMAP achieves this by refcounting all pfn_valid pages, and not
refcounting !pfn_valid pages (which is not an option for VM_PFNMAP, because it
needs to avoid refcounting pfn_valid pages eg. for /dev/mem mappings).
Signed-off-by: Jared Hulbert <jaredeh@gmail.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:12:58 +08:00
|
|
|
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
|
|
|
|
if (vma->vm_flags & VM_MIXEDMAP) {
|
|
|
|
if (!pfn_valid(pfn))
|
|
|
|
return NULL;
|
|
|
|
goto out;
|
|
|
|
} else {
|
mm: introduce pte_special pte bit
s390 for one, cannot implement VM_MIXEDMAP with pfn_valid, due to their memory
model (which is more dynamic than most). Instead, they had proposed to
implement it with an additional path through vm_normal_page(), using a bit in
the pte to determine whether or not the page should be refcounted:
vm_normal_page()
{
...
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
#else
if (!pfn_valid(pfn))
return NULL;
#endif
goto out;
}
...
}
This is fine, however if we are allowed to use a bit in the pte to determine
refcountedness, we can use that to _completely_ replace all the vma based
schemes. So instead of adding more cases to the already complex vma-based
scheme, we can have a clearly seperate and simple pte-based scheme (and get
slightly better code generation in the process):
vm_normal_page()
{
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
return pte_page(pte);
#else
...
#endif
}
And finally, we may rather make this concept usable by any architecture rather
than making it s390 only, so implement a new type of pte state for this.
Unfortunately the old vma based code must stay, because some architectures may
not be able to spare pte bits. This makes vm_normal_page a little bit more
ugly than we would like, but the 2 cases are clearly seperate.
So introduce a pte_special pte state, and use it in mm/memory.c. It is
currently a noop for all architectures, so this doesn't actually result in any
compiled code changes to mm/memory.o.
BTW:
I haven't put vm_normal_page() into arch code as-per an earlier suggestion.
The reason is that, regardless of where vm_normal_page is actually
implemented, the *abstraction* is still exactly the same. Also, while it
depends on whether the architecture has pte_special or not, that is the
only two possible cases, and it really isn't an arch specific function --
the role of the arch code should be to provide primitive functions and
accessors with which to build the core code; pte_special does that. We do
not want architectures to know or care about vm_normal_page itself, and
we definitely don't want them being able to invent something new there
out of sight of mm/ code. If we made vm_normal_page an arch function, then
we have to make vm_insert_mixed (next patch) an arch function too. So I
don't think moving it to arch code fundamentally improves any abstractions,
while it does practically make the code more difficult to follow, for both
mm and arch developers, and easier to misuse.
[akpm@linux-foundation.org: build fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:13:00 +08:00
|
|
|
unsigned long off;
|
|
|
|
off = (addr - vma->vm_start) >> PAGE_SHIFT;
|
mm: introduce VM_MIXEDMAP
This series introduces some important infrastructure work. The overall result
is that:
1. We now support XIP backed filesystems using memory that have no
struct page allocated to them. And patches 6 and 7 actually implement
this for s390.
This is pretty important in a number of cases. As far as I understand,
in the case of virtualisation (eg. s390), each guest may mount a
readonly copy of the same filesystem (eg. the distro). Currently,
guests need to allocate struct pages for this image. So if you have
100 guests, you already need to allocate more memory for the struct
pages than the size of the image. I think. (Carsten?)
For other (eg. embedded) systems, you may have a very large non-
volatile filesystem. If you have to have struct pages for this, then
your RAM consumption will go up proportionally to fs size. Even
though it is just a small proportion, the RAM can be much more costly
eg in terms of power, so every KB less that Linux uses makes it more
attractive to a lot of these guys.
2. VM_MIXEDMAP allows us to support mappings where you actually do want
to refcount _some_ pages in the mapping, but not others, and support
COW on arbitrary (non-linear) mappings. Jared needs this for his NVRAM
filesystem in progress. Future iterations of this filesystem will
most likely want to migrate pages between pagecache and XIP backing,
which is where the requirement for mixed (some refcounted, some not)
comes from.
3. pte_special also has a peripheral usage that I need for my lockless
get_user_pages patch. That was shown to speed up "oltp" on db2 by
10% on a 2 socket system, which is kind of significant because they
scrounge for months to try to find 0.1% improvement on these
workloads. I'm hoping we might finally be faster than AIX on
pSeries with this :). My reference to lockless get_user_pages is not
meant to justify this patchset (which doesn't include lockless gup),
but just to show that pte_special is not some s390 specific thing that
should be hidden in arch code or xip code: I definitely want to use it
on at least x86 and powerpc as well.
This patch:
Introduce a new type of mapping, VM_MIXEDMAP. This is unlike VM_PFNMAP in
that it can support COW mappings of arbitrary ranges including ranges without
struct page *and* ranges with a struct page that we actually want to refcount
(PFNMAP can only support COW in those cases where the un-COW-ed translations
are mapped linearly in the virtual address, and can only support non
refcounted ranges).
VM_MIXEDMAP achieves this by refcounting all pfn_valid pages, and not
refcounting !pfn_valid pages (which is not an option for VM_PFNMAP, because it
needs to avoid refcounting pfn_valid pages eg. for /dev/mem mappings).
Signed-off-by: Jared Hulbert <jaredeh@gmail.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:12:58 +08:00
|
|
|
if (pfn == vma->vm_pgoff + off)
|
|
|
|
return NULL;
|
|
|
|
if (!is_cow_mapping(vma->vm_flags))
|
|
|
|
return NULL;
|
|
|
|
}
|
2005-11-29 06:34:23 +08:00
|
|
|
}
|
|
|
|
|
2009-01-07 06:40:09 +08:00
|
|
|
check_pfn:
|
|
|
|
if (unlikely(pfn > highest_memmap_pfn)) {
|
|
|
|
print_bad_pte(vma, addr, pte, NULL);
|
|
|
|
return NULL;
|
|
|
|
}
|
2005-11-29 06:34:23 +08:00
|
|
|
|
|
|
|
/*
|
mm: introduce pte_special pte bit
s390 for one, cannot implement VM_MIXEDMAP with pfn_valid, due to their memory
model (which is more dynamic than most). Instead, they had proposed to
implement it with an additional path through vm_normal_page(), using a bit in
the pte to determine whether or not the page should be refcounted:
vm_normal_page()
{
...
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
#else
if (!pfn_valid(pfn))
return NULL;
#endif
goto out;
}
...
}
This is fine, however if we are allowed to use a bit in the pte to determine
refcountedness, we can use that to _completely_ replace all the vma based
schemes. So instead of adding more cases to the already complex vma-based
scheme, we can have a clearly seperate and simple pte-based scheme (and get
slightly better code generation in the process):
vm_normal_page()
{
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
return pte_page(pte);
#else
...
#endif
}
And finally, we may rather make this concept usable by any architecture rather
than making it s390 only, so implement a new type of pte state for this.
Unfortunately the old vma based code must stay, because some architectures may
not be able to spare pte bits. This makes vm_normal_page a little bit more
ugly than we would like, but the 2 cases are clearly seperate.
So introduce a pte_special pte state, and use it in mm/memory.c. It is
currently a noop for all architectures, so this doesn't actually result in any
compiled code changes to mm/memory.o.
BTW:
I haven't put vm_normal_page() into arch code as-per an earlier suggestion.
The reason is that, regardless of where vm_normal_page is actually
implemented, the *abstraction* is still exactly the same. Also, while it
depends on whether the architecture has pte_special or not, that is the
only two possible cases, and it really isn't an arch specific function --
the role of the arch code should be to provide primitive functions and
accessors with which to build the core code; pte_special does that. We do
not want architectures to know or care about vm_normal_page itself, and
we definitely don't want them being able to invent something new there
out of sight of mm/ code. If we made vm_normal_page an arch function, then
we have to make vm_insert_mixed (next patch) an arch function too. So I
don't think moving it to arch code fundamentally improves any abstractions,
while it does practically make the code more difficult to follow, for both
mm and arch developers, and easier to misuse.
[akpm@linux-foundation.org: build fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:13:00 +08:00
|
|
|
* NOTE! We still have PageReserved() pages in the page tables.
|
|
|
|
* eg. VDSO mappings can cause them to exist.
|
2005-11-29 06:34:23 +08:00
|
|
|
*/
|
mm: introduce VM_MIXEDMAP
This series introduces some important infrastructure work. The overall result
is that:
1. We now support XIP backed filesystems using memory that have no
struct page allocated to them. And patches 6 and 7 actually implement
this for s390.
This is pretty important in a number of cases. As far as I understand,
in the case of virtualisation (eg. s390), each guest may mount a
readonly copy of the same filesystem (eg. the distro). Currently,
guests need to allocate struct pages for this image. So if you have
100 guests, you already need to allocate more memory for the struct
pages than the size of the image. I think. (Carsten?)
For other (eg. embedded) systems, you may have a very large non-
volatile filesystem. If you have to have struct pages for this, then
your RAM consumption will go up proportionally to fs size. Even
though it is just a small proportion, the RAM can be much more costly
eg in terms of power, so every KB less that Linux uses makes it more
attractive to a lot of these guys.
2. VM_MIXEDMAP allows us to support mappings where you actually do want
to refcount _some_ pages in the mapping, but not others, and support
COW on arbitrary (non-linear) mappings. Jared needs this for his NVRAM
filesystem in progress. Future iterations of this filesystem will
most likely want to migrate pages between pagecache and XIP backing,
which is where the requirement for mixed (some refcounted, some not)
comes from.
3. pte_special also has a peripheral usage that I need for my lockless
get_user_pages patch. That was shown to speed up "oltp" on db2 by
10% on a 2 socket system, which is kind of significant because they
scrounge for months to try to find 0.1% improvement on these
workloads. I'm hoping we might finally be faster than AIX on
pSeries with this :). My reference to lockless get_user_pages is not
meant to justify this patchset (which doesn't include lockless gup),
but just to show that pte_special is not some s390 specific thing that
should be hidden in arch code or xip code: I definitely want to use it
on at least x86 and powerpc as well.
This patch:
Introduce a new type of mapping, VM_MIXEDMAP. This is unlike VM_PFNMAP in
that it can support COW mappings of arbitrary ranges including ranges without
struct page *and* ranges with a struct page that we actually want to refcount
(PFNMAP can only support COW in those cases where the un-COW-ed translations
are mapped linearly in the virtual address, and can only support non
refcounted ranges).
VM_MIXEDMAP achieves this by refcounting all pfn_valid pages, and not
refcounting !pfn_valid pages (which is not an option for VM_PFNMAP, because it
needs to avoid refcounting pfn_valid pages eg. for /dev/mem mappings).
Signed-off-by: Jared Hulbert <jaredeh@gmail.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:12:58 +08:00
|
|
|
out:
|
2005-11-29 06:34:23 +08:00
|
|
|
return pfn_to_page(pfn);
|
[PATCH] unpaged: anon in VM_UNPAGED
copy_one_pte needs to copy the anonymous COWed pages in a VM_UNPAGED area,
zap_pte_range needs to free them, do_wp_page needs to COW them: just like
ordinary pages, not like the unpaged.
But recognizing them is a little subtle: because PageReserved is no longer a
condition for remap_pfn_range, we can now mmap all of /dev/mem (whether the
distro permits, and whether it's advisable on this or that architecture, is
another matter). So if we can see a PageAnon, it may not be ours to mess with
(or may be ours from elsewhere in the address space). I suspect there's an
entertaining insoluble self-referential problem here, but the page_is_anon
function does a good practical job, and MAP_PRIVATE PROT_WRITE VM_UNPAGED will
always be an odd choice.
In updating the comment on page_address_in_vma, noticed a potential NULL
dereference, in a path we don't actually take, but fixed it.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-22 13:32:18 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* copy one vm_area from one task to the other. Assumes the page tables
|
|
|
|
* already present in the new task to be cleared in the whole range
|
|
|
|
* covered by this vma.
|
|
|
|
*/
|
|
|
|
|
2005-10-30 09:16:13 +08:00
|
|
|
static inline void
|
2005-04-17 06:20:36 +08:00
|
|
|
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
2005-10-30 09:16:12 +08:00
|
|
|
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
|
2005-10-30 09:16:13 +08:00
|
|
|
unsigned long addr, int *rss)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2005-10-30 09:16:12 +08:00
|
|
|
unsigned long vm_flags = vma->vm_flags;
|
2005-04-17 06:20:36 +08:00
|
|
|
pte_t pte = *src_pte;
|
|
|
|
struct page *page;
|
|
|
|
|
|
|
|
/* pte contains position in swap or file, so copy. */
|
|
|
|
if (unlikely(!pte_present(pte))) {
|
|
|
|
if (!pte_file(pte)) {
|
[PATCH] Swapless page migration: add R/W migration entries
Implement read/write migration ptes
We take the upper two swapfiles for the two types of migration ptes and define
a series of macros in swapops.h.
The VM is modified to handle the migration entries. migration entries can
only be encountered when the page they are pointing to is locked. This limits
the number of places one has to fix. We also check in copy_pte_range and in
mprotect_pte_range() for migration ptes.
We check for migration ptes in do_swap_cache and call a function that will
then wait on the page lock. This allows us to effectively stop all accesses
to apge.
Migration entries are created by try_to_unmap if called for migration and
removed by local functions in migrate.c
From: Hugh Dickins <hugh@veritas.com>
Several times while testing swapless page migration (I've no NUMA, just
hacking it up to migrate recklessly while running load), I've hit the
BUG_ON(!PageLocked(p)) in migration_entry_to_page.
This comes from an orphaned migration entry, unrelated to the current
correctly locked migration, but hit by remove_anon_migration_ptes as it
checks an address in each vma of the anon_vma list.
Such an orphan may be left behind if an earlier migration raced with fork:
copy_one_pte can duplicate a migration entry from parent to child, after
remove_anon_migration_ptes has checked the child vma, but before it has
removed it from the parent vma. (If the process were later to fault on this
orphaned entry, it would hit the same BUG from migration_entry_wait.)
This could be fixed by locking anon_vma in copy_one_pte, but we'd rather
not. There's no such problem with file pages, because vma_prio_tree_add
adds child vma after parent vma, and the page table locking at each end is
enough to serialize. Follow that example with anon_vma: add new vmas to the
tail instead of the head.
(There's no corresponding problem when inserting migration entries,
because a missed pte will leave the page count and mapcount high, which is
allowed for. And there's no corresponding problem when migrating via swap,
because a leftover swap entry will be correctly faulted. But the swapless
method has no refcounting of its entries.)
From: Ingo Molnar <mingo@elte.hu>
pte_unmap_unlock() takes the pte pointer as an argument.
From: Hugh Dickins <hugh@veritas.com>
Several times while testing swapless page migration, gcc has tried to exec
a pointer instead of a string: smells like COW mappings are not being
properly write-protected on fork.
The protection in copy_one_pte looks very convincing, until at last you
realize that the second arg to make_migration_entry is a boolean "write",
and SWP_MIGRATION_READ is 30.
Anyway, it's better done like in change_pte_range, using
is_write_migration_entry and make_migration_entry_read.
From: Hugh Dickins <hugh@veritas.com>
Remove unnecessary obfuscation from sys_swapon's range check on swap type,
which blew up causing memory corruption once swapless migration made
MAX_SWAPFILES no longer 2 ^ MAX_SWAPFILES_SHIFT.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
From: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 17:03:35 +08:00
|
|
|
swp_entry_t entry = pte_to_swp_entry(pte);
|
|
|
|
|
|
|
|
swap_duplicate(entry);
|
2005-04-17 06:20:36 +08:00
|
|
|
/* make sure dst_mm is on swapoff's mmlist. */
|
|
|
|
if (unlikely(list_empty(&dst_mm->mmlist))) {
|
|
|
|
spin_lock(&mmlist_lock);
|
2005-10-30 09:16:41 +08:00
|
|
|
if (list_empty(&dst_mm->mmlist))
|
|
|
|
list_add(&dst_mm->mmlist,
|
|
|
|
&src_mm->mmlist);
|
2005-04-17 06:20:36 +08:00
|
|
|
spin_unlock(&mmlist_lock);
|
|
|
|
}
|
[PATCH] Swapless page migration: add R/W migration entries
Implement read/write migration ptes
We take the upper two swapfiles for the two types of migration ptes and define
a series of macros in swapops.h.
The VM is modified to handle the migration entries. migration entries can
only be encountered when the page they are pointing to is locked. This limits
the number of places one has to fix. We also check in copy_pte_range and in
mprotect_pte_range() for migration ptes.
We check for migration ptes in do_swap_cache and call a function that will
then wait on the page lock. This allows us to effectively stop all accesses
to apge.
Migration entries are created by try_to_unmap if called for migration and
removed by local functions in migrate.c
From: Hugh Dickins <hugh@veritas.com>
Several times while testing swapless page migration (I've no NUMA, just
hacking it up to migrate recklessly while running load), I've hit the
BUG_ON(!PageLocked(p)) in migration_entry_to_page.
This comes from an orphaned migration entry, unrelated to the current
correctly locked migration, but hit by remove_anon_migration_ptes as it
checks an address in each vma of the anon_vma list.
Such an orphan may be left behind if an earlier migration raced with fork:
copy_one_pte can duplicate a migration entry from parent to child, after
remove_anon_migration_ptes has checked the child vma, but before it has
removed it from the parent vma. (If the process were later to fault on this
orphaned entry, it would hit the same BUG from migration_entry_wait.)
This could be fixed by locking anon_vma in copy_one_pte, but we'd rather
not. There's no such problem with file pages, because vma_prio_tree_add
adds child vma after parent vma, and the page table locking at each end is
enough to serialize. Follow that example with anon_vma: add new vmas to the
tail instead of the head.
(There's no corresponding problem when inserting migration entries,
because a missed pte will leave the page count and mapcount high, which is
allowed for. And there's no corresponding problem when migrating via swap,
because a leftover swap entry will be correctly faulted. But the swapless
method has no refcounting of its entries.)
From: Ingo Molnar <mingo@elte.hu>
pte_unmap_unlock() takes the pte pointer as an argument.
From: Hugh Dickins <hugh@veritas.com>
Several times while testing swapless page migration, gcc has tried to exec
a pointer instead of a string: smells like COW mappings are not being
properly write-protected on fork.
The protection in copy_one_pte looks very convincing, until at last you
realize that the second arg to make_migration_entry is a boolean "write",
and SWP_MIGRATION_READ is 30.
Anyway, it's better done like in change_pte_range, using
is_write_migration_entry and make_migration_entry_read.
From: Hugh Dickins <hugh@veritas.com>
Remove unnecessary obfuscation from sys_swapon's range check on swap type,
which blew up causing memory corruption once swapless migration made
MAX_SWAPFILES no longer 2 ^ MAX_SWAPFILES_SHIFT.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
From: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 17:03:35 +08:00
|
|
|
if (is_write_migration_entry(entry) &&
|
|
|
|
is_cow_mapping(vm_flags)) {
|
|
|
|
/*
|
|
|
|
* COW mappings require pages in both parent
|
|
|
|
* and child to be set to read.
|
|
|
|
*/
|
|
|
|
make_migration_entry_read(&entry);
|
|
|
|
pte = swp_entry_to_pte(entry);
|
|
|
|
set_pte_at(src_mm, addr, src_pte, pte);
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2005-10-30 09:16:05 +08:00
|
|
|
goto out_set_pte;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If it's a COW mapping, write protect it both
|
|
|
|
* in the parent and the child
|
|
|
|
*/
|
2005-12-12 12:38:17 +08:00
|
|
|
if (is_cow_mapping(vm_flags)) {
|
2005-04-17 06:20:36 +08:00
|
|
|
ptep_set_wrprotect(src_mm, addr, src_pte);
|
2006-10-01 14:29:30 +08:00
|
|
|
pte = pte_wrprotect(pte);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If it's a shared mapping, mark it clean in
|
|
|
|
* the child
|
|
|
|
*/
|
|
|
|
if (vm_flags & VM_SHARED)
|
|
|
|
pte = pte_mkclean(pte);
|
|
|
|
pte = pte_mkold(pte);
|
2005-11-29 06:34:23 +08:00
|
|
|
|
|
|
|
page = vm_normal_page(vma, addr, pte);
|
|
|
|
if (page) {
|
|
|
|
get_page(page);
|
2007-05-17 13:11:21 +08:00
|
|
|
page_dup_rmap(page, vma, addr);
|
2005-11-29 06:34:23 +08:00
|
|
|
rss[!!PageAnon(page)]++;
|
|
|
|
}
|
2005-10-30 09:16:05 +08:00
|
|
|
|
|
|
|
out_set_pte:
|
|
|
|
set_pte_at(dst_mm, addr, dst_pte, pte);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
|
|
|
pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
|
|
|
|
unsigned long addr, unsigned long end)
|
|
|
|
{
|
|
|
|
pte_t *src_pte, *dst_pte;
|
2005-10-30 09:16:23 +08:00
|
|
|
spinlock_t *src_ptl, *dst_ptl;
|
2005-10-30 09:15:53 +08:00
|
|
|
int progress = 0;
|
2005-10-30 09:16:13 +08:00
|
|
|
int rss[2];
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
again:
|
2005-10-30 09:16:05 +08:00
|
|
|
rss[1] = rss[0] = 0;
|
2005-10-30 09:16:23 +08:00
|
|
|
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!dst_pte)
|
|
|
|
return -ENOMEM;
|
|
|
|
src_pte = pte_offset_map_nested(src_pmd, addr);
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:40 +08:00
|
|
|
src_ptl = pte_lockptr(src_mm, src_pmd);
|
2006-07-03 15:25:08 +08:00
|
|
|
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
|
2006-10-01 14:29:33 +08:00
|
|
|
arch_enter_lazy_mmu_mode();
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
do {
|
|
|
|
/*
|
|
|
|
* We are holding two locks at this point - either of them
|
|
|
|
* could generate latencies in another task on another CPU.
|
|
|
|
*/
|
2005-10-30 09:15:53 +08:00
|
|
|
if (progress >= 32) {
|
|
|
|
progress = 0;
|
|
|
|
if (need_resched() ||
|
2008-01-30 20:31:20 +08:00
|
|
|
spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
|
2005-10-30 09:15:53 +08:00
|
|
|
break;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
if (pte_none(*src_pte)) {
|
|
|
|
progress++;
|
|
|
|
continue;
|
|
|
|
}
|
2005-10-30 09:16:13 +08:00
|
|
|
copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
|
2005-04-17 06:20:36 +08:00
|
|
|
progress += 8;
|
|
|
|
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
|
|
|
|
|
2006-10-01 14:29:33 +08:00
|
|
|
arch_leave_lazy_mmu_mode();
|
2005-10-30 09:16:23 +08:00
|
|
|
spin_unlock(src_ptl);
|
2005-04-17 06:20:36 +08:00
|
|
|
pte_unmap_nested(src_pte - 1);
|
2005-10-30 09:16:05 +08:00
|
|
|
add_mm_rss(dst_mm, rss[0], rss[1]);
|
2005-10-30 09:16:23 +08:00
|
|
|
pte_unmap_unlock(dst_pte - 1, dst_ptl);
|
|
|
|
cond_resched();
|
2005-04-17 06:20:36 +08:00
|
|
|
if (addr != end)
|
|
|
|
goto again;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
|
|
|
pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
|
|
|
|
unsigned long addr, unsigned long end)
|
|
|
|
{
|
|
|
|
pmd_t *src_pmd, *dst_pmd;
|
|
|
|
unsigned long next;
|
|
|
|
|
|
|
|
dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
|
|
|
|
if (!dst_pmd)
|
|
|
|
return -ENOMEM;
|
|
|
|
src_pmd = pmd_offset(src_pud, addr);
|
|
|
|
do {
|
|
|
|
next = pmd_addr_end(addr, end);
|
|
|
|
if (pmd_none_or_clear_bad(src_pmd))
|
|
|
|
continue;
|
|
|
|
if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
|
|
|
|
vma, addr, next))
|
|
|
|
return -ENOMEM;
|
|
|
|
} while (dst_pmd++, src_pmd++, addr = next, addr != end);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
|
|
|
pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
|
|
|
|
unsigned long addr, unsigned long end)
|
|
|
|
{
|
|
|
|
pud_t *src_pud, *dst_pud;
|
|
|
|
unsigned long next;
|
|
|
|
|
|
|
|
dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
|
|
|
|
if (!dst_pud)
|
|
|
|
return -ENOMEM;
|
|
|
|
src_pud = pud_offset(src_pgd, addr);
|
|
|
|
do {
|
|
|
|
next = pud_addr_end(addr, end);
|
|
|
|
if (pud_none_or_clear_bad(src_pud))
|
|
|
|
continue;
|
|
|
|
if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
|
|
|
|
vma, addr, next))
|
|
|
|
return -ENOMEM;
|
|
|
|
} while (dst_pud++, src_pud++, addr = next, addr != end);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
|
|
|
struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
pgd_t *src_pgd, *dst_pgd;
|
|
|
|
unsigned long next;
|
|
|
|
unsigned long addr = vma->vm_start;
|
|
|
|
unsigned long end = vma->vm_end;
|
mmu-notifiers: core
With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages.
There are secondary MMUs (with secondary sptes and secondary tlbs) too.
sptes in the kvm case are shadow pagetables, but when I say spte in
mmu-notifier context, I mean "secondary pte". In GRU case there's no
actual secondary pte and there's only a secondary tlb because the GRU
secondary MMU has no knowledge about sptes and every secondary tlb miss
event in the MMU always generates a page fault that has to be resolved by
the CPU (this is not the case of KVM where the a secondary tlb miss will
walk sptes in hardware and it will refill the secondary tlb transparently
to software if the corresponding spte is present). The same way
zap_page_range has to invalidate the pte before freeing the page, the spte
(and secondary tlb) must also be invalidated before any page is freed and
reused.
Currently we take a page_count pin on every page mapped by sptes, but that
means the pages can't be swapped whenever they're mapped by any spte
because they're part of the guest working set. Furthermore a spte unmap
event can immediately lead to a page to be freed when the pin is released
(so requiring the same complex and relatively slow tlb_gather smp safe
logic we have in zap_page_range and that can be avoided completely if the
spte unmap event doesn't require an unpin of the page previously mapped in
the secondary MMU).
The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know
when the VM is swapping or freeing or doing anything on the primary MMU so
that the secondary MMU code can drop sptes before the pages are freed,
avoiding all page pinning and allowing 100% reliable swapping of guest
physical address space. Furthermore it avoids the code that teardown the
mappings of the secondary MMU, to implement a logic like tlb_gather in
zap_page_range that would require many IPI to flush other cpu tlbs, for
each fixed number of spte unmapped.
To make an example: if what happens on the primary MMU is a protection
downgrade (from writeable to wrprotect) the secondary MMU mappings will be
invalidated, and the next secondary-mmu-page-fault will call
get_user_pages and trigger a do_wp_page through get_user_pages if it
called get_user_pages with write=1, and it'll re-establishing an updated
spte or secondary-tlb-mapping on the copied page. Or it will setup a
readonly spte or readonly tlb mapping if it's a guest-read, if it calls
get_user_pages with write=0. This is just an example.
This allows to map any page pointed by any pte (and in turn visible in the
primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an
full MMU with both sptes and secondary-tlb like the shadow-pagetable layer
with kvm), or a remote DMA in software like XPMEM (hence needing of
schedule in XPMEM code to send the invalidate to the remote node, while no
need to schedule in kvm/gru as it's an immediate event like invalidating
primary-mmu pte).
At least for KVM without this patch it's impossible to swap guests
reliably. And having this feature and removing the page pin allows
several other optimizations that simplify life considerably.
Dependencies:
1) mm_take_all_locks() to register the mmu notifier when the whole VM
isn't doing anything with "mm". This allows mmu notifier users to keep
track if the VM is in the middle of the invalidate_range_begin/end
critical section with an atomic counter incraese in range_begin and
decreased in range_end. No secondary MMU page fault is allowed to map
any spte or secondary tlb reference, while the VM is in the middle of
range_begin/end as any page returned by get_user_pages in that critical
section could later immediately be freed without any further
->invalidate_page notification (invalidate_range_begin/end works on
ranges and ->invalidate_page isn't called immediately before freeing
the page). To stop all page freeing and pagetable overwrites the
mmap_sem must be taken in write mode and all other anon_vma/i_mmap
locks must be taken too.
2) It'd be a waste to add branches in the VM if nobody could possibly
run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if
CONFIG_KVM=m/y. In the current kernel kvm won't yet take advantage of
mmu notifiers, but this already allows to compile a KVM external module
against a kernel with mmu notifiers enabled and from the next pull from
kvm.git we'll start using them. And GRU/XPMEM will also be able to
continue the development by enabling KVM=m in their config, until they
submit all GRU/XPMEM GPLv2 code to the mainline kernel. Then they can
also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n).
This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM
are all =n.
The mmu_notifier_register call can fail because mm_take_all_locks may be
interrupted by a signal and return -EINTR. Because mmu_notifier_reigster
is used when a driver startup, a failure can be gracefully handled. Here
an example of the change applied to kvm to register the mmu notifiers.
Usually when a driver startups other allocations are required anyway and
-ENOMEM failure paths exists already.
struct kvm *kvm_arch_create_vm(void)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+ int err;
if (!kvm)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+ err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+ if (err) {
+ kfree(kvm);
+ return ERR_PTR(err);
+ }
+
return kvm;
}
mmu_notifier_unregister returns void and it's reliable.
The patch also adds a few needed but missing includes that would prevent
kernel to compile after these changes on non-x86 archs (x86 didn't need
them by luck).
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix mm/filemap_xip.c build]
[akpm@linux-foundation.org: fix mm/mmu_notifier.c build]
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Kanoj Sarcar <kanojsarcar@yahoo.com>
Cc: Roland Dreier <rdreier@cisco.com>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Chris Wright <chrisw@redhat.com>
Cc: Marcelo Tosatti <marcelo@kvack.org>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Izik Eidus <izike@qumranet.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-29 06:46:29 +08:00
|
|
|
int ret;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-08-28 14:49:11 +08:00
|
|
|
/*
|
|
|
|
* Don't copy ptes where a page fault will fill them correctly.
|
|
|
|
* Fork becomes much lighter when there are big shared or private
|
|
|
|
* readonly mappings. The tradeoff is that copy_page_range is more
|
|
|
|
* efficient than faulting.
|
|
|
|
*/
|
2005-12-17 02:21:23 +08:00
|
|
|
if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
|
2005-08-28 14:49:11 +08:00
|
|
|
if (!vma->anon_vma)
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
if (is_vm_hugetlb_page(vma))
|
|
|
|
return copy_hugetlb_page_range(dst_mm, src_mm, vma);
|
|
|
|
|
2008-12-20 05:47:29 +08:00
|
|
|
if (unlikely(is_pfn_mapping(vma))) {
|
2008-12-19 03:41:29 +08:00
|
|
|
/*
|
|
|
|
* We do not free on error cases below as remove_vma
|
|
|
|
* gets called on error from higher level routine
|
|
|
|
*/
|
|
|
|
ret = track_pfn_vma_copy(vma);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
mmu-notifiers: core
With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages.
There are secondary MMUs (with secondary sptes and secondary tlbs) too.
sptes in the kvm case are shadow pagetables, but when I say spte in
mmu-notifier context, I mean "secondary pte". In GRU case there's no
actual secondary pte and there's only a secondary tlb because the GRU
secondary MMU has no knowledge about sptes and every secondary tlb miss
event in the MMU always generates a page fault that has to be resolved by
the CPU (this is not the case of KVM where the a secondary tlb miss will
walk sptes in hardware and it will refill the secondary tlb transparently
to software if the corresponding spte is present). The same way
zap_page_range has to invalidate the pte before freeing the page, the spte
(and secondary tlb) must also be invalidated before any page is freed and
reused.
Currently we take a page_count pin on every page mapped by sptes, but that
means the pages can't be swapped whenever they're mapped by any spte
because they're part of the guest working set. Furthermore a spte unmap
event can immediately lead to a page to be freed when the pin is released
(so requiring the same complex and relatively slow tlb_gather smp safe
logic we have in zap_page_range and that can be avoided completely if the
spte unmap event doesn't require an unpin of the page previously mapped in
the secondary MMU).
The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know
when the VM is swapping or freeing or doing anything on the primary MMU so
that the secondary MMU code can drop sptes before the pages are freed,
avoiding all page pinning and allowing 100% reliable swapping of guest
physical address space. Furthermore it avoids the code that teardown the
mappings of the secondary MMU, to implement a logic like tlb_gather in
zap_page_range that would require many IPI to flush other cpu tlbs, for
each fixed number of spte unmapped.
To make an example: if what happens on the primary MMU is a protection
downgrade (from writeable to wrprotect) the secondary MMU mappings will be
invalidated, and the next secondary-mmu-page-fault will call
get_user_pages and trigger a do_wp_page through get_user_pages if it
called get_user_pages with write=1, and it'll re-establishing an updated
spte or secondary-tlb-mapping on the copied page. Or it will setup a
readonly spte or readonly tlb mapping if it's a guest-read, if it calls
get_user_pages with write=0. This is just an example.
This allows to map any page pointed by any pte (and in turn visible in the
primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an
full MMU with both sptes and secondary-tlb like the shadow-pagetable layer
with kvm), or a remote DMA in software like XPMEM (hence needing of
schedule in XPMEM code to send the invalidate to the remote node, while no
need to schedule in kvm/gru as it's an immediate event like invalidating
primary-mmu pte).
At least for KVM without this patch it's impossible to swap guests
reliably. And having this feature and removing the page pin allows
several other optimizations that simplify life considerably.
Dependencies:
1) mm_take_all_locks() to register the mmu notifier when the whole VM
isn't doing anything with "mm". This allows mmu notifier users to keep
track if the VM is in the middle of the invalidate_range_begin/end
critical section with an atomic counter incraese in range_begin and
decreased in range_end. No secondary MMU page fault is allowed to map
any spte or secondary tlb reference, while the VM is in the middle of
range_begin/end as any page returned by get_user_pages in that critical
section could later immediately be freed without any further
->invalidate_page notification (invalidate_range_begin/end works on
ranges and ->invalidate_page isn't called immediately before freeing
the page). To stop all page freeing and pagetable overwrites the
mmap_sem must be taken in write mode and all other anon_vma/i_mmap
locks must be taken too.
2) It'd be a waste to add branches in the VM if nobody could possibly
run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if
CONFIG_KVM=m/y. In the current kernel kvm won't yet take advantage of
mmu notifiers, but this already allows to compile a KVM external module
against a kernel with mmu notifiers enabled and from the next pull from
kvm.git we'll start using them. And GRU/XPMEM will also be able to
continue the development by enabling KVM=m in their config, until they
submit all GRU/XPMEM GPLv2 code to the mainline kernel. Then they can
also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n).
This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM
are all =n.
The mmu_notifier_register call can fail because mm_take_all_locks may be
interrupted by a signal and return -EINTR. Because mmu_notifier_reigster
is used when a driver startup, a failure can be gracefully handled. Here
an example of the change applied to kvm to register the mmu notifiers.
Usually when a driver startups other allocations are required anyway and
-ENOMEM failure paths exists already.
struct kvm *kvm_arch_create_vm(void)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+ int err;
if (!kvm)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+ err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+ if (err) {
+ kfree(kvm);
+ return ERR_PTR(err);
+ }
+
return kvm;
}
mmu_notifier_unregister returns void and it's reliable.
The patch also adds a few needed but missing includes that would prevent
kernel to compile after these changes on non-x86 archs (x86 didn't need
them by luck).
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix mm/filemap_xip.c build]
[akpm@linux-foundation.org: fix mm/mmu_notifier.c build]
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Kanoj Sarcar <kanojsarcar@yahoo.com>
Cc: Roland Dreier <rdreier@cisco.com>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Chris Wright <chrisw@redhat.com>
Cc: Marcelo Tosatti <marcelo@kvack.org>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Izik Eidus <izike@qumranet.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-29 06:46:29 +08:00
|
|
|
/*
|
|
|
|
* We need to invalidate the secondary MMU mappings only when
|
|
|
|
* there could be a permission downgrade on the ptes of the
|
|
|
|
* parent mm. And a permission downgrade will only happen if
|
|
|
|
* is_cow_mapping() returns true.
|
|
|
|
*/
|
|
|
|
if (is_cow_mapping(vma->vm_flags))
|
|
|
|
mmu_notifier_invalidate_range_start(src_mm, addr, end);
|
|
|
|
|
|
|
|
ret = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
dst_pgd = pgd_offset(dst_mm, addr);
|
|
|
|
src_pgd = pgd_offset(src_mm, addr);
|
|
|
|
do {
|
|
|
|
next = pgd_addr_end(addr, end);
|
|
|
|
if (pgd_none_or_clear_bad(src_pgd))
|
|
|
|
continue;
|
mmu-notifiers: core
With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages.
There are secondary MMUs (with secondary sptes and secondary tlbs) too.
sptes in the kvm case are shadow pagetables, but when I say spte in
mmu-notifier context, I mean "secondary pte". In GRU case there's no
actual secondary pte and there's only a secondary tlb because the GRU
secondary MMU has no knowledge about sptes and every secondary tlb miss
event in the MMU always generates a page fault that has to be resolved by
the CPU (this is not the case of KVM where the a secondary tlb miss will
walk sptes in hardware and it will refill the secondary tlb transparently
to software if the corresponding spte is present). The same way
zap_page_range has to invalidate the pte before freeing the page, the spte
(and secondary tlb) must also be invalidated before any page is freed and
reused.
Currently we take a page_count pin on every page mapped by sptes, but that
means the pages can't be swapped whenever they're mapped by any spte
because they're part of the guest working set. Furthermore a spte unmap
event can immediately lead to a page to be freed when the pin is released
(so requiring the same complex and relatively slow tlb_gather smp safe
logic we have in zap_page_range and that can be avoided completely if the
spte unmap event doesn't require an unpin of the page previously mapped in
the secondary MMU).
The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know
when the VM is swapping or freeing or doing anything on the primary MMU so
that the secondary MMU code can drop sptes before the pages are freed,
avoiding all page pinning and allowing 100% reliable swapping of guest
physical address space. Furthermore it avoids the code that teardown the
mappings of the secondary MMU, to implement a logic like tlb_gather in
zap_page_range that would require many IPI to flush other cpu tlbs, for
each fixed number of spte unmapped.
To make an example: if what happens on the primary MMU is a protection
downgrade (from writeable to wrprotect) the secondary MMU mappings will be
invalidated, and the next secondary-mmu-page-fault will call
get_user_pages and trigger a do_wp_page through get_user_pages if it
called get_user_pages with write=1, and it'll re-establishing an updated
spte or secondary-tlb-mapping on the copied page. Or it will setup a
readonly spte or readonly tlb mapping if it's a guest-read, if it calls
get_user_pages with write=0. This is just an example.
This allows to map any page pointed by any pte (and in turn visible in the
primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an
full MMU with both sptes and secondary-tlb like the shadow-pagetable layer
with kvm), or a remote DMA in software like XPMEM (hence needing of
schedule in XPMEM code to send the invalidate to the remote node, while no
need to schedule in kvm/gru as it's an immediate event like invalidating
primary-mmu pte).
At least for KVM without this patch it's impossible to swap guests
reliably. And having this feature and removing the page pin allows
several other optimizations that simplify life considerably.
Dependencies:
1) mm_take_all_locks() to register the mmu notifier when the whole VM
isn't doing anything with "mm". This allows mmu notifier users to keep
track if the VM is in the middle of the invalidate_range_begin/end
critical section with an atomic counter incraese in range_begin and
decreased in range_end. No secondary MMU page fault is allowed to map
any spte or secondary tlb reference, while the VM is in the middle of
range_begin/end as any page returned by get_user_pages in that critical
section could later immediately be freed without any further
->invalidate_page notification (invalidate_range_begin/end works on
ranges and ->invalidate_page isn't called immediately before freeing
the page). To stop all page freeing and pagetable overwrites the
mmap_sem must be taken in write mode and all other anon_vma/i_mmap
locks must be taken too.
2) It'd be a waste to add branches in the VM if nobody could possibly
run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if
CONFIG_KVM=m/y. In the current kernel kvm won't yet take advantage of
mmu notifiers, but this already allows to compile a KVM external module
against a kernel with mmu notifiers enabled and from the next pull from
kvm.git we'll start using them. And GRU/XPMEM will also be able to
continue the development by enabling KVM=m in their config, until they
submit all GRU/XPMEM GPLv2 code to the mainline kernel. Then they can
also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n).
This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM
are all =n.
The mmu_notifier_register call can fail because mm_take_all_locks may be
interrupted by a signal and return -EINTR. Because mmu_notifier_reigster
is used when a driver startup, a failure can be gracefully handled. Here
an example of the change applied to kvm to register the mmu notifiers.
Usually when a driver startups other allocations are required anyway and
-ENOMEM failure paths exists already.
struct kvm *kvm_arch_create_vm(void)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+ int err;
if (!kvm)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+ err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+ if (err) {
+ kfree(kvm);
+ return ERR_PTR(err);
+ }
+
return kvm;
}
mmu_notifier_unregister returns void and it's reliable.
The patch also adds a few needed but missing includes that would prevent
kernel to compile after these changes on non-x86 archs (x86 didn't need
them by luck).
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix mm/filemap_xip.c build]
[akpm@linux-foundation.org: fix mm/mmu_notifier.c build]
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Kanoj Sarcar <kanojsarcar@yahoo.com>
Cc: Roland Dreier <rdreier@cisco.com>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Chris Wright <chrisw@redhat.com>
Cc: Marcelo Tosatti <marcelo@kvack.org>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Izik Eidus <izike@qumranet.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-29 06:46:29 +08:00
|
|
|
if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
|
|
|
|
vma, addr, next))) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
break;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
} while (dst_pgd++, src_pgd++, addr = next, addr != end);
|
mmu-notifiers: core
With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages.
There are secondary MMUs (with secondary sptes and secondary tlbs) too.
sptes in the kvm case are shadow pagetables, but when I say spte in
mmu-notifier context, I mean "secondary pte". In GRU case there's no
actual secondary pte and there's only a secondary tlb because the GRU
secondary MMU has no knowledge about sptes and every secondary tlb miss
event in the MMU always generates a page fault that has to be resolved by
the CPU (this is not the case of KVM where the a secondary tlb miss will
walk sptes in hardware and it will refill the secondary tlb transparently
to software if the corresponding spte is present). The same way
zap_page_range has to invalidate the pte before freeing the page, the spte
(and secondary tlb) must also be invalidated before any page is freed and
reused.
Currently we take a page_count pin on every page mapped by sptes, but that
means the pages can't be swapped whenever they're mapped by any spte
because they're part of the guest working set. Furthermore a spte unmap
event can immediately lead to a page to be freed when the pin is released
(so requiring the same complex and relatively slow tlb_gather smp safe
logic we have in zap_page_range and that can be avoided completely if the
spte unmap event doesn't require an unpin of the page previously mapped in
the secondary MMU).
The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know
when the VM is swapping or freeing or doing anything on the primary MMU so
that the secondary MMU code can drop sptes before the pages are freed,
avoiding all page pinning and allowing 100% reliable swapping of guest
physical address space. Furthermore it avoids the code that teardown the
mappings of the secondary MMU, to implement a logic like tlb_gather in
zap_page_range that would require many IPI to flush other cpu tlbs, for
each fixed number of spte unmapped.
To make an example: if what happens on the primary MMU is a protection
downgrade (from writeable to wrprotect) the secondary MMU mappings will be
invalidated, and the next secondary-mmu-page-fault will call
get_user_pages and trigger a do_wp_page through get_user_pages if it
called get_user_pages with write=1, and it'll re-establishing an updated
spte or secondary-tlb-mapping on the copied page. Or it will setup a
readonly spte or readonly tlb mapping if it's a guest-read, if it calls
get_user_pages with write=0. This is just an example.
This allows to map any page pointed by any pte (and in turn visible in the
primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an
full MMU with both sptes and secondary-tlb like the shadow-pagetable layer
with kvm), or a remote DMA in software like XPMEM (hence needing of
schedule in XPMEM code to send the invalidate to the remote node, while no
need to schedule in kvm/gru as it's an immediate event like invalidating
primary-mmu pte).
At least for KVM without this patch it's impossible to swap guests
reliably. And having this feature and removing the page pin allows
several other optimizations that simplify life considerably.
Dependencies:
1) mm_take_all_locks() to register the mmu notifier when the whole VM
isn't doing anything with "mm". This allows mmu notifier users to keep
track if the VM is in the middle of the invalidate_range_begin/end
critical section with an atomic counter incraese in range_begin and
decreased in range_end. No secondary MMU page fault is allowed to map
any spte or secondary tlb reference, while the VM is in the middle of
range_begin/end as any page returned by get_user_pages in that critical
section could later immediately be freed without any further
->invalidate_page notification (invalidate_range_begin/end works on
ranges and ->invalidate_page isn't called immediately before freeing
the page). To stop all page freeing and pagetable overwrites the
mmap_sem must be taken in write mode and all other anon_vma/i_mmap
locks must be taken too.
2) It'd be a waste to add branches in the VM if nobody could possibly
run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if
CONFIG_KVM=m/y. In the current kernel kvm won't yet take advantage of
mmu notifiers, but this already allows to compile a KVM external module
against a kernel with mmu notifiers enabled and from the next pull from
kvm.git we'll start using them. And GRU/XPMEM will also be able to
continue the development by enabling KVM=m in their config, until they
submit all GRU/XPMEM GPLv2 code to the mainline kernel. Then they can
also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n).
This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM
are all =n.
The mmu_notifier_register call can fail because mm_take_all_locks may be
interrupted by a signal and return -EINTR. Because mmu_notifier_reigster
is used when a driver startup, a failure can be gracefully handled. Here
an example of the change applied to kvm to register the mmu notifiers.
Usually when a driver startups other allocations are required anyway and
-ENOMEM failure paths exists already.
struct kvm *kvm_arch_create_vm(void)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+ int err;
if (!kvm)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+ err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+ if (err) {
+ kfree(kvm);
+ return ERR_PTR(err);
+ }
+
return kvm;
}
mmu_notifier_unregister returns void and it's reliable.
The patch also adds a few needed but missing includes that would prevent
kernel to compile after these changes on non-x86 archs (x86 didn't need
them by luck).
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix mm/filemap_xip.c build]
[akpm@linux-foundation.org: fix mm/mmu_notifier.c build]
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Kanoj Sarcar <kanojsarcar@yahoo.com>
Cc: Roland Dreier <rdreier@cisco.com>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Chris Wright <chrisw@redhat.com>
Cc: Marcelo Tosatti <marcelo@kvack.org>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Izik Eidus <izike@qumranet.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-29 06:46:29 +08:00
|
|
|
|
|
|
|
if (is_cow_mapping(vma->vm_flags))
|
|
|
|
mmu_notifier_invalidate_range_end(src_mm,
|
|
|
|
vma->vm_start, end);
|
|
|
|
return ret;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2005-11-14 08:06:42 +08:00
|
|
|
static unsigned long zap_pte_range(struct mmu_gather *tlb,
|
2005-10-30 09:16:12 +08:00
|
|
|
struct vm_area_struct *vma, pmd_t *pmd,
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long addr, unsigned long end,
|
2005-11-14 08:06:42 +08:00
|
|
|
long *zap_work, struct zap_details *details)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2005-10-30 09:16:12 +08:00
|
|
|
struct mm_struct *mm = tlb->mm;
|
2005-04-17 06:20:36 +08:00
|
|
|
pte_t *pte;
|
2005-10-30 09:16:30 +08:00
|
|
|
spinlock_t *ptl;
|
2005-10-30 09:16:05 +08:00
|
|
|
int file_rss = 0;
|
|
|
|
int anon_rss = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-10-30 09:16:30 +08:00
|
|
|
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
|
2006-10-01 14:29:33 +08:00
|
|
|
arch_enter_lazy_mmu_mode();
|
2005-04-17 06:20:36 +08:00
|
|
|
do {
|
|
|
|
pte_t ptent = *pte;
|
2005-11-14 08:06:42 +08:00
|
|
|
if (pte_none(ptent)) {
|
|
|
|
(*zap_work)--;
|
2005-04-17 06:20:36 +08:00
|
|
|
continue;
|
2005-11-14 08:06:42 +08:00
|
|
|
}
|
2006-03-17 15:04:09 +08:00
|
|
|
|
|
|
|
(*zap_work) -= PAGE_SIZE;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
if (pte_present(ptent)) {
|
[PATCH] unpaged: anon in VM_UNPAGED
copy_one_pte needs to copy the anonymous COWed pages in a VM_UNPAGED area,
zap_pte_range needs to free them, do_wp_page needs to COW them: just like
ordinary pages, not like the unpaged.
But recognizing them is a little subtle: because PageReserved is no longer a
condition for remap_pfn_range, we can now mmap all of /dev/mem (whether the
distro permits, and whether it's advisable on this or that architecture, is
another matter). So if we can see a PageAnon, it may not be ours to mess with
(or may be ours from elsewhere in the address space). I suspect there's an
entertaining insoluble self-referential problem here, but the page_is_anon
function does a good practical job, and MAP_PRIVATE PROT_WRITE VM_UNPAGED will
always be an odd choice.
In updating the comment on page_address_in_vma, noticed a potential NULL
dereference, in a path we don't actually take, but fixed it.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-22 13:32:18 +08:00
|
|
|
struct page *page;
|
2005-11-14 08:06:42 +08:00
|
|
|
|
2005-11-29 06:34:23 +08:00
|
|
|
page = vm_normal_page(vma, addr, ptent);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (unlikely(details) && page) {
|
|
|
|
/*
|
|
|
|
* unmap_shared_mapping_pages() wants to
|
|
|
|
* invalidate cache without truncating:
|
|
|
|
* unmap shared but keep private pages.
|
|
|
|
*/
|
|
|
|
if (details->check_mapping &&
|
|
|
|
details->check_mapping != page->mapping)
|
|
|
|
continue;
|
|
|
|
/*
|
|
|
|
* Each page->index must be checked when
|
|
|
|
* invalidating or truncating nonlinear.
|
|
|
|
*/
|
|
|
|
if (details->nonlinear_vma &&
|
|
|
|
(page->index < details->first_index ||
|
|
|
|
page->index > details->last_index))
|
|
|
|
continue;
|
|
|
|
}
|
2005-10-30 09:16:12 +08:00
|
|
|
ptent = ptep_get_and_clear_full(mm, addr, pte,
|
[PATCH] x86: ptep_clear optimization
Add a new accessor for PTEs, which passes the full hint from the mmu_gather
struct; this allows architectures with hardware pagetables to optimize away
atomic PTE operations when destroying an address space. Removing the
locked operation should allow better pipelining of memory access in this
loop. I measured an average savings of 30-35 cycles per zap_pte_range on
the first 500 destructions on Pentium-M, but I believe the optimization
would win more on older processors which still assert the bus lock on xchg
for an exclusive cacheline.
Update: I made some new measurements, and this saves exactly 26 cycles over
ptep_get_and_clear on Pentium M. On P4, with a PAE kernel, this saves 180
cycles per ptep_get_and_clear, for a whopping 92160 cycles savings for a
full address space destruction.
pte_clear_full is not yet used, but is provided for future optimizations
(in particular, when running inside of a hypervisor that queues page table
updates, the full hint allows us to avoid queueing unnecessary page table
update for an address space in the process of being destroyed.
This is not a huge win, but it does help a bit, and sets the stage for
further hypervisor optimization of the mm layer on all architectures.
Signed-off-by: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <christoph@lameter.com>
Cc: <linux-mm@kvack.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-04 06:55:04 +08:00
|
|
|
tlb->fullmm);
|
2005-04-17 06:20:36 +08:00
|
|
|
tlb_remove_tlb_entry(tlb, pte, addr);
|
|
|
|
if (unlikely(!page))
|
|
|
|
continue;
|
|
|
|
if (unlikely(details) && details->nonlinear_vma
|
|
|
|
&& linear_page_index(details->nonlinear_vma,
|
|
|
|
addr) != page->index)
|
2005-10-30 09:16:12 +08:00
|
|
|
set_pte_at(mm, addr, pte,
|
2005-04-17 06:20:36 +08:00
|
|
|
pgoff_to_pte(page->index));
|
|
|
|
if (PageAnon(page))
|
2005-10-30 09:16:14 +08:00
|
|
|
anon_rss--;
|
2005-10-30 09:15:54 +08:00
|
|
|
else {
|
|
|
|
if (pte_dirty(ptent))
|
|
|
|
set_page_dirty(page);
|
2009-01-07 06:39:17 +08:00
|
|
|
if (pte_young(ptent) &&
|
|
|
|
likely(!VM_SequentialReadHint(vma)))
|
2009-01-07 06:38:55 +08:00
|
|
|
mark_page_accessed(page);
|
2005-10-30 09:16:14 +08:00
|
|
|
file_rss--;
|
2005-10-30 09:15:54 +08:00
|
|
|
}
|
2009-01-07 06:40:11 +08:00
|
|
|
page_remove_rmap(page);
|
badpage: replace page_remove_rmap Eeek and BUG
Now that bad pages are kept out of circulation, there is no need for the
infamous page_remove_rmap() BUG() - once that page is freed, its negative
mapcount will issue a "Bad page state" message and the page won't be
freed. Removing the BUG() allows more info, on subsequent pages, to be
gathered.
We do have more info about the page at this point than bad_page() can know
- notably, what the pmd is, which might pinpoint something like low 64kB
corruption - but page_remove_rmap() isn't given the address to find that.
In practice, there is only one call to page_remove_rmap() which has ever
reported anything, that from zap_pte_range() (usually on exit, sometimes
on munmap). It has all the info, so remove page_remove_rmap()'s "Eeek"
message and leave it all to zap_pte_range().
mm/memory.c already has a hardly used print_bad_pte() function, showing
some of the appropriate info: extend it to show what we want for the rmap
case: pte info, page info (when there is a page) and vma info to compare.
zap_pte_range() already knows the pmd, but print_bad_pte() is easier to
use if it works that out for itself.
Some of this info is also shown in bad_page()'s "Bad page state" message.
Keep them separate, but adjust them to match each other as far as
possible. Say "Bad page map" in print_bad_pte(), and add a TAINT_BAD_PAGE
there too.
print_bad_pte() show current->comm unconditionally (though it should get
repeated in the usually irrelevant stack trace): sorry, I misled Nick
Piggin to make it conditional on vm_mm == current->mm, but current->mm is
already NULL in the exit case. Usually current->comm is good, though
exceptionally it may not be that of the mm (when "swapoff" for example).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:40:08 +08:00
|
|
|
if (unlikely(page_mapcount(page) < 0))
|
|
|
|
print_bad_pte(vma, addr, ptent, page);
|
2005-04-17 06:20:36 +08:00
|
|
|
tlb_remove_page(tlb, page);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If details->check_mapping, we leave swap entries;
|
|
|
|
* if details->nonlinear_vma, we leave file entries.
|
|
|
|
*/
|
|
|
|
if (unlikely(details))
|
|
|
|
continue;
|
2009-01-07 06:40:10 +08:00
|
|
|
if (pte_file(ptent)) {
|
|
|
|
if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
|
|
|
|
print_bad_pte(vma, addr, ptent, NULL);
|
|
|
|
} else if
|
|
|
|
(unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
|
|
|
|
print_bad_pte(vma, addr, ptent, NULL);
|
2006-10-01 14:29:31 +08:00
|
|
|
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
|
2005-11-14 08:06:42 +08:00
|
|
|
} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
|
2005-10-30 09:16:05 +08:00
|
|
|
|
2005-10-30 09:16:14 +08:00
|
|
|
add_mm_rss(mm, file_rss, anon_rss);
|
2006-10-01 14:29:33 +08:00
|
|
|
arch_leave_lazy_mmu_mode();
|
2005-10-30 09:16:30 +08:00
|
|
|
pte_unmap_unlock(pte - 1, ptl);
|
2005-11-14 08:06:42 +08:00
|
|
|
|
|
|
|
return addr;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2005-11-14 08:06:42 +08:00
|
|
|
static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
|
2005-10-30 09:16:12 +08:00
|
|
|
struct vm_area_struct *vma, pud_t *pud,
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long addr, unsigned long end,
|
2005-11-14 08:06:42 +08:00
|
|
|
long *zap_work, struct zap_details *details)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
pmd_t *pmd;
|
|
|
|
unsigned long next;
|
|
|
|
|
|
|
|
pmd = pmd_offset(pud, addr);
|
|
|
|
do {
|
|
|
|
next = pmd_addr_end(addr, end);
|
2005-11-14 08:06:42 +08:00
|
|
|
if (pmd_none_or_clear_bad(pmd)) {
|
|
|
|
(*zap_work)--;
|
2005-04-17 06:20:36 +08:00
|
|
|
continue;
|
2005-11-14 08:06:42 +08:00
|
|
|
}
|
|
|
|
next = zap_pte_range(tlb, vma, pmd, addr, next,
|
|
|
|
zap_work, details);
|
|
|
|
} while (pmd++, addr = next, (addr != end && *zap_work > 0));
|
|
|
|
|
|
|
|
return addr;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2005-11-14 08:06:42 +08:00
|
|
|
static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
|
2005-10-30 09:16:12 +08:00
|
|
|
struct vm_area_struct *vma, pgd_t *pgd,
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long addr, unsigned long end,
|
2005-11-14 08:06:42 +08:00
|
|
|
long *zap_work, struct zap_details *details)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
pud_t *pud;
|
|
|
|
unsigned long next;
|
|
|
|
|
|
|
|
pud = pud_offset(pgd, addr);
|
|
|
|
do {
|
|
|
|
next = pud_addr_end(addr, end);
|
2005-11-14 08:06:42 +08:00
|
|
|
if (pud_none_or_clear_bad(pud)) {
|
|
|
|
(*zap_work)--;
|
2005-04-17 06:20:36 +08:00
|
|
|
continue;
|
2005-11-14 08:06:42 +08:00
|
|
|
}
|
|
|
|
next = zap_pmd_range(tlb, vma, pud, addr, next,
|
|
|
|
zap_work, details);
|
|
|
|
} while (pud++, addr = next, (addr != end && *zap_work > 0));
|
|
|
|
|
|
|
|
return addr;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2005-11-14 08:06:42 +08:00
|
|
|
static unsigned long unmap_page_range(struct mmu_gather *tlb,
|
|
|
|
struct vm_area_struct *vma,
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long addr, unsigned long end,
|
2005-11-14 08:06:42 +08:00
|
|
|
long *zap_work, struct zap_details *details)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
pgd_t *pgd;
|
|
|
|
unsigned long next;
|
|
|
|
|
|
|
|
if (details && !details->check_mapping && !details->nonlinear_vma)
|
|
|
|
details = NULL;
|
|
|
|
|
|
|
|
BUG_ON(addr >= end);
|
|
|
|
tlb_start_vma(tlb, vma);
|
|
|
|
pgd = pgd_offset(vma->vm_mm, addr);
|
|
|
|
do {
|
|
|
|
next = pgd_addr_end(addr, end);
|
2005-11-14 08:06:42 +08:00
|
|
|
if (pgd_none_or_clear_bad(pgd)) {
|
|
|
|
(*zap_work)--;
|
2005-04-17 06:20:36 +08:00
|
|
|
continue;
|
2005-11-14 08:06:42 +08:00
|
|
|
}
|
|
|
|
next = zap_pud_range(tlb, vma, pgd, addr, next,
|
|
|
|
zap_work, details);
|
|
|
|
} while (pgd++, addr = next, (addr != end && *zap_work > 0));
|
2005-04-17 06:20:36 +08:00
|
|
|
tlb_end_vma(tlb, vma);
|
2005-11-14 08:06:42 +08:00
|
|
|
|
|
|
|
return addr;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_PREEMPT
|
|
|
|
# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
|
|
|
|
#else
|
|
|
|
/* No preempt: go for improved straight-line efficiency */
|
|
|
|
# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/**
|
|
|
|
* unmap_vmas - unmap a range of memory covered by a list of vma's
|
|
|
|
* @tlbp: address of the caller's struct mmu_gather
|
|
|
|
* @vma: the starting vma
|
|
|
|
* @start_addr: virtual address at which to start unmapping
|
|
|
|
* @end_addr: virtual address at which to end unmapping
|
|
|
|
* @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
|
|
|
|
* @details: details of nonlinear truncation or shared cache invalidation
|
|
|
|
*
|
2005-04-20 04:29:15 +08:00
|
|
|
* Returns the end address of the unmapping (restart addr if interrupted).
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2005-10-30 09:16:30 +08:00
|
|
|
* Unmap all pages in the vma list.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2005-10-30 09:16:30 +08:00
|
|
|
* We aim to not hold locks for too long (for scheduling latency reasons).
|
|
|
|
* So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
|
2005-04-17 06:20:36 +08:00
|
|
|
* return the ending mmu_gather to the caller.
|
|
|
|
*
|
|
|
|
* Only addresses between `start' and `end' will be unmapped.
|
|
|
|
*
|
|
|
|
* The VMA list must be sorted in ascending virtual address order.
|
|
|
|
*
|
|
|
|
* unmap_vmas() assumes that the caller will flush the whole unmapped address
|
|
|
|
* range after unmap_vmas() returns. So the only responsibility here is to
|
|
|
|
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
|
|
|
|
* drops the lock and schedules.
|
|
|
|
*/
|
2005-10-30 09:16:30 +08:00
|
|
|
unsigned long unmap_vmas(struct mmu_gather **tlbp,
|
2005-04-17 06:20:36 +08:00
|
|
|
struct vm_area_struct *vma, unsigned long start_addr,
|
|
|
|
unsigned long end_addr, unsigned long *nr_accounted,
|
|
|
|
struct zap_details *details)
|
|
|
|
{
|
2005-11-14 08:06:42 +08:00
|
|
|
long zap_work = ZAP_BLOCK_SIZE;
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long tlb_start = 0; /* For tlb_finish_mmu */
|
|
|
|
int tlb_start_valid = 0;
|
2005-04-20 04:29:15 +08:00
|
|
|
unsigned long start = start_addr;
|
2005-04-17 06:20:36 +08:00
|
|
|
spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
|
2005-10-30 09:16:02 +08:00
|
|
|
int fullmm = (*tlbp)->fullmm;
|
mmu-notifiers: core
With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages.
There are secondary MMUs (with secondary sptes and secondary tlbs) too.
sptes in the kvm case are shadow pagetables, but when I say spte in
mmu-notifier context, I mean "secondary pte". In GRU case there's no
actual secondary pte and there's only a secondary tlb because the GRU
secondary MMU has no knowledge about sptes and every secondary tlb miss
event in the MMU always generates a page fault that has to be resolved by
the CPU (this is not the case of KVM where the a secondary tlb miss will
walk sptes in hardware and it will refill the secondary tlb transparently
to software if the corresponding spte is present). The same way
zap_page_range has to invalidate the pte before freeing the page, the spte
(and secondary tlb) must also be invalidated before any page is freed and
reused.
Currently we take a page_count pin on every page mapped by sptes, but that
means the pages can't be swapped whenever they're mapped by any spte
because they're part of the guest working set. Furthermore a spte unmap
event can immediately lead to a page to be freed when the pin is released
(so requiring the same complex and relatively slow tlb_gather smp safe
logic we have in zap_page_range and that can be avoided completely if the
spte unmap event doesn't require an unpin of the page previously mapped in
the secondary MMU).
The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know
when the VM is swapping or freeing or doing anything on the primary MMU so
that the secondary MMU code can drop sptes before the pages are freed,
avoiding all page pinning and allowing 100% reliable swapping of guest
physical address space. Furthermore it avoids the code that teardown the
mappings of the secondary MMU, to implement a logic like tlb_gather in
zap_page_range that would require many IPI to flush other cpu tlbs, for
each fixed number of spte unmapped.
To make an example: if what happens on the primary MMU is a protection
downgrade (from writeable to wrprotect) the secondary MMU mappings will be
invalidated, and the next secondary-mmu-page-fault will call
get_user_pages and trigger a do_wp_page through get_user_pages if it
called get_user_pages with write=1, and it'll re-establishing an updated
spte or secondary-tlb-mapping on the copied page. Or it will setup a
readonly spte or readonly tlb mapping if it's a guest-read, if it calls
get_user_pages with write=0. This is just an example.
This allows to map any page pointed by any pte (and in turn visible in the
primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an
full MMU with both sptes and secondary-tlb like the shadow-pagetable layer
with kvm), or a remote DMA in software like XPMEM (hence needing of
schedule in XPMEM code to send the invalidate to the remote node, while no
need to schedule in kvm/gru as it's an immediate event like invalidating
primary-mmu pte).
At least for KVM without this patch it's impossible to swap guests
reliably. And having this feature and removing the page pin allows
several other optimizations that simplify life considerably.
Dependencies:
1) mm_take_all_locks() to register the mmu notifier when the whole VM
isn't doing anything with "mm". This allows mmu notifier users to keep
track if the VM is in the middle of the invalidate_range_begin/end
critical section with an atomic counter incraese in range_begin and
decreased in range_end. No secondary MMU page fault is allowed to map
any spte or secondary tlb reference, while the VM is in the middle of
range_begin/end as any page returned by get_user_pages in that critical
section could later immediately be freed without any further
->invalidate_page notification (invalidate_range_begin/end works on
ranges and ->invalidate_page isn't called immediately before freeing
the page). To stop all page freeing and pagetable overwrites the
mmap_sem must be taken in write mode and all other anon_vma/i_mmap
locks must be taken too.
2) It'd be a waste to add branches in the VM if nobody could possibly
run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if
CONFIG_KVM=m/y. In the current kernel kvm won't yet take advantage of
mmu notifiers, but this already allows to compile a KVM external module
against a kernel with mmu notifiers enabled and from the next pull from
kvm.git we'll start using them. And GRU/XPMEM will also be able to
continue the development by enabling KVM=m in their config, until they
submit all GRU/XPMEM GPLv2 code to the mainline kernel. Then they can
also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n).
This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM
are all =n.
The mmu_notifier_register call can fail because mm_take_all_locks may be
interrupted by a signal and return -EINTR. Because mmu_notifier_reigster
is used when a driver startup, a failure can be gracefully handled. Here
an example of the change applied to kvm to register the mmu notifiers.
Usually when a driver startups other allocations are required anyway and
-ENOMEM failure paths exists already.
struct kvm *kvm_arch_create_vm(void)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+ int err;
if (!kvm)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+ err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+ if (err) {
+ kfree(kvm);
+ return ERR_PTR(err);
+ }
+
return kvm;
}
mmu_notifier_unregister returns void and it's reliable.
The patch also adds a few needed but missing includes that would prevent
kernel to compile after these changes on non-x86 archs (x86 didn't need
them by luck).
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix mm/filemap_xip.c build]
[akpm@linux-foundation.org: fix mm/mmu_notifier.c build]
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Kanoj Sarcar <kanojsarcar@yahoo.com>
Cc: Roland Dreier <rdreier@cisco.com>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Chris Wright <chrisw@redhat.com>
Cc: Marcelo Tosatti <marcelo@kvack.org>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Izik Eidus <izike@qumranet.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-29 06:46:29 +08:00
|
|
|
struct mm_struct *mm = vma->vm_mm;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
mmu-notifiers: core
With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages.
There are secondary MMUs (with secondary sptes and secondary tlbs) too.
sptes in the kvm case are shadow pagetables, but when I say spte in
mmu-notifier context, I mean "secondary pte". In GRU case there's no
actual secondary pte and there's only a secondary tlb because the GRU
secondary MMU has no knowledge about sptes and every secondary tlb miss
event in the MMU always generates a page fault that has to be resolved by
the CPU (this is not the case of KVM where the a secondary tlb miss will
walk sptes in hardware and it will refill the secondary tlb transparently
to software if the corresponding spte is present). The same way
zap_page_range has to invalidate the pte before freeing the page, the spte
(and secondary tlb) must also be invalidated before any page is freed and
reused.
Currently we take a page_count pin on every page mapped by sptes, but that
means the pages can't be swapped whenever they're mapped by any spte
because they're part of the guest working set. Furthermore a spte unmap
event can immediately lead to a page to be freed when the pin is released
(so requiring the same complex and relatively slow tlb_gather smp safe
logic we have in zap_page_range and that can be avoided completely if the
spte unmap event doesn't require an unpin of the page previously mapped in
the secondary MMU).
The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know
when the VM is swapping or freeing or doing anything on the primary MMU so
that the secondary MMU code can drop sptes before the pages are freed,
avoiding all page pinning and allowing 100% reliable swapping of guest
physical address space. Furthermore it avoids the code that teardown the
mappings of the secondary MMU, to implement a logic like tlb_gather in
zap_page_range that would require many IPI to flush other cpu tlbs, for
each fixed number of spte unmapped.
To make an example: if what happens on the primary MMU is a protection
downgrade (from writeable to wrprotect) the secondary MMU mappings will be
invalidated, and the next secondary-mmu-page-fault will call
get_user_pages and trigger a do_wp_page through get_user_pages if it
called get_user_pages with write=1, and it'll re-establishing an updated
spte or secondary-tlb-mapping on the copied page. Or it will setup a
readonly spte or readonly tlb mapping if it's a guest-read, if it calls
get_user_pages with write=0. This is just an example.
This allows to map any page pointed by any pte (and in turn visible in the
primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an
full MMU with both sptes and secondary-tlb like the shadow-pagetable layer
with kvm), or a remote DMA in software like XPMEM (hence needing of
schedule in XPMEM code to send the invalidate to the remote node, while no
need to schedule in kvm/gru as it's an immediate event like invalidating
primary-mmu pte).
At least for KVM without this patch it's impossible to swap guests
reliably. And having this feature and removing the page pin allows
several other optimizations that simplify life considerably.
Dependencies:
1) mm_take_all_locks() to register the mmu notifier when the whole VM
isn't doing anything with "mm". This allows mmu notifier users to keep
track if the VM is in the middle of the invalidate_range_begin/end
critical section with an atomic counter incraese in range_begin and
decreased in range_end. No secondary MMU page fault is allowed to map
any spte or secondary tlb reference, while the VM is in the middle of
range_begin/end as any page returned by get_user_pages in that critical
section could later immediately be freed without any further
->invalidate_page notification (invalidate_range_begin/end works on
ranges and ->invalidate_page isn't called immediately before freeing
the page). To stop all page freeing and pagetable overwrites the
mmap_sem must be taken in write mode and all other anon_vma/i_mmap
locks must be taken too.
2) It'd be a waste to add branches in the VM if nobody could possibly
run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if
CONFIG_KVM=m/y. In the current kernel kvm won't yet take advantage of
mmu notifiers, but this already allows to compile a KVM external module
against a kernel with mmu notifiers enabled and from the next pull from
kvm.git we'll start using them. And GRU/XPMEM will also be able to
continue the development by enabling KVM=m in their config, until they
submit all GRU/XPMEM GPLv2 code to the mainline kernel. Then they can
also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n).
This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM
are all =n.
The mmu_notifier_register call can fail because mm_take_all_locks may be
interrupted by a signal and return -EINTR. Because mmu_notifier_reigster
is used when a driver startup, a failure can be gracefully handled. Here
an example of the change applied to kvm to register the mmu notifiers.
Usually when a driver startups other allocations are required anyway and
-ENOMEM failure paths exists already.
struct kvm *kvm_arch_create_vm(void)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+ int err;
if (!kvm)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+ err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+ if (err) {
+ kfree(kvm);
+ return ERR_PTR(err);
+ }
+
return kvm;
}
mmu_notifier_unregister returns void and it's reliable.
The patch also adds a few needed but missing includes that would prevent
kernel to compile after these changes on non-x86 archs (x86 didn't need
them by luck).
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix mm/filemap_xip.c build]
[akpm@linux-foundation.org: fix mm/mmu_notifier.c build]
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Kanoj Sarcar <kanojsarcar@yahoo.com>
Cc: Roland Dreier <rdreier@cisco.com>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Chris Wright <chrisw@redhat.com>
Cc: Marcelo Tosatti <marcelo@kvack.org>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Izik Eidus <izike@qumranet.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-29 06:46:29 +08:00
|
|
|
mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
|
2005-04-17 06:20:36 +08:00
|
|
|
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
|
|
|
|
unsigned long end;
|
|
|
|
|
|
|
|
start = max(vma->vm_start, start_addr);
|
|
|
|
if (start >= vma->vm_end)
|
|
|
|
continue;
|
|
|
|
end = min(vma->vm_end, end_addr);
|
|
|
|
if (end <= vma->vm_start)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (vma->vm_flags & VM_ACCOUNT)
|
|
|
|
*nr_accounted += (end - start) >> PAGE_SHIFT;
|
|
|
|
|
2008-12-20 05:47:29 +08:00
|
|
|
if (unlikely(is_pfn_mapping(vma)))
|
2008-12-19 03:41:29 +08:00
|
|
|
untrack_pfn_vma(vma, 0, 0);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
while (start != end) {
|
|
|
|
if (!tlb_start_valid) {
|
|
|
|
tlb_start = start;
|
|
|
|
tlb_start_valid = 1;
|
|
|
|
}
|
|
|
|
|
2005-11-14 08:06:42 +08:00
|
|
|
if (unlikely(is_vm_hugetlb_page(vma))) {
|
2008-07-24 12:27:43 +08:00
|
|
|
/*
|
|
|
|
* It is undesirable to test vma->vm_file as it
|
|
|
|
* should be non-null for valid hugetlb area.
|
|
|
|
* However, vm_file will be NULL in the error
|
|
|
|
* cleanup path of do_mmap_pgoff. When
|
|
|
|
* hugetlbfs ->mmap method fails,
|
|
|
|
* do_mmap_pgoff() nullifies vma->vm_file
|
|
|
|
* before calling this function to clean up.
|
|
|
|
* Since no pte has actually been setup, it is
|
|
|
|
* safe to do nothing in this case.
|
|
|
|
*/
|
|
|
|
if (vma->vm_file) {
|
|
|
|
unmap_hugepage_range(vma, start, end, NULL);
|
|
|
|
zap_work -= (end - start) /
|
2008-07-24 12:27:41 +08:00
|
|
|
pages_per_huge_page(hstate_vma(vma));
|
2008-07-24 12:27:43 +08:00
|
|
|
}
|
|
|
|
|
2005-11-14 08:06:42 +08:00
|
|
|
start = end;
|
|
|
|
} else
|
|
|
|
start = unmap_page_range(*tlbp, vma,
|
|
|
|
start, end, &zap_work, details);
|
|
|
|
|
|
|
|
if (zap_work > 0) {
|
|
|
|
BUG_ON(start != end);
|
|
|
|
break;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
tlb_finish_mmu(*tlbp, tlb_start, start);
|
|
|
|
|
|
|
|
if (need_resched() ||
|
2008-01-30 20:31:20 +08:00
|
|
|
(i_mmap_lock && spin_needbreak(i_mmap_lock))) {
|
2005-04-17 06:20:36 +08:00
|
|
|
if (i_mmap_lock) {
|
2005-10-30 09:16:30 +08:00
|
|
|
*tlbp = NULL;
|
2005-04-17 06:20:36 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
|
2005-10-30 09:16:30 +08:00
|
|
|
*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
|
2005-04-17 06:20:36 +08:00
|
|
|
tlb_start_valid = 0;
|
2005-11-14 08:06:42 +08:00
|
|
|
zap_work = ZAP_BLOCK_SIZE;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
out:
|
mmu-notifiers: core
With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages.
There are secondary MMUs (with secondary sptes and secondary tlbs) too.
sptes in the kvm case are shadow pagetables, but when I say spte in
mmu-notifier context, I mean "secondary pte". In GRU case there's no
actual secondary pte and there's only a secondary tlb because the GRU
secondary MMU has no knowledge about sptes and every secondary tlb miss
event in the MMU always generates a page fault that has to be resolved by
the CPU (this is not the case of KVM where the a secondary tlb miss will
walk sptes in hardware and it will refill the secondary tlb transparently
to software if the corresponding spte is present). The same way
zap_page_range has to invalidate the pte before freeing the page, the spte
(and secondary tlb) must also be invalidated before any page is freed and
reused.
Currently we take a page_count pin on every page mapped by sptes, but that
means the pages can't be swapped whenever they're mapped by any spte
because they're part of the guest working set. Furthermore a spte unmap
event can immediately lead to a page to be freed when the pin is released
(so requiring the same complex and relatively slow tlb_gather smp safe
logic we have in zap_page_range and that can be avoided completely if the
spte unmap event doesn't require an unpin of the page previously mapped in
the secondary MMU).
The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know
when the VM is swapping or freeing or doing anything on the primary MMU so
that the secondary MMU code can drop sptes before the pages are freed,
avoiding all page pinning and allowing 100% reliable swapping of guest
physical address space. Furthermore it avoids the code that teardown the
mappings of the secondary MMU, to implement a logic like tlb_gather in
zap_page_range that would require many IPI to flush other cpu tlbs, for
each fixed number of spte unmapped.
To make an example: if what happens on the primary MMU is a protection
downgrade (from writeable to wrprotect) the secondary MMU mappings will be
invalidated, and the next secondary-mmu-page-fault will call
get_user_pages and trigger a do_wp_page through get_user_pages if it
called get_user_pages with write=1, and it'll re-establishing an updated
spte or secondary-tlb-mapping on the copied page. Or it will setup a
readonly spte or readonly tlb mapping if it's a guest-read, if it calls
get_user_pages with write=0. This is just an example.
This allows to map any page pointed by any pte (and in turn visible in the
primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an
full MMU with both sptes and secondary-tlb like the shadow-pagetable layer
with kvm), or a remote DMA in software like XPMEM (hence needing of
schedule in XPMEM code to send the invalidate to the remote node, while no
need to schedule in kvm/gru as it's an immediate event like invalidating
primary-mmu pte).
At least for KVM without this patch it's impossible to swap guests
reliably. And having this feature and removing the page pin allows
several other optimizations that simplify life considerably.
Dependencies:
1) mm_take_all_locks() to register the mmu notifier when the whole VM
isn't doing anything with "mm". This allows mmu notifier users to keep
track if the VM is in the middle of the invalidate_range_begin/end
critical section with an atomic counter incraese in range_begin and
decreased in range_end. No secondary MMU page fault is allowed to map
any spte or secondary tlb reference, while the VM is in the middle of
range_begin/end as any page returned by get_user_pages in that critical
section could later immediately be freed without any further
->invalidate_page notification (invalidate_range_begin/end works on
ranges and ->invalidate_page isn't called immediately before freeing
the page). To stop all page freeing and pagetable overwrites the
mmap_sem must be taken in write mode and all other anon_vma/i_mmap
locks must be taken too.
2) It'd be a waste to add branches in the VM if nobody could possibly
run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if
CONFIG_KVM=m/y. In the current kernel kvm won't yet take advantage of
mmu notifiers, but this already allows to compile a KVM external module
against a kernel with mmu notifiers enabled and from the next pull from
kvm.git we'll start using them. And GRU/XPMEM will also be able to
continue the development by enabling KVM=m in their config, until they
submit all GRU/XPMEM GPLv2 code to the mainline kernel. Then they can
also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n).
This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM
are all =n.
The mmu_notifier_register call can fail because mm_take_all_locks may be
interrupted by a signal and return -EINTR. Because mmu_notifier_reigster
is used when a driver startup, a failure can be gracefully handled. Here
an example of the change applied to kvm to register the mmu notifiers.
Usually when a driver startups other allocations are required anyway and
-ENOMEM failure paths exists already.
struct kvm *kvm_arch_create_vm(void)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+ int err;
if (!kvm)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+ err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+ if (err) {
+ kfree(kvm);
+ return ERR_PTR(err);
+ }
+
return kvm;
}
mmu_notifier_unregister returns void and it's reliable.
The patch also adds a few needed but missing includes that would prevent
kernel to compile after these changes on non-x86 archs (x86 didn't need
them by luck).
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix mm/filemap_xip.c build]
[akpm@linux-foundation.org: fix mm/mmu_notifier.c build]
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Kanoj Sarcar <kanojsarcar@yahoo.com>
Cc: Roland Dreier <rdreier@cisco.com>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Chris Wright <chrisw@redhat.com>
Cc: Marcelo Tosatti <marcelo@kvack.org>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Izik Eidus <izike@qumranet.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-29 06:46:29 +08:00
|
|
|
mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
|
2005-04-20 04:29:15 +08:00
|
|
|
return start; /* which is now the end (or restart) address */
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* zap_page_range - remove user pages in a given range
|
|
|
|
* @vma: vm_area_struct holding the applicable pages
|
|
|
|
* @address: starting address of pages to zap
|
|
|
|
* @size: number of bytes to zap
|
|
|
|
* @details: details of nonlinear truncation or shared cache invalidation
|
|
|
|
*/
|
2005-04-20 04:29:15 +08:00
|
|
|
unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long size, struct zap_details *details)
|
|
|
|
{
|
|
|
|
struct mm_struct *mm = vma->vm_mm;
|
|
|
|
struct mmu_gather *tlb;
|
|
|
|
unsigned long end = address + size;
|
|
|
|
unsigned long nr_accounted = 0;
|
|
|
|
|
|
|
|
lru_add_drain();
|
|
|
|
tlb = tlb_gather_mmu(mm, 0);
|
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:18 +08:00
|
|
|
update_hiwater_rss(mm);
|
2005-10-30 09:16:30 +08:00
|
|
|
end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
|
|
|
|
if (tlb)
|
|
|
|
tlb_finish_mmu(tlb, address, end);
|
2005-04-20 04:29:15 +08:00
|
|
|
return end;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2008-07-30 13:33:53 +08:00
|
|
|
/**
|
|
|
|
* zap_vma_ptes - remove ptes mapping the vma
|
|
|
|
* @vma: vm_area_struct holding ptes to be zapped
|
|
|
|
* @address: starting address of pages to zap
|
|
|
|
* @size: number of bytes to zap
|
|
|
|
*
|
|
|
|
* This function only unmaps ptes assigned to VM_PFNMAP vmas.
|
|
|
|
*
|
|
|
|
* The entire address range must be fully contained within the vma.
|
|
|
|
*
|
|
|
|
* Returns 0 if successful.
|
|
|
|
*/
|
|
|
|
int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
|
|
|
|
unsigned long size)
|
|
|
|
{
|
|
|
|
if (address < vma->vm_start || address + size > vma->vm_end ||
|
|
|
|
!(vma->vm_flags & VM_PFNMAP))
|
|
|
|
return -1;
|
|
|
|
zap_page_range(vma, address, size, NULL);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(zap_vma_ptes);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Do a quick page-table lookup for a single page.
|
|
|
|
*/
|
2005-11-29 06:34:23 +08:00
|
|
|
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
|
2005-10-30 09:16:33 +08:00
|
|
|
unsigned int flags)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
pgd_t *pgd;
|
|
|
|
pud_t *pud;
|
|
|
|
pmd_t *pmd;
|
|
|
|
pte_t *ptep, pte;
|
2005-10-30 09:16:33 +08:00
|
|
|
spinlock_t *ptl;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct page *page;
|
2005-11-29 06:34:23 +08:00
|
|
|
struct mm_struct *mm = vma->vm_mm;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-10-30 09:16:33 +08:00
|
|
|
page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
|
|
|
|
if (!IS_ERR(page)) {
|
|
|
|
BUG_ON(flags & FOLL_GET);
|
|
|
|
goto out;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-10-30 09:16:33 +08:00
|
|
|
page = NULL;
|
2005-04-17 06:20:36 +08:00
|
|
|
pgd = pgd_offset(mm, address);
|
|
|
|
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
|
2005-10-30 09:16:33 +08:00
|
|
|
goto no_page_table;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
pud = pud_offset(pgd, address);
|
2008-07-24 12:27:50 +08:00
|
|
|
if (pud_none(*pud))
|
2005-10-30 09:16:33 +08:00
|
|
|
goto no_page_table;
|
2008-07-24 12:27:50 +08:00
|
|
|
if (pud_huge(*pud)) {
|
|
|
|
BUG_ON(flags & FOLL_GET);
|
|
|
|
page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (unlikely(pud_bad(*pud)))
|
|
|
|
goto no_page_table;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
pmd = pmd_offset(pud, address);
|
2008-05-07 03:49:23 +08:00
|
|
|
if (pmd_none(*pmd))
|
2005-10-30 09:16:33 +08:00
|
|
|
goto no_page_table;
|
|
|
|
if (pmd_huge(*pmd)) {
|
|
|
|
BUG_ON(flags & FOLL_GET);
|
|
|
|
page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
|
2005-04-17 06:20:36 +08:00
|
|
|
goto out;
|
2005-10-30 09:16:33 +08:00
|
|
|
}
|
2008-05-07 03:49:23 +08:00
|
|
|
if (unlikely(pmd_bad(*pmd)))
|
|
|
|
goto no_page_table;
|
|
|
|
|
2005-10-30 09:16:33 +08:00
|
|
|
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
pte = *ptep;
|
2005-10-30 09:16:33 +08:00
|
|
|
if (!pte_present(pte))
|
2008-06-21 02:18:25 +08:00
|
|
|
goto no_page;
|
2005-10-30 09:16:33 +08:00
|
|
|
if ((flags & FOLL_WRITE) && !pte_write(pte))
|
|
|
|
goto unlock;
|
2005-11-29 06:34:23 +08:00
|
|
|
page = vm_normal_page(vma, address, pte);
|
|
|
|
if (unlikely(!page))
|
2008-06-21 02:18:25 +08:00
|
|
|
goto bad_page;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-10-30 09:16:33 +08:00
|
|
|
if (flags & FOLL_GET)
|
|
|
|
get_page(page);
|
|
|
|
if (flags & FOLL_TOUCH) {
|
|
|
|
if ((flags & FOLL_WRITE) &&
|
|
|
|
!pte_dirty(pte) && !PageDirty(page))
|
|
|
|
set_page_dirty(page);
|
2009-04-01 06:19:37 +08:00
|
|
|
/*
|
|
|
|
* pte_mkyoung() would be more correct here, but atomic care
|
|
|
|
* is needed to avoid losing the dirty bit: it is easier to use
|
|
|
|
* mark_page_accessed().
|
|
|
|
*/
|
2005-10-30 09:16:33 +08:00
|
|
|
mark_page_accessed(page);
|
|
|
|
}
|
|
|
|
unlock:
|
|
|
|
pte_unmap_unlock(ptep, ptl);
|
2005-04-17 06:20:36 +08:00
|
|
|
out:
|
2005-10-30 09:16:33 +08:00
|
|
|
return page;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-06-21 02:18:25 +08:00
|
|
|
bad_page:
|
|
|
|
pte_unmap_unlock(ptep, ptl);
|
|
|
|
return ERR_PTR(-EFAULT);
|
|
|
|
|
|
|
|
no_page:
|
|
|
|
pte_unmap_unlock(ptep, ptl);
|
|
|
|
if (!pte_none(pte))
|
|
|
|
return page;
|
|
|
|
/* Fall through to ZERO_PAGE handling */
|
2005-10-30 09:16:33 +08:00
|
|
|
no_page_table:
|
|
|
|
/*
|
|
|
|
* When core dumping an enormous anonymous area that nobody
|
|
|
|
* has touched so far, we don't want to allocate page tables.
|
|
|
|
*/
|
|
|
|
if (flags & FOLL_ANON) {
|
remove ZERO_PAGE
The commit b5810039a54e5babf428e9a1e89fc1940fabff11 contains the note
A last caveat: the ZERO_PAGE is now refcounted and managed with rmap
(and thus mapcounted and count towards shared rss). These writes to
the struct page could cause excessive cacheline bouncing on big
systems. There are a number of ways this could be addressed if it is
an issue.
And indeed this cacheline bouncing has shown up on large SGI systems.
There was a situation where an Altix system was essentially livelocked
tearing down ZERO_PAGE pagetables when an HPC app aborted during startup.
This situation can be avoided in userspace, but it does highlight the
potential scalability problem with refcounting ZERO_PAGE, and corner
cases where it can really hurt (we don't want the system to livelock!).
There are several broad ways to fix this problem:
1. add back some special casing to avoid refcounting ZERO_PAGE
2. per-node or per-cpu ZERO_PAGES
3. remove the ZERO_PAGE completely
I will argue for 3. The others should also fix the problem, but they
result in more complex code than does 3, with little or no real benefit
that I can see.
Why? Inserting a ZERO_PAGE for anonymous read faults appears to be a
false optimisation: if an application is performance critical, it would
not be doing many read faults of new memory, or at least it could be
expected to write to that memory soon afterwards. If cache or memory use
is critical, it should not be working with a significant number of
ZERO_PAGEs anyway (a more compact representation of zeroes should be
used).
As a sanity check -- mesuring on my desktop system, there are never many
mappings to the ZERO_PAGE (eg. 2 or 3), thus memory usage here should not
increase much without it.
When running a make -j4 kernel compile on my dual core system, there are
about 1,000 mappings to the ZERO_PAGE created per second, but about 1,000
ZERO_PAGE COW faults per second (less than 1 ZERO_PAGE mapping per second
is torn down without being COWed). So removing ZERO_PAGE will save 1,000
page faults per second when running kbuild, while keeping it only saves
less than 1 page clearing operation per second. 1 page clear is cheaper
than a thousand faults, presumably, so there isn't an obvious loss.
Neither the logical argument nor these basic tests give a guarantee of no
regressions. However, this is a reasonable opportunity to try to remove
the ZERO_PAGE from the pagefault path. If it is found to cause regressions,
we can reintroduce it and just avoid refcounting it.
The /dev/zero ZERO_PAGE usage and TLB tricks also get nuked. I don't see
much use to them except on benchmarks. All other users of ZERO_PAGE are
converted just to use ZERO_PAGE(0) for simplicity. We can look at
replacing them all and maybe ripping out ZERO_PAGE completely when we are
more satisfied with this solution.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus "snif" Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:24:40 +08:00
|
|
|
page = ZERO_PAGE(0);
|
2005-10-30 09:16:33 +08:00
|
|
|
if (flags & FOLL_GET)
|
|
|
|
get_page(page);
|
|
|
|
BUG_ON(flags & FOLL_WRITE);
|
|
|
|
}
|
|
|
|
return page;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2008-06-24 02:21:37 +08:00
|
|
|
/* Can we do the FOLL_ANON optimization? */
|
|
|
|
static inline int use_zero_page(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We don't want to optimize FOLL_ANON for make_pages_present()
|
|
|
|
* when it tries to page in a VM_LOCKED region. As to VM_SHARED,
|
|
|
|
* we want to get the page from the page tables to make sure
|
|
|
|
* that we serialize and update with any other user of that
|
|
|
|
* mapping.
|
|
|
|
*/
|
|
|
|
if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
|
|
|
|
return 0;
|
|
|
|
/*
|
2008-07-24 12:27:05 +08:00
|
|
|
* And if we have a fault routine, it's not an anonymous region.
|
2008-06-24 02:21:37 +08:00
|
|
|
*/
|
2008-07-24 12:27:05 +08:00
|
|
|
return !vma->vm_ops || !vma->vm_ops->fault;
|
2008-06-24 02:21:37 +08:00
|
|
|
}
|
|
|
|
|
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 11:26:44 +08:00
|
|
|
|
|
|
|
|
|
|
|
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
|
|
|
unsigned long start, int len, int flags,
|
2005-04-17 06:20:36 +08:00
|
|
|
struct page **pages, struct vm_area_struct **vmas)
|
|
|
|
{
|
|
|
|
int i;
|
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 11:26:44 +08:00
|
|
|
unsigned int vm_flags = 0;
|
|
|
|
int write = !!(flags & GUP_FLAGS_WRITE);
|
|
|
|
int force = !!(flags & GUP_FLAGS_FORCE);
|
|
|
|
int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
|
mm: make get_user_pages() interruptible
The initial implementation of checking TIF_MEMDIE covers the cases of OOM
killing. If the process has been OOM killed, the TIF_MEMDIE is set and it
return immediately. This patch includes:
1. add the case that the SIGKILL is sent by user processes. The
process can try to get_user_pages() unlimited memory even if a user
process has sent a SIGKILL to it(maybe a monitor find the process
exceed its memory limit and try to kill it). In the old
implementation, the SIGKILL won't be handled until the get_user_pages()
returns.
2. change the return value to be ERESTARTSYS. It makes no sense to
return ENOMEM if the get_user_pages returned by getting a SIGKILL
signal. Considering the general convention for a system call
interrupted by a signal is ERESTARTNOSYS, so the current return value
is consistant to that.
Lee:
An unfortunate side effect of "make-get_user_pages-interruptible" is that
it prevents a SIGKILL'd task from munlock-ing pages that it had mlocked,
resulting in freeing of mlocked pages. Freeing of mlocked pages, in
itself, is not so bad. We just count them now--altho' I had hoped to
remove this stat and add PG_MLOCKED to the free pages flags check.
However, consider pages in shared libraries mapped by more than one task
that a task mlocked--e.g., via mlockall(). If the task that mlocked the
pages exits via SIGKILL, these pages would be left mlocked and
unevictable.
Proposed fix:
Add another GUP flag to ignore sigkill when calling get_user_pages from
munlock()--similar to Kosaki Motohiro's 'IGNORE_VMA_PERMISSIONS flag for
the same purpose. We are not actually allocating memory in this case,
which "make-get_user_pages-interruptible" intends to avoid. We're just
munlocking pages that are already resident and mapped, and we're reusing
get_user_pages() to access those pages.
?? Maybe we should combine 'IGNORE_VMA_PERMISSIONS and '_IGNORE_SIGKILL
into a single flag: GUP_FLAGS_MUNLOCK ???
[Lee.Schermerhorn@hp.com: ignore sigkill in get_user_pages during munlock]
Signed-off-by: Paul Menage <menage@google.com>
Signed-off-by: Ying Han <yinghan@google.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:40:18 +08:00
|
|
|
int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
Be more robust about bad arguments in get_user_pages()
So I spent a while pounding my head against my monitor trying to figure
out the vmsplice() vulnerability - how could a failure to check for
*read* access turn into a root exploit? It turns out that it's a buffer
overflow problem which is made easy by the way get_user_pages() is
coded.
In particular, "len" is a signed int, and it is only checked at the
*end* of a do {} while() loop. So, if it is passed in as zero, the loop
will execute once and decrement len to -1. At that point, the loop will
proceed until the next invalid address is found; in the process, it will
likely overflow the pages array passed in to get_user_pages().
I think that, if get_user_pages() has been asked to grab zero pages,
that's what it should do. Thus this patch; it is, among other things,
enough to block the (already fixed) root exploit and any others which
might be lurking in similar code. I also think that the number of pages
should be unsigned, but changing the prototype of this function probably
requires some more careful review.
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-12 07:17:33 +08:00
|
|
|
if (len <= 0)
|
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Require read or write permissions.
|
|
|
|
* If 'force' is set, we only require the "MAY" flags.
|
|
|
|
*/
|
2005-10-30 09:16:33 +08:00
|
|
|
vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
|
|
|
|
vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
|
2005-04-17 06:20:36 +08:00
|
|
|
i = 0;
|
|
|
|
|
|
|
|
do {
|
2005-10-30 09:16:33 +08:00
|
|
|
struct vm_area_struct *vma;
|
|
|
|
unsigned int foll_flags;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
vma = find_extend_vma(mm, start);
|
|
|
|
if (!vma && in_gate_area(tsk, start)) {
|
|
|
|
unsigned long pg = start & PAGE_MASK;
|
|
|
|
struct vm_area_struct *gate_vma = get_gate_vma(tsk);
|
|
|
|
pgd_t *pgd;
|
|
|
|
pud_t *pud;
|
|
|
|
pmd_t *pmd;
|
|
|
|
pte_t *pte;
|
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 11:26:44 +08:00
|
|
|
|
|
|
|
/* user gate pages are read-only */
|
|
|
|
if (!ignore && write)
|
2005-04-17 06:20:36 +08:00
|
|
|
return i ? : -EFAULT;
|
|
|
|
if (pg > TASK_SIZE)
|
|
|
|
pgd = pgd_offset_k(pg);
|
|
|
|
else
|
|
|
|
pgd = pgd_offset_gate(mm, pg);
|
|
|
|
BUG_ON(pgd_none(*pgd));
|
|
|
|
pud = pud_offset(pgd, pg);
|
|
|
|
BUG_ON(pud_none(*pud));
|
|
|
|
pmd = pmd_offset(pud, pg);
|
2005-08-02 12:11:42 +08:00
|
|
|
if (pmd_none(*pmd))
|
|
|
|
return i ? : -EFAULT;
|
2005-04-17 06:20:36 +08:00
|
|
|
pte = pte_offset_map(pmd, pg);
|
2005-08-02 12:11:42 +08:00
|
|
|
if (pte_none(*pte)) {
|
|
|
|
pte_unmap(pte);
|
|
|
|
return i ? : -EFAULT;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
if (pages) {
|
2005-11-29 15:43:17 +08:00
|
|
|
struct page *page = vm_normal_page(gate_vma, start, *pte);
|
2005-11-29 06:34:23 +08:00
|
|
|
pages[i] = page;
|
|
|
|
if (page)
|
|
|
|
get_page(page);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
pte_unmap(pte);
|
|
|
|
if (vmas)
|
|
|
|
vmas[i] = gate_vma;
|
|
|
|
i++;
|
|
|
|
start += PAGE_SIZE;
|
|
|
|
len--;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 11:26:44 +08:00
|
|
|
if (!vma ||
|
|
|
|
(vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
|
|
|
|
(!ignore && !(vm_flags & vma->vm_flags)))
|
2005-04-17 06:20:36 +08:00
|
|
|
return i ? : -EFAULT;
|
|
|
|
|
|
|
|
if (is_vm_hugetlb_page(vma)) {
|
|
|
|
i = follow_hugetlb_page(mm, vma, pages, vmas,
|
2007-11-15 08:59:33 +08:00
|
|
|
&start, &len, i, write);
|
2005-04-17 06:20:36 +08:00
|
|
|
continue;
|
|
|
|
}
|
2005-10-30 09:16:33 +08:00
|
|
|
|
|
|
|
foll_flags = FOLL_TOUCH;
|
|
|
|
if (pages)
|
|
|
|
foll_flags |= FOLL_GET;
|
2008-06-24 02:21:37 +08:00
|
|
|
if (!write && use_zero_page(vma))
|
2005-10-30 09:16:33 +08:00
|
|
|
foll_flags |= FOLL_ANON;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
do {
|
2005-06-22 08:15:10 +08:00
|
|
|
struct page *page;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-07-16 14:38:16 +08:00
|
|
|
/*
|
mm: make get_user_pages() interruptible
The initial implementation of checking TIF_MEMDIE covers the cases of OOM
killing. If the process has been OOM killed, the TIF_MEMDIE is set and it
return immediately. This patch includes:
1. add the case that the SIGKILL is sent by user processes. The
process can try to get_user_pages() unlimited memory even if a user
process has sent a SIGKILL to it(maybe a monitor find the process
exceed its memory limit and try to kill it). In the old
implementation, the SIGKILL won't be handled until the get_user_pages()
returns.
2. change the return value to be ERESTARTSYS. It makes no sense to
return ENOMEM if the get_user_pages returned by getting a SIGKILL
signal. Considering the general convention for a system call
interrupted by a signal is ERESTARTNOSYS, so the current return value
is consistant to that.
Lee:
An unfortunate side effect of "make-get_user_pages-interruptible" is that
it prevents a SIGKILL'd task from munlock-ing pages that it had mlocked,
resulting in freeing of mlocked pages. Freeing of mlocked pages, in
itself, is not so bad. We just count them now--altho' I had hoped to
remove this stat and add PG_MLOCKED to the free pages flags check.
However, consider pages in shared libraries mapped by more than one task
that a task mlocked--e.g., via mlockall(). If the task that mlocked the
pages exits via SIGKILL, these pages would be left mlocked and
unevictable.
Proposed fix:
Add another GUP flag to ignore sigkill when calling get_user_pages from
munlock()--similar to Kosaki Motohiro's 'IGNORE_VMA_PERMISSIONS flag for
the same purpose. We are not actually allocating memory in this case,
which "make-get_user_pages-interruptible" intends to avoid. We're just
munlocking pages that are already resident and mapped, and we're reusing
get_user_pages() to access those pages.
?? Maybe we should combine 'IGNORE_VMA_PERMISSIONS and '_IGNORE_SIGKILL
into a single flag: GUP_FLAGS_MUNLOCK ???
[Lee.Schermerhorn@hp.com: ignore sigkill in get_user_pages during munlock]
Signed-off-by: Paul Menage <menage@google.com>
Signed-off-by: Ying Han <yinghan@google.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:40:18 +08:00
|
|
|
* If we have a pending SIGKILL, don't keep faulting
|
|
|
|
* pages and potentially allocating memory, unless
|
|
|
|
* current is handling munlock--e.g., on exit. In
|
|
|
|
* that case, we are not allocating memory. Rather,
|
|
|
|
* we're only unlocking already resident/mapped pages.
|
2007-07-16 14:38:16 +08:00
|
|
|
*/
|
mm: make get_user_pages() interruptible
The initial implementation of checking TIF_MEMDIE covers the cases of OOM
killing. If the process has been OOM killed, the TIF_MEMDIE is set and it
return immediately. This patch includes:
1. add the case that the SIGKILL is sent by user processes. The
process can try to get_user_pages() unlimited memory even if a user
process has sent a SIGKILL to it(maybe a monitor find the process
exceed its memory limit and try to kill it). In the old
implementation, the SIGKILL won't be handled until the get_user_pages()
returns.
2. change the return value to be ERESTARTSYS. It makes no sense to
return ENOMEM if the get_user_pages returned by getting a SIGKILL
signal. Considering the general convention for a system call
interrupted by a signal is ERESTARTNOSYS, so the current return value
is consistant to that.
Lee:
An unfortunate side effect of "make-get_user_pages-interruptible" is that
it prevents a SIGKILL'd task from munlock-ing pages that it had mlocked,
resulting in freeing of mlocked pages. Freeing of mlocked pages, in
itself, is not so bad. We just count them now--altho' I had hoped to
remove this stat and add PG_MLOCKED to the free pages flags check.
However, consider pages in shared libraries mapped by more than one task
that a task mlocked--e.g., via mlockall(). If the task that mlocked the
pages exits via SIGKILL, these pages would be left mlocked and
unevictable.
Proposed fix:
Add another GUP flag to ignore sigkill when calling get_user_pages from
munlock()--similar to Kosaki Motohiro's 'IGNORE_VMA_PERMISSIONS flag for
the same purpose. We are not actually allocating memory in this case,
which "make-get_user_pages-interruptible" intends to avoid. We're just
munlocking pages that are already resident and mapped, and we're reusing
get_user_pages() to access those pages.
?? Maybe we should combine 'IGNORE_VMA_PERMISSIONS and '_IGNORE_SIGKILL
into a single flag: GUP_FLAGS_MUNLOCK ???
[Lee.Schermerhorn@hp.com: ignore sigkill in get_user_pages during munlock]
Signed-off-by: Paul Menage <menage@google.com>
Signed-off-by: Ying Han <yinghan@google.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:40:18 +08:00
|
|
|
if (unlikely(!ignore_sigkill &&
|
|
|
|
fatal_signal_pending(current)))
|
|
|
|
return i ? i : -ERESTARTSYS;
|
2007-07-16 14:38:16 +08:00
|
|
|
|
2005-10-30 09:16:33 +08:00
|
|
|
if (write)
|
|
|
|
foll_flags |= FOLL_WRITE;
|
2005-08-04 01:07:09 +08:00
|
|
|
|
2005-10-30 09:16:33 +08:00
|
|
|
cond_resched();
|
2005-11-29 06:34:23 +08:00
|
|
|
while (!(page = follow_page(vma, start, foll_flags))) {
|
2005-10-30 09:16:33 +08:00
|
|
|
int ret;
|
2009-04-11 00:01:23 +08:00
|
|
|
|
2009-06-23 20:52:49 +08:00
|
|
|
ret = handle_mm_fault(mm, vma, start,
|
|
|
|
(foll_flags & FOLL_WRITE) ?
|
|
|
|
FAULT_FLAG_WRITE : 0);
|
|
|
|
|
2007-07-19 16:47:05 +08:00
|
|
|
if (ret & VM_FAULT_ERROR) {
|
|
|
|
if (ret & VM_FAULT_OOM)
|
|
|
|
return i ? i : -ENOMEM;
|
|
|
|
else if (ret & VM_FAULT_SIGBUS)
|
|
|
|
return i ? i : -EFAULT;
|
|
|
|
BUG();
|
|
|
|
}
|
|
|
|
if (ret & VM_FAULT_MAJOR)
|
|
|
|
tsk->maj_flt++;
|
|
|
|
else
|
|
|
|
tsk->min_flt++;
|
|
|
|
|
2005-08-04 01:07:09 +08:00
|
|
|
/*
|
2007-07-19 16:47:05 +08:00
|
|
|
* The VM_FAULT_WRITE bit tells us that
|
|
|
|
* do_wp_page has broken COW when necessary,
|
|
|
|
* even if maybe_mkwrite decided not to set
|
|
|
|
* pte_write. We can thus safely do subsequent
|
2009-01-07 06:39:32 +08:00
|
|
|
* page lookups as if they were reads. But only
|
|
|
|
* do so when looping for pte_write is futile:
|
|
|
|
* in some cases userspace may also be wanting
|
|
|
|
* to write to the gotten user page, which a
|
|
|
|
* read fault here might prevent (a readonly
|
|
|
|
* page might get reCOWed by userspace write).
|
2005-08-04 01:07:09 +08:00
|
|
|
*/
|
2009-01-07 06:39:32 +08:00
|
|
|
if ((ret & VM_FAULT_WRITE) &&
|
|
|
|
!(vma->vm_flags & VM_WRITE))
|
2005-10-30 09:16:33 +08:00
|
|
|
foll_flags &= ~FOLL_WRITE;
|
2007-07-19 16:47:05 +08:00
|
|
|
|
[PATCH] page fault retry with NOPAGE_REFAULT
Add a way for a no_page() handler to request a retry of the faulting
instruction. It goes back to userland on page faults and just tries again
in get_user_pages(). I added a cond_resched() in the loop in that later
case.
The problem I have with signal and spufs is an actual bug affecting apps and I
don't see other ways of fixing it.
In addition, we are having issues with infiniband and 64k pages (related to
the way the hypervisor deals with some HV cards) that will require us to muck
around with the MMU from within the IB driver's no_page() (it's a pSeries
specific driver) and return to the caller the same way using NOPAGE_REFAULT.
And to add to this, the graphics folks have been following a new approach of
memory management that involves transparently swapping objects between video
ram and main meory. To do that, they need installing PTEs from a no_page()
handler as well and that also requires returning with NOPAGE_REFAULT.
(For the later, they are currently using io_remap_pfn_range to install one PTE
from no_page() which is a bit racy, we need to add a check for the PTE having
already been installed afer taking the lock, but that's ok, they are only at
the proof-of-concept stage. I'll send a patch adding a "clean" function to do
that, we can use that from spufs too and get rid of the sparsemem hacks we do
to create struct page for SPEs. Basically, that provides a generic solution
for being able to have no_page() map hardware devices, which is something that
I think sound driver folks have been asking for some time too).
All of these things depend on having the NOPAGE_REFAULT exit path from
no_page() handlers.
Signed-off-by: Benjamin Herrenchmidt <benh@kernel.crashing.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-06 15:43:53 +08:00
|
|
|
cond_resched();
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2008-06-21 02:18:25 +08:00
|
|
|
if (IS_ERR(page))
|
|
|
|
return i ? i : PTR_ERR(page);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (pages) {
|
2005-06-22 08:15:10 +08:00
|
|
|
pages[i] = page;
|
2006-03-26 17:36:57 +08:00
|
|
|
|
2006-12-31 06:24:19 +08:00
|
|
|
flush_anon_page(vma, page, start);
|
2005-06-22 08:15:10 +08:00
|
|
|
flush_dcache_page(page);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
if (vmas)
|
|
|
|
vmas[i] = vma;
|
|
|
|
i++;
|
|
|
|
start += PAGE_SIZE;
|
|
|
|
len--;
|
2005-06-22 08:15:10 +08:00
|
|
|
} while (len && start < vma->vm_end);
|
|
|
|
} while (len);
|
2005-04-17 06:20:36 +08:00
|
|
|
return i;
|
|
|
|
}
|
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 11:26:44 +08:00
|
|
|
|
2009-06-17 06:31:39 +08:00
|
|
|
/**
|
|
|
|
* get_user_pages() - pin user pages in memory
|
|
|
|
* @tsk: task_struct of target task
|
|
|
|
* @mm: mm_struct of target mm
|
|
|
|
* @start: starting user address
|
|
|
|
* @len: number of pages from start to pin
|
|
|
|
* @write: whether pages will be written to by the caller
|
|
|
|
* @force: whether to force write access even if user mapping is
|
|
|
|
* readonly. This will result in the page being COWed even
|
|
|
|
* in MAP_SHARED mappings. You do not want this.
|
|
|
|
* @pages: array that receives pointers to the pages pinned.
|
|
|
|
* Should be at least nr_pages long. Or NULL, if caller
|
|
|
|
* only intends to ensure the pages are faulted in.
|
|
|
|
* @vmas: array of pointers to vmas corresponding to each page.
|
|
|
|
* Or NULL if the caller does not require them.
|
|
|
|
*
|
|
|
|
* Returns number of pages pinned. This may be fewer than the number
|
|
|
|
* requested. If len is 0 or negative, returns 0. If no pages
|
|
|
|
* were pinned, returns -errno. Each page returned must be released
|
|
|
|
* with a put_page() call when it is finished with. vmas will only
|
|
|
|
* remain valid while mmap_sem is held.
|
|
|
|
*
|
|
|
|
* Must be called with mmap_sem held for read or write.
|
|
|
|
*
|
|
|
|
* get_user_pages walks a process's page tables and takes a reference to
|
|
|
|
* each struct page that each user address corresponds to at a given
|
|
|
|
* instant. That is, it takes the page that would be accessed if a user
|
|
|
|
* thread accesses the given user virtual address at that instant.
|
|
|
|
*
|
|
|
|
* This does not guarantee that the page exists in the user mappings when
|
|
|
|
* get_user_pages returns, and there may even be a completely different
|
|
|
|
* page there in some cases (eg. if mmapped pagecache has been invalidated
|
|
|
|
* and subsequently re faulted). However it does guarantee that the page
|
|
|
|
* won't be freed completely. And mostly callers simply care that the page
|
|
|
|
* contains data that was valid *at some point in time*. Typically, an IO
|
|
|
|
* or similar operation cannot guarantee anything stronger anyway because
|
|
|
|
* locks can't be held over the syscall boundary.
|
|
|
|
*
|
|
|
|
* If write=0, the page must not be written to. If the page is written to,
|
|
|
|
* set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
|
|
|
|
* after the page is finished with, and before put_page is called.
|
|
|
|
*
|
|
|
|
* get_user_pages is typically used for fewer-copy IO operations, to get a
|
|
|
|
* handle on the memory by some means other than accesses via the user virtual
|
|
|
|
* addresses. The pages may be submitted for DMA to devices or accessed via
|
|
|
|
* their kernel linear mapping (via the kmap APIs). Care should be taken to
|
|
|
|
* use the correct cache flushing APIs.
|
|
|
|
*
|
|
|
|
* See also get_user_pages_fast, for performance critical applications.
|
|
|
|
*/
|
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 11:26:44 +08:00
|
|
|
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
|
|
|
unsigned long start, int len, int write, int force,
|
|
|
|
struct page **pages, struct vm_area_struct **vmas)
|
|
|
|
{
|
|
|
|
int flags = 0;
|
|
|
|
|
|
|
|
if (write)
|
|
|
|
flags |= GUP_FLAGS_WRITE;
|
|
|
|
if (force)
|
|
|
|
flags |= GUP_FLAGS_FORCE;
|
|
|
|
|
|
|
|
return __get_user_pages(tsk, mm,
|
|
|
|
start, len, flags,
|
|
|
|
pages, vmas);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
EXPORT_SYMBOL(get_user_pages);
|
|
|
|
|
2008-02-05 14:29:26 +08:00
|
|
|
pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
|
|
|
|
spinlock_t **ptl)
|
2005-11-30 06:03:14 +08:00
|
|
|
{
|
|
|
|
pgd_t * pgd = pgd_offset(mm, addr);
|
|
|
|
pud_t * pud = pud_alloc(mm, pgd, addr);
|
|
|
|
if (pud) {
|
2005-11-30 08:27:22 +08:00
|
|
|
pmd_t * pmd = pmd_alloc(mm, pud, addr);
|
2005-11-30 06:03:14 +08:00
|
|
|
if (pmd)
|
|
|
|
return pte_alloc_map_lock(mm, pmd, addr, ptl);
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2005-11-30 05:01:56 +08:00
|
|
|
/*
|
|
|
|
* This is the old fallback for page remapping.
|
|
|
|
*
|
|
|
|
* For historical reasons, it only allows reserved pages. Only
|
|
|
|
* old drivers should use this, and they needed to mark their
|
|
|
|
* pages reserved for the old functions anyway.
|
|
|
|
*/
|
2008-04-28 17:13:01 +08:00
|
|
|
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
|
|
|
|
struct page *page, pgprot_t prot)
|
2005-11-30 05:01:56 +08:00
|
|
|
{
|
2008-04-28 17:13:01 +08:00
|
|
|
struct mm_struct *mm = vma->vm_mm;
|
2005-11-30 05:01:56 +08:00
|
|
|
int retval;
|
2005-11-30 06:03:14 +08:00
|
|
|
pte_t *pte;
|
2008-02-07 16:13:53 +08:00
|
|
|
spinlock_t *ptl;
|
|
|
|
|
2005-11-30 05:01:56 +08:00
|
|
|
retval = -EINVAL;
|
2005-12-01 01:35:19 +08:00
|
|
|
if (PageAnon(page))
|
2008-10-19 11:28:10 +08:00
|
|
|
goto out;
|
2005-11-30 05:01:56 +08:00
|
|
|
retval = -ENOMEM;
|
|
|
|
flush_dcache_page(page);
|
2005-11-30 06:03:14 +08:00
|
|
|
pte = get_locked_pte(mm, addr, &ptl);
|
2005-11-30 05:01:56 +08:00
|
|
|
if (!pte)
|
2008-10-19 11:28:10 +08:00
|
|
|
goto out;
|
2005-11-30 05:01:56 +08:00
|
|
|
retval = -EBUSY;
|
|
|
|
if (!pte_none(*pte))
|
|
|
|
goto out_unlock;
|
|
|
|
|
|
|
|
/* Ok, finally just insert the thing.. */
|
|
|
|
get_page(page);
|
|
|
|
inc_mm_counter(mm, file_rss);
|
|
|
|
page_add_file_rmap(page);
|
|
|
|
set_pte_at(mm, addr, pte, mk_pte(page, prot));
|
|
|
|
|
|
|
|
retval = 0;
|
2008-02-07 16:13:53 +08:00
|
|
|
pte_unmap_unlock(pte, ptl);
|
|
|
|
return retval;
|
2005-11-30 05:01:56 +08:00
|
|
|
out_unlock:
|
|
|
|
pte_unmap_unlock(pte, ptl);
|
|
|
|
out:
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
2006-09-26 14:31:22 +08:00
|
|
|
/**
|
|
|
|
* vm_insert_page - insert single page into user vma
|
|
|
|
* @vma: user vma to map to
|
|
|
|
* @addr: target user address of this page
|
|
|
|
* @page: source kernel page
|
|
|
|
*
|
2005-12-01 01:35:19 +08:00
|
|
|
* This allows drivers to insert individual pages they've allocated
|
|
|
|
* into a user vma.
|
|
|
|
*
|
|
|
|
* The page has to be a nice clean _individual_ kernel allocation.
|
|
|
|
* If you allocate a compound page, you need to have marked it as
|
|
|
|
* such (__GFP_COMP), or manually just split the page up yourself
|
2006-03-22 16:08:05 +08:00
|
|
|
* (see split_page()).
|
2005-12-01 01:35:19 +08:00
|
|
|
*
|
|
|
|
* NOTE! Traditionally this was done with "remap_pfn_range()" which
|
|
|
|
* took an arbitrary page protection parameter. This doesn't allow
|
|
|
|
* that. Your vma protection will have to be set up correctly, which
|
|
|
|
* means that if you want a shared writable mapping, you'd better
|
|
|
|
* ask for a shared writable mapping!
|
|
|
|
*
|
|
|
|
* The page does not need to be reserved.
|
|
|
|
*/
|
2008-04-28 17:13:01 +08:00
|
|
|
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
|
|
|
|
struct page *page)
|
2005-12-01 01:35:19 +08:00
|
|
|
{
|
|
|
|
if (addr < vma->vm_start || addr >= vma->vm_end)
|
|
|
|
return -EFAULT;
|
|
|
|
if (!page_count(page))
|
|
|
|
return -EINVAL;
|
2005-12-17 02:21:23 +08:00
|
|
|
vma->vm_flags |= VM_INSERTPAGE;
|
2008-04-28 17:13:01 +08:00
|
|
|
return insert_page(vma, addr, page, vma->vm_page_prot);
|
2005-12-01 01:35:19 +08:00
|
|
|
}
|
2005-12-04 12:48:11 +08:00
|
|
|
EXPORT_SYMBOL(vm_insert_page);
|
2005-12-01 01:35:19 +08:00
|
|
|
|
2008-04-28 17:13:01 +08:00
|
|
|
static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
|
|
|
|
unsigned long pfn, pgprot_t prot)
|
|
|
|
{
|
|
|
|
struct mm_struct *mm = vma->vm_mm;
|
|
|
|
int retval;
|
|
|
|
pte_t *pte, entry;
|
|
|
|
spinlock_t *ptl;
|
|
|
|
|
|
|
|
retval = -ENOMEM;
|
|
|
|
pte = get_locked_pte(mm, addr, &ptl);
|
|
|
|
if (!pte)
|
|
|
|
goto out;
|
|
|
|
retval = -EBUSY;
|
|
|
|
if (!pte_none(*pte))
|
|
|
|
goto out_unlock;
|
|
|
|
|
|
|
|
/* Ok, finally just insert the thing.. */
|
|
|
|
entry = pte_mkspecial(pfn_pte(pfn, prot));
|
|
|
|
set_pte_at(mm, addr, pte, entry);
|
|
|
|
update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */
|
|
|
|
|
|
|
|
retval = 0;
|
|
|
|
out_unlock:
|
|
|
|
pte_unmap_unlock(pte, ptl);
|
|
|
|
out:
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
2007-02-12 16:51:36 +08:00
|
|
|
/**
|
|
|
|
* vm_insert_pfn - insert single pfn into user vma
|
|
|
|
* @vma: user vma to map to
|
|
|
|
* @addr: target user address of this page
|
|
|
|
* @pfn: source kernel pfn
|
|
|
|
*
|
|
|
|
* Similar to vm_inert_page, this allows drivers to insert individual pages
|
|
|
|
* they've allocated into a user vma. Same comments apply.
|
|
|
|
*
|
|
|
|
* This function should only be called from a vm_ops->fault handler, and
|
|
|
|
* in that case the handler should return NULL.
|
2008-07-24 12:27:05 +08:00
|
|
|
*
|
|
|
|
* vma cannot be a COW mapping.
|
|
|
|
*
|
|
|
|
* As this is called only for pages that do not currently exist, we
|
|
|
|
* do not need to flush old virtual caches or the TLB.
|
2007-02-12 16:51:36 +08:00
|
|
|
*/
|
|
|
|
int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
|
2008-04-28 17:13:01 +08:00
|
|
|
unsigned long pfn)
|
2007-02-12 16:51:36 +08:00
|
|
|
{
|
2008-12-19 03:41:29 +08:00
|
|
|
int ret;
|
2009-01-10 08:13:11 +08:00
|
|
|
pgprot_t pgprot = vma->vm_page_prot;
|
mm: introduce pte_special pte bit
s390 for one, cannot implement VM_MIXEDMAP with pfn_valid, due to their memory
model (which is more dynamic than most). Instead, they had proposed to
implement it with an additional path through vm_normal_page(), using a bit in
the pte to determine whether or not the page should be refcounted:
vm_normal_page()
{
...
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
#else
if (!pfn_valid(pfn))
return NULL;
#endif
goto out;
}
...
}
This is fine, however if we are allowed to use a bit in the pte to determine
refcountedness, we can use that to _completely_ replace all the vma based
schemes. So instead of adding more cases to the already complex vma-based
scheme, we can have a clearly seperate and simple pte-based scheme (and get
slightly better code generation in the process):
vm_normal_page()
{
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
return pte_page(pte);
#else
...
#endif
}
And finally, we may rather make this concept usable by any architecture rather
than making it s390 only, so implement a new type of pte state for this.
Unfortunately the old vma based code must stay, because some architectures may
not be able to spare pte bits. This makes vm_normal_page a little bit more
ugly than we would like, but the 2 cases are clearly seperate.
So introduce a pte_special pte state, and use it in mm/memory.c. It is
currently a noop for all architectures, so this doesn't actually result in any
compiled code changes to mm/memory.o.
BTW:
I haven't put vm_normal_page() into arch code as-per an earlier suggestion.
The reason is that, regardless of where vm_normal_page is actually
implemented, the *abstraction* is still exactly the same. Also, while it
depends on whether the architecture has pte_special or not, that is the
only two possible cases, and it really isn't an arch specific function --
the role of the arch code should be to provide primitive functions and
accessors with which to build the core code; pte_special does that. We do
not want architectures to know or care about vm_normal_page itself, and
we definitely don't want them being able to invent something new there
out of sight of mm/ code. If we made vm_normal_page an arch function, then
we have to make vm_insert_mixed (next patch) an arch function too. So I
don't think moving it to arch code fundamentally improves any abstractions,
while it does practically make the code more difficult to follow, for both
mm and arch developers, and easier to misuse.
[akpm@linux-foundation.org: build fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:13:00 +08:00
|
|
|
/*
|
|
|
|
* Technically, architectures with pte_special can avoid all these
|
|
|
|
* restrictions (same for remap_pfn_range). However we would like
|
|
|
|
* consistency in testing and feature parity among all, so we should
|
|
|
|
* try to keep these invariants in place for everybody.
|
|
|
|
*/
|
mm: introduce VM_MIXEDMAP
This series introduces some important infrastructure work. The overall result
is that:
1. We now support XIP backed filesystems using memory that have no
struct page allocated to them. And patches 6 and 7 actually implement
this for s390.
This is pretty important in a number of cases. As far as I understand,
in the case of virtualisation (eg. s390), each guest may mount a
readonly copy of the same filesystem (eg. the distro). Currently,
guests need to allocate struct pages for this image. So if you have
100 guests, you already need to allocate more memory for the struct
pages than the size of the image. I think. (Carsten?)
For other (eg. embedded) systems, you may have a very large non-
volatile filesystem. If you have to have struct pages for this, then
your RAM consumption will go up proportionally to fs size. Even
though it is just a small proportion, the RAM can be much more costly
eg in terms of power, so every KB less that Linux uses makes it more
attractive to a lot of these guys.
2. VM_MIXEDMAP allows us to support mappings where you actually do want
to refcount _some_ pages in the mapping, but not others, and support
COW on arbitrary (non-linear) mappings. Jared needs this for his NVRAM
filesystem in progress. Future iterations of this filesystem will
most likely want to migrate pages between pagecache and XIP backing,
which is where the requirement for mixed (some refcounted, some not)
comes from.
3. pte_special also has a peripheral usage that I need for my lockless
get_user_pages patch. That was shown to speed up "oltp" on db2 by
10% on a 2 socket system, which is kind of significant because they
scrounge for months to try to find 0.1% improvement on these
workloads. I'm hoping we might finally be faster than AIX on
pSeries with this :). My reference to lockless get_user_pages is not
meant to justify this patchset (which doesn't include lockless gup),
but just to show that pte_special is not some s390 specific thing that
should be hidden in arch code or xip code: I definitely want to use it
on at least x86 and powerpc as well.
This patch:
Introduce a new type of mapping, VM_MIXEDMAP. This is unlike VM_PFNMAP in
that it can support COW mappings of arbitrary ranges including ranges without
struct page *and* ranges with a struct page that we actually want to refcount
(PFNMAP can only support COW in those cases where the un-COW-ed translations
are mapped linearly in the virtual address, and can only support non
refcounted ranges).
VM_MIXEDMAP achieves this by refcounting all pfn_valid pages, and not
refcounting !pfn_valid pages (which is not an option for VM_PFNMAP, because it
needs to avoid refcounting pfn_valid pages eg. for /dev/mem mappings).
Signed-off-by: Jared Hulbert <jaredeh@gmail.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:12:58 +08:00
|
|
|
BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
|
|
|
|
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
|
|
|
|
(VM_PFNMAP|VM_MIXEDMAP));
|
|
|
|
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
|
|
|
|
BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
|
2007-02-12 16:51:36 +08:00
|
|
|
|
2008-04-28 17:13:01 +08:00
|
|
|
if (addr < vma->vm_start || addr >= vma->vm_end)
|
|
|
|
return -EFAULT;
|
2009-01-10 08:13:11 +08:00
|
|
|
if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
|
2008-12-19 03:41:29 +08:00
|
|
|
return -EINVAL;
|
|
|
|
|
2009-01-10 08:13:11 +08:00
|
|
|
ret = insert_pfn(vma, addr, pfn, pgprot);
|
2008-12-19 03:41:29 +08:00
|
|
|
|
|
|
|
if (ret)
|
|
|
|
untrack_pfn_vma(vma, pfn, PAGE_SIZE);
|
|
|
|
|
|
|
|
return ret;
|
2008-04-28 17:13:01 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(vm_insert_pfn);
|
2007-02-12 16:51:36 +08:00
|
|
|
|
2008-04-28 17:13:01 +08:00
|
|
|
int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
|
|
|
|
unsigned long pfn)
|
|
|
|
{
|
|
|
|
BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
|
2007-02-12 16:51:36 +08:00
|
|
|
|
2008-04-28 17:13:01 +08:00
|
|
|
if (addr < vma->vm_start || addr >= vma->vm_end)
|
|
|
|
return -EFAULT;
|
2007-02-12 16:51:36 +08:00
|
|
|
|
2008-04-28 17:13:01 +08:00
|
|
|
/*
|
|
|
|
* If we don't have pte special, then we have to use the pfn_valid()
|
|
|
|
* based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
|
|
|
|
* refcount the page if pfn_valid is true (hence insert_page rather
|
|
|
|
* than insert_pfn).
|
|
|
|
*/
|
|
|
|
if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
|
|
|
|
struct page *page;
|
|
|
|
|
|
|
|
page = pfn_to_page(pfn);
|
|
|
|
return insert_page(vma, addr, page, vma->vm_page_prot);
|
|
|
|
}
|
|
|
|
return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
|
2007-02-12 16:51:36 +08:00
|
|
|
}
|
2008-04-28 17:13:01 +08:00
|
|
|
EXPORT_SYMBOL(vm_insert_mixed);
|
2007-02-12 16:51:36 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* maps a range of physical memory into the requested pages. the old
|
|
|
|
* mappings are removed. any references to nonexistent pages results
|
|
|
|
* in null mappings (currently treated as "copy-on-access")
|
|
|
|
*/
|
|
|
|
static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
|
|
|
|
unsigned long addr, unsigned long end,
|
|
|
|
unsigned long pfn, pgprot_t prot)
|
|
|
|
{
|
|
|
|
pte_t *pte;
|
2005-10-30 09:16:23 +08:00
|
|
|
spinlock_t *ptl;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-10-30 09:16:23 +08:00
|
|
|
pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!pte)
|
|
|
|
return -ENOMEM;
|
2006-10-01 14:29:33 +08:00
|
|
|
arch_enter_lazy_mmu_mode();
|
2005-04-17 06:20:36 +08:00
|
|
|
do {
|
|
|
|
BUG_ON(!pte_none(*pte));
|
mm: introduce pte_special pte bit
s390 for one, cannot implement VM_MIXEDMAP with pfn_valid, due to their memory
model (which is more dynamic than most). Instead, they had proposed to
implement it with an additional path through vm_normal_page(), using a bit in
the pte to determine whether or not the page should be refcounted:
vm_normal_page()
{
...
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
#else
if (!pfn_valid(pfn))
return NULL;
#endif
goto out;
}
...
}
This is fine, however if we are allowed to use a bit in the pte to determine
refcountedness, we can use that to _completely_ replace all the vma based
schemes. So instead of adding more cases to the already complex vma-based
scheme, we can have a clearly seperate and simple pte-based scheme (and get
slightly better code generation in the process):
vm_normal_page()
{
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
return pte_page(pte);
#else
...
#endif
}
And finally, we may rather make this concept usable by any architecture rather
than making it s390 only, so implement a new type of pte state for this.
Unfortunately the old vma based code must stay, because some architectures may
not be able to spare pte bits. This makes vm_normal_page a little bit more
ugly than we would like, but the 2 cases are clearly seperate.
So introduce a pte_special pte state, and use it in mm/memory.c. It is
currently a noop for all architectures, so this doesn't actually result in any
compiled code changes to mm/memory.o.
BTW:
I haven't put vm_normal_page() into arch code as-per an earlier suggestion.
The reason is that, regardless of where vm_normal_page is actually
implemented, the *abstraction* is still exactly the same. Also, while it
depends on whether the architecture has pte_special or not, that is the
only two possible cases, and it really isn't an arch specific function --
the role of the arch code should be to provide primitive functions and
accessors with which to build the core code; pte_special does that. We do
not want architectures to know or care about vm_normal_page itself, and
we definitely don't want them being able to invent something new there
out of sight of mm/ code. If we made vm_normal_page an arch function, then
we have to make vm_insert_mixed (next patch) an arch function too. So I
don't think moving it to arch code fundamentally improves any abstractions,
while it does practically make the code more difficult to follow, for both
mm and arch developers, and easier to misuse.
[akpm@linux-foundation.org: build fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:13:00 +08:00
|
|
|
set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
|
2005-04-17 06:20:36 +08:00
|
|
|
pfn++;
|
|
|
|
} while (pte++, addr += PAGE_SIZE, addr != end);
|
2006-10-01 14:29:33 +08:00
|
|
|
arch_leave_lazy_mmu_mode();
|
2005-10-30 09:16:23 +08:00
|
|
|
pte_unmap_unlock(pte - 1, ptl);
|
2005-04-17 06:20:36 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
|
|
|
|
unsigned long addr, unsigned long end,
|
|
|
|
unsigned long pfn, pgprot_t prot)
|
|
|
|
{
|
|
|
|
pmd_t *pmd;
|
|
|
|
unsigned long next;
|
|
|
|
|
|
|
|
pfn -= addr >> PAGE_SHIFT;
|
|
|
|
pmd = pmd_alloc(mm, pud, addr);
|
|
|
|
if (!pmd)
|
|
|
|
return -ENOMEM;
|
|
|
|
do {
|
|
|
|
next = pmd_addr_end(addr, end);
|
|
|
|
if (remap_pte_range(mm, pmd, addr, next,
|
|
|
|
pfn + (addr >> PAGE_SHIFT), prot))
|
|
|
|
return -ENOMEM;
|
|
|
|
} while (pmd++, addr = next, addr != end);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
|
|
|
|
unsigned long addr, unsigned long end,
|
|
|
|
unsigned long pfn, pgprot_t prot)
|
|
|
|
{
|
|
|
|
pud_t *pud;
|
|
|
|
unsigned long next;
|
|
|
|
|
|
|
|
pfn -= addr >> PAGE_SHIFT;
|
|
|
|
pud = pud_alloc(mm, pgd, addr);
|
|
|
|
if (!pud)
|
|
|
|
return -ENOMEM;
|
|
|
|
do {
|
|
|
|
next = pud_addr_end(addr, end);
|
|
|
|
if (remap_pmd_range(mm, pud, addr, next,
|
|
|
|
pfn + (addr >> PAGE_SHIFT), prot))
|
|
|
|
return -ENOMEM;
|
|
|
|
} while (pud++, addr = next, addr != end);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2006-09-26 14:31:22 +08:00
|
|
|
/**
|
|
|
|
* remap_pfn_range - remap kernel memory to userspace
|
|
|
|
* @vma: user vma to map to
|
|
|
|
* @addr: target user address to start at
|
|
|
|
* @pfn: physical address of kernel memory
|
|
|
|
* @size: size of map area
|
|
|
|
* @prot: page protection flags for this mapping
|
|
|
|
*
|
|
|
|
* Note: this is only safe if the mm semaphore is held when called.
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
|
|
|
|
unsigned long pfn, unsigned long size, pgprot_t prot)
|
|
|
|
{
|
|
|
|
pgd_t *pgd;
|
|
|
|
unsigned long next;
|
2005-06-26 05:54:33 +08:00
|
|
|
unsigned long end = addr + PAGE_ALIGN(size);
|
2005-04-17 06:20:36 +08:00
|
|
|
struct mm_struct *mm = vma->vm_mm;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Physically remapped pages are special. Tell the
|
|
|
|
* rest of the world about it:
|
|
|
|
* VM_IO tells people not to look at these pages
|
|
|
|
* (accesses can have side effects).
|
[PATCH] unpaged: VM_UNPAGED
Although we tend to associate VM_RESERVED with remap_pfn_range, quite a few
drivers set VM_RESERVED on areas which are then populated by nopage. The
PageReserved removal in 2.6.15-rc1 changed VM_RESERVED not to free pages in
zap_pte_range, without changing those drivers not to set it: so their pages
just leak away.
Let's not change miscellaneous drivers now: introduce VM_UNPAGED at the core,
to flag the special areas where the ptes may have no struct page, or if they
have then it's not to be touched. Replace most instances of VM_RESERVED in
core mm by VM_UNPAGED. Force it on in remap_pfn_range, and the sparc and
sparc64 io_remap_pfn_range.
Revert addition of VM_RESERVED to powerpc vdso, it's not needed there. Is it
needed anywhere? It still governs the mm->reserved_vm statistic, and special
vmas not to be merged, and areas not to be core dumped; but could probably be
eliminated later (the drivers are probably specifying it because in 2.4 it
kept swapout off the vma, but in 2.6 we work from the LRU, which these pages
don't get on).
Use the VM_SHM slot for VM_UNPAGED, and define VM_SHM to 0: it serves no
purpose whatsoever, and should be removed from drivers when we clean up.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: William Irwin <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-22 13:32:15 +08:00
|
|
|
* VM_RESERVED is specified all over the place, because
|
|
|
|
* in 2.4 it kept swapout's vma scan off this vma; but
|
|
|
|
* in 2.6 the LRU scan won't even find its pages, so this
|
|
|
|
* flag means no more than count its pages in reserved_vm,
|
|
|
|
* and omit it from core dump, even when VM_IO turned off.
|
2005-11-29 06:34:23 +08:00
|
|
|
* VM_PFNMAP tells the core MM that the base pages are just
|
|
|
|
* raw PFN mappings, and do not have a "struct page" associated
|
|
|
|
* with them.
|
2005-12-12 11:46:02 +08:00
|
|
|
*
|
|
|
|
* There's a horrible special case to handle copy-on-write
|
|
|
|
* behaviour that some programs depend on. We mark the "original"
|
|
|
|
* un-COW'ed pages by matching them up with "vma->vm_pgoff".
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2009-03-13 08:45:27 +08:00
|
|
|
if (addr == vma->vm_start && end == vma->vm_end) {
|
2005-12-12 11:46:02 +08:00
|
|
|
vma->vm_pgoff = pfn;
|
2009-03-14 07:35:44 +08:00
|
|
|
vma->vm_flags |= VM_PFN_AT_MMAP;
|
2009-03-13 08:45:27 +08:00
|
|
|
} else if (is_cow_mapping(vma->vm_flags))
|
x86: PAT: store vm_pgoff for all linear_over_vma_region mappings - v3
Impact: Code transformation, new functions added should have no effect.
Drivers use mmap followed by pgprot_* and remap_pfn_range or vm_insert_pfn,
in order to export reserved memory to userspace. Currently, such mappings are
not tracked and hence not kept consistent with other mappings (/dev/mem,
pci resource, ioremap) for the sme memory, that may exist in the system.
The following patchset adds x86 PAT attribute tracking and untracking for
pfnmap related APIs.
First three patches in the patchset are changing the generic mm code to fit
in this tracking. Last four patches are x86 specific to make things work
with x86 PAT code. The patchset aso introduces pgprot_writecombine interface,
which gives writecombine mapping when enabled, falling back to
pgprot_noncached otherwise.
This patch:
While working on x86 PAT, we faced some hurdles with trackking
remap_pfn_range() regions, as we do not have any information to say
whether that PFNMAP mapping is linear for the entire vma range or
it is smaller granularity regions within the vma.
A simple solution to this is to use vm_pgoff as an indicator for
linear mapping over the vma region. Currently, remap_pfn_range
only sets vm_pgoff for COW mappings. Below patch changes the
logic and sets the vm_pgoff irrespective of COW. This will still not
be enough for the case where pfn is zero (vma region mapped to
physical address zero). But, for all the other cases, we can look at
pfnmap VMAs and say whether the mappng is for the entire vma region
or not.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2008-12-19 03:41:27 +08:00
|
|
|
return -EINVAL;
|
2005-12-12 11:46:02 +08:00
|
|
|
|
2005-11-29 06:34:23 +08:00
|
|
|
vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-01-10 08:13:11 +08:00
|
|
|
err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
|
2009-01-10 08:13:09 +08:00
|
|
|
if (err) {
|
|
|
|
/*
|
|
|
|
* To indicate that track_pfn related cleanup is not
|
|
|
|
* needed from higher level routine calling unmap_vmas
|
|
|
|
*/
|
|
|
|
vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
|
2009-03-14 07:35:44 +08:00
|
|
|
vma->vm_flags &= ~VM_PFN_AT_MMAP;
|
2008-12-19 03:41:29 +08:00
|
|
|
return -EINVAL;
|
2009-01-10 08:13:09 +08:00
|
|
|
}
|
2008-12-19 03:41:29 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
BUG_ON(addr >= end);
|
|
|
|
pfn -= addr >> PAGE_SHIFT;
|
|
|
|
pgd = pgd_offset(mm, addr);
|
|
|
|
flush_cache_range(vma, addr, end);
|
|
|
|
do {
|
|
|
|
next = pgd_addr_end(addr, end);
|
|
|
|
err = remap_pud_range(mm, pgd, addr, next,
|
|
|
|
pfn + (addr >> PAGE_SHIFT), prot);
|
|
|
|
if (err)
|
|
|
|
break;
|
|
|
|
} while (pgd++, addr = next, addr != end);
|
2008-12-19 03:41:29 +08:00
|
|
|
|
|
|
|
if (err)
|
|
|
|
untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(remap_pfn_range);
|
|
|
|
|
2007-05-07 05:48:54 +08:00
|
|
|
static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
|
|
|
|
unsigned long addr, unsigned long end,
|
|
|
|
pte_fn_t fn, void *data)
|
|
|
|
{
|
|
|
|
pte_t *pte;
|
|
|
|
int err;
|
2008-02-08 20:22:04 +08:00
|
|
|
pgtable_t token;
|
2007-05-07 05:49:17 +08:00
|
|
|
spinlock_t *uninitialized_var(ptl);
|
2007-05-07 05:48:54 +08:00
|
|
|
|
|
|
|
pte = (mm == &init_mm) ?
|
|
|
|
pte_alloc_kernel(pmd, addr) :
|
|
|
|
pte_alloc_map_lock(mm, pmd, addr, &ptl);
|
|
|
|
if (!pte)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
BUG_ON(pmd_huge(*pmd));
|
|
|
|
|
2009-01-07 06:39:21 +08:00
|
|
|
arch_enter_lazy_mmu_mode();
|
|
|
|
|
2008-02-08 20:22:04 +08:00
|
|
|
token = pmd_pgtable(*pmd);
|
2007-05-07 05:48:54 +08:00
|
|
|
|
|
|
|
do {
|
2008-02-08 20:22:04 +08:00
|
|
|
err = fn(pte, token, addr, data);
|
2007-05-07 05:48:54 +08:00
|
|
|
if (err)
|
|
|
|
break;
|
|
|
|
} while (pte++, addr += PAGE_SIZE, addr != end);
|
|
|
|
|
2009-01-07 06:39:21 +08:00
|
|
|
arch_leave_lazy_mmu_mode();
|
|
|
|
|
2007-05-07 05:48:54 +08:00
|
|
|
if (mm != &init_mm)
|
|
|
|
pte_unmap_unlock(pte-1, ptl);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
|
|
|
|
unsigned long addr, unsigned long end,
|
|
|
|
pte_fn_t fn, void *data)
|
|
|
|
{
|
|
|
|
pmd_t *pmd;
|
|
|
|
unsigned long next;
|
|
|
|
int err;
|
|
|
|
|
2008-07-24 12:27:50 +08:00
|
|
|
BUG_ON(pud_huge(*pud));
|
|
|
|
|
2007-05-07 05:48:54 +08:00
|
|
|
pmd = pmd_alloc(mm, pud, addr);
|
|
|
|
if (!pmd)
|
|
|
|
return -ENOMEM;
|
|
|
|
do {
|
|
|
|
next = pmd_addr_end(addr, end);
|
|
|
|
err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
|
|
|
|
if (err)
|
|
|
|
break;
|
|
|
|
} while (pmd++, addr = next, addr != end);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
|
|
|
|
unsigned long addr, unsigned long end,
|
|
|
|
pte_fn_t fn, void *data)
|
|
|
|
{
|
|
|
|
pud_t *pud;
|
|
|
|
unsigned long next;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
pud = pud_alloc(mm, pgd, addr);
|
|
|
|
if (!pud)
|
|
|
|
return -ENOMEM;
|
|
|
|
do {
|
|
|
|
next = pud_addr_end(addr, end);
|
|
|
|
err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
|
|
|
|
if (err)
|
|
|
|
break;
|
|
|
|
} while (pud++, addr = next, addr != end);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Scan a region of virtual memory, filling in page tables as necessary
|
|
|
|
* and calling a provided function on each leaf page table.
|
|
|
|
*/
|
|
|
|
int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
|
|
|
|
unsigned long size, pte_fn_t fn, void *data)
|
|
|
|
{
|
|
|
|
pgd_t *pgd;
|
|
|
|
unsigned long next;
|
mmu-notifiers: core
With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages.
There are secondary MMUs (with secondary sptes and secondary tlbs) too.
sptes in the kvm case are shadow pagetables, but when I say spte in
mmu-notifier context, I mean "secondary pte". In GRU case there's no
actual secondary pte and there's only a secondary tlb because the GRU
secondary MMU has no knowledge about sptes and every secondary tlb miss
event in the MMU always generates a page fault that has to be resolved by
the CPU (this is not the case of KVM where the a secondary tlb miss will
walk sptes in hardware and it will refill the secondary tlb transparently
to software if the corresponding spte is present). The same way
zap_page_range has to invalidate the pte before freeing the page, the spte
(and secondary tlb) must also be invalidated before any page is freed and
reused.
Currently we take a page_count pin on every page mapped by sptes, but that
means the pages can't be swapped whenever they're mapped by any spte
because they're part of the guest working set. Furthermore a spte unmap
event can immediately lead to a page to be freed when the pin is released
(so requiring the same complex and relatively slow tlb_gather smp safe
logic we have in zap_page_range and that can be avoided completely if the
spte unmap event doesn't require an unpin of the page previously mapped in
the secondary MMU).
The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know
when the VM is swapping or freeing or doing anything on the primary MMU so
that the secondary MMU code can drop sptes before the pages are freed,
avoiding all page pinning and allowing 100% reliable swapping of guest
physical address space. Furthermore it avoids the code that teardown the
mappings of the secondary MMU, to implement a logic like tlb_gather in
zap_page_range that would require many IPI to flush other cpu tlbs, for
each fixed number of spte unmapped.
To make an example: if what happens on the primary MMU is a protection
downgrade (from writeable to wrprotect) the secondary MMU mappings will be
invalidated, and the next secondary-mmu-page-fault will call
get_user_pages and trigger a do_wp_page through get_user_pages if it
called get_user_pages with write=1, and it'll re-establishing an updated
spte or secondary-tlb-mapping on the copied page. Or it will setup a
readonly spte or readonly tlb mapping if it's a guest-read, if it calls
get_user_pages with write=0. This is just an example.
This allows to map any page pointed by any pte (and in turn visible in the
primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an
full MMU with both sptes and secondary-tlb like the shadow-pagetable layer
with kvm), or a remote DMA in software like XPMEM (hence needing of
schedule in XPMEM code to send the invalidate to the remote node, while no
need to schedule in kvm/gru as it's an immediate event like invalidating
primary-mmu pte).
At least for KVM without this patch it's impossible to swap guests
reliably. And having this feature and removing the page pin allows
several other optimizations that simplify life considerably.
Dependencies:
1) mm_take_all_locks() to register the mmu notifier when the whole VM
isn't doing anything with "mm". This allows mmu notifier users to keep
track if the VM is in the middle of the invalidate_range_begin/end
critical section with an atomic counter incraese in range_begin and
decreased in range_end. No secondary MMU page fault is allowed to map
any spte or secondary tlb reference, while the VM is in the middle of
range_begin/end as any page returned by get_user_pages in that critical
section could later immediately be freed without any further
->invalidate_page notification (invalidate_range_begin/end works on
ranges and ->invalidate_page isn't called immediately before freeing
the page). To stop all page freeing and pagetable overwrites the
mmap_sem must be taken in write mode and all other anon_vma/i_mmap
locks must be taken too.
2) It'd be a waste to add branches in the VM if nobody could possibly
run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if
CONFIG_KVM=m/y. In the current kernel kvm won't yet take advantage of
mmu notifiers, but this already allows to compile a KVM external module
against a kernel with mmu notifiers enabled and from the next pull from
kvm.git we'll start using them. And GRU/XPMEM will also be able to
continue the development by enabling KVM=m in their config, until they
submit all GRU/XPMEM GPLv2 code to the mainline kernel. Then they can
also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n).
This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM
are all =n.
The mmu_notifier_register call can fail because mm_take_all_locks may be
interrupted by a signal and return -EINTR. Because mmu_notifier_reigster
is used when a driver startup, a failure can be gracefully handled. Here
an example of the change applied to kvm to register the mmu notifiers.
Usually when a driver startups other allocations are required anyway and
-ENOMEM failure paths exists already.
struct kvm *kvm_arch_create_vm(void)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+ int err;
if (!kvm)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+ err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+ if (err) {
+ kfree(kvm);
+ return ERR_PTR(err);
+ }
+
return kvm;
}
mmu_notifier_unregister returns void and it's reliable.
The patch also adds a few needed but missing includes that would prevent
kernel to compile after these changes on non-x86 archs (x86 didn't need
them by luck).
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix mm/filemap_xip.c build]
[akpm@linux-foundation.org: fix mm/mmu_notifier.c build]
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Kanoj Sarcar <kanojsarcar@yahoo.com>
Cc: Roland Dreier <rdreier@cisco.com>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Chris Wright <chrisw@redhat.com>
Cc: Marcelo Tosatti <marcelo@kvack.org>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Izik Eidus <izike@qumranet.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-29 06:46:29 +08:00
|
|
|
unsigned long start = addr, end = addr + size;
|
2007-05-07 05:48:54 +08:00
|
|
|
int err;
|
|
|
|
|
|
|
|
BUG_ON(addr >= end);
|
mmu-notifiers: core
With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages.
There are secondary MMUs (with secondary sptes and secondary tlbs) too.
sptes in the kvm case are shadow pagetables, but when I say spte in
mmu-notifier context, I mean "secondary pte". In GRU case there's no
actual secondary pte and there's only a secondary tlb because the GRU
secondary MMU has no knowledge about sptes and every secondary tlb miss
event in the MMU always generates a page fault that has to be resolved by
the CPU (this is not the case of KVM where the a secondary tlb miss will
walk sptes in hardware and it will refill the secondary tlb transparently
to software if the corresponding spte is present). The same way
zap_page_range has to invalidate the pte before freeing the page, the spte
(and secondary tlb) must also be invalidated before any page is freed and
reused.
Currently we take a page_count pin on every page mapped by sptes, but that
means the pages can't be swapped whenever they're mapped by any spte
because they're part of the guest working set. Furthermore a spte unmap
event can immediately lead to a page to be freed when the pin is released
(so requiring the same complex and relatively slow tlb_gather smp safe
logic we have in zap_page_range and that can be avoided completely if the
spte unmap event doesn't require an unpin of the page previously mapped in
the secondary MMU).
The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know
when the VM is swapping or freeing or doing anything on the primary MMU so
that the secondary MMU code can drop sptes before the pages are freed,
avoiding all page pinning and allowing 100% reliable swapping of guest
physical address space. Furthermore it avoids the code that teardown the
mappings of the secondary MMU, to implement a logic like tlb_gather in
zap_page_range that would require many IPI to flush other cpu tlbs, for
each fixed number of spte unmapped.
To make an example: if what happens on the primary MMU is a protection
downgrade (from writeable to wrprotect) the secondary MMU mappings will be
invalidated, and the next secondary-mmu-page-fault will call
get_user_pages and trigger a do_wp_page through get_user_pages if it
called get_user_pages with write=1, and it'll re-establishing an updated
spte or secondary-tlb-mapping on the copied page. Or it will setup a
readonly spte or readonly tlb mapping if it's a guest-read, if it calls
get_user_pages with write=0. This is just an example.
This allows to map any page pointed by any pte (and in turn visible in the
primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an
full MMU with both sptes and secondary-tlb like the shadow-pagetable layer
with kvm), or a remote DMA in software like XPMEM (hence needing of
schedule in XPMEM code to send the invalidate to the remote node, while no
need to schedule in kvm/gru as it's an immediate event like invalidating
primary-mmu pte).
At least for KVM without this patch it's impossible to swap guests
reliably. And having this feature and removing the page pin allows
several other optimizations that simplify life considerably.
Dependencies:
1) mm_take_all_locks() to register the mmu notifier when the whole VM
isn't doing anything with "mm". This allows mmu notifier users to keep
track if the VM is in the middle of the invalidate_range_begin/end
critical section with an atomic counter incraese in range_begin and
decreased in range_end. No secondary MMU page fault is allowed to map
any spte or secondary tlb reference, while the VM is in the middle of
range_begin/end as any page returned by get_user_pages in that critical
section could later immediately be freed without any further
->invalidate_page notification (invalidate_range_begin/end works on
ranges and ->invalidate_page isn't called immediately before freeing
the page). To stop all page freeing and pagetable overwrites the
mmap_sem must be taken in write mode and all other anon_vma/i_mmap
locks must be taken too.
2) It'd be a waste to add branches in the VM if nobody could possibly
run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if
CONFIG_KVM=m/y. In the current kernel kvm won't yet take advantage of
mmu notifiers, but this already allows to compile a KVM external module
against a kernel with mmu notifiers enabled and from the next pull from
kvm.git we'll start using them. And GRU/XPMEM will also be able to
continue the development by enabling KVM=m in their config, until they
submit all GRU/XPMEM GPLv2 code to the mainline kernel. Then they can
also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n).
This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM
are all =n.
The mmu_notifier_register call can fail because mm_take_all_locks may be
interrupted by a signal and return -EINTR. Because mmu_notifier_reigster
is used when a driver startup, a failure can be gracefully handled. Here
an example of the change applied to kvm to register the mmu notifiers.
Usually when a driver startups other allocations are required anyway and
-ENOMEM failure paths exists already.
struct kvm *kvm_arch_create_vm(void)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+ int err;
if (!kvm)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+ err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+ if (err) {
+ kfree(kvm);
+ return ERR_PTR(err);
+ }
+
return kvm;
}
mmu_notifier_unregister returns void and it's reliable.
The patch also adds a few needed but missing includes that would prevent
kernel to compile after these changes on non-x86 archs (x86 didn't need
them by luck).
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix mm/filemap_xip.c build]
[akpm@linux-foundation.org: fix mm/mmu_notifier.c build]
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Kanoj Sarcar <kanojsarcar@yahoo.com>
Cc: Roland Dreier <rdreier@cisco.com>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Chris Wright <chrisw@redhat.com>
Cc: Marcelo Tosatti <marcelo@kvack.org>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Izik Eidus <izike@qumranet.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-29 06:46:29 +08:00
|
|
|
mmu_notifier_invalidate_range_start(mm, start, end);
|
2007-05-07 05:48:54 +08:00
|
|
|
pgd = pgd_offset(mm, addr);
|
|
|
|
do {
|
|
|
|
next = pgd_addr_end(addr, end);
|
|
|
|
err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
|
|
|
|
if (err)
|
|
|
|
break;
|
|
|
|
} while (pgd++, addr = next, addr != end);
|
mmu-notifiers: core
With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages.
There are secondary MMUs (with secondary sptes and secondary tlbs) too.
sptes in the kvm case are shadow pagetables, but when I say spte in
mmu-notifier context, I mean "secondary pte". In GRU case there's no
actual secondary pte and there's only a secondary tlb because the GRU
secondary MMU has no knowledge about sptes and every secondary tlb miss
event in the MMU always generates a page fault that has to be resolved by
the CPU (this is not the case of KVM where the a secondary tlb miss will
walk sptes in hardware and it will refill the secondary tlb transparently
to software if the corresponding spte is present). The same way
zap_page_range has to invalidate the pte before freeing the page, the spte
(and secondary tlb) must also be invalidated before any page is freed and
reused.
Currently we take a page_count pin on every page mapped by sptes, but that
means the pages can't be swapped whenever they're mapped by any spte
because they're part of the guest working set. Furthermore a spte unmap
event can immediately lead to a page to be freed when the pin is released
(so requiring the same complex and relatively slow tlb_gather smp safe
logic we have in zap_page_range and that can be avoided completely if the
spte unmap event doesn't require an unpin of the page previously mapped in
the secondary MMU).
The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know
when the VM is swapping or freeing or doing anything on the primary MMU so
that the secondary MMU code can drop sptes before the pages are freed,
avoiding all page pinning and allowing 100% reliable swapping of guest
physical address space. Furthermore it avoids the code that teardown the
mappings of the secondary MMU, to implement a logic like tlb_gather in
zap_page_range that would require many IPI to flush other cpu tlbs, for
each fixed number of spte unmapped.
To make an example: if what happens on the primary MMU is a protection
downgrade (from writeable to wrprotect) the secondary MMU mappings will be
invalidated, and the next secondary-mmu-page-fault will call
get_user_pages and trigger a do_wp_page through get_user_pages if it
called get_user_pages with write=1, and it'll re-establishing an updated
spte or secondary-tlb-mapping on the copied page. Or it will setup a
readonly spte or readonly tlb mapping if it's a guest-read, if it calls
get_user_pages with write=0. This is just an example.
This allows to map any page pointed by any pte (and in turn visible in the
primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an
full MMU with both sptes and secondary-tlb like the shadow-pagetable layer
with kvm), or a remote DMA in software like XPMEM (hence needing of
schedule in XPMEM code to send the invalidate to the remote node, while no
need to schedule in kvm/gru as it's an immediate event like invalidating
primary-mmu pte).
At least for KVM without this patch it's impossible to swap guests
reliably. And having this feature and removing the page pin allows
several other optimizations that simplify life considerably.
Dependencies:
1) mm_take_all_locks() to register the mmu notifier when the whole VM
isn't doing anything with "mm". This allows mmu notifier users to keep
track if the VM is in the middle of the invalidate_range_begin/end
critical section with an atomic counter incraese in range_begin and
decreased in range_end. No secondary MMU page fault is allowed to map
any spte or secondary tlb reference, while the VM is in the middle of
range_begin/end as any page returned by get_user_pages in that critical
section could later immediately be freed without any further
->invalidate_page notification (invalidate_range_begin/end works on
ranges and ->invalidate_page isn't called immediately before freeing
the page). To stop all page freeing and pagetable overwrites the
mmap_sem must be taken in write mode and all other anon_vma/i_mmap
locks must be taken too.
2) It'd be a waste to add branches in the VM if nobody could possibly
run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if
CONFIG_KVM=m/y. In the current kernel kvm won't yet take advantage of
mmu notifiers, but this already allows to compile a KVM external module
against a kernel with mmu notifiers enabled and from the next pull from
kvm.git we'll start using them. And GRU/XPMEM will also be able to
continue the development by enabling KVM=m in their config, until they
submit all GRU/XPMEM GPLv2 code to the mainline kernel. Then they can
also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n).
This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM
are all =n.
The mmu_notifier_register call can fail because mm_take_all_locks may be
interrupted by a signal and return -EINTR. Because mmu_notifier_reigster
is used when a driver startup, a failure can be gracefully handled. Here
an example of the change applied to kvm to register the mmu notifiers.
Usually when a driver startups other allocations are required anyway and
-ENOMEM failure paths exists already.
struct kvm *kvm_arch_create_vm(void)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+ int err;
if (!kvm)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+ err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+ if (err) {
+ kfree(kvm);
+ return ERR_PTR(err);
+ }
+
return kvm;
}
mmu_notifier_unregister returns void and it's reliable.
The patch also adds a few needed but missing includes that would prevent
kernel to compile after these changes on non-x86 archs (x86 didn't need
them by luck).
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix mm/filemap_xip.c build]
[akpm@linux-foundation.org: fix mm/mmu_notifier.c build]
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Kanoj Sarcar <kanojsarcar@yahoo.com>
Cc: Roland Dreier <rdreier@cisco.com>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Chris Wright <chrisw@redhat.com>
Cc: Marcelo Tosatti <marcelo@kvack.org>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Izik Eidus <izike@qumranet.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-29 06:46:29 +08:00
|
|
|
mmu_notifier_invalidate_range_end(mm, start, end);
|
2007-05-07 05:48:54 +08:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(apply_to_page_range);
|
|
|
|
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
/*
|
|
|
|
* handle_pte_fault chooses page fault handler according to an entry
|
|
|
|
* which was read non-atomically. Before making any commitment, on
|
|
|
|
* those architectures or configurations (e.g. i386 with PAE) which
|
|
|
|
* might give a mix of unmatched parts, do_swap_page and do_file_page
|
|
|
|
* must check under lock before unmapping the pte and proceeding
|
|
|
|
* (but do_wp_page is only called after already making such a check;
|
|
|
|
* and do_anonymous_page and do_no_page can safely check later on).
|
|
|
|
*/
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:40 +08:00
|
|
|
static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
pte_t *page_table, pte_t orig_pte)
|
|
|
|
{
|
|
|
|
int same = 1;
|
|
|
|
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
|
|
|
|
if (sizeof(pte_t) > sizeof(unsigned long)) {
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:40 +08:00
|
|
|
spinlock_t *ptl = pte_lockptr(mm, pmd);
|
|
|
|
spin_lock(ptl);
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
same = pte_same(*page_table, orig_pte);
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:40 +08:00
|
|
|
spin_unlock(ptl);
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
pte_unmap(page_table);
|
|
|
|
return same;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
|
|
|
|
* servicing faults for write access. In the normal case, do always want
|
|
|
|
* pte_mkwrite. But get_user_pages can cause write faults for mappings
|
|
|
|
* that do not have writing enabled, when used by access_process_vm.
|
|
|
|
*/
|
|
|
|
static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
if (likely(vma->vm_flags & VM_WRITE))
|
|
|
|
pte = pte_mkwrite(pte);
|
|
|
|
return pte;
|
|
|
|
}
|
|
|
|
|
2006-12-13 01:14:55 +08:00
|
|
|
static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
|
2005-11-29 06:34:23 +08:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* If the source page was a PFN mapping, we don't have
|
|
|
|
* a "struct page" for it. We do a best-effort copy by
|
|
|
|
* just copying from the original user address. If that
|
|
|
|
* fails, we just zero-fill it. Live with it.
|
|
|
|
*/
|
|
|
|
if (unlikely(!src)) {
|
|
|
|
void *kaddr = kmap_atomic(dst, KM_USER0);
|
2005-11-30 06:07:55 +08:00
|
|
|
void __user *uaddr = (void __user *)(va & PAGE_MASK);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This really shouldn't fail, because the page is there
|
|
|
|
* in the page tables. But it might just be unreadable,
|
|
|
|
* in which case we just give up and fill the result with
|
|
|
|
* zeroes.
|
|
|
|
*/
|
|
|
|
if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
|
2005-11-29 06:34:23 +08:00
|
|
|
memset(kaddr, 0, PAGE_SIZE);
|
|
|
|
kunmap_atomic(kaddr, KM_USER0);
|
[PATCH] mm: D-cache aliasing issue in cow_user_page
--=-=-=
from mm/memory.c:
1434 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
1435 {
1436 /*
1437 * If the source page was a PFN mapping, we don't have
1438 * a "struct page" for it. We do a best-effort copy by
1439 * just copying from the original user address. If that
1440 * fails, we just zero-fill it. Live with it.
1441 */
1442 if (unlikely(!src)) {
1443 void *kaddr = kmap_atomic(dst, KM_USER0);
1444 void __user *uaddr = (void __user *)(va & PAGE_MASK);
1445
1446 /*
1447 * This really shouldn't fail, because the page is there
1448 * in the page tables. But it might just be unreadable,
1449 * in which case we just give up and fill the result with
1450 * zeroes.
1451 */
1452 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
1453 memset(kaddr, 0, PAGE_SIZE);
1454 kunmap_atomic(kaddr, KM_USER0);
#### D-cache have to be flushed here.
#### It seems it is just forgotten.
1455 return;
1456
1457 }
1458 copy_user_highpage(dst, src, va);
#### Ok here. flush_dcache_page() called from this func if arch need it
1459 }
Following is the patch fix this issue:
Signed-off-by: Dmitriy Monakhov <dmonakhov@openvz.org>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-20 14:29:08 +08:00
|
|
|
flush_dcache_page(dst);
|
mm: fix PageUptodate data race
After running SetPageUptodate, preceeding stores to the page contents to
actually bring it uptodate may not be ordered with the store to set the
page uptodate.
Therefore, another CPU which checks PageUptodate is true, then reads the
page contents can get stale data.
Fix this by having an smp_wmb before SetPageUptodate, and smp_rmb after
PageUptodate.
Many places that test PageUptodate, do so with the page locked, and this
would be enough to ensure memory ordering in those places if
SetPageUptodate were only called while the page is locked. Unfortunately
that is not always the case for some filesystems, but it could be an idea
for the future.
Also bring the handling of anonymous page uptodateness in line with that of
file backed page management, by marking anon pages as uptodate when they
_are_ uptodate, rather than when our implementation requires that they be
marked as such. Doing allows us to get rid of the smp_wmb's in the page
copying functions, which were especially added for anonymous pages for an
analogous memory ordering problem. Both file and anonymous pages are
handled with the same barriers.
FAQ:
Q. Why not do this in flush_dcache_page?
A. Firstly, flush_dcache_page handles only one side (the smb side) of the
ordering protocol; we'd still need smp_rmb somewhere. Secondly, hiding away
memory barriers in a completely unrelated function is nasty; at least in the
PageUptodate macros, they are located together with (half) the operations
involved in the ordering. Thirdly, the smp_wmb is only required when first
bringing the page uptodate, wheras flush_dcache_page should be called each time
it is written to through the kernel mapping. It is logically the wrong place to
put it.
Q. Why does this increase my text size / reduce my performance / etc.
A. Because it is adding the necessary instructions to eliminate the data-race.
Q. Can it be improved?
A. Yes, eg. if you were to create a rule that all SetPageUptodate operations
run under the page lock, we could avoid the smp_rmb places where PageUptodate
is queried under the page lock. Requires audit of all filesystems and at least
some would need reworking. That's great you're interested, I'm eagerly awaiting
your patches.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-05 14:29:34 +08:00
|
|
|
} else
|
|
|
|
copy_user_highpage(dst, src, va, vma);
|
2005-11-29 06:34:23 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* This routine handles present pages, when users try to write
|
|
|
|
* to a shared page. It is done by copying the page to a new address
|
|
|
|
* and decrementing the shared-page counter for the old page.
|
|
|
|
*
|
|
|
|
* Note that this routine assumes that the protection checks have been
|
|
|
|
* done by the caller (the low-level page fault routine in most cases).
|
|
|
|
* Thus we can safely just mark it writable once we've done any necessary
|
|
|
|
* COW.
|
|
|
|
*
|
|
|
|
* We also mark the page dirty at this point even though the page will
|
|
|
|
* change only once the write actually happens. This avoids a few races,
|
|
|
|
* and potentially makes it more efficient.
|
|
|
|
*
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
* We enter with non-exclusive mmap_sem (to exclude vma changes,
|
|
|
|
* but allow concurrent faults), with pte both mapped and locked.
|
|
|
|
* We return with mmap_sem still held, but pte unmapped and unlocked.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
|
|
unsigned long address, pte_t *page_table, pmd_t *pmd,
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
spinlock_t *ptl, pte_t orig_pte)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2005-11-30 00:54:51 +08:00
|
|
|
struct page *old_page, *new_page;
|
2005-04-17 06:20:36 +08:00
|
|
|
pte_t entry;
|
2007-07-19 16:47:05 +08:00
|
|
|
int reuse = 0, ret = 0;
|
2007-10-09 00:54:37 +08:00
|
|
|
int page_mkwrite = 0;
|
2006-09-26 14:30:57 +08:00
|
|
|
struct page *dirty_page = NULL;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-11-29 06:34:23 +08:00
|
|
|
old_page = vm_normal_page(vma, address, orig_pte);
|
2008-07-05 00:59:24 +08:00
|
|
|
if (!old_page) {
|
|
|
|
/*
|
|
|
|
* VM_MIXEDMAP !pfn_valid() case
|
|
|
|
*
|
|
|
|
* We should not cow pages in a shared writeable mapping.
|
|
|
|
* Just mark the pages writable as we can't do any dirty
|
|
|
|
* accounting on raw pfn maps.
|
|
|
|
*/
|
|
|
|
if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
|
|
|
|
(VM_WRITE|VM_SHARED))
|
|
|
|
goto reuse;
|
2005-11-29 06:34:23 +08:00
|
|
|
goto gotten;
|
2008-07-05 00:59:24 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-09-26 14:30:57 +08:00
|
|
|
/*
|
2006-09-26 14:31:00 +08:00
|
|
|
* Take out anonymous pages first, anonymous shared vmas are
|
|
|
|
* not dirty accountable.
|
2006-09-26 14:30:57 +08:00
|
|
|
*/
|
2006-09-26 14:31:00 +08:00
|
|
|
if (PageAnon(old_page)) {
|
mm: wp lock page before deciding cow
An application may rely on get_user_pages() to give it pages writable from
userspace and shared with a driver, GUP breaking COW if necessary. It may
mprotect() the pages' writability, off and on, from time to time.
Normally this works fine (so long as the app does not fork); but just
occasionally, under memory pressure, a readonly pte in a newly writable
area is COWed unnecessarily, breaking the link with the driver: because
do_wp_page() does trylock_page, and falls back to COW whenever that fails.
For reliable behaviour in the unshared case, when the trylock_page fails,
now unlock pagetable, lock page and relock pagetable, before deciding
whether Copy-On-Write is really necessary.
Reported-by: Zhou Yingchao
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Robin Holt <holt@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:39:33 +08:00
|
|
|
if (!trylock_page(old_page)) {
|
|
|
|
page_cache_get(old_page);
|
|
|
|
pte_unmap_unlock(page_table, ptl);
|
|
|
|
lock_page(old_page);
|
|
|
|
page_table = pte_offset_map_lock(mm, pmd, address,
|
|
|
|
&ptl);
|
|
|
|
if (!pte_same(*page_table, orig_pte)) {
|
|
|
|
unlock_page(old_page);
|
|
|
|
page_cache_release(old_page);
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
page_cache_release(old_page);
|
2006-09-26 14:31:00 +08:00
|
|
|
}
|
2009-01-07 06:39:34 +08:00
|
|
|
reuse = reuse_swap_page(old_page);
|
mm: wp lock page before deciding cow
An application may rely on get_user_pages() to give it pages writable from
userspace and shared with a driver, GUP breaking COW if necessary. It may
mprotect() the pages' writability, off and on, from time to time.
Normally this works fine (so long as the app does not fork); but just
occasionally, under memory pressure, a readonly pte in a newly writable
area is COWed unnecessarily, breaking the link with the driver: because
do_wp_page() does trylock_page, and falls back to COW whenever that fails.
For reliable behaviour in the unshared case, when the trylock_page fails,
now unlock pagetable, lock page and relock pagetable, before deciding
whether Copy-On-Write is really necessary.
Reported-by: Zhou Yingchao
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Robin Holt <holt@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:39:33 +08:00
|
|
|
unlock_page(old_page);
|
2006-09-26 14:31:00 +08:00
|
|
|
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
|
2006-09-26 14:30:57 +08:00
|
|
|
(VM_WRITE|VM_SHARED))) {
|
2006-09-26 14:31:00 +08:00
|
|
|
/*
|
|
|
|
* Only catch write-faults on shared writable pages,
|
|
|
|
* read-only shared pages can get COWed by
|
|
|
|
* get_user_pages(.write=1, .force=1).
|
|
|
|
*/
|
2006-06-23 17:03:43 +08:00
|
|
|
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
|
2009-04-01 06:23:21 +08:00
|
|
|
struct vm_fault vmf;
|
|
|
|
int tmp;
|
|
|
|
|
|
|
|
vmf.virtual_address = (void __user *)(address &
|
|
|
|
PAGE_MASK);
|
|
|
|
vmf.pgoff = old_page->index;
|
|
|
|
vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
|
|
|
|
vmf.page = old_page;
|
|
|
|
|
2006-06-23 17:03:43 +08:00
|
|
|
/*
|
|
|
|
* Notify the address space that the page is about to
|
|
|
|
* become writable so that it can prohibit this or wait
|
|
|
|
* for the page to get into an appropriate state.
|
|
|
|
*
|
|
|
|
* We do this without the lock held, so that it can
|
|
|
|
* sleep if it needs to.
|
|
|
|
*/
|
|
|
|
page_cache_get(old_page);
|
|
|
|
pte_unmap_unlock(page_table, ptl);
|
|
|
|
|
2009-04-01 06:23:21 +08:00
|
|
|
tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
|
|
|
|
if (unlikely(tmp &
|
|
|
|
(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
|
|
|
|
ret = tmp;
|
2006-06-23 17:03:43 +08:00
|
|
|
goto unwritable_page;
|
2009-04-01 06:23:21 +08:00
|
|
|
}
|
mm: close page_mkwrite races
Change page_mkwrite to allow implementations to return with the page
locked, and also change it's callers (in page fault paths) to hold the
lock until the page is marked dirty. This allows the filesystem to have
full control of page dirtying events coming from the VM.
Rather than simply hold the page locked over the page_mkwrite call, we
call page_mkwrite with the page unlocked and allow callers to return with
it locked, so filesystems can avoid LOR conditions with page lock.
The problem with the current scheme is this: a filesystem that wants to
associate some metadata with a page as long as the page is dirty, will
perform this manipulation in its ->page_mkwrite. It currently then must
return with the page unlocked and may not hold any other locks (according
to existing page_mkwrite convention).
In this window, the VM could write out the page, clearing page-dirty. The
filesystem has no good way to detect that a dirty pte is about to be
attached, so it will happily write out the page, at which point, the
filesystem may manipulate the metadata to reflect that the page is no
longer dirty.
It is not always possible to perform the required metadata manipulation in
->set_page_dirty, because that function cannot block or fail. The
filesystem may need to allocate some data structure, for example.
And the VM cannot mark the pte dirty before page_mkwrite, because
page_mkwrite is allowed to fail, so we must not allow any window where the
page could be written to if page_mkwrite does fail.
This solution of holding the page locked over the 3 critical operations
(page_mkwrite, setting the pte dirty, and finally setting the page dirty)
closes out races nicely, preventing page cleaning for writeout being
initiated in that window. This provides the filesystem with a strong
synchronisation against the VM here.
- Sage needs this race closed for ceph filesystem.
- Trond for NFS (http://bugzilla.kernel.org/show_bug.cgi?id=12913).
- I need it for fsblock.
- I suspect other filesystems may need it too (eg. btrfs).
- I have converted buffer.c to the new locking. Even simple block allocation
under dirty pages might be susceptible to i_size changing under partial page
at the end of file (we also have a buffer.c-side problem here, but it cannot
be fixed properly without this patch).
- Other filesystems (eg. NFS, maybe btrfs) will need to change their
page_mkwrite functions themselves.
[ This also moves page_mkwrite another step closer to fault, which should
eventually allow page_mkwrite to be moved into ->fault, and thus avoiding a
filesystem calldown and page lock/unlock cycle in __do_fault. ]
[akpm@linux-foundation.org: fix derefs of NULL ->mapping]
Cc: Sage Weil <sage@newdream.net>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-05-01 06:08:16 +08:00
|
|
|
if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
|
|
|
|
lock_page(old_page);
|
|
|
|
if (!old_page->mapping) {
|
|
|
|
ret = 0; /* retry the fault */
|
|
|
|
unlock_page(old_page);
|
|
|
|
goto unwritable_page;
|
|
|
|
}
|
|
|
|
} else
|
|
|
|
VM_BUG_ON(!PageLocked(old_page));
|
2006-06-23 17:03:43 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Since we dropped the lock we need to revalidate
|
|
|
|
* the PTE as someone else may have changed it. If
|
|
|
|
* they did, we just return, as we can count on the
|
|
|
|
* MMU to tell us if they didn't also make it writable.
|
|
|
|
*/
|
|
|
|
page_table = pte_offset_map_lock(mm, pmd, address,
|
|
|
|
&ptl);
|
mm: close page_mkwrite races
Change page_mkwrite to allow implementations to return with the page
locked, and also change it's callers (in page fault paths) to hold the
lock until the page is marked dirty. This allows the filesystem to have
full control of page dirtying events coming from the VM.
Rather than simply hold the page locked over the page_mkwrite call, we
call page_mkwrite with the page unlocked and allow callers to return with
it locked, so filesystems can avoid LOR conditions with page lock.
The problem with the current scheme is this: a filesystem that wants to
associate some metadata with a page as long as the page is dirty, will
perform this manipulation in its ->page_mkwrite. It currently then must
return with the page unlocked and may not hold any other locks (according
to existing page_mkwrite convention).
In this window, the VM could write out the page, clearing page-dirty. The
filesystem has no good way to detect that a dirty pte is about to be
attached, so it will happily write out the page, at which point, the
filesystem may manipulate the metadata to reflect that the page is no
longer dirty.
It is not always possible to perform the required metadata manipulation in
->set_page_dirty, because that function cannot block or fail. The
filesystem may need to allocate some data structure, for example.
And the VM cannot mark the pte dirty before page_mkwrite, because
page_mkwrite is allowed to fail, so we must not allow any window where the
page could be written to if page_mkwrite does fail.
This solution of holding the page locked over the 3 critical operations
(page_mkwrite, setting the pte dirty, and finally setting the page dirty)
closes out races nicely, preventing page cleaning for writeout being
initiated in that window. This provides the filesystem with a strong
synchronisation against the VM here.
- Sage needs this race closed for ceph filesystem.
- Trond for NFS (http://bugzilla.kernel.org/show_bug.cgi?id=12913).
- I need it for fsblock.
- I suspect other filesystems may need it too (eg. btrfs).
- I have converted buffer.c to the new locking. Even simple block allocation
under dirty pages might be susceptible to i_size changing under partial page
at the end of file (we also have a buffer.c-side problem here, but it cannot
be fixed properly without this patch).
- Other filesystems (eg. NFS, maybe btrfs) will need to change their
page_mkwrite functions themselves.
[ This also moves page_mkwrite another step closer to fault, which should
eventually allow page_mkwrite to be moved into ->fault, and thus avoiding a
filesystem calldown and page lock/unlock cycle in __do_fault. ]
[akpm@linux-foundation.org: fix derefs of NULL ->mapping]
Cc: Sage Weil <sage@newdream.net>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-05-01 06:08:16 +08:00
|
|
|
if (!pte_same(*page_table, orig_pte)) {
|
|
|
|
unlock_page(old_page);
|
|
|
|
page_cache_release(old_page);
|
2006-06-23 17:03:43 +08:00
|
|
|
goto unlock;
|
mm: close page_mkwrite races
Change page_mkwrite to allow implementations to return with the page
locked, and also change it's callers (in page fault paths) to hold the
lock until the page is marked dirty. This allows the filesystem to have
full control of page dirtying events coming from the VM.
Rather than simply hold the page locked over the page_mkwrite call, we
call page_mkwrite with the page unlocked and allow callers to return with
it locked, so filesystems can avoid LOR conditions with page lock.
The problem with the current scheme is this: a filesystem that wants to
associate some metadata with a page as long as the page is dirty, will
perform this manipulation in its ->page_mkwrite. It currently then must
return with the page unlocked and may not hold any other locks (according
to existing page_mkwrite convention).
In this window, the VM could write out the page, clearing page-dirty. The
filesystem has no good way to detect that a dirty pte is about to be
attached, so it will happily write out the page, at which point, the
filesystem may manipulate the metadata to reflect that the page is no
longer dirty.
It is not always possible to perform the required metadata manipulation in
->set_page_dirty, because that function cannot block or fail. The
filesystem may need to allocate some data structure, for example.
And the VM cannot mark the pte dirty before page_mkwrite, because
page_mkwrite is allowed to fail, so we must not allow any window where the
page could be written to if page_mkwrite does fail.
This solution of holding the page locked over the 3 critical operations
(page_mkwrite, setting the pte dirty, and finally setting the page dirty)
closes out races nicely, preventing page cleaning for writeout being
initiated in that window. This provides the filesystem with a strong
synchronisation against the VM here.
- Sage needs this race closed for ceph filesystem.
- Trond for NFS (http://bugzilla.kernel.org/show_bug.cgi?id=12913).
- I need it for fsblock.
- I suspect other filesystems may need it too (eg. btrfs).
- I have converted buffer.c to the new locking. Even simple block allocation
under dirty pages might be susceptible to i_size changing under partial page
at the end of file (we also have a buffer.c-side problem here, but it cannot
be fixed properly without this patch).
- Other filesystems (eg. NFS, maybe btrfs) will need to change their
page_mkwrite functions themselves.
[ This also moves page_mkwrite another step closer to fault, which should
eventually allow page_mkwrite to be moved into ->fault, and thus avoiding a
filesystem calldown and page lock/unlock cycle in __do_fault. ]
[akpm@linux-foundation.org: fix derefs of NULL ->mapping]
Cc: Sage Weil <sage@newdream.net>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-05-01 06:08:16 +08:00
|
|
|
}
|
2007-10-09 00:54:37 +08:00
|
|
|
|
|
|
|
page_mkwrite = 1;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2006-09-26 14:30:57 +08:00
|
|
|
dirty_page = old_page;
|
|
|
|
get_page(dirty_page);
|
2006-06-23 17:03:43 +08:00
|
|
|
reuse = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (reuse) {
|
2008-07-05 00:59:24 +08:00
|
|
|
reuse:
|
2006-06-23 17:03:43 +08:00
|
|
|
flush_cache_page(vma, address, pte_pfn(orig_pte));
|
|
|
|
entry = pte_mkyoung(orig_pte);
|
|
|
|
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
|
2007-10-16 16:25:44 +08:00
|
|
|
if (ptep_set_access_flags(vma, address, page_table, entry,1))
|
2007-06-17 01:16:12 +08:00
|
|
|
update_mmu_cache(vma, address, entry);
|
2006-06-23 17:03:43 +08:00
|
|
|
ret |= VM_FAULT_WRITE;
|
|
|
|
goto unlock;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Ok, we need to copy. Oh, well..
|
|
|
|
*/
|
2005-10-30 09:16:12 +08:00
|
|
|
page_cache_get(old_page);
|
2005-11-22 13:32:17 +08:00
|
|
|
gotten:
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
pte_unmap_unlock(page_table, ptl);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (unlikely(anon_vma_prepare(vma)))
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
goto oom;
|
remove ZERO_PAGE
The commit b5810039a54e5babf428e9a1e89fc1940fabff11 contains the note
A last caveat: the ZERO_PAGE is now refcounted and managed with rmap
(and thus mapcounted and count towards shared rss). These writes to
the struct page could cause excessive cacheline bouncing on big
systems. There are a number of ways this could be addressed if it is
an issue.
And indeed this cacheline bouncing has shown up on large SGI systems.
There was a situation where an Altix system was essentially livelocked
tearing down ZERO_PAGE pagetables when an HPC app aborted during startup.
This situation can be avoided in userspace, but it does highlight the
potential scalability problem with refcounting ZERO_PAGE, and corner
cases where it can really hurt (we don't want the system to livelock!).
There are several broad ways to fix this problem:
1. add back some special casing to avoid refcounting ZERO_PAGE
2. per-node or per-cpu ZERO_PAGES
3. remove the ZERO_PAGE completely
I will argue for 3. The others should also fix the problem, but they
result in more complex code than does 3, with little or no real benefit
that I can see.
Why? Inserting a ZERO_PAGE for anonymous read faults appears to be a
false optimisation: if an application is performance critical, it would
not be doing many read faults of new memory, or at least it could be
expected to write to that memory soon afterwards. If cache or memory use
is critical, it should not be working with a significant number of
ZERO_PAGEs anyway (a more compact representation of zeroes should be
used).
As a sanity check -- mesuring on my desktop system, there are never many
mappings to the ZERO_PAGE (eg. 2 or 3), thus memory usage here should not
increase much without it.
When running a make -j4 kernel compile on my dual core system, there are
about 1,000 mappings to the ZERO_PAGE created per second, but about 1,000
ZERO_PAGE COW faults per second (less than 1 ZERO_PAGE mapping per second
is torn down without being COWed). So removing ZERO_PAGE will save 1,000
page faults per second when running kbuild, while keeping it only saves
less than 1 page clearing operation per second. 1 page clear is cheaper
than a thousand faults, presumably, so there isn't an obvious loss.
Neither the logical argument nor these basic tests give a guarantee of no
regressions. However, this is a reasonable opportunity to try to remove
the ZERO_PAGE from the pagefault path. If it is found to cause regressions,
we can reintroduce it and just avoid refcounting it.
The /dev/zero ZERO_PAGE usage and TLB tricks also get nuked. I don't see
much use to them except on benchmarks. All other users of ZERO_PAGE are
converted just to use ZERO_PAGE(0) for simplicity. We can look at
replacing them all and maybe ripping out ZERO_PAGE completely when we are
more satisfied with this solution.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus "snif" Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:24:40 +08:00
|
|
|
VM_BUG_ON(old_page == ZERO_PAGE(0));
|
|
|
|
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
|
|
|
|
if (!new_page)
|
|
|
|
goto oom;
|
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 11:26:44 +08:00
|
|
|
/*
|
|
|
|
* Don't let another task, with possibly unlocked vma,
|
|
|
|
* keep the mlocked page.
|
|
|
|
*/
|
2009-02-05 07:12:16 +08:00
|
|
|
if ((vma->vm_flags & VM_LOCKED) && old_page) {
|
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 11:26:44 +08:00
|
|
|
lock_page(old_page); /* for LRU manipulation */
|
|
|
|
clear_page_mlock(old_page);
|
|
|
|
unlock_page(old_page);
|
|
|
|
}
|
remove ZERO_PAGE
The commit b5810039a54e5babf428e9a1e89fc1940fabff11 contains the note
A last caveat: the ZERO_PAGE is now refcounted and managed with rmap
(and thus mapcounted and count towards shared rss). These writes to
the struct page could cause excessive cacheline bouncing on big
systems. There are a number of ways this could be addressed if it is
an issue.
And indeed this cacheline bouncing has shown up on large SGI systems.
There was a situation where an Altix system was essentially livelocked
tearing down ZERO_PAGE pagetables when an HPC app aborted during startup.
This situation can be avoided in userspace, but it does highlight the
potential scalability problem with refcounting ZERO_PAGE, and corner
cases where it can really hurt (we don't want the system to livelock!).
There are several broad ways to fix this problem:
1. add back some special casing to avoid refcounting ZERO_PAGE
2. per-node or per-cpu ZERO_PAGES
3. remove the ZERO_PAGE completely
I will argue for 3. The others should also fix the problem, but they
result in more complex code than does 3, with little or no real benefit
that I can see.
Why? Inserting a ZERO_PAGE for anonymous read faults appears to be a
false optimisation: if an application is performance critical, it would
not be doing many read faults of new memory, or at least it could be
expected to write to that memory soon afterwards. If cache or memory use
is critical, it should not be working with a significant number of
ZERO_PAGEs anyway (a more compact representation of zeroes should be
used).
As a sanity check -- mesuring on my desktop system, there are never many
mappings to the ZERO_PAGE (eg. 2 or 3), thus memory usage here should not
increase much without it.
When running a make -j4 kernel compile on my dual core system, there are
about 1,000 mappings to the ZERO_PAGE created per second, but about 1,000
ZERO_PAGE COW faults per second (less than 1 ZERO_PAGE mapping per second
is torn down without being COWed). So removing ZERO_PAGE will save 1,000
page faults per second when running kbuild, while keeping it only saves
less than 1 page clearing operation per second. 1 page clear is cheaper
than a thousand faults, presumably, so there isn't an obvious loss.
Neither the logical argument nor these basic tests give a guarantee of no
regressions. However, this is a reasonable opportunity to try to remove
the ZERO_PAGE from the pagefault path. If it is found to cause regressions,
we can reintroduce it and just avoid refcounting it.
The /dev/zero ZERO_PAGE usage and TLB tricks also get nuked. I don't see
much use to them except on benchmarks. All other users of ZERO_PAGE are
converted just to use ZERO_PAGE(0) for simplicity. We can look at
replacing them all and maybe ripping out ZERO_PAGE completely when we are
more satisfied with this solution.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus "snif" Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:24:40 +08:00
|
|
|
cow_user_page(new_page, old_page, address, vma);
|
mm: fix PageUptodate data race
After running SetPageUptodate, preceeding stores to the page contents to
actually bring it uptodate may not be ordered with the store to set the
page uptodate.
Therefore, another CPU which checks PageUptodate is true, then reads the
page contents can get stale data.
Fix this by having an smp_wmb before SetPageUptodate, and smp_rmb after
PageUptodate.
Many places that test PageUptodate, do so with the page locked, and this
would be enough to ensure memory ordering in those places if
SetPageUptodate were only called while the page is locked. Unfortunately
that is not always the case for some filesystems, but it could be an idea
for the future.
Also bring the handling of anonymous page uptodateness in line with that of
file backed page management, by marking anon pages as uptodate when they
_are_ uptodate, rather than when our implementation requires that they be
marked as such. Doing allows us to get rid of the smp_wmb's in the page
copying functions, which were especially added for anonymous pages for an
analogous memory ordering problem. Both file and anonymous pages are
handled with the same barriers.
FAQ:
Q. Why not do this in flush_dcache_page?
A. Firstly, flush_dcache_page handles only one side (the smb side) of the
ordering protocol; we'd still need smp_rmb somewhere. Secondly, hiding away
memory barriers in a completely unrelated function is nasty; at least in the
PageUptodate macros, they are located together with (half) the operations
involved in the ordering. Thirdly, the smp_wmb is only required when first
bringing the page uptodate, wheras flush_dcache_page should be called each time
it is written to through the kernel mapping. It is logically the wrong place to
put it.
Q. Why does this increase my text size / reduce my performance / etc.
A. Because it is adding the necessary instructions to eliminate the data-race.
Q. Can it be improved?
A. Yes, eg. if you were to create a rule that all SetPageUptodate operations
run under the page lock, we could avoid the smp_rmb places where PageUptodate
is queried under the page lock. Requires audit of all filesystems and at least
some would need reworking. That's great you're interested, I'm eagerly awaiting
your patches.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-05 14:29:34 +08:00
|
|
|
__SetPageUptodate(new_page);
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
|
2009-01-08 10:08:10 +08:00
|
|
|
if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
|
2008-02-07 16:13:53 +08:00
|
|
|
goto oom_free_new;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Re-check the pte - we dropped the lock
|
|
|
|
*/
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
if (likely(pte_same(*page_table, orig_pte))) {
|
2005-11-22 13:32:17 +08:00
|
|
|
if (old_page) {
|
|
|
|
if (!PageAnon(old_page)) {
|
|
|
|
dec_mm_counter(mm, file_rss);
|
|
|
|
inc_mm_counter(mm, anon_rss);
|
|
|
|
}
|
|
|
|
} else
|
2005-10-30 09:16:05 +08:00
|
|
|
inc_mm_counter(mm, anon_rss);
|
2005-11-30 03:45:26 +08:00
|
|
|
flush_cache_page(vma, address, pte_pfn(orig_pte));
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
entry = mk_pte(new_page, vma->vm_page_prot);
|
|
|
|
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
|
[PATCH] mm: fix a race condition under SMC + COW
Failing context is a multi threaded process context and the failing
sequence is as follows.
One thread T0 doing self modifying code on page X on processor P0 and
another thread T1 doing COW (breaking the COW setup as part of just
happened fork() in another thread T2) on the same page X on processor P1.
T0 doing SMC can endup modifying the new page Y (allocated by the T1 doing
COW on P1) but because of different I/D TLB's, P0 ITLB will not see the new
mapping till the flush TLB IPI from P1 is received. During this interval,
if T0 executes the code created by SMC it can result in an app error (as
ITLB still points to old page X and endup executing the content in page X
rather than using the content in page Y).
Fix this issue by first clearing the PTE and flushing it, before updating
it with new entry.
Hugh sayeth:
I was a bit sceptical, in the habit of thinking that Self Modifying Code
must look such issues itself: but I guess there's nothing it can do to avoid
this one.
Fair enough, what you're changing it to is pretty much what powerpc and
s390 were already doing, and is a more robust way of proceeding, consistent
with how ptes are set everywhere else.
The ptep_clear_flush is a bit heavy-handed (it's anxious to return the pte
that was atomically cleared), but we'd have to wander through lots of arches
to get the right minimal behaviour. It'd also be nice to eliminate
ptep_establish completely, now only used to define other macros/inlines: it
always seemed obfuscation to me, what you've got there now is clearer.
Let's put those cleanups on a TODO list.
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: "David S. Miller" <davem@davemloft.net>
Acked-by: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-29 16:58:42 +08:00
|
|
|
/*
|
|
|
|
* Clear the pte entry and flush it first, before updating the
|
|
|
|
* pte with the new entry. This will avoid a race condition
|
|
|
|
* seen in the presence of one thread doing SMC and another
|
|
|
|
* thread doing COW.
|
|
|
|
*/
|
mmu-notifiers: core
With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages.
There are secondary MMUs (with secondary sptes and secondary tlbs) too.
sptes in the kvm case are shadow pagetables, but when I say spte in
mmu-notifier context, I mean "secondary pte". In GRU case there's no
actual secondary pte and there's only a secondary tlb because the GRU
secondary MMU has no knowledge about sptes and every secondary tlb miss
event in the MMU always generates a page fault that has to be resolved by
the CPU (this is not the case of KVM where the a secondary tlb miss will
walk sptes in hardware and it will refill the secondary tlb transparently
to software if the corresponding spte is present). The same way
zap_page_range has to invalidate the pte before freeing the page, the spte
(and secondary tlb) must also be invalidated before any page is freed and
reused.
Currently we take a page_count pin on every page mapped by sptes, but that
means the pages can't be swapped whenever they're mapped by any spte
because they're part of the guest working set. Furthermore a spte unmap
event can immediately lead to a page to be freed when the pin is released
(so requiring the same complex and relatively slow tlb_gather smp safe
logic we have in zap_page_range and that can be avoided completely if the
spte unmap event doesn't require an unpin of the page previously mapped in
the secondary MMU).
The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know
when the VM is swapping or freeing or doing anything on the primary MMU so
that the secondary MMU code can drop sptes before the pages are freed,
avoiding all page pinning and allowing 100% reliable swapping of guest
physical address space. Furthermore it avoids the code that teardown the
mappings of the secondary MMU, to implement a logic like tlb_gather in
zap_page_range that would require many IPI to flush other cpu tlbs, for
each fixed number of spte unmapped.
To make an example: if what happens on the primary MMU is a protection
downgrade (from writeable to wrprotect) the secondary MMU mappings will be
invalidated, and the next secondary-mmu-page-fault will call
get_user_pages and trigger a do_wp_page through get_user_pages if it
called get_user_pages with write=1, and it'll re-establishing an updated
spte or secondary-tlb-mapping on the copied page. Or it will setup a
readonly spte or readonly tlb mapping if it's a guest-read, if it calls
get_user_pages with write=0. This is just an example.
This allows to map any page pointed by any pte (and in turn visible in the
primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an
full MMU with both sptes and secondary-tlb like the shadow-pagetable layer
with kvm), or a remote DMA in software like XPMEM (hence needing of
schedule in XPMEM code to send the invalidate to the remote node, while no
need to schedule in kvm/gru as it's an immediate event like invalidating
primary-mmu pte).
At least for KVM without this patch it's impossible to swap guests
reliably. And having this feature and removing the page pin allows
several other optimizations that simplify life considerably.
Dependencies:
1) mm_take_all_locks() to register the mmu notifier when the whole VM
isn't doing anything with "mm". This allows mmu notifier users to keep
track if the VM is in the middle of the invalidate_range_begin/end
critical section with an atomic counter incraese in range_begin and
decreased in range_end. No secondary MMU page fault is allowed to map
any spte or secondary tlb reference, while the VM is in the middle of
range_begin/end as any page returned by get_user_pages in that critical
section could later immediately be freed without any further
->invalidate_page notification (invalidate_range_begin/end works on
ranges and ->invalidate_page isn't called immediately before freeing
the page). To stop all page freeing and pagetable overwrites the
mmap_sem must be taken in write mode and all other anon_vma/i_mmap
locks must be taken too.
2) It'd be a waste to add branches in the VM if nobody could possibly
run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if
CONFIG_KVM=m/y. In the current kernel kvm won't yet take advantage of
mmu notifiers, but this already allows to compile a KVM external module
against a kernel with mmu notifiers enabled and from the next pull from
kvm.git we'll start using them. And GRU/XPMEM will also be able to
continue the development by enabling KVM=m in their config, until they
submit all GRU/XPMEM GPLv2 code to the mainline kernel. Then they can
also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n).
This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM
are all =n.
The mmu_notifier_register call can fail because mm_take_all_locks may be
interrupted by a signal and return -EINTR. Because mmu_notifier_reigster
is used when a driver startup, a failure can be gracefully handled. Here
an example of the change applied to kvm to register the mmu notifiers.
Usually when a driver startups other allocations are required anyway and
-ENOMEM failure paths exists already.
struct kvm *kvm_arch_create_vm(void)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+ int err;
if (!kvm)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+ err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+ if (err) {
+ kfree(kvm);
+ return ERR_PTR(err);
+ }
+
return kvm;
}
mmu_notifier_unregister returns void and it's reliable.
The patch also adds a few needed but missing includes that would prevent
kernel to compile after these changes on non-x86 archs (x86 didn't need
them by luck).
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix mm/filemap_xip.c build]
[akpm@linux-foundation.org: fix mm/mmu_notifier.c build]
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Kanoj Sarcar <kanojsarcar@yahoo.com>
Cc: Roland Dreier <rdreier@cisco.com>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Chris Wright <chrisw@redhat.com>
Cc: Marcelo Tosatti <marcelo@kvack.org>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Izik Eidus <izike@qumranet.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-29 06:46:29 +08:00
|
|
|
ptep_clear_flush_notify(vma, address, page_table);
|
2006-01-06 16:11:12 +08:00
|
|
|
page_add_new_anon_rmap(new_page, vma, address);
|
2008-10-19 11:26:52 +08:00
|
|
|
set_pte_at(mm, address, page_table, entry);
|
|
|
|
update_mmu_cache(vma, address, entry);
|
2008-06-23 20:30:30 +08:00
|
|
|
if (old_page) {
|
|
|
|
/*
|
|
|
|
* Only after switching the pte to the new page may
|
|
|
|
* we remove the mapcount here. Otherwise another
|
|
|
|
* process may come and find the rmap count decremented
|
|
|
|
* before the pte is switched to the new page, and
|
|
|
|
* "reuse" the old page writing into it while our pte
|
|
|
|
* here still points into it and can be read by other
|
|
|
|
* threads.
|
|
|
|
*
|
|
|
|
* The critical issue is to order this
|
|
|
|
* page_remove_rmap with the ptp_clear_flush above.
|
|
|
|
* Those stores are ordered by (if nothing else,)
|
|
|
|
* the barrier present in the atomic_add_negative
|
|
|
|
* in page_remove_rmap.
|
|
|
|
*
|
|
|
|
* Then the TLB flush in ptep_clear_flush ensures that
|
|
|
|
* no process can access the old page before the
|
|
|
|
* decremented mapcount is visible. And the old page
|
|
|
|
* cannot be reused until after the decremented
|
|
|
|
* mapcount is visible. So transitively, TLBs to
|
|
|
|
* old page will be flushed before it can be reused.
|
|
|
|
*/
|
2009-01-07 06:40:11 +08:00
|
|
|
page_remove_rmap(old_page);
|
2008-06-23 20:30:30 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Free the old page.. */
|
|
|
|
new_page = old_page;
|
[PATCH] fix get_user_pages bug
Checking pte_dirty instead of pte_write in __follow_page is problematic
for s390, and for copy_one_pte which leaves dirty when clearing write.
So revert __follow_page to check pte_write as before, and make
do_wp_page pass back a special extra VM_FAULT_WRITE bit to say it has
done its full job: once get_user_pages receives this value, it no longer
requires pte_write in __follow_page.
But most callers of handle_mm_fault, in the various architectures, have
switch statements which do not expect this new case. To avoid changing
them all in a hurry, make an inline wrapper function (using the old
name) that masks off the new bit, and use the extended interface with
double underscores.
Yes, we do have a call to do_wp_page from do_swap_page, but no need to
change that: in rare case it's needed, another do_wp_page will follow.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
[ Cleanups by Nick Piggin ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-08-03 18:24:01 +08:00
|
|
|
ret |= VM_FAULT_WRITE;
|
2008-02-07 16:13:53 +08:00
|
|
|
} else
|
|
|
|
mem_cgroup_uncharge_page(new_page);
|
|
|
|
|
2005-11-22 13:32:17 +08:00
|
|
|
if (new_page)
|
|
|
|
page_cache_release(new_page);
|
|
|
|
if (old_page)
|
|
|
|
page_cache_release(old_page);
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
unlock:
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
pte_unmap_unlock(page_table, ptl);
|
2006-09-26 14:30:57 +08:00
|
|
|
if (dirty_page) {
|
2007-07-19 16:47:22 +08:00
|
|
|
/*
|
|
|
|
* Yes, Virginia, this is actually required to prevent a race
|
|
|
|
* with clear_page_dirty_for_io() from clearing the page dirty
|
|
|
|
* bit after it clear all dirty ptes, but before a racing
|
|
|
|
* do_wp_page installs a dirty pte.
|
|
|
|
*
|
|
|
|
* do_no_page is protected similarly.
|
|
|
|
*/
|
mm: close page_mkwrite races
Change page_mkwrite to allow implementations to return with the page
locked, and also change it's callers (in page fault paths) to hold the
lock until the page is marked dirty. This allows the filesystem to have
full control of page dirtying events coming from the VM.
Rather than simply hold the page locked over the page_mkwrite call, we
call page_mkwrite with the page unlocked and allow callers to return with
it locked, so filesystems can avoid LOR conditions with page lock.
The problem with the current scheme is this: a filesystem that wants to
associate some metadata with a page as long as the page is dirty, will
perform this manipulation in its ->page_mkwrite. It currently then must
return with the page unlocked and may not hold any other locks (according
to existing page_mkwrite convention).
In this window, the VM could write out the page, clearing page-dirty. The
filesystem has no good way to detect that a dirty pte is about to be
attached, so it will happily write out the page, at which point, the
filesystem may manipulate the metadata to reflect that the page is no
longer dirty.
It is not always possible to perform the required metadata manipulation in
->set_page_dirty, because that function cannot block or fail. The
filesystem may need to allocate some data structure, for example.
And the VM cannot mark the pte dirty before page_mkwrite, because
page_mkwrite is allowed to fail, so we must not allow any window where the
page could be written to if page_mkwrite does fail.
This solution of holding the page locked over the 3 critical operations
(page_mkwrite, setting the pte dirty, and finally setting the page dirty)
closes out races nicely, preventing page cleaning for writeout being
initiated in that window. This provides the filesystem with a strong
synchronisation against the VM here.
- Sage needs this race closed for ceph filesystem.
- Trond for NFS (http://bugzilla.kernel.org/show_bug.cgi?id=12913).
- I need it for fsblock.
- I suspect other filesystems may need it too (eg. btrfs).
- I have converted buffer.c to the new locking. Even simple block allocation
under dirty pages might be susceptible to i_size changing under partial page
at the end of file (we also have a buffer.c-side problem here, but it cannot
be fixed properly without this patch).
- Other filesystems (eg. NFS, maybe btrfs) will need to change their
page_mkwrite functions themselves.
[ This also moves page_mkwrite another step closer to fault, which should
eventually allow page_mkwrite to be moved into ->fault, and thus avoiding a
filesystem calldown and page lock/unlock cycle in __do_fault. ]
[akpm@linux-foundation.org: fix derefs of NULL ->mapping]
Cc: Sage Weil <sage@newdream.net>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-05-01 06:08:16 +08:00
|
|
|
if (!page_mkwrite) {
|
|
|
|
wait_on_page_locked(dirty_page);
|
|
|
|
set_page_dirty_balance(dirty_page, page_mkwrite);
|
|
|
|
}
|
2006-09-26 14:30:57 +08:00
|
|
|
put_page(dirty_page);
|
mm: close page_mkwrite races
Change page_mkwrite to allow implementations to return with the page
locked, and also change it's callers (in page fault paths) to hold the
lock until the page is marked dirty. This allows the filesystem to have
full control of page dirtying events coming from the VM.
Rather than simply hold the page locked over the page_mkwrite call, we
call page_mkwrite with the page unlocked and allow callers to return with
it locked, so filesystems can avoid LOR conditions with page lock.
The problem with the current scheme is this: a filesystem that wants to
associate some metadata with a page as long as the page is dirty, will
perform this manipulation in its ->page_mkwrite. It currently then must
return with the page unlocked and may not hold any other locks (according
to existing page_mkwrite convention).
In this window, the VM could write out the page, clearing page-dirty. The
filesystem has no good way to detect that a dirty pte is about to be
attached, so it will happily write out the page, at which point, the
filesystem may manipulate the metadata to reflect that the page is no
longer dirty.
It is not always possible to perform the required metadata manipulation in
->set_page_dirty, because that function cannot block or fail. The
filesystem may need to allocate some data structure, for example.
And the VM cannot mark the pte dirty before page_mkwrite, because
page_mkwrite is allowed to fail, so we must not allow any window where the
page could be written to if page_mkwrite does fail.
This solution of holding the page locked over the 3 critical operations
(page_mkwrite, setting the pte dirty, and finally setting the page dirty)
closes out races nicely, preventing page cleaning for writeout being
initiated in that window. This provides the filesystem with a strong
synchronisation against the VM here.
- Sage needs this race closed for ceph filesystem.
- Trond for NFS (http://bugzilla.kernel.org/show_bug.cgi?id=12913).
- I need it for fsblock.
- I suspect other filesystems may need it too (eg. btrfs).
- I have converted buffer.c to the new locking. Even simple block allocation
under dirty pages might be susceptible to i_size changing under partial page
at the end of file (we also have a buffer.c-side problem here, but it cannot
be fixed properly without this patch).
- Other filesystems (eg. NFS, maybe btrfs) will need to change their
page_mkwrite functions themselves.
[ This also moves page_mkwrite another step closer to fault, which should
eventually allow page_mkwrite to be moved into ->fault, and thus avoiding a
filesystem calldown and page lock/unlock cycle in __do_fault. ]
[akpm@linux-foundation.org: fix derefs of NULL ->mapping]
Cc: Sage Weil <sage@newdream.net>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-05-01 06:08:16 +08:00
|
|
|
if (page_mkwrite) {
|
|
|
|
struct address_space *mapping = dirty_page->mapping;
|
|
|
|
|
|
|
|
set_page_dirty(dirty_page);
|
|
|
|
unlock_page(dirty_page);
|
|
|
|
page_cache_release(dirty_page);
|
|
|
|
if (mapping) {
|
|
|
|
/*
|
|
|
|
* Some device drivers do not set page.mapping
|
|
|
|
* but still dirty their pages
|
|
|
|
*/
|
|
|
|
balance_dirty_pages_ratelimited(mapping);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* file_update_time outside page_lock */
|
|
|
|
if (vma->vm_file)
|
|
|
|
file_update_time(vma->vm_file);
|
2006-09-26 14:30:57 +08:00
|
|
|
}
|
[PATCH] fix get_user_pages bug
Checking pte_dirty instead of pte_write in __follow_page is problematic
for s390, and for copy_one_pte which leaves dirty when clearing write.
So revert __follow_page to check pte_write as before, and make
do_wp_page pass back a special extra VM_FAULT_WRITE bit to say it has
done its full job: once get_user_pages receives this value, it no longer
requires pte_write in __follow_page.
But most callers of handle_mm_fault, in the various architectures, have
switch statements which do not expect this new case. To avoid changing
them all in a hurry, make an inline wrapper function (using the old
name) that masks off the new bit, and use the extended interface with
double underscores.
Yes, we do have a call to do_wp_page from do_swap_page, but no need to
change that: in rare case it's needed, another do_wp_page will follow.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
[ Cleanups by Nick Piggin ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-08-03 18:24:01 +08:00
|
|
|
return ret;
|
2008-02-07 16:13:53 +08:00
|
|
|
oom_free_new:
|
2008-03-05 06:29:04 +08:00
|
|
|
page_cache_release(new_page);
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
oom:
|
mm: close page_mkwrite races
Change page_mkwrite to allow implementations to return with the page
locked, and also change it's callers (in page fault paths) to hold the
lock until the page is marked dirty. This allows the filesystem to have
full control of page dirtying events coming from the VM.
Rather than simply hold the page locked over the page_mkwrite call, we
call page_mkwrite with the page unlocked and allow callers to return with
it locked, so filesystems can avoid LOR conditions with page lock.
The problem with the current scheme is this: a filesystem that wants to
associate some metadata with a page as long as the page is dirty, will
perform this manipulation in its ->page_mkwrite. It currently then must
return with the page unlocked and may not hold any other locks (according
to existing page_mkwrite convention).
In this window, the VM could write out the page, clearing page-dirty. The
filesystem has no good way to detect that a dirty pte is about to be
attached, so it will happily write out the page, at which point, the
filesystem may manipulate the metadata to reflect that the page is no
longer dirty.
It is not always possible to perform the required metadata manipulation in
->set_page_dirty, because that function cannot block or fail. The
filesystem may need to allocate some data structure, for example.
And the VM cannot mark the pte dirty before page_mkwrite, because
page_mkwrite is allowed to fail, so we must not allow any window where the
page could be written to if page_mkwrite does fail.
This solution of holding the page locked over the 3 critical operations
(page_mkwrite, setting the pte dirty, and finally setting the page dirty)
closes out races nicely, preventing page cleaning for writeout being
initiated in that window. This provides the filesystem with a strong
synchronisation against the VM here.
- Sage needs this race closed for ceph filesystem.
- Trond for NFS (http://bugzilla.kernel.org/show_bug.cgi?id=12913).
- I need it for fsblock.
- I suspect other filesystems may need it too (eg. btrfs).
- I have converted buffer.c to the new locking. Even simple block allocation
under dirty pages might be susceptible to i_size changing under partial page
at the end of file (we also have a buffer.c-side problem here, but it cannot
be fixed properly without this patch).
- Other filesystems (eg. NFS, maybe btrfs) will need to change their
page_mkwrite functions themselves.
[ This also moves page_mkwrite another step closer to fault, which should
eventually allow page_mkwrite to be moved into ->fault, and thus avoiding a
filesystem calldown and page lock/unlock cycle in __do_fault. ]
[akpm@linux-foundation.org: fix derefs of NULL ->mapping]
Cc: Sage Weil <sage@newdream.net>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-05-01 06:08:16 +08:00
|
|
|
if (old_page) {
|
|
|
|
if (page_mkwrite) {
|
|
|
|
unlock_page(old_page);
|
|
|
|
page_cache_release(old_page);
|
|
|
|
}
|
2005-11-22 13:32:17 +08:00
|
|
|
page_cache_release(old_page);
|
mm: close page_mkwrite races
Change page_mkwrite to allow implementations to return with the page
locked, and also change it's callers (in page fault paths) to hold the
lock until the page is marked dirty. This allows the filesystem to have
full control of page dirtying events coming from the VM.
Rather than simply hold the page locked over the page_mkwrite call, we
call page_mkwrite with the page unlocked and allow callers to return with
it locked, so filesystems can avoid LOR conditions with page lock.
The problem with the current scheme is this: a filesystem that wants to
associate some metadata with a page as long as the page is dirty, will
perform this manipulation in its ->page_mkwrite. It currently then must
return with the page unlocked and may not hold any other locks (according
to existing page_mkwrite convention).
In this window, the VM could write out the page, clearing page-dirty. The
filesystem has no good way to detect that a dirty pte is about to be
attached, so it will happily write out the page, at which point, the
filesystem may manipulate the metadata to reflect that the page is no
longer dirty.
It is not always possible to perform the required metadata manipulation in
->set_page_dirty, because that function cannot block or fail. The
filesystem may need to allocate some data structure, for example.
And the VM cannot mark the pte dirty before page_mkwrite, because
page_mkwrite is allowed to fail, so we must not allow any window where the
page could be written to if page_mkwrite does fail.
This solution of holding the page locked over the 3 critical operations
(page_mkwrite, setting the pte dirty, and finally setting the page dirty)
closes out races nicely, preventing page cleaning for writeout being
initiated in that window. This provides the filesystem with a strong
synchronisation against the VM here.
- Sage needs this race closed for ceph filesystem.
- Trond for NFS (http://bugzilla.kernel.org/show_bug.cgi?id=12913).
- I need it for fsblock.
- I suspect other filesystems may need it too (eg. btrfs).
- I have converted buffer.c to the new locking. Even simple block allocation
under dirty pages might be susceptible to i_size changing under partial page
at the end of file (we also have a buffer.c-side problem here, but it cannot
be fixed properly without this patch).
- Other filesystems (eg. NFS, maybe btrfs) will need to change their
page_mkwrite functions themselves.
[ This also moves page_mkwrite another step closer to fault, which should
eventually allow page_mkwrite to be moved into ->fault, and thus avoiding a
filesystem calldown and page lock/unlock cycle in __do_fault. ]
[akpm@linux-foundation.org: fix derefs of NULL ->mapping]
Cc: Sage Weil <sage@newdream.net>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-05-01 06:08:16 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
return VM_FAULT_OOM;
|
2006-06-23 17:03:43 +08:00
|
|
|
|
|
|
|
unwritable_page:
|
|
|
|
page_cache_release(old_page);
|
2009-04-01 06:23:21 +08:00
|
|
|
return ret;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Helper functions for unmap_mapping_range().
|
|
|
|
*
|
|
|
|
* __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
|
|
|
|
*
|
|
|
|
* We have to restart searching the prio_tree whenever we drop the lock,
|
|
|
|
* since the iterator is only valid while the lock is held, and anyway
|
|
|
|
* a later vma might be split and reinserted earlier while lock dropped.
|
|
|
|
*
|
|
|
|
* The list of nonlinear vmas could be handled more efficiently, using
|
|
|
|
* a placeholder, but handle it in the same way until a need is shown.
|
|
|
|
* It is important to search the prio_tree before nonlinear list: a vma
|
|
|
|
* may become nonlinear and be shifted from prio_tree to nonlinear list
|
|
|
|
* while the lock is dropped; but never shifted from list to prio_tree.
|
|
|
|
*
|
|
|
|
* In order to make forward progress despite restarting the search,
|
|
|
|
* vm_truncate_count is used to mark a vma as now dealt with, so we can
|
|
|
|
* quickly skip it next time around. Since the prio_tree search only
|
|
|
|
* shows us those vmas affected by unmapping the range in question, we
|
|
|
|
* can't efficiently keep all vmas in step with mapping->truncate_count:
|
|
|
|
* so instead reset them all whenever it wraps back to 0 (then go to 1).
|
|
|
|
* mapping->truncate_count and vma->vm_truncate_count are protected by
|
|
|
|
* i_mmap_lock.
|
|
|
|
*
|
|
|
|
* In order to make forward progress despite repeatedly restarting some
|
2005-04-20 04:29:15 +08:00
|
|
|
* large vma, note the restart_addr from unmap_vmas when it breaks out:
|
2005-04-17 06:20:36 +08:00
|
|
|
* and restart from that address when we reach that vma again. It might
|
|
|
|
* have been split or merged, shrunk or extended, but never shifted: so
|
|
|
|
* restart_addr remains valid so long as it remains in the vma's range.
|
|
|
|
* unmap_mapping_range forces truncate_count to leap over page-aligned
|
|
|
|
* values so we can save vma's restart_addr in its truncate_count field.
|
|
|
|
*/
|
|
|
|
#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
|
|
|
|
|
|
|
|
static void reset_vma_truncate_counts(struct address_space *mapping)
|
|
|
|
{
|
|
|
|
struct vm_area_struct *vma;
|
|
|
|
struct prio_tree_iter iter;
|
|
|
|
|
|
|
|
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
|
|
|
|
vma->vm_truncate_count = 0;
|
|
|
|
list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
|
|
|
|
vma->vm_truncate_count = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int unmap_mapping_range_vma(struct vm_area_struct *vma,
|
|
|
|
unsigned long start_addr, unsigned long end_addr,
|
|
|
|
struct zap_details *details)
|
|
|
|
{
|
|
|
|
unsigned long restart_addr;
|
|
|
|
int need_break;
|
|
|
|
|
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page.
Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.
The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache. Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.
The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.
Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).
Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size. do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size). The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data. Valid mappings to the same
place will see a different page.
Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit. He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight. However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit. Scalability is not an issue.
This patch implements this latter approach. ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so). do_no_page only unlocks the page after setting up the mapping
completely. invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).
This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 16:46:57 +08:00
|
|
|
/*
|
|
|
|
* files that support invalidating or truncating portions of the
|
2007-07-19 16:47:03 +08:00
|
|
|
* file from under mmaped areas must have their ->fault function
|
2007-07-19 16:47:05 +08:00
|
|
|
* return a locked page (and set VM_FAULT_LOCKED in the return).
|
|
|
|
* This provides synchronisation against concurrent unmapping here.
|
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page.
Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.
The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache. Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.
The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.
Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).
Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size. do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size). The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data. Valid mappings to the same
place will see a different page.
Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit. He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight. However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit. Scalability is not an issue.
This patch implements this latter approach. ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so). do_no_page only unlocks the page after setting up the mapping
completely. invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).
This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 16:46:57 +08:00
|
|
|
*/
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
again:
|
|
|
|
restart_addr = vma->vm_truncate_count;
|
|
|
|
if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
|
|
|
|
start_addr = restart_addr;
|
|
|
|
if (start_addr >= end_addr) {
|
|
|
|
/* Top of vma has been split off since last time */
|
|
|
|
vma->vm_truncate_count = details->truncate_count;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-04-20 04:29:15 +08:00
|
|
|
restart_addr = zap_page_range(vma, start_addr,
|
|
|
|
end_addr - start_addr, details);
|
2008-01-30 20:31:20 +08:00
|
|
|
need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-04-20 04:29:15 +08:00
|
|
|
if (restart_addr >= end_addr) {
|
2005-04-17 06:20:36 +08:00
|
|
|
/* We have now completed this vma: mark it so */
|
|
|
|
vma->vm_truncate_count = details->truncate_count;
|
|
|
|
if (!need_break)
|
|
|
|
return 0;
|
|
|
|
} else {
|
|
|
|
/* Note restart_addr in vma's truncate_count field */
|
2005-04-20 04:29:15 +08:00
|
|
|
vma->vm_truncate_count = restart_addr;
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!need_break)
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
|
|
|
spin_unlock(details->i_mmap_lock);
|
|
|
|
cond_resched();
|
|
|
|
spin_lock(details->i_mmap_lock);
|
|
|
|
return -EINTR;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
|
|
|
|
struct zap_details *details)
|
|
|
|
{
|
|
|
|
struct vm_area_struct *vma;
|
|
|
|
struct prio_tree_iter iter;
|
|
|
|
pgoff_t vba, vea, zba, zea;
|
|
|
|
|
|
|
|
restart:
|
|
|
|
vma_prio_tree_foreach(vma, &iter, root,
|
|
|
|
details->first_index, details->last_index) {
|
|
|
|
/* Skip quickly over those we have already dealt with */
|
|
|
|
if (vma->vm_truncate_count == details->truncate_count)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
vba = vma->vm_pgoff;
|
|
|
|
vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
|
|
|
|
/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
|
|
|
|
zba = details->first_index;
|
|
|
|
if (zba < vba)
|
|
|
|
zba = vba;
|
|
|
|
zea = details->last_index;
|
|
|
|
if (zea > vea)
|
|
|
|
zea = vea;
|
|
|
|
|
|
|
|
if (unmap_mapping_range_vma(vma,
|
|
|
|
((zba - vba) << PAGE_SHIFT) + vma->vm_start,
|
|
|
|
((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
|
|
|
|
details) < 0)
|
|
|
|
goto restart;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void unmap_mapping_range_list(struct list_head *head,
|
|
|
|
struct zap_details *details)
|
|
|
|
{
|
|
|
|
struct vm_area_struct *vma;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* In nonlinear VMAs there is no correspondence between virtual address
|
|
|
|
* offset and file offset. So we must perform an exhaustive search
|
|
|
|
* across *all* the pages in each nonlinear VMA, not just the pages
|
|
|
|
* whose virtual address lies outside the file truncation point.
|
|
|
|
*/
|
|
|
|
restart:
|
|
|
|
list_for_each_entry(vma, head, shared.vm_set.list) {
|
|
|
|
/* Skip quickly over those we have already dealt with */
|
|
|
|
if (vma->vm_truncate_count == details->truncate_count)
|
|
|
|
continue;
|
|
|
|
details->nonlinear_vma = vma;
|
|
|
|
if (unmap_mapping_range_vma(vma, vma->vm_start,
|
|
|
|
vma->vm_end, details) < 0)
|
|
|
|
goto restart;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2007-02-10 17:45:59 +08:00
|
|
|
* unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
|
2005-06-24 13:05:21 +08:00
|
|
|
* @mapping: the address space containing mmaps to be unmapped.
|
2005-04-17 06:20:36 +08:00
|
|
|
* @holebegin: byte in first page to unmap, relative to the start of
|
|
|
|
* the underlying file. This will be rounded down to a PAGE_SIZE
|
|
|
|
* boundary. Note that this is different from vmtruncate(), which
|
|
|
|
* must keep the partial page. In contrast, we must get rid of
|
|
|
|
* partial pages.
|
|
|
|
* @holelen: size of prospective hole in bytes. This will be rounded
|
|
|
|
* up to a PAGE_SIZE boundary. A holelen of zero truncates to the
|
|
|
|
* end of the file.
|
|
|
|
* @even_cows: 1 when truncating a file, unmap even private COWed pages;
|
|
|
|
* but 0 when invalidating pagecache, don't throw away private data.
|
|
|
|
*/
|
|
|
|
void unmap_mapping_range(struct address_space *mapping,
|
|
|
|
loff_t const holebegin, loff_t const holelen, int even_cows)
|
|
|
|
{
|
|
|
|
struct zap_details details;
|
|
|
|
pgoff_t hba = holebegin >> PAGE_SHIFT;
|
|
|
|
pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
|
|
|
|
|
|
|
/* Check for overflow. */
|
|
|
|
if (sizeof(holelen) > sizeof(hlen)) {
|
|
|
|
long long holeend =
|
|
|
|
(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
|
|
|
if (holeend & ~(long long)ULONG_MAX)
|
|
|
|
hlen = ULONG_MAX - hba + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
details.check_mapping = even_cows? NULL: mapping;
|
|
|
|
details.nonlinear_vma = NULL;
|
|
|
|
details.first_index = hba;
|
|
|
|
details.last_index = hba + hlen - 1;
|
|
|
|
if (details.last_index < details.first_index)
|
|
|
|
details.last_index = ULONG_MAX;
|
|
|
|
details.i_mmap_lock = &mapping->i_mmap_lock;
|
|
|
|
|
|
|
|
spin_lock(&mapping->i_mmap_lock);
|
|
|
|
|
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page.
Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.
The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache. Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.
The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.
Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).
Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size. do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size). The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data. Valid mappings to the same
place will see a different page.
Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit. He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight. However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit. Scalability is not an issue.
This patch implements this latter approach. ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so). do_no_page only unlocks the page after setting up the mapping
completely. invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).
This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 16:46:57 +08:00
|
|
|
/* Protect against endless unmapping loops */
|
2005-04-17 06:20:36 +08:00
|
|
|
mapping->truncate_count++;
|
|
|
|
if (unlikely(is_restart_addr(mapping->truncate_count))) {
|
|
|
|
if (mapping->truncate_count == 0)
|
|
|
|
reset_vma_truncate_counts(mapping);
|
|
|
|
mapping->truncate_count++;
|
|
|
|
}
|
|
|
|
details.truncate_count = mapping->truncate_count;
|
|
|
|
|
|
|
|
if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
|
|
|
|
unmap_mapping_range_tree(&mapping->i_mmap, &details);
|
|
|
|
if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
|
|
|
|
unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
|
|
|
|
spin_unlock(&mapping->i_mmap_lock);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(unmap_mapping_range);
|
|
|
|
|
2006-09-26 14:31:22 +08:00
|
|
|
/**
|
|
|
|
* vmtruncate - unmap mappings "freed" by truncate() syscall
|
|
|
|
* @inode: inode of the file used
|
|
|
|
* @offset: file offset to start truncating
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* NOTE! We have to be ready to update the memory sharing
|
|
|
|
* between the file and the memory map for a potential last
|
|
|
|
* incomplete page. Ugly, but necessary.
|
|
|
|
*/
|
|
|
|
int vmtruncate(struct inode * inode, loff_t offset)
|
|
|
|
{
|
2008-02-05 14:28:56 +08:00
|
|
|
if (inode->i_size < offset) {
|
|
|
|
unsigned long limit;
|
|
|
|
|
|
|
|
limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
|
|
|
|
if (limit != RLIM_INFINITY && offset > limit)
|
|
|
|
goto out_sig;
|
|
|
|
if (offset > inode->i_sb->s_maxbytes)
|
|
|
|
goto out_big;
|
|
|
|
i_size_write(inode, offset);
|
|
|
|
} else {
|
|
|
|
struct address_space *mapping = inode->i_mapping;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-02-05 14:28:56 +08:00
|
|
|
/*
|
|
|
|
* truncation of in-use swapfiles is disallowed - it would
|
|
|
|
* cause subsequent swapout to scribble on the now-freed
|
|
|
|
* blocks.
|
|
|
|
*/
|
|
|
|
if (IS_SWAPFILE(inode))
|
|
|
|
return -ETXTBSY;
|
|
|
|
i_size_write(inode, offset);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* unmap_mapping_range is called twice, first simply for
|
|
|
|
* efficiency so that truncate_inode_pages does fewer
|
|
|
|
* single-page unmaps. However after this first call, and
|
|
|
|
* before truncate_inode_pages finishes, it is possible for
|
|
|
|
* private pages to be COWed, which remain after
|
|
|
|
* truncate_inode_pages finishes, hence the second
|
|
|
|
* unmap_mapping_range call must be made for correctness.
|
|
|
|
*/
|
|
|
|
unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
|
|
|
|
truncate_inode_pages(mapping, offset);
|
|
|
|
unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
|
|
|
|
}
|
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page.
Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.
The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache. Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.
The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.
Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).
Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size. do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size). The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data. Valid mappings to the same
place will see a different page.
Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit. He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight. However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit. Scalability is not an issue.
This patch implements this latter approach. ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so). do_no_page only unlocks the page after setting up the mapping
completely. invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).
This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 16:46:57 +08:00
|
|
|
|
2008-12-04 23:06:33 +08:00
|
|
|
if (inode->i_op->truncate)
|
2005-04-17 06:20:36 +08:00
|
|
|
inode->i_op->truncate(inode);
|
|
|
|
return 0;
|
2008-02-05 14:28:56 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
out_sig:
|
|
|
|
send_sig(SIGXFSZ, current, 0);
|
|
|
|
out_big:
|
|
|
|
return -EFBIG;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(vmtruncate);
|
|
|
|
|
[PATCH] madvise(MADV_REMOVE): remove pages from tmpfs shm backing store
Here is the patch to implement madvise(MADV_REMOVE) - which frees up a
given range of pages & its associated backing store. Current
implementation supports only shmfs/tmpfs and other filesystems return
-ENOSYS.
"Some app allocates large tmpfs files, then when some task quits and some
client disconnect, some memory can be released. However the only way to
release tmpfs-swap is to MADV_REMOVE". - Andrea Arcangeli
Databases want to use this feature to drop a section of their bufferpool
(shared memory segments) - without writing back to disk/swap space.
This feature is also useful for supporting hot-plug memory on UML.
Concerns raised by Andrew Morton:
- "We have no plan for holepunching! If we _do_ have such a plan (or
might in the future) then what would the API look like? I think
sys_holepunch(fd, start, len), so we should start out with that."
- Using madvise is very weird, because people will ask "why do I need to
mmap my file before I can stick a hole in it?"
- None of the other madvise operations call into the filesystem in this
manner. A broad question is: is this capability an MM operation or a
filesytem operation? truncate, for example, is a filesystem operation
which sometimes has MM side-effects. madvise is an mm operation and with
this patch, it gains FS side-effects, only they're really, really
significant ones."
Comments:
- Andrea suggested the fs operation too but then it's more efficient to
have it as a mm operation with fs side effects, because they don't
immediatly know fd and physical offset of the range. It's possible to
fixup in userland and to use the fs operation but it's more expensive,
the vmas are already in the kernel and we can use them.
Short term plan & Future Direction:
- We seem to need this interface only for shmfs/tmpfs files in the short
term. We have to add hooks into the filesystem for correctness and
completeness. This is what this patch does.
- In the future, plan is to support both fs and mmap apis also. This
also involves (other) filesystem specific functions to be implemented.
- Current patch doesn't support VM_NONLINEAR - which can be addressed in
the future.
Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Andrea Arcangeli <andrea@suse.de>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-06 16:10:38 +08:00
|
|
|
int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
|
|
|
|
{
|
|
|
|
struct address_space *mapping = inode->i_mapping;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the underlying filesystem is not going to provide
|
|
|
|
* a way to truncate a range of blocks (punch a hole) -
|
|
|
|
* we should return failure right now.
|
|
|
|
*/
|
2008-12-04 23:06:33 +08:00
|
|
|
if (!inode->i_op->truncate_range)
|
[PATCH] madvise(MADV_REMOVE): remove pages from tmpfs shm backing store
Here is the patch to implement madvise(MADV_REMOVE) - which frees up a
given range of pages & its associated backing store. Current
implementation supports only shmfs/tmpfs and other filesystems return
-ENOSYS.
"Some app allocates large tmpfs files, then when some task quits and some
client disconnect, some memory can be released. However the only way to
release tmpfs-swap is to MADV_REMOVE". - Andrea Arcangeli
Databases want to use this feature to drop a section of their bufferpool
(shared memory segments) - without writing back to disk/swap space.
This feature is also useful for supporting hot-plug memory on UML.
Concerns raised by Andrew Morton:
- "We have no plan for holepunching! If we _do_ have such a plan (or
might in the future) then what would the API look like? I think
sys_holepunch(fd, start, len), so we should start out with that."
- Using madvise is very weird, because people will ask "why do I need to
mmap my file before I can stick a hole in it?"
- None of the other madvise operations call into the filesystem in this
manner. A broad question is: is this capability an MM operation or a
filesytem operation? truncate, for example, is a filesystem operation
which sometimes has MM side-effects. madvise is an mm operation and with
this patch, it gains FS side-effects, only they're really, really
significant ones."
Comments:
- Andrea suggested the fs operation too but then it's more efficient to
have it as a mm operation with fs side effects, because they don't
immediatly know fd and physical offset of the range. It's possible to
fixup in userland and to use the fs operation but it's more expensive,
the vmas are already in the kernel and we can use them.
Short term plan & Future Direction:
- We seem to need this interface only for shmfs/tmpfs files in the short
term. We have to add hooks into the filesystem for correctness and
completeness. This is what this patch does.
- In the future, plan is to support both fs and mmap apis also. This
also involves (other) filesystem specific functions to be implemented.
- Current patch doesn't support VM_NONLINEAR - which can be addressed in
the future.
Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Andrea Arcangeli <andrea@suse.de>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-06 16:10:38 +08:00
|
|
|
return -ENOSYS;
|
|
|
|
|
2006-01-10 07:59:24 +08:00
|
|
|
mutex_lock(&inode->i_mutex);
|
[PATCH] madvise(MADV_REMOVE): remove pages from tmpfs shm backing store
Here is the patch to implement madvise(MADV_REMOVE) - which frees up a
given range of pages & its associated backing store. Current
implementation supports only shmfs/tmpfs and other filesystems return
-ENOSYS.
"Some app allocates large tmpfs files, then when some task quits and some
client disconnect, some memory can be released. However the only way to
release tmpfs-swap is to MADV_REMOVE". - Andrea Arcangeli
Databases want to use this feature to drop a section of their bufferpool
(shared memory segments) - without writing back to disk/swap space.
This feature is also useful for supporting hot-plug memory on UML.
Concerns raised by Andrew Morton:
- "We have no plan for holepunching! If we _do_ have such a plan (or
might in the future) then what would the API look like? I think
sys_holepunch(fd, start, len), so we should start out with that."
- Using madvise is very weird, because people will ask "why do I need to
mmap my file before I can stick a hole in it?"
- None of the other madvise operations call into the filesystem in this
manner. A broad question is: is this capability an MM operation or a
filesytem operation? truncate, for example, is a filesystem operation
which sometimes has MM side-effects. madvise is an mm operation and with
this patch, it gains FS side-effects, only they're really, really
significant ones."
Comments:
- Andrea suggested the fs operation too but then it's more efficient to
have it as a mm operation with fs side effects, because they don't
immediatly know fd and physical offset of the range. It's possible to
fixup in userland and to use the fs operation but it's more expensive,
the vmas are already in the kernel and we can use them.
Short term plan & Future Direction:
- We seem to need this interface only for shmfs/tmpfs files in the short
term. We have to add hooks into the filesystem for correctness and
completeness. This is what this patch does.
- In the future, plan is to support both fs and mmap apis also. This
also involves (other) filesystem specific functions to be implemented.
- Current patch doesn't support VM_NONLINEAR - which can be addressed in
the future.
Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Andrea Arcangeli <andrea@suse.de>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-06 16:10:38 +08:00
|
|
|
down_write(&inode->i_alloc_sem);
|
|
|
|
unmap_mapping_range(mapping, offset, (end - offset), 1);
|
|
|
|
truncate_inode_pages_range(mapping, offset, end);
|
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page.
Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.
The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache. Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.
The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.
Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).
Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size. do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size). The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data. Valid mappings to the same
place will see a different page.
Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit. He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight. However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit. Scalability is not an issue.
This patch implements this latter approach. ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so). do_no_page only unlocks the page after setting up the mapping
completely. invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).
This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 16:46:57 +08:00
|
|
|
unmap_mapping_range(mapping, offset, (end - offset), 1);
|
[PATCH] madvise(MADV_REMOVE): remove pages from tmpfs shm backing store
Here is the patch to implement madvise(MADV_REMOVE) - which frees up a
given range of pages & its associated backing store. Current
implementation supports only shmfs/tmpfs and other filesystems return
-ENOSYS.
"Some app allocates large tmpfs files, then when some task quits and some
client disconnect, some memory can be released. However the only way to
release tmpfs-swap is to MADV_REMOVE". - Andrea Arcangeli
Databases want to use this feature to drop a section of their bufferpool
(shared memory segments) - without writing back to disk/swap space.
This feature is also useful for supporting hot-plug memory on UML.
Concerns raised by Andrew Morton:
- "We have no plan for holepunching! If we _do_ have such a plan (or
might in the future) then what would the API look like? I think
sys_holepunch(fd, start, len), so we should start out with that."
- Using madvise is very weird, because people will ask "why do I need to
mmap my file before I can stick a hole in it?"
- None of the other madvise operations call into the filesystem in this
manner. A broad question is: is this capability an MM operation or a
filesytem operation? truncate, for example, is a filesystem operation
which sometimes has MM side-effects. madvise is an mm operation and with
this patch, it gains FS side-effects, only they're really, really
significant ones."
Comments:
- Andrea suggested the fs operation too but then it's more efficient to
have it as a mm operation with fs side effects, because they don't
immediatly know fd and physical offset of the range. It's possible to
fixup in userland and to use the fs operation but it's more expensive,
the vmas are already in the kernel and we can use them.
Short term plan & Future Direction:
- We seem to need this interface only for shmfs/tmpfs files in the short
term. We have to add hooks into the filesystem for correctness and
completeness. This is what this patch does.
- In the future, plan is to support both fs and mmap apis also. This
also involves (other) filesystem specific functions to be implemented.
- Current patch doesn't support VM_NONLINEAR - which can be addressed in
the future.
Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Andrea Arcangeli <andrea@suse.de>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-06 16:10:38 +08:00
|
|
|
inode->i_op->truncate_range(inode, offset, end);
|
|
|
|
up_write(&inode->i_alloc_sem);
|
2006-01-10 07:59:24 +08:00
|
|
|
mutex_unlock(&inode->i_mutex);
|
[PATCH] madvise(MADV_REMOVE): remove pages from tmpfs shm backing store
Here is the patch to implement madvise(MADV_REMOVE) - which frees up a
given range of pages & its associated backing store. Current
implementation supports only shmfs/tmpfs and other filesystems return
-ENOSYS.
"Some app allocates large tmpfs files, then when some task quits and some
client disconnect, some memory can be released. However the only way to
release tmpfs-swap is to MADV_REMOVE". - Andrea Arcangeli
Databases want to use this feature to drop a section of their bufferpool
(shared memory segments) - without writing back to disk/swap space.
This feature is also useful for supporting hot-plug memory on UML.
Concerns raised by Andrew Morton:
- "We have no plan for holepunching! If we _do_ have such a plan (or
might in the future) then what would the API look like? I think
sys_holepunch(fd, start, len), so we should start out with that."
- Using madvise is very weird, because people will ask "why do I need to
mmap my file before I can stick a hole in it?"
- None of the other madvise operations call into the filesystem in this
manner. A broad question is: is this capability an MM operation or a
filesytem operation? truncate, for example, is a filesystem operation
which sometimes has MM side-effects. madvise is an mm operation and with
this patch, it gains FS side-effects, only they're really, really
significant ones."
Comments:
- Andrea suggested the fs operation too but then it's more efficient to
have it as a mm operation with fs side effects, because they don't
immediatly know fd and physical offset of the range. It's possible to
fixup in userland and to use the fs operation but it's more expensive,
the vmas are already in the kernel and we can use them.
Short term plan & Future Direction:
- We seem to need this interface only for shmfs/tmpfs files in the short
term. We have to add hooks into the filesystem for correctness and
completeness. This is what this patch does.
- In the future, plan is to support both fs and mmap apis also. This
also involves (other) filesystem specific functions to be implemented.
- Current patch doesn't support VM_NONLINEAR - which can be addressed in
the future.
Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Andrea Arcangeli <andrea@suse.de>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-06 16:10:38 +08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
* We enter with non-exclusive mmap_sem (to exclude vma changes,
|
|
|
|
* but allow concurrent faults), and pte mapped but not yet locked.
|
|
|
|
* We return with mmap_sem still held, but pte unmapped and unlocked.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
|
|
unsigned long address, pte_t *page_table, pmd_t *pmd,
|
2009-04-10 23:43:11 +08:00
|
|
|
unsigned int flags, pte_t orig_pte)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
spinlock_t *ptl;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct page *page;
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
swp_entry_t entry;
|
2005-04-17 06:20:36 +08:00
|
|
|
pte_t pte;
|
2009-01-08 10:07:48 +08:00
|
|
|
struct mem_cgroup *ptr = NULL;
|
2007-07-19 16:47:05 +08:00
|
|
|
int ret = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:40 +08:00
|
|
|
if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
goto out;
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
|
|
|
|
entry = pte_to_swp_entry(orig_pte);
|
[PATCH] Swapless page migration: add R/W migration entries
Implement read/write migration ptes
We take the upper two swapfiles for the two types of migration ptes and define
a series of macros in swapops.h.
The VM is modified to handle the migration entries. migration entries can
only be encountered when the page they are pointing to is locked. This limits
the number of places one has to fix. We also check in copy_pte_range and in
mprotect_pte_range() for migration ptes.
We check for migration ptes in do_swap_cache and call a function that will
then wait on the page lock. This allows us to effectively stop all accesses
to apge.
Migration entries are created by try_to_unmap if called for migration and
removed by local functions in migrate.c
From: Hugh Dickins <hugh@veritas.com>
Several times while testing swapless page migration (I've no NUMA, just
hacking it up to migrate recklessly while running load), I've hit the
BUG_ON(!PageLocked(p)) in migration_entry_to_page.
This comes from an orphaned migration entry, unrelated to the current
correctly locked migration, but hit by remove_anon_migration_ptes as it
checks an address in each vma of the anon_vma list.
Such an orphan may be left behind if an earlier migration raced with fork:
copy_one_pte can duplicate a migration entry from parent to child, after
remove_anon_migration_ptes has checked the child vma, but before it has
removed it from the parent vma. (If the process were later to fault on this
orphaned entry, it would hit the same BUG from migration_entry_wait.)
This could be fixed by locking anon_vma in copy_one_pte, but we'd rather
not. There's no such problem with file pages, because vma_prio_tree_add
adds child vma after parent vma, and the page table locking at each end is
enough to serialize. Follow that example with anon_vma: add new vmas to the
tail instead of the head.
(There's no corresponding problem when inserting migration entries,
because a missed pte will leave the page count and mapcount high, which is
allowed for. And there's no corresponding problem when migrating via swap,
because a leftover swap entry will be correctly faulted. But the swapless
method has no refcounting of its entries.)
From: Ingo Molnar <mingo@elte.hu>
pte_unmap_unlock() takes the pte pointer as an argument.
From: Hugh Dickins <hugh@veritas.com>
Several times while testing swapless page migration, gcc has tried to exec
a pointer instead of a string: smells like COW mappings are not being
properly write-protected on fork.
The protection in copy_one_pte looks very convincing, until at last you
realize that the second arg to make_migration_entry is a boolean "write",
and SWP_MIGRATION_READ is 30.
Anyway, it's better done like in change_pte_range, using
is_write_migration_entry and make_migration_entry_read.
From: Hugh Dickins <hugh@veritas.com>
Remove unnecessary obfuscation from sys_swapon's range check on swap type,
which blew up causing memory corruption once swapless migration made
MAX_SWAPFILES no longer 2 ^ MAX_SWAPFILES_SHIFT.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
From: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 17:03:35 +08:00
|
|
|
if (is_migration_entry(entry)) {
|
|
|
|
migration_entry_wait(mm, pmd, address);
|
|
|
|
goto out;
|
|
|
|
}
|
2006-07-14 15:24:37 +08:00
|
|
|
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
|
2005-04-17 06:20:36 +08:00
|
|
|
page = lookup_swap_cache(entry);
|
|
|
|
if (!page) {
|
2009-06-24 03:36:58 +08:00
|
|
|
grab_swap_token(mm); /* Contend for token _before_ read-in */
|
swapin needs gfp_mask for loop on tmpfs
Building in a filesystem on a loop device on a tmpfs file can hang when
swapping, the loop thread caught in that infamous throttle_vm_writeout.
In theory this is a long standing problem, which I've either never seen in
practice, or long ago suppressed the recollection, after discounting my load
and my tmpfs size as unrealistically high. But now, with the new aops, it has
become easy to hang on one machine.
Loop used to grab_cache_page before the old prepare_write to tmpfs, which
seems to have been enough to free up some memory for any swapin needed; but
the new write_begin lets tmpfs find or allocate the page (much nicer, since
grab_cache_page missed tmpfs pages in swapcache).
When allocating a fresh page, tmpfs respects loop's mapping_gfp_mask, which
has __GFP_IO|__GFP_FS stripped off, and throttle_vm_writeout is designed to
break out when __GFP_IO or GFP_FS is unset; but when tmfps swaps in,
read_swap_cache_async allocates with GFP_HIGHUSER_MOVABLE regardless of the
mapping_gfp_mask - hence the hang.
So, pass gfp_mask down the line from shmem_getpage to shmem_swapin to
swapin_readahead to read_swap_cache_async to add_to_swap_cache.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-05 14:28:42 +08:00
|
|
|
page = swapin_readahead(entry,
|
|
|
|
GFP_HIGHUSER_MOVABLE, vma, address);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!page) {
|
|
|
|
/*
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
* Back out if somebody else faulted in this pte
|
|
|
|
* while we released the pte lock.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (likely(pte_same(*page_table, orig_pte)))
|
|
|
|
ret = VM_FAULT_OOM;
|
2006-07-14 15:24:37 +08:00
|
|
|
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
goto unlock;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Had to read the page from swap area: Major fault */
|
|
|
|
ret = VM_FAULT_MAJOR;
|
2006-06-30 16:55:45 +08:00
|
|
|
count_vm_event(PGMAJFAULT);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2008-10-19 11:28:08 +08:00
|
|
|
lock_page(page);
|
|
|
|
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
|
|
|
|
|
2009-01-08 10:08:10 +08:00
|
|
|
if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
|
2008-02-07 16:13:53 +08:00
|
|
|
ret = VM_FAULT_OOM;
|
2009-05-01 06:08:08 +08:00
|
|
|
goto out_page;
|
2008-02-07 16:13:53 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
* Back out if somebody else already faulted in this pte.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
|
2005-10-30 09:16:15 +08:00
|
|
|
if (unlikely(!pte_same(*page_table, orig_pte)))
|
2005-05-17 12:53:50 +08:00
|
|
|
goto out_nomap;
|
|
|
|
|
|
|
|
if (unlikely(!PageUptodate(page))) {
|
|
|
|
ret = VM_FAULT_SIGBUS;
|
|
|
|
goto out_nomap;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2009-01-08 10:08:00 +08:00
|
|
|
/*
|
|
|
|
* The page isn't present yet, go ahead with the fault.
|
|
|
|
*
|
|
|
|
* Be careful about the sequence of operations here.
|
|
|
|
* To get its accounting right, reuse_swap_page() must be called
|
|
|
|
* while the page is counted on swap but not yet in mapcount i.e.
|
|
|
|
* before page_add_anon_rmap() and swap_free(); try_to_free_swap()
|
|
|
|
* must be called after the swap_free(), or it will never succeed.
|
2009-01-08 10:08:31 +08:00
|
|
|
* Because delete_from_swap_page() may be called by reuse_swap_page(),
|
|
|
|
* mem_cgroup_commit_charge_swapin() may not be able to find swp_entry
|
|
|
|
* in page->private. In this case, a record in swap_cgroup is silently
|
|
|
|
* discarded at swap_free().
|
2009-01-08 10:08:00 +08:00
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-10-30 09:16:05 +08:00
|
|
|
inc_mm_counter(mm, anon_rss);
|
2005-04-17 06:20:36 +08:00
|
|
|
pte = mk_pte(page, vma->vm_page_prot);
|
2009-04-10 23:43:11 +08:00
|
|
|
if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
|
2005-04-17 06:20:36 +08:00
|
|
|
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
|
2009-04-10 23:43:11 +08:00
|
|
|
flags &= ~FAULT_FLAG_WRITE;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
flush_icache_page(vma, page);
|
|
|
|
set_pte_at(mm, address, page_table, pte);
|
|
|
|
page_add_anon_rmap(page, vma, address);
|
2009-01-08 10:08:31 +08:00
|
|
|
/* It's better to call commit-charge after rmap is established */
|
|
|
|
mem_cgroup_commit_charge_swapin(page, ptr);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
[PATCH] can_share_swap_page: use page_mapcount
Remember that ironic get_user_pages race? when the raised page_count on a
page swapped out led do_wp_page to decide that it had to copy on write, so
substituted a different page into userspace. 2.6.7 onwards have Andrea's
solution, where try_to_unmap_one backs out if it finds page_count raised.
Which works, but is unsatisfying (rmap.c has no other page_count heuristics),
and was found a few months ago to hang an intensive page migration test. A
year ago I was hesitant to engage page_mapcount, now it seems the right fix.
So remove the page_count hack from try_to_unmap_one; and use activate_page in
unuse_mm when dropping lock, to replace its secondary effect of helping
swapoff to make progress in that case.
Simplify can_share_swap_page (now called only on anonymous pages) to check
page_mapcount + page_swapcount == 1: still needs the page lock to stabilize
their (pessimistic) sum, but does not need swapper_space.tree_lock for that.
In do_swap_page, move swap_free and unlock_page below page_add_anon_rmap, to
keep sum on the high side, and correct when can_share_swap_page called.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-22 08:15:12 +08:00
|
|
|
swap_free(entry);
|
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 11:26:44 +08:00
|
|
|
if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
|
2009-01-07 06:39:36 +08:00
|
|
|
try_to_free_swap(page);
|
[PATCH] can_share_swap_page: use page_mapcount
Remember that ironic get_user_pages race? when the raised page_count on a
page swapped out led do_wp_page to decide that it had to copy on write, so
substituted a different page into userspace. 2.6.7 onwards have Andrea's
solution, where try_to_unmap_one backs out if it finds page_count raised.
Which works, but is unsatisfying (rmap.c has no other page_count heuristics),
and was found a few months ago to hang an intensive page migration test. A
year ago I was hesitant to engage page_mapcount, now it seems the right fix.
So remove the page_count hack from try_to_unmap_one; and use activate_page in
unuse_mm when dropping lock, to replace its secondary effect of helping
swapoff to make progress in that case.
Simplify can_share_swap_page (now called only on anonymous pages) to check
page_mapcount + page_swapcount == 1: still needs the page lock to stabilize
their (pessimistic) sum, but does not need swapper_space.tree_lock for that.
In do_swap_page, move swap_free and unlock_page below page_add_anon_rmap, to
keep sum on the high side, and correct when can_share_swap_page called.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-22 08:15:12 +08:00
|
|
|
unlock_page(page);
|
|
|
|
|
2009-04-10 23:43:11 +08:00
|
|
|
if (flags & FAULT_FLAG_WRITE) {
|
2008-03-05 06:29:04 +08:00
|
|
|
ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
|
|
|
|
if (ret & VM_FAULT_ERROR)
|
|
|
|
ret &= VM_FAULT_ERROR;
|
2005-04-17 06:20:36 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* No need to invalidate - it was non-present before */
|
|
|
|
update_mmu_cache(vma, address, pte);
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
unlock:
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
pte_unmap_unlock(page_table, ptl);
|
2005-04-17 06:20:36 +08:00
|
|
|
out:
|
|
|
|
return ret;
|
2005-05-17 12:53:50 +08:00
|
|
|
out_nomap:
|
2009-01-08 10:07:48 +08:00
|
|
|
mem_cgroup_cancel_charge_swapin(ptr);
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
pte_unmap_unlock(page_table, ptl);
|
2009-05-01 06:08:08 +08:00
|
|
|
out_page:
|
2005-05-17 12:53:50 +08:00
|
|
|
unlock_page(page);
|
|
|
|
page_cache_release(page);
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
return ret;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
* We enter with non-exclusive mmap_sem (to exclude vma changes,
|
|
|
|
* but allow concurrent faults), and pte mapped but not yet locked.
|
|
|
|
* We return with mmap_sem still held, but pte unmapped and unlocked.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
|
|
unsigned long address, pte_t *page_table, pmd_t *pmd,
|
2009-04-10 23:43:11 +08:00
|
|
|
unsigned int flags)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
struct page *page;
|
|
|
|
spinlock_t *ptl;
|
2005-04-17 06:20:36 +08:00
|
|
|
pte_t entry;
|
|
|
|
|
remove ZERO_PAGE
The commit b5810039a54e5babf428e9a1e89fc1940fabff11 contains the note
A last caveat: the ZERO_PAGE is now refcounted and managed with rmap
(and thus mapcounted and count towards shared rss). These writes to
the struct page could cause excessive cacheline bouncing on big
systems. There are a number of ways this could be addressed if it is
an issue.
And indeed this cacheline bouncing has shown up on large SGI systems.
There was a situation where an Altix system was essentially livelocked
tearing down ZERO_PAGE pagetables when an HPC app aborted during startup.
This situation can be avoided in userspace, but it does highlight the
potential scalability problem with refcounting ZERO_PAGE, and corner
cases where it can really hurt (we don't want the system to livelock!).
There are several broad ways to fix this problem:
1. add back some special casing to avoid refcounting ZERO_PAGE
2. per-node or per-cpu ZERO_PAGES
3. remove the ZERO_PAGE completely
I will argue for 3. The others should also fix the problem, but they
result in more complex code than does 3, with little or no real benefit
that I can see.
Why? Inserting a ZERO_PAGE for anonymous read faults appears to be a
false optimisation: if an application is performance critical, it would
not be doing many read faults of new memory, or at least it could be
expected to write to that memory soon afterwards. If cache or memory use
is critical, it should not be working with a significant number of
ZERO_PAGEs anyway (a more compact representation of zeroes should be
used).
As a sanity check -- mesuring on my desktop system, there are never many
mappings to the ZERO_PAGE (eg. 2 or 3), thus memory usage here should not
increase much without it.
When running a make -j4 kernel compile on my dual core system, there are
about 1,000 mappings to the ZERO_PAGE created per second, but about 1,000
ZERO_PAGE COW faults per second (less than 1 ZERO_PAGE mapping per second
is torn down without being COWed). So removing ZERO_PAGE will save 1,000
page faults per second when running kbuild, while keeping it only saves
less than 1 page clearing operation per second. 1 page clear is cheaper
than a thousand faults, presumably, so there isn't an obvious loss.
Neither the logical argument nor these basic tests give a guarantee of no
regressions. However, this is a reasonable opportunity to try to remove
the ZERO_PAGE from the pagefault path. If it is found to cause regressions,
we can reintroduce it and just avoid refcounting it.
The /dev/zero ZERO_PAGE usage and TLB tricks also get nuked. I don't see
much use to them except on benchmarks. All other users of ZERO_PAGE are
converted just to use ZERO_PAGE(0) for simplicity. We can look at
replacing them all and maybe ripping out ZERO_PAGE completely when we are
more satisfied with this solution.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus "snif" Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:24:40 +08:00
|
|
|
/* Allocate our own private page. */
|
|
|
|
pte_unmap(page_table);
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
|
remove ZERO_PAGE
The commit b5810039a54e5babf428e9a1e89fc1940fabff11 contains the note
A last caveat: the ZERO_PAGE is now refcounted and managed with rmap
(and thus mapcounted and count towards shared rss). These writes to
the struct page could cause excessive cacheline bouncing on big
systems. There are a number of ways this could be addressed if it is
an issue.
And indeed this cacheline bouncing has shown up on large SGI systems.
There was a situation where an Altix system was essentially livelocked
tearing down ZERO_PAGE pagetables when an HPC app aborted during startup.
This situation can be avoided in userspace, but it does highlight the
potential scalability problem with refcounting ZERO_PAGE, and corner
cases where it can really hurt (we don't want the system to livelock!).
There are several broad ways to fix this problem:
1. add back some special casing to avoid refcounting ZERO_PAGE
2. per-node or per-cpu ZERO_PAGES
3. remove the ZERO_PAGE completely
I will argue for 3. The others should also fix the problem, but they
result in more complex code than does 3, with little or no real benefit
that I can see.
Why? Inserting a ZERO_PAGE for anonymous read faults appears to be a
false optimisation: if an application is performance critical, it would
not be doing many read faults of new memory, or at least it could be
expected to write to that memory soon afterwards. If cache or memory use
is critical, it should not be working with a significant number of
ZERO_PAGEs anyway (a more compact representation of zeroes should be
used).
As a sanity check -- mesuring on my desktop system, there are never many
mappings to the ZERO_PAGE (eg. 2 or 3), thus memory usage here should not
increase much without it.
When running a make -j4 kernel compile on my dual core system, there are
about 1,000 mappings to the ZERO_PAGE created per second, but about 1,000
ZERO_PAGE COW faults per second (less than 1 ZERO_PAGE mapping per second
is torn down without being COWed). So removing ZERO_PAGE will save 1,000
page faults per second when running kbuild, while keeping it only saves
less than 1 page clearing operation per second. 1 page clear is cheaper
than a thousand faults, presumably, so there isn't an obvious loss.
Neither the logical argument nor these basic tests give a guarantee of no
regressions. However, this is a reasonable opportunity to try to remove
the ZERO_PAGE from the pagefault path. If it is found to cause regressions,
we can reintroduce it and just avoid refcounting it.
The /dev/zero ZERO_PAGE usage and TLB tricks also get nuked. I don't see
much use to them except on benchmarks. All other users of ZERO_PAGE are
converted just to use ZERO_PAGE(0) for simplicity. We can look at
replacing them all and maybe ripping out ZERO_PAGE completely when we are
more satisfied with this solution.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus "snif" Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:24:40 +08:00
|
|
|
if (unlikely(anon_vma_prepare(vma)))
|
|
|
|
goto oom;
|
|
|
|
page = alloc_zeroed_user_highpage_movable(vma, address);
|
|
|
|
if (!page)
|
|
|
|
goto oom;
|
mm: fix PageUptodate data race
After running SetPageUptodate, preceeding stores to the page contents to
actually bring it uptodate may not be ordered with the store to set the
page uptodate.
Therefore, another CPU which checks PageUptodate is true, then reads the
page contents can get stale data.
Fix this by having an smp_wmb before SetPageUptodate, and smp_rmb after
PageUptodate.
Many places that test PageUptodate, do so with the page locked, and this
would be enough to ensure memory ordering in those places if
SetPageUptodate were only called while the page is locked. Unfortunately
that is not always the case for some filesystems, but it could be an idea
for the future.
Also bring the handling of anonymous page uptodateness in line with that of
file backed page management, by marking anon pages as uptodate when they
_are_ uptodate, rather than when our implementation requires that they be
marked as such. Doing allows us to get rid of the smp_wmb's in the page
copying functions, which were especially added for anonymous pages for an
analogous memory ordering problem. Both file and anonymous pages are
handled with the same barriers.
FAQ:
Q. Why not do this in flush_dcache_page?
A. Firstly, flush_dcache_page handles only one side (the smb side) of the
ordering protocol; we'd still need smp_rmb somewhere. Secondly, hiding away
memory barriers in a completely unrelated function is nasty; at least in the
PageUptodate macros, they are located together with (half) the operations
involved in the ordering. Thirdly, the smp_wmb is only required when first
bringing the page uptodate, wheras flush_dcache_page should be called each time
it is written to through the kernel mapping. It is logically the wrong place to
put it.
Q. Why does this increase my text size / reduce my performance / etc.
A. Because it is adding the necessary instructions to eliminate the data-race.
Q. Can it be improved?
A. Yes, eg. if you were to create a rule that all SetPageUptodate operations
run under the page lock, we could avoid the smp_rmb places where PageUptodate
is queried under the page lock. Requires audit of all filesystems and at least
some would need reworking. That's great you're interested, I'm eagerly awaiting
your patches.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-05 14:29:34 +08:00
|
|
|
__SetPageUptodate(page);
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
|
2009-01-08 10:08:10 +08:00
|
|
|
if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
|
2008-02-07 16:13:53 +08:00
|
|
|
goto oom_free_page;
|
|
|
|
|
remove ZERO_PAGE
The commit b5810039a54e5babf428e9a1e89fc1940fabff11 contains the note
A last caveat: the ZERO_PAGE is now refcounted and managed with rmap
(and thus mapcounted and count towards shared rss). These writes to
the struct page could cause excessive cacheline bouncing on big
systems. There are a number of ways this could be addressed if it is
an issue.
And indeed this cacheline bouncing has shown up on large SGI systems.
There was a situation where an Altix system was essentially livelocked
tearing down ZERO_PAGE pagetables when an HPC app aborted during startup.
This situation can be avoided in userspace, but it does highlight the
potential scalability problem with refcounting ZERO_PAGE, and corner
cases where it can really hurt (we don't want the system to livelock!).
There are several broad ways to fix this problem:
1. add back some special casing to avoid refcounting ZERO_PAGE
2. per-node or per-cpu ZERO_PAGES
3. remove the ZERO_PAGE completely
I will argue for 3. The others should also fix the problem, but they
result in more complex code than does 3, with little or no real benefit
that I can see.
Why? Inserting a ZERO_PAGE for anonymous read faults appears to be a
false optimisation: if an application is performance critical, it would
not be doing many read faults of new memory, or at least it could be
expected to write to that memory soon afterwards. If cache or memory use
is critical, it should not be working with a significant number of
ZERO_PAGEs anyway (a more compact representation of zeroes should be
used).
As a sanity check -- mesuring on my desktop system, there are never many
mappings to the ZERO_PAGE (eg. 2 or 3), thus memory usage here should not
increase much without it.
When running a make -j4 kernel compile on my dual core system, there are
about 1,000 mappings to the ZERO_PAGE created per second, but about 1,000
ZERO_PAGE COW faults per second (less than 1 ZERO_PAGE mapping per second
is torn down without being COWed). So removing ZERO_PAGE will save 1,000
page faults per second when running kbuild, while keeping it only saves
less than 1 page clearing operation per second. 1 page clear is cheaper
than a thousand faults, presumably, so there isn't an obvious loss.
Neither the logical argument nor these basic tests give a guarantee of no
regressions. However, this is a reasonable opportunity to try to remove
the ZERO_PAGE from the pagefault path. If it is found to cause regressions,
we can reintroduce it and just avoid refcounting it.
The /dev/zero ZERO_PAGE usage and TLB tricks also get nuked. I don't see
much use to them except on benchmarks. All other users of ZERO_PAGE are
converted just to use ZERO_PAGE(0) for simplicity. We can look at
replacing them all and maybe ripping out ZERO_PAGE completely when we are
more satisfied with this solution.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus "snif" Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:24:40 +08:00
|
|
|
entry = mk_pte(page, vma->vm_page_prot);
|
|
|
|
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
remove ZERO_PAGE
The commit b5810039a54e5babf428e9a1e89fc1940fabff11 contains the note
A last caveat: the ZERO_PAGE is now refcounted and managed with rmap
(and thus mapcounted and count towards shared rss). These writes to
the struct page could cause excessive cacheline bouncing on big
systems. There are a number of ways this could be addressed if it is
an issue.
And indeed this cacheline bouncing has shown up on large SGI systems.
There was a situation where an Altix system was essentially livelocked
tearing down ZERO_PAGE pagetables when an HPC app aborted during startup.
This situation can be avoided in userspace, but it does highlight the
potential scalability problem with refcounting ZERO_PAGE, and corner
cases where it can really hurt (we don't want the system to livelock!).
There are several broad ways to fix this problem:
1. add back some special casing to avoid refcounting ZERO_PAGE
2. per-node or per-cpu ZERO_PAGES
3. remove the ZERO_PAGE completely
I will argue for 3. The others should also fix the problem, but they
result in more complex code than does 3, with little or no real benefit
that I can see.
Why? Inserting a ZERO_PAGE for anonymous read faults appears to be a
false optimisation: if an application is performance critical, it would
not be doing many read faults of new memory, or at least it could be
expected to write to that memory soon afterwards. If cache or memory use
is critical, it should not be working with a significant number of
ZERO_PAGEs anyway (a more compact representation of zeroes should be
used).
As a sanity check -- mesuring on my desktop system, there are never many
mappings to the ZERO_PAGE (eg. 2 or 3), thus memory usage here should not
increase much without it.
When running a make -j4 kernel compile on my dual core system, there are
about 1,000 mappings to the ZERO_PAGE created per second, but about 1,000
ZERO_PAGE COW faults per second (less than 1 ZERO_PAGE mapping per second
is torn down without being COWed). So removing ZERO_PAGE will save 1,000
page faults per second when running kbuild, while keeping it only saves
less than 1 page clearing operation per second. 1 page clear is cheaper
than a thousand faults, presumably, so there isn't an obvious loss.
Neither the logical argument nor these basic tests give a guarantee of no
regressions. However, this is a reasonable opportunity to try to remove
the ZERO_PAGE from the pagefault path. If it is found to cause regressions,
we can reintroduce it and just avoid refcounting it.
The /dev/zero ZERO_PAGE usage and TLB tricks also get nuked. I don't see
much use to them except on benchmarks. All other users of ZERO_PAGE are
converted just to use ZERO_PAGE(0) for simplicity. We can look at
replacing them all and maybe ripping out ZERO_PAGE completely when we are
more satisfied with this solution.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus "snif" Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:24:40 +08:00
|
|
|
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
|
|
|
|
if (!pte_none(*page_table))
|
|
|
|
goto release;
|
|
|
|
inc_mm_counter(mm, anon_rss);
|
|
|
|
page_add_new_anon_rmap(page, vma, address);
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
set_pte_at(mm, address, page_table, entry);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* No need to invalidate - it was non-present before */
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
update_mmu_cache(vma, address, entry);
|
|
|
|
unlock:
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
pte_unmap_unlock(page_table, ptl);
|
2007-07-19 16:47:05 +08:00
|
|
|
return 0;
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
release:
|
2008-02-07 16:13:53 +08:00
|
|
|
mem_cgroup_uncharge_page(page);
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
page_cache_release(page);
|
|
|
|
goto unlock;
|
2008-02-07 16:13:53 +08:00
|
|
|
oom_free_page:
|
2008-03-05 06:29:04 +08:00
|
|
|
page_cache_release(page);
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
oom:
|
2005-04-17 06:20:36 +08:00
|
|
|
return VM_FAULT_OOM;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2007-07-19 16:46:59 +08:00
|
|
|
* __do_fault() tries to create a new page mapping. It aggressively
|
2005-04-17 06:20:36 +08:00
|
|
|
* tries to share with existing pages, but makes a separate copy if
|
2007-07-19 16:46:59 +08:00
|
|
|
* the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
|
|
|
|
* the next page fault.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* As this is called only for pages that do not currently exist, we
|
|
|
|
* do not need to flush old virtual caches or the TLB.
|
|
|
|
*
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
* We enter with non-exclusive mmap_sem (to exclude vma changes,
|
2007-10-04 23:56:06 +08:00
|
|
|
* but allow concurrent faults), and pte neither mapped nor locked.
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
* We return with mmap_sem still held, but pte unmapped and unlocked.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2007-07-19 16:46:59 +08:00
|
|
|
static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
2007-10-04 23:56:06 +08:00
|
|
|
unsigned long address, pmd_t *pmd,
|
2007-07-19 16:46:59 +08:00
|
|
|
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2007-10-04 23:56:06 +08:00
|
|
|
pte_t *page_table;
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
spinlock_t *ptl;
|
2007-07-19 16:47:03 +08:00
|
|
|
struct page *page;
|
2005-04-17 06:20:36 +08:00
|
|
|
pte_t entry;
|
|
|
|
int anon = 0;
|
2008-10-19 11:28:10 +08:00
|
|
|
int charged = 0;
|
2006-09-26 14:30:57 +08:00
|
|
|
struct page *dirty_page = NULL;
|
2007-07-19 16:47:03 +08:00
|
|
|
struct vm_fault vmf;
|
|
|
|
int ret;
|
2007-10-09 00:54:37 +08:00
|
|
|
int page_mkwrite = 0;
|
2007-07-19 16:46:59 +08:00
|
|
|
|
2007-07-19 16:47:03 +08:00
|
|
|
vmf.virtual_address = (void __user *)(address & PAGE_MASK);
|
|
|
|
vmf.pgoff = pgoff;
|
|
|
|
vmf.flags = flags;
|
|
|
|
vmf.page = NULL;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-04-28 17:12:10 +08:00
|
|
|
ret = vma->vm_ops->fault(vma, &vmf);
|
|
|
|
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
|
|
|
|
return ret;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page.
Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.
The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache. Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.
The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.
Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).
Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size. do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size). The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data. Valid mappings to the same
place will see a different page.
Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit. He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight. However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit. Scalability is not an issue.
This patch implements this latter approach. ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so). do_no_page only unlocks the page after setting up the mapping
completely. invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).
This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 16:46:57 +08:00
|
|
|
/*
|
2007-07-19 16:47:03 +08:00
|
|
|
* For consistency in subsequent calls, make the faulted page always
|
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page.
Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.
The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache. Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.
The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.
Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).
Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size. do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size). The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data. Valid mappings to the same
place will see a different page.
Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit. He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight. However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit. Scalability is not an issue.
This patch implements this latter approach. ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so). do_no_page only unlocks the page after setting up the mapping
completely. invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).
This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 16:46:57 +08:00
|
|
|
* locked.
|
|
|
|
*/
|
2007-07-19 16:47:05 +08:00
|
|
|
if (unlikely(!(ret & VM_FAULT_LOCKED)))
|
2007-07-19 16:47:03 +08:00
|
|
|
lock_page(vmf.page);
|
2007-07-19 16:46:59 +08:00
|
|
|
else
|
2007-07-19 16:47:03 +08:00
|
|
|
VM_BUG_ON(!PageLocked(vmf.page));
|
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page.
Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.
The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache. Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.
The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.
Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).
Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size. do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size). The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data. Valid mappings to the same
place will see a different page.
Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit. He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight. However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit. Scalability is not an issue.
This patch implements this latter approach. ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so). do_no_page only unlocks the page after setting up the mapping
completely. invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).
This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 16:46:57 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Should we do an early C-O-W break?
|
|
|
|
*/
|
2007-07-19 16:47:03 +08:00
|
|
|
page = vmf.page;
|
2007-07-19 16:46:59 +08:00
|
|
|
if (flags & FAULT_FLAG_WRITE) {
|
2006-06-23 17:03:43 +08:00
|
|
|
if (!(vma->vm_flags & VM_SHARED)) {
|
2007-07-19 16:46:59 +08:00
|
|
|
anon = 1;
|
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page.
Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.
The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache. Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.
The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.
Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).
Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size. do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size). The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data. Valid mappings to the same
place will see a different page.
Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit. He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight. However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit. Scalability is not an issue.
This patch implements this latter approach. ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so). do_no_page only unlocks the page after setting up the mapping
completely. invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).
This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 16:46:57 +08:00
|
|
|
if (unlikely(anon_vma_prepare(vma))) {
|
2007-07-19 16:47:03 +08:00
|
|
|
ret = VM_FAULT_OOM;
|
2007-07-19 16:46:59 +08:00
|
|
|
goto out;
|
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page.
Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.
The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache. Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.
The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.
Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).
Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size. do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size). The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data. Valid mappings to the same
place will see a different page.
Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit. He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight. However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit. Scalability is not an issue.
This patch implements this latter approach. ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so). do_no_page only unlocks the page after setting up the mapping
completely. invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).
This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 16:46:57 +08:00
|
|
|
}
|
2007-07-19 16:47:05 +08:00
|
|
|
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
|
|
|
|
vma, address);
|
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page.
Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.
The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache. Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.
The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.
Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).
Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size. do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size). The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data. Valid mappings to the same
place will see a different page.
Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit. He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight. However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit. Scalability is not an issue.
This patch implements this latter approach. ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so). do_no_page only unlocks the page after setting up the mapping
completely. invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).
This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 16:46:57 +08:00
|
|
|
if (!page) {
|
2007-07-19 16:47:03 +08:00
|
|
|
ret = VM_FAULT_OOM;
|
2007-07-19 16:46:59 +08:00
|
|
|
goto out;
|
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page.
Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.
The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache. Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.
The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.
Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).
Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size. do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size). The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data. Valid mappings to the same
place will see a different page.
Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit. He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight. However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit. Scalability is not an issue.
This patch implements this latter approach. ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so). do_no_page only unlocks the page after setting up the mapping
completely. invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).
This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 16:46:57 +08:00
|
|
|
}
|
2009-01-08 10:08:10 +08:00
|
|
|
if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
|
2008-10-19 11:28:10 +08:00
|
|
|
ret = VM_FAULT_OOM;
|
|
|
|
page_cache_release(page);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
charged = 1;
|
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 11:26:44 +08:00
|
|
|
/*
|
|
|
|
* Don't let another task, with possibly unlocked vma,
|
|
|
|
* keep the mlocked page.
|
|
|
|
*/
|
|
|
|
if (vma->vm_flags & VM_LOCKED)
|
|
|
|
clear_page_mlock(vmf.page);
|
2007-07-19 16:47:03 +08:00
|
|
|
copy_user_highpage(page, vmf.page, address, vma);
|
mm: fix PageUptodate data race
After running SetPageUptodate, preceeding stores to the page contents to
actually bring it uptodate may not be ordered with the store to set the
page uptodate.
Therefore, another CPU which checks PageUptodate is true, then reads the
page contents can get stale data.
Fix this by having an smp_wmb before SetPageUptodate, and smp_rmb after
PageUptodate.
Many places that test PageUptodate, do so with the page locked, and this
would be enough to ensure memory ordering in those places if
SetPageUptodate were only called while the page is locked. Unfortunately
that is not always the case for some filesystems, but it could be an idea
for the future.
Also bring the handling of anonymous page uptodateness in line with that of
file backed page management, by marking anon pages as uptodate when they
_are_ uptodate, rather than when our implementation requires that they be
marked as such. Doing allows us to get rid of the smp_wmb's in the page
copying functions, which were especially added for anonymous pages for an
analogous memory ordering problem. Both file and anonymous pages are
handled with the same barriers.
FAQ:
Q. Why not do this in flush_dcache_page?
A. Firstly, flush_dcache_page handles only one side (the smb side) of the
ordering protocol; we'd still need smp_rmb somewhere. Secondly, hiding away
memory barriers in a completely unrelated function is nasty; at least in the
PageUptodate macros, they are located together with (half) the operations
involved in the ordering. Thirdly, the smp_wmb is only required when first
bringing the page uptodate, wheras flush_dcache_page should be called each time
it is written to through the kernel mapping. It is logically the wrong place to
put it.
Q. Why does this increase my text size / reduce my performance / etc.
A. Because it is adding the necessary instructions to eliminate the data-race.
Q. Can it be improved?
A. Yes, eg. if you were to create a rule that all SetPageUptodate operations
run under the page lock, we could avoid the smp_rmb places where PageUptodate
is queried under the page lock. Requires audit of all filesystems and at least
some would need reworking. That's great you're interested, I'm eagerly awaiting
your patches.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-05 14:29:34 +08:00
|
|
|
__SetPageUptodate(page);
|
2006-06-23 17:03:43 +08:00
|
|
|
} else {
|
2007-07-19 16:46:59 +08:00
|
|
|
/*
|
|
|
|
* If the page will be shareable, see if the backing
|
2006-06-23 17:03:43 +08:00
|
|
|
* address space wants to know that the page is about
|
2007-07-19 16:46:59 +08:00
|
|
|
* to become writable
|
|
|
|
*/
|
2007-07-19 16:47:00 +08:00
|
|
|
if (vma->vm_ops->page_mkwrite) {
|
2009-04-01 06:23:21 +08:00
|
|
|
int tmp;
|
|
|
|
|
2007-07-19 16:47:00 +08:00
|
|
|
unlock_page(page);
|
mm: close page_mkwrite races
Change page_mkwrite to allow implementations to return with the page
locked, and also change it's callers (in page fault paths) to hold the
lock until the page is marked dirty. This allows the filesystem to have
full control of page dirtying events coming from the VM.
Rather than simply hold the page locked over the page_mkwrite call, we
call page_mkwrite with the page unlocked and allow callers to return with
it locked, so filesystems can avoid LOR conditions with page lock.
The problem with the current scheme is this: a filesystem that wants to
associate some metadata with a page as long as the page is dirty, will
perform this manipulation in its ->page_mkwrite. It currently then must
return with the page unlocked and may not hold any other locks (according
to existing page_mkwrite convention).
In this window, the VM could write out the page, clearing page-dirty. The
filesystem has no good way to detect that a dirty pte is about to be
attached, so it will happily write out the page, at which point, the
filesystem may manipulate the metadata to reflect that the page is no
longer dirty.
It is not always possible to perform the required metadata manipulation in
->set_page_dirty, because that function cannot block or fail. The
filesystem may need to allocate some data structure, for example.
And the VM cannot mark the pte dirty before page_mkwrite, because
page_mkwrite is allowed to fail, so we must not allow any window where the
page could be written to if page_mkwrite does fail.
This solution of holding the page locked over the 3 critical operations
(page_mkwrite, setting the pte dirty, and finally setting the page dirty)
closes out races nicely, preventing page cleaning for writeout being
initiated in that window. This provides the filesystem with a strong
synchronisation against the VM here.
- Sage needs this race closed for ceph filesystem.
- Trond for NFS (http://bugzilla.kernel.org/show_bug.cgi?id=12913).
- I need it for fsblock.
- I suspect other filesystems may need it too (eg. btrfs).
- I have converted buffer.c to the new locking. Even simple block allocation
under dirty pages might be susceptible to i_size changing under partial page
at the end of file (we also have a buffer.c-side problem here, but it cannot
be fixed properly without this patch).
- Other filesystems (eg. NFS, maybe btrfs) will need to change their
page_mkwrite functions themselves.
[ This also moves page_mkwrite another step closer to fault, which should
eventually allow page_mkwrite to be moved into ->fault, and thus avoiding a
filesystem calldown and page lock/unlock cycle in __do_fault. ]
[akpm@linux-foundation.org: fix derefs of NULL ->mapping]
Cc: Sage Weil <sage@newdream.net>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-05-01 06:08:16 +08:00
|
|
|
vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
|
2009-04-01 06:23:21 +08:00
|
|
|
tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
|
|
|
|
if (unlikely(tmp &
|
|
|
|
(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
|
|
|
|
ret = tmp;
|
mm: close page_mkwrite races
Change page_mkwrite to allow implementations to return with the page
locked, and also change it's callers (in page fault paths) to hold the
lock until the page is marked dirty. This allows the filesystem to have
full control of page dirtying events coming from the VM.
Rather than simply hold the page locked over the page_mkwrite call, we
call page_mkwrite with the page unlocked and allow callers to return with
it locked, so filesystems can avoid LOR conditions with page lock.
The problem with the current scheme is this: a filesystem that wants to
associate some metadata with a page as long as the page is dirty, will
perform this manipulation in its ->page_mkwrite. It currently then must
return with the page unlocked and may not hold any other locks (according
to existing page_mkwrite convention).
In this window, the VM could write out the page, clearing page-dirty. The
filesystem has no good way to detect that a dirty pte is about to be
attached, so it will happily write out the page, at which point, the
filesystem may manipulate the metadata to reflect that the page is no
longer dirty.
It is not always possible to perform the required metadata manipulation in
->set_page_dirty, because that function cannot block or fail. The
filesystem may need to allocate some data structure, for example.
And the VM cannot mark the pte dirty before page_mkwrite, because
page_mkwrite is allowed to fail, so we must not allow any window where the
page could be written to if page_mkwrite does fail.
This solution of holding the page locked over the 3 critical operations
(page_mkwrite, setting the pte dirty, and finally setting the page dirty)
closes out races nicely, preventing page cleaning for writeout being
initiated in that window. This provides the filesystem with a strong
synchronisation against the VM here.
- Sage needs this race closed for ceph filesystem.
- Trond for NFS (http://bugzilla.kernel.org/show_bug.cgi?id=12913).
- I need it for fsblock.
- I suspect other filesystems may need it too (eg. btrfs).
- I have converted buffer.c to the new locking. Even simple block allocation
under dirty pages might be susceptible to i_size changing under partial page
at the end of file (we also have a buffer.c-side problem here, but it cannot
be fixed properly without this patch).
- Other filesystems (eg. NFS, maybe btrfs) will need to change their
page_mkwrite functions themselves.
[ This also moves page_mkwrite another step closer to fault, which should
eventually allow page_mkwrite to be moved into ->fault, and thus avoiding a
filesystem calldown and page lock/unlock cycle in __do_fault. ]
[akpm@linux-foundation.org: fix derefs of NULL ->mapping]
Cc: Sage Weil <sage@newdream.net>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-05-01 06:08:16 +08:00
|
|
|
goto unwritable_page;
|
2007-07-19 16:47:03 +08:00
|
|
|
}
|
mm: close page_mkwrite races
Change page_mkwrite to allow implementations to return with the page
locked, and also change it's callers (in page fault paths) to hold the
lock until the page is marked dirty. This allows the filesystem to have
full control of page dirtying events coming from the VM.
Rather than simply hold the page locked over the page_mkwrite call, we
call page_mkwrite with the page unlocked and allow callers to return with
it locked, so filesystems can avoid LOR conditions with page lock.
The problem with the current scheme is this: a filesystem that wants to
associate some metadata with a page as long as the page is dirty, will
perform this manipulation in its ->page_mkwrite. It currently then must
return with the page unlocked and may not hold any other locks (according
to existing page_mkwrite convention).
In this window, the VM could write out the page, clearing page-dirty. The
filesystem has no good way to detect that a dirty pte is about to be
attached, so it will happily write out the page, at which point, the
filesystem may manipulate the metadata to reflect that the page is no
longer dirty.
It is not always possible to perform the required metadata manipulation in
->set_page_dirty, because that function cannot block or fail. The
filesystem may need to allocate some data structure, for example.
And the VM cannot mark the pte dirty before page_mkwrite, because
page_mkwrite is allowed to fail, so we must not allow any window where the
page could be written to if page_mkwrite does fail.
This solution of holding the page locked over the 3 critical operations
(page_mkwrite, setting the pte dirty, and finally setting the page dirty)
closes out races nicely, preventing page cleaning for writeout being
initiated in that window. This provides the filesystem with a strong
synchronisation against the VM here.
- Sage needs this race closed for ceph filesystem.
- Trond for NFS (http://bugzilla.kernel.org/show_bug.cgi?id=12913).
- I need it for fsblock.
- I suspect other filesystems may need it too (eg. btrfs).
- I have converted buffer.c to the new locking. Even simple block allocation
under dirty pages might be susceptible to i_size changing under partial page
at the end of file (we also have a buffer.c-side problem here, but it cannot
be fixed properly without this patch).
- Other filesystems (eg. NFS, maybe btrfs) will need to change their
page_mkwrite functions themselves.
[ This also moves page_mkwrite another step closer to fault, which should
eventually allow page_mkwrite to be moved into ->fault, and thus avoiding a
filesystem calldown and page lock/unlock cycle in __do_fault. ]
[akpm@linux-foundation.org: fix derefs of NULL ->mapping]
Cc: Sage Weil <sage@newdream.net>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-05-01 06:08:16 +08:00
|
|
|
if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
|
|
|
|
lock_page(page);
|
|
|
|
if (!page->mapping) {
|
|
|
|
ret = 0; /* retry the fault */
|
|
|
|
unlock_page(page);
|
|
|
|
goto unwritable_page;
|
|
|
|
}
|
|
|
|
} else
|
|
|
|
VM_BUG_ON(!PageLocked(page));
|
2007-10-09 00:54:37 +08:00
|
|
|
page_mkwrite = 1;
|
2006-06-23 17:03:43 +08:00
|
|
|
}
|
|
|
|
}
|
2007-07-19 16:46:59 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This silly early PAGE_DIRTY setting removes a race
|
|
|
|
* due to the bad i386 page protection. But it's valid
|
|
|
|
* for other architectures too.
|
|
|
|
*
|
2009-04-10 23:43:11 +08:00
|
|
|
* Note that if FAULT_FLAG_WRITE is set, we either now have
|
2005-04-17 06:20:36 +08:00
|
|
|
* an exclusive copy of the page, or this is a shared mapping,
|
|
|
|
* so we can make it writable and dirty to avoid having to
|
|
|
|
* handle that later.
|
|
|
|
*/
|
|
|
|
/* Only go through if we didn't race with anybody else... */
|
2007-07-19 16:46:59 +08:00
|
|
|
if (likely(pte_same(*page_table, orig_pte))) {
|
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page.
Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.
The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache. Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.
The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.
Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).
Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size. do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size). The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data. Valid mappings to the same
place will see a different page.
Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit. He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight. However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit. Scalability is not an issue.
This patch implements this latter approach. ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so). do_no_page only unlocks the page after setting up the mapping
completely. invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).
This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 16:46:57 +08:00
|
|
|
flush_icache_page(vma, page);
|
|
|
|
entry = mk_pte(page, vma->vm_page_prot);
|
2007-07-19 16:46:59 +08:00
|
|
|
if (flags & FAULT_FLAG_WRITE)
|
2005-04-17 06:20:36 +08:00
|
|
|
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
|
|
|
|
if (anon) {
|
2008-10-19 11:26:52 +08:00
|
|
|
inc_mm_counter(mm, anon_rss);
|
|
|
|
page_add_new_anon_rmap(page, vma, address);
|
2005-11-22 13:32:19 +08:00
|
|
|
} else {
|
2005-10-30 09:16:05 +08:00
|
|
|
inc_mm_counter(mm, file_rss);
|
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page.
Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.
The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache. Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.
The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.
Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).
Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size. do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size). The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data. Valid mappings to the same
place will see a different page.
Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit. He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight. However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit. Scalability is not an issue.
This patch implements this latter approach. ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so). do_no_page only unlocks the page after setting up the mapping
completely. invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).
This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 16:46:57 +08:00
|
|
|
page_add_file_rmap(page);
|
2007-07-19 16:46:59 +08:00
|
|
|
if (flags & FAULT_FLAG_WRITE) {
|
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page.
Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.
The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache. Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.
The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.
Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).
Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size. do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size). The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data. Valid mappings to the same
place will see a different page.
Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit. He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight. However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit. Scalability is not an issue.
This patch implements this latter approach. ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so). do_no_page only unlocks the page after setting up the mapping
completely. invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).
This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 16:46:57 +08:00
|
|
|
dirty_page = page;
|
2006-09-26 14:30:57 +08:00
|
|
|
get_page(dirty_page);
|
|
|
|
}
|
2005-10-30 09:16:05 +08:00
|
|
|
}
|
2008-10-19 11:26:52 +08:00
|
|
|
set_pte_at(mm, address, page_table, entry);
|
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page.
Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.
The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache. Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.
The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.
Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).
Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size. do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size). The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data. Valid mappings to the same
place will see a different page.
Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit. He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight. However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit. Scalability is not an issue.
This patch implements this latter approach. ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so). do_no_page only unlocks the page after setting up the mapping
completely. invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).
This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 16:46:57 +08:00
|
|
|
|
|
|
|
/* no need to invalidate: a not-present page won't be cached */
|
|
|
|
update_mmu_cache(vma, address, entry);
|
2005-04-17 06:20:36 +08:00
|
|
|
} else {
|
2008-10-19 11:28:10 +08:00
|
|
|
if (charged)
|
|
|
|
mem_cgroup_uncharge_page(page);
|
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page.
Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.
The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache. Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.
The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.
Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).
Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size. do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size). The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data. Valid mappings to the same
place will see a different page.
Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit. He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight. However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit. Scalability is not an issue.
This patch implements this latter approach. ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so). do_no_page only unlocks the page after setting up the mapping
completely. invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).
This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 16:46:57 +08:00
|
|
|
if (anon)
|
|
|
|
page_cache_release(page);
|
|
|
|
else
|
2007-07-19 16:46:59 +08:00
|
|
|
anon = 1; /* no anon but release faulted_page */
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
pte_unmap_unlock(page_table, ptl);
|
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page.
Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.
The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache. Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.
The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.
Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).
Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size. do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size). The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data. Valid mappings to the same
place will see a different page.
Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit. He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight. However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit. Scalability is not an issue.
This patch implements this latter approach. ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so). do_no_page only unlocks the page after setting up the mapping
completely. invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).
This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 16:46:57 +08:00
|
|
|
|
|
|
|
out:
|
mm: close page_mkwrite races
Change page_mkwrite to allow implementations to return with the page
locked, and also change it's callers (in page fault paths) to hold the
lock until the page is marked dirty. This allows the filesystem to have
full control of page dirtying events coming from the VM.
Rather than simply hold the page locked over the page_mkwrite call, we
call page_mkwrite with the page unlocked and allow callers to return with
it locked, so filesystems can avoid LOR conditions with page lock.
The problem with the current scheme is this: a filesystem that wants to
associate some metadata with a page as long as the page is dirty, will
perform this manipulation in its ->page_mkwrite. It currently then must
return with the page unlocked and may not hold any other locks (according
to existing page_mkwrite convention).
In this window, the VM could write out the page, clearing page-dirty. The
filesystem has no good way to detect that a dirty pte is about to be
attached, so it will happily write out the page, at which point, the
filesystem may manipulate the metadata to reflect that the page is no
longer dirty.
It is not always possible to perform the required metadata manipulation in
->set_page_dirty, because that function cannot block or fail. The
filesystem may need to allocate some data structure, for example.
And the VM cannot mark the pte dirty before page_mkwrite, because
page_mkwrite is allowed to fail, so we must not allow any window where the
page could be written to if page_mkwrite does fail.
This solution of holding the page locked over the 3 critical operations
(page_mkwrite, setting the pte dirty, and finally setting the page dirty)
closes out races nicely, preventing page cleaning for writeout being
initiated in that window. This provides the filesystem with a strong
synchronisation against the VM here.
- Sage needs this race closed for ceph filesystem.
- Trond for NFS (http://bugzilla.kernel.org/show_bug.cgi?id=12913).
- I need it for fsblock.
- I suspect other filesystems may need it too (eg. btrfs).
- I have converted buffer.c to the new locking. Even simple block allocation
under dirty pages might be susceptible to i_size changing under partial page
at the end of file (we also have a buffer.c-side problem here, but it cannot
be fixed properly without this patch).
- Other filesystems (eg. NFS, maybe btrfs) will need to change their
page_mkwrite functions themselves.
[ This also moves page_mkwrite another step closer to fault, which should
eventually allow page_mkwrite to be moved into ->fault, and thus avoiding a
filesystem calldown and page lock/unlock cycle in __do_fault. ]
[akpm@linux-foundation.org: fix derefs of NULL ->mapping]
Cc: Sage Weil <sage@newdream.net>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-05-01 06:08:16 +08:00
|
|
|
if (dirty_page) {
|
|
|
|
struct address_space *mapping = page->mapping;
|
2008-01-23 07:21:18 +08:00
|
|
|
|
mm: close page_mkwrite races
Change page_mkwrite to allow implementations to return with the page
locked, and also change it's callers (in page fault paths) to hold the
lock until the page is marked dirty. This allows the filesystem to have
full control of page dirtying events coming from the VM.
Rather than simply hold the page locked over the page_mkwrite call, we
call page_mkwrite with the page unlocked and allow callers to return with
it locked, so filesystems can avoid LOR conditions with page lock.
The problem with the current scheme is this: a filesystem that wants to
associate some metadata with a page as long as the page is dirty, will
perform this manipulation in its ->page_mkwrite. It currently then must
return with the page unlocked and may not hold any other locks (according
to existing page_mkwrite convention).
In this window, the VM could write out the page, clearing page-dirty. The
filesystem has no good way to detect that a dirty pte is about to be
attached, so it will happily write out the page, at which point, the
filesystem may manipulate the metadata to reflect that the page is no
longer dirty.
It is not always possible to perform the required metadata manipulation in
->set_page_dirty, because that function cannot block or fail. The
filesystem may need to allocate some data structure, for example.
And the VM cannot mark the pte dirty before page_mkwrite, because
page_mkwrite is allowed to fail, so we must not allow any window where the
page could be written to if page_mkwrite does fail.
This solution of holding the page locked over the 3 critical operations
(page_mkwrite, setting the pte dirty, and finally setting the page dirty)
closes out races nicely, preventing page cleaning for writeout being
initiated in that window. This provides the filesystem with a strong
synchronisation against the VM here.
- Sage needs this race closed for ceph filesystem.
- Trond for NFS (http://bugzilla.kernel.org/show_bug.cgi?id=12913).
- I need it for fsblock.
- I suspect other filesystems may need it too (eg. btrfs).
- I have converted buffer.c to the new locking. Even simple block allocation
under dirty pages might be susceptible to i_size changing under partial page
at the end of file (we also have a buffer.c-side problem here, but it cannot
be fixed properly without this patch).
- Other filesystems (eg. NFS, maybe btrfs) will need to change their
page_mkwrite functions themselves.
[ This also moves page_mkwrite another step closer to fault, which should
eventually allow page_mkwrite to be moved into ->fault, and thus avoiding a
filesystem calldown and page lock/unlock cycle in __do_fault. ]
[akpm@linux-foundation.org: fix derefs of NULL ->mapping]
Cc: Sage Weil <sage@newdream.net>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-05-01 06:08:16 +08:00
|
|
|
if (set_page_dirty(dirty_page))
|
|
|
|
page_mkwrite = 1;
|
|
|
|
unlock_page(dirty_page);
|
2006-09-26 14:30:57 +08:00
|
|
|
put_page(dirty_page);
|
mm: close page_mkwrite races
Change page_mkwrite to allow implementations to return with the page
locked, and also change it's callers (in page fault paths) to hold the
lock until the page is marked dirty. This allows the filesystem to have
full control of page dirtying events coming from the VM.
Rather than simply hold the page locked over the page_mkwrite call, we
call page_mkwrite with the page unlocked and allow callers to return with
it locked, so filesystems can avoid LOR conditions with page lock.
The problem with the current scheme is this: a filesystem that wants to
associate some metadata with a page as long as the page is dirty, will
perform this manipulation in its ->page_mkwrite. It currently then must
return with the page unlocked and may not hold any other locks (according
to existing page_mkwrite convention).
In this window, the VM could write out the page, clearing page-dirty. The
filesystem has no good way to detect that a dirty pte is about to be
attached, so it will happily write out the page, at which point, the
filesystem may manipulate the metadata to reflect that the page is no
longer dirty.
It is not always possible to perform the required metadata manipulation in
->set_page_dirty, because that function cannot block or fail. The
filesystem may need to allocate some data structure, for example.
And the VM cannot mark the pte dirty before page_mkwrite, because
page_mkwrite is allowed to fail, so we must not allow any window where the
page could be written to if page_mkwrite does fail.
This solution of holding the page locked over the 3 critical operations
(page_mkwrite, setting the pte dirty, and finally setting the page dirty)
closes out races nicely, preventing page cleaning for writeout being
initiated in that window. This provides the filesystem with a strong
synchronisation against the VM here.
- Sage needs this race closed for ceph filesystem.
- Trond for NFS (http://bugzilla.kernel.org/show_bug.cgi?id=12913).
- I need it for fsblock.
- I suspect other filesystems may need it too (eg. btrfs).
- I have converted buffer.c to the new locking. Even simple block allocation
under dirty pages might be susceptible to i_size changing under partial page
at the end of file (we also have a buffer.c-side problem here, but it cannot
be fixed properly without this patch).
- Other filesystems (eg. NFS, maybe btrfs) will need to change their
page_mkwrite functions themselves.
[ This also moves page_mkwrite another step closer to fault, which should
eventually allow page_mkwrite to be moved into ->fault, and thus avoiding a
filesystem calldown and page lock/unlock cycle in __do_fault. ]
[akpm@linux-foundation.org: fix derefs of NULL ->mapping]
Cc: Sage Weil <sage@newdream.net>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-05-01 06:08:16 +08:00
|
|
|
if (page_mkwrite && mapping) {
|
|
|
|
/*
|
|
|
|
* Some device drivers do not set page.mapping but still
|
|
|
|
* dirty their pages
|
|
|
|
*/
|
|
|
|
balance_dirty_pages_ratelimited(mapping);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* file_update_time outside page_lock */
|
|
|
|
if (vma->vm_file)
|
|
|
|
file_update_time(vma->vm_file);
|
|
|
|
} else {
|
|
|
|
unlock_page(vmf.page);
|
|
|
|
if (anon)
|
|
|
|
page_cache_release(vmf.page);
|
2006-09-26 14:30:57 +08:00
|
|
|
}
|
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page.
Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.
The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache. Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.
The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.
Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).
Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size. do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size). The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data. Valid mappings to the same
place will see a different page.
Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit. He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight. However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit. Scalability is not an issue.
This patch implements this latter approach. ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so). do_no_page only unlocks the page after setting up the mapping
completely. invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).
This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 16:46:57 +08:00
|
|
|
|
2007-07-19 16:47:05 +08:00
|
|
|
return ret;
|
mm: close page_mkwrite races
Change page_mkwrite to allow implementations to return with the page
locked, and also change it's callers (in page fault paths) to hold the
lock until the page is marked dirty. This allows the filesystem to have
full control of page dirtying events coming from the VM.
Rather than simply hold the page locked over the page_mkwrite call, we
call page_mkwrite with the page unlocked and allow callers to return with
it locked, so filesystems can avoid LOR conditions with page lock.
The problem with the current scheme is this: a filesystem that wants to
associate some metadata with a page as long as the page is dirty, will
perform this manipulation in its ->page_mkwrite. It currently then must
return with the page unlocked and may not hold any other locks (according
to existing page_mkwrite convention).
In this window, the VM could write out the page, clearing page-dirty. The
filesystem has no good way to detect that a dirty pte is about to be
attached, so it will happily write out the page, at which point, the
filesystem may manipulate the metadata to reflect that the page is no
longer dirty.
It is not always possible to perform the required metadata manipulation in
->set_page_dirty, because that function cannot block or fail. The
filesystem may need to allocate some data structure, for example.
And the VM cannot mark the pte dirty before page_mkwrite, because
page_mkwrite is allowed to fail, so we must not allow any window where the
page could be written to if page_mkwrite does fail.
This solution of holding the page locked over the 3 critical operations
(page_mkwrite, setting the pte dirty, and finally setting the page dirty)
closes out races nicely, preventing page cleaning for writeout being
initiated in that window. This provides the filesystem with a strong
synchronisation against the VM here.
- Sage needs this race closed for ceph filesystem.
- Trond for NFS (http://bugzilla.kernel.org/show_bug.cgi?id=12913).
- I need it for fsblock.
- I suspect other filesystems may need it too (eg. btrfs).
- I have converted buffer.c to the new locking. Even simple block allocation
under dirty pages might be susceptible to i_size changing under partial page
at the end of file (we also have a buffer.c-side problem here, but it cannot
be fixed properly without this patch).
- Other filesystems (eg. NFS, maybe btrfs) will need to change their
page_mkwrite functions themselves.
[ This also moves page_mkwrite another step closer to fault, which should
eventually allow page_mkwrite to be moved into ->fault, and thus avoiding a
filesystem calldown and page lock/unlock cycle in __do_fault. ]
[akpm@linux-foundation.org: fix derefs of NULL ->mapping]
Cc: Sage Weil <sage@newdream.net>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-05-01 06:08:16 +08:00
|
|
|
|
|
|
|
unwritable_page:
|
|
|
|
page_cache_release(page);
|
|
|
|
return ret;
|
2007-07-19 16:46:59 +08:00
|
|
|
}
|
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page.
Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.
The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache. Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.
The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.
Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).
Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size. do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size). The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data. Valid mappings to the same
place will see a different page.
Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit. He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight. However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit. Scalability is not an issue.
This patch implements this latter approach. ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so). do_no_page only unlocks the page after setting up the mapping
completely. invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).
This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 16:46:57 +08:00
|
|
|
|
2007-07-19 16:46:59 +08:00
|
|
|
static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
|
|
unsigned long address, pte_t *page_table, pmd_t *pmd,
|
2009-04-10 23:43:11 +08:00
|
|
|
unsigned int flags, pte_t orig_pte)
|
2007-07-19 16:46:59 +08:00
|
|
|
{
|
|
|
|
pgoff_t pgoff = (((address & PAGE_MASK)
|
2007-10-16 16:24:45 +08:00
|
|
|
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
|
2007-07-19 16:46:59 +08:00
|
|
|
|
2007-10-04 23:56:06 +08:00
|
|
|
pte_unmap(page_table);
|
|
|
|
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
|
2007-07-19 16:46:59 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Fault of a previously existing named mapping. Repopulate the pte
|
|
|
|
* from the encoded file_pte if possible. This enables swappable
|
|
|
|
* nonlinear vmas.
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
*
|
|
|
|
* We enter with non-exclusive mmap_sem (to exclude vma changes,
|
|
|
|
* but allow concurrent faults), and pte mapped but not yet locked.
|
|
|
|
* We return with mmap_sem still held, but pte unmapped and unlocked.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2007-07-19 16:47:03 +08:00
|
|
|
static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
unsigned long address, pte_t *page_table, pmd_t *pmd,
|
2009-04-10 23:43:11 +08:00
|
|
|
unsigned int flags, pte_t orig_pte)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
pgoff_t pgoff;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-04-10 23:43:11 +08:00
|
|
|
flags |= FAULT_FLAG_NONLINEAR;
|
|
|
|
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:40 +08:00
|
|
|
if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
|
2007-07-19 16:47:05 +08:00
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-01-07 06:40:10 +08:00
|
|
|
if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
/*
|
|
|
|
* Page table corrupted: show pte and kill process.
|
|
|
|
*/
|
badpage: replace page_remove_rmap Eeek and BUG
Now that bad pages are kept out of circulation, there is no need for the
infamous page_remove_rmap() BUG() - once that page is freed, its negative
mapcount will issue a "Bad page state" message and the page won't be
freed. Removing the BUG() allows more info, on subsequent pages, to be
gathered.
We do have more info about the page at this point than bad_page() can know
- notably, what the pmd is, which might pinpoint something like low 64kB
corruption - but page_remove_rmap() isn't given the address to find that.
In practice, there is only one call to page_remove_rmap() which has ever
reported anything, that from zap_pte_range() (usually on exit, sometimes
on munmap). It has all the info, so remove page_remove_rmap()'s "Eeek"
message and leave it all to zap_pte_range().
mm/memory.c already has a hardly used print_bad_pte() function, showing
some of the appropriate info: extend it to show what we want for the rmap
case: pte info, page info (when there is a page) and vma info to compare.
zap_pte_range() already knows the pmd, but print_bad_pte() is easier to
use if it works that out for itself.
Some of this info is also shown in bad_page()'s "Bad page state" message.
Keep them separate, but adjust them to match each other as far as
possible. Say "Bad page map" in print_bad_pte(), and add a TAINT_BAD_PAGE
there too.
print_bad_pte() show current->comm unconditionally (though it should get
repeated in the usually irrelevant stack trace): sorry, I misled Nick
Piggin to make it conditional on vm_mm == current->mm, but current->mm is
already NULL in the exit case. Usually current->comm is good, though
exceptionally it may not be that of the mm (when "swapoff" for example).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:40:08 +08:00
|
|
|
print_bad_pte(vma, address, orig_pte, NULL);
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
return VM_FAULT_OOM;
|
|
|
|
}
|
|
|
|
|
|
|
|
pgoff = pte_to_pgoff(orig_pte);
|
2007-10-04 23:56:06 +08:00
|
|
|
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* These routines also need to handle stuff like marking pages dirty
|
|
|
|
* and/or accessed for architectures that don't do it in hardware (most
|
|
|
|
* RISC architectures). The early dirtying is also good on the i386.
|
|
|
|
*
|
|
|
|
* There is also a hook called "update_mmu_cache()" that architectures
|
|
|
|
* with external mmu caches can use to update those (ie the Sparc or
|
|
|
|
* PowerPC hashed page tables that act as extended TLBs).
|
|
|
|
*
|
2005-10-30 09:16:23 +08:00
|
|
|
* We enter with non-exclusive mmap_sem (to exclude vma changes,
|
|
|
|
* but allow concurrent faults), and pte mapped but not yet locked.
|
|
|
|
* We return with mmap_sem still held, but pte unmapped and unlocked.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
static inline int handle_pte_fault(struct mm_struct *mm,
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
struct vm_area_struct *vma, unsigned long address,
|
2009-04-10 23:43:11 +08:00
|
|
|
pte_t *pte, pmd_t *pmd, unsigned int flags)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
pte_t entry;
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
spinlock_t *ptl;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-06-17 01:16:12 +08:00
|
|
|
entry = *pte;
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!pte_present(entry)) {
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
if (pte_none(entry)) {
|
2006-09-27 16:50:10 +08:00
|
|
|
if (vma->vm_ops) {
|
2008-04-28 17:12:10 +08:00
|
|
|
if (likely(vma->vm_ops->fault))
|
2007-07-19 16:46:59 +08:00
|
|
|
return do_linear_fault(mm, vma, address,
|
2009-04-10 23:43:11 +08:00
|
|
|
pte, pmd, flags, entry);
|
2006-09-27 16:50:10 +08:00
|
|
|
}
|
|
|
|
return do_anonymous_page(mm, vma, address,
|
2009-04-10 23:43:11 +08:00
|
|
|
pte, pmd, flags);
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
if (pte_file(entry))
|
2007-07-19 16:47:03 +08:00
|
|
|
return do_nonlinear_fault(mm, vma, address,
|
2009-04-10 23:43:11 +08:00
|
|
|
pte, pmd, flags, entry);
|
[PATCH] mm: page fault handlers tidyup
Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?
break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.
do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that. BUG_ON if not? Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.
Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:15:59 +08:00
|
|
|
return do_swap_page(mm, vma, address,
|
2009-04-10 23:43:11 +08:00
|
|
|
pte, pmd, flags, entry);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:40 +08:00
|
|
|
ptl = pte_lockptr(mm, pmd);
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
spin_lock(ptl);
|
|
|
|
if (unlikely(!pte_same(*pte, entry)))
|
|
|
|
goto unlock;
|
2009-04-10 23:43:11 +08:00
|
|
|
if (flags & FAULT_FLAG_WRITE) {
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!pte_write(entry))
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
return do_wp_page(mm, vma, address,
|
|
|
|
pte, pmd, ptl, entry);
|
2005-04-17 06:20:36 +08:00
|
|
|
entry = pte_mkdirty(entry);
|
|
|
|
}
|
|
|
|
entry = pte_mkyoung(entry);
|
2009-04-10 23:43:11 +08:00
|
|
|
if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
|
2005-10-30 09:16:48 +08:00
|
|
|
update_mmu_cache(vma, address, entry);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* This is needed only for protection faults but the arch code
|
|
|
|
* is not yet telling us if this is a protection fault or not.
|
|
|
|
* This still avoids useless tlb flushes for .text page faults
|
|
|
|
* with threads.
|
|
|
|
*/
|
2009-04-10 23:43:11 +08:00
|
|
|
if (flags & FAULT_FLAG_WRITE)
|
2005-10-30 09:16:48 +08:00
|
|
|
flush_tlb_page(vma, address);
|
|
|
|
}
|
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the
page_table_lock down to the head of handle_pte_fault (though it's also taken
and dropped earlier when a new page table has to be allocated).
Now delete that line, read "entry = *pte" without it, and go off to this or
that page fault handler on the basis of this unlocked peek. Usually the
handler can proceed without the lock, relying on the subsequent locked
pte_same or pte_none test to back out when necessary; though do_wp_page needs
the lock immediately, and do_file_page doesn't check (if there's a race,
install_page just zaps the entry and reinstalls it).
But on those architectures (notably i386 with PAE) whose pte is too big to be
read atomically, if SMP or preemption is enabled, do_swap_page and
do_file_page might cause irretrievable damage if passed a Frankenstein entry
stitched together from unrelated parts. In those configs, "pte_unmap_same"
has to take page_table_lock, validate orig_pte still the same, and drop
page_table_lock before unmapping, before proceeding.
Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock
avoidance leaves more lone maps and unmaps than elsewhere.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:26 +08:00
|
|
|
unlock:
|
|
|
|
pte_unmap_unlock(pte, ptl);
|
2007-07-19 16:47:05 +08:00
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* By the time we get here, we already hold the mm semaphore
|
|
|
|
*/
|
2007-07-19 16:47:05 +08:00
|
|
|
int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
2009-04-11 00:01:23 +08:00
|
|
|
unsigned long address, unsigned int flags)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
pgd_t *pgd;
|
|
|
|
pud_t *pud;
|
|
|
|
pmd_t *pmd;
|
|
|
|
pte_t *pte;
|
|
|
|
|
|
|
|
__set_current_state(TASK_RUNNING);
|
|
|
|
|
2006-06-30 16:55:45 +08:00
|
|
|
count_vm_event(PGFAULT);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-10-20 23:24:28 +08:00
|
|
|
if (unlikely(is_vm_hugetlb_page(vma)))
|
2009-04-10 23:43:11 +08:00
|
|
|
return hugetlb_fault(mm, vma, address, flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
pgd = pgd_offset(mm, address);
|
|
|
|
pud = pud_alloc(mm, pgd, address);
|
|
|
|
if (!pud)
|
2005-10-30 09:16:23 +08:00
|
|
|
return VM_FAULT_OOM;
|
2005-04-17 06:20:36 +08:00
|
|
|
pmd = pmd_alloc(mm, pud, address);
|
|
|
|
if (!pmd)
|
2005-10-30 09:16:23 +08:00
|
|
|
return VM_FAULT_OOM;
|
2005-04-17 06:20:36 +08:00
|
|
|
pte = pte_alloc_map(mm, pmd, address);
|
|
|
|
if (!pte)
|
2005-10-30 09:16:23 +08:00
|
|
|
return VM_FAULT_OOM;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-04-10 23:43:11 +08:00
|
|
|
return handle_pte_fault(mm, vma, address, pte, pmd, flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef __PAGETABLE_PUD_FOLDED
|
|
|
|
/*
|
|
|
|
* Allocate page upper directory.
|
[PATCH] mm: init_mm without ptlock
First step in pushing down the page_table_lock. init_mm.page_table_lock has
been used throughout the architectures (usually for ioremap): not to serialize
kernel address space allocation (that's usually vmlist_lock), but because
pud_alloc,pmd_alloc,pte_alloc_kernel expect caller holds it.
Reverse that: don't lock or unlock init_mm.page_table_lock in any of the
architectures; instead rely on pud_alloc,pmd_alloc,pte_alloc_kernel to take
and drop it when allocating a new one, to check lest a racing task already
did. Similarly no page_table_lock in vmalloc's map_vm_area.
Some temporary ugliness in __pud_alloc and __pmd_alloc: since they also handle
user mms, which are converted only by a later patch, for now they have to lock
differently according to whether or not it's init_mm.
If sources get muddled, there's a danger that an arch source taking
init_mm.page_table_lock will be mixed with common source also taking it (or
neither take it). So break the rules and make another change, which should
break the build for such a mismatch: remove the redundant mm arg from
pte_alloc_kernel (ppc64 scrapped its distinct ioremap_mm in 2.6.13).
Exceptions: arm26 used pte_alloc_kernel on user mm, now pte_alloc_map; ia64
used pte_alloc_map on init_mm, now pte_alloc_kernel; parisc had bad args to
pmd_alloc and pte_alloc_kernel in unused USE_HPPA_IOREMAP code; ppc64
map_io_page forgot to unlock on failure; ppc mmu_mapin_ram and ppc64 im_free
took page_table_lock for no good reason.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:21 +08:00
|
|
|
* We've already handled the fast-path in-line.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2005-10-30 09:16:22 +08:00
|
|
|
int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2005-10-30 09:16:23 +08:00
|
|
|
pud_t *new = pud_alloc_one(mm, address);
|
|
|
|
if (!new)
|
2005-10-30 09:16:22 +08:00
|
|
|
return -ENOMEM;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
fix SMP data race in pagetable setup vs walking
There is a possible data race in the page table walking code. After the split
ptlock patches, it actually seems to have been introduced to the core code, but
even before that I think it would have impacted some architectures (powerpc
and sparc64, at least, walk the page tables without taking locks eg. see
find_linux_pte()).
The race is as follows:
The pte page is allocated, zeroed, and its struct page gets its spinlock
initialized. The mm-wide ptl is then taken, and then the pte page is inserted
into the pagetables.
At this point, the spinlock is not guaranteed to have ordered the previous
stores to initialize the pte page with the subsequent store to put it in the
page tables. So another Linux page table walker might be walking down (without
any locks, because we have split-leaf-ptls), and find that new pte we've
inserted. It might try to take the spinlock before the store from the other
CPU initializes it. And subsequently it might read a pte_t out before stores
from the other CPU have cleared the memory.
There are also similar races in higher levels of the page tables. They
obviously don't involve the spinlock, but could see uninitialized memory.
Arch code and hardware pagetable walkers that walk the pagetables without
locks could see similar uninitialized memory problems, regardless of whether
split ptes are enabled or not.
I prefer to put the barriers in core code, because that's where the higher
level logic happens, but the page table accessors are per-arch, and open-coding
them everywhere I don't think is an option. I'll put the read-side barriers
in alpha arch code for now (other architectures perform data-dependent loads
in order).
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-05-14 12:37:36 +08:00
|
|
|
smp_wmb(); /* See comment in __pte_alloc */
|
|
|
|
|
[PATCH] mm: init_mm without ptlock
First step in pushing down the page_table_lock. init_mm.page_table_lock has
been used throughout the architectures (usually for ioremap): not to serialize
kernel address space allocation (that's usually vmlist_lock), but because
pud_alloc,pmd_alloc,pte_alloc_kernel expect caller holds it.
Reverse that: don't lock or unlock init_mm.page_table_lock in any of the
architectures; instead rely on pud_alloc,pmd_alloc,pte_alloc_kernel to take
and drop it when allocating a new one, to check lest a racing task already
did. Similarly no page_table_lock in vmalloc's map_vm_area.
Some temporary ugliness in __pud_alloc and __pmd_alloc: since they also handle
user mms, which are converted only by a later patch, for now they have to lock
differently according to whether or not it's init_mm.
If sources get muddled, there's a danger that an arch source taking
init_mm.page_table_lock will be mixed with common source also taking it (or
neither take it). So break the rules and make another change, which should
break the build for such a mismatch: remove the redundant mm arg from
pte_alloc_kernel (ppc64 scrapped its distinct ioremap_mm in 2.6.13).
Exceptions: arm26 used pte_alloc_kernel on user mm, now pte_alloc_map; ia64
used pte_alloc_map on init_mm, now pte_alloc_kernel; parisc had bad args to
pmd_alloc and pte_alloc_kernel in unused USE_HPPA_IOREMAP code; ppc64
map_io_page forgot to unlock on failure; ppc mmu_mapin_ram and ppc64 im_free
took page_table_lock for no good reason.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:21 +08:00
|
|
|
spin_lock(&mm->page_table_lock);
|
2005-10-30 09:16:22 +08:00
|
|
|
if (pgd_present(*pgd)) /* Another has populated it */
|
2008-02-05 14:29:14 +08:00
|
|
|
pud_free(mm, new);
|
2005-10-30 09:16:22 +08:00
|
|
|
else
|
|
|
|
pgd_populate(mm, pgd, new);
|
2005-10-30 09:16:23 +08:00
|
|
|
spin_unlock(&mm->page_table_lock);
|
2005-10-30 09:16:22 +08:00
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
#endif /* __PAGETABLE_PUD_FOLDED */
|
|
|
|
|
|
|
|
#ifndef __PAGETABLE_PMD_FOLDED
|
|
|
|
/*
|
|
|
|
* Allocate page middle directory.
|
[PATCH] mm: init_mm without ptlock
First step in pushing down the page_table_lock. init_mm.page_table_lock has
been used throughout the architectures (usually for ioremap): not to serialize
kernel address space allocation (that's usually vmlist_lock), but because
pud_alloc,pmd_alloc,pte_alloc_kernel expect caller holds it.
Reverse that: don't lock or unlock init_mm.page_table_lock in any of the
architectures; instead rely on pud_alloc,pmd_alloc,pte_alloc_kernel to take
and drop it when allocating a new one, to check lest a racing task already
did. Similarly no page_table_lock in vmalloc's map_vm_area.
Some temporary ugliness in __pud_alloc and __pmd_alloc: since they also handle
user mms, which are converted only by a later patch, for now they have to lock
differently according to whether or not it's init_mm.
If sources get muddled, there's a danger that an arch source taking
init_mm.page_table_lock will be mixed with common source also taking it (or
neither take it). So break the rules and make another change, which should
break the build for such a mismatch: remove the redundant mm arg from
pte_alloc_kernel (ppc64 scrapped its distinct ioremap_mm in 2.6.13).
Exceptions: arm26 used pte_alloc_kernel on user mm, now pte_alloc_map; ia64
used pte_alloc_map on init_mm, now pte_alloc_kernel; parisc had bad args to
pmd_alloc and pte_alloc_kernel in unused USE_HPPA_IOREMAP code; ppc64
map_io_page forgot to unlock on failure; ppc mmu_mapin_ram and ppc64 im_free
took page_table_lock for no good reason.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:21 +08:00
|
|
|
* We've already handled the fast-path in-line.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2005-10-30 09:16:22 +08:00
|
|
|
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2005-10-30 09:16:23 +08:00
|
|
|
pmd_t *new = pmd_alloc_one(mm, address);
|
|
|
|
if (!new)
|
2005-10-30 09:16:22 +08:00
|
|
|
return -ENOMEM;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
fix SMP data race in pagetable setup vs walking
There is a possible data race in the page table walking code. After the split
ptlock patches, it actually seems to have been introduced to the core code, but
even before that I think it would have impacted some architectures (powerpc
and sparc64, at least, walk the page tables without taking locks eg. see
find_linux_pte()).
The race is as follows:
The pte page is allocated, zeroed, and its struct page gets its spinlock
initialized. The mm-wide ptl is then taken, and then the pte page is inserted
into the pagetables.
At this point, the spinlock is not guaranteed to have ordered the previous
stores to initialize the pte page with the subsequent store to put it in the
page tables. So another Linux page table walker might be walking down (without
any locks, because we have split-leaf-ptls), and find that new pte we've
inserted. It might try to take the spinlock before the store from the other
CPU initializes it. And subsequently it might read a pte_t out before stores
from the other CPU have cleared the memory.
There are also similar races in higher levels of the page tables. They
obviously don't involve the spinlock, but could see uninitialized memory.
Arch code and hardware pagetable walkers that walk the pagetables without
locks could see similar uninitialized memory problems, regardless of whether
split ptes are enabled or not.
I prefer to put the barriers in core code, because that's where the higher
level logic happens, but the page table accessors are per-arch, and open-coding
them everywhere I don't think is an option. I'll put the read-side barriers
in alpha arch code for now (other architectures perform data-dependent loads
in order).
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-05-14 12:37:36 +08:00
|
|
|
smp_wmb(); /* See comment in __pte_alloc */
|
|
|
|
|
[PATCH] mm: init_mm without ptlock
First step in pushing down the page_table_lock. init_mm.page_table_lock has
been used throughout the architectures (usually for ioremap): not to serialize
kernel address space allocation (that's usually vmlist_lock), but because
pud_alloc,pmd_alloc,pte_alloc_kernel expect caller holds it.
Reverse that: don't lock or unlock init_mm.page_table_lock in any of the
architectures; instead rely on pud_alloc,pmd_alloc,pte_alloc_kernel to take
and drop it when allocating a new one, to check lest a racing task already
did. Similarly no page_table_lock in vmalloc's map_vm_area.
Some temporary ugliness in __pud_alloc and __pmd_alloc: since they also handle
user mms, which are converted only by a later patch, for now they have to lock
differently according to whether or not it's init_mm.
If sources get muddled, there's a danger that an arch source taking
init_mm.page_table_lock will be mixed with common source also taking it (or
neither take it). So break the rules and make another change, which should
break the build for such a mismatch: remove the redundant mm arg from
pte_alloc_kernel (ppc64 scrapped its distinct ioremap_mm in 2.6.13).
Exceptions: arm26 used pte_alloc_kernel on user mm, now pte_alloc_map; ia64
used pte_alloc_map on init_mm, now pte_alloc_kernel; parisc had bad args to
pmd_alloc and pte_alloc_kernel in unused USE_HPPA_IOREMAP code; ppc64
map_io_page forgot to unlock on failure; ppc mmu_mapin_ram and ppc64 im_free
took page_table_lock for no good reason.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:21 +08:00
|
|
|
spin_lock(&mm->page_table_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifndef __ARCH_HAS_4LEVEL_HACK
|
2005-10-30 09:16:22 +08:00
|
|
|
if (pud_present(*pud)) /* Another has populated it */
|
2008-02-05 14:29:14 +08:00
|
|
|
pmd_free(mm, new);
|
2005-10-30 09:16:22 +08:00
|
|
|
else
|
|
|
|
pud_populate(mm, pud, new);
|
2005-04-17 06:20:36 +08:00
|
|
|
#else
|
2005-10-30 09:16:22 +08:00
|
|
|
if (pgd_present(*pud)) /* Another has populated it */
|
2008-02-05 14:29:14 +08:00
|
|
|
pmd_free(mm, new);
|
2005-10-30 09:16:22 +08:00
|
|
|
else
|
|
|
|
pgd_populate(mm, pud, new);
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif /* __ARCH_HAS_4LEVEL_HACK */
|
2005-10-30 09:16:23 +08:00
|
|
|
spin_unlock(&mm->page_table_lock);
|
2005-10-30 09:16:22 +08:00
|
|
|
return 0;
|
[PATCH] Workaround for gcc 2.96 (undefined references)
LD .tmp_vmlinux1
mm/built-in.o(.text+0x100d6): In function `copy_page_range':
: undefined reference to `__pud_alloc'
mm/built-in.o(.text+0x1010b): In function `copy_page_range':
: undefined reference to `__pmd_alloc'
mm/built-in.o(.text+0x11ef4): In function `__handle_mm_fault':
: undefined reference to `__pud_alloc'
fs/built-in.o(.text+0xc930): In function `install_arg_page':
: undefined reference to `__pud_alloc'
make: *** [.tmp_vmlinux1] Error 1
Those missing references in mm/memory.c arise from this code in
include/linux/mm.h, combined with the fact that __PGTABLE_PMD_FOLDED and
__PGTABLE_PUD_FOLDED are both set and __ARCH_HAS_4LEVEL_HACK is not:
/*
* The following ifdef needed to get the 4level-fixup.h header to work.
* Remove it when 4level-fixup.h has been removed.
*/
#if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK)
static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))?
NULL: pud_offset(pgd, address);
}
static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
NULL: pmd_offset(pud, address);
}
#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
With my configuration the pgd_none and pud_none routines are inlines
returning a constant 0. Apparently the old compiler avoids generating
calls to __pud_alloc and __pmd_alloc but still lists them as undefined
references in the module's symbol table.
I don't know which change caused this problem. I think it was added
somewhere between 2.6.14 and 2.6.15-rc1, because I remember building
several 2.6.14-rc kernels without difficulty. However I can't point to an
individual culprit.
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-29 05:43:44 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif /* __PAGETABLE_PMD_FOLDED */
|
|
|
|
|
|
|
|
int make_pages_present(unsigned long addr, unsigned long end)
|
|
|
|
{
|
|
|
|
int ret, len, write;
|
|
|
|
struct vm_area_struct * vma;
|
|
|
|
|
|
|
|
vma = find_vma(current->mm, addr);
|
|
|
|
if (!vma)
|
mlock() fix return values
Halesh says:
Please find the below testcase provide to test mlock.
Test Case :
===========================
#include <sys/resource.h>
#include <stdio.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <errno.h>
#include <stdlib.h>
int main(void)
{
int fd,ret, i = 0;
char *addr, *addr1 = NULL;
unsigned int page_size;
struct rlimit rlim;
if (0 != geteuid())
{
printf("Execute this pgm as root\n");
exit(1);
}
/* create a file */
if ((fd = open("mmap_test.c",O_RDWR|O_CREAT,0755)) == -1)
{
printf("cant create test file\n");
exit(1);
}
page_size = sysconf(_SC_PAGE_SIZE);
/* set the MEMLOCK limit */
rlim.rlim_cur = 2000;
rlim.rlim_max = 2000;
if ((ret = setrlimit(RLIMIT_MEMLOCK,&rlim)) != 0)
{
printf("Cant change limit values\n");
exit(1);
}
addr = 0;
while (1)
{
/* map a page into memory each time*/
if ((addr = (char *) mmap(addr,page_size, PROT_READ |
PROT_WRITE,MAP_SHARED,fd,0)) == MAP_FAILED)
{
printf("cant do mmap on file\n");
exit(1);
}
if (0 == i)
addr1 = addr;
i++;
errno = 0;
/* lock the mapped memory pagewise*/
if ((ret = mlock((char *)addr, 1500)) == -1)
{
printf("errno value is %d\n", errno);
printf("cant lock maped region\n");
exit(1);
}
addr = addr + page_size;
}
}
======================================================
This testcase results in an mlock() failure with errno 14 that is EFAULT,
but it has nowhere been specified that mlock() will return EFAULT. When I
tested the same on older kernels like 2.6.18, I got the correct result i.e
errno 12 (ENOMEM).
I think in source code mlock(2), setting errno ENOMEM has been missed in
do_mlock() , on mlock_fixup() failure.
SUSv3 requires the following behavior frmo mlock(2).
[ENOMEM]
Some or all of the address range specified by the addr and
len arguments does not correspond to valid mapped pages
in the address space of the process.
[EAGAIN]
Some or all of the memory identified by the operation could not
be locked when the call was made.
This rule isn't so nice and slighly strange. but many people think
POSIX/SUS compliance is important.
Reported-by: Halesh Sadashiv <halesh.sadashiv@ap.sony.com>
Tested-by: Halesh Sadashiv <halesh.sadashiv@ap.sony.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: <stable@kernel.org> [2.6.25.x, 2.6.26.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-08-05 04:41:14 +08:00
|
|
|
return -ENOMEM;
|
2005-04-17 06:20:36 +08:00
|
|
|
write = (vma->vm_flags & VM_WRITE) != 0;
|
2006-03-27 00:30:52 +08:00
|
|
|
BUG_ON(addr >= end);
|
|
|
|
BUG_ON(end > vma->vm_end);
|
2007-07-16 14:38:03 +08:00
|
|
|
len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
|
2005-04-17 06:20:36 +08:00
|
|
|
ret = get_user_pages(current, current->mm, addr,
|
|
|
|
len, write, 0, NULL, NULL);
|
2008-10-19 11:26:56 +08:00
|
|
|
if (ret < 0)
|
2005-04-17 06:20:36 +08:00
|
|
|
return ret;
|
2008-10-19 11:26:56 +08:00
|
|
|
return ret == len ? 0 : -EFAULT;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(__HAVE_ARCH_GATE_AREA)
|
|
|
|
|
|
|
|
#if defined(AT_SYSINFO_EHDR)
|
2005-09-10 15:26:28 +08:00
|
|
|
static struct vm_area_struct gate_vma;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
static int __init gate_vma_init(void)
|
|
|
|
{
|
|
|
|
gate_vma.vm_mm = NULL;
|
|
|
|
gate_vma.vm_start = FIXADDR_USER_START;
|
|
|
|
gate_vma.vm_end = FIXADDR_USER_END;
|
2007-01-26 16:56:47 +08:00
|
|
|
gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
|
|
|
|
gate_vma.vm_page_prot = __P101;
|
2007-01-26 16:56:49 +08:00
|
|
|
/*
|
|
|
|
* Make sure the vDSO gets into every core dump.
|
|
|
|
* Dumping its contents makes post-mortem fully interpretable later
|
|
|
|
* without matching up the same kernel and hardware config to see
|
|
|
|
* what PC values meant.
|
|
|
|
*/
|
|
|
|
gate_vma.vm_flags |= VM_ALWAYSDUMP;
|
2005-04-17 06:20:36 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
__initcall(gate_vma_init);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
#ifdef AT_SYSINFO_EHDR
|
|
|
|
return &gate_vma;
|
|
|
|
#else
|
|
|
|
return NULL;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
int in_gate_area_no_task(unsigned long addr)
|
|
|
|
{
|
|
|
|
#ifdef AT_SYSINFO_EHDR
|
|
|
|
if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
|
|
|
|
return 1;
|
|
|
|
#endif
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* __HAVE_ARCH_GATE_AREA */
|
2006-09-27 16:50:15 +08:00
|
|
|
|
2009-06-17 06:32:33 +08:00
|
|
|
static int follow_pte(struct mm_struct *mm, unsigned long address,
|
|
|
|
pte_t **ptepp, spinlock_t **ptlp)
|
|
|
|
{
|
|
|
|
pgd_t *pgd;
|
|
|
|
pud_t *pud;
|
|
|
|
pmd_t *pmd;
|
|
|
|
pte_t *ptep;
|
|
|
|
|
|
|
|
pgd = pgd_offset(mm, address);
|
|
|
|
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
pud = pud_offset(pgd, address);
|
|
|
|
if (pud_none(*pud) || unlikely(pud_bad(*pud)))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
pmd = pmd_offset(pud, address);
|
|
|
|
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/* We cannot handle huge page PFN maps. Luckily they don't exist. */
|
|
|
|
if (pmd_huge(*pmd))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
|
|
|
|
if (!ptep)
|
|
|
|
goto out;
|
|
|
|
if (!pte_present(*ptep))
|
|
|
|
goto unlock;
|
|
|
|
*ptepp = ptep;
|
|
|
|
return 0;
|
|
|
|
unlock:
|
|
|
|
pte_unmap_unlock(ptep, *ptlp);
|
|
|
|
out:
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2009-06-17 06:32:35 +08:00
|
|
|
/**
|
|
|
|
* follow_pfn - look up PFN at a user virtual address
|
|
|
|
* @vma: memory mapping
|
|
|
|
* @address: user virtual address
|
|
|
|
* @pfn: location to store found PFN
|
|
|
|
*
|
|
|
|
* Only IO mappings and raw PFN mappings are allowed.
|
|
|
|
*
|
|
|
|
* Returns zero and the pfn at @pfn on success, -ve otherwise.
|
|
|
|
*/
|
|
|
|
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
|
|
|
|
unsigned long *pfn)
|
|
|
|
{
|
|
|
|
int ret = -EINVAL;
|
|
|
|
spinlock_t *ptl;
|
|
|
|
pte_t *ptep;
|
|
|
|
|
|
|
|
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
*pfn = pte_pfn(*ptep);
|
|
|
|
pte_unmap_unlock(ptep, ptl);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(follow_pfn);
|
|
|
|
|
2008-07-24 12:27:05 +08:00
|
|
|
#ifdef CONFIG_HAVE_IOREMAP_PROT
|
2008-12-20 05:47:27 +08:00
|
|
|
int follow_phys(struct vm_area_struct *vma,
|
|
|
|
unsigned long address, unsigned int flags,
|
|
|
|
unsigned long *prot, resource_size_t *phys)
|
2008-07-24 12:27:05 +08:00
|
|
|
{
|
2009-06-17 06:32:34 +08:00
|
|
|
int ret = -EINVAL;
|
2008-07-24 12:27:05 +08:00
|
|
|
pte_t *ptep, pte;
|
|
|
|
spinlock_t *ptl;
|
|
|
|
|
2008-12-20 05:47:27 +08:00
|
|
|
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
|
|
|
|
goto out;
|
2008-07-24 12:27:05 +08:00
|
|
|
|
2009-06-17 06:32:34 +08:00
|
|
|
if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
|
2008-12-20 05:47:27 +08:00
|
|
|
goto out;
|
2008-07-24 12:27:05 +08:00
|
|
|
pte = *ptep;
|
2009-06-17 06:32:34 +08:00
|
|
|
|
2008-07-24 12:27:05 +08:00
|
|
|
if ((flags & FOLL_WRITE) && !pte_write(pte))
|
|
|
|
goto unlock;
|
|
|
|
|
|
|
|
*prot = pgprot_val(pte_pgprot(pte));
|
2009-06-17 06:32:34 +08:00
|
|
|
*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
|
2008-07-24 12:27:05 +08:00
|
|
|
|
2009-06-17 06:32:34 +08:00
|
|
|
ret = 0;
|
2008-07-24 12:27:05 +08:00
|
|
|
unlock:
|
|
|
|
pte_unmap_unlock(ptep, ptl);
|
|
|
|
out:
|
2008-12-20 05:47:27 +08:00
|
|
|
return ret;
|
2008-07-24 12:27:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
|
|
|
|
void *buf, int len, int write)
|
|
|
|
{
|
|
|
|
resource_size_t phys_addr;
|
|
|
|
unsigned long prot = 0;
|
2009-01-07 06:39:43 +08:00
|
|
|
void __iomem *maddr;
|
2008-07-24 12:27:05 +08:00
|
|
|
int offset = addr & (PAGE_SIZE-1);
|
|
|
|
|
2008-12-20 05:47:27 +08:00
|
|
|
if (follow_phys(vma, addr, write, &prot, &phys_addr))
|
2008-07-24 12:27:05 +08:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
|
|
|
|
if (write)
|
|
|
|
memcpy_toio(maddr + offset, buf, len);
|
|
|
|
else
|
|
|
|
memcpy_fromio(buf, maddr + offset, len);
|
|
|
|
iounmap(maddr);
|
|
|
|
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2006-09-27 16:50:15 +08:00
|
|
|
/*
|
|
|
|
* Access another process' address space.
|
|
|
|
* Source/target buffer must be kernel space,
|
|
|
|
* Do not walk the page table directly, use get_user_pages
|
|
|
|
*/
|
|
|
|
int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
|
|
|
|
{
|
|
|
|
struct mm_struct *mm;
|
|
|
|
struct vm_area_struct *vma;
|
|
|
|
void *old_buf = buf;
|
|
|
|
|
|
|
|
mm = get_task_mm(tsk);
|
|
|
|
if (!mm)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
down_read(&mm->mmap_sem);
|
2007-10-20 07:27:18 +08:00
|
|
|
/* ignore errors, just check how much was successfully transferred */
|
2006-09-27 16:50:15 +08:00
|
|
|
while (len) {
|
|
|
|
int bytes, ret, offset;
|
|
|
|
void *maddr;
|
2008-07-24 12:27:05 +08:00
|
|
|
struct page *page = NULL;
|
2006-09-27 16:50:15 +08:00
|
|
|
|
|
|
|
ret = get_user_pages(tsk, mm, addr, 1,
|
|
|
|
write, 1, &page, &vma);
|
2008-07-24 12:27:05 +08:00
|
|
|
if (ret <= 0) {
|
|
|
|
/*
|
|
|
|
* Check if this is a VM_IO | VM_PFNMAP VMA, which
|
|
|
|
* we can access using slightly different code.
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_HAVE_IOREMAP_PROT
|
|
|
|
vma = find_vma(mm, addr);
|
|
|
|
if (!vma)
|
|
|
|
break;
|
|
|
|
if (vma->vm_ops && vma->vm_ops->access)
|
|
|
|
ret = vma->vm_ops->access(vma, addr, buf,
|
|
|
|
len, write);
|
|
|
|
if (ret <= 0)
|
|
|
|
#endif
|
|
|
|
break;
|
|
|
|
bytes = ret;
|
2006-09-27 16:50:15 +08:00
|
|
|
} else {
|
2008-07-24 12:27:05 +08:00
|
|
|
bytes = len;
|
|
|
|
offset = addr & (PAGE_SIZE-1);
|
|
|
|
if (bytes > PAGE_SIZE-offset)
|
|
|
|
bytes = PAGE_SIZE-offset;
|
|
|
|
|
|
|
|
maddr = kmap(page);
|
|
|
|
if (write) {
|
|
|
|
copy_to_user_page(vma, page, addr,
|
|
|
|
maddr + offset, buf, bytes);
|
|
|
|
set_page_dirty_lock(page);
|
|
|
|
} else {
|
|
|
|
copy_from_user_page(vma, page, addr,
|
|
|
|
buf, maddr + offset, bytes);
|
|
|
|
}
|
|
|
|
kunmap(page);
|
|
|
|
page_cache_release(page);
|
2006-09-27 16:50:15 +08:00
|
|
|
}
|
|
|
|
len -= bytes;
|
|
|
|
buf += bytes;
|
|
|
|
addr += bytes;
|
|
|
|
}
|
|
|
|
up_read(&mm->mmap_sem);
|
|
|
|
mmput(mm);
|
|
|
|
|
|
|
|
return buf - old_buf;
|
|
|
|
}
|
2008-01-30 20:33:18 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Print the name of a VMA.
|
|
|
|
*/
|
|
|
|
void print_vma_addr(char *prefix, unsigned long ip)
|
|
|
|
{
|
|
|
|
struct mm_struct *mm = current->mm;
|
|
|
|
struct vm_area_struct *vma;
|
|
|
|
|
2008-02-14 03:21:06 +08:00
|
|
|
/*
|
|
|
|
* Do not print if we are in atomic
|
|
|
|
* contexts (in exception stacks, etc.):
|
|
|
|
*/
|
|
|
|
if (preempt_count())
|
|
|
|
return;
|
|
|
|
|
2008-01-30 20:33:18 +08:00
|
|
|
down_read(&mm->mmap_sem);
|
|
|
|
vma = find_vma(mm, ip);
|
|
|
|
if (vma && vma->vm_file) {
|
|
|
|
struct file *f = vma->vm_file;
|
|
|
|
char *buf = (char *)__get_free_page(GFP_KERNEL);
|
|
|
|
if (buf) {
|
|
|
|
char *p, *s;
|
|
|
|
|
2008-02-15 11:38:44 +08:00
|
|
|
p = d_path(&f->f_path, buf, PAGE_SIZE);
|
2008-01-30 20:33:18 +08:00
|
|
|
if (IS_ERR(p))
|
|
|
|
p = "?";
|
|
|
|
s = strrchr(p, '/');
|
|
|
|
if (s)
|
|
|
|
p = s+1;
|
|
|
|
printk("%s%s[%lx+%lx]", prefix, p,
|
|
|
|
vma->vm_start,
|
|
|
|
vma->vm_end - vma->vm_start);
|
|
|
|
free_page((unsigned long)buf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
up_read(¤t->mm->mmap_sem);
|
|
|
|
}
|
2008-09-10 19:37:17 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_PROVE_LOCKING
|
|
|
|
void might_fault(void)
|
|
|
|
{
|
2009-01-12 20:02:11 +08:00
|
|
|
/*
|
|
|
|
* Some code (nfs/sunrpc) uses socket ops on kernel memory while
|
|
|
|
* holding the mmap_sem, this is safe because kernel memory doesn't
|
|
|
|
* get paged out, therefore we'll never actually fault, and the
|
|
|
|
* below annotations will generate false positives.
|
|
|
|
*/
|
|
|
|
if (segment_eq(get_fs(), KERNEL_DS))
|
|
|
|
return;
|
|
|
|
|
2008-09-10 19:37:17 +08:00
|
|
|
might_sleep();
|
|
|
|
/*
|
|
|
|
* it would be nicer only to annotate paths which are not under
|
|
|
|
* pagefault_disable, however that requires a larger audit and
|
|
|
|
* providing helpers like get_user_atomic.
|
|
|
|
*/
|
|
|
|
if (!in_atomic() && current->mm)
|
|
|
|
might_lock_read(¤t->mm->mmap_sem);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(might_fault);
|
|
|
|
#endif
|