2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* fs/dcache.c
|
|
|
|
*
|
|
|
|
* Complete reimplementation
|
|
|
|
* (C) 1997 Thomas Schoebel-Theuer,
|
|
|
|
* with heavy changes by Linus Torvalds
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Notes on the allocation strategy:
|
|
|
|
*
|
|
|
|
* The dcache is a master of the icache - whenever a dcache entry
|
|
|
|
* exists, the inode will always exist. "iput()" is done either when
|
|
|
|
* the dcache entry is deleted or garbage collected.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/syscalls.h>
|
|
|
|
#include <linux/string.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/fs.h>
|
2005-08-09 01:52:16 +08:00
|
|
|
#include <linux/fsnotify.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/hash.h>
|
|
|
|
#include <linux/cache.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/mount.h>
|
|
|
|
#include <linux/file.h>
|
|
|
|
#include <asm/uaccess.h>
|
|
|
|
#include <linux/security.h>
|
|
|
|
#include <linux/seqlock.h>
|
|
|
|
#include <linux/swap.h>
|
|
|
|
#include <linux/bootmem.h>
|
2009-03-30 07:50:06 +08:00
|
|
|
#include <linux/fs_struct.h>
|
2009-07-16 21:44:29 +08:00
|
|
|
#include <linux/hardirq.h>
|
2011-01-07 14:50:05 +08:00
|
|
|
#include <linux/bit_spinlock.h>
|
|
|
|
#include <linux/rculist_bl.h>
|
2011-05-21 03:50:29 +08:00
|
|
|
#include <linux/prefetch.h>
|
2011-08-16 22:31:30 +08:00
|
|
|
#include <linux/ratelimit.h>
|
2006-10-01 02:52:18 +08:00
|
|
|
#include "internal.h"
|
2011-11-24 08:26:23 +08:00
|
|
|
#include "mount.h"
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-01-07 14:49:30 +08:00
|
|
|
/*
|
|
|
|
* Usage:
|
2011-01-07 14:50:06 +08:00
|
|
|
* dcache->d_inode->i_lock protects:
|
|
|
|
* - i_dentry, d_alias, d_inode of aliases
|
2011-01-07 14:50:05 +08:00
|
|
|
* dcache_hash_bucket lock protects:
|
|
|
|
* - the dcache hash table
|
|
|
|
* s_anon bl list spinlock protects:
|
|
|
|
* - the s_anon list (see __d_drop)
|
2011-01-07 14:49:31 +08:00
|
|
|
* dcache_lru_lock protects:
|
|
|
|
* - the dcache lru lists and counters
|
|
|
|
* d_lock protects:
|
|
|
|
* - d_flags
|
|
|
|
* - d_name
|
|
|
|
* - d_lru
|
2011-01-07 14:49:32 +08:00
|
|
|
* - d_count
|
2011-01-07 14:49:33 +08:00
|
|
|
* - d_unhashed()
|
2011-01-07 14:49:34 +08:00
|
|
|
* - d_parent and d_subdirs
|
|
|
|
* - childrens' d_child and d_parent
|
2011-01-07 14:49:35 +08:00
|
|
|
* - d_alias, d_inode
|
2011-01-07 14:49:30 +08:00
|
|
|
*
|
|
|
|
* Ordering:
|
2011-01-07 14:50:06 +08:00
|
|
|
* dentry->d_inode->i_lock
|
2011-01-07 14:49:38 +08:00
|
|
|
* dentry->d_lock
|
|
|
|
* dcache_lru_lock
|
2011-01-07 14:50:05 +08:00
|
|
|
* dcache_hash_bucket lock
|
|
|
|
* s_anon lock
|
2011-01-07 14:49:30 +08:00
|
|
|
*
|
2011-01-07 14:49:33 +08:00
|
|
|
* If there is an ancestor relationship:
|
|
|
|
* dentry->d_parent->...->d_parent->d_lock
|
|
|
|
* ...
|
|
|
|
* dentry->d_parent->d_lock
|
|
|
|
* dentry->d_lock
|
|
|
|
*
|
|
|
|
* If no ancestor relationship:
|
2011-01-07 14:49:30 +08:00
|
|
|
* if (dentry1 < dentry2)
|
|
|
|
* dentry1->d_lock
|
|
|
|
* dentry2->d_lock
|
|
|
|
*/
|
2006-03-26 17:37:24 +08:00
|
|
|
int sysctl_vfs_cache_pressure __read_mostly = 100;
|
2005-04-17 06:20:36 +08:00
|
|
|
EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
|
|
|
|
|
2011-01-07 14:49:31 +08:00
|
|
|
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
|
[PATCH] audit: watching subtrees
New kind of audit rule predicates: "object is visible in given subtree".
The part that can be sanely implemented, that is. Limitations:
* if you have hardlink from outside of tree, you'd better watch
it too (or just watch the object itself, obviously)
* if you mount something under a watched tree, tell audit
that new chunk should be added to watched subtrees
* if you umount something in a watched tree and it's still mounted
elsewhere, you will get matches on events happening there. New command
tells audit to recalculate the trees, trimming such sources of false
positives.
Note that it's _not_ about path - if something mounted in several places
(multiple mount, bindings, different namespaces, etc.), the match does
_not_ depend on which one we are using for access.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2007-07-22 20:04:18 +08:00
|
|
|
__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-01-07 14:49:37 +08:00
|
|
|
EXPORT_SYMBOL(rename_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-12-07 12:33:20 +08:00
|
|
|
static struct kmem_cache *dentry_cache __read_mostly;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This is the single most critical data structure when it comes
|
|
|
|
* to the dcache: the hashtable for lookups. Somebody should try
|
|
|
|
* to make this good - I've just made it work.
|
|
|
|
*
|
|
|
|
* This hash-function tries to avoid losing too many bits of hash
|
|
|
|
* information, yet avoid using a prime hash-size or similar.
|
|
|
|
*/
|
|
|
|
#define D_HASHBITS d_hash_shift
|
|
|
|
#define D_HASHMASK d_hash_mask
|
|
|
|
|
2006-03-26 17:37:24 +08:00
|
|
|
static unsigned int d_hash_mask __read_mostly;
|
|
|
|
static unsigned int d_hash_shift __read_mostly;
|
2011-01-07 14:50:05 +08:00
|
|
|
|
2011-04-24 13:32:03 +08:00
|
|
|
static struct hlist_bl_head *dentry_hashtable __read_mostly;
|
2011-01-07 14:50:05 +08:00
|
|
|
|
2011-04-24 13:32:03 +08:00
|
|
|
static inline struct hlist_bl_head *d_hash(struct dentry *parent,
|
2011-01-07 14:50:05 +08:00
|
|
|
unsigned long hash)
|
|
|
|
{
|
|
|
|
hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES;
|
|
|
|
hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS);
|
|
|
|
return dentry_hashtable + (hash & D_HASHMASK);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Statistics gathering. */
|
|
|
|
struct dentry_stat_t dentry_stat = {
|
|
|
|
.age_limit = 45,
|
|
|
|
};
|
|
|
|
|
fs: use fast counters for vfs caches
percpu_counter library generates quite nasty code, so unless you need
to dynamically allocate counters or take fast approximate value, a
simple per cpu set of counters is much better.
The percpu_counter can never be made to work as well, because it has an
indirection from pointer to percpu memory, and it can't use direct
this_cpu_inc interfaces because it doesn't use static PER_CPU data, so
code will always be worse.
In the fastpath, it is the difference between this:
incl %gs:nr_dentry # nr_dentry
and this:
movl percpu_counter_batch(%rip), %edx # percpu_counter_batch,
movl $1, %esi #,
movq $nr_dentry, %rdi #,
call __percpu_counter_add # (plus I clobber registers)
__percpu_counter_add:
pushq %rbp #
movq %rsp, %rbp #,
subq $32, %rsp #,
movq %rbx, -24(%rbp) #,
movq %r12, -16(%rbp) #,
movq %r13, -8(%rbp) #,
movq %rdi, %rbx # fbc, fbc
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
movq %gs:kernel_stack,%rax #, pfo_ret__
# 0 "" 2
#NO_APP
incl -8124(%rax) # <variable>.preempt_count
movq 32(%rdi), %r12 # <variable>.counters, tcp_ptr__
#APP
# 78 "lib/percpu_counter.c" 1
add %gs:this_cpu_off, %r12 # this_cpu_off, tcp_ptr__
# 0 "" 2
#NO_APP
movslq (%r12),%r13 #* tcp_ptr__, tmp73
movslq %edx,%rax # batch, batch
addq %rsi, %r13 # amount, count
cmpq %rax, %r13 # batch, count
jge .L27 #,
negl %edx # tmp76
movslq %edx,%rdx # tmp76, tmp77
cmpq %rdx, %r13 # tmp77, count
jg .L28 #,
.L27:
movq %rbx, %rdi # fbc,
call _raw_spin_lock #
addq %r13, 8(%rbx) # count, <variable>.count
movq %rbx, %rdi # fbc,
movl $0, (%r12) #,* tcp_ptr__
call _raw_spin_unlock #
.L29:
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
movq %gs:kernel_stack,%rax #, pfo_ret__
# 0 "" 2
#NO_APP
decl -8124(%rax) # <variable>.preempt_count
movq -8136(%rax), %rax #, D.14625
testb $8, %al #, D.14625
jne .L32 #,
.L31:
movq -24(%rbp), %rbx #,
movq -16(%rbp), %r12 #,
movq -8(%rbp), %r13 #,
leave
ret
.p2align 4,,10
.p2align 3
.L28:
movl %r13d, (%r12) # count,*
jmp .L29 #
.L32:
call preempt_schedule #
.p2align 4,,6
jmp .L31 #
.size __percpu_counter_add, .-__percpu_counter_add
.p2align 4,,15
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:19 +08:00
|
|
|
static DEFINE_PER_CPU(unsigned int, nr_dentry);
|
2010-10-10 17:36:23 +08:00
|
|
|
|
|
|
|
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
|
fs: use fast counters for vfs caches
percpu_counter library generates quite nasty code, so unless you need
to dynamically allocate counters or take fast approximate value, a
simple per cpu set of counters is much better.
The percpu_counter can never be made to work as well, because it has an
indirection from pointer to percpu memory, and it can't use direct
this_cpu_inc interfaces because it doesn't use static PER_CPU data, so
code will always be worse.
In the fastpath, it is the difference between this:
incl %gs:nr_dentry # nr_dentry
and this:
movl percpu_counter_batch(%rip), %edx # percpu_counter_batch,
movl $1, %esi #,
movq $nr_dentry, %rdi #,
call __percpu_counter_add # (plus I clobber registers)
__percpu_counter_add:
pushq %rbp #
movq %rsp, %rbp #,
subq $32, %rsp #,
movq %rbx, -24(%rbp) #,
movq %r12, -16(%rbp) #,
movq %r13, -8(%rbp) #,
movq %rdi, %rbx # fbc, fbc
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
movq %gs:kernel_stack,%rax #, pfo_ret__
# 0 "" 2
#NO_APP
incl -8124(%rax) # <variable>.preempt_count
movq 32(%rdi), %r12 # <variable>.counters, tcp_ptr__
#APP
# 78 "lib/percpu_counter.c" 1
add %gs:this_cpu_off, %r12 # this_cpu_off, tcp_ptr__
# 0 "" 2
#NO_APP
movslq (%r12),%r13 #* tcp_ptr__, tmp73
movslq %edx,%rax # batch, batch
addq %rsi, %r13 # amount, count
cmpq %rax, %r13 # batch, count
jge .L27 #,
negl %edx # tmp76
movslq %edx,%rdx # tmp76, tmp77
cmpq %rdx, %r13 # tmp77, count
jg .L28 #,
.L27:
movq %rbx, %rdi # fbc,
call _raw_spin_lock #
addq %r13, 8(%rbx) # count, <variable>.count
movq %rbx, %rdi # fbc,
movl $0, (%r12) #,* tcp_ptr__
call _raw_spin_unlock #
.L29:
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
movq %gs:kernel_stack,%rax #, pfo_ret__
# 0 "" 2
#NO_APP
decl -8124(%rax) # <variable>.preempt_count
movq -8136(%rax), %rax #, D.14625
testb $8, %al #, D.14625
jne .L32 #,
.L31:
movq -24(%rbp), %rbx #,
movq -16(%rbp), %r12 #,
movq -8(%rbp), %r13 #,
leave
ret
.p2align 4,,10
.p2align 3
.L28:
movl %r13d, (%r12) # count,*
jmp .L29 #
.L32:
call preempt_schedule #
.p2align 4,,6
jmp .L31 #
.size __percpu_counter_add, .-__percpu_counter_add
.p2align 4,,15
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:19 +08:00
|
|
|
static int get_nr_dentry(void)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
int sum = 0;
|
|
|
|
for_each_possible_cpu(i)
|
|
|
|
sum += per_cpu(nr_dentry, i);
|
|
|
|
return sum < 0 ? 0 : sum;
|
|
|
|
}
|
|
|
|
|
2010-10-10 17:36:23 +08:00
|
|
|
int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
|
|
|
|
size_t *lenp, loff_t *ppos)
|
|
|
|
{
|
fs: use fast counters for vfs caches
percpu_counter library generates quite nasty code, so unless you need
to dynamically allocate counters or take fast approximate value, a
simple per cpu set of counters is much better.
The percpu_counter can never be made to work as well, because it has an
indirection from pointer to percpu memory, and it can't use direct
this_cpu_inc interfaces because it doesn't use static PER_CPU data, so
code will always be worse.
In the fastpath, it is the difference between this:
incl %gs:nr_dentry # nr_dentry
and this:
movl percpu_counter_batch(%rip), %edx # percpu_counter_batch,
movl $1, %esi #,
movq $nr_dentry, %rdi #,
call __percpu_counter_add # (plus I clobber registers)
__percpu_counter_add:
pushq %rbp #
movq %rsp, %rbp #,
subq $32, %rsp #,
movq %rbx, -24(%rbp) #,
movq %r12, -16(%rbp) #,
movq %r13, -8(%rbp) #,
movq %rdi, %rbx # fbc, fbc
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
movq %gs:kernel_stack,%rax #, pfo_ret__
# 0 "" 2
#NO_APP
incl -8124(%rax) # <variable>.preempt_count
movq 32(%rdi), %r12 # <variable>.counters, tcp_ptr__
#APP
# 78 "lib/percpu_counter.c" 1
add %gs:this_cpu_off, %r12 # this_cpu_off, tcp_ptr__
# 0 "" 2
#NO_APP
movslq (%r12),%r13 #* tcp_ptr__, tmp73
movslq %edx,%rax # batch, batch
addq %rsi, %r13 # amount, count
cmpq %rax, %r13 # batch, count
jge .L27 #,
negl %edx # tmp76
movslq %edx,%rdx # tmp76, tmp77
cmpq %rdx, %r13 # tmp77, count
jg .L28 #,
.L27:
movq %rbx, %rdi # fbc,
call _raw_spin_lock #
addq %r13, 8(%rbx) # count, <variable>.count
movq %rbx, %rdi # fbc,
movl $0, (%r12) #,* tcp_ptr__
call _raw_spin_unlock #
.L29:
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
movq %gs:kernel_stack,%rax #, pfo_ret__
# 0 "" 2
#NO_APP
decl -8124(%rax) # <variable>.preempt_count
movq -8136(%rax), %rax #, D.14625
testb $8, %al #, D.14625
jne .L32 #,
.L31:
movq -24(%rbp), %rbx #,
movq -16(%rbp), %r12 #,
movq -8(%rbp), %r13 #,
leave
ret
.p2align 4,,10
.p2align 3
.L28:
movl %r13d, (%r12) # count,*
jmp .L29 #
.L32:
call preempt_schedule #
.p2align 4,,6
jmp .L31 #
.size __percpu_counter_add, .-__percpu_counter_add
.p2align 4,,15
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:19 +08:00
|
|
|
dentry_stat.nr_dentry = get_nr_dentry();
|
2010-10-10 17:36:23 +08:00
|
|
|
return proc_dointvec(table, write, buffer, lenp, ppos);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2010-10-10 17:36:22 +08:00
|
|
|
static void __d_free(struct rcu_head *head)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2010-10-10 17:36:22 +08:00
|
|
|
struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
|
|
|
|
|
2008-10-21 21:47:33 +08:00
|
|
|
WARN_ON(!list_empty(&dentry->d_alias));
|
2005-04-17 06:20:36 +08:00
|
|
|
if (dname_external(dentry))
|
|
|
|
kfree(dentry->d_name.name);
|
|
|
|
kmem_cache_free(dentry_cache, dentry);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2011-01-07 14:49:38 +08:00
|
|
|
* no locks, please.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
static void d_free(struct dentry *dentry)
|
|
|
|
{
|
2011-01-07 14:49:32 +08:00
|
|
|
BUG_ON(dentry->d_count);
|
fs: use fast counters for vfs caches
percpu_counter library generates quite nasty code, so unless you need
to dynamically allocate counters or take fast approximate value, a
simple per cpu set of counters is much better.
The percpu_counter can never be made to work as well, because it has an
indirection from pointer to percpu memory, and it can't use direct
this_cpu_inc interfaces because it doesn't use static PER_CPU data, so
code will always be worse.
In the fastpath, it is the difference between this:
incl %gs:nr_dentry # nr_dentry
and this:
movl percpu_counter_batch(%rip), %edx # percpu_counter_batch,
movl $1, %esi #,
movq $nr_dentry, %rdi #,
call __percpu_counter_add # (plus I clobber registers)
__percpu_counter_add:
pushq %rbp #
movq %rsp, %rbp #,
subq $32, %rsp #,
movq %rbx, -24(%rbp) #,
movq %r12, -16(%rbp) #,
movq %r13, -8(%rbp) #,
movq %rdi, %rbx # fbc, fbc
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
movq %gs:kernel_stack,%rax #, pfo_ret__
# 0 "" 2
#NO_APP
incl -8124(%rax) # <variable>.preempt_count
movq 32(%rdi), %r12 # <variable>.counters, tcp_ptr__
#APP
# 78 "lib/percpu_counter.c" 1
add %gs:this_cpu_off, %r12 # this_cpu_off, tcp_ptr__
# 0 "" 2
#NO_APP
movslq (%r12),%r13 #* tcp_ptr__, tmp73
movslq %edx,%rax # batch, batch
addq %rsi, %r13 # amount, count
cmpq %rax, %r13 # batch, count
jge .L27 #,
negl %edx # tmp76
movslq %edx,%rdx # tmp76, tmp77
cmpq %rdx, %r13 # tmp77, count
jg .L28 #,
.L27:
movq %rbx, %rdi # fbc,
call _raw_spin_lock #
addq %r13, 8(%rbx) # count, <variable>.count
movq %rbx, %rdi # fbc,
movl $0, (%r12) #,* tcp_ptr__
call _raw_spin_unlock #
.L29:
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
movq %gs:kernel_stack,%rax #, pfo_ret__
# 0 "" 2
#NO_APP
decl -8124(%rax) # <variable>.preempt_count
movq -8136(%rax), %rax #, D.14625
testb $8, %al #, D.14625
jne .L32 #,
.L31:
movq -24(%rbp), %rbx #,
movq -16(%rbp), %r12 #,
movq -8(%rbp), %r13 #,
leave
ret
.p2align 4,,10
.p2align 3
.L28:
movl %r13d, (%r12) # count,*
jmp .L29 #
.L32:
call preempt_schedule #
.p2align 4,,6
jmp .L31 #
.size __percpu_counter_add, .-__percpu_counter_add
.p2align 4,,15
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:19 +08:00
|
|
|
this_cpu_dec(nr_dentry);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (dentry->d_op && dentry->d_op->d_release)
|
|
|
|
dentry->d_op->d_release(dentry);
|
2010-10-10 17:36:23 +08:00
|
|
|
|
vfs: get rid of insane dentry hashing rules
The dentry hashing rules have been really quite complicated for a long
while, in odd ways. That made functions like __d_drop() very fragile
and non-obvious.
In particular, whether a dentry was hashed or not was indicated with an
explicit DCACHE_UNHASHED bit. That's despite the fact that the hash
abstraction that the dentries use actually have a 'is this entry hashed
or not' model (which is a simple test of the 'pprev' pointer).
The reason that was done is because we used the normal 'is this entry
unhashed' model to mark whether the dentry had _ever_ been hashed in the
dentry hash tables, and that logic goes back many years (commit
b3423415fbc2: "dcache: avoid RCU for never-hashed dentries").
That, in turn, meant that __d_drop had totally different unhashing logic
for the dentry hash table case and for the anonymous dcache case,
because in order to use the "is this dentry hashed" logic as a flag for
whether it had ever been on the RCU hash table, we had to unhash such a
dentry differently so that we'd never think that it wasn't 'unhashed'
and wouldn't be free'd correctly.
That's just insane. It made the logic really hard to follow, when there
were two different kinds of "unhashed" states, and one of them (the one
that used "list_bl_unhashed()") really had nothing at all to do with
being unhashed per se, but with a very subtle lifetime rule instead.
So turn all of it around, and make it logical.
Instead of having a DENTRY_UNHASHED bit in d_flags to indicate whether
the dentry is on the hash chains or not, use the hash chain unhashed
logic for that. Suddenly "d_unhashed()" just uses "list_bl_unhashed()",
and everything makes sense.
And for the lifetime rule, just use an explicit DENTRY_RCUACCEES bit.
If we ever insert the dentry into the dentry hash table so that it is
visible to RCU lookup, we mark it DENTRY_RCUACCESS to show that it now
needs the RCU lifetime rules. Now suddently that test at dentry free
time makes sense too.
And because unhashing now is sane and doesn't depend on where the dentry
got unhashed from (because the dentry hash chain details doesn't have
some subtle side effects), we can re-unify the __d_drop() logic and use
common code for the unhashing.
Also fix one more open-coded hash chain bit_spin_lock() that I missed in
the previous chain locking cleanup commit.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-04-24 22:58:46 +08:00
|
|
|
/* if dentry was never visible to RCU, immediate free is OK */
|
|
|
|
if (!(dentry->d_flags & DCACHE_RCUACCESS))
|
2010-10-10 17:36:22 +08:00
|
|
|
__d_free(&dentry->d_u.d_rcu);
|
2006-12-07 12:38:48 +08:00
|
|
|
else
|
2010-10-10 17:36:22 +08:00
|
|
|
call_rcu(&dentry->d_u.d_rcu, __d_free);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
/**
|
|
|
|
* dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
|
2011-01-23 12:16:06 +08:00
|
|
|
* @dentry: the target dentry
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
* After this call, in-progress rcu-walk path lookup will fail. This
|
|
|
|
* should be called after unhashing, and after changing d_inode (if
|
|
|
|
* the dentry has not already been unhashed).
|
|
|
|
*/
|
|
|
|
static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
assert_spin_locked(&dentry->d_lock);
|
|
|
|
/* Go through a barrier */
|
|
|
|
write_seqcount_barrier(&dentry->d_seq);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Release the dentry's inode, using the filesystem
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
* d_iput() operation if defined. Dentry has no refcount
|
|
|
|
* and is unhashed.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2006-01-15 05:20:43 +08:00
|
|
|
static void dentry_iput(struct dentry * dentry)
|
2008-06-24 00:11:52 +08:00
|
|
|
__releases(dentry->d_lock)
|
2011-01-07 14:50:06 +08:00
|
|
|
__releases(dentry->d_inode->i_lock)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct inode *inode = dentry->d_inode;
|
|
|
|
if (inode) {
|
|
|
|
dentry->d_inode = NULL;
|
|
|
|
list_del_init(&dentry->d_alias);
|
|
|
|
spin_unlock(&dentry->d_lock);
|
2011-01-07 14:50:06 +08:00
|
|
|
spin_unlock(&inode->i_lock);
|
2005-09-20 10:54:29 +08:00
|
|
|
if (!inode->i_nlink)
|
|
|
|
fsnotify_inoderemove(inode);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (dentry->d_op && dentry->d_op->d_iput)
|
|
|
|
dentry->d_op->d_iput(dentry, inode);
|
|
|
|
else
|
|
|
|
iput(inode);
|
|
|
|
} else {
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
/*
|
|
|
|
* Release the dentry's inode, using the filesystem
|
|
|
|
* d_iput() operation if defined. dentry remains in-use.
|
|
|
|
*/
|
|
|
|
static void dentry_unlink_inode(struct dentry * dentry)
|
|
|
|
__releases(dentry->d_lock)
|
2011-01-07 14:50:06 +08:00
|
|
|
__releases(dentry->d_inode->i_lock)
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
{
|
|
|
|
struct inode *inode = dentry->d_inode;
|
|
|
|
dentry->d_inode = NULL;
|
|
|
|
list_del_init(&dentry->d_alias);
|
|
|
|
dentry_rcuwalk_barrier(dentry);
|
|
|
|
spin_unlock(&dentry->d_lock);
|
2011-01-07 14:50:06 +08:00
|
|
|
spin_unlock(&inode->i_lock);
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
if (!inode->i_nlink)
|
|
|
|
fsnotify_inoderemove(inode);
|
|
|
|
if (dentry->d_op && dentry->d_op->d_iput)
|
|
|
|
dentry->d_op->d_iput(dentry, inode);
|
|
|
|
else
|
|
|
|
iput(inode);
|
|
|
|
}
|
|
|
|
|
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary]
Split LRU-list of unused dentries to one per superblock to avoid soft
lock up during NFS mounts and remounting of any filesystem.
Previously I posted here:
http://lkml.org/lkml/2008/3/5/590
[Descriptions]
- background
dentry_unused is a list of dentries which are not referenced.
dentry_unused grows up when references on directories or files are
released. This list can be very long if there is huge free memory.
- the problem
When shrink_dcache_sb() is called, it scans all dentry_unused linearly
under spin_lock(), and if dentry->d_sb is differnt from given
superblock, scan next dentry. This scan costs very much if there are
many entries, and very ineffective if there are many superblocks.
IOW, When we need to shrink unused dentries on one dentry, but scans
unused dentries on all superblocks in the system. For example, we scan
500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
dentries on other superblocks.
In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
unused dentries on NFS, but scans 100,000,000 unused dentries on
superblocks in the system such as local ext3 filesystems. I hear NFS
mounting took 1 min on some system in use.
* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
this problem.
100,000,000 is possible number on large systems.
Per-superblock LRU of unused dentried can reduce the cost in
reasonable manner.
- How to fix
I found this problem is solved by David Chinner's "Per-superblock
unused dentry LRU lists V3"(1), so I rebase it and add some fix to
reclaim with fairness, which is in Andrew Morton's comments(2).
1) http://lkml.org/lkml/2006/5/25/318
2) http://lkml.org/lkml/2006/5/25/320
Split LRU-list of unused dentries to each superblocks. Then, NFS
mounting will check dentries under a superblock instead of all. But
this spliting will break LRU of dentry-unused. So, I've attempted to
make reclaim unused dentrins with fairness by calculate number of
dentries to scan on this sb based on following way
number of dentries to scan on this sb =
count * (number of dentries on this sb / number of dentries in the machine)
- ToDo
- I have to measuring performance number and do stress tests.
- When unmount occurs during prune_dcache(), scanning on same
superblock, It is unable to reach next superblock because it is gone
away. We restart scannig superblock from first one, it causes
unfairness of reclaim unused dentries on first superblock. But I think
this happens very rarely.
- Test Results
Result on 6GB boxes with excessive unused dentries.
Without patch:
$ cat /proc/sys/fs/dentry-state
10181835 10180203 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m1.830s
user 0m0.001s
sys 0m1.653s
With this patch:
$ cat /proc/sys/fs/dentry-state
10236610 10234751 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m0.106s
user 0m0.002s
sys 0m0.032s
[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 12:27:13 +08:00
|
|
|
/*
|
2011-10-29 01:02:42 +08:00
|
|
|
* dentry_lru_(add|del|prune|move_tail) must be called with d_lock held.
|
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary]
Split LRU-list of unused dentries to one per superblock to avoid soft
lock up during NFS mounts and remounting of any filesystem.
Previously I posted here:
http://lkml.org/lkml/2008/3/5/590
[Descriptions]
- background
dentry_unused is a list of dentries which are not referenced.
dentry_unused grows up when references on directories or files are
released. This list can be very long if there is huge free memory.
- the problem
When shrink_dcache_sb() is called, it scans all dentry_unused linearly
under spin_lock(), and if dentry->d_sb is differnt from given
superblock, scan next dentry. This scan costs very much if there are
many entries, and very ineffective if there are many superblocks.
IOW, When we need to shrink unused dentries on one dentry, but scans
unused dentries on all superblocks in the system. For example, we scan
500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
dentries on other superblocks.
In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
unused dentries on NFS, but scans 100,000,000 unused dentries on
superblocks in the system such as local ext3 filesystems. I hear NFS
mounting took 1 min on some system in use.
* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
this problem.
100,000,000 is possible number on large systems.
Per-superblock LRU of unused dentried can reduce the cost in
reasonable manner.
- How to fix
I found this problem is solved by David Chinner's "Per-superblock
unused dentry LRU lists V3"(1), so I rebase it and add some fix to
reclaim with fairness, which is in Andrew Morton's comments(2).
1) http://lkml.org/lkml/2006/5/25/318
2) http://lkml.org/lkml/2006/5/25/320
Split LRU-list of unused dentries to each superblocks. Then, NFS
mounting will check dentries under a superblock instead of all. But
this spliting will break LRU of dentry-unused. So, I've attempted to
make reclaim unused dentrins with fairness by calculate number of
dentries to scan on this sb based on following way
number of dentries to scan on this sb =
count * (number of dentries on this sb / number of dentries in the machine)
- ToDo
- I have to measuring performance number and do stress tests.
- When unmount occurs during prune_dcache(), scanning on same
superblock, It is unable to reach next superblock because it is gone
away. We restart scannig superblock from first one, it causes
unfairness of reclaim unused dentries on first superblock. But I think
this happens very rarely.
- Test Results
Result on 6GB boxes with excessive unused dentries.
Without patch:
$ cat /proc/sys/fs/dentry-state
10181835 10180203 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m1.830s
user 0m0.001s
sys 0m1.653s
With this patch:
$ cat /proc/sys/fs/dentry-state
10236610 10234751 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m0.106s
user 0m0.002s
sys 0m0.032s
[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 12:27:13 +08:00
|
|
|
*/
|
|
|
|
static void dentry_lru_add(struct dentry *dentry)
|
|
|
|
{
|
2010-10-10 17:36:26 +08:00
|
|
|
if (list_empty(&dentry->d_lru)) {
|
2011-01-07 14:49:31 +08:00
|
|
|
spin_lock(&dcache_lru_lock);
|
2010-10-10 17:36:26 +08:00
|
|
|
list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
|
|
|
|
dentry->d_sb->s_nr_dentry_unused++;
|
2011-01-07 14:49:18 +08:00
|
|
|
dentry_stat.nr_unused++;
|
2011-01-07 14:49:31 +08:00
|
|
|
spin_unlock(&dcache_lru_lock);
|
2010-10-10 17:36:26 +08:00
|
|
|
}
|
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary]
Split LRU-list of unused dentries to one per superblock to avoid soft
lock up during NFS mounts and remounting of any filesystem.
Previously I posted here:
http://lkml.org/lkml/2008/3/5/590
[Descriptions]
- background
dentry_unused is a list of dentries which are not referenced.
dentry_unused grows up when references on directories or files are
released. This list can be very long if there is huge free memory.
- the problem
When shrink_dcache_sb() is called, it scans all dentry_unused linearly
under spin_lock(), and if dentry->d_sb is differnt from given
superblock, scan next dentry. This scan costs very much if there are
many entries, and very ineffective if there are many superblocks.
IOW, When we need to shrink unused dentries on one dentry, but scans
unused dentries on all superblocks in the system. For example, we scan
500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
dentries on other superblocks.
In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
unused dentries on NFS, but scans 100,000,000 unused dentries on
superblocks in the system such as local ext3 filesystems. I hear NFS
mounting took 1 min on some system in use.
* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
this problem.
100,000,000 is possible number on large systems.
Per-superblock LRU of unused dentried can reduce the cost in
reasonable manner.
- How to fix
I found this problem is solved by David Chinner's "Per-superblock
unused dentry LRU lists V3"(1), so I rebase it and add some fix to
reclaim with fairness, which is in Andrew Morton's comments(2).
1) http://lkml.org/lkml/2006/5/25/318
2) http://lkml.org/lkml/2006/5/25/320
Split LRU-list of unused dentries to each superblocks. Then, NFS
mounting will check dentries under a superblock instead of all. But
this spliting will break LRU of dentry-unused. So, I've attempted to
make reclaim unused dentrins with fairness by calculate number of
dentries to scan on this sb based on following way
number of dentries to scan on this sb =
count * (number of dentries on this sb / number of dentries in the machine)
- ToDo
- I have to measuring performance number and do stress tests.
- When unmount occurs during prune_dcache(), scanning on same
superblock, It is unable to reach next superblock because it is gone
away. We restart scannig superblock from first one, it causes
unfairness of reclaim unused dentries on first superblock. But I think
this happens very rarely.
- Test Results
Result on 6GB boxes with excessive unused dentries.
Without patch:
$ cat /proc/sys/fs/dentry-state
10181835 10180203 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m1.830s
user 0m0.001s
sys 0m1.653s
With this patch:
$ cat /proc/sys/fs/dentry-state
10236610 10234751 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m0.106s
user 0m0.002s
sys 0m0.032s
[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 12:27:13 +08:00
|
|
|
}
|
|
|
|
|
2011-01-07 14:49:31 +08:00
|
|
|
static void __dentry_lru_del(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
list_del_init(&dentry->d_lru);
|
fix shrink_dcache_parent() livelock
Two (or more) concurrent calls of shrink_dcache_parent() on the same dentry may
cause shrink_dcache_parent() to loop forever.
Here's what appears to happen:
1 - CPU0: select_parent(P) finds C and puts it on dispose list, returns 1
2 - CPU1: select_parent(P) locks P->d_lock
3 - CPU0: shrink_dentry_list() locks C->d_lock
dentry_kill(C) tries to lock P->d_lock but fails, unlocks C->d_lock
4 - CPU1: select_parent(P) locks C->d_lock,
moves C from dispose list being processed on CPU0 to the new
dispose list, returns 1
5 - CPU0: shrink_dentry_list() finds dispose list empty, returns
6 - Goto 2 with CPU0 and CPU1 switched
Basically select_parent() steals the dentry from shrink_dentry_list() and thinks
it found a new one, causing shrink_dentry_list() to think it's making progress
and loop over and over.
One way to trigger this is to make udev calls stat() on the sysfs file while it
is going away.
Having a file in /lib/udev/rules.d/ with only this one rule seems to the trick:
ATTR{vendor}=="0x8086", ATTR{device}=="0x10ca", ENV{PCI_SLOT_NAME}="%k", ENV{MATCHADDR}="$attr{address}", RUN+="/bin/true"
Then execute the following loop:
while true; do
echo -bond0 > /sys/class/net/bonding_masters
echo +bond0 > /sys/class/net/bonding_masters
echo -bond1 > /sys/class/net/bonding_masters
echo +bond1 > /sys/class/net/bonding_masters
done
One fix would be to check all callers and prevent concurrent calls to
shrink_dcache_parent(). But I think a better solution is to stop the
stealing behavior.
This patch adds a new dentry flag that is set when the dentry is added to the
dispose list. The flag is cleared in dentry_lru_del() in case the dentry gets a
new reference just before being pruned.
If the dentry has this flag, select_parent() will skip it and let
shrink_dentry_list() retry pruning it. With select_parent() skipping those
dentries there will not be the appearance of progress (new dentries found) when
there is none, hence shrink_dcache_parent() will not loop forever.
Set the flag is also set in prune_dcache_sb() for consistency as suggested by
Linus.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2012-01-11 01:22:25 +08:00
|
|
|
dentry->d_flags &= ~DCACHE_SHRINK_LIST;
|
2011-01-07 14:49:31 +08:00
|
|
|
dentry->d_sb->s_nr_dentry_unused--;
|
|
|
|
dentry_stat.nr_unused--;
|
|
|
|
}
|
|
|
|
|
2011-10-29 01:02:42 +08:00
|
|
|
/*
|
|
|
|
* Remove a dentry with references from the LRU.
|
|
|
|
*/
|
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary]
Split LRU-list of unused dentries to one per superblock to avoid soft
lock up during NFS mounts and remounting of any filesystem.
Previously I posted here:
http://lkml.org/lkml/2008/3/5/590
[Descriptions]
- background
dentry_unused is a list of dentries which are not referenced.
dentry_unused grows up when references on directories or files are
released. This list can be very long if there is huge free memory.
- the problem
When shrink_dcache_sb() is called, it scans all dentry_unused linearly
under spin_lock(), and if dentry->d_sb is differnt from given
superblock, scan next dentry. This scan costs very much if there are
many entries, and very ineffective if there are many superblocks.
IOW, When we need to shrink unused dentries on one dentry, but scans
unused dentries on all superblocks in the system. For example, we scan
500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
dentries on other superblocks.
In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
unused dentries on NFS, but scans 100,000,000 unused dentries on
superblocks in the system such as local ext3 filesystems. I hear NFS
mounting took 1 min on some system in use.
* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
this problem.
100,000,000 is possible number on large systems.
Per-superblock LRU of unused dentried can reduce the cost in
reasonable manner.
- How to fix
I found this problem is solved by David Chinner's "Per-superblock
unused dentry LRU lists V3"(1), so I rebase it and add some fix to
reclaim with fairness, which is in Andrew Morton's comments(2).
1) http://lkml.org/lkml/2006/5/25/318
2) http://lkml.org/lkml/2006/5/25/320
Split LRU-list of unused dentries to each superblocks. Then, NFS
mounting will check dentries under a superblock instead of all. But
this spliting will break LRU of dentry-unused. So, I've attempted to
make reclaim unused dentrins with fairness by calculate number of
dentries to scan on this sb based on following way
number of dentries to scan on this sb =
count * (number of dentries on this sb / number of dentries in the machine)
- ToDo
- I have to measuring performance number and do stress tests.
- When unmount occurs during prune_dcache(), scanning on same
superblock, It is unable to reach next superblock because it is gone
away. We restart scannig superblock from first one, it causes
unfairness of reclaim unused dentries on first superblock. But I think
this happens very rarely.
- Test Results
Result on 6GB boxes with excessive unused dentries.
Without patch:
$ cat /proc/sys/fs/dentry-state
10181835 10180203 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m1.830s
user 0m0.001s
sys 0m1.653s
With this patch:
$ cat /proc/sys/fs/dentry-state
10236610 10234751 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m0.106s
user 0m0.002s
sys 0m0.032s
[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 12:27:13 +08:00
|
|
|
static void dentry_lru_del(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
if (!list_empty(&dentry->d_lru)) {
|
2011-01-07 14:49:31 +08:00
|
|
|
spin_lock(&dcache_lru_lock);
|
|
|
|
__dentry_lru_del(dentry);
|
|
|
|
spin_unlock(&dcache_lru_lock);
|
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary]
Split LRU-list of unused dentries to one per superblock to avoid soft
lock up during NFS mounts and remounting of any filesystem.
Previously I posted here:
http://lkml.org/lkml/2008/3/5/590
[Descriptions]
- background
dentry_unused is a list of dentries which are not referenced.
dentry_unused grows up when references on directories or files are
released. This list can be very long if there is huge free memory.
- the problem
When shrink_dcache_sb() is called, it scans all dentry_unused linearly
under spin_lock(), and if dentry->d_sb is differnt from given
superblock, scan next dentry. This scan costs very much if there are
many entries, and very ineffective if there are many superblocks.
IOW, When we need to shrink unused dentries on one dentry, but scans
unused dentries on all superblocks in the system. For example, we scan
500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
dentries on other superblocks.
In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
unused dentries on NFS, but scans 100,000,000 unused dentries on
superblocks in the system such as local ext3 filesystems. I hear NFS
mounting took 1 min on some system in use.
* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
this problem.
100,000,000 is possible number on large systems.
Per-superblock LRU of unused dentried can reduce the cost in
reasonable manner.
- How to fix
I found this problem is solved by David Chinner's "Per-superblock
unused dentry LRU lists V3"(1), so I rebase it and add some fix to
reclaim with fairness, which is in Andrew Morton's comments(2).
1) http://lkml.org/lkml/2006/5/25/318
2) http://lkml.org/lkml/2006/5/25/320
Split LRU-list of unused dentries to each superblocks. Then, NFS
mounting will check dentries under a superblock instead of all. But
this spliting will break LRU of dentry-unused. So, I've attempted to
make reclaim unused dentrins with fairness by calculate number of
dentries to scan on this sb based on following way
number of dentries to scan on this sb =
count * (number of dentries on this sb / number of dentries in the machine)
- ToDo
- I have to measuring performance number and do stress tests.
- When unmount occurs during prune_dcache(), scanning on same
superblock, It is unable to reach next superblock because it is gone
away. We restart scannig superblock from first one, it causes
unfairness of reclaim unused dentries on first superblock. But I think
this happens very rarely.
- Test Results
Result on 6GB boxes with excessive unused dentries.
Without patch:
$ cat /proc/sys/fs/dentry-state
10181835 10180203 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m1.830s
user 0m0.001s
sys 0m1.653s
With this patch:
$ cat /proc/sys/fs/dentry-state
10236610 10234751 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m0.106s
user 0m0.002s
sys 0m0.032s
[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 12:27:13 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-10-29 01:02:42 +08:00
|
|
|
/*
|
|
|
|
* Remove a dentry that is unreferenced and about to be pruned
|
|
|
|
* (unhashed and destroyed) from the LRU, and inform the file system.
|
|
|
|
* This wrapper should be called _prior_ to unhashing a victim dentry.
|
|
|
|
*/
|
|
|
|
static void dentry_lru_prune(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
if (!list_empty(&dentry->d_lru)) {
|
|
|
|
if (dentry->d_flags & DCACHE_OP_PRUNE)
|
|
|
|
dentry->d_op->d_prune(dentry);
|
|
|
|
|
|
|
|
spin_lock(&dcache_lru_lock);
|
|
|
|
__dentry_lru_del(dentry);
|
|
|
|
spin_unlock(&dcache_lru_lock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-08-23 16:56:24 +08:00
|
|
|
static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
|
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary]
Split LRU-list of unused dentries to one per superblock to avoid soft
lock up during NFS mounts and remounting of any filesystem.
Previously I posted here:
http://lkml.org/lkml/2008/3/5/590
[Descriptions]
- background
dentry_unused is a list of dentries which are not referenced.
dentry_unused grows up when references on directories or files are
released. This list can be very long if there is huge free memory.
- the problem
When shrink_dcache_sb() is called, it scans all dentry_unused linearly
under spin_lock(), and if dentry->d_sb is differnt from given
superblock, scan next dentry. This scan costs very much if there are
many entries, and very ineffective if there are many superblocks.
IOW, When we need to shrink unused dentries on one dentry, but scans
unused dentries on all superblocks in the system. For example, we scan
500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
dentries on other superblocks.
In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
unused dentries on NFS, but scans 100,000,000 unused dentries on
superblocks in the system such as local ext3 filesystems. I hear NFS
mounting took 1 min on some system in use.
* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
this problem.
100,000,000 is possible number on large systems.
Per-superblock LRU of unused dentried can reduce the cost in
reasonable manner.
- How to fix
I found this problem is solved by David Chinner's "Per-superblock
unused dentry LRU lists V3"(1), so I rebase it and add some fix to
reclaim with fairness, which is in Andrew Morton's comments(2).
1) http://lkml.org/lkml/2006/5/25/318
2) http://lkml.org/lkml/2006/5/25/320
Split LRU-list of unused dentries to each superblocks. Then, NFS
mounting will check dentries under a superblock instead of all. But
this spliting will break LRU of dentry-unused. So, I've attempted to
make reclaim unused dentrins with fairness by calculate number of
dentries to scan on this sb based on following way
number of dentries to scan on this sb =
count * (number of dentries on this sb / number of dentries in the machine)
- ToDo
- I have to measuring performance number and do stress tests.
- When unmount occurs during prune_dcache(), scanning on same
superblock, It is unable to reach next superblock because it is gone
away. We restart scannig superblock from first one, it causes
unfairness of reclaim unused dentries on first superblock. But I think
this happens very rarely.
- Test Results
Result on 6GB boxes with excessive unused dentries.
Without patch:
$ cat /proc/sys/fs/dentry-state
10181835 10180203 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m1.830s
user 0m0.001s
sys 0m1.653s
With this patch:
$ cat /proc/sys/fs/dentry-state
10236610 10234751 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m0.106s
user 0m0.002s
sys 0m0.032s
[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 12:27:13 +08:00
|
|
|
{
|
2011-01-07 14:49:31 +08:00
|
|
|
spin_lock(&dcache_lru_lock);
|
2010-10-10 17:36:26 +08:00
|
|
|
if (list_empty(&dentry->d_lru)) {
|
2011-08-23 16:56:24 +08:00
|
|
|
list_add_tail(&dentry->d_lru, list);
|
2010-10-10 17:36:26 +08:00
|
|
|
dentry->d_sb->s_nr_dentry_unused++;
|
2011-01-07 14:49:18 +08:00
|
|
|
dentry_stat.nr_unused++;
|
2010-10-10 17:36:26 +08:00
|
|
|
} else {
|
2011-08-23 16:56:24 +08:00
|
|
|
list_move_tail(&dentry->d_lru, list);
|
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary]
Split LRU-list of unused dentries to one per superblock to avoid soft
lock up during NFS mounts and remounting of any filesystem.
Previously I posted here:
http://lkml.org/lkml/2008/3/5/590
[Descriptions]
- background
dentry_unused is a list of dentries which are not referenced.
dentry_unused grows up when references on directories or files are
released. This list can be very long if there is huge free memory.
- the problem
When shrink_dcache_sb() is called, it scans all dentry_unused linearly
under spin_lock(), and if dentry->d_sb is differnt from given
superblock, scan next dentry. This scan costs very much if there are
many entries, and very ineffective if there are many superblocks.
IOW, When we need to shrink unused dentries on one dentry, but scans
unused dentries on all superblocks in the system. For example, we scan
500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
dentries on other superblocks.
In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
unused dentries on NFS, but scans 100,000,000 unused dentries on
superblocks in the system such as local ext3 filesystems. I hear NFS
mounting took 1 min on some system in use.
* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
this problem.
100,000,000 is possible number on large systems.
Per-superblock LRU of unused dentried can reduce the cost in
reasonable manner.
- How to fix
I found this problem is solved by David Chinner's "Per-superblock
unused dentry LRU lists V3"(1), so I rebase it and add some fix to
reclaim with fairness, which is in Andrew Morton's comments(2).
1) http://lkml.org/lkml/2006/5/25/318
2) http://lkml.org/lkml/2006/5/25/320
Split LRU-list of unused dentries to each superblocks. Then, NFS
mounting will check dentries under a superblock instead of all. But
this spliting will break LRU of dentry-unused. So, I've attempted to
make reclaim unused dentrins with fairness by calculate number of
dentries to scan on this sb based on following way
number of dentries to scan on this sb =
count * (number of dentries on this sb / number of dentries in the machine)
- ToDo
- I have to measuring performance number and do stress tests.
- When unmount occurs during prune_dcache(), scanning on same
superblock, It is unable to reach next superblock because it is gone
away. We restart scannig superblock from first one, it causes
unfairness of reclaim unused dentries on first superblock. But I think
this happens very rarely.
- Test Results
Result on 6GB boxes with excessive unused dentries.
Without patch:
$ cat /proc/sys/fs/dentry-state
10181835 10180203 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m1.830s
user 0m0.001s
sys 0m1.653s
With this patch:
$ cat /proc/sys/fs/dentry-state
10236610 10234751 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m0.106s
user 0m0.002s
sys 0m0.032s
[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 12:27:13 +08:00
|
|
|
}
|
2011-01-07 14:49:31 +08:00
|
|
|
spin_unlock(&dcache_lru_lock);
|
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary]
Split LRU-list of unused dentries to one per superblock to avoid soft
lock up during NFS mounts and remounting of any filesystem.
Previously I posted here:
http://lkml.org/lkml/2008/3/5/590
[Descriptions]
- background
dentry_unused is a list of dentries which are not referenced.
dentry_unused grows up when references on directories or files are
released. This list can be very long if there is huge free memory.
- the problem
When shrink_dcache_sb() is called, it scans all dentry_unused linearly
under spin_lock(), and if dentry->d_sb is differnt from given
superblock, scan next dentry. This scan costs very much if there are
many entries, and very ineffective if there are many superblocks.
IOW, When we need to shrink unused dentries on one dentry, but scans
unused dentries on all superblocks in the system. For example, we scan
500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
dentries on other superblocks.
In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
unused dentries on NFS, but scans 100,000,000 unused dentries on
superblocks in the system such as local ext3 filesystems. I hear NFS
mounting took 1 min on some system in use.
* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
this problem.
100,000,000 is possible number on large systems.
Per-superblock LRU of unused dentried can reduce the cost in
reasonable manner.
- How to fix
I found this problem is solved by David Chinner's "Per-superblock
unused dentry LRU lists V3"(1), so I rebase it and add some fix to
reclaim with fairness, which is in Andrew Morton's comments(2).
1) http://lkml.org/lkml/2006/5/25/318
2) http://lkml.org/lkml/2006/5/25/320
Split LRU-list of unused dentries to each superblocks. Then, NFS
mounting will check dentries under a superblock instead of all. But
this spliting will break LRU of dentry-unused. So, I've attempted to
make reclaim unused dentrins with fairness by calculate number of
dentries to scan on this sb based on following way
number of dentries to scan on this sb =
count * (number of dentries on this sb / number of dentries in the machine)
- ToDo
- I have to measuring performance number and do stress tests.
- When unmount occurs during prune_dcache(), scanning on same
superblock, It is unable to reach next superblock because it is gone
away. We restart scannig superblock from first one, it causes
unfairness of reclaim unused dentries on first superblock. But I think
this happens very rarely.
- Test Results
Result on 6GB boxes with excessive unused dentries.
Without patch:
$ cat /proc/sys/fs/dentry-state
10181835 10180203 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m1.830s
user 0m0.001s
sys 0m1.653s
With this patch:
$ cat /proc/sys/fs/dentry-state
10236610 10234751 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m0.106s
user 0m0.002s
sys 0m0.032s
[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 12:27:13 +08:00
|
|
|
}
|
|
|
|
|
2007-05-08 15:23:46 +08:00
|
|
|
/**
|
|
|
|
* d_kill - kill dentry and return parent
|
|
|
|
* @dentry: dentry to kill
|
2011-01-23 12:16:06 +08:00
|
|
|
* @parent: parent dentry
|
2007-05-08 15:23:46 +08:00
|
|
|
*
|
2008-06-24 00:11:52 +08:00
|
|
|
* The dentry must already be unhashed and removed from the LRU.
|
2007-05-08 15:23:46 +08:00
|
|
|
*
|
|
|
|
* If this is the root of the dentry tree, return NULL.
|
2011-01-07 14:49:31 +08:00
|
|
|
*
|
2011-01-07 14:49:38 +08:00
|
|
|
* dentry->d_lock and parent->d_lock must be held by caller, and are dropped by
|
|
|
|
* d_kill.
|
2007-05-08 15:23:46 +08:00
|
|
|
*/
|
2011-01-07 14:49:34 +08:00
|
|
|
static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
|
2008-06-24 00:11:52 +08:00
|
|
|
__releases(dentry->d_lock)
|
2011-01-07 14:49:34 +08:00
|
|
|
__releases(parent->d_lock)
|
2011-01-07 14:50:06 +08:00
|
|
|
__releases(dentry->d_inode->i_lock)
|
2007-05-08 15:23:46 +08:00
|
|
|
{
|
|
|
|
list_del(&dentry->d_u.d_child);
|
2011-03-16 01:36:43 +08:00
|
|
|
/*
|
|
|
|
* Inform try_to_ascend() that we are no longer attached to the
|
|
|
|
* dentry tree
|
|
|
|
*/
|
|
|
|
dentry->d_flags |= DCACHE_DISCONNECTED;
|
2011-01-07 14:49:34 +08:00
|
|
|
if (parent)
|
|
|
|
spin_unlock(&parent->d_lock);
|
2007-05-08 15:23:46 +08:00
|
|
|
dentry_iput(dentry);
|
2011-01-07 14:49:32 +08:00
|
|
|
/*
|
|
|
|
* dentry_iput drops the locks, at which point nobody (except
|
|
|
|
* transient RCU lookups) can reach this dentry.
|
|
|
|
*/
|
2007-05-08 15:23:46 +08:00
|
|
|
d_free(dentry);
|
2008-10-16 06:50:27 +08:00
|
|
|
return parent;
|
2007-05-08 15:23:46 +08:00
|
|
|
}
|
|
|
|
|
2011-06-07 21:09:20 +08:00
|
|
|
/*
|
|
|
|
* Unhash a dentry without inserting an RCU walk barrier or checking that
|
|
|
|
* dentry->d_lock is locked. The caller must take care of that, if
|
|
|
|
* appropriate.
|
|
|
|
*/
|
|
|
|
static void __d_shrink(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
if (!d_unhashed(dentry)) {
|
|
|
|
struct hlist_bl_head *b;
|
|
|
|
if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
|
|
|
|
b = &dentry->d_sb->s_anon;
|
|
|
|
else
|
|
|
|
b = d_hash(dentry->d_parent, dentry->d_name.hash);
|
|
|
|
|
|
|
|
hlist_bl_lock(b);
|
|
|
|
__hlist_bl_del(&dentry->d_hash);
|
|
|
|
dentry->d_hash.pprev = NULL;
|
|
|
|
hlist_bl_unlock(b);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-01-07 14:49:30 +08:00
|
|
|
/**
|
|
|
|
* d_drop - drop a dentry
|
|
|
|
* @dentry: dentry to drop
|
|
|
|
*
|
|
|
|
* d_drop() unhashes the entry from the parent dentry hashes, so that it won't
|
|
|
|
* be found through a VFS lookup any more. Note that this is different from
|
|
|
|
* deleting the dentry - d_delete will try to mark the dentry negative if
|
|
|
|
* possible, giving a successful _negative_ lookup, while d_drop will
|
|
|
|
* just make the cache lookup fail.
|
|
|
|
*
|
|
|
|
* d_drop() is used mainly for stuff that wants to invalidate a dentry for some
|
|
|
|
* reason (NFS timeouts or autofs deletes).
|
|
|
|
*
|
|
|
|
* __d_drop requires dentry->d_lock.
|
|
|
|
*/
|
|
|
|
void __d_drop(struct dentry *dentry)
|
|
|
|
{
|
vfs: get rid of insane dentry hashing rules
The dentry hashing rules have been really quite complicated for a long
while, in odd ways. That made functions like __d_drop() very fragile
and non-obvious.
In particular, whether a dentry was hashed or not was indicated with an
explicit DCACHE_UNHASHED bit. That's despite the fact that the hash
abstraction that the dentries use actually have a 'is this entry hashed
or not' model (which is a simple test of the 'pprev' pointer).
The reason that was done is because we used the normal 'is this entry
unhashed' model to mark whether the dentry had _ever_ been hashed in the
dentry hash tables, and that logic goes back many years (commit
b3423415fbc2: "dcache: avoid RCU for never-hashed dentries").
That, in turn, meant that __d_drop had totally different unhashing logic
for the dentry hash table case and for the anonymous dcache case,
because in order to use the "is this dentry hashed" logic as a flag for
whether it had ever been on the RCU hash table, we had to unhash such a
dentry differently so that we'd never think that it wasn't 'unhashed'
and wouldn't be free'd correctly.
That's just insane. It made the logic really hard to follow, when there
were two different kinds of "unhashed" states, and one of them (the one
that used "list_bl_unhashed()") really had nothing at all to do with
being unhashed per se, but with a very subtle lifetime rule instead.
So turn all of it around, and make it logical.
Instead of having a DENTRY_UNHASHED bit in d_flags to indicate whether
the dentry is on the hash chains or not, use the hash chain unhashed
logic for that. Suddenly "d_unhashed()" just uses "list_bl_unhashed()",
and everything makes sense.
And for the lifetime rule, just use an explicit DENTRY_RCUACCEES bit.
If we ever insert the dentry into the dentry hash table so that it is
visible to RCU lookup, we mark it DENTRY_RCUACCESS to show that it now
needs the RCU lifetime rules. Now suddently that test at dentry free
time makes sense too.
And because unhashing now is sane and doesn't depend on where the dentry
got unhashed from (because the dentry hash chain details doesn't have
some subtle side effects), we can re-unify the __d_drop() logic and use
common code for the unhashing.
Also fix one more open-coded hash chain bit_spin_lock() that I missed in
the previous chain locking cleanup commit.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-04-24 22:58:46 +08:00
|
|
|
if (!d_unhashed(dentry)) {
|
2011-06-07 21:09:20 +08:00
|
|
|
__d_shrink(dentry);
|
vfs: get rid of insane dentry hashing rules
The dentry hashing rules have been really quite complicated for a long
while, in odd ways. That made functions like __d_drop() very fragile
and non-obvious.
In particular, whether a dentry was hashed or not was indicated with an
explicit DCACHE_UNHASHED bit. That's despite the fact that the hash
abstraction that the dentries use actually have a 'is this entry hashed
or not' model (which is a simple test of the 'pprev' pointer).
The reason that was done is because we used the normal 'is this entry
unhashed' model to mark whether the dentry had _ever_ been hashed in the
dentry hash tables, and that logic goes back many years (commit
b3423415fbc2: "dcache: avoid RCU for never-hashed dentries").
That, in turn, meant that __d_drop had totally different unhashing logic
for the dentry hash table case and for the anonymous dcache case,
because in order to use the "is this dentry hashed" logic as a flag for
whether it had ever been on the RCU hash table, we had to unhash such a
dentry differently so that we'd never think that it wasn't 'unhashed'
and wouldn't be free'd correctly.
That's just insane. It made the logic really hard to follow, when there
were two different kinds of "unhashed" states, and one of them (the one
that used "list_bl_unhashed()") really had nothing at all to do with
being unhashed per se, but with a very subtle lifetime rule instead.
So turn all of it around, and make it logical.
Instead of having a DENTRY_UNHASHED bit in d_flags to indicate whether
the dentry is on the hash chains or not, use the hash chain unhashed
logic for that. Suddenly "d_unhashed()" just uses "list_bl_unhashed()",
and everything makes sense.
And for the lifetime rule, just use an explicit DENTRY_RCUACCEES bit.
If we ever insert the dentry into the dentry hash table so that it is
visible to RCU lookup, we mark it DENTRY_RCUACCESS to show that it now
needs the RCU lifetime rules. Now suddently that test at dentry free
time makes sense too.
And because unhashing now is sane and doesn't depend on where the dentry
got unhashed from (because the dentry hash chain details doesn't have
some subtle side effects), we can re-unify the __d_drop() logic and use
common code for the unhashing.
Also fix one more open-coded hash chain bit_spin_lock() that I missed in
the previous chain locking cleanup commit.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-04-24 22:58:46 +08:00
|
|
|
dentry_rcuwalk_barrier(dentry);
|
2011-01-07 14:49:30 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(__d_drop);
|
|
|
|
|
|
|
|
void d_drop(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
spin_lock(&dentry->d_lock);
|
|
|
|
__d_drop(dentry);
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(d_drop);
|
|
|
|
|
2011-05-31 23:58:49 +08:00
|
|
|
/*
|
|
|
|
* d_clear_need_lookup - drop a dentry from cache and clear the need lookup flag
|
|
|
|
* @dentry: dentry to drop
|
|
|
|
*
|
|
|
|
* This is called when we do a lookup on a placeholder dentry that needed to be
|
|
|
|
* looked up. The dentry should have been hashed in order for it to be found by
|
|
|
|
* the lookup code, but now needs to be unhashed while we do the actual lookup
|
|
|
|
* and clear the DCACHE_NEED_LOOKUP flag.
|
|
|
|
*/
|
|
|
|
void d_clear_need_lookup(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
spin_lock(&dentry->d_lock);
|
|
|
|
__d_drop(dentry);
|
|
|
|
dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(d_clear_need_lookup);
|
|
|
|
|
2011-01-07 14:49:48 +08:00
|
|
|
/*
|
|
|
|
* Finish off a dentry we've decided to kill.
|
|
|
|
* dentry->d_lock must be held, returns with it unlocked.
|
|
|
|
* If ref is non-zero, then decrement the refcount too.
|
|
|
|
* Returns dentry requiring refcount drop, or NULL if we're done.
|
|
|
|
*/
|
|
|
|
static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)
|
|
|
|
__releases(dentry->d_lock)
|
|
|
|
{
|
2011-01-07 14:50:06 +08:00
|
|
|
struct inode *inode;
|
2011-01-07 14:49:48 +08:00
|
|
|
struct dentry *parent;
|
|
|
|
|
2011-01-07 14:50:06 +08:00
|
|
|
inode = dentry->d_inode;
|
|
|
|
if (inode && !spin_trylock(&inode->i_lock)) {
|
2011-01-07 14:49:48 +08:00
|
|
|
relock:
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
cpu_relax();
|
|
|
|
return dentry; /* try again with same dentry */
|
|
|
|
}
|
|
|
|
if (IS_ROOT(dentry))
|
|
|
|
parent = NULL;
|
|
|
|
else
|
|
|
|
parent = dentry->d_parent;
|
|
|
|
if (parent && !spin_trylock(&parent->d_lock)) {
|
2011-01-07 14:50:06 +08:00
|
|
|
if (inode)
|
|
|
|
spin_unlock(&inode->i_lock);
|
2011-01-07 14:49:48 +08:00
|
|
|
goto relock;
|
|
|
|
}
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
|
2011-01-07 14:49:48 +08:00
|
|
|
if (ref)
|
|
|
|
dentry->d_count--;
|
2011-10-29 01:02:42 +08:00
|
|
|
/*
|
|
|
|
* if dentry was on the d_lru list delete it from there.
|
|
|
|
* inform the fs via d_prune that this dentry is about to be
|
|
|
|
* unhashed and destroyed.
|
|
|
|
*/
|
|
|
|
dentry_lru_prune(dentry);
|
2011-01-07 14:49:48 +08:00
|
|
|
/* if it was on the hash then remove it */
|
|
|
|
__d_drop(dentry);
|
|
|
|
return d_kill(dentry, parent);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* This is dput
|
|
|
|
*
|
|
|
|
* This is complicated by the fact that we do not want to put
|
|
|
|
* dentries that are no longer on any hash chain on the unused
|
|
|
|
* list: we'd much rather just get rid of them immediately.
|
|
|
|
*
|
|
|
|
* However, that implies that we have to traverse the dentry
|
|
|
|
* tree upwards to the parents which might _also_ now be
|
|
|
|
* scheduled for deletion (it may have been only waiting for
|
|
|
|
* its last child to go away).
|
|
|
|
*
|
|
|
|
* This tail recursion is done by hand as we don't want to depend
|
|
|
|
* on the compiler to always get this right (gcc generally doesn't).
|
|
|
|
* Real recursion would eat up our stack space.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* dput - release a dentry
|
|
|
|
* @dentry: dentry to release
|
|
|
|
*
|
|
|
|
* Release a dentry. This will drop the usage count and if appropriate
|
|
|
|
* call the dentry unlink method as well as removing it from the queues and
|
|
|
|
* releasing its resources. If the parent dentries were scheduled for release
|
|
|
|
* they too may now get deleted.
|
|
|
|
*/
|
|
|
|
void dput(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
if (!dentry)
|
|
|
|
return;
|
|
|
|
|
|
|
|
repeat:
|
2011-01-07 14:49:32 +08:00
|
|
|
if (dentry->d_count == 1)
|
2005-04-17 06:20:36 +08:00
|
|
|
might_sleep();
|
|
|
|
spin_lock(&dentry->d_lock);
|
2011-01-07 14:49:40 +08:00
|
|
|
BUG_ON(!dentry->d_count);
|
|
|
|
if (dentry->d_count > 1) {
|
|
|
|
dentry->d_count--;
|
2005-04-17 06:20:36 +08:00
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2011-01-07 14:49:55 +08:00
|
|
|
if (dentry->d_flags & DCACHE_OP_DELETE) {
|
2005-04-17 06:20:36 +08:00
|
|
|
if (dentry->d_op->d_delete(dentry))
|
2011-01-07 14:49:40 +08:00
|
|
|
goto kill_it;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2010-10-10 17:36:24 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Unreachable? Get rid of it */
|
|
|
|
if (d_unhashed(dentry))
|
|
|
|
goto kill_it;
|
2010-10-10 17:36:24 +08:00
|
|
|
|
2011-05-31 23:58:49 +08:00
|
|
|
/*
|
|
|
|
* If this dentry needs lookup, don't set the referenced flag so that it
|
|
|
|
* is more likely to be cleaned up by the dcache shrinker in case of
|
|
|
|
* memory pressure.
|
|
|
|
*/
|
|
|
|
if (!d_need_lookup(dentry))
|
|
|
|
dentry->d_flags |= DCACHE_REFERENCED;
|
2010-10-10 17:36:26 +08:00
|
|
|
dentry_lru_add(dentry);
|
2010-10-10 17:36:24 +08:00
|
|
|
|
2011-01-07 14:49:40 +08:00
|
|
|
dentry->d_count--;
|
|
|
|
spin_unlock(&dentry->d_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
return;
|
|
|
|
|
2007-05-08 15:23:46 +08:00
|
|
|
kill_it:
|
2011-01-07 14:49:48 +08:00
|
|
|
dentry = dentry_kill(dentry, 1);
|
2007-05-08 15:23:46 +08:00
|
|
|
if (dentry)
|
|
|
|
goto repeat;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2010-01-06 04:45:18 +08:00
|
|
|
EXPORT_SYMBOL(dput);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* d_invalidate - invalidate a dentry
|
|
|
|
* @dentry: dentry to invalidate
|
|
|
|
*
|
|
|
|
* Try to invalidate the dentry if it turns out to be
|
|
|
|
* possible. If there are other dentries that can be
|
|
|
|
* reached through this one we can't delete it and we
|
|
|
|
* return -EBUSY. On success we return 0.
|
|
|
|
*
|
|
|
|
* no dcache lock.
|
|
|
|
*/
|
|
|
|
|
|
|
|
int d_invalidate(struct dentry * dentry)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* If it's already been dropped, return OK.
|
|
|
|
*/
|
2011-01-07 14:49:33 +08:00
|
|
|
spin_lock(&dentry->d_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (d_unhashed(dentry)) {
|
2011-01-07 14:49:33 +08:00
|
|
|
spin_unlock(&dentry->d_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Check whether to do a partial shrink_dcache
|
|
|
|
* to get rid of unused child entries.
|
|
|
|
*/
|
|
|
|
if (!list_empty(&dentry->d_subdirs)) {
|
2011-01-07 14:49:33 +08:00
|
|
|
spin_unlock(&dentry->d_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
shrink_dcache_parent(dentry);
|
2011-01-07 14:49:33 +08:00
|
|
|
spin_lock(&dentry->d_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Somebody else still using it?
|
|
|
|
*
|
|
|
|
* If it's a directory, we can't drop it
|
|
|
|
* for fear of somebody re-populating it
|
|
|
|
* with children (even though dropping it
|
|
|
|
* would make it unreachable from the root,
|
|
|
|
* we might still populate it if it was a
|
|
|
|
* working directory or similar).
|
2011-11-08 00:39:57 +08:00
|
|
|
* We also need to leave mountpoints alone,
|
|
|
|
* directory or not.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2011-11-08 00:39:57 +08:00
|
|
|
if (dentry->d_count > 1 && dentry->d_inode) {
|
|
|
|
if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) {
|
2005-04-17 06:20:36 +08:00
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
return -EBUSY;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
__d_drop(dentry);
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
return 0;
|
|
|
|
}
|
2010-01-06 04:45:18 +08:00
|
|
|
EXPORT_SYMBOL(d_invalidate);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-01-07 14:49:38 +08:00
|
|
|
/* This must be called with d_lock held */
|
2011-01-07 14:49:43 +08:00
|
|
|
static inline void __dget_dlock(struct dentry *dentry)
|
2011-01-07 14:49:31 +08:00
|
|
|
{
|
2011-01-07 14:49:32 +08:00
|
|
|
dentry->d_count++;
|
2011-01-07 14:49:31 +08:00
|
|
|
}
|
|
|
|
|
2011-01-07 14:49:43 +08:00
|
|
|
static inline void __dget(struct dentry *dentry)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2011-01-07 14:49:31 +08:00
|
|
|
spin_lock(&dentry->d_lock);
|
2011-01-07 14:49:43 +08:00
|
|
|
__dget_dlock(dentry);
|
2011-01-07 14:49:31 +08:00
|
|
|
spin_unlock(&dentry->d_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2011-01-07 14:49:32 +08:00
|
|
|
struct dentry *dget_parent(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
struct dentry *ret;
|
|
|
|
|
|
|
|
repeat:
|
2011-01-07 14:49:44 +08:00
|
|
|
/*
|
|
|
|
* Don't need rcu_dereference because we re-check it was correct under
|
|
|
|
* the lock.
|
|
|
|
*/
|
|
|
|
rcu_read_lock();
|
2011-01-07 14:49:32 +08:00
|
|
|
ret = dentry->d_parent;
|
2011-01-07 14:49:44 +08:00
|
|
|
spin_lock(&ret->d_lock);
|
|
|
|
if (unlikely(ret != dentry->d_parent)) {
|
|
|
|
spin_unlock(&ret->d_lock);
|
|
|
|
rcu_read_unlock();
|
2011-01-07 14:49:32 +08:00
|
|
|
goto repeat;
|
|
|
|
}
|
2011-01-07 14:49:44 +08:00
|
|
|
rcu_read_unlock();
|
2011-01-07 14:49:32 +08:00
|
|
|
BUG_ON(!ret->d_count);
|
|
|
|
ret->d_count++;
|
|
|
|
spin_unlock(&ret->d_lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(dget_parent);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/**
|
|
|
|
* d_find_alias - grab a hashed alias of inode
|
|
|
|
* @inode: inode in question
|
|
|
|
* @want_discon: flag, used by d_splice_alias, to request
|
|
|
|
* that only a DISCONNECTED alias be returned.
|
|
|
|
*
|
|
|
|
* If inode has a hashed alias, or is a directory and has any alias,
|
|
|
|
* acquire the reference to alias and return it. Otherwise return NULL.
|
|
|
|
* Notice that if inode is a directory there can be only one alias and
|
|
|
|
* it can be unhashed only if it has no children, or if it is the root
|
|
|
|
* of a filesystem.
|
|
|
|
*
|
[PATCH] knfsd: close a race-opportunity in d_splice_alias
There is a possible race in d_splice_alias. Though __d_find_alias(inode, 1)
will only return a dentry with DCACHE_DISCONNECTED set, it is possible for it
to get cleared before the BUG_ON, and it is is not possible to lock against
that.
There are a couple of problems here. Firstly, the code doesn't match the
comment. The comment describes a 'disconnected' dentry as being IS_ROOT as
well as DCACHE_DISCONNECTED, however there is not testing of IS_ROOT anythere.
A dentry is marked DCACHE_DISCONNECTED when allocated with d_alloc_anon, and
remains DCACHE_DISCONNECTED while a path is built up towards the root. So a
dentry can have a valid name and a valid parent and even grandparent, but will
still be DCACHE_DISCONNECTED until a path to the root is created. Once the
path to the root is complete, everything in the path gets DCACHE_DISCONNECTED
cleared. So the fact that DCACHE_DISCONNECTED isn't enough to say that a
dentry is free to be spliced in with a given name. This can only be allowed
if the dentry does not yet have a name, so the IS_ROOT test is needed too.
However even adding that test to __d_find_alias isn't enough. As
d_splice_alias drops dcache_lock before calling d_move to perform the splice,
it could race with another thread calling d_splice_alias to splice the inode
in with a different name in a different part of the tree (in the case where a
file has hard links). So that splicing code is only really safe for
directories (as we know that directories only have one link). For
directories, the caller of d_splice_alias will be holding i_mutex on the
(unique) parent so there is no room for a race.
A consequence of this is that a non-directory will never benefit from being
spliced into a pre-exisiting dentry, but that isn't a problem. It is
perfectly OK for a non-directory to have multiple dentries, some anonymous,
some not. And the comment for d_splice_alias says that it only happens for
directories anyway.
Signed-off-by: Neil Brown <neilb@suse.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-04 17:16:16 +08:00
|
|
|
* If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
|
2005-04-17 06:20:36 +08:00
|
|
|
* any other hashed alias over that one unless @want_discon is set,
|
[PATCH] knfsd: close a race-opportunity in d_splice_alias
There is a possible race in d_splice_alias. Though __d_find_alias(inode, 1)
will only return a dentry with DCACHE_DISCONNECTED set, it is possible for it
to get cleared before the BUG_ON, and it is is not possible to lock against
that.
There are a couple of problems here. Firstly, the code doesn't match the
comment. The comment describes a 'disconnected' dentry as being IS_ROOT as
well as DCACHE_DISCONNECTED, however there is not testing of IS_ROOT anythere.
A dentry is marked DCACHE_DISCONNECTED when allocated with d_alloc_anon, and
remains DCACHE_DISCONNECTED while a path is built up towards the root. So a
dentry can have a valid name and a valid parent and even grandparent, but will
still be DCACHE_DISCONNECTED until a path to the root is created. Once the
path to the root is complete, everything in the path gets DCACHE_DISCONNECTED
cleared. So the fact that DCACHE_DISCONNECTED isn't enough to say that a
dentry is free to be spliced in with a given name. This can only be allowed
if the dentry does not yet have a name, so the IS_ROOT test is needed too.
However even adding that test to __d_find_alias isn't enough. As
d_splice_alias drops dcache_lock before calling d_move to perform the splice,
it could race with another thread calling d_splice_alias to splice the inode
in with a different name in a different part of the tree (in the case where a
file has hard links). So that splicing code is only really safe for
directories (as we know that directories only have one link). For
directories, the caller of d_splice_alias will be holding i_mutex on the
(unique) parent so there is no room for a race.
A consequence of this is that a non-directory will never benefit from being
spliced into a pre-exisiting dentry, but that isn't a problem. It is
perfectly OK for a non-directory to have multiple dentries, some anonymous,
some not. And the comment for d_splice_alias says that it only happens for
directories anyway.
Signed-off-by: Neil Brown <neilb@suse.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-04 17:16:16 +08:00
|
|
|
* in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2011-01-07 14:49:33 +08:00
|
|
|
static struct dentry *__d_find_alias(struct inode *inode, int want_discon)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2011-01-07 14:49:33 +08:00
|
|
|
struct dentry *alias, *discon_alias;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-01-07 14:49:33 +08:00
|
|
|
again:
|
|
|
|
discon_alias = NULL;
|
|
|
|
list_for_each_entry(alias, &inode->i_dentry, d_alias) {
|
|
|
|
spin_lock(&alias->d_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
|
[PATCH] knfsd: close a race-opportunity in d_splice_alias
There is a possible race in d_splice_alias. Though __d_find_alias(inode, 1)
will only return a dentry with DCACHE_DISCONNECTED set, it is possible for it
to get cleared before the BUG_ON, and it is is not possible to lock against
that.
There are a couple of problems here. Firstly, the code doesn't match the
comment. The comment describes a 'disconnected' dentry as being IS_ROOT as
well as DCACHE_DISCONNECTED, however there is not testing of IS_ROOT anythere.
A dentry is marked DCACHE_DISCONNECTED when allocated with d_alloc_anon, and
remains DCACHE_DISCONNECTED while a path is built up towards the root. So a
dentry can have a valid name and a valid parent and even grandparent, but will
still be DCACHE_DISCONNECTED until a path to the root is created. Once the
path to the root is complete, everything in the path gets DCACHE_DISCONNECTED
cleared. So the fact that DCACHE_DISCONNECTED isn't enough to say that a
dentry is free to be spliced in with a given name. This can only be allowed
if the dentry does not yet have a name, so the IS_ROOT test is needed too.
However even adding that test to __d_find_alias isn't enough. As
d_splice_alias drops dcache_lock before calling d_move to perform the splice,
it could race with another thread calling d_splice_alias to splice the inode
in with a different name in a different part of the tree (in the case where a
file has hard links). So that splicing code is only really safe for
directories (as we know that directories only have one link). For
directories, the caller of d_splice_alias will be holding i_mutex on the
(unique) parent so there is no room for a race.
A consequence of this is that a non-directory will never benefit from being
spliced into a pre-exisiting dentry, but that isn't a problem. It is
perfectly OK for a non-directory to have multiple dentries, some anonymous,
some not. And the comment for d_splice_alias says that it only happens for
directories anyway.
Signed-off-by: Neil Brown <neilb@suse.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-04 17:16:16 +08:00
|
|
|
if (IS_ROOT(alias) &&
|
2011-01-07 14:49:33 +08:00
|
|
|
(alias->d_flags & DCACHE_DISCONNECTED)) {
|
2005-04-17 06:20:36 +08:00
|
|
|
discon_alias = alias;
|
2011-01-07 14:49:33 +08:00
|
|
|
} else if (!want_discon) {
|
2011-01-07 14:49:43 +08:00
|
|
|
__dget_dlock(alias);
|
2011-01-07 14:49:33 +08:00
|
|
|
spin_unlock(&alias->d_lock);
|
|
|
|
return alias;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
spin_unlock(&alias->d_lock);
|
|
|
|
}
|
|
|
|
if (discon_alias) {
|
|
|
|
alias = discon_alias;
|
|
|
|
spin_lock(&alias->d_lock);
|
|
|
|
if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
|
|
|
|
if (IS_ROOT(alias) &&
|
|
|
|
(alias->d_flags & DCACHE_DISCONNECTED)) {
|
2011-01-07 14:49:43 +08:00
|
|
|
__dget_dlock(alias);
|
2011-01-07 14:49:33 +08:00
|
|
|
spin_unlock(&alias->d_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
return alias;
|
|
|
|
}
|
|
|
|
}
|
2011-01-07 14:49:33 +08:00
|
|
|
spin_unlock(&alias->d_lock);
|
|
|
|
goto again;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2011-01-07 14:49:33 +08:00
|
|
|
return NULL;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2011-01-07 14:49:33 +08:00
|
|
|
struct dentry *d_find_alias(struct inode *inode)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2006-03-25 19:06:36 +08:00
|
|
|
struct dentry *de = NULL;
|
|
|
|
|
|
|
|
if (!list_empty(&inode->i_dentry)) {
|
2011-01-07 14:50:06 +08:00
|
|
|
spin_lock(&inode->i_lock);
|
2006-03-25 19:06:36 +08:00
|
|
|
de = __d_find_alias(inode, 0);
|
2011-01-07 14:50:06 +08:00
|
|
|
spin_unlock(&inode->i_lock);
|
2006-03-25 19:06:36 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
return de;
|
|
|
|
}
|
2010-01-06 04:45:18 +08:00
|
|
|
EXPORT_SYMBOL(d_find_alias);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Try to kill dentries associated with this inode.
|
|
|
|
* WARNING: you must own a reference to inode.
|
|
|
|
*/
|
|
|
|
void d_prune_aliases(struct inode *inode)
|
|
|
|
{
|
2005-09-10 15:27:07 +08:00
|
|
|
struct dentry *dentry;
|
2005-04-17 06:20:36 +08:00
|
|
|
restart:
|
2011-01-07 14:50:06 +08:00
|
|
|
spin_lock(&inode->i_lock);
|
2005-09-10 15:27:07 +08:00
|
|
|
list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
|
2005-04-17 06:20:36 +08:00
|
|
|
spin_lock(&dentry->d_lock);
|
2011-01-07 14:49:32 +08:00
|
|
|
if (!dentry->d_count) {
|
2011-01-07 14:49:43 +08:00
|
|
|
__dget_dlock(dentry);
|
2005-04-17 06:20:36 +08:00
|
|
|
__d_drop(dentry);
|
|
|
|
spin_unlock(&dentry->d_lock);
|
2011-01-07 14:50:06 +08:00
|
|
|
spin_unlock(&inode->i_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
dput(dentry);
|
|
|
|
goto restart;
|
|
|
|
}
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
}
|
2011-01-07 14:50:06 +08:00
|
|
|
spin_unlock(&inode->i_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2010-01-06 04:45:18 +08:00
|
|
|
EXPORT_SYMBOL(d_prune_aliases);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
2011-01-07 14:49:48 +08:00
|
|
|
* Try to throw away a dentry - free the inode, dput the parent.
|
|
|
|
* Requires dentry->d_lock is held, and dentry->d_count == 0.
|
|
|
|
* Releases dentry->d_lock.
|
2006-06-23 05:47:31 +08:00
|
|
|
*
|
2011-01-07 14:49:48 +08:00
|
|
|
* This may fail if locks cannot be acquired no problem, just try again.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2011-01-07 14:49:48 +08:00
|
|
|
static void try_prune_one_dentry(struct dentry *dentry)
|
2008-06-24 00:11:52 +08:00
|
|
|
__releases(dentry->d_lock)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2011-01-07 14:49:48 +08:00
|
|
|
struct dentry *parent;
|
2007-05-08 15:23:46 +08:00
|
|
|
|
2011-01-07 14:49:48 +08:00
|
|
|
parent = dentry_kill(dentry, 0);
|
2007-05-08 15:23:46 +08:00
|
|
|
/*
|
2011-01-07 14:49:48 +08:00
|
|
|
* If dentry_kill returns NULL, we have nothing more to do.
|
|
|
|
* if it returns the same dentry, trylocks failed. In either
|
|
|
|
* case, just loop again.
|
|
|
|
*
|
|
|
|
* Otherwise, we need to prune ancestors too. This is necessary
|
|
|
|
* to prevent quadratic behavior of shrink_dcache_parent(), but
|
|
|
|
* is also expected to be beneficial in reducing dentry cache
|
|
|
|
* fragmentation.
|
2007-05-08 15:23:46 +08:00
|
|
|
*/
|
2011-01-07 14:49:48 +08:00
|
|
|
if (!parent)
|
|
|
|
return;
|
|
|
|
if (parent == dentry)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* Prune ancestors. */
|
|
|
|
dentry = parent;
|
2007-05-08 15:23:46 +08:00
|
|
|
while (dentry) {
|
2011-01-07 14:49:32 +08:00
|
|
|
spin_lock(&dentry->d_lock);
|
2011-01-07 14:49:45 +08:00
|
|
|
if (dentry->d_count > 1) {
|
|
|
|
dentry->d_count--;
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
return;
|
|
|
|
}
|
2011-01-07 14:49:48 +08:00
|
|
|
dentry = dentry_kill(dentry, 1);
|
2007-05-08 15:23:46 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2010-10-10 17:36:25 +08:00
|
|
|
static void shrink_dentry_list(struct list_head *list)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary]
Split LRU-list of unused dentries to one per superblock to avoid soft
lock up during NFS mounts and remounting of any filesystem.
Previously I posted here:
http://lkml.org/lkml/2008/3/5/590
[Descriptions]
- background
dentry_unused is a list of dentries which are not referenced.
dentry_unused grows up when references on directories or files are
released. This list can be very long if there is huge free memory.
- the problem
When shrink_dcache_sb() is called, it scans all dentry_unused linearly
under spin_lock(), and if dentry->d_sb is differnt from given
superblock, scan next dentry. This scan costs very much if there are
many entries, and very ineffective if there are many superblocks.
IOW, When we need to shrink unused dentries on one dentry, but scans
unused dentries on all superblocks in the system. For example, we scan
500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
dentries on other superblocks.
In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
unused dentries on NFS, but scans 100,000,000 unused dentries on
superblocks in the system such as local ext3 filesystems. I hear NFS
mounting took 1 min on some system in use.
* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
this problem.
100,000,000 is possible number on large systems.
Per-superblock LRU of unused dentried can reduce the cost in
reasonable manner.
- How to fix
I found this problem is solved by David Chinner's "Per-superblock
unused dentry LRU lists V3"(1), so I rebase it and add some fix to
reclaim with fairness, which is in Andrew Morton's comments(2).
1) http://lkml.org/lkml/2006/5/25/318
2) http://lkml.org/lkml/2006/5/25/320
Split LRU-list of unused dentries to each superblocks. Then, NFS
mounting will check dentries under a superblock instead of all. But
this spliting will break LRU of dentry-unused. So, I've attempted to
make reclaim unused dentrins with fairness by calculate number of
dentries to scan on this sb based on following way
number of dentries to scan on this sb =
count * (number of dentries on this sb / number of dentries in the machine)
- ToDo
- I have to measuring performance number and do stress tests.
- When unmount occurs during prune_dcache(), scanning on same
superblock, It is unable to reach next superblock because it is gone
away. We restart scannig superblock from first one, it causes
unfairness of reclaim unused dentries on first superblock. But I think
this happens very rarely.
- Test Results
Result on 6GB boxes with excessive unused dentries.
Without patch:
$ cat /proc/sys/fs/dentry-state
10181835 10180203 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m1.830s
user 0m0.001s
sys 0m1.653s
With this patch:
$ cat /proc/sys/fs/dentry-state
10236610 10234751 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m0.106s
user 0m0.002s
sys 0m0.032s
[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 12:27:13 +08:00
|
|
|
struct dentry *dentry;
|
|
|
|
|
2011-01-07 14:49:47 +08:00
|
|
|
rcu_read_lock();
|
|
|
|
for (;;) {
|
|
|
|
dentry = list_entry_rcu(list->prev, struct dentry, d_lru);
|
|
|
|
if (&dentry->d_lru == list)
|
|
|
|
break; /* empty */
|
|
|
|
spin_lock(&dentry->d_lock);
|
|
|
|
if (dentry != list_entry(list->prev, struct dentry, d_lru)) {
|
|
|
|
spin_unlock(&dentry->d_lock);
|
2011-01-07 14:49:31 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* We found an inuse dentry which was not removed from
|
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary]
Split LRU-list of unused dentries to one per superblock to avoid soft
lock up during NFS mounts and remounting of any filesystem.
Previously I posted here:
http://lkml.org/lkml/2008/3/5/590
[Descriptions]
- background
dentry_unused is a list of dentries which are not referenced.
dentry_unused grows up when references on directories or files are
released. This list can be very long if there is huge free memory.
- the problem
When shrink_dcache_sb() is called, it scans all dentry_unused linearly
under spin_lock(), and if dentry->d_sb is differnt from given
superblock, scan next dentry. This scan costs very much if there are
many entries, and very ineffective if there are many superblocks.
IOW, When we need to shrink unused dentries on one dentry, but scans
unused dentries on all superblocks in the system. For example, we scan
500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
dentries on other superblocks.
In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
unused dentries on NFS, but scans 100,000,000 unused dentries on
superblocks in the system such as local ext3 filesystems. I hear NFS
mounting took 1 min on some system in use.
* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
this problem.
100,000,000 is possible number on large systems.
Per-superblock LRU of unused dentried can reduce the cost in
reasonable manner.
- How to fix
I found this problem is solved by David Chinner's "Per-superblock
unused dentry LRU lists V3"(1), so I rebase it and add some fix to
reclaim with fairness, which is in Andrew Morton's comments(2).
1) http://lkml.org/lkml/2006/5/25/318
2) http://lkml.org/lkml/2006/5/25/320
Split LRU-list of unused dentries to each superblocks. Then, NFS
mounting will check dentries under a superblock instead of all. But
this spliting will break LRU of dentry-unused. So, I've attempted to
make reclaim unused dentrins with fairness by calculate number of
dentries to scan on this sb based on following way
number of dentries to scan on this sb =
count * (number of dentries on this sb / number of dentries in the machine)
- ToDo
- I have to measuring performance number and do stress tests.
- When unmount occurs during prune_dcache(), scanning on same
superblock, It is unable to reach next superblock because it is gone
away. We restart scannig superblock from first one, it causes
unfairness of reclaim unused dentries on first superblock. But I think
this happens very rarely.
- Test Results
Result on 6GB boxes with excessive unused dentries.
Without patch:
$ cat /proc/sys/fs/dentry-state
10181835 10180203 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m1.830s
user 0m0.001s
sys 0m1.653s
With this patch:
$ cat /proc/sys/fs/dentry-state
10236610 10234751 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m0.106s
user 0m0.002s
sys 0m0.032s
[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 12:27:13 +08:00
|
|
|
* the LRU because of laziness during lookup. Do not free
|
|
|
|
* it - just keep it off the LRU list.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2011-01-07 14:49:32 +08:00
|
|
|
if (dentry->d_count) {
|
2011-01-07 14:49:47 +08:00
|
|
|
dentry_lru_del(dentry);
|
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary]
Split LRU-list of unused dentries to one per superblock to avoid soft
lock up during NFS mounts and remounting of any filesystem.
Previously I posted here:
http://lkml.org/lkml/2008/3/5/590
[Descriptions]
- background
dentry_unused is a list of dentries which are not referenced.
dentry_unused grows up when references on directories or files are
released. This list can be very long if there is huge free memory.
- the problem
When shrink_dcache_sb() is called, it scans all dentry_unused linearly
under spin_lock(), and if dentry->d_sb is differnt from given
superblock, scan next dentry. This scan costs very much if there are
many entries, and very ineffective if there are many superblocks.
IOW, When we need to shrink unused dentries on one dentry, but scans
unused dentries on all superblocks in the system. For example, we scan
500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
dentries on other superblocks.
In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
unused dentries on NFS, but scans 100,000,000 unused dentries on
superblocks in the system such as local ext3 filesystems. I hear NFS
mounting took 1 min on some system in use.
* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
this problem.
100,000,000 is possible number on large systems.
Per-superblock LRU of unused dentried can reduce the cost in
reasonable manner.
- How to fix
I found this problem is solved by David Chinner's "Per-superblock
unused dentry LRU lists V3"(1), so I rebase it and add some fix to
reclaim with fairness, which is in Andrew Morton's comments(2).
1) http://lkml.org/lkml/2006/5/25/318
2) http://lkml.org/lkml/2006/5/25/320
Split LRU-list of unused dentries to each superblocks. Then, NFS
mounting will check dentries under a superblock instead of all. But
this spliting will break LRU of dentry-unused. So, I've attempted to
make reclaim unused dentrins with fairness by calculate number of
dentries to scan on this sb based on following way
number of dentries to scan on this sb =
count * (number of dentries on this sb / number of dentries in the machine)
- ToDo
- I have to measuring performance number and do stress tests.
- When unmount occurs during prune_dcache(), scanning on same
superblock, It is unable to reach next superblock because it is gone
away. We restart scannig superblock from first one, it causes
unfairness of reclaim unused dentries on first superblock. But I think
this happens very rarely.
- Test Results
Result on 6GB boxes with excessive unused dentries.
Without patch:
$ cat /proc/sys/fs/dentry-state
10181835 10180203 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m1.830s
user 0m0.001s
sys 0m1.653s
With this patch:
$ cat /proc/sys/fs/dentry-state
10236610 10234751 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m0.106s
user 0m0.002s
sys 0m0.032s
[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 12:27:13 +08:00
|
|
|
spin_unlock(&dentry->d_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
continue;
|
|
|
|
}
|
2011-01-07 14:49:47 +08:00
|
|
|
|
|
|
|
rcu_read_unlock();
|
2011-01-07 14:49:48 +08:00
|
|
|
|
|
|
|
try_prune_one_dentry(dentry);
|
|
|
|
|
2011-01-07 14:49:47 +08:00
|
|
|
rcu_read_lock();
|
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary]
Split LRU-list of unused dentries to one per superblock to avoid soft
lock up during NFS mounts and remounting of any filesystem.
Previously I posted here:
http://lkml.org/lkml/2008/3/5/590
[Descriptions]
- background
dentry_unused is a list of dentries which are not referenced.
dentry_unused grows up when references on directories or files are
released. This list can be very long if there is huge free memory.
- the problem
When shrink_dcache_sb() is called, it scans all dentry_unused linearly
under spin_lock(), and if dentry->d_sb is differnt from given
superblock, scan next dentry. This scan costs very much if there are
many entries, and very ineffective if there are many superblocks.
IOW, When we need to shrink unused dentries on one dentry, but scans
unused dentries on all superblocks in the system. For example, we scan
500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
dentries on other superblocks.
In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
unused dentries on NFS, but scans 100,000,000 unused dentries on
superblocks in the system such as local ext3 filesystems. I hear NFS
mounting took 1 min on some system in use.
* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
this problem.
100,000,000 is possible number on large systems.
Per-superblock LRU of unused dentried can reduce the cost in
reasonable manner.
- How to fix
I found this problem is solved by David Chinner's "Per-superblock
unused dentry LRU lists V3"(1), so I rebase it and add some fix to
reclaim with fairness, which is in Andrew Morton's comments(2).
1) http://lkml.org/lkml/2006/5/25/318
2) http://lkml.org/lkml/2006/5/25/320
Split LRU-list of unused dentries to each superblocks. Then, NFS
mounting will check dentries under a superblock instead of all. But
this spliting will break LRU of dentry-unused. So, I've attempted to
make reclaim unused dentrins with fairness by calculate number of
dentries to scan on this sb based on following way
number of dentries to scan on this sb =
count * (number of dentries on this sb / number of dentries in the machine)
- ToDo
- I have to measuring performance number and do stress tests.
- When unmount occurs during prune_dcache(), scanning on same
superblock, It is unable to reach next superblock because it is gone
away. We restart scannig superblock from first one, it causes
unfairness of reclaim unused dentries on first superblock. But I think
this happens very rarely.
- Test Results
Result on 6GB boxes with excessive unused dentries.
Without patch:
$ cat /proc/sys/fs/dentry-state
10181835 10180203 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m1.830s
user 0m0.001s
sys 0m1.653s
With this patch:
$ cat /proc/sys/fs/dentry-state
10236610 10234751 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m0.106s
user 0m0.002s
sys 0m0.032s
[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 12:27:13 +08:00
|
|
|
}
|
2011-01-07 14:49:47 +08:00
|
|
|
rcu_read_unlock();
|
2010-10-10 17:36:25 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2011-08-23 16:56:24 +08:00
|
|
|
* prune_dcache_sb - shrink the dcache
|
|
|
|
* @sb: superblock
|
|
|
|
* @count: number of entries to try to free
|
|
|
|
*
|
|
|
|
* Attempt to shrink the superblock dcache LRU by @count entries. This is
|
|
|
|
* done when we need more memory an called from the superblock shrinker
|
|
|
|
* function.
|
2010-10-10 17:36:25 +08:00
|
|
|
*
|
2011-08-23 16:56:24 +08:00
|
|
|
* This function may fail to free any resources if all the dentries are in
|
|
|
|
* use.
|
2010-10-10 17:36:25 +08:00
|
|
|
*/
|
2011-08-23 16:56:24 +08:00
|
|
|
void prune_dcache_sb(struct super_block *sb, int count)
|
2010-10-10 17:36:25 +08:00
|
|
|
{
|
|
|
|
struct dentry *dentry;
|
|
|
|
LIST_HEAD(referenced);
|
|
|
|
LIST_HEAD(tmp);
|
|
|
|
|
2011-01-07 14:49:31 +08:00
|
|
|
relock:
|
|
|
|
spin_lock(&dcache_lru_lock);
|
2010-10-10 17:36:25 +08:00
|
|
|
while (!list_empty(&sb->s_dentry_lru)) {
|
|
|
|
dentry = list_entry(sb->s_dentry_lru.prev,
|
|
|
|
struct dentry, d_lru);
|
|
|
|
BUG_ON(dentry->d_sb != sb);
|
|
|
|
|
2011-01-07 14:49:31 +08:00
|
|
|
if (!spin_trylock(&dentry->d_lock)) {
|
|
|
|
spin_unlock(&dcache_lru_lock);
|
|
|
|
cpu_relax();
|
|
|
|
goto relock;
|
|
|
|
}
|
|
|
|
|
2011-08-23 16:56:24 +08:00
|
|
|
if (dentry->d_flags & DCACHE_REFERENCED) {
|
2011-01-07 14:49:31 +08:00
|
|
|
dentry->d_flags &= ~DCACHE_REFERENCED;
|
|
|
|
list_move(&dentry->d_lru, &referenced);
|
2010-10-10 17:36:25 +08:00
|
|
|
spin_unlock(&dentry->d_lock);
|
2011-01-07 14:49:31 +08:00
|
|
|
} else {
|
|
|
|
list_move_tail(&dentry->d_lru, &tmp);
|
fix shrink_dcache_parent() livelock
Two (or more) concurrent calls of shrink_dcache_parent() on the same dentry may
cause shrink_dcache_parent() to loop forever.
Here's what appears to happen:
1 - CPU0: select_parent(P) finds C and puts it on dispose list, returns 1
2 - CPU1: select_parent(P) locks P->d_lock
3 - CPU0: shrink_dentry_list() locks C->d_lock
dentry_kill(C) tries to lock P->d_lock but fails, unlocks C->d_lock
4 - CPU1: select_parent(P) locks C->d_lock,
moves C from dispose list being processed on CPU0 to the new
dispose list, returns 1
5 - CPU0: shrink_dentry_list() finds dispose list empty, returns
6 - Goto 2 with CPU0 and CPU1 switched
Basically select_parent() steals the dentry from shrink_dentry_list() and thinks
it found a new one, causing shrink_dentry_list() to think it's making progress
and loop over and over.
One way to trigger this is to make udev calls stat() on the sysfs file while it
is going away.
Having a file in /lib/udev/rules.d/ with only this one rule seems to the trick:
ATTR{vendor}=="0x8086", ATTR{device}=="0x10ca", ENV{PCI_SLOT_NAME}="%k", ENV{MATCHADDR}="$attr{address}", RUN+="/bin/true"
Then execute the following loop:
while true; do
echo -bond0 > /sys/class/net/bonding_masters
echo +bond0 > /sys/class/net/bonding_masters
echo -bond1 > /sys/class/net/bonding_masters
echo +bond1 > /sys/class/net/bonding_masters
done
One fix would be to check all callers and prevent concurrent calls to
shrink_dcache_parent(). But I think a better solution is to stop the
stealing behavior.
This patch adds a new dentry flag that is set when the dentry is added to the
dispose list. The flag is cleared in dentry_lru_del() in case the dentry gets a
new reference just before being pruned.
If the dentry has this flag, select_parent() will skip it and let
shrink_dentry_list() retry pruning it. With select_parent() skipping those
dentries there will not be the appearance of progress (new dentries found) when
there is none, hence shrink_dcache_parent() will not loop forever.
Set the flag is also set in prune_dcache_sb() for consistency as suggested by
Linus.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2012-01-11 01:22:25 +08:00
|
|
|
dentry->d_flags |= DCACHE_SHRINK_LIST;
|
2011-01-07 14:49:31 +08:00
|
|
|
spin_unlock(&dentry->d_lock);
|
2011-07-08 12:14:42 +08:00
|
|
|
if (!--count)
|
2011-01-07 14:49:31 +08:00
|
|
|
break;
|
2010-10-10 17:36:25 +08:00
|
|
|
}
|
2011-01-07 14:49:47 +08:00
|
|
|
cond_resched_lock(&dcache_lru_lock);
|
2010-10-10 17:36:25 +08:00
|
|
|
}
|
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary]
Split LRU-list of unused dentries to one per superblock to avoid soft
lock up during NFS mounts and remounting of any filesystem.
Previously I posted here:
http://lkml.org/lkml/2008/3/5/590
[Descriptions]
- background
dentry_unused is a list of dentries which are not referenced.
dentry_unused grows up when references on directories or files are
released. This list can be very long if there is huge free memory.
- the problem
When shrink_dcache_sb() is called, it scans all dentry_unused linearly
under spin_lock(), and if dentry->d_sb is differnt from given
superblock, scan next dentry. This scan costs very much if there are
many entries, and very ineffective if there are many superblocks.
IOW, When we need to shrink unused dentries on one dentry, but scans
unused dentries on all superblocks in the system. For example, we scan
500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
dentries on other superblocks.
In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
unused dentries on NFS, but scans 100,000,000 unused dentries on
superblocks in the system such as local ext3 filesystems. I hear NFS
mounting took 1 min on some system in use.
* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
this problem.
100,000,000 is possible number on large systems.
Per-superblock LRU of unused dentried can reduce the cost in
reasonable manner.
- How to fix
I found this problem is solved by David Chinner's "Per-superblock
unused dentry LRU lists V3"(1), so I rebase it and add some fix to
reclaim with fairness, which is in Andrew Morton's comments(2).
1) http://lkml.org/lkml/2006/5/25/318
2) http://lkml.org/lkml/2006/5/25/320
Split LRU-list of unused dentries to each superblocks. Then, NFS
mounting will check dentries under a superblock instead of all. But
this spliting will break LRU of dentry-unused. So, I've attempted to
make reclaim unused dentrins with fairness by calculate number of
dentries to scan on this sb based on following way
number of dentries to scan on this sb =
count * (number of dentries on this sb / number of dentries in the machine)
- ToDo
- I have to measuring performance number and do stress tests.
- When unmount occurs during prune_dcache(), scanning on same
superblock, It is unable to reach next superblock because it is gone
away. We restart scannig superblock from first one, it causes
unfairness of reclaim unused dentries on first superblock. But I think
this happens very rarely.
- Test Results
Result on 6GB boxes with excessive unused dentries.
Without patch:
$ cat /proc/sys/fs/dentry-state
10181835 10180203 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m1.830s
user 0m0.001s
sys 0m1.653s
With this patch:
$ cat /proc/sys/fs/dentry-state
10236610 10234751 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m0.106s
user 0m0.002s
sys 0m0.032s
[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 12:27:13 +08:00
|
|
|
if (!list_empty(&referenced))
|
|
|
|
list_splice(&referenced, &sb->s_dentry_lru);
|
2011-01-07 14:49:31 +08:00
|
|
|
spin_unlock(&dcache_lru_lock);
|
2011-01-07 14:49:47 +08:00
|
|
|
|
|
|
|
shrink_dentry_list(&tmp);
|
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary]
Split LRU-list of unused dentries to one per superblock to avoid soft
lock up during NFS mounts and remounting of any filesystem.
Previously I posted here:
http://lkml.org/lkml/2008/3/5/590
[Descriptions]
- background
dentry_unused is a list of dentries which are not referenced.
dentry_unused grows up when references on directories or files are
released. This list can be very long if there is huge free memory.
- the problem
When shrink_dcache_sb() is called, it scans all dentry_unused linearly
under spin_lock(), and if dentry->d_sb is differnt from given
superblock, scan next dentry. This scan costs very much if there are
many entries, and very ineffective if there are many superblocks.
IOW, When we need to shrink unused dentries on one dentry, but scans
unused dentries on all superblocks in the system. For example, we scan
500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
dentries on other superblocks.
In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
unused dentries on NFS, but scans 100,000,000 unused dentries on
superblocks in the system such as local ext3 filesystems. I hear NFS
mounting took 1 min on some system in use.
* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
this problem.
100,000,000 is possible number on large systems.
Per-superblock LRU of unused dentried can reduce the cost in
reasonable manner.
- How to fix
I found this problem is solved by David Chinner's "Per-superblock
unused dentry LRU lists V3"(1), so I rebase it and add some fix to
reclaim with fairness, which is in Andrew Morton's comments(2).
1) http://lkml.org/lkml/2006/5/25/318
2) http://lkml.org/lkml/2006/5/25/320
Split LRU-list of unused dentries to each superblocks. Then, NFS
mounting will check dentries under a superblock instead of all. But
this spliting will break LRU of dentry-unused. So, I've attempted to
make reclaim unused dentrins with fairness by calculate number of
dentries to scan on this sb based on following way
number of dentries to scan on this sb =
count * (number of dentries on this sb / number of dentries in the machine)
- ToDo
- I have to measuring performance number and do stress tests.
- When unmount occurs during prune_dcache(), scanning on same
superblock, It is unable to reach next superblock because it is gone
away. We restart scannig superblock from first one, it causes
unfairness of reclaim unused dentries on first superblock. But I think
this happens very rarely.
- Test Results
Result on 6GB boxes with excessive unused dentries.
Without patch:
$ cat /proc/sys/fs/dentry-state
10181835 10180203 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m1.830s
user 0m0.001s
sys 0m1.653s
With this patch:
$ cat /proc/sys/fs/dentry-state
10236610 10234751 45 0 0 0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real 0m0.106s
user 0m0.002s
sys 0m0.032s
[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 12:27:13 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/**
|
|
|
|
* shrink_dcache_sb - shrink dcache for a superblock
|
|
|
|
* @sb: superblock
|
|
|
|
*
|
2010-10-10 17:36:25 +08:00
|
|
|
* Shrink the dcache for the specified super block. This is used to free
|
|
|
|
* the dcache before unmounting a file system.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2010-10-10 17:36:25 +08:00
|
|
|
void shrink_dcache_sb(struct super_block *sb)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2010-10-10 17:36:25 +08:00
|
|
|
LIST_HEAD(tmp);
|
|
|
|
|
2011-01-07 14:49:31 +08:00
|
|
|
spin_lock(&dcache_lru_lock);
|
2010-10-10 17:36:25 +08:00
|
|
|
while (!list_empty(&sb->s_dentry_lru)) {
|
|
|
|
list_splice_init(&sb->s_dentry_lru, &tmp);
|
2011-01-07 14:49:47 +08:00
|
|
|
spin_unlock(&dcache_lru_lock);
|
2010-10-10 17:36:25 +08:00
|
|
|
shrink_dentry_list(&tmp);
|
2011-01-07 14:49:47 +08:00
|
|
|
spin_lock(&dcache_lru_lock);
|
2010-10-10 17:36:25 +08:00
|
|
|
}
|
2011-01-07 14:49:31 +08:00
|
|
|
spin_unlock(&dcache_lru_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2010-01-06 04:45:18 +08:00
|
|
|
EXPORT_SYMBOL(shrink_dcache_sb);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-10-11 16:22:19 +08:00
|
|
|
/*
|
|
|
|
* destroy a single subtree of dentries for unmount
|
|
|
|
* - see the comments on shrink_dcache_for_umount() for a description of the
|
|
|
|
* locking
|
|
|
|
*/
|
|
|
|
static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
struct dentry *parent;
|
|
|
|
|
|
|
|
BUG_ON(!IS_ROOT(dentry));
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
/* descend to the first leaf in the current subtree */
|
2011-06-07 21:09:30 +08:00
|
|
|
while (!list_empty(&dentry->d_subdirs))
|
2006-10-11 16:22:19 +08:00
|
|
|
dentry = list_entry(dentry->d_subdirs.next,
|
|
|
|
struct dentry, d_u.d_child);
|
|
|
|
|
|
|
|
/* consume the dentries from this leaf up through its parents
|
|
|
|
* until we find one with children or run out altogether */
|
|
|
|
do {
|
|
|
|
struct inode *inode;
|
|
|
|
|
2011-10-29 01:02:42 +08:00
|
|
|
/*
|
|
|
|
* remove the dentry from the lru, and inform
|
|
|
|
* the fs that this dentry is about to be
|
|
|
|
* unhashed and destroyed.
|
|
|
|
*/
|
|
|
|
dentry_lru_prune(dentry);
|
2011-06-07 21:09:30 +08:00
|
|
|
__d_shrink(dentry);
|
|
|
|
|
2011-01-07 14:49:32 +08:00
|
|
|
if (dentry->d_count != 0) {
|
2006-10-11 16:22:19 +08:00
|
|
|
printk(KERN_ERR
|
|
|
|
"BUG: Dentry %p{i=%lx,n=%s}"
|
|
|
|
" still in use (%d)"
|
|
|
|
" [unmount of %s %s]\n",
|
|
|
|
dentry,
|
|
|
|
dentry->d_inode ?
|
|
|
|
dentry->d_inode->i_ino : 0UL,
|
|
|
|
dentry->d_name.name,
|
2011-01-07 14:49:32 +08:00
|
|
|
dentry->d_count,
|
2006-10-11 16:22:19 +08:00
|
|
|
dentry->d_sb->s_type->name,
|
|
|
|
dentry->d_sb->s_id);
|
|
|
|
BUG();
|
|
|
|
}
|
|
|
|
|
2011-01-07 14:49:34 +08:00
|
|
|
if (IS_ROOT(dentry)) {
|
2006-10-11 16:22:19 +08:00
|
|
|
parent = NULL;
|
2011-01-07 14:49:34 +08:00
|
|
|
list_del(&dentry->d_u.d_child);
|
|
|
|
} else {
|
2008-10-16 06:50:27 +08:00
|
|
|
parent = dentry->d_parent;
|
2011-01-07 14:49:32 +08:00
|
|
|
parent->d_count--;
|
2011-01-07 14:49:34 +08:00
|
|
|
list_del(&dentry->d_u.d_child);
|
2008-10-16 06:50:27 +08:00
|
|
|
}
|
2006-10-11 16:22:19 +08:00
|
|
|
|
|
|
|
inode = dentry->d_inode;
|
|
|
|
if (inode) {
|
|
|
|
dentry->d_inode = NULL;
|
|
|
|
list_del_init(&dentry->d_alias);
|
|
|
|
if (dentry->d_op && dentry->d_op->d_iput)
|
|
|
|
dentry->d_op->d_iput(dentry, inode);
|
|
|
|
else
|
|
|
|
iput(inode);
|
|
|
|
}
|
|
|
|
|
|
|
|
d_free(dentry);
|
|
|
|
|
|
|
|
/* finished when we fall off the top of the tree,
|
|
|
|
* otherwise we ascend to the parent and move to the
|
|
|
|
* next sibling if there is one */
|
|
|
|
if (!parent)
|
2010-10-10 17:36:23 +08:00
|
|
|
return;
|
2006-10-11 16:22:19 +08:00
|
|
|
dentry = parent;
|
|
|
|
} while (list_empty(&dentry->d_subdirs));
|
|
|
|
|
|
|
|
dentry = list_entry(dentry->d_subdirs.next,
|
|
|
|
struct dentry, d_u.d_child);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* destroy the dentries attached to a superblock on unmounting
|
2011-01-07 14:49:38 +08:00
|
|
|
* - we don't need to use dentry->d_lock because:
|
2006-10-11 16:22:19 +08:00
|
|
|
* - the superblock is detached from all mountings and open files, so the
|
|
|
|
* dentry trees will not be rearranged by the VFS
|
|
|
|
* - s_umount is write-locked, so the memory pressure shrinker will ignore
|
|
|
|
* any dentries belonging to this superblock that it comes across
|
|
|
|
* - the filesystem itself is no longer permitted to rearrange the dentries
|
|
|
|
* in this superblock
|
|
|
|
*/
|
|
|
|
void shrink_dcache_for_umount(struct super_block *sb)
|
|
|
|
{
|
|
|
|
struct dentry *dentry;
|
|
|
|
|
|
|
|
if (down_read_trylock(&sb->s_umount))
|
|
|
|
BUG();
|
|
|
|
|
|
|
|
dentry = sb->s_root;
|
|
|
|
sb->s_root = NULL;
|
2011-01-07 14:49:32 +08:00
|
|
|
dentry->d_count--;
|
2006-10-11 16:22:19 +08:00
|
|
|
shrink_dcache_for_umount_subtree(dentry);
|
|
|
|
|
2011-01-07 14:50:05 +08:00
|
|
|
while (!hlist_bl_empty(&sb->s_anon)) {
|
|
|
|
dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash);
|
2006-10-11 16:22:19 +08:00
|
|
|
shrink_dcache_for_umount_subtree(dentry);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-03-16 06:29:21 +08:00
|
|
|
/*
|
|
|
|
* This tries to ascend one level of parenthood, but
|
|
|
|
* we can race with renaming, so we need to re-check
|
|
|
|
* the parenthood after dropping the lock and check
|
|
|
|
* that the sequence number still matches.
|
|
|
|
*/
|
|
|
|
static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq)
|
|
|
|
{
|
|
|
|
struct dentry *new = old->d_parent;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
spin_unlock(&old->d_lock);
|
|
|
|
spin_lock(&new->d_lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* might go back up the wrong parent if we have had a rename
|
|
|
|
* or deletion
|
|
|
|
*/
|
|
|
|
if (new != old->d_parent ||
|
2011-03-16 01:36:43 +08:00
|
|
|
(old->d_flags & DCACHE_DISCONNECTED) ||
|
2011-03-16 06:29:21 +08:00
|
|
|
(!locked && read_seqretry(&rename_lock, seq))) {
|
|
|
|
spin_unlock(&new->d_lock);
|
|
|
|
new = NULL;
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
return new;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Search for at least 1 mount point in the dentry's subdirs.
|
|
|
|
* We descend to the next level whenever the d_subdirs
|
|
|
|
* list is non-empty and continue searching.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* have_submounts - check for mounts over a dentry
|
|
|
|
* @parent: dentry to check.
|
|
|
|
*
|
|
|
|
* Return true if the parent or its subdirectories contain
|
|
|
|
* a mount point
|
|
|
|
*/
|
|
|
|
int have_submounts(struct dentry *parent)
|
|
|
|
{
|
2011-01-07 14:49:37 +08:00
|
|
|
struct dentry *this_parent;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct list_head *next;
|
2011-01-07 14:49:37 +08:00
|
|
|
unsigned seq;
|
2011-01-07 14:49:39 +08:00
|
|
|
int locked = 0;
|
2011-01-07 14:49:37 +08:00
|
|
|
|
|
|
|
seq = read_seqbegin(&rename_lock);
|
2011-01-07 14:49:39 +08:00
|
|
|
again:
|
|
|
|
this_parent = parent;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (d_mountpoint(parent))
|
|
|
|
goto positive;
|
2011-01-07 14:49:34 +08:00
|
|
|
spin_lock(&this_parent->d_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
repeat:
|
|
|
|
next = this_parent->d_subdirs.next;
|
|
|
|
resume:
|
|
|
|
while (next != &this_parent->d_subdirs) {
|
|
|
|
struct list_head *tmp = next;
|
[PATCH] shrink dentry struct
Some long time ago, dentry struct was carefully tuned so that on 32 bits
UP, sizeof(struct dentry) was exactly 128, ie a power of 2, and a multiple
of memory cache lines.
Then RCU was added and dentry struct enlarged by two pointers, with nice
results for SMP, but not so good on UP, because breaking the above tuning
(128 + 8 = 136 bytes)
This patch reverts this unwanted side effect, by using an union (d_u),
where d_rcu and d_child are placed so that these two fields can share their
memory needs.
At the time d_free() is called (and d_rcu is really used), d_child is known
to be empty and not touched by the dentry freeing.
Lockless lookups only access d_name, d_parent, d_lock, d_op, d_flags (so
the previous content of d_child is not needed if said dentry was unhashed
but still accessed by a CPU because of RCU constraints)
As dentry cache easily contains millions of entries, a size reduction is
worth the extra complexity of the ugly C union.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Maneesh Soni <maneesh@in.ibm.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Ian Kent <raven@themaw.net>
Cc: Paul Jackson <pj@sgi.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Cc: James Morris <jmorris@namei.org>
Cc: Stephen Smalley <sds@epoch.ncsc.mil>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:03:32 +08:00
|
|
|
struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
|
2005-04-17 06:20:36 +08:00
|
|
|
next = tmp->next;
|
2011-01-07 14:49:34 +08:00
|
|
|
|
|
|
|
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Have we found a mount point ? */
|
2011-01-07 14:49:34 +08:00
|
|
|
if (d_mountpoint(dentry)) {
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
spin_unlock(&this_parent->d_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
goto positive;
|
2011-01-07 14:49:34 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!list_empty(&dentry->d_subdirs)) {
|
2011-01-07 14:49:34 +08:00
|
|
|
spin_unlock(&this_parent->d_lock);
|
|
|
|
spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
|
2005-04-17 06:20:36 +08:00
|
|
|
this_parent = dentry;
|
2011-01-07 14:49:34 +08:00
|
|
|
spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
|
2005-04-17 06:20:36 +08:00
|
|
|
goto repeat;
|
|
|
|
}
|
2011-01-07 14:49:34 +08:00
|
|
|
spin_unlock(&dentry->d_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* All done at this level ... ascend and resume the search.
|
|
|
|
*/
|
|
|
|
if (this_parent != parent) {
|
2011-03-16 06:29:21 +08:00
|
|
|
struct dentry *child = this_parent;
|
|
|
|
this_parent = try_to_ascend(this_parent, locked, seq);
|
|
|
|
if (!this_parent)
|
2011-01-07 14:49:37 +08:00
|
|
|
goto rename_retry;
|
|
|
|
next = child->d_u.d_child.next;
|
2005-04-17 06:20:36 +08:00
|
|
|
goto resume;
|
|
|
|
}
|
2011-01-07 14:49:34 +08:00
|
|
|
spin_unlock(&this_parent->d_lock);
|
2011-01-07 14:49:39 +08:00
|
|
|
if (!locked && read_seqretry(&rename_lock, seq))
|
2011-01-07 14:49:37 +08:00
|
|
|
goto rename_retry;
|
2011-01-07 14:49:39 +08:00
|
|
|
if (locked)
|
|
|
|
write_sequnlock(&rename_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
return 0; /* No mount points found in tree */
|
|
|
|
positive:
|
2011-01-07 14:49:39 +08:00
|
|
|
if (!locked && read_seqretry(&rename_lock, seq))
|
2011-01-07 14:49:37 +08:00
|
|
|
goto rename_retry;
|
2011-01-07 14:49:39 +08:00
|
|
|
if (locked)
|
|
|
|
write_sequnlock(&rename_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
return 1;
|
2011-01-07 14:49:39 +08:00
|
|
|
|
|
|
|
rename_retry:
|
|
|
|
locked = 1;
|
|
|
|
write_seqlock(&rename_lock);
|
|
|
|
goto again;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2010-01-06 04:45:18 +08:00
|
|
|
EXPORT_SYMBOL(have_submounts);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Search the dentry child list for the specified parent,
|
|
|
|
* and move any unused dentries to the end of the unused
|
|
|
|
* list for prune_dcache(). We descend to the next level
|
|
|
|
* whenever the d_subdirs list is non-empty and continue
|
|
|
|
* searching.
|
|
|
|
*
|
|
|
|
* It returns zero iff there are no unused children,
|
|
|
|
* otherwise it returns the number of children moved to
|
|
|
|
* the end of the unused list. This may not be the total
|
|
|
|
* number of unused children, because select_parent can
|
|
|
|
* drop the lock and return early due to latency
|
|
|
|
* constraints.
|
|
|
|
*/
|
2011-08-23 16:56:24 +08:00
|
|
|
static int select_parent(struct dentry *parent, struct list_head *dispose)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2011-01-07 14:49:37 +08:00
|
|
|
struct dentry *this_parent;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct list_head *next;
|
2011-01-07 14:49:37 +08:00
|
|
|
unsigned seq;
|
2005-04-17 06:20:36 +08:00
|
|
|
int found = 0;
|
2011-01-07 14:49:39 +08:00
|
|
|
int locked = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-01-07 14:49:37 +08:00
|
|
|
seq = read_seqbegin(&rename_lock);
|
2011-01-07 14:49:39 +08:00
|
|
|
again:
|
|
|
|
this_parent = parent;
|
2011-01-07 14:49:34 +08:00
|
|
|
spin_lock(&this_parent->d_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
repeat:
|
|
|
|
next = this_parent->d_subdirs.next;
|
|
|
|
resume:
|
|
|
|
while (next != &this_parent->d_subdirs) {
|
|
|
|
struct list_head *tmp = next;
|
[PATCH] shrink dentry struct
Some long time ago, dentry struct was carefully tuned so that on 32 bits
UP, sizeof(struct dentry) was exactly 128, ie a power of 2, and a multiple
of memory cache lines.
Then RCU was added and dentry struct enlarged by two pointers, with nice
results for SMP, but not so good on UP, because breaking the above tuning
(128 + 8 = 136 bytes)
This patch reverts this unwanted side effect, by using an union (d_u),
where d_rcu and d_child are placed so that these two fields can share their
memory needs.
At the time d_free() is called (and d_rcu is really used), d_child is known
to be empty and not touched by the dentry freeing.
Lockless lookups only access d_name, d_parent, d_lock, d_op, d_flags (so
the previous content of d_child is not needed if said dentry was unhashed
but still accessed by a CPU because of RCU constraints)
As dentry cache easily contains millions of entries, a size reduction is
worth the extra complexity of the ugly C union.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Maneesh Soni <maneesh@in.ibm.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Ian Kent <raven@themaw.net>
Cc: Paul Jackson <pj@sgi.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Cc: James Morris <jmorris@namei.org>
Cc: Stephen Smalley <sds@epoch.ncsc.mil>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:03:32 +08:00
|
|
|
struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
|
2005-04-17 06:20:36 +08:00
|
|
|
next = tmp->next;
|
|
|
|
|
2011-01-07 14:49:34 +08:00
|
|
|
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
|
2011-01-07 14:49:31 +08:00
|
|
|
|
2011-08-23 16:56:24 +08:00
|
|
|
/*
|
|
|
|
* move only zero ref count dentries to the dispose list.
|
fix shrink_dcache_parent() livelock
Two (or more) concurrent calls of shrink_dcache_parent() on the same dentry may
cause shrink_dcache_parent() to loop forever.
Here's what appears to happen:
1 - CPU0: select_parent(P) finds C and puts it on dispose list, returns 1
2 - CPU1: select_parent(P) locks P->d_lock
3 - CPU0: shrink_dentry_list() locks C->d_lock
dentry_kill(C) tries to lock P->d_lock but fails, unlocks C->d_lock
4 - CPU1: select_parent(P) locks C->d_lock,
moves C from dispose list being processed on CPU0 to the new
dispose list, returns 1
5 - CPU0: shrink_dentry_list() finds dispose list empty, returns
6 - Goto 2 with CPU0 and CPU1 switched
Basically select_parent() steals the dentry from shrink_dentry_list() and thinks
it found a new one, causing shrink_dentry_list() to think it's making progress
and loop over and over.
One way to trigger this is to make udev calls stat() on the sysfs file while it
is going away.
Having a file in /lib/udev/rules.d/ with only this one rule seems to the trick:
ATTR{vendor}=="0x8086", ATTR{device}=="0x10ca", ENV{PCI_SLOT_NAME}="%k", ENV{MATCHADDR}="$attr{address}", RUN+="/bin/true"
Then execute the following loop:
while true; do
echo -bond0 > /sys/class/net/bonding_masters
echo +bond0 > /sys/class/net/bonding_masters
echo -bond1 > /sys/class/net/bonding_masters
echo +bond1 > /sys/class/net/bonding_masters
done
One fix would be to check all callers and prevent concurrent calls to
shrink_dcache_parent(). But I think a better solution is to stop the
stealing behavior.
This patch adds a new dentry flag that is set when the dentry is added to the
dispose list. The flag is cleared in dentry_lru_del() in case the dentry gets a
new reference just before being pruned.
If the dentry has this flag, select_parent() will skip it and let
shrink_dentry_list() retry pruning it. With select_parent() skipping those
dentries there will not be the appearance of progress (new dentries found) when
there is none, hence shrink_dcache_parent() will not loop forever.
Set the flag is also set in prune_dcache_sb() for consistency as suggested by
Linus.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2012-01-11 01:22:25 +08:00
|
|
|
*
|
|
|
|
* Those which are presently on the shrink list, being processed
|
|
|
|
* by shrink_dentry_list(), shouldn't be moved. Otherwise the
|
|
|
|
* loop in shrink_dcache_parent() might not make any progress
|
|
|
|
* and loop forever.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
fix shrink_dcache_parent() livelock
Two (or more) concurrent calls of shrink_dcache_parent() on the same dentry may
cause shrink_dcache_parent() to loop forever.
Here's what appears to happen:
1 - CPU0: select_parent(P) finds C and puts it on dispose list, returns 1
2 - CPU1: select_parent(P) locks P->d_lock
3 - CPU0: shrink_dentry_list() locks C->d_lock
dentry_kill(C) tries to lock P->d_lock but fails, unlocks C->d_lock
4 - CPU1: select_parent(P) locks C->d_lock,
moves C from dispose list being processed on CPU0 to the new
dispose list, returns 1
5 - CPU0: shrink_dentry_list() finds dispose list empty, returns
6 - Goto 2 with CPU0 and CPU1 switched
Basically select_parent() steals the dentry from shrink_dentry_list() and thinks
it found a new one, causing shrink_dentry_list() to think it's making progress
and loop over and over.
One way to trigger this is to make udev calls stat() on the sysfs file while it
is going away.
Having a file in /lib/udev/rules.d/ with only this one rule seems to the trick:
ATTR{vendor}=="0x8086", ATTR{device}=="0x10ca", ENV{PCI_SLOT_NAME}="%k", ENV{MATCHADDR}="$attr{address}", RUN+="/bin/true"
Then execute the following loop:
while true; do
echo -bond0 > /sys/class/net/bonding_masters
echo +bond0 > /sys/class/net/bonding_masters
echo -bond1 > /sys/class/net/bonding_masters
echo +bond1 > /sys/class/net/bonding_masters
done
One fix would be to check all callers and prevent concurrent calls to
shrink_dcache_parent(). But I think a better solution is to stop the
stealing behavior.
This patch adds a new dentry flag that is set when the dentry is added to the
dispose list. The flag is cleared in dentry_lru_del() in case the dentry gets a
new reference just before being pruned.
If the dentry has this flag, select_parent() will skip it and let
shrink_dentry_list() retry pruning it. With select_parent() skipping those
dentries there will not be the appearance of progress (new dentries found) when
there is none, hence shrink_dcache_parent() will not loop forever.
Set the flag is also set in prune_dcache_sb() for consistency as suggested by
Linus.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2012-01-11 01:22:25 +08:00
|
|
|
if (dentry->d_count) {
|
|
|
|
dentry_lru_del(dentry);
|
|
|
|
} else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
|
2011-08-23 16:56:24 +08:00
|
|
|
dentry_lru_move_list(dentry, dispose);
|
fix shrink_dcache_parent() livelock
Two (or more) concurrent calls of shrink_dcache_parent() on the same dentry may
cause shrink_dcache_parent() to loop forever.
Here's what appears to happen:
1 - CPU0: select_parent(P) finds C and puts it on dispose list, returns 1
2 - CPU1: select_parent(P) locks P->d_lock
3 - CPU0: shrink_dentry_list() locks C->d_lock
dentry_kill(C) tries to lock P->d_lock but fails, unlocks C->d_lock
4 - CPU1: select_parent(P) locks C->d_lock,
moves C from dispose list being processed on CPU0 to the new
dispose list, returns 1
5 - CPU0: shrink_dentry_list() finds dispose list empty, returns
6 - Goto 2 with CPU0 and CPU1 switched
Basically select_parent() steals the dentry from shrink_dentry_list() and thinks
it found a new one, causing shrink_dentry_list() to think it's making progress
and loop over and over.
One way to trigger this is to make udev calls stat() on the sysfs file while it
is going away.
Having a file in /lib/udev/rules.d/ with only this one rule seems to the trick:
ATTR{vendor}=="0x8086", ATTR{device}=="0x10ca", ENV{PCI_SLOT_NAME}="%k", ENV{MATCHADDR}="$attr{address}", RUN+="/bin/true"
Then execute the following loop:
while true; do
echo -bond0 > /sys/class/net/bonding_masters
echo +bond0 > /sys/class/net/bonding_masters
echo -bond1 > /sys/class/net/bonding_masters
echo +bond1 > /sys/class/net/bonding_masters
done
One fix would be to check all callers and prevent concurrent calls to
shrink_dcache_parent(). But I think a better solution is to stop the
stealing behavior.
This patch adds a new dentry flag that is set when the dentry is added to the
dispose list. The flag is cleared in dentry_lru_del() in case the dentry gets a
new reference just before being pruned.
If the dentry has this flag, select_parent() will skip it and let
shrink_dentry_list() retry pruning it. With select_parent() skipping those
dentries there will not be the appearance of progress (new dentries found) when
there is none, hence shrink_dcache_parent() will not loop forever.
Set the flag is also set in prune_dcache_sb() for consistency as suggested by
Linus.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2012-01-11 01:22:25 +08:00
|
|
|
dentry->d_flags |= DCACHE_SHRINK_LIST;
|
2005-04-17 06:20:36 +08:00
|
|
|
found++;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* We can return to the caller if we have found some (this
|
|
|
|
* ensures forward progress). We'll be coming back to find
|
|
|
|
* the rest.
|
|
|
|
*/
|
2011-01-07 14:49:34 +08:00
|
|
|
if (found && need_resched()) {
|
|
|
|
spin_unlock(&dentry->d_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
goto out;
|
2011-01-07 14:49:34 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Descend a level if the d_subdirs list is non-empty.
|
|
|
|
*/
|
|
|
|
if (!list_empty(&dentry->d_subdirs)) {
|
2011-01-07 14:49:34 +08:00
|
|
|
spin_unlock(&this_parent->d_lock);
|
|
|
|
spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
|
2005-04-17 06:20:36 +08:00
|
|
|
this_parent = dentry;
|
2011-01-07 14:49:34 +08:00
|
|
|
spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
|
2005-04-17 06:20:36 +08:00
|
|
|
goto repeat;
|
|
|
|
}
|
2011-01-07 14:49:34 +08:00
|
|
|
|
|
|
|
spin_unlock(&dentry->d_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* All done at this level ... ascend and resume the search.
|
|
|
|
*/
|
|
|
|
if (this_parent != parent) {
|
2011-03-16 06:29:21 +08:00
|
|
|
struct dentry *child = this_parent;
|
|
|
|
this_parent = try_to_ascend(this_parent, locked, seq);
|
|
|
|
if (!this_parent)
|
2011-01-07 14:49:37 +08:00
|
|
|
goto rename_retry;
|
|
|
|
next = child->d_u.d_child.next;
|
2005-04-17 06:20:36 +08:00
|
|
|
goto resume;
|
|
|
|
}
|
|
|
|
out:
|
2011-01-07 14:49:34 +08:00
|
|
|
spin_unlock(&this_parent->d_lock);
|
2011-01-07 14:49:39 +08:00
|
|
|
if (!locked && read_seqretry(&rename_lock, seq))
|
2011-01-07 14:49:37 +08:00
|
|
|
goto rename_retry;
|
2011-01-07 14:49:39 +08:00
|
|
|
if (locked)
|
|
|
|
write_sequnlock(&rename_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
return found;
|
2011-01-07 14:49:39 +08:00
|
|
|
|
|
|
|
rename_retry:
|
|
|
|
if (found)
|
|
|
|
return found;
|
|
|
|
locked = 1;
|
|
|
|
write_seqlock(&rename_lock);
|
|
|
|
goto again;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* shrink_dcache_parent - prune dcache
|
|
|
|
* @parent: parent of entries to prune
|
|
|
|
*
|
|
|
|
* Prune the dcache to remove unused children of the parent dentry.
|
|
|
|
*/
|
|
|
|
void shrink_dcache_parent(struct dentry * parent)
|
|
|
|
{
|
2011-08-23 16:56:24 +08:00
|
|
|
LIST_HEAD(dispose);
|
2005-04-17 06:20:36 +08:00
|
|
|
int found;
|
|
|
|
|
2011-08-23 16:56:24 +08:00
|
|
|
while ((found = select_parent(parent, &dispose)) != 0)
|
|
|
|
shrink_dentry_list(&dispose);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2010-01-06 04:45:18 +08:00
|
|
|
EXPORT_SYMBOL(shrink_dcache_parent);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/**
|
2011-07-08 03:03:58 +08:00
|
|
|
* __d_alloc - allocate a dcache entry
|
|
|
|
* @sb: filesystem it will belong to
|
2005-04-17 06:20:36 +08:00
|
|
|
* @name: qstr of the name
|
|
|
|
*
|
|
|
|
* Allocates a dentry. It returns %NULL if there is insufficient memory
|
|
|
|
* available. On a success the dentry is returned. The name passed in is
|
|
|
|
* copied and the copy passed in may be reused after this call.
|
|
|
|
*/
|
|
|
|
|
2011-07-08 03:03:58 +08:00
|
|
|
struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct dentry *dentry;
|
|
|
|
char *dname;
|
|
|
|
|
2007-10-16 16:25:52 +08:00
|
|
|
dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!dentry)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
if (name->len > DNAME_INLINE_LEN-1) {
|
|
|
|
dname = kmalloc(name->len + 1, GFP_KERNEL);
|
|
|
|
if (!dname) {
|
|
|
|
kmem_cache_free(dentry_cache, dentry);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
dname = dentry->d_iname;
|
|
|
|
}
|
|
|
|
dentry->d_name.name = dname;
|
|
|
|
|
|
|
|
dentry->d_name.len = name->len;
|
|
|
|
dentry->d_name.hash = name->hash;
|
|
|
|
memcpy(dname, name->name, name->len);
|
|
|
|
dname[name->len] = 0;
|
|
|
|
|
2011-01-07 14:49:32 +08:00
|
|
|
dentry->d_count = 1;
|
vfs: get rid of insane dentry hashing rules
The dentry hashing rules have been really quite complicated for a long
while, in odd ways. That made functions like __d_drop() very fragile
and non-obvious.
In particular, whether a dentry was hashed or not was indicated with an
explicit DCACHE_UNHASHED bit. That's despite the fact that the hash
abstraction that the dentries use actually have a 'is this entry hashed
or not' model (which is a simple test of the 'pprev' pointer).
The reason that was done is because we used the normal 'is this entry
unhashed' model to mark whether the dentry had _ever_ been hashed in the
dentry hash tables, and that logic goes back many years (commit
b3423415fbc2: "dcache: avoid RCU for never-hashed dentries").
That, in turn, meant that __d_drop had totally different unhashing logic
for the dentry hash table case and for the anonymous dcache case,
because in order to use the "is this dentry hashed" logic as a flag for
whether it had ever been on the RCU hash table, we had to unhash such a
dentry differently so that we'd never think that it wasn't 'unhashed'
and wouldn't be free'd correctly.
That's just insane. It made the logic really hard to follow, when there
were two different kinds of "unhashed" states, and one of them (the one
that used "list_bl_unhashed()") really had nothing at all to do with
being unhashed per se, but with a very subtle lifetime rule instead.
So turn all of it around, and make it logical.
Instead of having a DENTRY_UNHASHED bit in d_flags to indicate whether
the dentry is on the hash chains or not, use the hash chain unhashed
logic for that. Suddenly "d_unhashed()" just uses "list_bl_unhashed()",
and everything makes sense.
And for the lifetime rule, just use an explicit DENTRY_RCUACCEES bit.
If we ever insert the dentry into the dentry hash table so that it is
visible to RCU lookup, we mark it DENTRY_RCUACCESS to show that it now
needs the RCU lifetime rules. Now suddently that test at dentry free
time makes sense too.
And because unhashing now is sane and doesn't depend on where the dentry
got unhashed from (because the dentry hash chain details doesn't have
some subtle side effects), we can re-unify the __d_drop() logic and use
common code for the unhashing.
Also fix one more open-coded hash chain bit_spin_lock() that I missed in
the previous chain locking cleanup commit.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-04-24 22:58:46 +08:00
|
|
|
dentry->d_flags = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
spin_lock_init(&dentry->d_lock);
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
seqcount_init(&dentry->d_seq);
|
2005-04-17 06:20:36 +08:00
|
|
|
dentry->d_inode = NULL;
|
2011-07-08 03:03:58 +08:00
|
|
|
dentry->d_parent = dentry;
|
|
|
|
dentry->d_sb = sb;
|
2005-04-17 06:20:36 +08:00
|
|
|
dentry->d_op = NULL;
|
|
|
|
dentry->d_fsdata = NULL;
|
2011-01-07 14:50:05 +08:00
|
|
|
INIT_HLIST_BL_NODE(&dentry->d_hash);
|
2005-04-17 06:20:36 +08:00
|
|
|
INIT_LIST_HEAD(&dentry->d_lru);
|
|
|
|
INIT_LIST_HEAD(&dentry->d_subdirs);
|
|
|
|
INIT_LIST_HEAD(&dentry->d_alias);
|
2011-01-07 14:49:34 +08:00
|
|
|
INIT_LIST_HEAD(&dentry->d_u.d_child);
|
2011-07-08 03:03:58 +08:00
|
|
|
d_set_d_op(dentry, dentry->d_sb->s_d_op);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
fs: use fast counters for vfs caches
percpu_counter library generates quite nasty code, so unless you need
to dynamically allocate counters or take fast approximate value, a
simple per cpu set of counters is much better.
The percpu_counter can never be made to work as well, because it has an
indirection from pointer to percpu memory, and it can't use direct
this_cpu_inc interfaces because it doesn't use static PER_CPU data, so
code will always be worse.
In the fastpath, it is the difference between this:
incl %gs:nr_dentry # nr_dentry
and this:
movl percpu_counter_batch(%rip), %edx # percpu_counter_batch,
movl $1, %esi #,
movq $nr_dentry, %rdi #,
call __percpu_counter_add # (plus I clobber registers)
__percpu_counter_add:
pushq %rbp #
movq %rsp, %rbp #,
subq $32, %rsp #,
movq %rbx, -24(%rbp) #,
movq %r12, -16(%rbp) #,
movq %r13, -8(%rbp) #,
movq %rdi, %rbx # fbc, fbc
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
movq %gs:kernel_stack,%rax #, pfo_ret__
# 0 "" 2
#NO_APP
incl -8124(%rax) # <variable>.preempt_count
movq 32(%rdi), %r12 # <variable>.counters, tcp_ptr__
#APP
# 78 "lib/percpu_counter.c" 1
add %gs:this_cpu_off, %r12 # this_cpu_off, tcp_ptr__
# 0 "" 2
#NO_APP
movslq (%r12),%r13 #* tcp_ptr__, tmp73
movslq %edx,%rax # batch, batch
addq %rsi, %r13 # amount, count
cmpq %rax, %r13 # batch, count
jge .L27 #,
negl %edx # tmp76
movslq %edx,%rdx # tmp76, tmp77
cmpq %rdx, %r13 # tmp77, count
jg .L28 #,
.L27:
movq %rbx, %rdi # fbc,
call _raw_spin_lock #
addq %r13, 8(%rbx) # count, <variable>.count
movq %rbx, %rdi # fbc,
movl $0, (%r12) #,* tcp_ptr__
call _raw_spin_unlock #
.L29:
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
movq %gs:kernel_stack,%rax #, pfo_ret__
# 0 "" 2
#NO_APP
decl -8124(%rax) # <variable>.preempt_count
movq -8136(%rax), %rax #, D.14625
testb $8, %al #, D.14625
jne .L32 #,
.L31:
movq -24(%rbp), %rbx #,
movq -16(%rbp), %r12 #,
movq -8(%rbp), %r13 #,
leave
ret
.p2align 4,,10
.p2align 3
.L28:
movl %r13d, (%r12) # count,*
jmp .L29 #
.L32:
call preempt_schedule #
.p2align 4,,6
jmp .L31 #
.size __percpu_counter_add, .-__percpu_counter_add
.p2align 4,,15
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:19 +08:00
|
|
|
this_cpu_inc(nr_dentry);
|
2010-10-10 17:36:23 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
return dentry;
|
|
|
|
}
|
2011-07-08 03:03:58 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* d_alloc - allocate a dcache entry
|
|
|
|
* @parent: parent of entry to allocate
|
|
|
|
* @name: qstr of the name
|
|
|
|
*
|
|
|
|
* Allocates a dentry. It returns %NULL if there is insufficient memory
|
|
|
|
* available. On a success the dentry is returned. The name passed in is
|
|
|
|
* copied and the copy passed in may be reused after this call.
|
|
|
|
*/
|
|
|
|
struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
|
|
|
|
{
|
|
|
|
struct dentry *dentry = __d_alloc(parent->d_sb, name);
|
|
|
|
if (!dentry)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
spin_lock(&parent->d_lock);
|
|
|
|
/*
|
|
|
|
* don't need child lock because it is not subject
|
|
|
|
* to concurrency here
|
|
|
|
*/
|
|
|
|
__dget_dlock(parent);
|
|
|
|
dentry->d_parent = parent;
|
|
|
|
list_add(&dentry->d_u.d_child, &parent->d_subdirs);
|
|
|
|
spin_unlock(&parent->d_lock);
|
|
|
|
|
|
|
|
return dentry;
|
|
|
|
}
|
2010-01-06 04:45:18 +08:00
|
|
|
EXPORT_SYMBOL(d_alloc);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-01-07 14:50:07 +08:00
|
|
|
struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
|
|
|
|
{
|
2011-07-08 03:03:58 +08:00
|
|
|
struct dentry *dentry = __d_alloc(sb, name);
|
|
|
|
if (dentry)
|
2011-01-07 14:50:07 +08:00
|
|
|
dentry->d_flags |= DCACHE_DISCONNECTED;
|
|
|
|
return dentry;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(d_alloc_pseudo);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
struct dentry *d_alloc_name(struct dentry *parent, const char *name)
|
|
|
|
{
|
|
|
|
struct qstr q;
|
|
|
|
|
|
|
|
q.name = name;
|
|
|
|
q.len = strlen(name);
|
|
|
|
q.hash = full_name_hash(q.name, q.len);
|
|
|
|
return d_alloc(parent, &q);
|
|
|
|
}
|
2009-09-30 08:09:42 +08:00
|
|
|
EXPORT_SYMBOL(d_alloc_name);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-01-07 14:49:55 +08:00
|
|
|
void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
|
|
|
|
{
|
2011-01-15 05:26:18 +08:00
|
|
|
WARN_ON_ONCE(dentry->d_op);
|
|
|
|
WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH |
|
2011-01-07 14:49:55 +08:00
|
|
|
DCACHE_OP_COMPARE |
|
|
|
|
DCACHE_OP_REVALIDATE |
|
|
|
|
DCACHE_OP_DELETE ));
|
|
|
|
dentry->d_op = op;
|
|
|
|
if (!op)
|
|
|
|
return;
|
|
|
|
if (op->d_hash)
|
|
|
|
dentry->d_flags |= DCACHE_OP_HASH;
|
|
|
|
if (op->d_compare)
|
|
|
|
dentry->d_flags |= DCACHE_OP_COMPARE;
|
|
|
|
if (op->d_revalidate)
|
|
|
|
dentry->d_flags |= DCACHE_OP_REVALIDATE;
|
|
|
|
if (op->d_delete)
|
|
|
|
dentry->d_flags |= DCACHE_OP_DELETE;
|
2011-10-29 01:02:42 +08:00
|
|
|
if (op->d_prune)
|
|
|
|
dentry->d_flags |= DCACHE_OP_PRUNE;
|
2011-01-07 14:49:55 +08:00
|
|
|
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(d_set_d_op);
|
|
|
|
|
2008-10-16 06:50:28 +08:00
|
|
|
static void __d_instantiate(struct dentry *dentry, struct inode *inode)
|
|
|
|
{
|
2011-01-07 14:49:35 +08:00
|
|
|
spin_lock(&dentry->d_lock);
|
Add a dentry op to handle automounting rather than abusing follow_link()
Add a dentry op (d_automount) to handle automounting directories rather than
abusing the follow_link() inode operation. The operation is keyed off a new
dentry flag (DCACHE_NEED_AUTOMOUNT).
This also makes it easier to add an AT_ flag to suppress terminal segment
automount during pathwalk and removes the need for the kludge code in the
pathwalk algorithm to handle directories with follow_link() semantics.
The ->d_automount() dentry operation:
struct vfsmount *(*d_automount)(struct path *mountpoint);
takes a pointer to the directory to be mounted upon, which is expected to
provide sufficient data to determine what should be mounted. If successful, it
should return the vfsmount struct it creates (which it should also have added
to the namespace using do_add_mount() or similar). If there's a collision with
another automount attempt, NULL should be returned. If the directory specified
by the parameter should be used directly rather than being mounted upon,
-EISDIR should be returned. In any other case, an error code should be
returned.
The ->d_automount() operation is called with no locks held and may sleep. At
this point the pathwalk algorithm will be in ref-walk mode.
Within fs/namei.c itself, a new pathwalk subroutine (follow_automount()) is
added to handle mountpoints. It will return -EREMOTE if the automount flag was
set, but no d_automount() op was supplied, -ELOOP if we've encountered too many
symlinks or mountpoints, -EISDIR if the walk point should be used without
mounting and 0 if successful. The path will be updated to point to the mounted
filesystem if a successful automount took place.
__follow_mount() is replaced by follow_managed() which is more generic
(especially with the patch that adds ->d_manage()). This handles transits from
directories during pathwalk, including automounting and skipping over
mountpoints (and holding processes with the next patch).
__follow_mount_rcu() will jump out of RCU-walk mode if it encounters an
automount point with nothing mounted on it.
follow_dotdot*() does not handle automounts as you don't want to trigger them
whilst following "..".
I've also extracted the mount/don't-mount logic from autofs4 and included it
here. It makes the mount go ahead anyway if someone calls open() or creat(),
tries to traverse the directory, tries to chdir/chroot/etc. into the directory,
or sticks a '/' on the end of the pathname. If they do a stat(), however,
they'll only trigger the automount if they didn't also say O_NOFOLLOW.
I've also added an inode flag (S_AUTOMOUNT) so that filesystems can mark their
inodes as automount points. This flag is automatically propagated to the
dentry as DCACHE_NEED_AUTOMOUNT by __d_instantiate(). This saves NFS and could
save AFS a private flag bit apiece, but is not strictly necessary. It would be
preferable to do the propagation in d_set_d_op(), but that doesn't normally
have access to the inode.
[AV: fixed breakage in case if __follow_mount_rcu() fails and nameidata_drop_rcu()
succeeds in RCU case of do_lookup(); we need to fall through to non-RCU case after
that, rather than just returning with ungrabbed *path]
Signed-off-by: David Howells <dhowells@redhat.com>
Was-Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2011-01-15 02:45:21 +08:00
|
|
|
if (inode) {
|
|
|
|
if (unlikely(IS_AUTOMOUNT(inode)))
|
|
|
|
dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
|
2008-10-16 06:50:28 +08:00
|
|
|
list_add(&dentry->d_alias, &inode->i_dentry);
|
Add a dentry op to handle automounting rather than abusing follow_link()
Add a dentry op (d_automount) to handle automounting directories rather than
abusing the follow_link() inode operation. The operation is keyed off a new
dentry flag (DCACHE_NEED_AUTOMOUNT).
This also makes it easier to add an AT_ flag to suppress terminal segment
automount during pathwalk and removes the need for the kludge code in the
pathwalk algorithm to handle directories with follow_link() semantics.
The ->d_automount() dentry operation:
struct vfsmount *(*d_automount)(struct path *mountpoint);
takes a pointer to the directory to be mounted upon, which is expected to
provide sufficient data to determine what should be mounted. If successful, it
should return the vfsmount struct it creates (which it should also have added
to the namespace using do_add_mount() or similar). If there's a collision with
another automount attempt, NULL should be returned. If the directory specified
by the parameter should be used directly rather than being mounted upon,
-EISDIR should be returned. In any other case, an error code should be
returned.
The ->d_automount() operation is called with no locks held and may sleep. At
this point the pathwalk algorithm will be in ref-walk mode.
Within fs/namei.c itself, a new pathwalk subroutine (follow_automount()) is
added to handle mountpoints. It will return -EREMOTE if the automount flag was
set, but no d_automount() op was supplied, -ELOOP if we've encountered too many
symlinks or mountpoints, -EISDIR if the walk point should be used without
mounting and 0 if successful. The path will be updated to point to the mounted
filesystem if a successful automount took place.
__follow_mount() is replaced by follow_managed() which is more generic
(especially with the patch that adds ->d_manage()). This handles transits from
directories during pathwalk, including automounting and skipping over
mountpoints (and holding processes with the next patch).
__follow_mount_rcu() will jump out of RCU-walk mode if it encounters an
automount point with nothing mounted on it.
follow_dotdot*() does not handle automounts as you don't want to trigger them
whilst following "..".
I've also extracted the mount/don't-mount logic from autofs4 and included it
here. It makes the mount go ahead anyway if someone calls open() or creat(),
tries to traverse the directory, tries to chdir/chroot/etc. into the directory,
or sticks a '/' on the end of the pathname. If they do a stat(), however,
they'll only trigger the automount if they didn't also say O_NOFOLLOW.
I've also added an inode flag (S_AUTOMOUNT) so that filesystems can mark their
inodes as automount points. This flag is automatically propagated to the
dentry as DCACHE_NEED_AUTOMOUNT by __d_instantiate(). This saves NFS and could
save AFS a private flag bit apiece, but is not strictly necessary. It would be
preferable to do the propagation in d_set_d_op(), but that doesn't normally
have access to the inode.
[AV: fixed breakage in case if __follow_mount_rcu() fails and nameidata_drop_rcu()
succeeds in RCU case of do_lookup(); we need to fall through to non-RCU case after
that, rather than just returning with ungrabbed *path]
Signed-off-by: David Howells <dhowells@redhat.com>
Was-Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2011-01-15 02:45:21 +08:00
|
|
|
}
|
2008-10-16 06:50:28 +08:00
|
|
|
dentry->d_inode = inode;
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
dentry_rcuwalk_barrier(dentry);
|
2011-01-07 14:49:35 +08:00
|
|
|
spin_unlock(&dentry->d_lock);
|
2008-10-16 06:50:28 +08:00
|
|
|
fsnotify_d_instantiate(dentry, inode);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/**
|
|
|
|
* d_instantiate - fill in inode information for a dentry
|
|
|
|
* @entry: dentry to complete
|
|
|
|
* @inode: inode to attach to this dentry
|
|
|
|
*
|
|
|
|
* Fill in inode information in the entry.
|
|
|
|
*
|
|
|
|
* This turns negative dentries into productive full members
|
|
|
|
* of society.
|
|
|
|
*
|
|
|
|
* NOTE! This assumes that the inode count has been incremented
|
|
|
|
* (or otherwise set) by the caller to indicate that it is now
|
|
|
|
* in use by the dcache.
|
|
|
|
*/
|
|
|
|
|
|
|
|
void d_instantiate(struct dentry *entry, struct inode * inode)
|
|
|
|
{
|
2006-03-27 00:25:39 +08:00
|
|
|
BUG_ON(!list_empty(&entry->d_alias));
|
2011-01-07 14:50:06 +08:00
|
|
|
if (inode)
|
|
|
|
spin_lock(&inode->i_lock);
|
2008-10-16 06:50:28 +08:00
|
|
|
__d_instantiate(entry, inode);
|
2011-01-07 14:50:06 +08:00
|
|
|
if (inode)
|
|
|
|
spin_unlock(&inode->i_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
security_d_instantiate(entry, inode);
|
|
|
|
}
|
2010-01-06 04:45:18 +08:00
|
|
|
EXPORT_SYMBOL(d_instantiate);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* d_instantiate_unique - instantiate a non-aliased dentry
|
|
|
|
* @entry: dentry to instantiate
|
|
|
|
* @inode: inode to attach to this dentry
|
|
|
|
*
|
|
|
|
* Fill in inode information in the entry. On success, it returns NULL.
|
|
|
|
* If an unhashed alias of "entry" already exists, then we return the
|
2006-01-10 12:52:51 +08:00
|
|
|
* aliased dentry instead and drop one reference to inode.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* Note that in order to avoid conflicts with rename() etc, the caller
|
|
|
|
* had better be holding the parent directory semaphore.
|
2006-01-10 12:52:51 +08:00
|
|
|
*
|
|
|
|
* This also assumes that the inode count has been incremented
|
|
|
|
* (or otherwise set) by the caller to indicate that it is now
|
|
|
|
* in use by the dcache.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2006-08-23 08:06:07 +08:00
|
|
|
static struct dentry *__d_instantiate_unique(struct dentry *entry,
|
|
|
|
struct inode *inode)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct dentry *alias;
|
|
|
|
int len = entry->d_name.len;
|
|
|
|
const char *name = entry->d_name.name;
|
|
|
|
unsigned int hash = entry->d_name.hash;
|
|
|
|
|
2006-08-23 08:06:07 +08:00
|
|
|
if (!inode) {
|
2008-10-16 06:50:28 +08:00
|
|
|
__d_instantiate(entry, NULL);
|
2006-08-23 08:06:07 +08:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
list_for_each_entry(alias, &inode->i_dentry, d_alias) {
|
|
|
|
struct qstr *qstr = &alias->d_name;
|
|
|
|
|
2011-01-07 14:49:36 +08:00
|
|
|
/*
|
|
|
|
* Don't need alias->d_lock here, because aliases with
|
|
|
|
* d_parent == entry->d_parent are not subject to name or
|
|
|
|
* parent changes, because the parent inode i_mutex is held.
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
if (qstr->hash != hash)
|
|
|
|
continue;
|
|
|
|
if (alias->d_parent != entry->d_parent)
|
|
|
|
continue;
|
2011-01-07 14:50:09 +08:00
|
|
|
if (dentry_cmp(qstr->name, qstr->len, name, len))
|
2005-04-17 06:20:36 +08:00
|
|
|
continue;
|
2011-01-07 14:49:43 +08:00
|
|
|
__dget(alias);
|
2005-04-17 06:20:36 +08:00
|
|
|
return alias;
|
|
|
|
}
|
2006-08-23 08:06:07 +08:00
|
|
|
|
2008-10-16 06:50:28 +08:00
|
|
|
__d_instantiate(entry, inode);
|
2005-04-17 06:20:36 +08:00
|
|
|
return NULL;
|
|
|
|
}
|
2006-08-23 08:06:07 +08:00
|
|
|
|
|
|
|
struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
|
|
|
|
{
|
|
|
|
struct dentry *result;
|
|
|
|
|
|
|
|
BUG_ON(!list_empty(&entry->d_alias));
|
|
|
|
|
2011-01-07 14:50:06 +08:00
|
|
|
if (inode)
|
|
|
|
spin_lock(&inode->i_lock);
|
2006-08-23 08:06:07 +08:00
|
|
|
result = __d_instantiate_unique(entry, inode);
|
2011-01-07 14:50:06 +08:00
|
|
|
if (inode)
|
|
|
|
spin_unlock(&inode->i_lock);
|
2006-08-23 08:06:07 +08:00
|
|
|
|
|
|
|
if (!result) {
|
|
|
|
security_d_instantiate(entry, inode);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
BUG_ON(!d_unhashed(result));
|
|
|
|
iput(inode);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
EXPORT_SYMBOL(d_instantiate_unique);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* d_alloc_root - allocate root dentry
|
|
|
|
* @root_inode: inode to allocate the root for
|
|
|
|
*
|
|
|
|
* Allocate a root ("/") dentry for the inode given. The inode is
|
|
|
|
* instantiated and returned. %NULL is returned if there is insufficient
|
|
|
|
* memory or the inode passed is %NULL.
|
|
|
|
*/
|
|
|
|
|
|
|
|
struct dentry * d_alloc_root(struct inode * root_inode)
|
|
|
|
{
|
|
|
|
struct dentry *res = NULL;
|
|
|
|
|
|
|
|
if (root_inode) {
|
|
|
|
static const struct qstr name = { .name = "/", .len = 1 };
|
|
|
|
|
2011-07-08 03:03:58 +08:00
|
|
|
res = __d_alloc(root_inode->i_sb, &name);
|
|
|
|
if (res)
|
2005-04-17 06:20:36 +08:00
|
|
|
d_instantiate(res, root_inode);
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
2010-01-06 04:45:18 +08:00
|
|
|
EXPORT_SYMBOL(d_alloc_root);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-01-09 05:49:21 +08:00
|
|
|
struct dentry *d_make_root(struct inode *root_inode)
|
|
|
|
{
|
|
|
|
struct dentry *res = NULL;
|
|
|
|
|
|
|
|
if (root_inode) {
|
|
|
|
static const struct qstr name = { .name = "/", .len = 1 };
|
|
|
|
|
|
|
|
res = __d_alloc(root_inode->i_sb, &name);
|
|
|
|
if (res)
|
|
|
|
d_instantiate(res, root_inode);
|
|
|
|
else
|
|
|
|
iput(root_inode);
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(d_make_root);
|
|
|
|
|
fs/dcache: allow d_obtain_alias() to return unhashed dentries
Without this patch, inodes are not promptly freed on last close of an
unlinked file by an nfs client:
client$ mount -tnfs4 server:/export/ /mnt/
client$ tail -f /mnt/FOO
...
server$ df -i /export
server$ rm /export/FOO
(^C the tail -f)
server$ df -i /export
server$ echo 2 >/proc/sys/vm/drop_caches
server$ df -i /export
the df's will show that the inode is not freed on the filesystem until
the last step, when it could have been freed after killing the client's
tail -f. On-disk data won't be deallocated either, leading to possible
spurious ENOSPC.
This occurs because when the client does the close, it arrives in a
compound with a putfh and a close, processed like:
- putfh: look up the filehandle. The only alias found for the
inode will be DCACHE_UNHASHED alias referenced by the filp
this, so it creates a new DCACHE_DISCONECTED dentry and
returns that instead.
- close: closes the existing filp, which is destroyed
immediately by dput() since it's DCACHE_UNHASHED.
- end of the compound: release the reference
to the current filehandle, and dput() the new
DCACHE_DISCONECTED dentry, which gets put on the
unused list instead of being destroyed immediately.
Nick Piggin suggested fixing this by allowing d_obtain_alias to return
the unhashed dentry that is referenced by the filp, instead of making it
create a new dentry.
Leave __d_find_alias() alone to avoid changing behavior of other
callers.
Also nfsd doesn't need all the checks of __d_find_alias(); any dentry,
hashed or unhashed, disconnected or not, should work.
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2011-01-19 04:45:09 +08:00
|
|
|
static struct dentry * __d_find_any_alias(struct inode *inode)
|
|
|
|
{
|
|
|
|
struct dentry *alias;
|
|
|
|
|
|
|
|
if (list_empty(&inode->i_dentry))
|
|
|
|
return NULL;
|
|
|
|
alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias);
|
|
|
|
__dget(alias);
|
|
|
|
return alias;
|
|
|
|
}
|
|
|
|
|
2012-01-11 01:04:37 +08:00
|
|
|
/**
|
|
|
|
* d_find_any_alias - find any alias for a given inode
|
|
|
|
* @inode: inode to find an alias for
|
|
|
|
*
|
|
|
|
* If any aliases exist for the given inode, take and return a
|
|
|
|
* reference for one of them. If no aliases exist, return %NULL.
|
|
|
|
*/
|
|
|
|
struct dentry *d_find_any_alias(struct inode *inode)
|
fs/dcache: allow d_obtain_alias() to return unhashed dentries
Without this patch, inodes are not promptly freed on last close of an
unlinked file by an nfs client:
client$ mount -tnfs4 server:/export/ /mnt/
client$ tail -f /mnt/FOO
...
server$ df -i /export
server$ rm /export/FOO
(^C the tail -f)
server$ df -i /export
server$ echo 2 >/proc/sys/vm/drop_caches
server$ df -i /export
the df's will show that the inode is not freed on the filesystem until
the last step, when it could have been freed after killing the client's
tail -f. On-disk data won't be deallocated either, leading to possible
spurious ENOSPC.
This occurs because when the client does the close, it arrives in a
compound with a putfh and a close, processed like:
- putfh: look up the filehandle. The only alias found for the
inode will be DCACHE_UNHASHED alias referenced by the filp
this, so it creates a new DCACHE_DISCONECTED dentry and
returns that instead.
- close: closes the existing filp, which is destroyed
immediately by dput() since it's DCACHE_UNHASHED.
- end of the compound: release the reference
to the current filehandle, and dput() the new
DCACHE_DISCONECTED dentry, which gets put on the
unused list instead of being destroyed immediately.
Nick Piggin suggested fixing this by allowing d_obtain_alias to return
the unhashed dentry that is referenced by the filp, instead of making it
create a new dentry.
Leave __d_find_alias() alone to avoid changing behavior of other
callers.
Also nfsd doesn't need all the checks of __d_find_alias(); any dentry,
hashed or unhashed, disconnected or not, should work.
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2011-01-19 04:45:09 +08:00
|
|
|
{
|
|
|
|
struct dentry *de;
|
|
|
|
|
|
|
|
spin_lock(&inode->i_lock);
|
|
|
|
de = __d_find_any_alias(inode);
|
|
|
|
spin_unlock(&inode->i_lock);
|
|
|
|
return de;
|
|
|
|
}
|
2012-01-11 01:04:37 +08:00
|
|
|
EXPORT_SYMBOL(d_find_any_alias);
|
fs/dcache: allow d_obtain_alias() to return unhashed dentries
Without this patch, inodes are not promptly freed on last close of an
unlinked file by an nfs client:
client$ mount -tnfs4 server:/export/ /mnt/
client$ tail -f /mnt/FOO
...
server$ df -i /export
server$ rm /export/FOO
(^C the tail -f)
server$ df -i /export
server$ echo 2 >/proc/sys/vm/drop_caches
server$ df -i /export
the df's will show that the inode is not freed on the filesystem until
the last step, when it could have been freed after killing the client's
tail -f. On-disk data won't be deallocated either, leading to possible
spurious ENOSPC.
This occurs because when the client does the close, it arrives in a
compound with a putfh and a close, processed like:
- putfh: look up the filehandle. The only alias found for the
inode will be DCACHE_UNHASHED alias referenced by the filp
this, so it creates a new DCACHE_DISCONECTED dentry and
returns that instead.
- close: closes the existing filp, which is destroyed
immediately by dput() since it's DCACHE_UNHASHED.
- end of the compound: release the reference
to the current filehandle, and dput() the new
DCACHE_DISCONECTED dentry, which gets put on the
unused list instead of being destroyed immediately.
Nick Piggin suggested fixing this by allowing d_obtain_alias to return
the unhashed dentry that is referenced by the filp, instead of making it
create a new dentry.
Leave __d_find_alias() alone to avoid changing behavior of other
callers.
Also nfsd doesn't need all the checks of __d_find_alias(); any dentry,
hashed or unhashed, disconnected or not, should work.
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2011-01-19 04:45:09 +08:00
|
|
|
|
2008-08-11 21:48:57 +08:00
|
|
|
/**
|
|
|
|
* d_obtain_alias - find or allocate a dentry for a given inode
|
|
|
|
* @inode: inode to allocate the dentry for
|
|
|
|
*
|
|
|
|
* Obtain a dentry for an inode resulting from NFS filehandle conversion or
|
|
|
|
* similar open by handle operations. The returned dentry may be anonymous,
|
|
|
|
* or may have a full name (if the inode was already in the cache).
|
|
|
|
*
|
|
|
|
* When called on a directory inode, we must ensure that the inode only ever
|
|
|
|
* has one dentry. If a dentry is found, that is returned instead of
|
|
|
|
* allocating a new one.
|
|
|
|
*
|
|
|
|
* On successful return, the reference to the inode has been transferred
|
2008-08-11 21:49:04 +08:00
|
|
|
* to the dentry. In case of an error the reference on the inode is released.
|
|
|
|
* To make it easier to use in export operations a %NULL or IS_ERR inode may
|
|
|
|
* be passed in and will be the error will be propagate to the return value,
|
|
|
|
* with a %NULL @inode replaced by ERR_PTR(-ESTALE).
|
2008-08-11 21:48:57 +08:00
|
|
|
*/
|
|
|
|
struct dentry *d_obtain_alias(struct inode *inode)
|
|
|
|
{
|
2008-08-11 21:49:12 +08:00
|
|
|
static const struct qstr anonstring = { .name = "" };
|
|
|
|
struct dentry *tmp;
|
|
|
|
struct dentry *res;
|
2008-08-11 21:48:57 +08:00
|
|
|
|
|
|
|
if (!inode)
|
2008-08-11 21:49:04 +08:00
|
|
|
return ERR_PTR(-ESTALE);
|
2008-08-11 21:48:57 +08:00
|
|
|
if (IS_ERR(inode))
|
|
|
|
return ERR_CAST(inode);
|
|
|
|
|
fs/dcache: allow d_obtain_alias() to return unhashed dentries
Without this patch, inodes are not promptly freed on last close of an
unlinked file by an nfs client:
client$ mount -tnfs4 server:/export/ /mnt/
client$ tail -f /mnt/FOO
...
server$ df -i /export
server$ rm /export/FOO
(^C the tail -f)
server$ df -i /export
server$ echo 2 >/proc/sys/vm/drop_caches
server$ df -i /export
the df's will show that the inode is not freed on the filesystem until
the last step, when it could have been freed after killing the client's
tail -f. On-disk data won't be deallocated either, leading to possible
spurious ENOSPC.
This occurs because when the client does the close, it arrives in a
compound with a putfh and a close, processed like:
- putfh: look up the filehandle. The only alias found for the
inode will be DCACHE_UNHASHED alias referenced by the filp
this, so it creates a new DCACHE_DISCONECTED dentry and
returns that instead.
- close: closes the existing filp, which is destroyed
immediately by dput() since it's DCACHE_UNHASHED.
- end of the compound: release the reference
to the current filehandle, and dput() the new
DCACHE_DISCONECTED dentry, which gets put on the
unused list instead of being destroyed immediately.
Nick Piggin suggested fixing this by allowing d_obtain_alias to return
the unhashed dentry that is referenced by the filp, instead of making it
create a new dentry.
Leave __d_find_alias() alone to avoid changing behavior of other
callers.
Also nfsd doesn't need all the checks of __d_find_alias(); any dentry,
hashed or unhashed, disconnected or not, should work.
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2011-01-19 04:45:09 +08:00
|
|
|
res = d_find_any_alias(inode);
|
2008-08-11 21:49:12 +08:00
|
|
|
if (res)
|
|
|
|
goto out_iput;
|
|
|
|
|
2011-07-08 03:03:58 +08:00
|
|
|
tmp = __d_alloc(inode->i_sb, &anonstring);
|
2008-08-11 21:49:12 +08:00
|
|
|
if (!tmp) {
|
|
|
|
res = ERR_PTR(-ENOMEM);
|
|
|
|
goto out_iput;
|
2008-08-11 21:48:57 +08:00
|
|
|
}
|
2011-01-07 14:49:38 +08:00
|
|
|
|
2011-01-07 14:50:06 +08:00
|
|
|
spin_lock(&inode->i_lock);
|
fs/dcache: allow d_obtain_alias() to return unhashed dentries
Without this patch, inodes are not promptly freed on last close of an
unlinked file by an nfs client:
client$ mount -tnfs4 server:/export/ /mnt/
client$ tail -f /mnt/FOO
...
server$ df -i /export
server$ rm /export/FOO
(^C the tail -f)
server$ df -i /export
server$ echo 2 >/proc/sys/vm/drop_caches
server$ df -i /export
the df's will show that the inode is not freed on the filesystem until
the last step, when it could have been freed after killing the client's
tail -f. On-disk data won't be deallocated either, leading to possible
spurious ENOSPC.
This occurs because when the client does the close, it arrives in a
compound with a putfh and a close, processed like:
- putfh: look up the filehandle. The only alias found for the
inode will be DCACHE_UNHASHED alias referenced by the filp
this, so it creates a new DCACHE_DISCONECTED dentry and
returns that instead.
- close: closes the existing filp, which is destroyed
immediately by dput() since it's DCACHE_UNHASHED.
- end of the compound: release the reference
to the current filehandle, and dput() the new
DCACHE_DISCONECTED dentry, which gets put on the
unused list instead of being destroyed immediately.
Nick Piggin suggested fixing this by allowing d_obtain_alias to return
the unhashed dentry that is referenced by the filp, instead of making it
create a new dentry.
Leave __d_find_alias() alone to avoid changing behavior of other
callers.
Also nfsd doesn't need all the checks of __d_find_alias(); any dentry,
hashed or unhashed, disconnected or not, should work.
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2011-01-19 04:45:09 +08:00
|
|
|
res = __d_find_any_alias(inode);
|
2008-08-11 21:49:12 +08:00
|
|
|
if (res) {
|
2011-01-07 14:50:06 +08:00
|
|
|
spin_unlock(&inode->i_lock);
|
2008-08-11 21:49:12 +08:00
|
|
|
dput(tmp);
|
|
|
|
goto out_iput;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* attach a disconnected dentry */
|
|
|
|
spin_lock(&tmp->d_lock);
|
|
|
|
tmp->d_inode = inode;
|
|
|
|
tmp->d_flags |= DCACHE_DISCONNECTED;
|
|
|
|
list_add(&tmp->d_alias, &inode->i_dentry);
|
2011-04-26 02:01:36 +08:00
|
|
|
hlist_bl_lock(&tmp->d_sb->s_anon);
|
2011-01-07 14:50:05 +08:00
|
|
|
hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon);
|
2011-04-26 02:01:36 +08:00
|
|
|
hlist_bl_unlock(&tmp->d_sb->s_anon);
|
2008-08-11 21:49:12 +08:00
|
|
|
spin_unlock(&tmp->d_lock);
|
2011-01-07 14:50:06 +08:00
|
|
|
spin_unlock(&inode->i_lock);
|
2010-11-19 09:52:55 +08:00
|
|
|
security_d_instantiate(tmp, inode);
|
2008-08-11 21:49:12 +08:00
|
|
|
|
|
|
|
return tmp;
|
|
|
|
|
|
|
|
out_iput:
|
2010-11-19 09:52:55 +08:00
|
|
|
if (res && !IS_ERR(res))
|
|
|
|
security_d_instantiate(res, inode);
|
2008-08-11 21:49:12 +08:00
|
|
|
iput(inode);
|
|
|
|
return res;
|
2008-08-11 21:48:57 +08:00
|
|
|
}
|
2009-02-28 06:02:59 +08:00
|
|
|
EXPORT_SYMBOL(d_obtain_alias);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* d_splice_alias - splice a disconnected dentry into the tree if one exists
|
|
|
|
* @inode: the inode which may have a disconnected dentry
|
|
|
|
* @dentry: a negative dentry which we want to point to the inode.
|
|
|
|
*
|
|
|
|
* If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and
|
|
|
|
* DCACHE_DISCONNECTED), then d_move that in place of the given dentry
|
|
|
|
* and return it, else simply d_add the inode to the dentry and return NULL.
|
|
|
|
*
|
|
|
|
* This is needed in the lookup routine of any filesystem that is exportable
|
|
|
|
* (via knfsd) so that we can build dcache paths to directories effectively.
|
|
|
|
*
|
|
|
|
* If a dentry was found and moved, then it is returned. Otherwise NULL
|
|
|
|
* is returned. This matches the expected return value of ->lookup.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
|
|
|
|
{
|
|
|
|
struct dentry *new = NULL;
|
|
|
|
|
2011-07-09 09:20:11 +08:00
|
|
|
if (IS_ERR(inode))
|
|
|
|
return ERR_CAST(inode);
|
|
|
|
|
[PATCH] knfsd: close a race-opportunity in d_splice_alias
There is a possible race in d_splice_alias. Though __d_find_alias(inode, 1)
will only return a dentry with DCACHE_DISCONNECTED set, it is possible for it
to get cleared before the BUG_ON, and it is is not possible to lock against
that.
There are a couple of problems here. Firstly, the code doesn't match the
comment. The comment describes a 'disconnected' dentry as being IS_ROOT as
well as DCACHE_DISCONNECTED, however there is not testing of IS_ROOT anythere.
A dentry is marked DCACHE_DISCONNECTED when allocated with d_alloc_anon, and
remains DCACHE_DISCONNECTED while a path is built up towards the root. So a
dentry can have a valid name and a valid parent and even grandparent, but will
still be DCACHE_DISCONNECTED until a path to the root is created. Once the
path to the root is complete, everything in the path gets DCACHE_DISCONNECTED
cleared. So the fact that DCACHE_DISCONNECTED isn't enough to say that a
dentry is free to be spliced in with a given name. This can only be allowed
if the dentry does not yet have a name, so the IS_ROOT test is needed too.
However even adding that test to __d_find_alias isn't enough. As
d_splice_alias drops dcache_lock before calling d_move to perform the splice,
it could race with another thread calling d_splice_alias to splice the inode
in with a different name in a different part of the tree (in the case where a
file has hard links). So that splicing code is only really safe for
directories (as we know that directories only have one link). For
directories, the caller of d_splice_alias will be holding i_mutex on the
(unique) parent so there is no room for a race.
A consequence of this is that a non-directory will never benefit from being
spliced into a pre-exisiting dentry, but that isn't a problem. It is
perfectly OK for a non-directory to have multiple dentries, some anonymous,
some not. And the comment for d_splice_alias says that it only happens for
directories anyway.
Signed-off-by: Neil Brown <neilb@suse.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-04 17:16:16 +08:00
|
|
|
if (inode && S_ISDIR(inode->i_mode)) {
|
2011-01-07 14:50:06 +08:00
|
|
|
spin_lock(&inode->i_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
new = __d_find_alias(inode, 1);
|
|
|
|
if (new) {
|
|
|
|
BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
|
2011-01-07 14:50:06 +08:00
|
|
|
spin_unlock(&inode->i_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
security_d_instantiate(new, inode);
|
|
|
|
d_move(new, dentry);
|
|
|
|
iput(inode);
|
|
|
|
} else {
|
2011-01-07 14:50:06 +08:00
|
|
|
/* already taking inode->i_lock, so d_add() by hand */
|
2008-10-16 06:50:28 +08:00
|
|
|
__d_instantiate(dentry, inode);
|
2011-01-07 14:50:06 +08:00
|
|
|
spin_unlock(&inode->i_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
security_d_instantiate(dentry, inode);
|
|
|
|
d_rehash(dentry);
|
|
|
|
}
|
|
|
|
} else
|
|
|
|
d_add(dentry, inode);
|
|
|
|
return new;
|
|
|
|
}
|
2010-01-06 04:45:18 +08:00
|
|
|
EXPORT_SYMBOL(d_splice_alias);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-05-21 14:50:46 +08:00
|
|
|
/**
|
|
|
|
* d_add_ci - lookup or allocate new dentry with case-exact name
|
|
|
|
* @inode: the inode case-insensitive lookup has found
|
|
|
|
* @dentry: the negative dentry that was passed to the parent's lookup func
|
|
|
|
* @name: the case-exact name to be associated with the returned dentry
|
|
|
|
*
|
|
|
|
* This is to avoid filling the dcache with case-insensitive names to the
|
|
|
|
* same inode, only the actual correct case is stored in the dcache for
|
|
|
|
* case-insensitive filesystems.
|
|
|
|
*
|
|
|
|
* For a case-insensitive lookup match and if the the case-exact dentry
|
|
|
|
* already exists in in the dcache, use it and return it.
|
|
|
|
*
|
|
|
|
* If no entry exists with the exact case name, allocate new dentry with
|
|
|
|
* the exact case, and return the spliced entry.
|
|
|
|
*/
|
2008-08-08 05:49:07 +08:00
|
|
|
struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
|
2008-05-21 14:50:46 +08:00
|
|
|
struct qstr *name)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
struct dentry *found;
|
|
|
|
struct dentry *new;
|
|
|
|
|
2009-01-06 02:10:37 +08:00
|
|
|
/*
|
|
|
|
* First check if a dentry matching the name already exists,
|
|
|
|
* if not go ahead and create it now.
|
|
|
|
*/
|
2008-05-21 14:50:46 +08:00
|
|
|
found = d_hash_and_lookup(dentry->d_parent, name);
|
|
|
|
if (!found) {
|
|
|
|
new = d_alloc(dentry->d_parent, name);
|
|
|
|
if (!new) {
|
|
|
|
error = -ENOMEM;
|
|
|
|
goto err_out;
|
|
|
|
}
|
2009-01-06 02:10:37 +08:00
|
|
|
|
2008-05-21 14:50:46 +08:00
|
|
|
found = d_splice_alias(inode, new);
|
|
|
|
if (found) {
|
|
|
|
dput(new);
|
|
|
|
return found;
|
|
|
|
}
|
|
|
|
return new;
|
|
|
|
}
|
2009-01-06 02:10:37 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If a matching dentry exists, and it's not negative use it.
|
|
|
|
*
|
|
|
|
* Decrement the reference count to balance the iget() done
|
|
|
|
* earlier on.
|
|
|
|
*/
|
2008-05-21 14:50:46 +08:00
|
|
|
if (found->d_inode) {
|
|
|
|
if (unlikely(found->d_inode != inode)) {
|
|
|
|
/* This can't happen because bad inodes are unhashed. */
|
|
|
|
BUG_ON(!is_bad_inode(inode));
|
|
|
|
BUG_ON(!is_bad_inode(found->d_inode));
|
|
|
|
}
|
|
|
|
iput(inode);
|
|
|
|
return found;
|
|
|
|
}
|
2009-01-06 02:10:37 +08:00
|
|
|
|
2008-05-21 14:50:46 +08:00
|
|
|
/*
|
2011-05-31 23:58:49 +08:00
|
|
|
* We are going to instantiate this dentry, unhash it and clear the
|
|
|
|
* lookup flag so we can do that.
|
2008-05-21 14:50:46 +08:00
|
|
|
*/
|
2011-05-31 23:58:49 +08:00
|
|
|
if (unlikely(d_need_lookup(found)))
|
|
|
|
d_clear_need_lookup(found);
|
2009-01-06 02:10:37 +08:00
|
|
|
|
2008-05-21 14:50:46 +08:00
|
|
|
/*
|
|
|
|
* Negative dentry: instantiate it unless the inode is a directory and
|
2009-01-06 02:10:37 +08:00
|
|
|
* already has a dentry.
|
2008-05-21 14:50:46 +08:00
|
|
|
*/
|
2011-07-17 22:52:14 +08:00
|
|
|
new = d_splice_alias(inode, found);
|
|
|
|
if (new) {
|
|
|
|
dput(found);
|
|
|
|
found = new;
|
2008-05-21 14:50:46 +08:00
|
|
|
}
|
2011-07-17 22:52:14 +08:00
|
|
|
return found;
|
2008-05-21 14:50:46 +08:00
|
|
|
|
|
|
|
err_out:
|
|
|
|
iput(inode);
|
|
|
|
return ERR_PTR(error);
|
|
|
|
}
|
2010-01-06 04:45:18 +08:00
|
|
|
EXPORT_SYMBOL(d_add_ci);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
/**
|
|
|
|
* __d_lookup_rcu - search for a dentry (racy, store-free)
|
|
|
|
* @parent: parent dentry
|
|
|
|
* @name: qstr of name we wish to find
|
|
|
|
* @seq: returns d_seq value at the point where the dentry was found
|
|
|
|
* @inode: returns dentry->d_inode when the inode was found valid.
|
|
|
|
* Returns: dentry, or NULL
|
|
|
|
*
|
|
|
|
* __d_lookup_rcu is the dcache lookup function for rcu-walk name
|
|
|
|
* resolution (store-free path walking) design described in
|
|
|
|
* Documentation/filesystems/path-lookup.txt.
|
|
|
|
*
|
|
|
|
* This is not to be used outside core vfs.
|
|
|
|
*
|
|
|
|
* __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
|
|
|
|
* held, and rcu_read_lock held. The returned dentry must not be stored into
|
|
|
|
* without taking d_lock and checking d_seq sequence count against @seq
|
|
|
|
* returned here.
|
|
|
|
*
|
|
|
|
* A refcount may be taken on the found dentry with the __d_rcu_to_refcount
|
|
|
|
* function.
|
|
|
|
*
|
|
|
|
* Alternatively, __d_lookup_rcu may be called again to look up the child of
|
|
|
|
* the returned dentry, so long as its parent's seqlock is checked after the
|
|
|
|
* child is looked up. Thus, an interlocking stepping of sequence lock checks
|
|
|
|
* is formed, giving integrity down the path walk.
|
|
|
|
*/
|
|
|
|
struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
|
|
|
|
unsigned *seq, struct inode **inode)
|
|
|
|
{
|
|
|
|
unsigned int len = name->len;
|
|
|
|
unsigned int hash = name->hash;
|
|
|
|
const unsigned char *str = name->name;
|
2011-04-24 13:32:03 +08:00
|
|
|
struct hlist_bl_head *b = d_hash(parent, hash);
|
2011-01-07 14:50:05 +08:00
|
|
|
struct hlist_bl_node *node;
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
struct dentry *dentry;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note: There is significant duplication with __d_lookup_rcu which is
|
|
|
|
* required to prevent single threaded performance regressions
|
|
|
|
* especially on architectures where smp_rmb (in seqcounts) are costly.
|
|
|
|
* Keep the two functions in sync.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The hash list is protected using RCU.
|
|
|
|
*
|
|
|
|
* Carefully use d_seq when comparing a candidate dentry, to avoid
|
|
|
|
* races with d_move().
|
|
|
|
*
|
|
|
|
* It is possible that concurrent renames can mess up our list
|
|
|
|
* walk here and result in missing our dentry, resulting in the
|
|
|
|
* false-negative result. d_lookup() protects against concurrent
|
|
|
|
* renames using rename_lock seqlock.
|
|
|
|
*
|
2011-01-22 14:31:32 +08:00
|
|
|
* See Documentation/filesystems/path-lookup.txt for more details.
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
*/
|
2011-04-24 13:32:03 +08:00
|
|
|
hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
struct inode *i;
|
|
|
|
const char *tname;
|
|
|
|
int tlen;
|
|
|
|
|
|
|
|
if (dentry->d_name.hash != hash)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
seqretry:
|
|
|
|
*seq = read_seqcount_begin(&dentry->d_seq);
|
|
|
|
if (dentry->d_parent != parent)
|
|
|
|
continue;
|
|
|
|
if (d_unhashed(dentry))
|
|
|
|
continue;
|
|
|
|
tlen = dentry->d_name.len;
|
|
|
|
tname = dentry->d_name.name;
|
|
|
|
i = dentry->d_inode;
|
2011-01-07 14:50:08 +08:00
|
|
|
prefetch(tname);
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
/*
|
|
|
|
* This seqcount check is required to ensure name and
|
|
|
|
* len are loaded atomically, so as not to walk off the
|
|
|
|
* edge of memory when walking. If we could load this
|
|
|
|
* atomically some other way, we could drop this check.
|
|
|
|
*/
|
|
|
|
if (read_seqcount_retry(&dentry->d_seq, *seq))
|
|
|
|
goto seqretry;
|
2011-08-07 13:41:50 +08:00
|
|
|
if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
if (parent->d_op->d_compare(parent, *inode,
|
|
|
|
dentry, i,
|
|
|
|
tlen, tname, name))
|
|
|
|
continue;
|
|
|
|
} else {
|
2011-01-07 14:50:09 +08:00
|
|
|
if (dentry_cmp(tname, tlen, str, len))
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* No extra seqcount check is required after the name
|
|
|
|
* compare. The caller must perform a seqcount check in
|
|
|
|
* order to do anything useful with the returned dentry
|
|
|
|
* anyway.
|
|
|
|
*/
|
|
|
|
*inode = i;
|
|
|
|
return dentry;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/**
|
|
|
|
* d_lookup - search for a dentry
|
|
|
|
* @parent: parent dentry
|
|
|
|
* @name: qstr of name we wish to find
|
fs: remove extra lookup in __lookup_hash
fs: remove extra lookup in __lookup_hash
Optimize lookup for create operations, where no dentry should often be
common-case. In cases where it is not, such as unlink, the added overhead
is much smaller than the removed.
Also, move comments about __d_lookup racyness to the __d_lookup call site.
d_lookup is intuitive; __d_lookup is what needs commenting. So in that same
vein, add kerneldoc comments to __d_lookup and clean up some of the comments:
- We are interested in how the RCU lookup works here, particularly with
renames. Make that explicit, and point to the document where it is explained
in more detail.
- RCU is pretty standard now, and macros make implementations pretty mindless.
If we want to know about RCU barrier details, we look in RCU code.
- Delete some boring legacy comments because we don't care much about how the
code used to work, more about the interesting parts of how it works now. So
comments about lazy LRU may be interesting, but would better be done in the
LRU or refcount management code.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 02:37:34 +08:00
|
|
|
* Returns: dentry, or NULL
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
fs: remove extra lookup in __lookup_hash
fs: remove extra lookup in __lookup_hash
Optimize lookup for create operations, where no dentry should often be
common-case. In cases where it is not, such as unlink, the added overhead
is much smaller than the removed.
Also, move comments about __d_lookup racyness to the __d_lookup call site.
d_lookup is intuitive; __d_lookup is what needs commenting. So in that same
vein, add kerneldoc comments to __d_lookup and clean up some of the comments:
- We are interested in how the RCU lookup works here, particularly with
renames. Make that explicit, and point to the document where it is explained
in more detail.
- RCU is pretty standard now, and macros make implementations pretty mindless.
If we want to know about RCU barrier details, we look in RCU code.
- Delete some boring legacy comments because we don't care much about how the
code used to work, more about the interesting parts of how it works now. So
comments about lazy LRU may be interesting, but would better be done in the
LRU or refcount management code.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 02:37:34 +08:00
|
|
|
* d_lookup searches the children of the parent dentry for the name in
|
|
|
|
* question. If the dentry is found its reference count is incremented and the
|
|
|
|
* dentry is returned. The caller must use dput to free the entry when it has
|
|
|
|
* finished using it. %NULL is returned if the dentry does not exist.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
struct dentry *d_lookup(struct dentry *parent, struct qstr *name)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
struct dentry *dentry;
|
2011-01-07 14:49:37 +08:00
|
|
|
unsigned seq;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
do {
|
|
|
|
seq = read_seqbegin(&rename_lock);
|
|
|
|
dentry = __d_lookup(parent, name);
|
|
|
|
if (dentry)
|
|
|
|
break;
|
|
|
|
} while (read_seqretry(&rename_lock, seq));
|
|
|
|
return dentry;
|
|
|
|
}
|
2010-01-06 04:45:18 +08:00
|
|
|
EXPORT_SYMBOL(d_lookup);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
/**
|
fs: remove extra lookup in __lookup_hash
fs: remove extra lookup in __lookup_hash
Optimize lookup for create operations, where no dentry should often be
common-case. In cases where it is not, such as unlink, the added overhead
is much smaller than the removed.
Also, move comments about __d_lookup racyness to the __d_lookup call site.
d_lookup is intuitive; __d_lookup is what needs commenting. So in that same
vein, add kerneldoc comments to __d_lookup and clean up some of the comments:
- We are interested in how the RCU lookup works here, particularly with
renames. Make that explicit, and point to the document where it is explained
in more detail.
- RCU is pretty standard now, and macros make implementations pretty mindless.
If we want to know about RCU barrier details, we look in RCU code.
- Delete some boring legacy comments because we don't care much about how the
code used to work, more about the interesting parts of how it works now. So
comments about lazy LRU may be interesting, but would better be done in the
LRU or refcount management code.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 02:37:34 +08:00
|
|
|
* __d_lookup - search for a dentry (racy)
|
|
|
|
* @parent: parent dentry
|
|
|
|
* @name: qstr of name we wish to find
|
|
|
|
* Returns: dentry, or NULL
|
|
|
|
*
|
|
|
|
* __d_lookup is like d_lookup, however it may (rarely) return a
|
|
|
|
* false-negative result due to unrelated rename activity.
|
|
|
|
*
|
|
|
|
* __d_lookup is slightly faster by avoiding rename_lock read seqlock,
|
|
|
|
* however it must be used carefully, eg. with a following d_lookup in
|
|
|
|
* the case of failure.
|
|
|
|
*
|
|
|
|
* __d_lookup callers must be commented.
|
|
|
|
*/
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
unsigned int len = name->len;
|
|
|
|
unsigned int hash = name->hash;
|
|
|
|
const unsigned char *str = name->name;
|
2011-04-24 13:32:03 +08:00
|
|
|
struct hlist_bl_head *b = d_hash(parent, hash);
|
2011-01-07 14:50:05 +08:00
|
|
|
struct hlist_bl_node *node;
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
struct dentry *found = NULL;
|
2005-11-07 16:59:17 +08:00
|
|
|
struct dentry *dentry;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
/*
|
|
|
|
* Note: There is significant duplication with __d_lookup_rcu which is
|
|
|
|
* required to prevent single threaded performance regressions
|
|
|
|
* especially on architectures where smp_rmb (in seqcounts) are costly.
|
|
|
|
* Keep the two functions in sync.
|
|
|
|
*/
|
|
|
|
|
fs: remove extra lookup in __lookup_hash
fs: remove extra lookup in __lookup_hash
Optimize lookup for create operations, where no dentry should often be
common-case. In cases where it is not, such as unlink, the added overhead
is much smaller than the removed.
Also, move comments about __d_lookup racyness to the __d_lookup call site.
d_lookup is intuitive; __d_lookup is what needs commenting. So in that same
vein, add kerneldoc comments to __d_lookup and clean up some of the comments:
- We are interested in how the RCU lookup works here, particularly with
renames. Make that explicit, and point to the document where it is explained
in more detail.
- RCU is pretty standard now, and macros make implementations pretty mindless.
If we want to know about RCU barrier details, we look in RCU code.
- Delete some boring legacy comments because we don't care much about how the
code used to work, more about the interesting parts of how it works now. So
comments about lazy LRU may be interesting, but would better be done in the
LRU or refcount management code.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 02:37:34 +08:00
|
|
|
/*
|
|
|
|
* The hash list is protected using RCU.
|
|
|
|
*
|
|
|
|
* Take d_lock when comparing a candidate dentry, to avoid races
|
|
|
|
* with d_move().
|
|
|
|
*
|
|
|
|
* It is possible that concurrent renames can mess up our list
|
|
|
|
* walk here and result in missing our dentry, resulting in the
|
|
|
|
* false-negative result. d_lookup() protects against concurrent
|
|
|
|
* renames using rename_lock seqlock.
|
|
|
|
*
|
2011-01-22 14:31:32 +08:00
|
|
|
* See Documentation/filesystems/path-lookup.txt for more details.
|
fs: remove extra lookup in __lookup_hash
fs: remove extra lookup in __lookup_hash
Optimize lookup for create operations, where no dentry should often be
common-case. In cases where it is not, such as unlink, the added overhead
is much smaller than the removed.
Also, move comments about __d_lookup racyness to the __d_lookup call site.
d_lookup is intuitive; __d_lookup is what needs commenting. So in that same
vein, add kerneldoc comments to __d_lookup and clean up some of the comments:
- We are interested in how the RCU lookup works here, particularly with
renames. Make that explicit, and point to the document where it is explained
in more detail.
- RCU is pretty standard now, and macros make implementations pretty mindless.
If we want to know about RCU barrier details, we look in RCU code.
- Delete some boring legacy comments because we don't care much about how the
code used to work, more about the interesting parts of how it works now. So
comments about lazy LRU may be interesting, but would better be done in the
LRU or refcount management code.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 02:37:34 +08:00
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
rcu_read_lock();
|
|
|
|
|
2011-04-24 13:32:03 +08:00
|
|
|
hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
const char *tname;
|
|
|
|
int tlen;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (dentry->d_name.hash != hash)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
spin_lock(&dentry->d_lock);
|
|
|
|
if (dentry->d_parent != parent)
|
|
|
|
goto next;
|
Fix NULL pointer dereference in proc_sys_compare
The VFS interface for the 'd_compare()' is a bit special (read: 'odd'),
because it really just essentially replaces a memcmp(). The filesystem
is supposed to just compare the two names with whatever case-independent
or other function.
And when I say 'is supposed to', I obviously mean that 'procfs does odd
things, and actually looks at the dentry that we don't even pass down,
rather than just the name'. Which results in problems, because we
actually call d_compare before we have even verified that the dentry is
still hashed at all.
And that causes a problm since the inode that procfs looks at may have
been free'd and the d_inode pointer is NULL. procfs just assumes that
all dentries are positive, since procfs itself never generates a
negative one. But memory pressure will still result in the dentry
getting torn down, and as it is removed by RCU, it still remains visible
on some lists - and to d_compare.
If the filesystem just did a name comparison, we wouldn't care. And we
could just fix procfs to know about negative dentries too. But rather
than have the low-level filesystems know about internal VFS details,
just move the check for a unhashed dentry up a bit, so that we will only
call d_compare on dentries that are still active.
The actual oops this caused didn't look like a NULL pointer dereference
because procfs did a 'container_of(inode, struct proc_inode, vfs_inode)'
to get at its internal proc_inode information from the inode pointer,
and accessed a field below the inode. So the oops would look something
like
BUG: unable to handle kernel paging request at fffffffffffffff0
IP: [<ffffffff802bc6c6>] proc_sys_compare+0x36/0x50
and was seen on both x86-64 (Alexey Dobriyan and Hugh Dickins) and
ppc64 (Hugh Dickins).
Reported-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-of-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-09-29 22:42:57 +08:00
|
|
|
if (d_unhashed(dentry))
|
|
|
|
goto next;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* It is safe to compare names since d_move() cannot
|
|
|
|
* change the qstr (protected by d_lock).
|
|
|
|
*/
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
tlen = dentry->d_name.len;
|
|
|
|
tname = dentry->d_name.name;
|
2011-01-07 14:49:55 +08:00
|
|
|
if (parent->d_flags & DCACHE_OP_COMPARE) {
|
2011-01-07 14:49:27 +08:00
|
|
|
if (parent->d_op->d_compare(parent, parent->d_inode,
|
|
|
|
dentry, dentry->d_inode,
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
tlen, tname, name))
|
2005-04-17 06:20:36 +08:00
|
|
|
goto next;
|
|
|
|
} else {
|
2011-01-07 14:50:09 +08:00
|
|
|
if (dentry_cmp(tname, tlen, str, len))
|
2005-04-17 06:20:36 +08:00
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
|
2011-01-07 14:49:32 +08:00
|
|
|
dentry->d_count++;
|
Fix NULL pointer dereference in proc_sys_compare
The VFS interface for the 'd_compare()' is a bit special (read: 'odd'),
because it really just essentially replaces a memcmp(). The filesystem
is supposed to just compare the two names with whatever case-independent
or other function.
And when I say 'is supposed to', I obviously mean that 'procfs does odd
things, and actually looks at the dentry that we don't even pass down,
rather than just the name'. Which results in problems, because we
actually call d_compare before we have even verified that the dentry is
still hashed at all.
And that causes a problm since the inode that procfs looks at may have
been free'd and the d_inode pointer is NULL. procfs just assumes that
all dentries are positive, since procfs itself never generates a
negative one. But memory pressure will still result in the dentry
getting torn down, and as it is removed by RCU, it still remains visible
on some lists - and to d_compare.
If the filesystem just did a name comparison, we wouldn't care. And we
could just fix procfs to know about negative dentries too. But rather
than have the low-level filesystems know about internal VFS details,
just move the check for a unhashed dentry up a bit, so that we will only
call d_compare on dentries that are still active.
The actual oops this caused didn't look like a NULL pointer dereference
because procfs did a 'container_of(inode, struct proc_inode, vfs_inode)'
to get at its internal proc_inode information from the inode pointer,
and accessed a field below the inode. So the oops would look something
like
BUG: unable to handle kernel paging request at fffffffffffffff0
IP: [<ffffffff802bc6c6>] proc_sys_compare+0x36/0x50
and was seen on both x86-64 (Alexey Dobriyan and Hugh Dickins) and
ppc64 (Hugh Dickins).
Reported-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-of-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-09-29 22:42:57 +08:00
|
|
|
found = dentry;
|
2005-04-17 06:20:36 +08:00
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
break;
|
|
|
|
next:
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
return found;
|
|
|
|
}
|
|
|
|
|
2006-03-31 18:31:43 +08:00
|
|
|
/**
|
|
|
|
* d_hash_and_lookup - hash the qstr then search for a dentry
|
|
|
|
* @dir: Directory to search in
|
|
|
|
* @name: qstr of name we wish to find
|
|
|
|
*
|
|
|
|
* On hash failure or on lookup failure NULL is returned.
|
|
|
|
*/
|
|
|
|
struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
|
|
|
|
{
|
|
|
|
struct dentry *dentry = NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check for a fs-specific hash function. Note that we must
|
|
|
|
* calculate the standard hash first, as the d_op->d_hash()
|
|
|
|
* routine may choose to leave the hash value unchanged.
|
|
|
|
*/
|
|
|
|
name->hash = full_name_hash(name->name, name->len);
|
2011-01-07 14:49:55 +08:00
|
|
|
if (dir->d_flags & DCACHE_OP_HASH) {
|
2011-01-07 14:49:28 +08:00
|
|
|
if (dir->d_op->d_hash(dir, dir->d_inode, name) < 0)
|
2006-03-31 18:31:43 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
dentry = d_lookup(dir, name);
|
|
|
|
out:
|
|
|
|
return dentry;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/**
|
2011-01-07 14:49:16 +08:00
|
|
|
* d_validate - verify dentry provided from insecure source (deprecated)
|
2005-04-17 06:20:36 +08:00
|
|
|
* @dentry: The dentry alleged to be valid child of @dparent
|
2011-01-23 12:16:06 +08:00
|
|
|
* @dparent: The parent dentry (known to be valid)
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* An insecure source has sent us a dentry, here we verify it and dget() it.
|
|
|
|
* This is used by ncpfs in its readdir implementation.
|
|
|
|
* Zero is returned in the dentry is invalid.
|
2011-01-07 14:49:16 +08:00
|
|
|
*
|
|
|
|
* This function is slow for big directories, and deprecated, do not use it.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2011-01-05 17:01:21 +08:00
|
|
|
int d_validate(struct dentry *dentry, struct dentry *dparent)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2011-01-07 14:49:16 +08:00
|
|
|
struct dentry *child;
|
2011-01-05 17:01:21 +08:00
|
|
|
|
2011-01-07 14:49:34 +08:00
|
|
|
spin_lock(&dparent->d_lock);
|
2011-01-07 14:49:16 +08:00
|
|
|
list_for_each_entry(child, &dparent->d_subdirs, d_u.d_child) {
|
|
|
|
if (dentry == child) {
|
2011-01-07 14:49:34 +08:00
|
|
|
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
|
2011-01-07 14:49:43 +08:00
|
|
|
__dget_dlock(dentry);
|
2011-01-07 14:49:34 +08:00
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
spin_unlock(&dparent->d_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
2011-01-07 14:49:34 +08:00
|
|
|
spin_unlock(&dparent->d_lock);
|
2011-01-07 14:49:16 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2010-01-06 04:45:18 +08:00
|
|
|
EXPORT_SYMBOL(d_validate);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* When a file is deleted, we have two options:
|
|
|
|
* - turn this dentry into a negative dentry
|
|
|
|
* - unhash this dentry and free it.
|
|
|
|
*
|
|
|
|
* Usually, we want to just turn this into
|
|
|
|
* a negative dentry, but if anybody else is
|
|
|
|
* currently using the dentry or the inode
|
|
|
|
* we can't do that and we fall back on removing
|
|
|
|
* it from the hash queues and waiting for
|
|
|
|
* it to be deleted later when it has no users
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* d_delete - delete a dentry
|
|
|
|
* @dentry: The dentry to delete
|
|
|
|
*
|
|
|
|
* Turn the dentry into a negative dentry if possible, otherwise
|
|
|
|
* remove it from the hash queues so it can be deleted later
|
|
|
|
*/
|
|
|
|
|
|
|
|
void d_delete(struct dentry * dentry)
|
|
|
|
{
|
2011-01-07 14:50:06 +08:00
|
|
|
struct inode *inode;
|
2005-08-09 01:52:16 +08:00
|
|
|
int isdir = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Are we the only user?
|
|
|
|
*/
|
2011-01-07 14:49:42 +08:00
|
|
|
again:
|
2005-04-17 06:20:36 +08:00
|
|
|
spin_lock(&dentry->d_lock);
|
2011-01-07 14:50:06 +08:00
|
|
|
inode = dentry->d_inode;
|
|
|
|
isdir = S_ISDIR(inode->i_mode);
|
2011-01-07 14:49:32 +08:00
|
|
|
if (dentry->d_count == 1) {
|
2011-01-07 14:50:06 +08:00
|
|
|
if (inode && !spin_trylock(&inode->i_lock)) {
|
2011-01-07 14:49:42 +08:00
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
cpu_relax();
|
|
|
|
goto again;
|
|
|
|
}
|
2010-05-22 04:11:04 +08:00
|
|
|
dentry->d_flags &= ~DCACHE_CANT_MOUNT;
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
dentry_unlink_inode(dentry);
|
2005-08-09 01:52:16 +08:00
|
|
|
fsnotify_nameremove(dentry, isdir);
|
2005-04-17 06:20:36 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!d_unhashed(dentry))
|
|
|
|
__d_drop(dentry);
|
|
|
|
|
|
|
|
spin_unlock(&dentry->d_lock);
|
2005-08-09 01:52:16 +08:00
|
|
|
|
|
|
|
fsnotify_nameremove(dentry, isdir);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2010-01-06 04:45:18 +08:00
|
|
|
EXPORT_SYMBOL(d_delete);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-04-24 13:32:03 +08:00
|
|
|
static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2011-01-07 14:50:05 +08:00
|
|
|
BUG_ON(!d_unhashed(entry));
|
2011-04-26 02:01:36 +08:00
|
|
|
hlist_bl_lock(b);
|
vfs: get rid of insane dentry hashing rules
The dentry hashing rules have been really quite complicated for a long
while, in odd ways. That made functions like __d_drop() very fragile
and non-obvious.
In particular, whether a dentry was hashed or not was indicated with an
explicit DCACHE_UNHASHED bit. That's despite the fact that the hash
abstraction that the dentries use actually have a 'is this entry hashed
or not' model (which is a simple test of the 'pprev' pointer).
The reason that was done is because we used the normal 'is this entry
unhashed' model to mark whether the dentry had _ever_ been hashed in the
dentry hash tables, and that logic goes back many years (commit
b3423415fbc2: "dcache: avoid RCU for never-hashed dentries").
That, in turn, meant that __d_drop had totally different unhashing logic
for the dentry hash table case and for the anonymous dcache case,
because in order to use the "is this dentry hashed" logic as a flag for
whether it had ever been on the RCU hash table, we had to unhash such a
dentry differently so that we'd never think that it wasn't 'unhashed'
and wouldn't be free'd correctly.
That's just insane. It made the logic really hard to follow, when there
were two different kinds of "unhashed" states, and one of them (the one
that used "list_bl_unhashed()") really had nothing at all to do with
being unhashed per se, but with a very subtle lifetime rule instead.
So turn all of it around, and make it logical.
Instead of having a DENTRY_UNHASHED bit in d_flags to indicate whether
the dentry is on the hash chains or not, use the hash chain unhashed
logic for that. Suddenly "d_unhashed()" just uses "list_bl_unhashed()",
and everything makes sense.
And for the lifetime rule, just use an explicit DENTRY_RCUACCEES bit.
If we ever insert the dentry into the dentry hash table so that it is
visible to RCU lookup, we mark it DENTRY_RCUACCESS to show that it now
needs the RCU lifetime rules. Now suddently that test at dentry free
time makes sense too.
And because unhashing now is sane and doesn't depend on where the dentry
got unhashed from (because the dentry hash chain details doesn't have
some subtle side effects), we can re-unify the __d_drop() logic and use
common code for the unhashing.
Also fix one more open-coded hash chain bit_spin_lock() that I missed in
the previous chain locking cleanup commit.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-04-24 22:58:46 +08:00
|
|
|
entry->d_flags |= DCACHE_RCUACCESS;
|
2011-04-24 13:32:03 +08:00
|
|
|
hlist_bl_add_head_rcu(&entry->d_hash, b);
|
2011-04-26 02:01:36 +08:00
|
|
|
hlist_bl_unlock(b);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2006-08-23 08:06:07 +08:00
|
|
|
static void _d_rehash(struct dentry * entry)
|
|
|
|
{
|
|
|
|
__d_rehash(entry, d_hash(entry->d_parent, entry->d_name.hash));
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/**
|
|
|
|
* d_rehash - add an entry back to the hash
|
|
|
|
* @entry: dentry to add to the hash
|
|
|
|
*
|
|
|
|
* Adds a dentry to the hash according to its name.
|
|
|
|
*/
|
|
|
|
|
|
|
|
void d_rehash(struct dentry * entry)
|
|
|
|
{
|
|
|
|
spin_lock(&entry->d_lock);
|
2006-08-23 08:06:07 +08:00
|
|
|
_d_rehash(entry);
|
2005-04-17 06:20:36 +08:00
|
|
|
spin_unlock(&entry->d_lock);
|
|
|
|
}
|
2010-01-06 04:45:18 +08:00
|
|
|
EXPORT_SYMBOL(d_rehash);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-01-07 14:49:26 +08:00
|
|
|
/**
|
|
|
|
* dentry_update_name_case - update case insensitive dentry with a new name
|
|
|
|
* @dentry: dentry to be updated
|
|
|
|
* @name: new name
|
|
|
|
*
|
|
|
|
* Update a case insensitive dentry with new case of name.
|
|
|
|
*
|
|
|
|
* dentry must have been returned by d_lookup with name @name. Old and new
|
|
|
|
* name lengths must match (ie. no d_compare which allows mismatched name
|
|
|
|
* lengths).
|
|
|
|
*
|
|
|
|
* Parent inode i_mutex must be held over d_lookup and into this call (to
|
|
|
|
* keep renames and concurrent inserts, and readdir(2) away).
|
|
|
|
*/
|
|
|
|
void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
|
|
|
|
{
|
2011-04-15 22:34:26 +08:00
|
|
|
BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex));
|
2011-01-07 14:49:26 +08:00
|
|
|
BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
|
|
|
|
|
|
|
|
spin_lock(&dentry->d_lock);
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
write_seqcount_begin(&dentry->d_seq);
|
2011-01-07 14:49:26 +08:00
|
|
|
memcpy((unsigned char *)dentry->d_name.name, name->name, name->len);
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
write_seqcount_end(&dentry->d_seq);
|
2011-01-07 14:49:26 +08:00
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(dentry_update_name_case);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static void switch_names(struct dentry *dentry, struct dentry *target)
|
|
|
|
{
|
|
|
|
if (dname_external(target)) {
|
|
|
|
if (dname_external(dentry)) {
|
|
|
|
/*
|
|
|
|
* Both external: swap the pointers
|
|
|
|
*/
|
2009-01-08 10:09:14 +08:00
|
|
|
swap(target->d_name.name, dentry->d_name.name);
|
2005-04-17 06:20:36 +08:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* dentry:internal, target:external. Steal target's
|
|
|
|
* storage and make target internal.
|
|
|
|
*/
|
2007-10-22 07:41:38 +08:00
|
|
|
memcpy(target->d_iname, dentry->d_name.name,
|
|
|
|
dentry->d_name.len + 1);
|
2005-04-17 06:20:36 +08:00
|
|
|
dentry->d_name.name = target->d_name.name;
|
|
|
|
target->d_name.name = target->d_iname;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (dname_external(dentry)) {
|
|
|
|
/*
|
|
|
|
* dentry:external, target:internal. Give dentry's
|
|
|
|
* storage to target and make dentry internal
|
|
|
|
*/
|
|
|
|
memcpy(dentry->d_iname, target->d_name.name,
|
|
|
|
target->d_name.len + 1);
|
|
|
|
target->d_name.name = dentry->d_name.name;
|
|
|
|
dentry->d_name.name = dentry->d_iname;
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Both are internal. Just copy target to dentry
|
|
|
|
*/
|
|
|
|
memcpy(dentry->d_iname, target->d_name.name,
|
|
|
|
target->d_name.len + 1);
|
2008-11-04 04:03:50 +08:00
|
|
|
dentry->d_name.len = target->d_name.len;
|
|
|
|
return;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
2009-01-08 10:09:14 +08:00
|
|
|
swap(dentry->d_name.len, target->d_name.len);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2011-01-07 14:49:34 +08:00
|
|
|
static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* XXXX: do we really need to take target->d_lock?
|
|
|
|
*/
|
|
|
|
if (IS_ROOT(dentry) || dentry->d_parent == target->d_parent)
|
|
|
|
spin_lock(&target->d_parent->d_lock);
|
|
|
|
else {
|
|
|
|
if (d_ancestor(dentry->d_parent, target->d_parent)) {
|
|
|
|
spin_lock(&dentry->d_parent->d_lock);
|
|
|
|
spin_lock_nested(&target->d_parent->d_lock,
|
|
|
|
DENTRY_D_LOCK_NESTED);
|
|
|
|
} else {
|
|
|
|
spin_lock(&target->d_parent->d_lock);
|
|
|
|
spin_lock_nested(&dentry->d_parent->d_lock,
|
|
|
|
DENTRY_D_LOCK_NESTED);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (target < dentry) {
|
|
|
|
spin_lock_nested(&target->d_lock, 2);
|
|
|
|
spin_lock_nested(&dentry->d_lock, 3);
|
|
|
|
} else {
|
|
|
|
spin_lock_nested(&dentry->d_lock, 2);
|
|
|
|
spin_lock_nested(&target->d_lock, 3);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void dentry_unlock_parents_for_move(struct dentry *dentry,
|
|
|
|
struct dentry *target)
|
|
|
|
{
|
|
|
|
if (target->d_parent != dentry->d_parent)
|
|
|
|
spin_unlock(&dentry->d_parent->d_lock);
|
|
|
|
if (target->d_parent != target)
|
|
|
|
spin_unlock(&target->d_parent->d_lock);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2011-01-07 14:49:34 +08:00
|
|
|
* When switching names, the actual string doesn't strictly have to
|
|
|
|
* be preserved in the target - because we're dropping the target
|
|
|
|
* anyway. As such, we can just do a simple memcpy() to copy over
|
|
|
|
* the new name before we switch.
|
|
|
|
*
|
|
|
|
* Note that we have to be a lot more careful about getting the hash
|
|
|
|
* switched - we have to switch the hash value properly even if it
|
|
|
|
* then no longer matches the actual (corrupted) string of the target.
|
|
|
|
* The hash value has to match the hash queue that the dentry is on..
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2006-10-22 01:24:20 +08:00
|
|
|
/*
|
2011-07-13 09:42:24 +08:00
|
|
|
* __d_move - move a dentry
|
2005-04-17 06:20:36 +08:00
|
|
|
* @dentry: entry to move
|
|
|
|
* @target: new dentry
|
|
|
|
*
|
|
|
|
* Update the dcache to reflect the move of a file name. Negative
|
2011-07-27 01:33:16 +08:00
|
|
|
* dcache entries should not be moved in this way. Caller must hold
|
|
|
|
* rename_lock, the i_mutex of the source and target directories,
|
|
|
|
* and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2011-07-13 09:42:24 +08:00
|
|
|
static void __d_move(struct dentry * dentry, struct dentry * target)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
if (!dentry->d_inode)
|
|
|
|
printk(KERN_WARNING "VFS: moving negative dcache entry\n");
|
|
|
|
|
2011-01-07 14:49:34 +08:00
|
|
|
BUG_ON(d_ancestor(dentry, target));
|
|
|
|
BUG_ON(d_ancestor(target, dentry));
|
|
|
|
|
|
|
|
dentry_lock_for_move(dentry, target);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
write_seqcount_begin(&dentry->d_seq);
|
|
|
|
write_seqcount_begin(&target->d_seq);
|
|
|
|
|
2011-01-07 14:50:05 +08:00
|
|
|
/* __d_drop does write_seqcount_barrier, but they're OK to nest. */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Move the dentry to the target hash queue. Don't bother checking
|
|
|
|
* for the same hash queue because of how unlikely it is.
|
|
|
|
*/
|
|
|
|
__d_drop(dentry);
|
2011-01-07 14:49:30 +08:00
|
|
|
__d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash));
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Unhash the target: dput() will then get rid of it */
|
|
|
|
__d_drop(target);
|
|
|
|
|
[PATCH] shrink dentry struct
Some long time ago, dentry struct was carefully tuned so that on 32 bits
UP, sizeof(struct dentry) was exactly 128, ie a power of 2, and a multiple
of memory cache lines.
Then RCU was added and dentry struct enlarged by two pointers, with nice
results for SMP, but not so good on UP, because breaking the above tuning
(128 + 8 = 136 bytes)
This patch reverts this unwanted side effect, by using an union (d_u),
where d_rcu and d_child are placed so that these two fields can share their
memory needs.
At the time d_free() is called (and d_rcu is really used), d_child is known
to be empty and not touched by the dentry freeing.
Lockless lookups only access d_name, d_parent, d_lock, d_op, d_flags (so
the previous content of d_child is not needed if said dentry was unhashed
but still accessed by a CPU because of RCU constraints)
As dentry cache easily contains millions of entries, a size reduction is
worth the extra complexity of the ugly C union.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Maneesh Soni <maneesh@in.ibm.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Ian Kent <raven@themaw.net>
Cc: Paul Jackson <pj@sgi.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Cc: James Morris <jmorris@namei.org>
Cc: Stephen Smalley <sds@epoch.ncsc.mil>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:03:32 +08:00
|
|
|
list_del(&dentry->d_u.d_child);
|
|
|
|
list_del(&target->d_u.d_child);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Switch the names.. */
|
|
|
|
switch_names(dentry, target);
|
2009-01-08 10:09:14 +08:00
|
|
|
swap(dentry->d_name.hash, target->d_name.hash);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* ... and switch the parents */
|
|
|
|
if (IS_ROOT(dentry)) {
|
|
|
|
dentry->d_parent = target->d_parent;
|
|
|
|
target->d_parent = target;
|
[PATCH] shrink dentry struct
Some long time ago, dentry struct was carefully tuned so that on 32 bits
UP, sizeof(struct dentry) was exactly 128, ie a power of 2, and a multiple
of memory cache lines.
Then RCU was added and dentry struct enlarged by two pointers, with nice
results for SMP, but not so good on UP, because breaking the above tuning
(128 + 8 = 136 bytes)
This patch reverts this unwanted side effect, by using an union (d_u),
where d_rcu and d_child are placed so that these two fields can share their
memory needs.
At the time d_free() is called (and d_rcu is really used), d_child is known
to be empty and not touched by the dentry freeing.
Lockless lookups only access d_name, d_parent, d_lock, d_op, d_flags (so
the previous content of d_child is not needed if said dentry was unhashed
but still accessed by a CPU because of RCU constraints)
As dentry cache easily contains millions of entries, a size reduction is
worth the extra complexity of the ugly C union.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Maneesh Soni <maneesh@in.ibm.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Ian Kent <raven@themaw.net>
Cc: Paul Jackson <pj@sgi.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Cc: James Morris <jmorris@namei.org>
Cc: Stephen Smalley <sds@epoch.ncsc.mil>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:03:32 +08:00
|
|
|
INIT_LIST_HEAD(&target->d_u.d_child);
|
2005-04-17 06:20:36 +08:00
|
|
|
} else {
|
2009-01-08 10:09:14 +08:00
|
|
|
swap(dentry->d_parent, target->d_parent);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* And add them back to the (new) parent lists */
|
[PATCH] shrink dentry struct
Some long time ago, dentry struct was carefully tuned so that on 32 bits
UP, sizeof(struct dentry) was exactly 128, ie a power of 2, and a multiple
of memory cache lines.
Then RCU was added and dentry struct enlarged by two pointers, with nice
results for SMP, but not so good on UP, because breaking the above tuning
(128 + 8 = 136 bytes)
This patch reverts this unwanted side effect, by using an union (d_u),
where d_rcu and d_child are placed so that these two fields can share their
memory needs.
At the time d_free() is called (and d_rcu is really used), d_child is known
to be empty and not touched by the dentry freeing.
Lockless lookups only access d_name, d_parent, d_lock, d_op, d_flags (so
the previous content of d_child is not needed if said dentry was unhashed
but still accessed by a CPU because of RCU constraints)
As dentry cache easily contains millions of entries, a size reduction is
worth the extra complexity of the ugly C union.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Maneesh Soni <maneesh@in.ibm.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Ian Kent <raven@themaw.net>
Cc: Paul Jackson <pj@sgi.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Cc: James Morris <jmorris@namei.org>
Cc: Stephen Smalley <sds@epoch.ncsc.mil>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:03:32 +08:00
|
|
|
list_add(&target->d_u.d_child, &target->d_parent->d_subdirs);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
[PATCH] shrink dentry struct
Some long time ago, dentry struct was carefully tuned so that on 32 bits
UP, sizeof(struct dentry) was exactly 128, ie a power of 2, and a multiple
of memory cache lines.
Then RCU was added and dentry struct enlarged by two pointers, with nice
results for SMP, but not so good on UP, because breaking the above tuning
(128 + 8 = 136 bytes)
This patch reverts this unwanted side effect, by using an union (d_u),
where d_rcu and d_child are placed so that these two fields can share their
memory needs.
At the time d_free() is called (and d_rcu is really used), d_child is known
to be empty and not touched by the dentry freeing.
Lockless lookups only access d_name, d_parent, d_lock, d_op, d_flags (so
the previous content of d_child is not needed if said dentry was unhashed
but still accessed by a CPU because of RCU constraints)
As dentry cache easily contains millions of entries, a size reduction is
worth the extra complexity of the ugly C union.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Maneesh Soni <maneesh@in.ibm.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Ian Kent <raven@themaw.net>
Cc: Paul Jackson <pj@sgi.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Cc: James Morris <jmorris@namei.org>
Cc: Stephen Smalley <sds@epoch.ncsc.mil>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:03:32 +08:00
|
|
|
list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
|
2011-01-07 14:49:34 +08:00
|
|
|
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
write_seqcount_end(&target->d_seq);
|
|
|
|
write_seqcount_end(&dentry->d_seq);
|
|
|
|
|
2011-01-07 14:49:34 +08:00
|
|
|
dentry_unlock_parents_for_move(dentry, target);
|
2005-04-17 06:20:36 +08:00
|
|
|
spin_unlock(&target->d_lock);
|
2006-03-25 19:07:09 +08:00
|
|
|
fsnotify_d_move(dentry);
|
2005-04-17 06:20:36 +08:00
|
|
|
spin_unlock(&dentry->d_lock);
|
2011-07-13 09:42:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* d_move - move a dentry
|
|
|
|
* @dentry: entry to move
|
|
|
|
* @target: new dentry
|
|
|
|
*
|
|
|
|
* Update the dcache to reflect the move of a file name. Negative
|
2011-07-27 01:33:16 +08:00
|
|
|
* dcache entries should not be moved in this way. See the locking
|
|
|
|
* requirements for __d_move.
|
2011-07-13 09:42:24 +08:00
|
|
|
*/
|
|
|
|
void d_move(struct dentry *dentry, struct dentry *target)
|
|
|
|
{
|
|
|
|
write_seqlock(&rename_lock);
|
|
|
|
__d_move(dentry, target);
|
2005-04-17 06:20:36 +08:00
|
|
|
write_sequnlock(&rename_lock);
|
2006-10-22 01:24:20 +08:00
|
|
|
}
|
2010-01-06 04:45:18 +08:00
|
|
|
EXPORT_SYMBOL(d_move);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-10-16 06:50:28 +08:00
|
|
|
/**
|
|
|
|
* d_ancestor - search for an ancestor
|
|
|
|
* @p1: ancestor dentry
|
|
|
|
* @p2: child dentry
|
|
|
|
*
|
|
|
|
* Returns the ancestor dentry of p2 which is a child of p1, if p1 is
|
|
|
|
* an ancestor of p2, else NULL.
|
2006-10-22 01:24:20 +08:00
|
|
|
*/
|
2008-10-16 06:50:28 +08:00
|
|
|
struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
|
2006-10-22 01:24:20 +08:00
|
|
|
{
|
|
|
|
struct dentry *p;
|
|
|
|
|
2008-10-16 06:50:27 +08:00
|
|
|
for (p = p2; !IS_ROOT(p); p = p->d_parent) {
|
2006-10-22 01:24:20 +08:00
|
|
|
if (p->d_parent == p1)
|
2008-10-16 06:50:28 +08:00
|
|
|
return p;
|
2006-10-22 01:24:20 +08:00
|
|
|
}
|
2008-10-16 06:50:28 +08:00
|
|
|
return NULL;
|
2006-10-22 01:24:20 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This helper attempts to cope with remotely renamed directories
|
|
|
|
*
|
|
|
|
* It assumes that the caller is already holding
|
2011-07-13 09:42:24 +08:00
|
|
|
* dentry->d_parent->d_inode->i_mutex, inode->i_lock and rename_lock
|
2006-10-22 01:24:20 +08:00
|
|
|
*
|
|
|
|
* Note: If ever the locking in lock_rename() changes, then please
|
|
|
|
* remember to update this too...
|
|
|
|
*/
|
2011-01-07 14:50:06 +08:00
|
|
|
static struct dentry *__d_unalias(struct inode *inode,
|
|
|
|
struct dentry *dentry, struct dentry *alias)
|
2006-10-22 01:24:20 +08:00
|
|
|
{
|
|
|
|
struct mutex *m1 = NULL, *m2 = NULL;
|
|
|
|
struct dentry *ret;
|
|
|
|
|
|
|
|
/* If alias and dentry share a parent, then no extra locks required */
|
|
|
|
if (alias->d_parent == dentry->d_parent)
|
|
|
|
goto out_unalias;
|
|
|
|
|
|
|
|
/* See lock_rename() */
|
|
|
|
ret = ERR_PTR(-EBUSY);
|
|
|
|
if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex))
|
|
|
|
goto out_err;
|
|
|
|
m1 = &dentry->d_sb->s_vfs_rename_mutex;
|
|
|
|
if (!mutex_trylock(&alias->d_parent->d_inode->i_mutex))
|
|
|
|
goto out_err;
|
|
|
|
m2 = &alias->d_parent->d_inode->i_mutex;
|
|
|
|
out_unalias:
|
2011-07-13 09:42:24 +08:00
|
|
|
__d_move(alias, dentry);
|
2006-10-22 01:24:20 +08:00
|
|
|
ret = alias;
|
|
|
|
out_err:
|
2011-01-07 14:50:06 +08:00
|
|
|
spin_unlock(&inode->i_lock);
|
2006-10-22 01:24:20 +08:00
|
|
|
if (m2)
|
|
|
|
mutex_unlock(m2);
|
|
|
|
if (m1)
|
|
|
|
mutex_unlock(m1);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2006-08-23 08:06:07 +08:00
|
|
|
/*
|
|
|
|
* Prepare an anonymous dentry for life in the superblock's dentry tree as a
|
|
|
|
* named dentry in place of the dentry to be replaced.
|
2011-01-07 14:49:34 +08:00
|
|
|
* returns with anon->d_lock held!
|
2006-08-23 08:06:07 +08:00
|
|
|
*/
|
|
|
|
static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
|
|
|
|
{
|
|
|
|
struct dentry *dparent, *aparent;
|
|
|
|
|
2011-01-07 14:49:34 +08:00
|
|
|
dentry_lock_for_move(anon, dentry);
|
2006-08-23 08:06:07 +08:00
|
|
|
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
write_seqcount_begin(&dentry->d_seq);
|
|
|
|
write_seqcount_begin(&anon->d_seq);
|
|
|
|
|
2006-08-23 08:06:07 +08:00
|
|
|
dparent = dentry->d_parent;
|
|
|
|
aparent = anon->d_parent;
|
|
|
|
|
2011-01-07 14:49:34 +08:00
|
|
|
switch_names(dentry, anon);
|
|
|
|
swap(dentry->d_name.hash, anon->d_name.hash);
|
|
|
|
|
2006-08-23 08:06:07 +08:00
|
|
|
dentry->d_parent = (aparent == anon) ? dentry : aparent;
|
|
|
|
list_del(&dentry->d_u.d_child);
|
|
|
|
if (!IS_ROOT(dentry))
|
|
|
|
list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
|
|
|
|
else
|
|
|
|
INIT_LIST_HEAD(&dentry->d_u.d_child);
|
|
|
|
|
|
|
|
anon->d_parent = (dparent == dentry) ? anon : dparent;
|
|
|
|
list_del(&anon->d_u.d_child);
|
|
|
|
if (!IS_ROOT(anon))
|
|
|
|
list_add(&anon->d_u.d_child, &anon->d_parent->d_subdirs);
|
|
|
|
else
|
|
|
|
INIT_LIST_HEAD(&anon->d_u.d_child);
|
|
|
|
|
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 14:49:52 +08:00
|
|
|
write_seqcount_end(&dentry->d_seq);
|
|
|
|
write_seqcount_end(&anon->d_seq);
|
|
|
|
|
2011-01-07 14:49:34 +08:00
|
|
|
dentry_unlock_parents_for_move(anon, dentry);
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
|
|
|
|
/* anon->d_lock still locked, returns locked */
|
2006-08-23 08:06:07 +08:00
|
|
|
anon->d_flags &= ~DCACHE_DISCONNECTED;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* d_materialise_unique - introduce an inode into the tree
|
|
|
|
* @dentry: candidate dentry
|
|
|
|
* @inode: inode to bind to the dentry, to which aliases may be attached
|
|
|
|
*
|
|
|
|
* Introduces an dentry into the tree, substituting an extant disconnected
|
2011-07-27 01:33:16 +08:00
|
|
|
* root directory alias in its place if there is one. Caller must hold the
|
|
|
|
* i_mutex of the parent directory.
|
2006-08-23 08:06:07 +08:00
|
|
|
*/
|
|
|
|
struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
|
|
|
|
{
|
2006-10-22 01:24:20 +08:00
|
|
|
struct dentry *actual;
|
2006-08-23 08:06:07 +08:00
|
|
|
|
|
|
|
BUG_ON(!d_unhashed(dentry));
|
|
|
|
|
|
|
|
if (!inode) {
|
|
|
|
actual = dentry;
|
2008-10-16 06:50:28 +08:00
|
|
|
__d_instantiate(dentry, NULL);
|
2011-01-07 14:49:42 +08:00
|
|
|
d_rehash(actual);
|
|
|
|
goto out_nolock;
|
2006-08-23 08:06:07 +08:00
|
|
|
}
|
|
|
|
|
2011-01-07 14:50:06 +08:00
|
|
|
spin_lock(&inode->i_lock);
|
2011-01-07 14:49:42 +08:00
|
|
|
|
2006-10-22 01:24:20 +08:00
|
|
|
if (S_ISDIR(inode->i_mode)) {
|
|
|
|
struct dentry *alias;
|
|
|
|
|
|
|
|
/* Does an aliased dentry already exist? */
|
|
|
|
alias = __d_find_alias(inode, 0);
|
|
|
|
if (alias) {
|
|
|
|
actual = alias;
|
2011-07-13 09:42:24 +08:00
|
|
|
write_seqlock(&rename_lock);
|
|
|
|
|
|
|
|
if (d_ancestor(alias, dentry)) {
|
|
|
|
/* Check for loops */
|
|
|
|
actual = ERR_PTR(-ELOOP);
|
|
|
|
} else if (IS_ROOT(alias)) {
|
|
|
|
/* Is this an anonymous mountpoint that we
|
|
|
|
* could splice into our tree? */
|
2006-10-22 01:24:20 +08:00
|
|
|
__d_materialise_dentry(dentry, alias);
|
2011-07-13 09:42:24 +08:00
|
|
|
write_sequnlock(&rename_lock);
|
2006-10-22 01:24:20 +08:00
|
|
|
__d_drop(alias);
|
|
|
|
goto found;
|
2011-07-13 09:42:24 +08:00
|
|
|
} else {
|
|
|
|
/* Nope, but we must(!) avoid directory
|
|
|
|
* aliasing */
|
|
|
|
actual = __d_unalias(inode, dentry, alias);
|
2006-10-22 01:24:20 +08:00
|
|
|
}
|
2011-07-13 09:42:24 +08:00
|
|
|
write_sequnlock(&rename_lock);
|
2011-08-16 22:31:30 +08:00
|
|
|
if (IS_ERR(actual)) {
|
|
|
|
if (PTR_ERR(actual) == -ELOOP)
|
|
|
|
pr_warn_ratelimited(
|
|
|
|
"VFS: Lookup of '%s' in %s %s"
|
|
|
|
" would have caused loop\n",
|
|
|
|
dentry->d_name.name,
|
|
|
|
inode->i_sb->s_type->name,
|
|
|
|
inode->i_sb->s_id);
|
2006-10-22 01:24:20 +08:00
|
|
|
dput(alias);
|
2011-08-16 22:31:30 +08:00
|
|
|
}
|
2006-10-22 01:24:20 +08:00
|
|
|
goto out_nolock;
|
|
|
|
}
|
2006-08-23 08:06:07 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Add a unique reference */
|
|
|
|
actual = __d_instantiate_unique(dentry, inode);
|
|
|
|
if (!actual)
|
|
|
|
actual = dentry;
|
2011-01-07 14:49:42 +08:00
|
|
|
else
|
|
|
|
BUG_ON(!d_unhashed(actual));
|
2006-08-23 08:06:07 +08:00
|
|
|
|
|
|
|
spin_lock(&actual->d_lock);
|
|
|
|
found:
|
|
|
|
_d_rehash(actual);
|
|
|
|
spin_unlock(&actual->d_lock);
|
2011-01-07 14:50:06 +08:00
|
|
|
spin_unlock(&inode->i_lock);
|
2006-10-22 01:24:20 +08:00
|
|
|
out_nolock:
|
2006-08-23 08:06:07 +08:00
|
|
|
if (actual == dentry) {
|
|
|
|
security_d_instantiate(dentry, inode);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
iput(inode);
|
|
|
|
return actual;
|
|
|
|
}
|
2010-01-06 04:45:18 +08:00
|
|
|
EXPORT_SYMBOL_GPL(d_materialise_unique);
|
2006-08-23 08:06:07 +08:00
|
|
|
|
2008-06-24 00:11:53 +08:00
|
|
|
static int prepend(char **buffer, int *buflen, const char *str, int namelen)
|
2008-03-27 20:06:20 +08:00
|
|
|
{
|
|
|
|
*buflen -= namelen;
|
|
|
|
if (*buflen < 0)
|
|
|
|
return -ENAMETOOLONG;
|
|
|
|
*buffer -= namelen;
|
|
|
|
memcpy(*buffer, str, namelen);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-06-24 00:11:53 +08:00
|
|
|
static int prepend_name(char **buffer, int *buflen, struct qstr *name)
|
|
|
|
{
|
|
|
|
return prepend(buffer, buflen, name->name, name->len);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/**
|
2010-11-19 07:02:49 +08:00
|
|
|
* prepend_path - Prepend path string to a buffer
|
2008-03-27 20:06:21 +08:00
|
|
|
* @path: the dentry/vfsmount to report
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 21:43:34 +08:00
|
|
|
* @root: root vfsmnt/dentry
|
2010-08-10 17:41:39 +08:00
|
|
|
* @buffer: pointer to the end of the buffer
|
|
|
|
* @buflen: pointer to buffer length
|
2007-02-14 04:08:18 +08:00
|
|
|
*
|
2011-01-07 14:49:37 +08:00
|
|
|
* Caller holds the rename_lock.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 21:43:34 +08:00
|
|
|
static int prepend_path(const struct path *path,
|
|
|
|
const struct path *root,
|
2010-08-10 17:41:39 +08:00
|
|
|
char **buffer, int *buflen)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2008-03-27 20:06:21 +08:00
|
|
|
struct dentry *dentry = path->dentry;
|
|
|
|
struct vfsmount *vfsmnt = path->mnt;
|
2011-11-25 11:19:58 +08:00
|
|
|
struct mount *mnt = real_mount(vfsmnt);
|
2010-08-10 17:41:39 +08:00
|
|
|
bool slash = false;
|
|
|
|
int error = 0;
|
2008-03-27 20:06:20 +08:00
|
|
|
|
fs: brlock vfsmount_lock
fs: brlock vfsmount_lock
Use a brlock for the vfsmount lock. It must be taken for write whenever
modifying the mount hash or associated fields, and may be taken for read when
performing mount hash lookups.
A new lock is added for the mnt-id allocator, so it doesn't need to take
the heavy vfsmount write-lock.
The number of atomics should remain the same for fastpath rlock cases, though
code would be slightly slower due to per-cpu access. Scalability is not not be
much improved in common cases yet, due to other locks (ie. dcache_lock) getting
in the way. However path lookups crossing mountpoints should be one case where
scalability is improved (currently requiring the global lock).
The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node
Altix system (high latency to remote nodes), a simple umount microbenchmark
(mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it
took 6.8s, afterwards took 7.1s, about 5% slower.
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 02:37:39 +08:00
|
|
|
br_read_lock(vfsmount_lock);
|
2010-08-10 17:41:39 +08:00
|
|
|
while (dentry != root->dentry || vfsmnt != root->mnt) {
|
2005-04-17 06:20:36 +08:00
|
|
|
struct dentry * parent;
|
|
|
|
|
|
|
|
if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
|
2007-02-14 04:08:18 +08:00
|
|
|
/* Global root? */
|
2011-11-25 10:47:05 +08:00
|
|
|
if (!mnt_has_parent(mnt))
|
2005-04-17 06:20:36 +08:00
|
|
|
goto global_root;
|
2011-11-25 11:25:07 +08:00
|
|
|
dentry = mnt->mnt_mountpoint;
|
2011-11-25 11:19:58 +08:00
|
|
|
mnt = mnt->mnt_parent;
|
|
|
|
vfsmnt = &mnt->mnt;
|
2005-04-17 06:20:36 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
parent = dentry->d_parent;
|
|
|
|
prefetch(parent);
|
2011-01-07 14:49:36 +08:00
|
|
|
spin_lock(&dentry->d_lock);
|
2010-08-10 17:41:39 +08:00
|
|
|
error = prepend_name(buffer, buflen, &dentry->d_name);
|
2011-01-07 14:49:36 +08:00
|
|
|
spin_unlock(&dentry->d_lock);
|
2010-08-10 17:41:39 +08:00
|
|
|
if (!error)
|
|
|
|
error = prepend(buffer, buflen, "/", 1);
|
|
|
|
if (error)
|
|
|
|
break;
|
|
|
|
|
|
|
|
slash = true;
|
2005-04-17 06:20:36 +08:00
|
|
|
dentry = parent;
|
|
|
|
}
|
|
|
|
|
2010-08-10 17:41:39 +08:00
|
|
|
if (!error && !slash)
|
|
|
|
error = prepend(buffer, buflen, "/", 1);
|
|
|
|
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 21:43:34 +08:00
|
|
|
out:
|
fs: brlock vfsmount_lock
fs: brlock vfsmount_lock
Use a brlock for the vfsmount lock. It must be taken for write whenever
modifying the mount hash or associated fields, and may be taken for read when
performing mount hash lookups.
A new lock is added for the mnt-id allocator, so it doesn't need to take
the heavy vfsmount write-lock.
The number of atomics should remain the same for fastpath rlock cases, though
code would be slightly slower due to per-cpu access. Scalability is not not be
much improved in common cases yet, due to other locks (ie. dcache_lock) getting
in the way. However path lookups crossing mountpoints should be one case where
scalability is improved (currently requiring the global lock).
The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node
Altix system (high latency to remote nodes), a simple umount microbenchmark
(mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it
took 6.8s, afterwards took 7.1s, about 5% slower.
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-18 02:37:39 +08:00
|
|
|
br_read_unlock(vfsmount_lock);
|
2010-08-10 17:41:39 +08:00
|
|
|
return error;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
global_root:
|
2010-08-10 17:41:38 +08:00
|
|
|
/*
|
|
|
|
* Filesystems needing to implement special "root names"
|
|
|
|
* should do so with ->d_dname()
|
|
|
|
*/
|
|
|
|
if (IS_ROOT(dentry) &&
|
|
|
|
(dentry->d_name.len != 1 || dentry->d_name.name[0] != '/')) {
|
|
|
|
WARN(1, "Root dentry has weird name <%.*s>\n",
|
|
|
|
(int) dentry->d_name.len, dentry->d_name.name);
|
|
|
|
}
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 21:43:34 +08:00
|
|
|
if (!slash)
|
|
|
|
error = prepend(buffer, buflen, "/", 1);
|
|
|
|
if (!error)
|
2011-11-25 13:46:35 +08:00
|
|
|
error = real_mount(vfsmnt)->mnt_ns ? 1 : 2;
|
2008-06-16 19:28:07 +08:00
|
|
|
goto out;
|
2010-08-10 17:41:39 +08:00
|
|
|
}
|
2008-06-16 19:28:07 +08:00
|
|
|
|
2010-08-10 17:41:39 +08:00
|
|
|
/**
|
|
|
|
* __d_path - return the path of a dentry
|
|
|
|
* @path: the dentry/vfsmount to report
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 21:43:34 +08:00
|
|
|
* @root: root vfsmnt/dentry
|
2010-08-15 04:05:31 +08:00
|
|
|
* @buf: buffer to return value in
|
2010-08-10 17:41:39 +08:00
|
|
|
* @buflen: buffer length
|
|
|
|
*
|
2010-08-10 17:41:40 +08:00
|
|
|
* Convert a dentry into an ASCII path name.
|
2010-08-10 17:41:39 +08:00
|
|
|
*
|
|
|
|
* Returns a pointer into the buffer or an error code if the
|
|
|
|
* path was too long.
|
|
|
|
*
|
2010-10-10 17:36:21 +08:00
|
|
|
* "buflen" should be positive.
|
2010-08-10 17:41:39 +08:00
|
|
|
*
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 21:43:34 +08:00
|
|
|
* If the path is not reachable from the supplied root, return %NULL.
|
2010-08-10 17:41:39 +08:00
|
|
|
*/
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 21:43:34 +08:00
|
|
|
char *__d_path(const struct path *path,
|
|
|
|
const struct path *root,
|
2010-08-10 17:41:39 +08:00
|
|
|
char *buf, int buflen)
|
|
|
|
{
|
|
|
|
char *res = buf + buflen;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
prepend(&res, &buflen, "\0", 1);
|
2011-01-07 14:49:37 +08:00
|
|
|
write_seqlock(&rename_lock);
|
2010-08-10 17:41:39 +08:00
|
|
|
error = prepend_path(path, root, &res, &buflen);
|
2011-01-07 14:49:37 +08:00
|
|
|
write_sequnlock(&rename_lock);
|
2010-10-10 17:36:21 +08:00
|
|
|
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 21:43:34 +08:00
|
|
|
if (error < 0)
|
|
|
|
return ERR_PTR(error);
|
|
|
|
if (error > 0)
|
|
|
|
return NULL;
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
char *d_absolute_path(const struct path *path,
|
|
|
|
char *buf, int buflen)
|
|
|
|
{
|
|
|
|
struct path root = {};
|
|
|
|
char *res = buf + buflen;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
prepend(&res, &buflen, "\0", 1);
|
|
|
|
write_seqlock(&rename_lock);
|
|
|
|
error = prepend_path(path, &root, &res, &buflen);
|
|
|
|
write_sequnlock(&rename_lock);
|
|
|
|
|
|
|
|
if (error > 1)
|
|
|
|
error = -EINVAL;
|
|
|
|
if (error < 0)
|
2010-08-10 17:41:39 +08:00
|
|
|
return ERR_PTR(error);
|
|
|
|
return res;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2010-08-10 17:41:40 +08:00
|
|
|
/*
|
|
|
|
* same as __d_path but appends "(deleted)" for unlinked files.
|
|
|
|
*/
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 21:43:34 +08:00
|
|
|
static int path_with_deleted(const struct path *path,
|
|
|
|
const struct path *root,
|
|
|
|
char **buf, int *buflen)
|
2010-08-10 17:41:40 +08:00
|
|
|
{
|
|
|
|
prepend(buf, buflen, "\0", 1);
|
|
|
|
if (d_unlinked(path->dentry)) {
|
|
|
|
int error = prepend(buf, buflen, " (deleted)", 10);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
return prepend_path(path, root, buf, buflen);
|
|
|
|
}
|
|
|
|
|
2010-08-10 17:41:41 +08:00
|
|
|
static int prepend_unreachable(char **buffer, int *buflen)
|
|
|
|
{
|
|
|
|
return prepend(buffer, buflen, "(unreachable)", 13);
|
|
|
|
}
|
|
|
|
|
2008-02-15 11:38:32 +08:00
|
|
|
/**
|
|
|
|
* d_path - return the path of a dentry
|
2008-02-15 11:38:44 +08:00
|
|
|
* @path: path to report
|
2008-02-15 11:38:32 +08:00
|
|
|
* @buf: buffer to return value in
|
|
|
|
* @buflen: buffer length
|
|
|
|
*
|
|
|
|
* Convert a dentry into an ASCII path name. If the entry has been deleted
|
|
|
|
* the string " (deleted)" is appended. Note that this is ambiguous.
|
|
|
|
*
|
2008-12-02 06:35:00 +08:00
|
|
|
* Returns a pointer into the buffer or an error code if the path was
|
|
|
|
* too long. Note: Callers should use the returned pointer, not the passed
|
|
|
|
* in buffer, to use the name! The implementation often starts at an offset
|
|
|
|
* into the buffer, and may leave 0 bytes at the start.
|
2008-02-15 11:38:32 +08:00
|
|
|
*
|
2008-06-24 00:11:52 +08:00
|
|
|
* "buflen" should be positive.
|
2008-02-15 11:38:32 +08:00
|
|
|
*/
|
2008-06-10 07:40:36 +08:00
|
|
|
char *d_path(const struct path *path, char *buf, int buflen)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2010-08-10 17:41:40 +08:00
|
|
|
char *res = buf + buflen;
|
2008-02-15 11:34:38 +08:00
|
|
|
struct path root;
|
2010-08-10 17:41:40 +08:00
|
|
|
int error;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-05-08 15:26:18 +08:00
|
|
|
/*
|
|
|
|
* We have various synthetic filesystems that never get mounted. On
|
|
|
|
* these filesystems dentries are never used for lookup purposes, and
|
|
|
|
* thus don't need to be hashed. They also don't need a name until a
|
|
|
|
* user wants to identify the object in /proc/pid/fd/. The little hack
|
|
|
|
* below allows us to generate a name for these objects on demand:
|
|
|
|
*/
|
2008-02-15 11:38:44 +08:00
|
|
|
if (path->dentry->d_op && path->dentry->d_op->d_dname)
|
|
|
|
return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
|
2007-05-08 15:26:18 +08:00
|
|
|
|
2010-08-10 17:41:36 +08:00
|
|
|
get_fs_root(current->fs, &root);
|
2011-01-07 14:49:37 +08:00
|
|
|
write_seqlock(&rename_lock);
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 21:43:34 +08:00
|
|
|
error = path_with_deleted(path, &root, &res, &buflen);
|
|
|
|
if (error < 0)
|
2010-08-10 17:41:40 +08:00
|
|
|
res = ERR_PTR(error);
|
2011-01-07 14:49:37 +08:00
|
|
|
write_sequnlock(&rename_lock);
|
2008-02-15 11:34:38 +08:00
|
|
|
path_put(&root);
|
2005-04-17 06:20:36 +08:00
|
|
|
return res;
|
|
|
|
}
|
2010-01-06 04:45:18 +08:00
|
|
|
EXPORT_SYMBOL(d_path);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-08-10 17:41:41 +08:00
|
|
|
/**
|
|
|
|
* d_path_with_unreachable - return the path of a dentry
|
|
|
|
* @path: path to report
|
|
|
|
* @buf: buffer to return value in
|
|
|
|
* @buflen: buffer length
|
|
|
|
*
|
|
|
|
* The difference from d_path() is that this prepends "(unreachable)"
|
|
|
|
* to paths which are unreachable from the current process' root.
|
|
|
|
*/
|
|
|
|
char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
|
|
|
|
{
|
|
|
|
char *res = buf + buflen;
|
|
|
|
struct path root;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (path->dentry->d_op && path->dentry->d_op->d_dname)
|
|
|
|
return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
|
|
|
|
|
|
|
|
get_fs_root(current->fs, &root);
|
2011-01-07 14:49:37 +08:00
|
|
|
write_seqlock(&rename_lock);
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 21:43:34 +08:00
|
|
|
error = path_with_deleted(path, &root, &res, &buflen);
|
|
|
|
if (error > 0)
|
2010-08-10 17:41:41 +08:00
|
|
|
error = prepend_unreachable(&res, &buflen);
|
2011-01-07 14:49:37 +08:00
|
|
|
write_sequnlock(&rename_lock);
|
2010-08-10 17:41:41 +08:00
|
|
|
path_put(&root);
|
|
|
|
if (error)
|
|
|
|
res = ERR_PTR(error);
|
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2007-05-08 15:26:18 +08:00
|
|
|
/*
|
|
|
|
* Helper function for dentry_operations.d_dname() members
|
|
|
|
*/
|
|
|
|
char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
|
|
|
|
const char *fmt, ...)
|
|
|
|
{
|
|
|
|
va_list args;
|
|
|
|
char temp[64];
|
|
|
|
int sz;
|
|
|
|
|
|
|
|
va_start(args, fmt);
|
|
|
|
sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1;
|
|
|
|
va_end(args);
|
|
|
|
|
|
|
|
if (sz > sizeof(temp) || sz > buflen)
|
|
|
|
return ERR_PTR(-ENAMETOOLONG);
|
|
|
|
|
|
|
|
buffer += buflen - sz;
|
|
|
|
return memcpy(buffer, temp, sz);
|
|
|
|
}
|
|
|
|
|
2008-03-27 20:06:20 +08:00
|
|
|
/*
|
|
|
|
* Write full pathname from the root of the filesystem into the buffer.
|
|
|
|
*/
|
2011-01-07 14:49:29 +08:00
|
|
|
static char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
|
2008-03-27 20:06:20 +08:00
|
|
|
{
|
|
|
|
char *end = buf + buflen;
|
|
|
|
char *retval;
|
|
|
|
|
|
|
|
prepend(&end, &buflen, "\0", 1);
|
|
|
|
if (buflen < 1)
|
|
|
|
goto Elong;
|
|
|
|
/* Get '/' right */
|
|
|
|
retval = end-1;
|
|
|
|
*retval = '/';
|
|
|
|
|
2008-06-24 00:11:53 +08:00
|
|
|
while (!IS_ROOT(dentry)) {
|
|
|
|
struct dentry *parent = dentry->d_parent;
|
2011-01-07 14:49:36 +08:00
|
|
|
int error;
|
2008-03-27 20:06:20 +08:00
|
|
|
|
|
|
|
prefetch(parent);
|
2011-01-07 14:49:36 +08:00
|
|
|
spin_lock(&dentry->d_lock);
|
|
|
|
error = prepend_name(&end, &buflen, &dentry->d_name);
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
if (error != 0 || prepend(&end, &buflen, "/", 1) != 0)
|
2008-03-27 20:06:20 +08:00
|
|
|
goto Elong;
|
|
|
|
|
|
|
|
retval = end;
|
|
|
|
dentry = parent;
|
|
|
|
}
|
2010-06-07 10:31:14 +08:00
|
|
|
return retval;
|
|
|
|
Elong:
|
|
|
|
return ERR_PTR(-ENAMETOOLONG);
|
|
|
|
}
|
2011-01-07 14:49:29 +08:00
|
|
|
|
|
|
|
char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
|
|
|
|
{
|
|
|
|
char *retval;
|
|
|
|
|
2011-01-07 14:49:37 +08:00
|
|
|
write_seqlock(&rename_lock);
|
2011-01-07 14:49:29 +08:00
|
|
|
retval = __dentry_path(dentry, buf, buflen);
|
2011-01-07 14:49:37 +08:00
|
|
|
write_sequnlock(&rename_lock);
|
2011-01-07 14:49:29 +08:00
|
|
|
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(dentry_path_raw);
|
2010-06-07 10:31:14 +08:00
|
|
|
|
|
|
|
char *dentry_path(struct dentry *dentry, char *buf, int buflen)
|
|
|
|
{
|
|
|
|
char *p = NULL;
|
|
|
|
char *retval;
|
|
|
|
|
2011-01-07 14:49:37 +08:00
|
|
|
write_seqlock(&rename_lock);
|
2010-06-07 10:31:14 +08:00
|
|
|
if (d_unlinked(dentry)) {
|
|
|
|
p = buf + buflen;
|
|
|
|
if (prepend(&p, &buflen, "//deleted", 10) != 0)
|
|
|
|
goto Elong;
|
|
|
|
buflen++;
|
|
|
|
}
|
|
|
|
retval = __dentry_path(dentry, buf, buflen);
|
2011-01-07 14:49:37 +08:00
|
|
|
write_sequnlock(&rename_lock);
|
2010-06-07 10:31:14 +08:00
|
|
|
if (!IS_ERR(retval) && p)
|
|
|
|
*p = '/'; /* restore '/' overriden with '\0' */
|
2008-03-27 20:06:20 +08:00
|
|
|
return retval;
|
|
|
|
Elong:
|
|
|
|
return ERR_PTR(-ENAMETOOLONG);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* NOTE! The user-level library version returns a
|
|
|
|
* character pointer. The kernel system call just
|
|
|
|
* returns the length of the buffer filled (which
|
|
|
|
* includes the ending '\0' character), or a negative
|
|
|
|
* error value. So libc would do something like
|
|
|
|
*
|
|
|
|
* char *getcwd(char * buf, size_t size)
|
|
|
|
* {
|
|
|
|
* int retval;
|
|
|
|
*
|
|
|
|
* retval = sys_getcwd(buf, size);
|
|
|
|
* if (retval >= 0)
|
|
|
|
* return buf;
|
|
|
|
* errno = -retval;
|
|
|
|
* return NULL;
|
|
|
|
* }
|
|
|
|
*/
|
2009-01-14 21:14:22 +08:00
|
|
|
SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2007-02-14 04:08:18 +08:00
|
|
|
int error;
|
2008-02-15 11:34:38 +08:00
|
|
|
struct path pwd, root;
|
2007-02-14 04:08:18 +08:00
|
|
|
char *page = (char *) __get_free_page(GFP_USER);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (!page)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2010-08-10 17:41:36 +08:00
|
|
|
get_fs_root_and_pwd(current->fs, &root, &pwd);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-02-14 04:08:18 +08:00
|
|
|
error = -ENOENT;
|
2011-01-07 14:49:37 +08:00
|
|
|
write_seqlock(&rename_lock);
|
2009-05-04 07:32:03 +08:00
|
|
|
if (!d_unlinked(pwd.dentry)) {
|
2007-02-14 04:08:18 +08:00
|
|
|
unsigned long len;
|
2010-08-10 17:41:41 +08:00
|
|
|
char *cwd = page + PAGE_SIZE;
|
|
|
|
int buflen = PAGE_SIZE;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-08-10 17:41:41 +08:00
|
|
|
prepend(&cwd, &buflen, "\0", 1);
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 21:43:34 +08:00
|
|
|
error = prepend_path(&pwd, &root, &cwd, &buflen);
|
2011-01-07 14:49:37 +08:00
|
|
|
write_sequnlock(&rename_lock);
|
2007-02-14 04:08:18 +08:00
|
|
|
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 21:43:34 +08:00
|
|
|
if (error < 0)
|
2007-02-14 04:08:18 +08:00
|
|
|
goto out;
|
|
|
|
|
2010-08-10 17:41:41 +08:00
|
|
|
/* Unreachable from current root */
|
fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API
__d_path() API is asking for trouble and in case of apparmor d_namespace_path()
getting just that. The root cause is that when __d_path() misses the root
it had been told to look for, it stores the location of the most remote ancestor
in *root. Without grabbing references. Sure, at the moment of call it had
been pinned down by what we have in *path. And if we raced with umount -l, we
could have very well stopped at vfsmount/dentry that got freed as soon as
prepend_path() dropped vfsmount_lock.
It is safe to compare these pointers with pre-existing (and known to be still
alive) vfsmount and dentry, as long as all we are asking is "is it the same
address?". Dereferencing is not safe and apparmor ended up stepping into
that. d_namespace_path() really wants to examine the place where we stopped,
even if it's not connected to our namespace. As the result, it looked
at ->d_sb->s_magic of a dentry that might've been already freed by that point.
All other callers had been careful enough to avoid that, but it's really
a bad interface - it invites that kind of trouble.
The fix is fairly straightforward, even though it's bigger than I'd like:
* prepend_path() root argument becomes const.
* __d_path() is never called with NULL/NULL root. It was a kludge
to start with. Instead, we have an explicit function - d_absolute_root().
Same as __d_path(), except that it doesn't get root passed and stops where
it stops. apparmor and tomoyo are using it.
* __d_path() returns NULL on path outside of root. The main
caller is show_mountinfo() and that's precisely what we pass root for - to
skip those outside chroot jail. Those who don't want that can (and do)
use d_path().
* __d_path() root argument becomes const. Everyone agrees, I hope.
* apparmor does *NOT* try to use __d_path() or any of its variants
when it sees that path->mnt is an internal vfsmount. In that case it's
definitely not mounted anywhere and dentry_path() is exactly what we want
there. Handling of sysctl()-triggered weirdness is moved to that place.
* if apparmor is asked to do pathname relative to chroot jail
and __d_path() tells it we it's not in that jail, the sucker just calls
d_absolute_path() instead. That's the other remaining caller of __d_path(),
BTW.
* seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway -
the normal seq_file logics will take care of growing the buffer and redoing
the call of ->show() just fine). However, if it gets path not reachable
from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped
ignoring the return value as it used to do).
Reviewed-by: John Johansen <john.johansen@canonical.com>
ACKed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
2011-12-05 21:43:34 +08:00
|
|
|
if (error > 0) {
|
2010-08-10 17:41:41 +08:00
|
|
|
error = prepend_unreachable(&cwd, &buflen);
|
|
|
|
if (error)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2007-02-14 04:08:18 +08:00
|
|
|
error = -ERANGE;
|
|
|
|
len = PAGE_SIZE + page - cwd;
|
|
|
|
if (len <= size) {
|
|
|
|
error = len;
|
|
|
|
if (copy_to_user(buf, cwd, len))
|
|
|
|
error = -EFAULT;
|
|
|
|
}
|
2011-01-07 14:49:37 +08:00
|
|
|
} else {
|
|
|
|
write_sequnlock(&rename_lock);
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
out:
|
2008-02-15 11:34:38 +08:00
|
|
|
path_put(&pwd);
|
|
|
|
path_put(&root);
|
2005-04-17 06:20:36 +08:00
|
|
|
free_page((unsigned long) page);
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Test whether new_dentry is a subdirectory of old_dentry.
|
|
|
|
*
|
|
|
|
* Trivially implemented using the dcache structure
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* is_subdir - is new dentry a subdirectory of old_dentry
|
|
|
|
* @new_dentry: new dentry
|
|
|
|
* @old_dentry: old dentry
|
|
|
|
*
|
|
|
|
* Returns 1 if new_dentry is a subdirectory of the parent (at any depth).
|
|
|
|
* Returns 0 otherwise.
|
|
|
|
* Caller must ensure that "new_dentry" is pinned before calling is_subdir()
|
|
|
|
*/
|
|
|
|
|
2008-10-16 06:50:28 +08:00
|
|
|
int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
int result;
|
2011-01-07 14:49:37 +08:00
|
|
|
unsigned seq;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-10-16 06:50:28 +08:00
|
|
|
if (new_dentry == old_dentry)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
do {
|
2005-04-17 06:20:36 +08:00
|
|
|
/* for restarting inner loop in case of seq retry */
|
|
|
|
seq = read_seqbegin(&rename_lock);
|
2011-01-07 14:49:37 +08:00
|
|
|
/*
|
|
|
|
* Need rcu_readlock to protect against the d_parent trashing
|
|
|
|
* due to d_move
|
|
|
|
*/
|
|
|
|
rcu_read_lock();
|
2008-10-16 06:50:28 +08:00
|
|
|
if (d_ancestor(old_dentry, new_dentry))
|
2005-04-17 06:20:36 +08:00
|
|
|
result = 1;
|
2008-10-16 06:50:28 +08:00
|
|
|
else
|
|
|
|
result = 0;
|
2011-01-07 14:49:37 +08:00
|
|
|
rcu_read_unlock();
|
2005-04-17 06:20:36 +08:00
|
|
|
} while (read_seqretry(&rename_lock, seq));
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
void d_genocide(struct dentry *root)
|
|
|
|
{
|
2011-01-07 14:49:37 +08:00
|
|
|
struct dentry *this_parent;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct list_head *next;
|
2011-01-07 14:49:37 +08:00
|
|
|
unsigned seq;
|
2011-01-07 14:49:39 +08:00
|
|
|
int locked = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-01-07 14:49:37 +08:00
|
|
|
seq = read_seqbegin(&rename_lock);
|
2011-01-07 14:49:39 +08:00
|
|
|
again:
|
|
|
|
this_parent = root;
|
2011-01-07 14:49:34 +08:00
|
|
|
spin_lock(&this_parent->d_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
repeat:
|
|
|
|
next = this_parent->d_subdirs.next;
|
|
|
|
resume:
|
|
|
|
while (next != &this_parent->d_subdirs) {
|
|
|
|
struct list_head *tmp = next;
|
[PATCH] shrink dentry struct
Some long time ago, dentry struct was carefully tuned so that on 32 bits
UP, sizeof(struct dentry) was exactly 128, ie a power of 2, and a multiple
of memory cache lines.
Then RCU was added and dentry struct enlarged by two pointers, with nice
results for SMP, but not so good on UP, because breaking the above tuning
(128 + 8 = 136 bytes)
This patch reverts this unwanted side effect, by using an union (d_u),
where d_rcu and d_child are placed so that these two fields can share their
memory needs.
At the time d_free() is called (and d_rcu is really used), d_child is known
to be empty and not touched by the dentry freeing.
Lockless lookups only access d_name, d_parent, d_lock, d_op, d_flags (so
the previous content of d_child is not needed if said dentry was unhashed
but still accessed by a CPU because of RCU constraints)
As dentry cache easily contains millions of entries, a size reduction is
worth the extra complexity of the ugly C union.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Maneesh Soni <maneesh@in.ibm.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Ian Kent <raven@themaw.net>
Cc: Paul Jackson <pj@sgi.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Cc: James Morris <jmorris@namei.org>
Cc: Stephen Smalley <sds@epoch.ncsc.mil>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:03:32 +08:00
|
|
|
struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
|
2005-04-17 06:20:36 +08:00
|
|
|
next = tmp->next;
|
2011-01-07 14:49:37 +08:00
|
|
|
|
2011-01-07 14:49:33 +08:00
|
|
|
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
|
|
|
|
if (d_unhashed(dentry) || !dentry->d_inode) {
|
|
|
|
spin_unlock(&dentry->d_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
continue;
|
2011-01-07 14:49:33 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!list_empty(&dentry->d_subdirs)) {
|
2011-01-07 14:49:34 +08:00
|
|
|
spin_unlock(&this_parent->d_lock);
|
|
|
|
spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
|
2005-04-17 06:20:36 +08:00
|
|
|
this_parent = dentry;
|
2011-01-07 14:49:34 +08:00
|
|
|
spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
|
2005-04-17 06:20:36 +08:00
|
|
|
goto repeat;
|
|
|
|
}
|
2011-01-07 14:49:37 +08:00
|
|
|
if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
|
|
|
|
dentry->d_flags |= DCACHE_GENOCIDE;
|
|
|
|
dentry->d_count--;
|
|
|
|
}
|
2011-01-07 14:49:32 +08:00
|
|
|
spin_unlock(&dentry->d_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
if (this_parent != root) {
|
2011-03-16 06:29:21 +08:00
|
|
|
struct dentry *child = this_parent;
|
2011-01-07 14:49:37 +08:00
|
|
|
if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
|
|
|
|
this_parent->d_flags |= DCACHE_GENOCIDE;
|
|
|
|
this_parent->d_count--;
|
|
|
|
}
|
2011-03-16 06:29:21 +08:00
|
|
|
this_parent = try_to_ascend(this_parent, locked, seq);
|
|
|
|
if (!this_parent)
|
2011-01-07 14:49:37 +08:00
|
|
|
goto rename_retry;
|
|
|
|
next = child->d_u.d_child.next;
|
2005-04-17 06:20:36 +08:00
|
|
|
goto resume;
|
|
|
|
}
|
2011-01-07 14:49:34 +08:00
|
|
|
spin_unlock(&this_parent->d_lock);
|
2011-01-07 14:49:39 +08:00
|
|
|
if (!locked && read_seqretry(&rename_lock, seq))
|
2011-01-07 14:49:37 +08:00
|
|
|
goto rename_retry;
|
2011-01-07 14:49:39 +08:00
|
|
|
if (locked)
|
|
|
|
write_sequnlock(&rename_lock);
|
|
|
|
return;
|
|
|
|
|
|
|
|
rename_retry:
|
|
|
|
locked = 1;
|
|
|
|
write_seqlock(&rename_lock);
|
|
|
|
goto again;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* find_inode_number - check for dentry with name
|
|
|
|
* @dir: directory to check
|
|
|
|
* @name: Name to find.
|
|
|
|
*
|
|
|
|
* Check whether a dentry already exists for the given name,
|
|
|
|
* and return the inode number if it has an inode. Otherwise
|
|
|
|
* 0 is returned.
|
|
|
|
*
|
|
|
|
* This routine is used to post-process directory listings for
|
|
|
|
* filesystems using synthetic inode numbers, and is necessary
|
|
|
|
* to keep getcwd() working.
|
|
|
|
*/
|
|
|
|
|
|
|
|
ino_t find_inode_number(struct dentry *dir, struct qstr *name)
|
|
|
|
{
|
|
|
|
struct dentry * dentry;
|
|
|
|
ino_t ino = 0;
|
|
|
|
|
2006-03-31 18:31:43 +08:00
|
|
|
dentry = d_hash_and_lookup(dir, name);
|
|
|
|
if (dentry) {
|
2005-04-17 06:20:36 +08:00
|
|
|
if (dentry->d_inode)
|
|
|
|
ino = dentry->d_inode->i_ino;
|
|
|
|
dput(dentry);
|
|
|
|
}
|
|
|
|
return ino;
|
|
|
|
}
|
2010-01-06 04:45:18 +08:00
|
|
|
EXPORT_SYMBOL(find_inode_number);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
static __initdata unsigned long dhash_entries;
|
|
|
|
static int __init set_dhash_entries(char *str)
|
|
|
|
{
|
|
|
|
if (!str)
|
|
|
|
return 0;
|
|
|
|
dhash_entries = simple_strtoul(str, &str, 0);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
__setup("dhash_entries=", set_dhash_entries);
|
|
|
|
|
|
|
|
static void __init dcache_init_early(void)
|
|
|
|
{
|
2012-02-09 04:39:07 +08:00
|
|
|
unsigned int loop;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* If hashes are distributed across NUMA nodes, defer
|
|
|
|
* hash allocation until vmalloc space is available.
|
|
|
|
*/
|
|
|
|
if (hashdist)
|
|
|
|
return;
|
|
|
|
|
|
|
|
dentry_hashtable =
|
|
|
|
alloc_large_system_hash("Dentry cache",
|
2011-04-24 13:32:03 +08:00
|
|
|
sizeof(struct hlist_bl_head),
|
2005-04-17 06:20:36 +08:00
|
|
|
dhash_entries,
|
|
|
|
13,
|
|
|
|
HASH_EARLY,
|
|
|
|
&d_hash_shift,
|
|
|
|
&d_hash_mask,
|
|
|
|
0);
|
|
|
|
|
2012-02-09 04:39:07 +08:00
|
|
|
for (loop = 0; loop < (1U << d_hash_shift); loop++)
|
2011-04-24 13:32:03 +08:00
|
|
|
INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2007-10-17 14:26:30 +08:00
|
|
|
static void __init dcache_init(void)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2012-02-09 04:39:07 +08:00
|
|
|
unsigned int loop;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* A constructor could be added for stable state like the lists,
|
|
|
|
* but it is probably not worth it because of the cache nature
|
|
|
|
* of the dcache.
|
|
|
|
*/
|
2007-05-07 05:49:57 +08:00
|
|
|
dentry_cache = KMEM_CACHE(dentry,
|
|
|
|
SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Hash may have been set up in dcache_init_early */
|
|
|
|
if (!hashdist)
|
|
|
|
return;
|
|
|
|
|
|
|
|
dentry_hashtable =
|
|
|
|
alloc_large_system_hash("Dentry cache",
|
2011-04-24 13:32:03 +08:00
|
|
|
sizeof(struct hlist_bl_head),
|
2005-04-17 06:20:36 +08:00
|
|
|
dhash_entries,
|
|
|
|
13,
|
|
|
|
0,
|
|
|
|
&d_hash_shift,
|
|
|
|
&d_hash_mask,
|
|
|
|
0);
|
|
|
|
|
2012-02-09 04:39:07 +08:00
|
|
|
for (loop = 0; loop < (1U << d_hash_shift); loop++)
|
2011-04-24 13:32:03 +08:00
|
|
|
INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* SLAB cache for __getname() consumers */
|
2006-12-07 12:33:20 +08:00
|
|
|
struct kmem_cache *names_cachep __read_mostly;
|
2010-01-06 04:45:18 +08:00
|
|
|
EXPORT_SYMBOL(names_cachep);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
EXPORT_SYMBOL(d_genocide);
|
|
|
|
|
|
|
|
void __init vfs_caches_init_early(void)
|
|
|
|
{
|
|
|
|
dcache_init_early();
|
|
|
|
inode_init_early();
|
|
|
|
}
|
|
|
|
|
|
|
|
void __init vfs_caches_init(unsigned long mempages)
|
|
|
|
{
|
|
|
|
unsigned long reserve;
|
|
|
|
|
|
|
|
/* Base hash sizes on available memory, with a reserve equal to
|
|
|
|
150% of current kernel size */
|
|
|
|
|
|
|
|
reserve = min((mempages - nr_free_pages()) * 3/2, mempages - 1);
|
|
|
|
mempages -= reserve;
|
|
|
|
|
|
|
|
names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
|
2007-07-20 09:11:58 +08:00
|
|
|
SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-10-17 14:26:30 +08:00
|
|
|
dcache_init();
|
|
|
|
inode_init();
|
2005-04-17 06:20:36 +08:00
|
|
|
files_init(mempages);
|
2007-10-17 14:26:30 +08:00
|
|
|
mnt_init();
|
2005-04-17 06:20:36 +08:00
|
|
|
bdev_cache_init();
|
|
|
|
chrdev_init();
|
|
|
|
}
|