Merge branch 'for-4.9' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu

Pull percpu updates from Tejun Heo:

 - Nick improved generic implementations of percpu operations which
   modify the variable and return so that they calculate the physical
   address only once.

 - percpu_ref percpu <-> atomic mode switching improvements. The
   patchset was originally posted about a year ago but fell through the
   crack.

 - misc non-critical fixes.

* 'for-4.9' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu:
  mm/percpu.c: fix potential memory leakage for pcpu_embed_first_chunk()
  mm/percpu.c: correct max_distance calculation for pcpu_embed_first_chunk()
  percpu: eliminate two sparse warnings
  percpu: improve generic percpu modify-return implementation
  percpu-refcount: init ->confirm_switch member properly
  percpu_ref: allow operation mode switching operations to be called concurrently
  percpu_ref: restructure operation mode switching
  percpu_ref: unify staggered atomic switching wait behavior
  percpu_ref: reorganize __percpu_ref_switch_to_atomic() and relocate percpu_ref_switch_to_atomic()
  percpu_ref: remove unnecessary RCU grace period for staggered atomic switching confirmation
This commit is contained in:
Linus Torvalds 2016-10-14 11:46:25 -07:00
commit b6daa51b9a
4 changed files with 151 additions and 120 deletions

View File

@ -521,7 +521,8 @@ do { \
static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
const unsigned long __percpu *addr)
{
unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG;
unsigned long __percpu *a =
(unsigned long __percpu *)addr + nr / BITS_PER_LONG;
#ifdef CONFIG_X86_64
return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_8(*a)) != 0;
@ -538,7 +539,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr,
asm volatile("bt "__percpu_arg(2)",%1\n\t"
CC_SET(c)
: CC_OUT(c) (oldbit)
: "m" (*(unsigned long *)addr), "Ir" (nr));
: "m" (*(unsigned long __percpu *)addr), "Ir" (nr));
return oldbit;
}

View File

@ -65,6 +65,11 @@ extern void setup_per_cpu_areas(void);
#define PER_CPU_DEF_ATTRIBUTES
#endif
#define raw_cpu_generic_read(pcp) \
({ \
*raw_cpu_ptr(&(pcp)); \
})
#define raw_cpu_generic_to_op(pcp, val, op) \
do { \
*raw_cpu_ptr(&(pcp)) op val; \
@ -72,34 +77,39 @@ do { \
#define raw_cpu_generic_add_return(pcp, val) \
({ \
raw_cpu_add(pcp, val); \
raw_cpu_read(pcp); \
typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp)); \
\
*__p += val; \
*__p; \
})
#define raw_cpu_generic_xchg(pcp, nval) \
({ \
typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp)); \
typeof(pcp) __ret; \
__ret = raw_cpu_read(pcp); \
raw_cpu_write(pcp, nval); \
__ret = *__p; \
*__p = nval; \
__ret; \
})
#define raw_cpu_generic_cmpxchg(pcp, oval, nval) \
({ \
typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp)); \
typeof(pcp) __ret; \
__ret = raw_cpu_read(pcp); \
__ret = *__p; \
if (__ret == (oval)) \
raw_cpu_write(pcp, nval); \
*__p = nval; \
__ret; \
})
#define raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
({ \
typeof(&(pcp1)) __p1 = raw_cpu_ptr(&(pcp1)); \
typeof(&(pcp2)) __p2 = raw_cpu_ptr(&(pcp2)); \
int __ret = 0; \
if (raw_cpu_read(pcp1) == (oval1) && \
raw_cpu_read(pcp2) == (oval2)) { \
raw_cpu_write(pcp1, nval1); \
raw_cpu_write(pcp2, nval2); \
if (*__p1 == (oval1) && *__p2 == (oval2)) { \
*__p1 = nval1; \
*__p2 = nval2; \
__ret = 1; \
} \
(__ret); \
@ -109,7 +119,7 @@ do { \
({ \
typeof(pcp) __ret; \
preempt_disable(); \
__ret = *this_cpu_ptr(&(pcp)); \
__ret = raw_cpu_generic_read(pcp); \
preempt_enable(); \
__ret; \
})
@ -118,17 +128,17 @@ do { \
do { \
unsigned long __flags; \
raw_local_irq_save(__flags); \
*raw_cpu_ptr(&(pcp)) op val; \
raw_cpu_generic_to_op(pcp, val, op); \
raw_local_irq_restore(__flags); \
} while (0)
#define this_cpu_generic_add_return(pcp, val) \
({ \
typeof(pcp) __ret; \
unsigned long __flags; \
raw_local_irq_save(__flags); \
raw_cpu_add(pcp, val); \
__ret = raw_cpu_read(pcp); \
__ret = raw_cpu_generic_add_return(pcp, val); \
raw_local_irq_restore(__flags); \
__ret; \
})
@ -138,8 +148,7 @@ do { \
typeof(pcp) __ret; \
unsigned long __flags; \
raw_local_irq_save(__flags); \
__ret = raw_cpu_read(pcp); \
raw_cpu_write(pcp, nval); \
__ret = raw_cpu_generic_xchg(pcp, nval); \
raw_local_irq_restore(__flags); \
__ret; \
})
@ -149,9 +158,7 @@ do { \
typeof(pcp) __ret; \
unsigned long __flags; \
raw_local_irq_save(__flags); \
__ret = raw_cpu_read(pcp); \
if (__ret == (oval)) \
raw_cpu_write(pcp, nval); \
__ret = raw_cpu_generic_cmpxchg(pcp, oval, nval); \
raw_local_irq_restore(__flags); \
__ret; \
})
@ -168,16 +175,16 @@ do { \
})
#ifndef raw_cpu_read_1
#define raw_cpu_read_1(pcp) (*raw_cpu_ptr(&(pcp)))
#define raw_cpu_read_1(pcp) raw_cpu_generic_read(pcp)
#endif
#ifndef raw_cpu_read_2
#define raw_cpu_read_2(pcp) (*raw_cpu_ptr(&(pcp)))
#define raw_cpu_read_2(pcp) raw_cpu_generic_read(pcp)
#endif
#ifndef raw_cpu_read_4
#define raw_cpu_read_4(pcp) (*raw_cpu_ptr(&(pcp)))
#define raw_cpu_read_4(pcp) raw_cpu_generic_read(pcp)
#endif
#ifndef raw_cpu_read_8
#define raw_cpu_read_8(pcp) (*raw_cpu_ptr(&(pcp)))
#define raw_cpu_read_8(pcp) raw_cpu_generic_read(pcp)
#endif
#ifndef raw_cpu_write_1

View File

@ -33,6 +33,7 @@
#define PERCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1))
static DEFINE_SPINLOCK(percpu_ref_switch_lock);
static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq);
static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref)
@ -82,6 +83,7 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release,
atomic_long_set(&ref->count, start_count);
ref->release = release;
ref->confirm_switch = NULL;
return 0;
}
EXPORT_SYMBOL_GPL(percpu_ref_init);
@ -101,6 +103,8 @@ void percpu_ref_exit(struct percpu_ref *ref)
unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
if (percpu_count) {
/* non-NULL confirm_switch indicates switching in progress */
WARN_ON_ONCE(ref->confirm_switch);
free_percpu(percpu_count);
ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD;
}
@ -161,34 +165,67 @@ static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref)
static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref,
percpu_ref_func_t *confirm_switch)
{
if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) {
/* switching from percpu to atomic */
ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
/*
* Non-NULL ->confirm_switch is used to indicate that
* switching is in progress. Use noop one if unspecified.
*/
WARN_ON_ONCE(ref->confirm_switch);
ref->confirm_switch =
confirm_switch ?: percpu_ref_noop_confirm_switch;
percpu_ref_get(ref); /* put after confirmation */
call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu);
} else if (confirm_switch) {
/*
* Somebody already set ATOMIC. Switching may still be in
* progress. @confirm_switch must be invoked after the
* switching is complete and a full sched RCU grace period
* has passed. Wait synchronously for the previous
* switching and schedule @confirm_switch invocation.
*/
wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
ref->confirm_switch = confirm_switch;
percpu_ref_get(ref); /* put after confirmation */
call_rcu_sched(&ref->rcu, percpu_ref_call_confirm_rcu);
if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) {
if (confirm_switch)
confirm_switch(ref);
return;
}
/* switching from percpu to atomic */
ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
/*
* Non-NULL ->confirm_switch is used to indicate that switching is
* in progress. Use noop one if unspecified.
*/
ref->confirm_switch = confirm_switch ?: percpu_ref_noop_confirm_switch;
percpu_ref_get(ref); /* put after confirmation */
call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu);
}
static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
{
unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
int cpu;
BUG_ON(!percpu_count);
if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC))
return;
atomic_long_add(PERCPU_COUNT_BIAS, &ref->count);
/*
* Restore per-cpu operation. smp_store_release() is paired with
* smp_read_barrier_depends() in __ref_is_percpu() and guarantees
* that the zeroing is visible to all percpu accesses which can see
* the following __PERCPU_REF_ATOMIC clearing.
*/
for_each_possible_cpu(cpu)
*per_cpu_ptr(percpu_count, cpu) = 0;
smp_store_release(&ref->percpu_count_ptr,
ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC);
}
static void __percpu_ref_switch_mode(struct percpu_ref *ref,
percpu_ref_func_t *confirm_switch)
{
lockdep_assert_held(&percpu_ref_switch_lock);
/*
* If the previous ATOMIC switching hasn't finished yet, wait for
* its completion. If the caller ensures that ATOMIC switching
* isn't in progress, this function can be called from any context.
*/
wait_event_lock_irq(percpu_ref_switch_waitq, !ref->confirm_switch,
percpu_ref_switch_lock);
if (ref->force_atomic || (ref->percpu_count_ptr & __PERCPU_REF_DEAD))
__percpu_ref_switch_to_atomic(ref, confirm_switch);
else
__percpu_ref_switch_to_percpu(ref);
}
/**
@ -207,47 +244,21 @@ static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref,
* operations. Note that @ref will stay in atomic mode across kill/reinit
* cycles until percpu_ref_switch_to_percpu() is called.
*
* This function normally doesn't block and can be called from any context
* but it may block if @confirm_kill is specified and @ref is already in
* the process of switching to atomic mode. In such cases, @confirm_switch
* will be invoked after the switching is complete.
*
* Due to the way percpu_ref is implemented, @confirm_switch will be called
* after at least one full sched RCU grace period has passed but this is an
* implementation detail and must not be depended upon.
* This function may block if @ref is in the process of switching to atomic
* mode. If the caller ensures that @ref is not in the process of
* switching to atomic mode, this function can be called from any context.
*/
void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
percpu_ref_func_t *confirm_switch)
{
unsigned long flags;
spin_lock_irqsave(&percpu_ref_switch_lock, flags);
ref->force_atomic = true;
__percpu_ref_switch_to_atomic(ref, confirm_switch);
}
__percpu_ref_switch_mode(ref, confirm_switch);
static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
{
unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
int cpu;
BUG_ON(!percpu_count);
if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC))
return;
wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
atomic_long_add(PERCPU_COUNT_BIAS, &ref->count);
/*
* Restore per-cpu operation. smp_store_release() is paired with
* smp_read_barrier_depends() in __ref_is_percpu() and guarantees
* that the zeroing is visible to all percpu accesses which can see
* the following __PERCPU_REF_ATOMIC clearing.
*/
for_each_possible_cpu(cpu)
*per_cpu_ptr(percpu_count, cpu) = 0;
smp_store_release(&ref->percpu_count_ptr,
ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC);
spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
/**
@ -264,17 +275,20 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
* dying or dead, the actual switching takes place on the following
* percpu_ref_reinit().
*
* This function normally doesn't block and can be called from any context
* but it may block if @ref is in the process of switching to atomic mode
* by percpu_ref_switch_atomic().
* This function may block if @ref is in the process of switching to atomic
* mode. If the caller ensures that @ref is not in the process of
* switching to atomic mode, this function can be called from any context.
*/
void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
{
ref->force_atomic = false;
unsigned long flags;
/* a dying or dead ref can't be switched to percpu mode w/o reinit */
if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD))
__percpu_ref_switch_to_percpu(ref);
spin_lock_irqsave(&percpu_ref_switch_lock, flags);
ref->force_atomic = false;
__percpu_ref_switch_mode(ref, NULL);
spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
/**
@ -290,21 +304,23 @@ void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
*
* This function normally doesn't block and can be called from any context
* but it may block if @confirm_kill is specified and @ref is in the
* process of switching to atomic mode by percpu_ref_switch_atomic().
*
* Due to the way percpu_ref is implemented, @confirm_switch will be called
* after at least one full sched RCU grace period has passed but this is an
* implementation detail and must not be depended upon.
* process of switching to atomic mode by percpu_ref_switch_to_atomic().
*/
void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
percpu_ref_func_t *confirm_kill)
{
unsigned long flags;
spin_lock_irqsave(&percpu_ref_switch_lock, flags);
WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD,
"%s called more than once on %pf!", __func__, ref->release);
ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
__percpu_ref_switch_to_atomic(ref, confirm_kill);
__percpu_ref_switch_mode(ref, confirm_kill);
percpu_ref_put(ref);
spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);
@ -321,11 +337,16 @@ EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);
*/
void percpu_ref_reinit(struct percpu_ref *ref)
{
unsigned long flags;
spin_lock_irqsave(&percpu_ref_switch_lock, flags);
WARN_ON_ONCE(!percpu_ref_is_zero(ref));
ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD;
percpu_ref_get(ref);
if (!ref->force_atomic)
__percpu_ref_switch_to_percpu(ref);
__percpu_ref_switch_mode(ref, NULL);
spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_reinit);

View File

@ -1961,8 +1961,9 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
void *base = (void *)ULONG_MAX;
void **areas = NULL;
struct pcpu_alloc_info *ai;
size_t size_sum, areas_size, max_distance;
int group, i, rc;
size_t size_sum, areas_size;
unsigned long max_distance;
int group, i, highest_group, rc;
ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
cpu_distance_fn);
@ -1978,7 +1979,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
goto out_free;
}
/* allocate, copy and determine base address */
/* allocate, copy and determine base address & max_distance */
highest_group = 0;
for (group = 0; group < ai->nr_groups; group++) {
struct pcpu_group_info *gi = &ai->groups[group];
unsigned int cpu = NR_CPUS;
@ -1999,6 +2001,21 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
areas[group] = ptr;
base = min(ptr, base);
if (ptr > areas[highest_group])
highest_group = group;
}
max_distance = areas[highest_group] - base;
max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
/* warn if maximum distance is further than 75% of vmalloc space */
if (max_distance > VMALLOC_TOTAL * 3 / 4) {
pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
max_distance, VMALLOC_TOTAL);
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
/* and fail if we have fallback */
rc = -EINVAL;
goto out_free_areas;
#endif
}
/*
@ -2023,23 +2040,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
}
/* base address is now known, determine group base offsets */
max_distance = 0;
for (group = 0; group < ai->nr_groups; group++) {
ai->groups[group].base_offset = areas[group] - base;
max_distance = max_t(size_t, max_distance,
ai->groups[group].base_offset);
}
max_distance += ai->unit_size;
/* warn if maximum distance is further than 75% of vmalloc space */
if (max_distance > VMALLOC_TOTAL * 3 / 4) {
pr_warn("max_distance=0x%zx too large for vmalloc space 0x%lx\n",
max_distance, VMALLOC_TOTAL);
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
/* and fail if we have fallback */
rc = -EINVAL;
goto out_free;
#endif
}
pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",