2005-04-17 06:20:36 +08:00
|
|
|
#ifndef _LINUX_SCHED_H
|
|
|
|
#define _LINUX_SCHED_H
|
|
|
|
|
2006-04-27 07:12:56 +08:00
|
|
|
/*
|
|
|
|
* cloning flags:
|
|
|
|
*/
|
|
|
|
#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */
|
|
|
|
#define CLONE_VM 0x00000100 /* set if VM shared between processes */
|
|
|
|
#define CLONE_FS 0x00000200 /* set if fs info shared between processes */
|
|
|
|
#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
|
|
|
|
#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
|
|
|
|
#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
|
|
|
|
#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
|
|
|
|
#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
|
|
|
|
#define CLONE_THREAD 0x00010000 /* Same thread group? */
|
|
|
|
#define CLONE_NEWNS 0x00020000 /* New namespace group? */
|
|
|
|
#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */
|
|
|
|
#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */
|
|
|
|
#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */
|
|
|
|
#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */
|
|
|
|
#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
|
|
|
|
#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
|
|
|
|
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
|
|
|
|
#define CLONE_STOPPED 0x02000000 /* Start in stopped state */
|
2006-10-02 17:18:17 +08:00
|
|
|
#define CLONE_NEWUTS 0x04000000 /* New utsname group? */
|
2006-10-02 17:18:19 +08:00
|
|
|
#define CLONE_NEWIPC 0x08000000 /* New ipcs */
|
2007-07-16 14:41:01 +08:00
|
|
|
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
|
2007-10-19 14:40:10 +08:00
|
|
|
#define CLONE_NEWPID 0x20000000 /* New pid namespace */
|
2007-09-28 08:10:06 +08:00
|
|
|
#define CLONE_NEWNET 0x40000000 /* New network namespace */
|
2008-01-24 15:54:47 +08:00
|
|
|
#define CLONE_IO 0x80000000 /* Clone io context */
|
2006-04-27 07:12:56 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Scheduling policies
|
|
|
|
*/
|
|
|
|
#define SCHED_NORMAL 0
|
|
|
|
#define SCHED_FIFO 1
|
|
|
|
#define SCHED_RR 2
|
|
|
|
#define SCHED_BATCH 3
|
2007-07-10 00:51:57 +08:00
|
|
|
/* SCHED_ISO: reserved but not implemented yet */
|
|
|
|
#define SCHED_IDLE 5
|
2006-04-27 07:12:56 +08:00
|
|
|
|
2006-04-25 21:54:40 +08:00
|
|
|
#ifdef __KERNEL__
|
2006-04-27 07:12:56 +08:00
|
|
|
|
|
|
|
struct sched_param {
|
|
|
|
int sched_priority;
|
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <asm/param.h> /* for HZ */
|
|
|
|
|
|
|
|
#include <linux/capability.h>
|
|
|
|
#include <linux/threads.h>
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/timex.h>
|
|
|
|
#include <linux/jiffies.h>
|
|
|
|
#include <linux/rbtree.h>
|
|
|
|
#include <linux/thread_info.h>
|
|
|
|
#include <linux/cpumask.h>
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/nodemask.h>
|
2007-10-16 16:24:43 +08:00
|
|
|
#include <linux/mm_types.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#include <asm/system.h>
|
|
|
|
#include <asm/page.h>
|
|
|
|
#include <asm/ptrace.h>
|
|
|
|
#include <asm/cputime.h>
|
|
|
|
|
|
|
|
#include <linux/smp.h>
|
|
|
|
#include <linux/sem.h>
|
|
|
|
#include <linux/signal.h>
|
|
|
|
#include <linux/fs_struct.h>
|
|
|
|
#include <linux/compiler.h>
|
|
|
|
#include <linux/completion.h>
|
|
|
|
#include <linux/pid.h>
|
|
|
|
#include <linux/percpu.h>
|
|
|
|
#include <linux/topology.h>
|
2007-10-17 14:25:50 +08:00
|
|
|
#include <linux/proportions.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/seccomp.h>
|
2006-01-08 17:01:37 +08:00
|
|
|
#include <linux/rcupdate.h>
|
2006-06-27 17:54:53 +08:00
|
|
|
#include <linux/rtmutex.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-04-25 21:54:40 +08:00
|
|
|
#include <linux/time.h>
|
|
|
|
#include <linux/param.h>
|
|
|
|
#include <linux/resource.h>
|
|
|
|
#include <linux/timer.h>
|
|
|
|
#include <linux/hrtimer.h>
|
2006-12-10 18:19:19 +08:00
|
|
|
#include <linux/task_io_accounting.h>
|
2007-10-15 23:00:14 +08:00
|
|
|
#include <linux/kobject.h>
|
2008-01-26 04:08:34 +08:00
|
|
|
#include <linux/latencytop.h>
|
2008-08-13 23:20:04 +08:00
|
|
|
#include <linux/cred.h>
|
2006-04-25 21:54:40 +08:00
|
|
|
|
|
|
|
#include <asm/processor.h>
|
2005-09-07 06:16:49 +08:00
|
|
|
|
2008-02-07 16:13:51 +08:00
|
|
|
struct mem_cgroup;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct exec_domain;
|
2006-06-27 17:54:58 +08:00
|
|
|
struct futex_pi_state;
|
2008-01-26 04:08:34 +08:00
|
|
|
struct robust_list_head;
|
When stacked block devices are in-use (e.g. md or dm), the recursive calls
to generic_make_request can use up a lot of space, and we would rather they
didn't.
As generic_make_request is a void function, and as it is generally not
expected that it will have any effect immediately, it is safe to delay any
call to generic_make_request until there is sufficient stack space
available.
As ->bi_next is reserved for the driver to use, it can have no valid value
when generic_make_request is called, and as __make_request implicitly
assumes it will be NULL (ELEVATOR_BACK_MERGE fork of switch) we can be
certain that all callers set it to NULL. We can therefore safely use
bi_next to link pending requests together, providing we clear it before
making the real call.
So, we choose to allow each thread to only be active in one
generic_make_request at a time. If a subsequent (recursive) call is made,
the bio is linked into a per-thread list, and is handled when the active
call completes.
As the list of pending bios is per-thread, there are no locking issues to
worry about.
I say above that it is "safe to delay any call...". There are, however,
some behaviours of a make_request_fn which would make it unsafe. These
include any behaviour that assumes anything will have changed after a
recursive call to generic_make_request.
These could include:
- waiting for that call to finish and call it's bi_end_io function.
md use to sometimes do this (marking the superblock dirty before
completing a write) but doesn't any more
- inspecting the bio for fields that generic_make_request might
change, such as bi_sector or bi_bdev. It is hard to see a good
reason for this, and I don't think anyone actually does it.
- inspecing the queue to see if, e.g. it is 'full' yet. Again, I
think this is very unlikely to be useful, or to be done.
Signed-off-by: Neil Brown <neilb@suse.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: <dm-devel@redhat.com>
Alasdair G Kergon <agk@redhat.com> said:
I can see nothing wrong with this in principle.
For device-mapper at the moment though it's essential that, while the bio
mappings may now get delayed, they still get processed in exactly
the same order as they were passed to generic_make_request().
My main concern is whether the timing changes implicit in this patch
will make the rare data-corrupting races in the existing snapshot code
more likely. (I'm working on a fix for these races, but the unfinished
patch is already several hundred lines long.)
It would be helpful if some people on this mailing list would test
this patch in various scenarios and report back.
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2007-05-01 15:53:42 +08:00
|
|
|
struct bio;
|
2008-11-25 16:01:25 +08:00
|
|
|
struct bts_tracer;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* List of flags we want to share for kernel threads,
|
|
|
|
* if only because they are not used by them anyway.
|
|
|
|
*/
|
|
|
|
#define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* These are the constant used to fake the fixed-point load-average
|
|
|
|
* counting. Some notes:
|
|
|
|
* - 11 bit fractions expand to 22 bits by the multiplies: this gives
|
|
|
|
* a load-average precision of 10 bits integer + 11 bits fractional
|
|
|
|
* - if you want to count load-averages more often, you need more
|
|
|
|
* precision, or rounding will get you. With 2-second counting freq,
|
|
|
|
* the EXP_n values would be 1981, 2034 and 2043 if still using only
|
|
|
|
* 11 bit fractions.
|
|
|
|
*/
|
|
|
|
extern unsigned long avenrun[]; /* Load averages */
|
|
|
|
|
|
|
|
#define FSHIFT 11 /* nr of bits of precision */
|
|
|
|
#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
|
2007-10-08 07:17:38 +08:00
|
|
|
#define LOAD_FREQ (5*HZ+1) /* 5 sec intervals */
|
2005-04-17 06:20:36 +08:00
|
|
|
#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
|
|
|
|
#define EXP_5 2014 /* 1/exp(5sec/5min) */
|
|
|
|
#define EXP_15 2037 /* 1/exp(5sec/15min) */
|
|
|
|
|
|
|
|
#define CALC_LOAD(load,exp,n) \
|
|
|
|
load *= exp; \
|
|
|
|
load += n*(FIXED_1-exp); \
|
|
|
|
load >>= FSHIFT;
|
|
|
|
|
|
|
|
extern unsigned long total_forks;
|
|
|
|
extern int nr_threads;
|
|
|
|
DECLARE_PER_CPU(unsigned long, process_counts);
|
|
|
|
extern int nr_processes(void);
|
|
|
|
extern unsigned long nr_running(void);
|
|
|
|
extern unsigned long nr_uninterruptible(void);
|
2006-03-31 18:31:21 +08:00
|
|
|
extern unsigned long nr_active(void);
|
2005-04-17 06:20:36 +08:00
|
|
|
extern unsigned long nr_iowait(void);
|
|
|
|
|
2007-07-10 00:52:00 +08:00
|
|
|
struct seq_file;
|
|
|
|
struct cfs_rq;
|
2007-10-15 23:00:14 +08:00
|
|
|
struct task_group;
|
2007-07-10 00:52:00 +08:00
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
|
|
extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
|
|
|
|
extern void proc_sched_set_task(struct task_struct *p);
|
|
|
|
extern void
|
2007-08-09 17:16:47 +08:00
|
|
|
print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
|
2007-07-10 00:52:00 +08:00
|
|
|
#else
|
|
|
|
static inline void
|
|
|
|
proc_sched_show_task(struct task_struct *p, struct seq_file *m)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static inline void proc_sched_set_task(struct task_struct *p)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static inline void
|
2007-08-09 17:16:47 +08:00
|
|
|
print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
|
2007-07-10 00:52:00 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-04-23 15:31:35 +08:00
|
|
|
extern unsigned long long time_sync_thresh;
|
|
|
|
|
2005-09-30 06:18:21 +08:00
|
|
|
/*
|
|
|
|
* Task state bitmask. NOTE! These bits are also
|
|
|
|
* encoded in fs/proc/array.c: get_task_state().
|
|
|
|
*
|
|
|
|
* We have two separate sets of flags: task->state
|
|
|
|
* is about runnability, while task->exit_state are
|
|
|
|
* about the task exiting. Confusing, but this way
|
|
|
|
* modifying one set can't modify the other one by
|
|
|
|
* mistake.
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
#define TASK_RUNNING 0
|
|
|
|
#define TASK_INTERRUPTIBLE 1
|
|
|
|
#define TASK_UNINTERRUPTIBLE 2
|
2007-12-07 00:13:16 +08:00
|
|
|
#define __TASK_STOPPED 4
|
|
|
|
#define __TASK_TRACED 8
|
2005-09-30 06:18:21 +08:00
|
|
|
/* in tsk->exit_state */
|
|
|
|
#define EXIT_ZOMBIE 16
|
|
|
|
#define EXIT_DEAD 32
|
|
|
|
/* in tsk->state again */
|
2007-10-15 23:00:13 +08:00
|
|
|
#define TASK_DEAD 64
|
2007-12-07 00:13:16 +08:00
|
|
|
#define TASK_WAKEKILL 128
|
|
|
|
|
|
|
|
/* Convenience macros for the sake of set_task_state */
|
|
|
|
#define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
|
|
|
|
#define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED)
|
|
|
|
#define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-12-06 23:55:25 +08:00
|
|
|
/* Convenience macros for the sake of wake_up */
|
|
|
|
#define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
|
2007-12-07 00:13:16 +08:00
|
|
|
#define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
|
2007-12-06 23:55:25 +08:00
|
|
|
|
|
|
|
/* get_task_state() */
|
|
|
|
#define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \
|
2007-12-07 00:13:16 +08:00
|
|
|
TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
|
|
|
|
__TASK_TRACED)
|
2007-12-06 23:55:25 +08:00
|
|
|
|
2007-12-07 00:13:16 +08:00
|
|
|
#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0)
|
|
|
|
#define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0)
|
2007-12-06 23:55:25 +08:00
|
|
|
#define task_is_stopped_or_traced(task) \
|
2007-12-07 00:13:16 +08:00
|
|
|
((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
|
2007-12-06 23:55:25 +08:00
|
|
|
#define task_contributes_to_load(task) \
|
|
|
|
((task->state & TASK_UNINTERRUPTIBLE) != 0)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#define __set_task_state(tsk, state_value) \
|
|
|
|
do { (tsk)->state = (state_value); } while (0)
|
|
|
|
#define set_task_state(tsk, state_value) \
|
|
|
|
set_mb((tsk)->state, (state_value))
|
|
|
|
|
2005-09-13 16:25:14 +08:00
|
|
|
/*
|
|
|
|
* set_current_state() includes a barrier so that the write of current->state
|
|
|
|
* is correctly serialised wrt the caller's subsequent test of whether to
|
|
|
|
* actually sleep:
|
|
|
|
*
|
|
|
|
* set_current_state(TASK_UNINTERRUPTIBLE);
|
|
|
|
* if (do_i_need_to_sleep())
|
|
|
|
* schedule();
|
|
|
|
*
|
|
|
|
* If the caller does not need such serialisation then use __set_current_state()
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
#define __set_current_state(state_value) \
|
|
|
|
do { current->state = (state_value); } while (0)
|
|
|
|
#define set_current_state(state_value) \
|
|
|
|
set_mb(current->state, (state_value))
|
|
|
|
|
|
|
|
/* Task command name length */
|
|
|
|
#define TASK_COMM_LEN 16
|
|
|
|
|
|
|
|
#include <linux/spinlock.h>
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This serializes "schedule()" and also protects
|
|
|
|
* the run-queue from deletions/modifications (but
|
|
|
|
* _adding_ to the beginning of the run-queue has
|
|
|
|
* a separate lock).
|
|
|
|
*/
|
|
|
|
extern rwlock_t tasklist_lock;
|
|
|
|
extern spinlock_t mmlist_lock;
|
|
|
|
|
2006-07-03 15:25:41 +08:00
|
|
|
struct task_struct;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
extern void sched_init(void);
|
|
|
|
extern void sched_init_smp(void);
|
2008-02-16 01:56:34 +08:00
|
|
|
extern asmlinkage void schedule_tail(struct task_struct *prev);
|
2006-07-03 15:25:41 +08:00
|
|
|
extern void init_idle(struct task_struct *idle, int cpu);
|
2007-07-10 00:51:58 +08:00
|
|
|
extern void init_idle_bootup_task(struct task_struct *idle);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-05-13 03:20:52 +08:00
|
|
|
extern int runqueue_is_locked(void);
|
2008-11-10 22:39:30 +08:00
|
|
|
extern void task_rq_unlock_wait(struct task_struct *p);
|
2008-05-13 03:20:52 +08:00
|
|
|
|
2008-11-25 00:05:04 +08:00
|
|
|
extern cpumask_var_t nohz_cpu_mask;
|
2007-05-08 15:32:51 +08:00
|
|
|
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
|
|
|
|
extern int select_nohz_load_balancer(int cpu);
|
|
|
|
#else
|
|
|
|
static inline int select_nohz_load_balancer(int cpu)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-12-07 12:35:59 +08:00
|
|
|
/*
|
2007-04-26 11:50:03 +08:00
|
|
|
* Only dump TASK_* tasks. (0 for all tasks)
|
2006-12-07 12:35:59 +08:00
|
|
|
*/
|
|
|
|
extern void show_state_filter(unsigned long state_filter);
|
|
|
|
|
|
|
|
static inline void show_state(void)
|
|
|
|
{
|
2007-04-26 11:50:03 +08:00
|
|
|
show_state_filter(0);
|
2006-12-07 12:35:59 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
extern void show_regs(struct pt_regs *);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* TASK is a pointer to the task whose backtrace we want to see (or NULL for current
|
|
|
|
* task), SP is the stack pointer of the first frame that should be shown in the back
|
|
|
|
* trace (or NULL if the entire call-chain of the task should be shown).
|
|
|
|
*/
|
|
|
|
extern void show_stack(struct task_struct *task, unsigned long *sp);
|
|
|
|
|
|
|
|
void io_schedule(void);
|
|
|
|
long io_schedule_timeout(long timeout);
|
|
|
|
|
|
|
|
extern void cpu_init (void);
|
|
|
|
extern void trap_init(void);
|
|
|
|
extern void update_process_times(int user);
|
|
|
|
extern void scheduler_tick(void);
|
|
|
|
|
2008-01-26 04:08:02 +08:00
|
|
|
extern void sched_show_task(struct task_struct *p);
|
|
|
|
|
2005-09-07 06:16:27 +08:00
|
|
|
#ifdef CONFIG_DETECT_SOFTLOCKUP
|
2006-03-24 19:18:41 +08:00
|
|
|
extern void softlockup_tick(void);
|
2005-09-07 06:16:27 +08:00
|
|
|
extern void touch_softlockup_watchdog(void);
|
2007-05-08 15:28:05 +08:00
|
|
|
extern void touch_all_softlockup_watchdogs(void);
|
2008-05-13 03:21:04 +08:00
|
|
|
extern unsigned int softlockup_panic;
|
2008-01-26 04:08:02 +08:00
|
|
|
extern unsigned long sysctl_hung_task_check_count;
|
|
|
|
extern unsigned long sysctl_hung_task_timeout_secs;
|
2008-01-26 04:08:34 +08:00
|
|
|
extern unsigned long sysctl_hung_task_warnings;
|
2008-05-13 03:21:14 +08:00
|
|
|
extern int softlockup_thresh;
|
2005-09-07 06:16:27 +08:00
|
|
|
#else
|
2006-03-24 19:18:41 +08:00
|
|
|
static inline void softlockup_tick(void)
|
2005-09-07 06:16:27 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
static inline void spawn_softlockup_task(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static inline void touch_softlockup_watchdog(void)
|
|
|
|
{
|
|
|
|
}
|
2007-05-08 15:28:05 +08:00
|
|
|
static inline void touch_all_softlockup_watchdogs(void)
|
|
|
|
{
|
|
|
|
}
|
2005-09-07 06:16:27 +08:00
|
|
|
#endif
|
|
|
|
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Attach to any functions which should be ignored in wchan output. */
|
|
|
|
#define __sched __attribute__((__section__(".sched.text")))
|
2007-11-28 22:52:56 +08:00
|
|
|
|
|
|
|
/* Linker adds these: start and end of __sched functions */
|
|
|
|
extern char __sched_text_start[], __sched_text_end[];
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Is this address in the __sched functions? */
|
|
|
|
extern int in_sched_functions(unsigned long addr);
|
|
|
|
|
|
|
|
#define MAX_SCHEDULE_TIMEOUT LONG_MAX
|
2008-02-14 07:03:15 +08:00
|
|
|
extern signed long schedule_timeout(signed long timeout);
|
2005-09-10 15:27:21 +08:00
|
|
|
extern signed long schedule_timeout_interruptible(signed long timeout);
|
2007-12-07 00:59:46 +08:00
|
|
|
extern signed long schedule_timeout_killable(signed long timeout);
|
2005-09-10 15:27:21 +08:00
|
|
|
extern signed long schedule_timeout_uninterruptible(signed long timeout);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage void schedule(void);
|
|
|
|
|
2006-10-02 17:18:06 +08:00
|
|
|
struct nsproxy;
|
2007-07-16 14:40:59 +08:00
|
|
|
struct user_namespace;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Maximum number of active map areas.. This is a random (large) number */
|
|
|
|
#define DEFAULT_MAX_MAP_COUNT 65536
|
|
|
|
|
|
|
|
extern int sysctl_max_map_count;
|
|
|
|
|
|
|
|
#include <linux/aio.h>
|
|
|
|
|
|
|
|
extern unsigned long
|
|
|
|
arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
|
|
|
|
unsigned long, unsigned long);
|
|
|
|
extern unsigned long
|
|
|
|
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long pgoff,
|
|
|
|
unsigned long flags);
|
2005-06-22 08:14:49 +08:00
|
|
|
extern void arch_unmap_area(struct mm_struct *, unsigned long);
|
|
|
|
extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-09-10 06:43:22 +08:00
|
|
|
#if USE_SPLIT_PTLOCKS
|
2005-10-30 09:16:41 +08:00
|
|
|
/*
|
|
|
|
* The mm counters are not protected by its page_table_lock,
|
|
|
|
* so must be incremented atomically.
|
|
|
|
*/
|
2006-01-06 16:11:20 +08:00
|
|
|
#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
|
|
|
|
#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
|
|
|
|
#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
|
|
|
|
#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
|
|
|
|
#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
|
2005-10-30 09:16:41 +08:00
|
|
|
|
2008-09-10 06:43:22 +08:00
|
|
|
#else /* !USE_SPLIT_PTLOCKS */
|
2005-10-30 09:16:41 +08:00
|
|
|
/*
|
|
|
|
* The mm counters are protected by its page_table_lock,
|
|
|
|
* so can be incremented directly.
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
#define set_mm_counter(mm, member, value) (mm)->_##member = (value)
|
|
|
|
#define get_mm_counter(mm, member) ((mm)->_##member)
|
|
|
|
#define add_mm_counter(mm, member, value) (mm)->_##member += (value)
|
|
|
|
#define inc_mm_counter(mm, member) (mm)->_##member++
|
|
|
|
#define dec_mm_counter(mm, member) (mm)->_##member--
|
2005-10-30 09:16:41 +08:00
|
|
|
|
2008-09-10 06:43:22 +08:00
|
|
|
#endif /* !USE_SPLIT_PTLOCKS */
|
2005-10-30 09:16:05 +08:00
|
|
|
|
2005-10-30 09:16:41 +08:00
|
|
|
#define get_mm_rss(mm) \
|
|
|
|
(get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
|
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:18 +08:00
|
|
|
#define update_hiwater_rss(mm) do { \
|
|
|
|
unsigned long _rss = get_mm_rss(mm); \
|
|
|
|
if ((mm)->hiwater_rss < _rss) \
|
|
|
|
(mm)->hiwater_rss = _rss; \
|
|
|
|
} while (0)
|
|
|
|
#define update_hiwater_vm(mm) do { \
|
|
|
|
if ((mm)->hiwater_vm < (mm)->total_vm) \
|
|
|
|
(mm)->hiwater_vm = (mm)->total_vm; \
|
|
|
|
} while (0)
|
|
|
|
|
2009-01-07 06:40:29 +08:00
|
|
|
#define get_mm_hiwater_rss(mm) max((mm)->hiwater_rss, get_mm_rss(mm))
|
|
|
|
#define get_mm_hiwater_vm(mm) max((mm)->hiwater_vm, (mm)->total_vm)
|
|
|
|
|
2007-07-19 16:48:27 +08:00
|
|
|
extern void set_dumpable(struct mm_struct *mm, int value);
|
|
|
|
extern int get_dumpable(struct mm_struct *mm);
|
|
|
|
|
|
|
|
/* mm flags */
|
2007-07-19 16:48:28 +08:00
|
|
|
/* dumpable bits */
|
2007-07-19 16:48:27 +08:00
|
|
|
#define MMF_DUMPABLE 0 /* core dump is permitted */
|
|
|
|
#define MMF_DUMP_SECURELY 1 /* core file is readable only by root */
|
2007-07-19 16:48:28 +08:00
|
|
|
#define MMF_DUMPABLE_BITS 2
|
|
|
|
|
|
|
|
/* coredump filter bits */
|
|
|
|
#define MMF_DUMP_ANON_PRIVATE 2
|
|
|
|
#define MMF_DUMP_ANON_SHARED 3
|
|
|
|
#define MMF_DUMP_MAPPED_PRIVATE 4
|
|
|
|
#define MMF_DUMP_MAPPED_SHARED 5
|
2007-10-17 14:27:02 +08:00
|
|
|
#define MMF_DUMP_ELF_HEADERS 6
|
coredump_filter: add hugepage dumping
Presently hugepage's vma has a VM_RESERVED flag in order not to be
swapped. But a VM_RESERVED vma isn't core dumped because this flag is
often used for some kernel vmas (e.g. vmalloc, sound related).
Thus hugepages are never dumped and it can't be debugged easily. Many
developers want hugepages to be included into core-dump.
However, We can't read generic VM_RESERVED area because this area is often
IO mapping area. then these area reading may change device state. it is
definitly undesiable side-effect.
So adding a hugepage specific bit to the coredump filter is better. It
will be able to hugepage core dumping and doesn't cause any side-effect to
any i/o devices.
In additional, libhugetlb use hugetlb private mapping pages as anonymous
page. Then, hugepage private mapping pages should be core dumped by
default.
Then, /proc/[pid]/core_dump_filter has two new bits.
- bit 5 mean hugetlb private mapping pages are dumped or not. (default: yes)
- bit 6 mean hugetlb shared mapping pages are dumped or not. (default: no)
I tested by following method.
% ulimit -c unlimited
% ./crash_hugepage 50
% ./crash_hugepage 50 -p
% ls -lh
% gdb ./crash_hugepage core
%
% echo 0x43 > /proc/self/coredump_filter
% ./crash_hugepage 50
% ./crash_hugepage 50 -p
% ls -lh
% gdb ./crash_hugepage core
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
#include <string.h>
#include "hugetlbfs.h"
int main(int argc, char** argv){
char* p;
int ch;
int mmap_flags = MAP_SHARED;
int fd;
int nr_pages;
while((ch = getopt(argc, argv, "p")) != -1) {
switch (ch) {
case 'p':
mmap_flags &= ~MAP_SHARED;
mmap_flags |= MAP_PRIVATE;
break;
default:
/* nothing*/
break;
}
}
argc -= optind;
argv += optind;
if (argc == 0){
printf("need # of pages\n");
exit(1);
}
nr_pages = atoi(argv[0]);
if (nr_pages < 2) {
printf("nr_pages must >2\n");
exit(1);
}
fd = hugetlbfs_unlinked_fd();
p = mmap(NULL, nr_pages * gethugepagesize(),
PROT_READ|PROT_WRITE, mmap_flags, fd, 0);
sleep(2);
*(p + gethugepagesize()) = 1; /* COW */
sleep(2);
/* crash! */
*(int*)0 = 1;
return 0;
}
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Kawai Hidehiro <hidehiro.kawai.ez@hitachi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: William Irwin <wli@holomorphy.com>
Cc: Adam Litke <agl@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 11:27:08 +08:00
|
|
|
#define MMF_DUMP_HUGETLB_PRIVATE 7
|
|
|
|
#define MMF_DUMP_HUGETLB_SHARED 8
|
2007-07-19 16:48:28 +08:00
|
|
|
#define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS
|
coredump_filter: add hugepage dumping
Presently hugepage's vma has a VM_RESERVED flag in order not to be
swapped. But a VM_RESERVED vma isn't core dumped because this flag is
often used for some kernel vmas (e.g. vmalloc, sound related).
Thus hugepages are never dumped and it can't be debugged easily. Many
developers want hugepages to be included into core-dump.
However, We can't read generic VM_RESERVED area because this area is often
IO mapping area. then these area reading may change device state. it is
definitly undesiable side-effect.
So adding a hugepage specific bit to the coredump filter is better. It
will be able to hugepage core dumping and doesn't cause any side-effect to
any i/o devices.
In additional, libhugetlb use hugetlb private mapping pages as anonymous
page. Then, hugepage private mapping pages should be core dumped by
default.
Then, /proc/[pid]/core_dump_filter has two new bits.
- bit 5 mean hugetlb private mapping pages are dumped or not. (default: yes)
- bit 6 mean hugetlb shared mapping pages are dumped or not. (default: no)
I tested by following method.
% ulimit -c unlimited
% ./crash_hugepage 50
% ./crash_hugepage 50 -p
% ls -lh
% gdb ./crash_hugepage core
%
% echo 0x43 > /proc/self/coredump_filter
% ./crash_hugepage 50
% ./crash_hugepage 50 -p
% ls -lh
% gdb ./crash_hugepage core
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
#include <string.h>
#include "hugetlbfs.h"
int main(int argc, char** argv){
char* p;
int ch;
int mmap_flags = MAP_SHARED;
int fd;
int nr_pages;
while((ch = getopt(argc, argv, "p")) != -1) {
switch (ch) {
case 'p':
mmap_flags &= ~MAP_SHARED;
mmap_flags |= MAP_PRIVATE;
break;
default:
/* nothing*/
break;
}
}
argc -= optind;
argv += optind;
if (argc == 0){
printf("need # of pages\n");
exit(1);
}
nr_pages = atoi(argv[0]);
if (nr_pages < 2) {
printf("nr_pages must >2\n");
exit(1);
}
fd = hugetlbfs_unlinked_fd();
p = mmap(NULL, nr_pages * gethugepagesize(),
PROT_READ|PROT_WRITE, mmap_flags, fd, 0);
sleep(2);
*(p + gethugepagesize()) = 1; /* COW */
sleep(2);
/* crash! */
*(int*)0 = 1;
return 0;
}
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Kawai Hidehiro <hidehiro.kawai.ez@hitachi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: William Irwin <wli@holomorphy.com>
Cc: Adam Litke <agl@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 11:27:08 +08:00
|
|
|
#define MMF_DUMP_FILTER_BITS 7
|
2007-07-19 16:48:28 +08:00
|
|
|
#define MMF_DUMP_FILTER_MASK \
|
|
|
|
(((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
|
|
|
|
#define MMF_DUMP_FILTER_DEFAULT \
|
coredump_filter: add hugepage dumping
Presently hugepage's vma has a VM_RESERVED flag in order not to be
swapped. But a VM_RESERVED vma isn't core dumped because this flag is
often used for some kernel vmas (e.g. vmalloc, sound related).
Thus hugepages are never dumped and it can't be debugged easily. Many
developers want hugepages to be included into core-dump.
However, We can't read generic VM_RESERVED area because this area is often
IO mapping area. then these area reading may change device state. it is
definitly undesiable side-effect.
So adding a hugepage specific bit to the coredump filter is better. It
will be able to hugepage core dumping and doesn't cause any side-effect to
any i/o devices.
In additional, libhugetlb use hugetlb private mapping pages as anonymous
page. Then, hugepage private mapping pages should be core dumped by
default.
Then, /proc/[pid]/core_dump_filter has two new bits.
- bit 5 mean hugetlb private mapping pages are dumped or not. (default: yes)
- bit 6 mean hugetlb shared mapping pages are dumped or not. (default: no)
I tested by following method.
% ulimit -c unlimited
% ./crash_hugepage 50
% ./crash_hugepage 50 -p
% ls -lh
% gdb ./crash_hugepage core
%
% echo 0x43 > /proc/self/coredump_filter
% ./crash_hugepage 50
% ./crash_hugepage 50 -p
% ls -lh
% gdb ./crash_hugepage core
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
#include <string.h>
#include "hugetlbfs.h"
int main(int argc, char** argv){
char* p;
int ch;
int mmap_flags = MAP_SHARED;
int fd;
int nr_pages;
while((ch = getopt(argc, argv, "p")) != -1) {
switch (ch) {
case 'p':
mmap_flags &= ~MAP_SHARED;
mmap_flags |= MAP_PRIVATE;
break;
default:
/* nothing*/
break;
}
}
argc -= optind;
argv += optind;
if (argc == 0){
printf("need # of pages\n");
exit(1);
}
nr_pages = atoi(argv[0]);
if (nr_pages < 2) {
printf("nr_pages must >2\n");
exit(1);
}
fd = hugetlbfs_unlinked_fd();
p = mmap(NULL, nr_pages * gethugepagesize(),
PROT_READ|PROT_WRITE, mmap_flags, fd, 0);
sleep(2);
*(p + gethugepagesize()) = 1; /* COW */
sleep(2);
/* crash! */
*(int*)0 = 1;
return 0;
}
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Kawai Hidehiro <hidehiro.kawai.ez@hitachi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: William Irwin <wli@holomorphy.com>
Cc: Adam Litke <agl@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 11:27:08 +08:00
|
|
|
((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\
|
2008-10-19 11:28:23 +08:00
|
|
|
(1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
|
|
|
|
|
|
|
|
#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
|
|
|
|
# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS)
|
|
|
|
#else
|
|
|
|
# define MMF_DUMP_MASK_DEFAULT_ELF 0
|
|
|
|
#endif
|
2007-07-19 16:48:27 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
struct sighand_struct {
|
|
|
|
atomic_t count;
|
|
|
|
struct k_sigaction action[_NSIG];
|
|
|
|
spinlock_t siglock;
|
2007-09-21 03:40:16 +08:00
|
|
|
wait_queue_head_t signalfd_wqh;
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2006-06-25 20:49:24 +08:00
|
|
|
struct pacct_struct {
|
2006-06-25 20:49:25 +08:00
|
|
|
int ac_flag;
|
|
|
|
long ac_exitcode;
|
2006-06-25 20:49:24 +08:00
|
|
|
unsigned long ac_mem;
|
2006-06-25 20:49:26 +08:00
|
|
|
cputime_t ac_utime, ac_stime;
|
|
|
|
unsigned long ac_minflt, ac_majflt;
|
2006-06-25 20:49:24 +08:00
|
|
|
};
|
|
|
|
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
/**
|
|
|
|
* struct task_cputime - collected CPU time counts
|
|
|
|
* @utime: time spent in user mode, in &cputime_t units
|
|
|
|
* @stime: time spent in kernel mode, in &cputime_t units
|
|
|
|
* @sum_exec_runtime: total time spent on the CPU, in nanoseconds
|
2008-09-14 23:11:46 +08:00
|
|
|
*
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
* This structure groups together three kinds of CPU time that are
|
|
|
|
* tracked for threads and thread groups. Most things considering
|
|
|
|
* CPU time want to group these counts together and treat all three
|
|
|
|
* of them in parallel.
|
|
|
|
*/
|
|
|
|
struct task_cputime {
|
|
|
|
cputime_t utime;
|
|
|
|
cputime_t stime;
|
|
|
|
unsigned long long sum_exec_runtime;
|
|
|
|
};
|
|
|
|
/* Alternate field names when used to cache expirations. */
|
|
|
|
#define prof_exp stime
|
|
|
|
#define virt_exp utime
|
|
|
|
#define sched_exp sum_exec_runtime
|
|
|
|
|
|
|
|
/**
|
|
|
|
* struct thread_group_cputime - thread group interval timer counts
|
|
|
|
* @totals: thread group interval timers; substructure for
|
|
|
|
* uniprocessor kernel, per-cpu for SMP kernel.
|
|
|
|
*
|
|
|
|
* This structure contains the version of task_cputime, above, that is
|
|
|
|
* used for thread group CPU clock calculations.
|
|
|
|
*/
|
|
|
|
struct thread_group_cputime {
|
|
|
|
struct task_cputime *totals;
|
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* NOTE! "signal_struct" does not have it's own
|
|
|
|
* locking, because a shared signal_struct always
|
|
|
|
* implies a shared sighand_struct, so locking
|
|
|
|
* sighand_struct is always a proper superset of
|
|
|
|
* the locking of signal_struct.
|
|
|
|
*/
|
|
|
|
struct signal_struct {
|
|
|
|
atomic_t count;
|
|
|
|
atomic_t live;
|
|
|
|
|
|
|
|
wait_queue_head_t wait_chldexit; /* for wait4() */
|
|
|
|
|
|
|
|
/* current thread group signal load-balancing target: */
|
2006-07-03 15:25:41 +08:00
|
|
|
struct task_struct *curr_target;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* shared signal handling: */
|
|
|
|
struct sigpending shared_pending;
|
|
|
|
|
|
|
|
/* thread group exit support */
|
|
|
|
int group_exit_code;
|
|
|
|
/* overloaded:
|
|
|
|
* - notify group_exit_task when ->count is equal to notify_count
|
|
|
|
* - everyone except group_exit_task is stopped during signal delivery
|
|
|
|
* of fatal signals, group_exit_task processes the signal.
|
|
|
|
*/
|
|
|
|
int notify_count;
|
2008-08-01 20:18:04 +08:00
|
|
|
struct task_struct *group_exit_task;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* thread group stop support, overloads group_exit_code too */
|
|
|
|
int group_stop_count;
|
|
|
|
unsigned int flags; /* see SIGNAL_* flags below */
|
|
|
|
|
|
|
|
/* POSIX.1b Interval Timers */
|
|
|
|
struct list_head posix_timers;
|
|
|
|
|
|
|
|
/* ITIMER_REAL timer for the process */
|
2006-01-10 12:52:34 +08:00
|
|
|
struct hrtimer real_timer;
|
2008-02-08 20:19:19 +08:00
|
|
|
struct pid *leader_pid;
|
2006-01-10 12:52:34 +08:00
|
|
|
ktime_t it_real_incr;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */
|
|
|
|
cputime_t it_prof_expires, it_virt_expires;
|
|
|
|
cputime_t it_prof_incr, it_virt_incr;
|
|
|
|
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
/*
|
|
|
|
* Thread group totals for process CPU clocks.
|
|
|
|
* See thread_group_cputime(), et al, for details.
|
|
|
|
*/
|
|
|
|
struct thread_group_cputime cputime;
|
|
|
|
|
|
|
|
/* Earliest-expiration cache. */
|
|
|
|
struct task_cputime cputime_expires;
|
|
|
|
|
|
|
|
struct list_head cpu_timers[3];
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* job control IDs */
|
2007-10-19 14:40:39 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* pgrp and session fields are deprecated.
|
|
|
|
* use the task_session_Xnr and task_pgrp_Xnr routines below
|
|
|
|
*/
|
|
|
|
|
|
|
|
union {
|
|
|
|
pid_t pgrp __deprecated;
|
|
|
|
pid_t __pgrp;
|
|
|
|
};
|
|
|
|
|
2007-02-12 16:53:00 +08:00
|
|
|
struct pid *tty_old_pgrp;
|
2006-12-08 18:37:55 +08:00
|
|
|
|
|
|
|
union {
|
|
|
|
pid_t session __deprecated;
|
|
|
|
pid_t __session;
|
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* boolean value for session group leader */
|
|
|
|
int leader;
|
|
|
|
|
|
|
|
struct tty_struct *tty; /* NULL if no tty */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Cumulative resource counters for dead threads in the group,
|
|
|
|
* and for reaped dead child processes forked by this group.
|
|
|
|
* Live threads maintain their own counters and add to these
|
|
|
|
* in __exit_signal, except for the group leader.
|
|
|
|
*/
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
cputime_t cutime, cstime;
|
2007-10-15 23:00:19 +08:00
|
|
|
cputime_t gtime;
|
|
|
|
cputime_t cgtime;
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
|
|
|
|
unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
|
2007-05-11 13:22:37 +08:00
|
|
|
unsigned long inblock, oublock, cinblock, coublock;
|
2008-07-28 06:48:12 +08:00
|
|
|
struct task_io_accounting ioac;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't bother to synchronize most readers of this at all,
|
|
|
|
* because there is no reader checking a limit that actually needs
|
|
|
|
* to get both rlim_cur and rlim_max atomically, and either one
|
|
|
|
* alone is a single word that can safely be read normally.
|
|
|
|
* getrlimit/setrlimit use task_lock(current->group_leader) to
|
|
|
|
* protect this instead of the siglock, because they really
|
|
|
|
* have no need to disable irqs.
|
|
|
|
*/
|
|
|
|
struct rlimit rlim[RLIM_NLIMITS];
|
|
|
|
|
2006-06-25 20:49:24 +08:00
|
|
|
#ifdef CONFIG_BSD_PROCESS_ACCT
|
|
|
|
struct pacct_struct pacct; /* per-process accounting information */
|
|
|
|
#endif
|
2006-07-14 15:24:44 +08:00
|
|
|
#ifdef CONFIG_TASKSTATS
|
|
|
|
struct taskstats *stats;
|
|
|
|
#endif
|
Audit: add TTY input auditing
Add TTY input auditing, used to audit system administrator's actions. This is
required by various security standards such as DCID 6/3 and PCI to provide
non-repudiation of administrator's actions and to allow a review of past
actions if the administrator seems to overstep their duties or if the system
becomes misconfigured for unknown reasons. These requirements do not make it
necessary to audit TTY output as well.
Compared to an user-space keylogger, this approach records TTY input using the
audit subsystem, correlated with other audit events, and it is completely
transparent to the user-space application (e.g. the console ioctls still
work).
TTY input auditing works on a higher level than auditing all system calls
within the session, which would produce an overwhelming amount of mostly
useless audit events.
Add an "audit_tty" attribute, inherited across fork (). Data read from TTYs
by process with the attribute is sent to the audit subsystem by the kernel.
The audit netlink interface is extended to allow modifying the audit_tty
attribute, and to allow sending explanatory audit events from user-space (for
example, a shell might send an event containing the final command, after the
interactive command-line editing and history expansion is performed, which
might be difficult to decipher from the TTY input alone).
Because the "audit_tty" attribute is inherited across fork (), it would be set
e.g. for sshd restarted within an audited session. To prevent this, the
audit_tty attribute is cleared when a process with no open TTY file
descriptors (e.g. after daemon startup) opens a TTY.
See https://www.redhat.com/archives/linux-audit/2007-June/msg00000.html for a
more detailed rationale document for an older version of this patch.
[akpm@linux-foundation.org: build fix]
Signed-off-by: Miloslav Trmac <mitr@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Paul Fulghum <paulkf@microgate.com>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Cc: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-16 14:40:56 +08:00
|
|
|
#ifdef CONFIG_AUDIT
|
|
|
|
unsigned audit_tty;
|
|
|
|
struct tty_audit_buf *tty_audit_buf;
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2005-06-26 05:57:23 +08:00
|
|
|
/* Context switch must be unlocked if interrupts are to be enabled */
|
|
|
|
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
|
|
|
|
# define __ARCH_WANT_UNLOCKED_CTXSW
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Bits in flags field of signal_struct.
|
|
|
|
*/
|
|
|
|
#define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */
|
|
|
|
#define SIGNAL_STOP_DEQUEUED 0x00000002 /* stop signal dequeued */
|
|
|
|
#define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */
|
|
|
|
#define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */
|
2008-04-30 15:52:44 +08:00
|
|
|
/*
|
|
|
|
* Pending notifications to parent.
|
|
|
|
*/
|
|
|
|
#define SIGNAL_CLD_STOPPED 0x00000010
|
|
|
|
#define SIGNAL_CLD_CONTINUED 0x00000020
|
|
|
|
#define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-04-30 15:53:03 +08:00
|
|
|
#define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */
|
|
|
|
|
2008-02-05 14:27:24 +08:00
|
|
|
/* If true, all threads except ->group_exit_task have pending SIGKILL */
|
|
|
|
static inline int signal_group_exit(const struct signal_struct *sig)
|
|
|
|
{
|
|
|
|
return (sig->flags & SIGNAL_GROUP_EXIT) ||
|
|
|
|
(sig->group_exit_task != NULL);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Some day this will be a full-fledged user tracking system..
|
|
|
|
*/
|
|
|
|
struct user_struct {
|
|
|
|
atomic_t __count; /* reference count */
|
|
|
|
atomic_t processes; /* How many processes does this user have? */
|
|
|
|
atomic_t files; /* How many open files does this user have? */
|
|
|
|
atomic_t sigpending; /* How many pending signals does this user have? */
|
2006-06-02 04:10:59 +08:00
|
|
|
#ifdef CONFIG_INOTIFY_USER
|
[PATCH] inotify
inotify is intended to correct the deficiencies of dnotify, particularly
its inability to scale and its terrible user interface:
* dnotify requires the opening of one fd per each directory
that you intend to watch. This quickly results in too many
open files and pins removable media, preventing unmount.
* dnotify is directory-based. You only learn about changes to
directories. Sure, a change to a file in a directory affects
the directory, but you are then forced to keep a cache of
stat structures.
* dnotify's interface to user-space is awful. Signals?
inotify provides a more usable, simple, powerful solution to file change
notification:
* inotify's interface is a system call that returns a fd, not SIGIO.
You get a single fd, which is select()-able.
* inotify has an event that says "the filesystem that the item
you were watching is on was unmounted."
* inotify can watch directories or files.
Inotify is currently used by Beagle (a desktop search infrastructure),
Gamin (a FAM replacement), and other projects.
See Documentation/filesystems/inotify.txt.
Signed-off-by: Robert Love <rml@novell.com>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-07-13 05:06:03 +08:00
|
|
|
atomic_t inotify_watches; /* How many inotify watches does this user have? */
|
|
|
|
atomic_t inotify_devs; /* How many inotify devs does this user have opened? */
|
|
|
|
#endif
|
epoll: introduce resource usage limits
It has been thought that the per-user file descriptors limit would also
limit the resources that a normal user can request via the epoll
interface. Vegard Nossum reported a very simple program (a modified
version attached) that can make a normal user to request a pretty large
amount of kernel memory, well within the its maximum number of fds. To
solve such problem, default limits are now imposed, and /proc based
configuration has been introduced. A new directory has been created,
named /proc/sys/fs/epoll/ and inside there, there are two configuration
points:
max_user_instances = Maximum number of devices - per user
max_user_watches = Maximum number of "watched" fds - per user
The current default for "max_user_watches" limits the memory used by epoll
to store "watches", to 1/32 of the amount of the low RAM. As example, a
256MB 32bit machine, will have "max_user_watches" set to roughly 90000.
That should be enough to not break existing heavy epoll users. The
default value for "max_user_instances" is set to 128, that should be
enough too.
This also changes the userspace, because a new error code can now come out
from EPOLL_CTL_ADD (-ENOSPC). The EMFILE from epoll_create() was already
listed, so that should be ok.
[akpm@linux-foundation.org: use get_current_user()]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: <stable@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Reported-by: Vegard Nossum <vegardno@ifi.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-12-02 05:13:55 +08:00
|
|
|
#ifdef CONFIG_EPOLL
|
|
|
|
atomic_t epoll_devs; /* The number of epoll descriptors currently open */
|
|
|
|
atomic_t epoll_watches; /* The number of file descriptors currently watched */
|
|
|
|
#endif
|
2007-10-17 14:30:09 +08:00
|
|
|
#ifdef CONFIG_POSIX_MQUEUE
|
2005-04-17 06:20:36 +08:00
|
|
|
/* protected by mq_lock */
|
|
|
|
unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
|
2007-10-17 14:30:09 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long locked_shm; /* How many pages of mlocked shm ? */
|
|
|
|
|
|
|
|
#ifdef CONFIG_KEYS
|
|
|
|
struct key *uid_keyring; /* UID specific keyring */
|
|
|
|
struct key *session_keyring; /* UID's default session keyring */
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Hash table maintenance information */
|
2007-09-19 13:46:44 +08:00
|
|
|
struct hlist_node uidhash_node;
|
2005-04-17 06:20:36 +08:00
|
|
|
uid_t uid;
|
2008-10-16 05:38:45 +08:00
|
|
|
struct user_namespace *user_ns;
|
2007-10-15 23:00:09 +08:00
|
|
|
|
2008-02-13 22:45:40 +08:00
|
|
|
#ifdef CONFIG_USER_SCHED
|
2007-10-15 23:00:14 +08:00
|
|
|
struct task_group *tg;
|
2007-10-17 22:55:11 +08:00
|
|
|
#ifdef CONFIG_SYSFS
|
2007-11-02 20:47:53 +08:00
|
|
|
struct kobject kobj;
|
2007-10-15 23:00:14 +08:00
|
|
|
struct work_struct work;
|
2007-10-15 23:00:09 +08:00
|
|
|
#endif
|
2007-10-17 22:55:11 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2007-11-02 20:47:53 +08:00
|
|
|
extern int uids_sysfs_init(void);
|
2007-10-15 23:00:14 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
extern struct user_struct *find_user(uid_t);
|
|
|
|
|
|
|
|
extern struct user_struct root_user;
|
|
|
|
#define INIT_USER (&root_user)
|
|
|
|
|
2008-11-14 07:39:16 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
struct backing_dev_info;
|
|
|
|
struct reclaim_state;
|
|
|
|
|
2006-07-14 15:24:38 +08:00
|
|
|
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
|
2005-04-17 06:20:36 +08:00
|
|
|
struct sched_info {
|
|
|
|
/* cumulative counters */
|
2007-10-15 23:00:12 +08:00
|
|
|
unsigned long pcount; /* # of times run on this cpu */
|
2008-12-17 15:41:22 +08:00
|
|
|
unsigned long long run_delay; /* time spent waiting on a runqueue */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* timestamps */
|
2007-07-10 00:52:00 +08:00
|
|
|
unsigned long long last_arrival,/* when we last ran on a cpu */
|
|
|
|
last_queued; /* when we were last queued to run */
|
2007-10-15 23:00:10 +08:00
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
|
|
|
/* BKL stats */
|
2007-10-19 03:32:56 +08:00
|
|
|
unsigned int bkl_count;
|
2007-10-15 23:00:10 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
2006-07-14 15:24:38 +08:00
|
|
|
#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-07-14 15:24:36 +08:00
|
|
|
#ifdef CONFIG_TASK_DELAY_ACCT
|
|
|
|
struct task_delay_info {
|
|
|
|
spinlock_t lock;
|
|
|
|
unsigned int flags; /* Private per-task flags */
|
|
|
|
|
|
|
|
/* For each stat XXX, add following, aligned appropriately
|
|
|
|
*
|
|
|
|
* struct timespec XXX_start, XXX_end;
|
|
|
|
* u64 XXX_delay;
|
|
|
|
* u32 XXX_count;
|
|
|
|
*
|
|
|
|
* Atomicity of updates to XXX_delay, XXX_count protected by
|
|
|
|
* single lock above (split into XXX_lock if contention is an issue).
|
|
|
|
*/
|
2006-07-14 15:24:37 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* XXX_count is incremented on every XXX operation, the delay
|
|
|
|
* associated with the operation is added to XXX_delay.
|
|
|
|
* XXX_delay contains the accumulated delay time in nanoseconds.
|
|
|
|
*/
|
|
|
|
struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */
|
|
|
|
u64 blkio_delay; /* wait for sync block io completion */
|
|
|
|
u64 swapin_delay; /* wait for swapin block io completion */
|
|
|
|
u32 blkio_count; /* total count of the number of sync block */
|
|
|
|
/* io operations performed */
|
|
|
|
u32 swapin_count; /* total count of the number of swapin block */
|
|
|
|
/* io operations performed */
|
2008-07-25 16:48:52 +08:00
|
|
|
|
|
|
|
struct timespec freepages_start, freepages_end;
|
|
|
|
u64 freepages_delay; /* wait for memory reclaim */
|
|
|
|
u32 freepages_count; /* total count of memory reclaim */
|
2006-07-14 15:24:36 +08:00
|
|
|
};
|
2006-07-14 15:24:38 +08:00
|
|
|
#endif /* CONFIG_TASK_DELAY_ACCT */
|
|
|
|
|
|
|
|
static inline int sched_info_on(void)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
|
|
|
return 1;
|
|
|
|
#elif defined(CONFIG_TASK_DELAY_ACCT)
|
|
|
|
extern int delayacct_on;
|
|
|
|
return delayacct_on;
|
|
|
|
#else
|
|
|
|
return 0;
|
2006-07-14 15:24:36 +08:00
|
|
|
#endif
|
2006-07-14 15:24:38 +08:00
|
|
|
}
|
2006-07-14 15:24:36 +08:00
|
|
|
|
2007-07-10 00:51:57 +08:00
|
|
|
enum cpu_idle_type {
|
|
|
|
CPU_IDLE,
|
|
|
|
CPU_NOT_IDLE,
|
|
|
|
CPU_NEWLY_IDLE,
|
|
|
|
CPU_MAX_IDLE_TYPES
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* sched-domains (multiprocessor balancing) declarations:
|
|
|
|
*/
|
2007-07-10 00:51:58 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Increase resolution of nice-level calculations:
|
|
|
|
*/
|
|
|
|
#define SCHED_LOAD_SHIFT 10
|
|
|
|
#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
|
|
|
|
|
2007-08-23 21:18:02 +08:00
|
|
|
#define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE
|
2005-04-17 06:20:36 +08:00
|
|
|
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 17:54:34 +08:00
|
|
|
#ifdef CONFIG_SMP
|
2005-04-17 06:20:36 +08:00
|
|
|
#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */
|
|
|
|
#define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */
|
|
|
|
#define SD_BALANCE_EXEC 4 /* Balance on exec */
|
2005-06-26 05:57:19 +08:00
|
|
|
#define SD_BALANCE_FORK 8 /* Balance on fork, clone */
|
|
|
|
#define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */
|
|
|
|
#define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */
|
|
|
|
#define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */
|
|
|
|
#define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */
|
2006-06-27 17:54:42 +08:00
|
|
|
#define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */
|
2006-10-03 16:14:09 +08:00
|
|
|
#define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */
|
2006-12-10 18:20:29 +08:00
|
|
|
#define SD_SERIALIZE 1024 /* Only a single load balancing instance */
|
2008-04-15 13:04:23 +08:00
|
|
|
#define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */
|
2006-06-27 17:54:42 +08:00
|
|
|
|
2008-12-19 01:56:09 +08:00
|
|
|
enum powersavings_balance_level {
|
|
|
|
POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */
|
|
|
|
POWERSAVINGS_BALANCE_BASIC, /* Fill one thread/core/package
|
|
|
|
* first for long running threads
|
|
|
|
*/
|
|
|
|
POWERSAVINGS_BALANCE_WAKEUP, /* Also bias task wakeups to semi-idle
|
|
|
|
* cpu package for power savings
|
|
|
|
*/
|
|
|
|
MAX_POWERSAVINGS_BALANCE_LEVELS
|
|
|
|
};
|
2006-10-03 16:14:09 +08:00
|
|
|
|
2008-12-19 01:56:02 +08:00
|
|
|
extern int sched_mc_power_savings, sched_smt_power_savings;
|
2006-10-03 16:14:09 +08:00
|
|
|
|
2008-12-19 01:56:02 +08:00
|
|
|
static inline int sd_balance_for_mc_power(void)
|
|
|
|
{
|
|
|
|
if (sched_smt_power_savings)
|
|
|
|
return SD_POWERSAVINGS_BALANCE;
|
2006-06-27 17:54:42 +08:00
|
|
|
|
2008-12-19 01:56:02 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2006-10-03 16:14:09 +08:00
|
|
|
|
2008-12-19 01:56:02 +08:00
|
|
|
static inline int sd_balance_for_package_power(void)
|
|
|
|
{
|
|
|
|
if (sched_mc_power_savings | sched_smt_power_savings)
|
|
|
|
return SD_POWERSAVINGS_BALANCE;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2006-06-27 17:54:42 +08:00
|
|
|
|
2008-12-19 01:56:47 +08:00
|
|
|
/*
|
|
|
|
* Optimise SD flags for power savings:
|
|
|
|
* SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings.
|
|
|
|
* Keep default SD flags if sched_{smt,mc}_power_saving=0
|
|
|
|
*/
|
|
|
|
|
|
|
|
static inline int sd_power_saving_flags(void)
|
|
|
|
{
|
|
|
|
if (sched_mc_power_savings | sched_smt_power_savings)
|
|
|
|
return SD_BALANCE_NEWIDLE;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
struct sched_group {
|
|
|
|
struct sched_group *next; /* Must be a circular list */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* CPU power of this group, SCHED_LOAD_SCALE being max power for a
|
|
|
|
* single CPU. This is read only (except for setup, hotplug CPU).
|
2007-05-08 15:32:57 +08:00
|
|
|
* Note : Never change cpu_power without recompute its reciprocal
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2007-05-08 15:32:57 +08:00
|
|
|
unsigned int __cpu_power;
|
|
|
|
/*
|
|
|
|
* reciprocal value of cpu_power to avoid expensive divides
|
|
|
|
* (see include/linux/reciprocal_div.h)
|
|
|
|
*/
|
|
|
|
u32 reciprocal_cpu_power;
|
2008-11-25 00:05:04 +08:00
|
|
|
|
|
|
|
unsigned long cpumask[];
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2008-11-25 00:05:04 +08:00
|
|
|
static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
|
|
|
|
{
|
2008-11-25 00:05:04 +08:00
|
|
|
return to_cpumask(sg->cpumask);
|
2008-11-25 00:05:04 +08:00
|
|
|
}
|
|
|
|
|
2008-04-15 13:04:23 +08:00
|
|
|
enum sched_domain_level {
|
|
|
|
SD_LV_NONE = 0,
|
|
|
|
SD_LV_SIBLING,
|
|
|
|
SD_LV_MC,
|
|
|
|
SD_LV_CPU,
|
|
|
|
SD_LV_NODE,
|
|
|
|
SD_LV_ALLNODES,
|
|
|
|
SD_LV_MAX
|
|
|
|
};
|
|
|
|
|
|
|
|
struct sched_domain_attr {
|
|
|
|
int relax_domain_level;
|
|
|
|
};
|
|
|
|
|
|
|
|
#define SD_ATTR_INIT (struct sched_domain_attr) { \
|
|
|
|
.relax_domain_level = -1, \
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
struct sched_domain {
|
|
|
|
/* These fields must be setup */
|
|
|
|
struct sched_domain *parent; /* top domain must be null terminated */
|
2006-10-03 16:14:08 +08:00
|
|
|
struct sched_domain *child; /* bottom domain must be null terminated */
|
2005-04-17 06:20:36 +08:00
|
|
|
struct sched_group *groups; /* the balancing groups of the domain */
|
|
|
|
unsigned long min_interval; /* Minimum balance interval ms */
|
|
|
|
unsigned long max_interval; /* Maximum balance interval ms */
|
|
|
|
unsigned int busy_factor; /* less balancing by factor if busy */
|
|
|
|
unsigned int imbalance_pct; /* No balance until over watermark */
|
|
|
|
unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
|
2005-06-26 05:57:13 +08:00
|
|
|
unsigned int busy_idx;
|
|
|
|
unsigned int idle_idx;
|
|
|
|
unsigned int newidle_idx;
|
|
|
|
unsigned int wake_idx;
|
2005-06-26 05:57:19 +08:00
|
|
|
unsigned int forkexec_idx;
|
2005-04-17 06:20:36 +08:00
|
|
|
int flags; /* See SD_* */
|
2008-04-15 13:04:23 +08:00
|
|
|
enum sched_domain_level level;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Runtime fields. */
|
|
|
|
unsigned long last_balance; /* init to jiffies. units in jiffies */
|
|
|
|
unsigned int balance_interval; /* initialise to 1. units in ms. */
|
|
|
|
unsigned int nr_balance_failed; /* initialise to 0 */
|
|
|
|
|
2008-06-27 19:41:35 +08:00
|
|
|
u64 last_update;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
|
|
|
/* load_balance() stats */
|
2007-10-19 03:32:56 +08:00
|
|
|
unsigned int lb_count[CPU_MAX_IDLE_TYPES];
|
|
|
|
unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
|
|
|
|
unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
|
|
|
|
unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
|
|
|
|
unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
|
|
|
|
unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
|
|
|
|
unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
|
|
|
|
unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Active load balancing */
|
2007-10-19 03:32:56 +08:00
|
|
|
unsigned int alb_count;
|
|
|
|
unsigned int alb_failed;
|
|
|
|
unsigned int alb_pushed;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-06-26 05:57:20 +08:00
|
|
|
/* SD_BALANCE_EXEC stats */
|
2007-10-19 03:32:56 +08:00
|
|
|
unsigned int sbe_count;
|
|
|
|
unsigned int sbe_balanced;
|
|
|
|
unsigned int sbe_pushed;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-06-26 05:57:20 +08:00
|
|
|
/* SD_BALANCE_FORK stats */
|
2007-10-19 03:32:56 +08:00
|
|
|
unsigned int sbf_count;
|
|
|
|
unsigned int sbf_balanced;
|
|
|
|
unsigned int sbf_pushed;
|
2005-06-26 05:57:20 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* try_to_wake_up() stats */
|
2007-10-19 03:32:56 +08:00
|
|
|
unsigned int ttwu_wake_remote;
|
|
|
|
unsigned int ttwu_move_affine;
|
|
|
|
unsigned int ttwu_move_balance;
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
2008-10-09 17:35:51 +08:00
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
|
|
char *name;
|
|
|
|
#endif
|
2008-11-25 00:05:04 +08:00
|
|
|
|
|
|
|
/* span of all CPUs in this domain */
|
|
|
|
unsigned long span[];
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2008-11-25 00:05:04 +08:00
|
|
|
static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
|
|
|
|
{
|
2008-11-25 00:05:04 +08:00
|
|
|
return to_cpumask(sd->span);
|
2008-11-25 00:05:04 +08:00
|
|
|
}
|
|
|
|
|
2008-11-25 00:05:14 +08:00
|
|
|
extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
|
2008-04-15 13:04:23 +08:00
|
|
|
struct sched_domain_attr *dattr_new);
|
2007-10-19 14:40:20 +08:00
|
|
|
|
2008-12-19 04:30:23 +08:00
|
|
|
/* Test a flag in parent sched domain */
|
|
|
|
static inline int test_sd_parent(struct sched_domain *sd, int flag)
|
|
|
|
{
|
|
|
|
if (sd->parent && (sd->parent->flags & flag))
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-07-18 20:01:39 +08:00
|
|
|
#else /* CONFIG_SMP */
|
|
|
|
|
|
|
|
struct sched_domain_attr;
|
|
|
|
|
|
|
|
static inline void
|
2008-11-25 00:05:14 +08:00
|
|
|
partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
|
2008-07-18 20:01:39 +08:00
|
|
|
struct sched_domain_attr *dattr_new)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* !CONFIG_SMP */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
struct io_context; /* See blkdev.h */
|
|
|
|
|
|
|
|
|
2005-09-10 04:02:02 +08:00
|
|
|
#ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
|
2006-07-03 15:25:41 +08:00
|
|
|
extern void prefetch_stack(struct task_struct *t);
|
2005-09-10 04:02:02 +08:00
|
|
|
#else
|
|
|
|
static inline void prefetch_stack(struct task_struct *t) { }
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
struct audit_context; /* See audit.c */
|
|
|
|
struct mempolicy;
|
2006-04-11 19:52:07 +08:00
|
|
|
struct pipe_inode_info;
|
2006-10-02 17:18:14 +08:00
|
|
|
struct uts_namespace;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-07-10 00:51:58 +08:00
|
|
|
struct rq;
|
|
|
|
struct sched_domain;
|
|
|
|
|
|
|
|
struct sched_class {
|
2007-10-15 23:00:12 +08:00
|
|
|
const struct sched_class *next;
|
2007-07-10 00:51:58 +08:00
|
|
|
|
2007-08-09 17:16:48 +08:00
|
|
|
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
|
2007-08-09 17:16:48 +08:00
|
|
|
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
|
2007-10-15 23:00:08 +08:00
|
|
|
void (*yield_task) (struct rq *rq);
|
2007-07-10 00:51:58 +08:00
|
|
|
|
2008-09-21 05:38:02 +08:00
|
|
|
void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync);
|
2007-07-10 00:51:58 +08:00
|
|
|
|
2007-08-09 17:16:48 +08:00
|
|
|
struct task_struct * (*pick_next_task) (struct rq *rq);
|
2007-08-09 17:16:49 +08:00
|
|
|
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
|
2007-07-10 00:51:58 +08:00
|
|
|
|
2007-10-25 00:23:51 +08:00
|
|
|
#ifdef CONFIG_SMP
|
2008-10-22 15:25:26 +08:00
|
|
|
int (*select_task_rq)(struct task_struct *p, int sync);
|
|
|
|
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 17:16:46 +08:00
|
|
|
unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
|
2007-10-25 00:23:51 +08:00
|
|
|
struct rq *busiest, unsigned long max_load_move,
|
2007-07-10 00:51:58 +08:00
|
|
|
struct sched_domain *sd, enum cpu_idle_type idle,
|
2007-08-09 17:16:46 +08:00
|
|
|
int *all_pinned, int *this_best_prio);
|
2007-07-10 00:51:58 +08:00
|
|
|
|
2007-10-25 00:23:51 +08:00
|
|
|
int (*move_one_task) (struct rq *this_rq, int this_cpu,
|
|
|
|
struct rq *busiest, struct sched_domain *sd,
|
|
|
|
enum cpu_idle_type idle);
|
2008-01-26 04:08:22 +08:00
|
|
|
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
|
2008-12-29 22:39:52 +08:00
|
|
|
int (*needs_post_schedule) (struct rq *this_rq);
|
2008-01-26 04:08:22 +08:00
|
|
|
void (*post_schedule) (struct rq *this_rq);
|
|
|
|
void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
|
2007-10-25 00:23:51 +08:00
|
|
|
|
2008-03-27 05:23:49 +08:00
|
|
|
void (*set_cpus_allowed)(struct task_struct *p,
|
2008-11-25 00:05:14 +08:00
|
|
|
const struct cpumask *newmask);
|
2008-01-26 04:08:18 +08:00
|
|
|
|
2008-06-05 03:04:05 +08:00
|
|
|
void (*rq_online)(struct rq *rq);
|
|
|
|
void (*rq_offline)(struct rq *rq);
|
2008-10-22 15:25:26 +08:00
|
|
|
#endif
|
|
|
|
|
|
|
|
void (*set_curr_task) (struct rq *rq);
|
|
|
|
void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
|
|
|
|
void (*task_new) (struct rq *rq, struct task_struct *p);
|
2008-01-26 04:08:22 +08:00
|
|
|
|
|
|
|
void (*switched_from) (struct rq *this_rq, struct task_struct *task,
|
|
|
|
int running);
|
|
|
|
void (*switched_to) (struct rq *this_rq, struct task_struct *task,
|
|
|
|
int running);
|
|
|
|
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
|
|
|
|
int oldprio, int running);
|
2008-03-01 04:21:01 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
|
void (*moved_group) (struct task_struct *p);
|
|
|
|
#endif
|
2007-07-10 00:51:58 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct load_weight {
|
|
|
|
unsigned long weight, inv_weight;
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* CFS stats for a schedulable entity (task, task-group etc)
|
|
|
|
*
|
|
|
|
* Current field usage histogram:
|
|
|
|
*
|
|
|
|
* 4 se->block_start
|
|
|
|
* 4 se->run_node
|
|
|
|
* 4 se->sleep_start
|
|
|
|
* 6 se->load.weight
|
|
|
|
*/
|
|
|
|
struct sched_entity {
|
|
|
|
struct load_weight load; /* for load-balancing */
|
|
|
|
struct rb_node run_node;
|
2008-04-20 01:45:00 +08:00
|
|
|
struct list_head group_node;
|
2007-07-10 00:51:58 +08:00
|
|
|
unsigned int on_rq;
|
|
|
|
|
2007-08-02 23:41:40 +08:00
|
|
|
u64 exec_start;
|
|
|
|
u64 sum_exec_runtime;
|
2007-10-15 23:00:04 +08:00
|
|
|
u64 vruntime;
|
2007-08-28 18:53:24 +08:00
|
|
|
u64 prev_sum_exec_runtime;
|
2007-08-02 23:41:40 +08:00
|
|
|
|
2008-03-19 08:42:00 +08:00
|
|
|
u64 last_wakeup;
|
|
|
|
u64 avg_overlap;
|
|
|
|
|
2007-08-02 23:41:40 +08:00
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
2007-07-10 00:51:58 +08:00
|
|
|
u64 wait_start;
|
2007-08-02 23:41:40 +08:00
|
|
|
u64 wait_max;
|
2008-01-26 04:08:35 +08:00
|
|
|
u64 wait_count;
|
|
|
|
u64 wait_sum;
|
2007-08-02 23:41:40 +08:00
|
|
|
|
2007-07-10 00:51:58 +08:00
|
|
|
u64 sleep_start;
|
|
|
|
u64 sleep_max;
|
2007-08-02 23:41:40 +08:00
|
|
|
s64 sum_sleep_runtime;
|
|
|
|
|
|
|
|
u64 block_start;
|
2007-07-10 00:51:58 +08:00
|
|
|
u64 block_max;
|
|
|
|
u64 exec_max;
|
2007-10-15 23:00:02 +08:00
|
|
|
u64 slice_max;
|
2007-10-15 23:00:18 +08:00
|
|
|
|
|
|
|
u64 nr_migrations;
|
|
|
|
u64 nr_migrations_cold;
|
|
|
|
u64 nr_failed_migrations_affine;
|
|
|
|
u64 nr_failed_migrations_running;
|
|
|
|
u64 nr_failed_migrations_hot;
|
|
|
|
u64 nr_forced_migrations;
|
|
|
|
u64 nr_forced2_migrations;
|
|
|
|
|
|
|
|
u64 nr_wakeups;
|
|
|
|
u64 nr_wakeups_sync;
|
|
|
|
u64 nr_wakeups_migrate;
|
|
|
|
u64 nr_wakeups_local;
|
|
|
|
u64 nr_wakeups_remote;
|
|
|
|
u64 nr_wakeups_affine;
|
|
|
|
u64 nr_wakeups_affine_attempts;
|
|
|
|
u64 nr_wakeups_passive;
|
|
|
|
u64 nr_wakeups_idle;
|
2007-08-02 23:41:40 +08:00
|
|
|
#endif
|
|
|
|
|
2007-07-10 00:51:58 +08:00
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
|
struct sched_entity *parent;
|
|
|
|
/* rq on which this entity is (to be) queued: */
|
|
|
|
struct cfs_rq *cfs_rq;
|
|
|
|
/* rq "owned" by this entity/group: */
|
|
|
|
struct cfs_rq *my_q;
|
|
|
|
#endif
|
|
|
|
};
|
2006-07-03 15:25:42 +08:00
|
|
|
|
2008-01-26 04:08:27 +08:00
|
|
|
struct sched_rt_entity {
|
|
|
|
struct list_head run_list;
|
2008-01-26 04:08:27 +08:00
|
|
|
unsigned long timeout;
|
2008-08-01 20:24:08 +08:00
|
|
|
unsigned int time_slice;
|
2008-01-26 04:08:30 +08:00
|
|
|
int nr_cpus_allowed;
|
|
|
|
|
2008-04-20 01:45:00 +08:00
|
|
|
struct sched_rt_entity *back;
|
2008-02-13 22:45:40 +08:00
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
2008-01-26 04:08:30 +08:00
|
|
|
struct sched_rt_entity *parent;
|
|
|
|
/* rq on which this entity is (to be) queued: */
|
|
|
|
struct rt_rq *rt_rq;
|
|
|
|
/* rq "owned" by this entity/group: */
|
|
|
|
struct rt_rq *my_q;
|
|
|
|
#endif
|
2008-01-26 04:08:27 +08:00
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
struct task_struct {
|
|
|
|
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
|
2007-05-09 17:35:17 +08:00
|
|
|
void *stack;
|
2005-04-17 06:20:36 +08:00
|
|
|
atomic_t usage;
|
2007-05-08 15:23:41 +08:00
|
|
|
unsigned int flags; /* per process flags, defined below */
|
|
|
|
unsigned int ptrace;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-05-06 07:16:12 +08:00
|
|
|
int lock_depth; /* BKL lock depth */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 17:54:34 +08:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
#ifdef __ARCH_WANT_UNLOCKED_CTXSW
|
2005-06-26 05:57:23 +08:00
|
|
|
int oncpu;
|
|
|
|
#endif
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 17:54:34 +08:00
|
|
|
#endif
|
2007-07-10 00:52:00 +08:00
|
|
|
|
2006-06-27 17:54:51 +08:00
|
|
|
int prio, static_prio, normal_prio;
|
2008-05-15 19:09:15 +08:00
|
|
|
unsigned int rt_priority;
|
2007-10-15 23:00:12 +08:00
|
|
|
const struct sched_class *sched_class;
|
2007-07-10 00:51:58 +08:00
|
|
|
struct sched_entity se;
|
2008-01-26 04:08:27 +08:00
|
|
|
struct sched_rt_entity rt;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-07-26 19:40:43 +08:00
|
|
|
#ifdef CONFIG_PREEMPT_NOTIFIERS
|
|
|
|
/* list of struct preempt_notifier: */
|
|
|
|
struct hlist_head preempt_notifiers;
|
|
|
|
#endif
|
|
|
|
|
2007-10-17 14:30:26 +08:00
|
|
|
/*
|
|
|
|
* fpu_counter contains the number of consecutive context switches
|
|
|
|
* that the FPU is used. If this is over a threshold, the lazy fpu
|
|
|
|
* saving becomes unlazy to save the trap. This is an unsigned char
|
|
|
|
* so that after 256 times the counter wraps and the behavior turns
|
|
|
|
* lazy again; this to deal with bursty apps that only use FPU for
|
|
|
|
* a short time
|
|
|
|
*/
|
|
|
|
unsigned char fpu_counter;
|
|
|
|
s8 oomkilladj; /* OOM kill score adjustment (bit shift). */
|
2006-09-29 16:59:40 +08:00
|
|
|
#ifdef CONFIG_BLK_DEV_IO_TRACE
|
2006-03-24 03:00:26 +08:00
|
|
|
unsigned int btrace_seq;
|
2006-09-29 16:59:40 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-05-08 15:23:41 +08:00
|
|
|
unsigned int policy;
|
2005-04-17 06:20:36 +08:00
|
|
|
cpumask_t cpus_allowed;
|
|
|
|
|
2008-01-26 04:08:24 +08:00
|
|
|
#ifdef CONFIG_PREEMPT_RCU
|
|
|
|
int rcu_read_lock_nesting;
|
|
|
|
int rcu_flipctr_idx;
|
|
|
|
#endif /* #ifdef CONFIG_PREEMPT_RCU */
|
|
|
|
|
2006-07-14 15:24:38 +08:00
|
|
|
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
|
2005-04-17 06:20:36 +08:00
|
|
|
struct sched_info sched_info;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
struct list_head tasks;
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 22:39:53 +08:00
|
|
|
struct plist_node pushable_tasks;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
struct mm_struct *mm, *active_mm;
|
|
|
|
|
|
|
|
/* task state */
|
|
|
|
struct linux_binfmt *binfmt;
|
2007-05-08 15:23:41 +08:00
|
|
|
int exit_state;
|
2005-04-17 06:20:36 +08:00
|
|
|
int exit_code, exit_signal;
|
|
|
|
int pdeath_signal; /* The signal sent when the parent dies */
|
|
|
|
/* ??? */
|
2007-05-08 15:23:41 +08:00
|
|
|
unsigned int personality;
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned did_exec:1;
|
|
|
|
pid_t pid;
|
|
|
|
pid_t tgid;
|
2006-09-26 16:52:38 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_CC_STACKPROTECTOR
|
|
|
|
/* Canary value for the -fstack-protector gcc feature */
|
|
|
|
unsigned long stack_canary;
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* pointers to (original) parent process, youngest child, younger sibling,
|
|
|
|
* older sibling, respectively. (p->father can be replaced with
|
2008-03-25 09:36:23 +08:00
|
|
|
* p->real_parent->pid)
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2008-03-25 09:36:23 +08:00
|
|
|
struct task_struct *real_parent; /* real parent process */
|
|
|
|
struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2008-03-25 09:36:23 +08:00
|
|
|
* children/sibling forms the list of my natural children
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
struct list_head children; /* list of my children */
|
|
|
|
struct list_head sibling; /* linkage in my parent's children list */
|
|
|
|
struct task_struct *group_leader; /* threadgroup leader */
|
|
|
|
|
2008-03-25 09:36:23 +08:00
|
|
|
/*
|
|
|
|
* ptraced is the list of tasks this task is using ptrace on.
|
|
|
|
* This includes both natural children and PTRACE_ATTACH targets.
|
|
|
|
* p->ptrace_entry is p's link on the p->parent->ptraced list.
|
|
|
|
*/
|
|
|
|
struct list_head ptraced;
|
|
|
|
struct list_head ptrace_entry;
|
|
|
|
|
2008-11-25 16:01:25 +08:00
|
|
|
#ifdef CONFIG_X86_PTRACE_BTS
|
|
|
|
/*
|
|
|
|
* This is the tracer handle for the ptrace BTS extension.
|
|
|
|
* This field actually belongs to the ptracer task.
|
|
|
|
*/
|
|
|
|
struct bts_tracer *bts;
|
2008-11-25 16:05:27 +08:00
|
|
|
/*
|
|
|
|
* The buffer to hold the BTS data.
|
|
|
|
*/
|
|
|
|
void *bts_buffer;
|
2008-12-11 20:49:59 +08:00
|
|
|
size_t bts_size;
|
2008-11-25 16:01:25 +08:00
|
|
|
#endif /* CONFIG_X86_PTRACE_BTS */
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* PID/PID hash table linkage. */
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 18:31:42 +08:00
|
|
|
struct pid_link pids[PIDTYPE_MAX];
|
2006-03-29 08:11:25 +08:00
|
|
|
struct list_head thread_group;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
struct completion *vfork_done; /* for vfork() */
|
|
|
|
int __user *set_child_tid; /* CLONE_CHILD_SETTID */
|
|
|
|
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
|
|
|
|
|
2007-10-18 18:06:34 +08:00
|
|
|
cputime_t utime, stime, utimescaled, stimescaled;
|
2007-10-15 23:00:19 +08:00
|
|
|
cputime_t gtime;
|
2007-10-30 07:26:32 +08:00
|
|
|
cputime_t prev_utime, prev_stime;
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long nvcsw, nivcsw; /* context switch counts */
|
2007-07-16 14:39:42 +08:00
|
|
|
struct timespec start_time; /* monotonic time */
|
|
|
|
struct timespec real_start_time; /* boot based time */
|
2005-04-17 06:20:36 +08:00
|
|
|
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
|
|
|
|
unsigned long min_flt, maj_flt;
|
|
|
|
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
struct task_cputime cputime_expires;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct list_head cpu_timers[3];
|
|
|
|
|
|
|
|
/* process credentials */
|
2008-11-14 07:39:26 +08:00
|
|
|
const struct cred *real_cred; /* objective and real subjective task
|
|
|
|
* credentials (COW) */
|
|
|
|
const struct cred *cred; /* effective (overridable) subjective task
|
|
|
|
* credentials (COW) */
|
CRED: Inaugurate COW credentials
Inaugurate copy-on-write credentials management. This uses RCU to manage the
credentials pointer in the task_struct with respect to accesses by other tasks.
A process may only modify its own credentials, and so does not need locking to
access or modify its own credentials.
A mutex (cred_replace_mutex) is added to the task_struct to control the effect
of PTRACE_ATTACHED on credential calculations, particularly with respect to
execve().
With this patch, the contents of an active credentials struct may not be
changed directly; rather a new set of credentials must be prepared, modified
and committed using something like the following sequence of events:
struct cred *new = prepare_creds();
int ret = blah(new);
if (ret < 0) {
abort_creds(new);
return ret;
}
return commit_creds(new);
There are some exceptions to this rule: the keyrings pointed to by the active
credentials may be instantiated - keyrings violate the COW rule as managing
COW keyrings is tricky, given that it is possible for a task to directly alter
the keys in a keyring in use by another task.
To help enforce this, various pointers to sets of credentials, such as those in
the task_struct, are declared const. The purpose of this is compile-time
discouragement of altering credentials through those pointers. Once a set of
credentials has been made public through one of these pointers, it may not be
modified, except under special circumstances:
(1) Its reference count may incremented and decremented.
(2) The keyrings to which it points may be modified, but not replaced.
The only safe way to modify anything else is to create a replacement and commit
using the functions described in Documentation/credentials.txt (which will be
added by a later patch).
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
This now prepares and commits credentials in various places in the
security code rather than altering the current creds directly.
(2) Temporary credential overrides.
do_coredump() and sys_faccessat() now prepare their own credentials and
temporarily override the ones currently on the acting thread, whilst
preventing interference from other threads by holding cred_replace_mutex
on the thread being dumped.
This will be replaced in a future patch by something that hands down the
credentials directly to the functions being called, rather than altering
the task's objective credentials.
(3) LSM interface.
A number of functions have been changed, added or removed:
(*) security_capset_check(), ->capset_check()
(*) security_capset_set(), ->capset_set()
Removed in favour of security_capset().
(*) security_capset(), ->capset()
New. This is passed a pointer to the new creds, a pointer to the old
creds and the proposed capability sets. It should fill in the new
creds or return an error. All pointers, barring the pointer to the
new creds, are now const.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
Changed; now returns a value, which will cause the process to be
killed if it's an error.
(*) security_task_alloc(), ->task_alloc_security()
Removed in favour of security_prepare_creds().
(*) security_cred_free(), ->cred_free()
New. Free security data attached to cred->security.
(*) security_prepare_creds(), ->cred_prepare()
New. Duplicate any security data attached to cred->security.
(*) security_commit_creds(), ->cred_commit()
New. Apply any security effects for the upcoming installation of new
security by commit_creds().
(*) security_task_post_setuid(), ->task_post_setuid()
Removed in favour of security_task_fix_setuid().
(*) security_task_fix_setuid(), ->task_fix_setuid()
Fix up the proposed new credentials for setuid(). This is used by
cap_set_fix_setuid() to implicitly adjust capabilities in line with
setuid() changes. Changes are made to the new credentials, rather
than the task itself as in security_task_post_setuid().
(*) security_task_reparent_to_init(), ->task_reparent_to_init()
Removed. Instead the task being reparented to init is referred
directly to init's credentials.
NOTE! This results in the loss of some state: SELinux's osid no
longer records the sid of the thread that forked it.
(*) security_key_alloc(), ->key_alloc()
(*) security_key_permission(), ->key_permission()
Changed. These now take cred pointers rather than task pointers to
refer to the security context.
(4) sys_capset().
This has been simplified and uses less locking. The LSM functions it
calls have been merged.
(5) reparent_to_kthreadd().
This gives the current thread the same credentials as init by simply using
commit_thread() to point that way.
(6) __sigqueue_alloc() and switch_uid()
__sigqueue_alloc() can't stop the target task from changing its creds
beneath it, so this function gets a reference to the currently applicable
user_struct which it then passes into the sigqueue struct it returns if
successful.
switch_uid() is now called from commit_creds(), and possibly should be
folded into that. commit_creds() should take care of protecting
__sigqueue_alloc().
(7) [sg]et[ug]id() and co and [sg]et_current_groups.
The set functions now all use prepare_creds(), commit_creds() and
abort_creds() to build and check a new set of credentials before applying
it.
security_task_set[ug]id() is called inside the prepared section. This
guarantees that nothing else will affect the creds until we've finished.
The calling of set_dumpable() has been moved into commit_creds().
Much of the functionality of set_user() has been moved into
commit_creds().
The get functions all simply access the data directly.
(8) security_task_prctl() and cap_task_prctl().
security_task_prctl() has been modified to return -ENOSYS if it doesn't
want to handle a function, or otherwise return the return value directly
rather than through an argument.
Additionally, cap_task_prctl() now prepares a new set of credentials, even
if it doesn't end up using it.
(9) Keyrings.
A number of changes have been made to the keyrings code:
(a) switch_uid_keyring(), copy_keys(), exit_keys() and suid_keys() have
all been dropped and built in to the credentials functions directly.
They may want separating out again later.
(b) key_alloc() and search_process_keyrings() now take a cred pointer
rather than a task pointer to specify the security context.
(c) copy_creds() gives a new thread within the same thread group a new
thread keyring if its parent had one, otherwise it discards the thread
keyring.
(d) The authorisation key now points directly to the credentials to extend
the search into rather pointing to the task that carries them.
(e) Installing thread, process or session keyrings causes a new set of
credentials to be created, even though it's not strictly necessary for
process or session keyrings (they're shared).
(10) Usermode helper.
The usermode helper code now carries a cred struct pointer in its
subprocess_info struct instead of a new session keyring pointer. This set
of credentials is derived from init_cred and installed on the new process
after it has been cloned.
call_usermodehelper_setup() allocates the new credentials and
call_usermodehelper_freeinfo() discards them if they haven't been used. A
special cred function (prepare_usermodeinfo_creds()) is provided
specifically for call_usermodehelper_setup() to call.
call_usermodehelper_setkeys() adjusts the credentials to sport the
supplied keyring as the new session keyring.
(11) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) selinux_setprocattr() no longer does its check for whether the
current ptracer can access processes with the new SID inside the lock
that covers getting the ptracer's SID. Whilst this lock ensures that
the check is done with the ptracer pinned, the result is only valid
until the lock is released, so there's no point doing it inside the
lock.
(12) is_single_threaded().
This function has been extracted from selinux_setprocattr() and put into
a file of its own in the lib/ directory as join_session_keyring() now
wants to use it too.
The code in SELinux just checked to see whether a task shared mm_structs
with other tasks (CLONE_VM), but that isn't good enough. We really want
to know if they're part of the same thread group (CLONE_THREAD).
(13) nfsd.
The NFS server daemon now has to use the COW credentials to set the
credentials it is going to use. It really needs to pass the credentials
down to the functions it calls, but it can't do that until other patches
in this series have been applied.
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:23 +08:00
|
|
|
struct mutex cred_exec_mutex; /* execve vs ptrace cred calculation mutex */
|
2008-11-14 07:39:16 +08:00
|
|
|
|
2005-05-06 07:16:12 +08:00
|
|
|
char comm[TASK_COMM_LEN]; /* executable name excluding path
|
|
|
|
- access with [gs]et_task_comm (which lock
|
|
|
|
it with task_lock())
|
|
|
|
- initialized normally by flush_old_exec */
|
2005-04-17 06:20:36 +08:00
|
|
|
/* file system info */
|
|
|
|
int link_count, total_link_count;
|
2006-09-29 16:59:40 +08:00
|
|
|
#ifdef CONFIG_SYSVIPC
|
2005-04-17 06:20:36 +08:00
|
|
|
/* ipc stuff */
|
|
|
|
struct sysv_sem sysvsem;
|
2006-09-29 16:59:40 +08:00
|
|
|
#endif
|
2008-01-26 04:08:02 +08:00
|
|
|
#ifdef CONFIG_DETECT_SOFTLOCKUP
|
|
|
|
/* hung task detection */
|
|
|
|
unsigned long last_switch_timestamp;
|
|
|
|
unsigned long last_switch_count;
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
/* CPU-specific state of this task */
|
|
|
|
struct thread_struct thread;
|
|
|
|
/* filesystem information */
|
|
|
|
struct fs_struct *fs;
|
|
|
|
/* open file information */
|
|
|
|
struct files_struct *files;
|
2006-10-02 17:18:08 +08:00
|
|
|
/* namespaces */
|
2006-10-02 17:18:06 +08:00
|
|
|
struct nsproxy *nsproxy;
|
2005-04-17 06:20:36 +08:00
|
|
|
/* signal handlers */
|
|
|
|
struct signal_struct *signal;
|
|
|
|
struct sighand_struct *sighand;
|
|
|
|
|
|
|
|
sigset_t blocked, real_blocked;
|
2008-04-30 15:53:09 +08:00
|
|
|
sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
|
2005-04-17 06:20:36 +08:00
|
|
|
struct sigpending pending;
|
|
|
|
|
|
|
|
unsigned long sas_ss_sp;
|
|
|
|
size_t sas_ss_size;
|
|
|
|
int (*notifier)(void *priv);
|
|
|
|
void *notifier_data;
|
|
|
|
sigset_t *notifier_mask;
|
|
|
|
struct audit_context *audit_context;
|
2008-01-10 17:53:18 +08:00
|
|
|
#ifdef CONFIG_AUDITSYSCALL
|
|
|
|
uid_t loginuid;
|
2008-01-08 23:06:53 +08:00
|
|
|
unsigned int sessionid;
|
2008-01-10 17:53:18 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
seccomp_t seccomp;
|
|
|
|
|
|
|
|
/* Thread group tracking */
|
|
|
|
u32 parent_exec_id;
|
|
|
|
u32 self_exec_id;
|
|
|
|
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
|
|
|
|
spinlock_t alloc_lock;
|
|
|
|
|
2006-06-27 17:54:51 +08:00
|
|
|
/* Protection of the PI data structures: */
|
|
|
|
spinlock_t pi_lock;
|
|
|
|
|
2006-06-27 17:54:53 +08:00
|
|
|
#ifdef CONFIG_RT_MUTEXES
|
|
|
|
/* PI waiters blocked on a rt_mutex held by this task */
|
|
|
|
struct plist_head pi_waiters;
|
|
|
|
/* Deadlock detection and priority inheritance handling */
|
|
|
|
struct rt_mutex_waiter *pi_blocked_on;
|
|
|
|
#endif
|
|
|
|
|
2006-01-10 07:59:20 +08:00
|
|
|
#ifdef CONFIG_DEBUG_MUTEXES
|
|
|
|
/* mutex deadlock detection */
|
|
|
|
struct mutex_waiter *blocked_on;
|
|
|
|
#endif
|
2006-07-03 15:24:42 +08:00
|
|
|
#ifdef CONFIG_TRACE_IRQFLAGS
|
|
|
|
unsigned int irq_events;
|
|
|
|
int hardirqs_enabled;
|
|
|
|
unsigned long hardirq_enable_ip;
|
|
|
|
unsigned int hardirq_enable_event;
|
|
|
|
unsigned long hardirq_disable_ip;
|
|
|
|
unsigned int hardirq_disable_event;
|
|
|
|
int softirqs_enabled;
|
|
|
|
unsigned long softirq_disable_ip;
|
|
|
|
unsigned int softirq_disable_event;
|
|
|
|
unsigned long softirq_enable_ip;
|
|
|
|
unsigned int softirq_enable_event;
|
|
|
|
int hardirq_context;
|
|
|
|
int softirq_context;
|
|
|
|
#endif
|
[PATCH] lockdep: core
Do 'make oldconfig' and accept all the defaults for new config options -
reboot into the kernel and if everything goes well it should boot up fine and
you should have /proc/lockdep and /proc/lockdep_stats files.
Typically if the lock validator finds some problem it will print out
voluminous debug output that begins with "BUG: ..." and which syslog output
can be used by kernel developers to figure out the precise locking scenario.
What does the lock validator do? It "observes" and maps all locking rules as
they occur dynamically (as triggered by the kernel's natural use of spinlocks,
rwlocks, mutexes and rwsems). Whenever the lock validator subsystem detects a
new locking scenario, it validates this new rule against the existing set of
rules. If this new rule is consistent with the existing set of rules then the
new rule is added transparently and the kernel continues as normal. If the
new rule could create a deadlock scenario then this condition is printed out.
When determining validity of locking, all possible "deadlock scenarios" are
considered: assuming arbitrary number of CPUs, arbitrary irq context and task
context constellations, running arbitrary combinations of all the existing
locking scenarios. In a typical system this means millions of separate
scenarios. This is why we call it a "locking correctness" validator - for all
rules that are observed the lock validator proves it with mathematical
certainty that a deadlock could not occur (assuming that the lock validator
implementation itself is correct and its internal data structures are not
corrupted by some other kernel subsystem). [see more details and conditionals
of this statement in include/linux/lockdep.h and
Documentation/lockdep-design.txt]
Furthermore, this "all possible scenarios" property of the validator also
enables the finding of complex, highly unlikely multi-CPU multi-context races
via single single-context rules, increasing the likelyhood of finding bugs
drastically. In practical terms: the lock validator already found a bug in
the upstream kernel that could only occur on systems with 3 or more CPUs, and
which needed 3 very unlikely code sequences to occur at once on the 3 CPUs.
That bug was found and reported on a single-CPU system (!). So in essence a
race will be found "piecemail-wise", triggering all the necessary components
for the race, without having to reproduce the race scenario itself! In its
short existence the lock validator found and reported many bugs before they
actually caused a real deadlock.
To further increase the efficiency of the validator, the mapping is not per
"lock instance", but per "lock-class". For example, all struct inode objects
in the kernel have inode->inotify_mutex. If there are 10,000 inodes cached,
then there are 10,000 lock objects. But ->inotify_mutex is a single "lock
type", and all locking activities that occur against ->inotify_mutex are
"unified" into this single lock-class. The advantage of the lock-class
approach is that all historical ->inotify_mutex uses are mapped into a single
(and as narrow as possible) set of locking rules - regardless of how many
different tasks or inode structures it took to build this set of rules. The
set of rules persist during the lifetime of the kernel.
To see the rough magnitude of checking that the lock validator does, here's a
portion of /proc/lockdep_stats, fresh after bootup:
lock-classes: 694 [max: 2048]
direct dependencies: 1598 [max: 8192]
indirect dependencies: 17896
all direct dependencies: 16206
dependency chains: 1910 [max: 8192]
in-hardirq chains: 17
in-softirq chains: 105
in-process chains: 1065
stack-trace entries: 38761 [max: 131072]
combined max dependencies: 2033928
hardirq-safe locks: 24
hardirq-unsafe locks: 176
softirq-safe locks: 53
softirq-unsafe locks: 137
irq-safe locks: 59
irq-unsafe locks: 176
The lock validator has observed 1598 actual single-thread locking patterns,
and has validated all possible 2033928 distinct locking scenarios.
More details about the design of the lock validator can be found in
Documentation/lockdep-design.txt, which can also found at:
http://redhat.com/~mingo/lockdep-patches/lockdep-design.txt
[bunk@stusta.de: cleanups]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-07-03 15:24:50 +08:00
|
|
|
#ifdef CONFIG_LOCKDEP
|
2008-02-26 06:02:48 +08:00
|
|
|
# define MAX_LOCK_DEPTH 48UL
|
[PATCH] lockdep: core
Do 'make oldconfig' and accept all the defaults for new config options -
reboot into the kernel and if everything goes well it should boot up fine and
you should have /proc/lockdep and /proc/lockdep_stats files.
Typically if the lock validator finds some problem it will print out
voluminous debug output that begins with "BUG: ..." and which syslog output
can be used by kernel developers to figure out the precise locking scenario.
What does the lock validator do? It "observes" and maps all locking rules as
they occur dynamically (as triggered by the kernel's natural use of spinlocks,
rwlocks, mutexes and rwsems). Whenever the lock validator subsystem detects a
new locking scenario, it validates this new rule against the existing set of
rules. If this new rule is consistent with the existing set of rules then the
new rule is added transparently and the kernel continues as normal. If the
new rule could create a deadlock scenario then this condition is printed out.
When determining validity of locking, all possible "deadlock scenarios" are
considered: assuming arbitrary number of CPUs, arbitrary irq context and task
context constellations, running arbitrary combinations of all the existing
locking scenarios. In a typical system this means millions of separate
scenarios. This is why we call it a "locking correctness" validator - for all
rules that are observed the lock validator proves it with mathematical
certainty that a deadlock could not occur (assuming that the lock validator
implementation itself is correct and its internal data structures are not
corrupted by some other kernel subsystem). [see more details and conditionals
of this statement in include/linux/lockdep.h and
Documentation/lockdep-design.txt]
Furthermore, this "all possible scenarios" property of the validator also
enables the finding of complex, highly unlikely multi-CPU multi-context races
via single single-context rules, increasing the likelyhood of finding bugs
drastically. In practical terms: the lock validator already found a bug in
the upstream kernel that could only occur on systems with 3 or more CPUs, and
which needed 3 very unlikely code sequences to occur at once on the 3 CPUs.
That bug was found and reported on a single-CPU system (!). So in essence a
race will be found "piecemail-wise", triggering all the necessary components
for the race, without having to reproduce the race scenario itself! In its
short existence the lock validator found and reported many bugs before they
actually caused a real deadlock.
To further increase the efficiency of the validator, the mapping is not per
"lock instance", but per "lock-class". For example, all struct inode objects
in the kernel have inode->inotify_mutex. If there are 10,000 inodes cached,
then there are 10,000 lock objects. But ->inotify_mutex is a single "lock
type", and all locking activities that occur against ->inotify_mutex are
"unified" into this single lock-class. The advantage of the lock-class
approach is that all historical ->inotify_mutex uses are mapped into a single
(and as narrow as possible) set of locking rules - regardless of how many
different tasks or inode structures it took to build this set of rules. The
set of rules persist during the lifetime of the kernel.
To see the rough magnitude of checking that the lock validator does, here's a
portion of /proc/lockdep_stats, fresh after bootup:
lock-classes: 694 [max: 2048]
direct dependencies: 1598 [max: 8192]
indirect dependencies: 17896
all direct dependencies: 16206
dependency chains: 1910 [max: 8192]
in-hardirq chains: 17
in-softirq chains: 105
in-process chains: 1065
stack-trace entries: 38761 [max: 131072]
combined max dependencies: 2033928
hardirq-safe locks: 24
hardirq-unsafe locks: 176
softirq-safe locks: 53
softirq-unsafe locks: 137
irq-safe locks: 59
irq-unsafe locks: 176
The lock validator has observed 1598 actual single-thread locking patterns,
and has validated all possible 2033928 distinct locking scenarios.
More details about the design of the lock validator can be found in
Documentation/lockdep-design.txt, which can also found at:
http://redhat.com/~mingo/lockdep-patches/lockdep-design.txt
[bunk@stusta.de: cleanups]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-07-03 15:24:50 +08:00
|
|
|
u64 curr_chain_key;
|
|
|
|
int lockdep_depth;
|
|
|
|
unsigned int lockdep_recursion;
|
2008-05-15 19:09:15 +08:00
|
|
|
struct held_lock held_locks[MAX_LOCK_DEPTH];
|
[PATCH] lockdep: core
Do 'make oldconfig' and accept all the defaults for new config options -
reboot into the kernel and if everything goes well it should boot up fine and
you should have /proc/lockdep and /proc/lockdep_stats files.
Typically if the lock validator finds some problem it will print out
voluminous debug output that begins with "BUG: ..." and which syslog output
can be used by kernel developers to figure out the precise locking scenario.
What does the lock validator do? It "observes" and maps all locking rules as
they occur dynamically (as triggered by the kernel's natural use of spinlocks,
rwlocks, mutexes and rwsems). Whenever the lock validator subsystem detects a
new locking scenario, it validates this new rule against the existing set of
rules. If this new rule is consistent with the existing set of rules then the
new rule is added transparently and the kernel continues as normal. If the
new rule could create a deadlock scenario then this condition is printed out.
When determining validity of locking, all possible "deadlock scenarios" are
considered: assuming arbitrary number of CPUs, arbitrary irq context and task
context constellations, running arbitrary combinations of all the existing
locking scenarios. In a typical system this means millions of separate
scenarios. This is why we call it a "locking correctness" validator - for all
rules that are observed the lock validator proves it with mathematical
certainty that a deadlock could not occur (assuming that the lock validator
implementation itself is correct and its internal data structures are not
corrupted by some other kernel subsystem). [see more details and conditionals
of this statement in include/linux/lockdep.h and
Documentation/lockdep-design.txt]
Furthermore, this "all possible scenarios" property of the validator also
enables the finding of complex, highly unlikely multi-CPU multi-context races
via single single-context rules, increasing the likelyhood of finding bugs
drastically. In practical terms: the lock validator already found a bug in
the upstream kernel that could only occur on systems with 3 or more CPUs, and
which needed 3 very unlikely code sequences to occur at once on the 3 CPUs.
That bug was found and reported on a single-CPU system (!). So in essence a
race will be found "piecemail-wise", triggering all the necessary components
for the race, without having to reproduce the race scenario itself! In its
short existence the lock validator found and reported many bugs before they
actually caused a real deadlock.
To further increase the efficiency of the validator, the mapping is not per
"lock instance", but per "lock-class". For example, all struct inode objects
in the kernel have inode->inotify_mutex. If there are 10,000 inodes cached,
then there are 10,000 lock objects. But ->inotify_mutex is a single "lock
type", and all locking activities that occur against ->inotify_mutex are
"unified" into this single lock-class. The advantage of the lock-class
approach is that all historical ->inotify_mutex uses are mapped into a single
(and as narrow as possible) set of locking rules - regardless of how many
different tasks or inode structures it took to build this set of rules. The
set of rules persist during the lifetime of the kernel.
To see the rough magnitude of checking that the lock validator does, here's a
portion of /proc/lockdep_stats, fresh after bootup:
lock-classes: 694 [max: 2048]
direct dependencies: 1598 [max: 8192]
indirect dependencies: 17896
all direct dependencies: 16206
dependency chains: 1910 [max: 8192]
in-hardirq chains: 17
in-softirq chains: 105
in-process chains: 1065
stack-trace entries: 38761 [max: 131072]
combined max dependencies: 2033928
hardirq-safe locks: 24
hardirq-unsafe locks: 176
softirq-safe locks: 53
softirq-unsafe locks: 137
irq-safe locks: 59
irq-unsafe locks: 176
The lock validator has observed 1598 actual single-thread locking patterns,
and has validated all possible 2033928 distinct locking scenarios.
More details about the design of the lock validator can be found in
Documentation/lockdep-design.txt, which can also found at:
http://redhat.com/~mingo/lockdep-patches/lockdep-design.txt
[bunk@stusta.de: cleanups]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-07-03 15:24:50 +08:00
|
|
|
#endif
|
2006-01-10 07:59:20 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* journalling filesystem info */
|
|
|
|
void *journal_info;
|
|
|
|
|
When stacked block devices are in-use (e.g. md or dm), the recursive calls
to generic_make_request can use up a lot of space, and we would rather they
didn't.
As generic_make_request is a void function, and as it is generally not
expected that it will have any effect immediately, it is safe to delay any
call to generic_make_request until there is sufficient stack space
available.
As ->bi_next is reserved for the driver to use, it can have no valid value
when generic_make_request is called, and as __make_request implicitly
assumes it will be NULL (ELEVATOR_BACK_MERGE fork of switch) we can be
certain that all callers set it to NULL. We can therefore safely use
bi_next to link pending requests together, providing we clear it before
making the real call.
So, we choose to allow each thread to only be active in one
generic_make_request at a time. If a subsequent (recursive) call is made,
the bio is linked into a per-thread list, and is handled when the active
call completes.
As the list of pending bios is per-thread, there are no locking issues to
worry about.
I say above that it is "safe to delay any call...". There are, however,
some behaviours of a make_request_fn which would make it unsafe. These
include any behaviour that assumes anything will have changed after a
recursive call to generic_make_request.
These could include:
- waiting for that call to finish and call it's bi_end_io function.
md use to sometimes do this (marking the superblock dirty before
completing a write) but doesn't any more
- inspecting the bio for fields that generic_make_request might
change, such as bi_sector or bi_bdev. It is hard to see a good
reason for this, and I don't think anyone actually does it.
- inspecing the queue to see if, e.g. it is 'full' yet. Again, I
think this is very unlikely to be useful, or to be done.
Signed-off-by: Neil Brown <neilb@suse.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: <dm-devel@redhat.com>
Alasdair G Kergon <agk@redhat.com> said:
I can see nothing wrong with this in principle.
For device-mapper at the moment though it's essential that, while the bio
mappings may now get delayed, they still get processed in exactly
the same order as they were passed to generic_make_request().
My main concern is whether the timing changes implicit in this patch
will make the rare data-corrupting races in the existing snapshot code
more likely. (I'm working on a fix for these races, but the unfinished
patch is already several hundred lines long.)
It would be helpful if some people on this mailing list would test
this patch in various scenarios and report back.
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2007-05-01 15:53:42 +08:00
|
|
|
/* stacked block device info */
|
|
|
|
struct bio *bio_list, **bio_tail;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* VM state */
|
|
|
|
struct reclaim_state *reclaim_state;
|
|
|
|
|
|
|
|
struct backing_dev_info *backing_dev_info;
|
|
|
|
|
|
|
|
struct io_context *io_context;
|
|
|
|
|
|
|
|
unsigned long ptrace_message;
|
|
|
|
siginfo_t *last_siginfo; /* For ptrace use. */
|
2008-07-28 06:48:12 +08:00
|
|
|
struct task_io_accounting ioac;
|
2006-10-01 14:28:59 +08:00
|
|
|
#if defined(CONFIG_TASK_XACCT)
|
2005-04-17 06:20:36 +08:00
|
|
|
u64 acct_rss_mem1; /* accumulated rss usage */
|
|
|
|
u64 acct_vm_mem1; /* accumulated virtual memory usage */
|
2008-07-25 16:48:40 +08:00
|
|
|
cputime_t acct_timexpd; /* stime + utime since last update */
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_CPUSETS
|
|
|
|
nodemask_t mems_allowed;
|
|
|
|
int cpuset_mems_generation;
|
[PATCH] cpuset memory spread basic implementation
This patch provides the implementation and cpuset interface for an alternative
memory allocation policy that can be applied to certain kinds of memory
allocations, such as the page cache (file system buffers) and some slab caches
(such as inode caches).
The policy is called "memory spreading." If enabled, it spreads out these
kinds of memory allocations over all the nodes allowed to a task, instead of
preferring to place them on the node where the task is executing.
All other kinds of allocations, including anonymous pages for a tasks stack
and data regions, are not affected by this policy choice, and continue to be
allocated preferring the node local to execution, as modified by the NUMA
mempolicy.
There are two boolean flag files per cpuset that control where the kernel
allocates pages for the file system buffers and related in kernel data
structures. They are called 'memory_spread_page' and 'memory_spread_slab'.
If the per-cpuset boolean flag file 'memory_spread_page' is set, then the
kernel will spread the file system buffers (page cache) evenly over all the
nodes that the faulting task is allowed to use, instead of preferring to put
those pages on the node where the task is running.
If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the
kernel will spread some file system related slab caches, such as for inodes
and dentries evenly over all the nodes that the faulting task is allowed to
use, instead of preferring to put those pages on the node where the task is
running.
The implementation is simple. Setting the cpuset flags 'memory_spread_page'
or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or
PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or
subsequently joins that cpuset. In subsequent patches, the page allocation
calls for the affected page cache and slab caches are modified to perform an
inline check for these flags, and if set, a call to a new routine
cpuset_mem_spread_node() returns the node to prefer for the allocation.
The cpuset_mem_spread_node() routine is also simple. It uses the value of a
per-task rotor cpuset_mem_spread_rotor to select the next node in the current
tasks mems_allowed to prefer for the allocation.
This policy can provide substantial improvements for jobs that need to place
thread local data on the corresponding node, but that need to access large
file system data sets that need to be spread across the several nodes in the
jobs cpuset in order to fit. Without this patch, especially for jobs that
might have one thread reading in the data set, the memory allocation across
the nodes in the jobs cpuset can become very uneven.
A couple of Copyright year ranges are updated as well. And a couple of email
addresses that can be found in the MAINTAINERS file are removed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 19:16:03 +08:00
|
|
|
int cpuset_mem_spread_rotor;
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
Task Control Groups: basic task cgroup framework
Generic Process Control Groups
--------------------------
There have recently been various proposals floating around for
resource management/accounting and other task grouping subsystems in
the kernel, including ResGroups, User BeanCounters, NSProxy
cgroups, and others. These all need the basic abstraction of being
able to group together multiple processes in an aggregate, in order to
track/limit the resources permitted to those processes, or control
other behaviour of the processes, and all implement this grouping in
different ways.
This patchset provides a framework for tracking and grouping processes
into arbitrary "cgroups" and assigning arbitrary state to those
groupings, in order to control the behaviour of the cgroup as an
aggregate.
The intention is that the various resource management and
virtualization/cgroup efforts can also become task cgroup
clients, with the result that:
- the userspace APIs are (somewhat) normalised
- it's easier to test e.g. the ResGroups CPU controller in
conjunction with the BeanCounters memory controller, or use either of
them as the resource-control portion of a virtual server system.
- the additional kernel footprint of any of the competing resource
management systems is substantially reduced, since it doesn't need
to provide process grouping/containment, hence improving their
chances of getting into the kernel
This patch:
Add the main task cgroups framework - the cgroup filesystem, and the
basic structures for tracking membership and associating subsystem state
objects to tasks.
Signed-off-by: Paul Menage <menage@google.com>
Cc: Serge E. Hallyn <serue@us.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-19 14:39:30 +08:00
|
|
|
#ifdef CONFIG_CGROUPS
|
2007-10-19 14:39:36 +08:00
|
|
|
/* Control Group info protected by css_set_lock */
|
|
|
|
struct css_set *cgroups;
|
|
|
|
/* cg_list protected by css_set_lock and tsk->alloc_lock */
|
|
|
|
struct list_head cg_list;
|
Task Control Groups: basic task cgroup framework
Generic Process Control Groups
--------------------------
There have recently been various proposals floating around for
resource management/accounting and other task grouping subsystems in
the kernel, including ResGroups, User BeanCounters, NSProxy
cgroups, and others. These all need the basic abstraction of being
able to group together multiple processes in an aggregate, in order to
track/limit the resources permitted to those processes, or control
other behaviour of the processes, and all implement this grouping in
different ways.
This patchset provides a framework for tracking and grouping processes
into arbitrary "cgroups" and assigning arbitrary state to those
groupings, in order to control the behaviour of the cgroup as an
aggregate.
The intention is that the various resource management and
virtualization/cgroup efforts can also become task cgroup
clients, with the result that:
- the userspace APIs are (somewhat) normalised
- it's easier to test e.g. the ResGroups CPU controller in
conjunction with the BeanCounters memory controller, or use either of
them as the resource-control portion of a virtual server system.
- the additional kernel footprint of any of the competing resource
management systems is substantially reduced, since it doesn't need
to provide process grouping/containment, hence improving their
chances of getting into the kernel
This patch:
Add the main task cgroups framework - the cgroup filesystem, and the
basic structures for tracking membership and associating subsystem state
objects to tasks.
Signed-off-by: Paul Menage <menage@google.com>
Cc: Serge E. Hallyn <serue@us.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-19 14:39:30 +08:00
|
|
|
#endif
|
2007-10-17 14:27:30 +08:00
|
|
|
#ifdef CONFIG_FUTEX
|
2006-03-27 17:16:22 +08:00
|
|
|
struct robust_list_head __user *robust_list;
|
2006-03-27 17:16:24 +08:00
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
struct compat_robust_list_head __user *compat_robust_list;
|
|
|
|
#endif
|
2006-06-27 17:54:58 +08:00
|
|
|
struct list_head pi_state_list;
|
|
|
|
struct futex_pi_state *pi_state_cache;
|
2008-05-15 19:09:15 +08:00
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
struct mempolicy *mempolicy;
|
|
|
|
short il_next;
|
2007-10-17 14:27:30 +08:00
|
|
|
#endif
|
2005-06-27 16:55:12 +08:00
|
|
|
atomic_t fs_excl; /* holding fs exclusive resources */
|
2006-01-08 17:01:37 +08:00
|
|
|
struct rcu_head rcu;
|
2006-04-11 19:52:07 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* cache last used pipe for splice
|
|
|
|
*/
|
|
|
|
struct pipe_inode_info *splice_pipe;
|
2006-07-14 15:24:36 +08:00
|
|
|
#ifdef CONFIG_TASK_DELAY_ACCT
|
|
|
|
struct task_delay_info *delays;
|
2006-12-08 18:39:47 +08:00
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_FAULT_INJECTION
|
|
|
|
int make_it_fail;
|
2006-07-14 15:24:36 +08:00
|
|
|
#endif
|
2007-10-17 14:25:50 +08:00
|
|
|
struct prop_local_single dirties;
|
2008-01-26 04:08:34 +08:00
|
|
|
#ifdef CONFIG_LATENCYTOP
|
|
|
|
int latency_record_count;
|
|
|
|
struct latency_record latency_record[LT_SAVECOUNT];
|
|
|
|
#endif
|
2008-09-02 06:52:40 +08:00
|
|
|
/*
|
|
|
|
* time slack values; these are used to round up poll() and
|
|
|
|
* select() etc timeout values. These are in nanoseconds.
|
|
|
|
*/
|
|
|
|
unsigned long timer_slack_ns;
|
|
|
|
unsigned long default_timer_slack_ns;
|
2008-11-06 16:37:40 +08:00
|
|
|
|
|
|
|
struct list_head *scm_work_list;
|
2008-11-26 04:07:04 +08:00
|
|
|
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
|
2008-11-23 13:22:56 +08:00
|
|
|
/* Index of current stored adress in ret_stack */
|
|
|
|
int curr_ret_stack;
|
|
|
|
/* Stack of return addresses for return function tracing */
|
|
|
|
struct ftrace_ret_stack *ret_stack;
|
|
|
|
/*
|
|
|
|
* Number of functions that haven't been traced
|
|
|
|
* because of depth overrun.
|
|
|
|
*/
|
|
|
|
atomic_t trace_overrun;
|
2008-12-06 10:43:41 +08:00
|
|
|
/* Pause for the tracing */
|
|
|
|
atomic_t tracing_graph_pause;
|
2008-11-23 13:22:56 +08:00
|
|
|
#endif
|
2008-12-04 04:36:57 +08:00
|
|
|
#ifdef CONFIG_TRACING
|
|
|
|
/* state flags for use by tracers */
|
|
|
|
unsigned long trace;
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2007-07-10 00:51:59 +08:00
|
|
|
/*
|
|
|
|
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
|
|
|
|
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
|
|
|
|
* tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
|
|
|
|
* values are inverted: lower p->prio value means higher priority.
|
|
|
|
*
|
|
|
|
* The MAX_USER_RT_PRIO value allows the actual maximum
|
|
|
|
* RT priority to be separate from the value exported to
|
|
|
|
* user-space. This allows kernel threads to set their
|
|
|
|
* priority to a value higher than any user task. Note:
|
|
|
|
* MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define MAX_USER_RT_PRIO 100
|
|
|
|
#define MAX_RT_PRIO MAX_USER_RT_PRIO
|
|
|
|
|
|
|
|
#define MAX_PRIO (MAX_RT_PRIO + 40)
|
|
|
|
#define DEFAULT_PRIO (MAX_RT_PRIO + 20)
|
|
|
|
|
|
|
|
static inline int rt_prio(int prio)
|
|
|
|
{
|
|
|
|
if (unlikely(prio < MAX_RT_PRIO))
|
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2007-10-26 16:17:22 +08:00
|
|
|
static inline int rt_task(struct task_struct *p)
|
2007-07-10 00:51:59 +08:00
|
|
|
{
|
|
|
|
return rt_prio(p->prio);
|
|
|
|
}
|
|
|
|
|
2007-10-19 14:39:46 +08:00
|
|
|
static inline void set_task_session(struct task_struct *tsk, pid_t session)
|
2006-12-08 18:37:54 +08:00
|
|
|
{
|
2007-10-19 14:39:46 +08:00
|
|
|
tsk->signal->__session = session;
|
2006-12-08 18:37:54 +08:00
|
|
|
}
|
|
|
|
|
2007-10-19 14:40:39 +08:00
|
|
|
static inline void set_task_pgrp(struct task_struct *tsk, pid_t pgrp)
|
|
|
|
{
|
|
|
|
tsk->signal->__pgrp = pgrp;
|
|
|
|
}
|
|
|
|
|
2007-10-26 16:17:22 +08:00
|
|
|
static inline struct pid *task_pid(struct task_struct *task)
|
2006-10-02 17:17:09 +08:00
|
|
|
{
|
|
|
|
return task->pids[PIDTYPE_PID].pid;
|
|
|
|
}
|
|
|
|
|
2007-10-26 16:17:22 +08:00
|
|
|
static inline struct pid *task_tgid(struct task_struct *task)
|
2006-10-02 17:17:09 +08:00
|
|
|
{
|
|
|
|
return task->group_leader->pids[PIDTYPE_PID].pid;
|
|
|
|
}
|
|
|
|
|
2007-10-26 16:17:22 +08:00
|
|
|
static inline struct pid *task_pgrp(struct task_struct *task)
|
2006-10-02 17:17:09 +08:00
|
|
|
{
|
|
|
|
return task->group_leader->pids[PIDTYPE_PGID].pid;
|
|
|
|
}
|
|
|
|
|
2007-10-26 16:17:22 +08:00
|
|
|
static inline struct pid *task_session(struct task_struct *task)
|
2006-10-02 17:17:09 +08:00
|
|
|
{
|
|
|
|
return task->group_leader->pids[PIDTYPE_SID].pid;
|
|
|
|
}
|
|
|
|
|
2007-10-19 14:40:06 +08:00
|
|
|
struct pid_namespace;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* the helpers to get the task's different pids as they are seen
|
|
|
|
* from various namespaces
|
|
|
|
*
|
|
|
|
* task_xid_nr() : global id, i.e. the id seen from the init namespace;
|
2008-02-08 20:19:15 +08:00
|
|
|
* task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of
|
|
|
|
* current.
|
2007-10-19 14:40:06 +08:00
|
|
|
* task_xid_nr_ns() : id seen from the ns specified;
|
|
|
|
*
|
|
|
|
* set_task_vxid() : assigns a virtual id to a task;
|
|
|
|
*
|
|
|
|
* see also pid_nr() etc in include/linux/pid.h
|
|
|
|
*/
|
|
|
|
|
2007-10-26 16:17:22 +08:00
|
|
|
static inline pid_t task_pid_nr(struct task_struct *tsk)
|
2007-10-19 14:40:06 +08:00
|
|
|
{
|
|
|
|
return tsk->pid;
|
|
|
|
}
|
|
|
|
|
2007-10-19 14:40:19 +08:00
|
|
|
pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
|
2007-10-19 14:40:06 +08:00
|
|
|
|
|
|
|
static inline pid_t task_pid_vnr(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
return pid_vnr(task_pid(tsk));
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2007-10-26 16:17:22 +08:00
|
|
|
static inline pid_t task_tgid_nr(struct task_struct *tsk)
|
2007-10-19 14:40:06 +08:00
|
|
|
{
|
|
|
|
return tsk->tgid;
|
|
|
|
}
|
|
|
|
|
2007-10-19 14:40:19 +08:00
|
|
|
pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
|
2007-10-19 14:40:06 +08:00
|
|
|
|
|
|
|
static inline pid_t task_tgid_vnr(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
return pid_vnr(task_tgid(tsk));
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2007-10-26 16:17:22 +08:00
|
|
|
static inline pid_t task_pgrp_nr(struct task_struct *tsk)
|
2007-10-19 14:40:06 +08:00
|
|
|
{
|
2007-10-19 14:40:39 +08:00
|
|
|
return tsk->signal->__pgrp;
|
2007-10-19 14:40:06 +08:00
|
|
|
}
|
|
|
|
|
2007-10-19 14:40:19 +08:00
|
|
|
pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
|
2007-10-19 14:40:06 +08:00
|
|
|
|
|
|
|
static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
return pid_vnr(task_pgrp(tsk));
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2007-10-26 16:17:22 +08:00
|
|
|
static inline pid_t task_session_nr(struct task_struct *tsk)
|
2007-10-19 14:40:06 +08:00
|
|
|
{
|
|
|
|
return tsk->signal->__session;
|
|
|
|
}
|
|
|
|
|
2007-10-19 14:40:19 +08:00
|
|
|
pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
|
2007-10-19 14:40:06 +08:00
|
|
|
|
|
|
|
static inline pid_t task_session_vnr(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
return pid_vnr(task_session(tsk));
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/**
|
|
|
|
* pid_alive - check that a task structure is not stale
|
|
|
|
* @p: Task structure to be checked.
|
|
|
|
*
|
|
|
|
* Test if a process is not yet dead (at most zombie state)
|
|
|
|
* If pid_alive fails, then pointers within the task structure
|
|
|
|
* can be stale and must not be dereferenced.
|
|
|
|
*/
|
2007-10-26 16:17:22 +08:00
|
|
|
static inline int pid_alive(struct task_struct *p)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 18:31:42 +08:00
|
|
|
return p->pids[PIDTYPE_PID].pid != NULL;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2006-09-29 17:00:07 +08:00
|
|
|
/**
|
2007-10-19 14:39:52 +08:00
|
|
|
* is_global_init - check if a task structure is init
|
2006-10-06 15:44:01 +08:00
|
|
|
* @tsk: Task structure to be checked.
|
|
|
|
*
|
|
|
|
* Check if a task structure is the first user space task the kernel created.
|
2007-10-19 14:39:52 +08:00
|
|
|
*/
|
2007-10-26 16:17:22 +08:00
|
|
|
static inline int is_global_init(struct task_struct *tsk)
|
2007-10-19 14:40:09 +08:00
|
|
|
{
|
|
|
|
return tsk->pid == 1;
|
|
|
|
}
|
2007-10-19 14:39:52 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* is_container_init:
|
|
|
|
* check whether in the task is init in its own pid namespace.
|
2006-09-29 17:00:07 +08:00
|
|
|
*/
|
2007-10-19 14:40:09 +08:00
|
|
|
extern int is_container_init(struct task_struct *tsk);
|
2006-09-29 17:00:07 +08:00
|
|
|
|
2006-10-02 17:19:00 +08:00
|
|
|
extern struct pid *cad_pid;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
extern void free_task(struct task_struct *tsk);
|
|
|
|
#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
|
2006-01-08 17:01:37 +08:00
|
|
|
|
2006-03-31 18:31:34 +08:00
|
|
|
extern void __put_task_struct(struct task_struct *t);
|
2006-01-08 17:01:37 +08:00
|
|
|
|
|
|
|
static inline void put_task_struct(struct task_struct *t)
|
|
|
|
{
|
|
|
|
if (atomic_dec_and_test(&t->usage))
|
2006-03-31 18:31:37 +08:00
|
|
|
__put_task_struct(t);
|
2006-01-08 17:01:37 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-09-06 00:12:23 +08:00
|
|
|
extern cputime_t task_utime(struct task_struct *p);
|
|
|
|
extern cputime_t task_stime(struct task_struct *p);
|
|
|
|
extern cputime_t task_gtime(struct task_struct *p);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Per process flags
|
|
|
|
*/
|
|
|
|
#define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */
|
|
|
|
/* Not implemented yet, only for 486*/
|
|
|
|
#define PF_STARTING 0x00000002 /* being created */
|
|
|
|
#define PF_EXITING 0x00000004 /* getting shut down */
|
2007-06-09 04:47:00 +08:00
|
|
|
#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
|
2007-10-15 23:00:19 +08:00
|
|
|
#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
|
2005-04-17 06:20:36 +08:00
|
|
|
#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
|
|
|
|
#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
|
|
|
|
#define PF_DUMPCORE 0x00000200 /* dumped core */
|
|
|
|
#define PF_SIGNALED 0x00000400 /* killed by a signal */
|
|
|
|
#define PF_MEMALLOC 0x00000800 /* Allocating memory */
|
|
|
|
#define PF_FLUSHER 0x00001000 /* responsible for disk writeback */
|
|
|
|
#define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */
|
|
|
|
#define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */
|
|
|
|
#define PF_FROZEN 0x00010000 /* frozen for system suspend */
|
|
|
|
#define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */
|
|
|
|
#define PF_KSWAPD 0x00040000 /* I am kswapd */
|
|
|
|
#define PF_SWAPOFF 0x00080000 /* I am in swapoff */
|
|
|
|
#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
|
2008-07-25 16:47:38 +08:00
|
|
|
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
|
2006-06-13 14:26:10 +08:00
|
|
|
#define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */
|
|
|
|
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
|
|
|
|
#define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */
|
|
|
|
#define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */
|
2008-06-06 03:57:11 +08:00
|
|
|
#define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */
|
2006-03-24 19:16:08 +08:00
|
|
|
#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
|
2006-06-27 17:54:56 +08:00
|
|
|
#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
|
2007-05-24 04:57:25 +08:00
|
|
|
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */
|
2008-06-12 04:04:29 +08:00
|
|
|
#define PF_FREEZER_NOSIG 0x80000000 /* Freezer won't send signals to it */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Only the _current_ task can read/write to tsk->flags, but other
|
|
|
|
* tasks can access tsk->flags in readonly mode for example
|
|
|
|
* with tsk_used_math (like during threaded core dumping).
|
|
|
|
* There is however an exception to this rule during ptrace
|
|
|
|
* or during fork: the ptracer task is allowed to write to the
|
|
|
|
* child->flags of its traced child (same goes for fork, the parent
|
|
|
|
* can write to the child->flags), because we're guaranteed the
|
|
|
|
* child is not running and in turn not changing child->flags
|
|
|
|
* at the same time the parent does it.
|
|
|
|
*/
|
|
|
|
#define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0)
|
|
|
|
#define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0)
|
|
|
|
#define clear_used_math() clear_stopped_child_used_math(current)
|
|
|
|
#define set_used_math() set_stopped_child_used_math(current)
|
|
|
|
#define conditional_stopped_child_used_math(condition, child) \
|
|
|
|
do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
|
|
|
|
#define conditional_used_math(condition) \
|
|
|
|
conditional_stopped_child_used_math(condition, current)
|
|
|
|
#define copy_to_stopped_child_used_math(child) \
|
|
|
|
do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
|
|
|
|
/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
|
|
|
|
#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
|
|
|
|
#define used_math() tsk_used_math(current)
|
|
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
2008-03-27 05:23:49 +08:00
|
|
|
extern int set_cpus_allowed_ptr(struct task_struct *p,
|
2008-11-25 00:05:14 +08:00
|
|
|
const struct cpumask *new_mask);
|
2005-04-17 06:20:36 +08:00
|
|
|
#else
|
2008-03-27 05:23:49 +08:00
|
|
|
static inline int set_cpus_allowed_ptr(struct task_struct *p,
|
2008-11-25 00:05:14 +08:00
|
|
|
const struct cpumask *new_mask)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2008-11-25 00:05:14 +08:00
|
|
|
if (!cpumask_test_cpu(0, new_mask))
|
2005-04-17 06:20:36 +08:00
|
|
|
return -EINVAL;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
2008-03-27 05:23:49 +08:00
|
|
|
static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
|
|
|
|
{
|
|
|
|
return set_cpus_allowed_ptr(p, &new_mask);
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
extern unsigned long long sched_clock(void);
|
2007-07-20 03:28:35 +08:00
|
|
|
|
2008-08-11 14:59:03 +08:00
|
|
|
extern void sched_clock_init(void);
|
|
|
|
extern u64 sched_clock_cpu(int cpu);
|
2008-05-04 00:29:28 +08:00
|
|
|
|
2008-08-11 14:59:03 +08:00
|
|
|
#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
|
2008-05-04 00:29:28 +08:00
|
|
|
static inline void sched_clock_tick(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void sched_clock_idle_sleep_event(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
|
|
|
|
{
|
|
|
|
}
|
2008-04-14 14:50:02 +08:00
|
|
|
#else
|
2008-05-04 00:29:28 +08:00
|
|
|
extern void sched_clock_tick(void);
|
|
|
|
extern void sched_clock_idle_sleep_event(void);
|
|
|
|
extern void sched_clock_idle_wakeup_event(u64 delta_ns);
|
|
|
|
#endif
|
|
|
|
|
2007-07-20 03:28:35 +08:00
|
|
|
/*
|
|
|
|
* For kernel-internal use: high-speed (but slightly incorrect) per-cpu
|
|
|
|
* clock constructed from sched_clock():
|
|
|
|
*/
|
|
|
|
extern unsigned long long cpu_clock(int cpu);
|
|
|
|
|
2006-07-03 15:25:41 +08:00
|
|
|
extern unsigned long long
|
2007-07-10 00:51:58 +08:00
|
|
|
task_sched_runtime(struct task_struct *task);
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* sched_exec is called by processes performing an exec */
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
extern void sched_exec(void);
|
|
|
|
#else
|
|
|
|
#define sched_exec() {}
|
|
|
|
#endif
|
|
|
|
|
2007-08-23 21:18:02 +08:00
|
|
|
extern void sched_clock_idle_sleep_event(void);
|
|
|
|
extern void sched_clock_idle_wakeup_event(u64 delta_ns);
|
2007-07-10 00:51:59 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
|
|
|
extern void idle_task_exit(void);
|
|
|
|
#else
|
|
|
|
static inline void idle_task_exit(void) {}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
extern void sched_idle_next(void);
|
2006-06-27 17:54:51 +08:00
|
|
|
|
2008-03-22 16:20:24 +08:00
|
|
|
#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
|
|
|
|
extern void wake_up_idle_cpu(int cpu);
|
|
|
|
#else
|
|
|
|
static inline void wake_up_idle_cpu(int cpu) { }
|
|
|
|
#endif
|
|
|
|
|
2007-08-26 00:41:53 +08:00
|
|
|
extern unsigned int sysctl_sched_latency;
|
2007-11-10 05:39:37 +08:00
|
|
|
extern unsigned int sysctl_sched_min_granularity;
|
2007-07-10 00:51:58 +08:00
|
|
|
extern unsigned int sysctl_sched_wakeup_granularity;
|
2008-12-30 02:09:17 +08:00
|
|
|
extern unsigned int sysctl_sched_shares_ratelimit;
|
|
|
|
extern unsigned int sysctl_sched_shares_thresh;
|
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
2007-07-10 00:51:58 +08:00
|
|
|
extern unsigned int sysctl_sched_child_runs_first;
|
|
|
|
extern unsigned int sysctl_sched_features;
|
2007-10-15 23:00:18 +08:00
|
|
|
extern unsigned int sysctl_sched_migration_cost;
|
2007-11-10 05:39:39 +08:00
|
|
|
extern unsigned int sysctl_sched_nr_migrate;
|
2007-11-10 05:39:37 +08:00
|
|
|
|
|
|
|
int sched_nr_latency_handler(struct ctl_table *table, int write,
|
|
|
|
struct file *file, void __user *buffer, size_t *length,
|
|
|
|
loff_t *ppos);
|
2007-10-15 23:00:02 +08:00
|
|
|
#endif
|
2008-02-13 22:45:39 +08:00
|
|
|
extern unsigned int sysctl_sched_rt_period;
|
|
|
|
extern int sysctl_sched_rt_runtime;
|
2007-10-15 23:00:02 +08:00
|
|
|
|
2008-04-20 01:44:57 +08:00
|
|
|
int sched_rt_handler(struct ctl_table *table, int write,
|
|
|
|
struct file *filp, void __user *buffer, size_t *lenp,
|
|
|
|
loff_t *ppos);
|
|
|
|
|
2007-10-15 23:00:02 +08:00
|
|
|
extern unsigned int sysctl_sched_compat_yield;
|
2007-07-10 00:51:58 +08:00
|
|
|
|
2006-06-27 17:54:51 +08:00
|
|
|
#ifdef CONFIG_RT_MUTEXES
|
2006-07-03 15:25:41 +08:00
|
|
|
extern int rt_mutex_getprio(struct task_struct *p);
|
|
|
|
extern void rt_mutex_setprio(struct task_struct *p, int prio);
|
|
|
|
extern void rt_mutex_adjust_pi(struct task_struct *p);
|
2006-06-27 17:54:51 +08:00
|
|
|
#else
|
2007-10-26 16:17:22 +08:00
|
|
|
static inline int rt_mutex_getprio(struct task_struct *p)
|
2006-06-27 17:54:51 +08:00
|
|
|
{
|
|
|
|
return p->normal_prio;
|
|
|
|
}
|
2006-06-27 17:55:02 +08:00
|
|
|
# define rt_mutex_adjust_pi(p) do { } while (0)
|
2006-06-27 17:54:51 +08:00
|
|
|
#endif
|
|
|
|
|
2006-07-03 15:25:41 +08:00
|
|
|
extern void set_user_nice(struct task_struct *p, long nice);
|
|
|
|
extern int task_prio(const struct task_struct *p);
|
|
|
|
extern int task_nice(const struct task_struct *p);
|
|
|
|
extern int can_nice(const struct task_struct *p, const int nice);
|
|
|
|
extern int task_curr(const struct task_struct *p);
|
2005-04-17 06:20:36 +08:00
|
|
|
extern int idle_cpu(int cpu);
|
|
|
|
extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
|
2008-06-23 11:55:38 +08:00
|
|
|
extern int sched_setscheduler_nocheck(struct task_struct *, int,
|
|
|
|
struct sched_param *);
|
2006-07-03 15:25:41 +08:00
|
|
|
extern struct task_struct *idle_task(int cpu);
|
|
|
|
extern struct task_struct *curr_task(int cpu);
|
|
|
|
extern void set_curr_task(int cpu, struct task_struct *p);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
void yield(void);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The default (Linux) execution domain.
|
|
|
|
*/
|
|
|
|
extern struct exec_domain default_exec_domain;
|
|
|
|
|
|
|
|
union thread_union {
|
|
|
|
struct thread_info thread_info;
|
|
|
|
unsigned long stack[THREAD_SIZE/sizeof(long)];
|
|
|
|
};
|
|
|
|
|
|
|
|
#ifndef __HAVE_ARCH_KSTACK_END
|
|
|
|
static inline int kstack_end(void *addr)
|
|
|
|
{
|
|
|
|
/* Reliable end of stack detection:
|
|
|
|
* Some APM bios versions misalign the stack
|
|
|
|
*/
|
|
|
|
return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*)));
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
extern union thread_union init_thread_union;
|
|
|
|
extern struct task_struct init_task;
|
|
|
|
|
|
|
|
extern struct mm_struct init_mm;
|
|
|
|
|
2007-10-19 14:40:06 +08:00
|
|
|
extern struct pid_namespace init_pid_ns;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* find a task by one of its numerical ids
|
|
|
|
*
|
|
|
|
* find_task_by_pid_type_ns():
|
|
|
|
* it is the most generic call - it finds a task by all id,
|
|
|
|
* type and namespace specified
|
|
|
|
* find_task_by_pid_ns():
|
|
|
|
* finds a task by its pid in the specified namespace
|
2007-10-19 14:40:16 +08:00
|
|
|
* find_task_by_vpid():
|
|
|
|
* finds a task by its virtual pid
|
2007-10-19 14:40:06 +08:00
|
|
|
*
|
2008-07-25 16:48:36 +08:00
|
|
|
* see also find_vpid() etc in include/linux/pid.h
|
2007-10-19 14:40:06 +08:00
|
|
|
*/
|
|
|
|
|
|
|
|
extern struct task_struct *find_task_by_pid_type_ns(int type, int pid,
|
|
|
|
struct pid_namespace *ns);
|
|
|
|
|
2007-10-19 14:40:16 +08:00
|
|
|
extern struct task_struct *find_task_by_vpid(pid_t nr);
|
|
|
|
extern struct task_struct *find_task_by_pid_ns(pid_t nr,
|
|
|
|
struct pid_namespace *ns);
|
2007-10-19 14:40:06 +08:00
|
|
|
|
2008-02-08 20:19:09 +08:00
|
|
|
extern void __set_special_pids(struct pid *pid);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* per-UID process charging. */
|
2007-07-16 14:40:59 +08:00
|
|
|
extern struct user_struct * alloc_uid(struct user_namespace *, uid_t);
|
2005-04-17 06:20:36 +08:00
|
|
|
static inline struct user_struct *get_uid(struct user_struct *u)
|
|
|
|
{
|
|
|
|
atomic_inc(&u->__count);
|
|
|
|
return u;
|
|
|
|
}
|
|
|
|
extern void free_uid(struct user_struct *);
|
Fix user namespace exiting OOPs
It turned out, that the user namespace is released during the do_exit() in
exit_task_namespaces(), but the struct user_struct is released only during the
put_task_struct(), i.e. MUCH later.
On debug kernels with poisoned slabs this will cause the oops in
uid_hash_remove() because the head of the chain, which resides inside the
struct user_namespace, will be already freed and poisoned.
Since the uid hash itself is required only when someone can search it, i.e.
when the namespace is alive, we can safely unhash all the user_struct-s from
it during the namespace exiting. The subsequent free_uid() will complete the
user_struct destruction.
For example simple program
#include <sched.h>
char stack[2 * 1024 * 1024];
int f(void *foo)
{
return 0;
}
int main(void)
{
clone(f, stack + 1 * 1024 * 1024, 0x10000000, 0);
return 0;
}
run on kernel with CONFIG_USER_NS turned on will oops the
kernel immediately.
This was spotted during OpenVZ kernel testing.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Alexey Dobriyan <adobriyan@openvz.org>
Acked-by: "Serge E. Hallyn" <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-09-19 13:46:45 +08:00
|
|
|
extern void release_uids(struct user_namespace *ns);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#include <asm/current.h>
|
|
|
|
|
2006-09-29 17:00:32 +08:00
|
|
|
extern void do_timer(unsigned long ticks);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-02-14 07:03:15 +08:00
|
|
|
extern int wake_up_state(struct task_struct *tsk, unsigned int state);
|
|
|
|
extern int wake_up_process(struct task_struct *tsk);
|
|
|
|
extern void wake_up_new_task(struct task_struct *tsk,
|
|
|
|
unsigned long clone_flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
extern void kick_process(struct task_struct *tsk);
|
|
|
|
#else
|
|
|
|
static inline void kick_process(struct task_struct *tsk) { }
|
|
|
|
#endif
|
2007-07-10 00:52:00 +08:00
|
|
|
extern void sched_fork(struct task_struct *p, int clone_flags);
|
|
|
|
extern void sched_dead(struct task_struct *p);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
extern void proc_caches_init(void);
|
|
|
|
extern void flush_signals(struct task_struct *);
|
2007-05-09 17:34:37 +08:00
|
|
|
extern void ignore_signals(struct task_struct *);
|
2005-04-17 06:20:36 +08:00
|
|
|
extern void flush_signal_handlers(struct task_struct *, int force_default);
|
|
|
|
extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
|
|
|
|
|
|
|
|
static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&tsk->sighand->siglock, flags);
|
|
|
|
ret = dequeue_signal(tsk, mask, info);
|
|
|
|
spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
extern void block_all_signals(int (*notifier)(void *priv), void *priv,
|
|
|
|
sigset_t *mask);
|
|
|
|
extern void unblock_all_signals(void);
|
|
|
|
extern void release_task(struct task_struct * p);
|
|
|
|
extern int send_sig_info(int, struct siginfo *, struct task_struct *);
|
|
|
|
extern int force_sigsegv(int, struct task_struct *);
|
|
|
|
extern int force_sig_info(int, struct siginfo *, struct task_struct *);
|
2006-10-02 17:17:10 +08:00
|
|
|
extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
|
|
|
|
extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
|
2006-10-02 17:17:28 +08:00
|
|
|
extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_t, u32);
|
2006-10-02 17:17:10 +08:00
|
|
|
extern int kill_pgrp(struct pid *pid, int sig, int priv);
|
|
|
|
extern int kill_pid(struct pid *pid, int sig, int priv);
|
2007-02-09 23:11:47 +08:00
|
|
|
extern int kill_proc_info(int, struct siginfo *, pid_t);
|
2008-07-26 10:45:54 +08:00
|
|
|
extern int do_notify_parent(struct task_struct *, int);
|
2005-04-17 06:20:36 +08:00
|
|
|
extern void force_sig(int, struct task_struct *);
|
|
|
|
extern void force_sig_specific(int, struct task_struct *);
|
|
|
|
extern int send_sig(int, struct task_struct *, int);
|
|
|
|
extern void zap_other_threads(struct task_struct *p);
|
|
|
|
extern struct sigqueue *sigqueue_alloc(void);
|
|
|
|
extern void sigqueue_free(struct sigqueue *);
|
2008-04-30 15:52:57 +08:00
|
|
|
extern int send_sigqueue(struct sigqueue *, struct task_struct *, int group);
|
2006-02-10 03:41:50 +08:00
|
|
|
extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
|
2005-04-17 06:20:36 +08:00
|
|
|
extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long);
|
|
|
|
|
2006-10-02 17:19:00 +08:00
|
|
|
static inline int kill_cad_pid(int sig, int priv)
|
|
|
|
{
|
|
|
|
return kill_pid(cad_pid, sig, priv);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* These can be the second arg to send_sig_info/send_group_sig_info. */
|
|
|
|
#define SEND_SIG_NOINFO ((struct siginfo *) 0)
|
|
|
|
#define SEND_SIG_PRIV ((struct siginfo *) 1)
|
|
|
|
#define SEND_SIG_FORCED ((struct siginfo *) 2)
|
|
|
|
|
2005-10-31 07:03:45 +08:00
|
|
|
static inline int is_si_special(const struct siginfo *info)
|
|
|
|
{
|
|
|
|
return info <= SEND_SIG_FORCED;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* True if we are on the alternate signal stack. */
|
|
|
|
|
|
|
|
static inline int on_sig_stack(unsigned long sp)
|
|
|
|
{
|
|
|
|
return (sp - current->sas_ss_sp < current->sas_ss_size);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int sas_ss_flags(unsigned long sp)
|
|
|
|
{
|
|
|
|
return (current->sas_ss_size == 0 ? SS_DISABLE
|
|
|
|
: on_sig_stack(sp) ? SS_ONSTACK : 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Routines for handling mm_structs
|
|
|
|
*/
|
|
|
|
extern struct mm_struct * mm_alloc(void);
|
|
|
|
|
|
|
|
/* mmdrop drops the mm and the page tables */
|
2008-02-14 07:03:15 +08:00
|
|
|
extern void __mmdrop(struct mm_struct *);
|
2005-04-17 06:20:36 +08:00
|
|
|
static inline void mmdrop(struct mm_struct * mm)
|
|
|
|
{
|
2007-07-10 00:52:01 +08:00
|
|
|
if (unlikely(atomic_dec_and_test(&mm->mm_count)))
|
2005-04-17 06:20:36 +08:00
|
|
|
__mmdrop(mm);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* mmput gets rid of the mappings and all user-space */
|
|
|
|
extern void mmput(struct mm_struct *);
|
|
|
|
/* Grab a reference to a task's mm, if it is not already going away */
|
|
|
|
extern struct mm_struct *get_task_mm(struct task_struct *task);
|
|
|
|
/* Remove the current tasks stale references to the old mm_struct */
|
|
|
|
extern void mm_release(struct task_struct *, struct mm_struct *);
|
2008-03-26 01:47:10 +08:00
|
|
|
/* Allocate a new mm structure and copy contents from tsk->mm */
|
|
|
|
extern struct mm_struct *dup_mm(struct task_struct *tsk);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
|
|
|
|
extern void flush_thread(void);
|
|
|
|
extern void exit_thread(void);
|
|
|
|
|
|
|
|
extern void exit_files(struct task_struct *);
|
2006-03-29 08:11:16 +08:00
|
|
|
extern void __cleanup_signal(struct signal_struct *);
|
2006-03-29 08:11:27 +08:00
|
|
|
extern void __cleanup_sighand(struct sighand_struct *);
|
2008-05-27 00:55:42 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
extern void exit_itimers(struct signal_struct *);
|
2008-05-27 00:55:42 +08:00
|
|
|
extern void flush_itimer_signals(void);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
extern NORET_TYPE void do_group_exit(int);
|
|
|
|
|
|
|
|
extern void daemonize(const char *, ...);
|
|
|
|
extern int allow_signal(int);
|
|
|
|
extern int disallow_signal(int);
|
|
|
|
|
|
|
|
extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *);
|
|
|
|
extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
|
2006-07-03 15:25:41 +08:00
|
|
|
struct task_struct *fork_idle(int);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
extern void set_task_comm(struct task_struct *tsk, char *from);
|
2008-02-05 14:27:21 +08:00
|
|
|
extern char *get_task_comm(char *to, struct task_struct *tsk);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
2008-07-26 10:45:58 +08:00
|
|
|
extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
|
2005-04-17 06:20:36 +08:00
|
|
|
#else
|
2008-07-26 10:45:58 +08:00
|
|
|
static inline unsigned long wait_task_inactive(struct task_struct *p,
|
|
|
|
long match_state)
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
|
2006-04-19 13:20:16 +08:00
|
|
|
#define next_task(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#define for_each_process(p) \
|
|
|
|
for (p = &init_task ; (p = next_task(p)) != &init_task ; )
|
|
|
|
|
CRED: Inaugurate COW credentials
Inaugurate copy-on-write credentials management. This uses RCU to manage the
credentials pointer in the task_struct with respect to accesses by other tasks.
A process may only modify its own credentials, and so does not need locking to
access or modify its own credentials.
A mutex (cred_replace_mutex) is added to the task_struct to control the effect
of PTRACE_ATTACHED on credential calculations, particularly with respect to
execve().
With this patch, the contents of an active credentials struct may not be
changed directly; rather a new set of credentials must be prepared, modified
and committed using something like the following sequence of events:
struct cred *new = prepare_creds();
int ret = blah(new);
if (ret < 0) {
abort_creds(new);
return ret;
}
return commit_creds(new);
There are some exceptions to this rule: the keyrings pointed to by the active
credentials may be instantiated - keyrings violate the COW rule as managing
COW keyrings is tricky, given that it is possible for a task to directly alter
the keys in a keyring in use by another task.
To help enforce this, various pointers to sets of credentials, such as those in
the task_struct, are declared const. The purpose of this is compile-time
discouragement of altering credentials through those pointers. Once a set of
credentials has been made public through one of these pointers, it may not be
modified, except under special circumstances:
(1) Its reference count may incremented and decremented.
(2) The keyrings to which it points may be modified, but not replaced.
The only safe way to modify anything else is to create a replacement and commit
using the functions described in Documentation/credentials.txt (which will be
added by a later patch).
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
This now prepares and commits credentials in various places in the
security code rather than altering the current creds directly.
(2) Temporary credential overrides.
do_coredump() and sys_faccessat() now prepare their own credentials and
temporarily override the ones currently on the acting thread, whilst
preventing interference from other threads by holding cred_replace_mutex
on the thread being dumped.
This will be replaced in a future patch by something that hands down the
credentials directly to the functions being called, rather than altering
the task's objective credentials.
(3) LSM interface.
A number of functions have been changed, added or removed:
(*) security_capset_check(), ->capset_check()
(*) security_capset_set(), ->capset_set()
Removed in favour of security_capset().
(*) security_capset(), ->capset()
New. This is passed a pointer to the new creds, a pointer to the old
creds and the proposed capability sets. It should fill in the new
creds or return an error. All pointers, barring the pointer to the
new creds, are now const.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
Changed; now returns a value, which will cause the process to be
killed if it's an error.
(*) security_task_alloc(), ->task_alloc_security()
Removed in favour of security_prepare_creds().
(*) security_cred_free(), ->cred_free()
New. Free security data attached to cred->security.
(*) security_prepare_creds(), ->cred_prepare()
New. Duplicate any security data attached to cred->security.
(*) security_commit_creds(), ->cred_commit()
New. Apply any security effects for the upcoming installation of new
security by commit_creds().
(*) security_task_post_setuid(), ->task_post_setuid()
Removed in favour of security_task_fix_setuid().
(*) security_task_fix_setuid(), ->task_fix_setuid()
Fix up the proposed new credentials for setuid(). This is used by
cap_set_fix_setuid() to implicitly adjust capabilities in line with
setuid() changes. Changes are made to the new credentials, rather
than the task itself as in security_task_post_setuid().
(*) security_task_reparent_to_init(), ->task_reparent_to_init()
Removed. Instead the task being reparented to init is referred
directly to init's credentials.
NOTE! This results in the loss of some state: SELinux's osid no
longer records the sid of the thread that forked it.
(*) security_key_alloc(), ->key_alloc()
(*) security_key_permission(), ->key_permission()
Changed. These now take cred pointers rather than task pointers to
refer to the security context.
(4) sys_capset().
This has been simplified and uses less locking. The LSM functions it
calls have been merged.
(5) reparent_to_kthreadd().
This gives the current thread the same credentials as init by simply using
commit_thread() to point that way.
(6) __sigqueue_alloc() and switch_uid()
__sigqueue_alloc() can't stop the target task from changing its creds
beneath it, so this function gets a reference to the currently applicable
user_struct which it then passes into the sigqueue struct it returns if
successful.
switch_uid() is now called from commit_creds(), and possibly should be
folded into that. commit_creds() should take care of protecting
__sigqueue_alloc().
(7) [sg]et[ug]id() and co and [sg]et_current_groups.
The set functions now all use prepare_creds(), commit_creds() and
abort_creds() to build and check a new set of credentials before applying
it.
security_task_set[ug]id() is called inside the prepared section. This
guarantees that nothing else will affect the creds until we've finished.
The calling of set_dumpable() has been moved into commit_creds().
Much of the functionality of set_user() has been moved into
commit_creds().
The get functions all simply access the data directly.
(8) security_task_prctl() and cap_task_prctl().
security_task_prctl() has been modified to return -ENOSYS if it doesn't
want to handle a function, or otherwise return the return value directly
rather than through an argument.
Additionally, cap_task_prctl() now prepares a new set of credentials, even
if it doesn't end up using it.
(9) Keyrings.
A number of changes have been made to the keyrings code:
(a) switch_uid_keyring(), copy_keys(), exit_keys() and suid_keys() have
all been dropped and built in to the credentials functions directly.
They may want separating out again later.
(b) key_alloc() and search_process_keyrings() now take a cred pointer
rather than a task pointer to specify the security context.
(c) copy_creds() gives a new thread within the same thread group a new
thread keyring if its parent had one, otherwise it discards the thread
keyring.
(d) The authorisation key now points directly to the credentials to extend
the search into rather pointing to the task that carries them.
(e) Installing thread, process or session keyrings causes a new set of
credentials to be created, even though it's not strictly necessary for
process or session keyrings (they're shared).
(10) Usermode helper.
The usermode helper code now carries a cred struct pointer in its
subprocess_info struct instead of a new session keyring pointer. This set
of credentials is derived from init_cred and installed on the new process
after it has been cloned.
call_usermodehelper_setup() allocates the new credentials and
call_usermodehelper_freeinfo() discards them if they haven't been used. A
special cred function (prepare_usermodeinfo_creds()) is provided
specifically for call_usermodehelper_setup() to call.
call_usermodehelper_setkeys() adjusts the credentials to sport the
supplied keyring as the new session keyring.
(11) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) selinux_setprocattr() no longer does its check for whether the
current ptracer can access processes with the new SID inside the lock
that covers getting the ptracer's SID. Whilst this lock ensures that
the check is done with the ptracer pinned, the result is only valid
until the lock is released, so there's no point doing it inside the
lock.
(12) is_single_threaded().
This function has been extracted from selinux_setprocattr() and put into
a file of its own in the lib/ directory as join_session_keyring() now
wants to use it too.
The code in SELinux just checked to see whether a task shared mm_structs
with other tasks (CLONE_VM), but that isn't good enough. We really want
to know if they're part of the same thread group (CLONE_THREAD).
(13) nfsd.
The NFS server daemon now has to use the COW credentials to set the
credentials it is going to use. It really needs to pass the credentials
down to the functions it calls, but it can't do that until other patches
in this series have been applied.
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:23 +08:00
|
|
|
extern bool is_single_threaded(struct task_struct *);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Careful: do_each_thread/while_each_thread is a double loop so
|
|
|
|
* 'break' will not work as expected - use goto instead.
|
|
|
|
*/
|
|
|
|
#define do_each_thread(g, t) \
|
|
|
|
for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
|
|
|
|
|
|
|
|
#define while_each_thread(g, t) \
|
|
|
|
while ((t = next_thread(t)) != g)
|
|
|
|
|
2006-04-11 07:16:49 +08:00
|
|
|
/* de_thread depends on thread_group_leader not being a pid based check */
|
|
|
|
#define thread_group_leader(p) (p == p->group_leader)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
[PATCH] proc: readdir race fix (take 3)
The problem: An opendir, readdir, closedir sequence can fail to report
process ids that are continually in use throughout the sequence of system
calls. For this race to trigger the process that proc_pid_readdir stops at
must exit before readdir is called again.
This can cause ps to fail to report processes, and it is in violation of
posix guarantees and normal application expectations with respect to
readdir.
Currently there is no way to work around this problem in user space short
of providing a gargantuan buffer to user space so the directory read all
happens in on system call.
This patch implements the normal directory semantics for proc, that
guarantee that a directory entry that is neither created nor destroyed
while reading the directory entry will be returned. For directory that are
either created or destroyed during the readdir you may or may not see them.
Furthermore you may seek to a directory offset you have previously seen.
These are the guarantee that ext[23] provides and that posix requires, and
more importantly that user space expects. Plus it is a simple semantic to
implement reliable service. It is just a matter of calling readdir a
second time if you are wondering if something new has show up.
These better semantics are implemented by scanning through the pids in
numerical order and by making the file offset a pid plus a fixed offset.
The pid scan happens on the pid bitmap, which when you look at it is
remarkably efficient for a brute force algorithm. Given that a typical
cache line is 64 bytes and thus covers space for 64*8 == 200 pids. There
are only 40 cache lines for the entire 32K pid space. A typical system
will have 100 pids or more so this is actually fewer cache lines we have to
look at to scan a linked list, and the worst case of having to scan the
entire pid bitmap is pretty reasonable.
If we need something more efficient we can go to a more efficient data
structure for indexing the pids, but for now what we have should be
sufficient.
In addition this takes no additional locks and is actually less code than
what we are doing now.
Also another very subtle bug in this area has been fixed. It is possible
to catch a task in the middle of de_thread where a thread is assuming the
thread of it's thread group leader. This patch carefully handles that case
so if we hit it we don't fail to return the pid, that is undergoing the
de_thread dance.
Thanks to KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> for
providing the first fix, pointing this out and working on it.
[oleg@tv-sign.ru: fix it]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-02 17:17:04 +08:00
|
|
|
/* Do to the insanities of de_thread it is possible for a process
|
|
|
|
* to have the pid of the thread group leader without actually being
|
|
|
|
* the thread group leader. For iteration through the pids in proc
|
|
|
|
* all we care about is that we have a task with the appropriate
|
|
|
|
* pid, we don't actually care if we have the right task.
|
|
|
|
*/
|
2007-10-26 16:17:22 +08:00
|
|
|
static inline int has_group_leader_pid(struct task_struct *p)
|
[PATCH] proc: readdir race fix (take 3)
The problem: An opendir, readdir, closedir sequence can fail to report
process ids that are continually in use throughout the sequence of system
calls. For this race to trigger the process that proc_pid_readdir stops at
must exit before readdir is called again.
This can cause ps to fail to report processes, and it is in violation of
posix guarantees and normal application expectations with respect to
readdir.
Currently there is no way to work around this problem in user space short
of providing a gargantuan buffer to user space so the directory read all
happens in on system call.
This patch implements the normal directory semantics for proc, that
guarantee that a directory entry that is neither created nor destroyed
while reading the directory entry will be returned. For directory that are
either created or destroyed during the readdir you may or may not see them.
Furthermore you may seek to a directory offset you have previously seen.
These are the guarantee that ext[23] provides and that posix requires, and
more importantly that user space expects. Plus it is a simple semantic to
implement reliable service. It is just a matter of calling readdir a
second time if you are wondering if something new has show up.
These better semantics are implemented by scanning through the pids in
numerical order and by making the file offset a pid plus a fixed offset.
The pid scan happens on the pid bitmap, which when you look at it is
remarkably efficient for a brute force algorithm. Given that a typical
cache line is 64 bytes and thus covers space for 64*8 == 200 pids. There
are only 40 cache lines for the entire 32K pid space. A typical system
will have 100 pids or more so this is actually fewer cache lines we have to
look at to scan a linked list, and the worst case of having to scan the
entire pid bitmap is pretty reasonable.
If we need something more efficient we can go to a more efficient data
structure for indexing the pids, but for now what we have should be
sufficient.
In addition this takes no additional locks and is actually less code than
what we are doing now.
Also another very subtle bug in this area has been fixed. It is possible
to catch a task in the middle of de_thread where a thread is assuming the
thread of it's thread group leader. This patch carefully handles that case
so if we hit it we don't fail to return the pid, that is undergoing the
de_thread dance.
Thanks to KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> for
providing the first fix, pointing this out and working on it.
[oleg@tv-sign.ru: fix it]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-02 17:17:04 +08:00
|
|
|
{
|
|
|
|
return p->pid == p->tgid;
|
|
|
|
}
|
|
|
|
|
2007-10-19 14:40:18 +08:00
|
|
|
static inline
|
|
|
|
int same_thread_group(struct task_struct *p1, struct task_struct *p2)
|
|
|
|
{
|
|
|
|
return p1->tgid == p2->tgid;
|
|
|
|
}
|
|
|
|
|
2006-07-03 15:25:41 +08:00
|
|
|
static inline struct task_struct *next_thread(const struct task_struct *p)
|
2006-03-29 08:11:25 +08:00
|
|
|
{
|
|
|
|
return list_entry(rcu_dereference(p->thread_group.next),
|
2006-07-03 15:25:41 +08:00
|
|
|
struct task_struct, thread_group);
|
2006-03-29 08:11:25 +08:00
|
|
|
}
|
|
|
|
|
2007-10-26 16:17:22 +08:00
|
|
|
static inline int thread_group_empty(struct task_struct *p)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2006-03-29 08:11:25 +08:00
|
|
|
return list_empty(&p->thread_group);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#define delay_group_leader(p) \
|
|
|
|
(thread_group_leader(p) && !thread_group_empty(p))
|
|
|
|
|
|
|
|
/*
|
2006-06-23 17:05:18 +08:00
|
|
|
* Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
|
2005-06-27 16:55:12 +08:00
|
|
|
* subscriptions and synchronises with wait4(). Also used in procfs. Also
|
Task Control Groups: basic task cgroup framework
Generic Process Control Groups
--------------------------
There have recently been various proposals floating around for
resource management/accounting and other task grouping subsystems in
the kernel, including ResGroups, User BeanCounters, NSProxy
cgroups, and others. These all need the basic abstraction of being
able to group together multiple processes in an aggregate, in order to
track/limit the resources permitted to those processes, or control
other behaviour of the processes, and all implement this grouping in
different ways.
This patchset provides a framework for tracking and grouping processes
into arbitrary "cgroups" and assigning arbitrary state to those
groupings, in order to control the behaviour of the cgroup as an
aggregate.
The intention is that the various resource management and
virtualization/cgroup efforts can also become task cgroup
clients, with the result that:
- the userspace APIs are (somewhat) normalised
- it's easier to test e.g. the ResGroups CPU controller in
conjunction with the BeanCounters memory controller, or use either of
them as the resource-control portion of a virtual server system.
- the additional kernel footprint of any of the competing resource
management systems is substantially reduced, since it doesn't need
to provide process grouping/containment, hence improving their
chances of getting into the kernel
This patch:
Add the main task cgroups framework - the cgroup filesystem, and the
basic structures for tracking membership and associating subsystem state
objects to tasks.
Signed-off-by: Paul Menage <menage@google.com>
Cc: Serge E. Hallyn <serue@us.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-19 14:39:30 +08:00
|
|
|
* pins the final release of task.io_context. Also protects ->cpuset and
|
|
|
|
* ->cgroup.subsys[].
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* Nests both inside and outside of read_lock(&tasklist_lock).
|
|
|
|
* It must not be nested with write_lock_irq(&tasklist_lock),
|
|
|
|
* neither inside nor outside.
|
|
|
|
*/
|
|
|
|
static inline void task_lock(struct task_struct *p)
|
|
|
|
{
|
|
|
|
spin_lock(&p->alloc_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void task_unlock(struct task_struct *p)
|
|
|
|
{
|
|
|
|
spin_unlock(&p->alloc_lock);
|
|
|
|
}
|
|
|
|
|
2006-03-29 08:11:13 +08:00
|
|
|
extern struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
|
|
|
|
unsigned long *flags);
|
|
|
|
|
|
|
|
static inline void unlock_task_sighand(struct task_struct *tsk,
|
|
|
|
unsigned long *flags)
|
|
|
|
{
|
|
|
|
spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
|
|
|
|
}
|
|
|
|
|
2005-11-14 08:06:57 +08:00
|
|
|
#ifndef __HAVE_THREAD_FUNCTIONS
|
|
|
|
|
2007-05-09 17:35:17 +08:00
|
|
|
#define task_thread_info(task) ((struct thread_info *)(task)->stack)
|
|
|
|
#define task_stack_page(task) ((task)->stack)
|
2005-11-14 08:06:55 +08:00
|
|
|
|
2005-11-14 08:06:56 +08:00
|
|
|
static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
|
|
|
|
{
|
|
|
|
*task_thread_info(p) = *task_thread_info(org);
|
|
|
|
task_thread_info(p)->task = p;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned long *end_of_stack(struct task_struct *p)
|
|
|
|
{
|
2007-05-09 17:35:17 +08:00
|
|
|
return (unsigned long *)(task_thread_info(p) + 1);
|
2005-11-14 08:06:56 +08:00
|
|
|
}
|
|
|
|
|
2005-11-14 08:06:57 +08:00
|
|
|
#endif
|
|
|
|
|
2008-07-24 12:26:53 +08:00
|
|
|
static inline int object_is_on_stack(void *obj)
|
|
|
|
{
|
|
|
|
void *stack = task_stack_page(current);
|
|
|
|
|
|
|
|
return (obj >= stack) && (obj < (stack + THREAD_SIZE));
|
|
|
|
}
|
|
|
|
|
2008-04-18 14:56:15 +08:00
|
|
|
extern void thread_info_cache_init(void);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* set thread flags in other task's structures
|
|
|
|
* - see asm/thread_info.h for TIF_xxxx flags available
|
|
|
|
*/
|
|
|
|
static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
|
|
|
|
{
|
2005-11-14 08:06:55 +08:00
|
|
|
set_ti_thread_flag(task_thread_info(tsk), flag);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
|
|
|
|
{
|
2005-11-14 08:06:55 +08:00
|
|
|
clear_ti_thread_flag(task_thread_info(tsk), flag);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
|
|
|
|
{
|
2005-11-14 08:06:55 +08:00
|
|
|
return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
|
|
|
|
{
|
2005-11-14 08:06:55 +08:00
|
|
|
return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
|
|
|
|
{
|
2005-11-14 08:06:55 +08:00
|
|
|
return test_ti_thread_flag(task_thread_info(tsk), flag);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void set_tsk_need_resched(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void clear_tsk_need_resched(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
|
|
|
|
}
|
|
|
|
|
2008-04-23 19:13:29 +08:00
|
|
|
static inline int test_tsk_need_resched(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static inline int signal_pending(struct task_struct *p)
|
|
|
|
{
|
|
|
|
return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
|
|
|
|
}
|
2007-12-07 00:15:50 +08:00
|
|
|
|
2008-02-14 07:03:15 +08:00
|
|
|
extern int __fatal_signal_pending(struct task_struct *p);
|
2007-12-07 00:15:50 +08:00
|
|
|
|
|
|
|
static inline int fatal_signal_pending(struct task_struct *p)
|
|
|
|
{
|
|
|
|
return signal_pending(p) && __fatal_signal_pending(p);
|
|
|
|
}
|
|
|
|
|
sched: fix TASK_WAKEKILL vs SIGKILL race
schedule() has the special "TASK_INTERRUPTIBLE && signal_pending()" case,
this allows us to do
current->state = TASK_INTERRUPTIBLE;
schedule();
without fear to sleep with pending signal.
However, the code like
current->state = TASK_KILLABLE;
schedule();
is not right, schedule() doesn't take TASK_WAKEKILL into account. This means
that mutex_lock_killable(), wait_for_completion_killable(), down_killable(),
schedule_timeout_killable() can miss SIGKILL (and btw the second SIGKILL has
no effect).
Introduce the new helper, signal_pending_state(), and change schedule() to
use it. Hopefully it will have more users, that is why the task's state is
passed separately.
Note this "__TASK_STOPPED | __TASK_TRACED" check in signal_pending_state().
This is needed to preserve the current behaviour (ptrace_notify). I hope
this check will be removed soon, but this (afaics good) change needs the
separate discussion.
The fast path is "(state & (INTERRUPTIBLE | WAKEKILL)) + signal_pending(p)",
basically the same that schedule() does now. However, this patch of course
bloats schedule().
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-06-09 01:20:41 +08:00
|
|
|
static inline int signal_pending_state(long state, struct task_struct *p)
|
|
|
|
{
|
|
|
|
if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
|
|
|
|
return 0;
|
|
|
|
if (!signal_pending(p))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static inline int need_resched(void)
|
|
|
|
{
|
2008-05-13 01:14:22 +08:00
|
|
|
return unlikely(test_thread_flag(TIF_NEED_RESCHED));
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* cond_resched() and cond_resched_lock(): latency reduction via
|
|
|
|
* explicit rescheduling in places that are safe. The return
|
|
|
|
* value indicates whether a reschedule was done in fact.
|
|
|
|
* cond_resched_lock() will drop the spinlock before scheduling,
|
|
|
|
* cond_resched_softirq() will enable bhs before scheduling.
|
|
|
|
*/
|
2008-05-12 07:04:48 +08:00
|
|
|
extern int _cond_resched(void);
|
2008-05-13 04:34:13 +08:00
|
|
|
#ifdef CONFIG_PREEMPT_BKL
|
2008-01-26 04:08:28 +08:00
|
|
|
static inline int cond_resched(void)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline int cond_resched(void)
|
|
|
|
{
|
|
|
|
return _cond_resched();
|
|
|
|
}
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
extern int cond_resched_lock(spinlock_t * lock);
|
|
|
|
extern int cond_resched_softirq(void);
|
2008-05-12 07:04:48 +08:00
|
|
|
static inline int cond_resched_bkl(void)
|
|
|
|
{
|
|
|
|
return _cond_resched();
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Does a critical section need to be broken due to another
|
2008-01-30 20:31:20 +08:00
|
|
|
* task waiting?: (technically does not depend on CONFIG_PREEMPT,
|
|
|
|
* but a general need for low latency)
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2008-01-30 20:31:20 +08:00
|
|
|
static inline int spin_needbreak(spinlock_t *lock)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2008-01-30 20:31:20 +08:00
|
|
|
#ifdef CONFIG_PREEMPT
|
|
|
|
return spin_is_contended(lock);
|
|
|
|
#else
|
2005-04-17 06:20:36 +08:00
|
|
|
return 0;
|
2008-01-30 20:31:20 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
/*
|
|
|
|
* Thread group CPU time accounting.
|
|
|
|
*/
|
|
|
|
|
2008-09-13 00:54:39 +08:00
|
|
|
extern int thread_group_cputime_alloc(struct task_struct *);
|
|
|
|
extern void thread_group_cputime(struct task_struct *, struct task_cputime *);
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
|
|
|
|
static inline void thread_group_cputime_init(struct signal_struct *sig)
|
|
|
|
{
|
|
|
|
sig->cputime.totals = NULL;
|
|
|
|
}
|
|
|
|
|
2008-09-13 00:54:39 +08:00
|
|
|
static inline int thread_group_cputime_clone_thread(struct task_struct *curr)
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
{
|
|
|
|
if (curr->signal->cputime.totals)
|
|
|
|
return 0;
|
2008-09-13 00:54:39 +08:00
|
|
|
return thread_group_cputime_alloc(curr);
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void thread_group_cputime_free(struct signal_struct *sig)
|
|
|
|
{
|
2008-09-13 00:54:39 +08:00
|
|
|
free_percpu(sig->cputime.totals);
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
}
|
|
|
|
|
2007-05-24 04:57:44 +08:00
|
|
|
/*
|
|
|
|
* Reevaluate whether the task has signals pending delivery.
|
|
|
|
* Wake the task if so.
|
|
|
|
* This is required every time the blocked sigset_t changes.
|
|
|
|
* callers must hold sighand->siglock.
|
|
|
|
*/
|
|
|
|
extern void recalc_sigpending_and_wake(struct task_struct *t);
|
2005-04-17 06:20:36 +08:00
|
|
|
extern void recalc_sigpending(void);
|
|
|
|
|
|
|
|
extern void signal_wake_up(struct task_struct *t, int resume_stopped);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Wrappers for p->thread_info->cpu access. No-op on UP.
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
|
|
|
|
static inline unsigned int task_cpu(const struct task_struct *p)
|
|
|
|
{
|
2005-11-14 08:06:55 +08:00
|
|
|
return task_thread_info(p)->cpu;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2007-07-10 00:51:58 +08:00
|
|
|
extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
static inline unsigned int task_cpu(const struct task_struct *p)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
|
|
|
|
extern void arch_pick_mmap_layout(struct mm_struct *mm);
|
|
|
|
|
2008-05-13 03:20:52 +08:00
|
|
|
#ifdef CONFIG_TRACING
|
|
|
|
extern void
|
|
|
|
__trace_special(void *__tr, void *__data,
|
|
|
|
unsigned long arg1, unsigned long arg2, unsigned long arg3);
|
|
|
|
#else
|
|
|
|
static inline void
|
|
|
|
__trace_special(void *__tr, void *__data,
|
|
|
|
unsigned long arg1, unsigned long arg2, unsigned long arg3)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2008-11-25 00:05:14 +08:00
|
|
|
extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
|
|
|
|
extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
|
2006-06-27 17:54:42 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
extern void normalize_rt_tasks(void);
|
|
|
|
|
2008-02-13 22:45:40 +08:00
|
|
|
#ifdef CONFIG_GROUP_SCHED
|
2007-10-15 23:00:09 +08:00
|
|
|
|
2007-10-15 23:00:14 +08:00
|
|
|
extern struct task_group init_task_group;
|
2008-04-20 01:45:00 +08:00
|
|
|
#ifdef CONFIG_USER_SCHED
|
|
|
|
extern struct task_group root_task_group;
|
2008-12-01 23:19:05 +08:00
|
|
|
extern void set_tg_uid(struct user_struct *user);
|
2008-04-20 01:45:00 +08:00
|
|
|
#endif
|
2007-10-15 23:00:09 +08:00
|
|
|
|
2008-04-20 01:44:59 +08:00
|
|
|
extern struct task_group *sched_create_group(struct task_group *parent);
|
2007-10-15 23:00:14 +08:00
|
|
|
extern void sched_destroy_group(struct task_group *tg);
|
2007-10-15 23:00:09 +08:00
|
|
|
extern void sched_move_task(struct task_struct *tsk);
|
2008-02-13 22:45:40 +08:00
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
2007-10-15 23:00:14 +08:00
|
|
|
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
|
2007-10-15 23:00:14 +08:00
|
|
|
extern unsigned long sched_group_shares(struct task_group *tg);
|
2008-02-13 22:45:40 +08:00
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
2008-02-13 22:45:39 +08:00
|
|
|
extern int sched_group_set_rt_runtime(struct task_group *tg,
|
|
|
|
long rt_runtime_us);
|
|
|
|
extern long sched_group_rt_runtime(struct task_group *tg);
|
2008-04-20 01:44:57 +08:00
|
|
|
extern int sched_group_set_rt_period(struct task_group *tg,
|
|
|
|
long rt_period_us);
|
|
|
|
extern long sched_group_rt_period(struct task_group *tg);
|
2008-02-13 22:45:40 +08:00
|
|
|
#endif
|
2007-10-15 23:00:09 +08:00
|
|
|
#endif
|
|
|
|
|
[PATCH] ifdef ->rchar, ->wchar, ->syscr, ->syscw from task_struct
They are fat: 4x8 bytes in task_struct.
They are uncoditionally updated in every fork, read, write and sendfile.
They are used only if you have some "extended acct fields feature".
And please, please, please, read(2) knows about bytes, not characters,
why it is called "rchar"?
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-10 17:46:45 +08:00
|
|
|
#ifdef CONFIG_TASK_XACCT
|
|
|
|
static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
|
|
|
|
{
|
2008-07-28 06:48:12 +08:00
|
|
|
tsk->ioac.rchar += amt;
|
[PATCH] ifdef ->rchar, ->wchar, ->syscr, ->syscw from task_struct
They are fat: 4x8 bytes in task_struct.
They are uncoditionally updated in every fork, read, write and sendfile.
They are used only if you have some "extended acct fields feature".
And please, please, please, read(2) knows about bytes, not characters,
why it is called "rchar"?
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-10 17:46:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
|
|
|
|
{
|
2008-07-28 06:48:12 +08:00
|
|
|
tsk->ioac.wchar += amt;
|
[PATCH] ifdef ->rchar, ->wchar, ->syscr, ->syscw from task_struct
They are fat: 4x8 bytes in task_struct.
They are uncoditionally updated in every fork, read, write and sendfile.
They are used only if you have some "extended acct fields feature".
And please, please, please, read(2) knows about bytes, not characters,
why it is called "rchar"?
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-10 17:46:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void inc_syscr(struct task_struct *tsk)
|
|
|
|
{
|
2008-07-28 06:48:12 +08:00
|
|
|
tsk->ioac.syscr++;
|
[PATCH] ifdef ->rchar, ->wchar, ->syscr, ->syscw from task_struct
They are fat: 4x8 bytes in task_struct.
They are uncoditionally updated in every fork, read, write and sendfile.
They are used only if you have some "extended acct fields feature".
And please, please, please, read(2) knows about bytes, not characters,
why it is called "rchar"?
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-10 17:46:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void inc_syscw(struct task_struct *tsk)
|
|
|
|
{
|
2008-07-28 06:48:12 +08:00
|
|
|
tsk->ioac.syscw++;
|
[PATCH] ifdef ->rchar, ->wchar, ->syscr, ->syscw from task_struct
They are fat: 4x8 bytes in task_struct.
They are uncoditionally updated in every fork, read, write and sendfile.
They are used only if you have some "extended acct fields feature".
And please, please, please, read(2) knows about bytes, not characters,
why it is called "rchar"?
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-10 17:46:45 +08:00
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void inc_syscr(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void inc_syscw(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2008-02-05 14:28:59 +08:00
|
|
|
#ifndef TASK_SIZE_OF
|
|
|
|
#define TASK_SIZE_OF(tsk) TASK_SIZE
|
|
|
|
#endif
|
|
|
|
|
cgroups: add an owner to the mm_struct
Remove the mem_cgroup member from mm_struct and instead adds an owner.
This approach was suggested by Paul Menage. The advantage of this approach
is that, once the mm->owner is known, using the subsystem id, the cgroup
can be determined. It also allows several control groups that are
virtually grouped by mm_struct, to exist independent of the memory
controller i.e., without adding mem_cgroup's for each controller, to
mm_struct.
A new config option CONFIG_MM_OWNER is added and the memory resource
controller selects this config option.
This patch also adds cgroup callbacks to notify subsystems when mm->owner
changes. The mm_cgroup_changed callback is called with the task_lock() of
the new task held and is called just prior to changing the mm->owner.
I am indebted to Paul Menage for the several reviews of this patchset and
helping me make it lighter and simpler.
This patch was tested on a powerpc box, it was compiled with both the
MM_OWNER config turned on and off.
After the thread group leader exits, it's moved to init_css_state by
cgroup_exit(), thus all future charges from runnings threads would be
redirected to the init_css_set's subsystem.
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Sudhir Kumar <skumar@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Hirokazu Takahashi <taka@valinux.co.jp>
Cc: David Rientjes <rientjes@google.com>,
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 16:00:16 +08:00
|
|
|
#ifdef CONFIG_MM_OWNER
|
|
|
|
extern void mm_update_next_owner(struct mm_struct *mm);
|
|
|
|
extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
|
|
|
|
#else
|
|
|
|
static inline void mm_update_next_owner(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_MM_OWNER */
|
|
|
|
|
2008-05-13 03:20:41 +08:00
|
|
|
#define TASK_STATE_TO_CHAR_STR "RSDTtZX"
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif /* __KERNEL__ */
|
|
|
|
|
|
|
|
#endif
|