sched: cfs core code
[deliverable/linux.git] / kernel / sched.c
CommitLineData
1da177e4
LT
1/*
2 * kernel/sched.c
3 *
4 * Kernel scheduler and related syscalls
5 *
6 * Copyright (C) 1991-2002 Linus Torvalds
7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 */
20
21#include <linux/mm.h>
22#include <linux/module.h>
23#include <linux/nmi.h>
24#include <linux/init.h>
25#include <asm/uaccess.h>
26#include <linux/highmem.h>
27#include <linux/smp_lock.h>
28#include <asm/mmu_context.h>
29#include <linux/interrupt.h>
c59ede7b 30#include <linux/capability.h>
1da177e4
LT
31#include <linux/completion.h>
32#include <linux/kernel_stat.h>
9a11b49a 33#include <linux/debug_locks.h>
1da177e4
LT
34#include <linux/security.h>
35#include <linux/notifier.h>
36#include <linux/profile.h>
7dfb7103 37#include <linux/freezer.h>
198e2f18 38#include <linux/vmalloc.h>
1da177e4
LT
39#include <linux/blkdev.h>
40#include <linux/delay.h>
41#include <linux/smp.h>
42#include <linux/threads.h>
43#include <linux/timer.h>
44#include <linux/rcupdate.h>
45#include <linux/cpu.h>
46#include <linux/cpuset.h>
47#include <linux/percpu.h>
48#include <linux/kthread.h>
49#include <linux/seq_file.h>
50#include <linux/syscalls.h>
51#include <linux/times.h>
8f0ab514 52#include <linux/tsacct_kern.h>
c6fd91f0 53#include <linux/kprobes.h>
0ff92245 54#include <linux/delayacct.h>
5517d86b 55#include <linux/reciprocal_div.h>
1da177e4 56
5517d86b 57#include <asm/tlb.h>
1da177e4
LT
58#include <asm/unistd.h>
59
b035b6de
AD
60/*
61 * Scheduler clock - returns current time in nanosec units.
62 * This is default implementation.
63 * Architectures and sub-architectures can override this.
64 */
65unsigned long long __attribute__((weak)) sched_clock(void)
66{
67 return (unsigned long long)jiffies * (1000000000 / HZ);
68}
69
1da177e4
LT
70/*
71 * Convert user-nice values [ -20 ... 0 ... 19 ]
72 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
73 * and back.
74 */
75#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
76#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
77#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
78
79/*
80 * 'User priority' is the nice value converted to something we
81 * can work with better when scaling various scheduler parameters,
82 * it's a [ 0 ... 39 ] range.
83 */
84#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
85#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
86#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
87
88/*
89 * Some helpers for converting nanosecond timing to jiffy resolution
90 */
91#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
92#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
93
6aa645ea
IM
94#define NICE_0_LOAD SCHED_LOAD_SCALE
95#define NICE_0_SHIFT SCHED_LOAD_SHIFT
96
1da177e4
LT
97/*
98 * These are the 'tuning knobs' of the scheduler:
99 *
100 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
101 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
102 * Timeslices get refilled after they expire.
103 */
104#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
105#define DEF_TIMESLICE (100 * HZ / 1000)
106#define ON_RUNQUEUE_WEIGHT 30
107#define CHILD_PENALTY 95
108#define PARENT_PENALTY 100
109#define EXIT_WEIGHT 3
110#define PRIO_BONUS_RATIO 25
111#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
112#define INTERACTIVE_DELTA 2
113#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
114#define STARVATION_LIMIT (MAX_SLEEP_AVG)
115#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
116
117/*
118 * If a task is 'interactive' then we reinsert it in the active
119 * array after it has expired its current timeslice. (it will not
120 * continue to run immediately, it will still roundrobin with
121 * other interactive tasks.)
122 *
123 * This part scales the interactivity limit depending on niceness.
124 *
125 * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
126 * Here are a few examples of different nice levels:
127 *
128 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
129 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
130 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
131 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
132 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
133 *
134 * (the X axis represents the possible -5 ... 0 ... +5 dynamic
135 * priority range a task can explore, a value of '1' means the
136 * task is rated interactive.)
137 *
138 * Ie. nice +19 tasks can never get 'interactive' enough to be
139 * reinserted into the active array. And only heavily CPU-hog nice -20
140 * tasks will be expired. Default nice 0 tasks are somewhere between,
141 * it takes some effort for them to get interactive, but it's not
142 * too hard.
143 */
144
145#define CURRENT_BONUS(p) \
146 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
147 MAX_SLEEP_AVG)
148
149#define GRANULARITY (10 * HZ / 1000 ? : 1)
150
151#ifdef CONFIG_SMP
152#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
153 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
154 num_online_cpus())
155#else
156#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
157 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
158#endif
159
160#define SCALE(v1,v1_max,v2_max) \
161 (v1) * (v2_max) / (v1_max)
162
163#define DELTA(p) \
013d3868
MA
164 (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
165 INTERACTIVE_DELTA)
1da177e4
LT
166
167#define TASK_INTERACTIVE(p) \
168 ((p)->prio <= (p)->static_prio - DELTA(p))
169
170#define INTERACTIVE_SLEEP(p) \
171 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
172 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
173
174#define TASK_PREEMPTS_CURR(p, rq) \
d5f9f942 175 ((p)->prio < (rq)->curr->prio)
1da177e4 176
1da177e4 177#define SCALE_PRIO(x, prio) \
2dd73a4f 178 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
1da177e4 179
2dd73a4f 180static unsigned int static_prio_timeslice(int static_prio)
1da177e4 181{
2dd73a4f
PW
182 if (static_prio < NICE_TO_PRIO(0))
183 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
1da177e4 184 else
2dd73a4f 185 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
1da177e4 186}
2dd73a4f 187
5517d86b
ED
188#ifdef CONFIG_SMP
189/*
190 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
191 * Since cpu_power is a 'constant', we can use a reciprocal divide.
192 */
193static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
194{
195 return reciprocal_divide(load, sg->reciprocal_cpu_power);
196}
197
198/*
199 * Each time a sched group cpu_power is changed,
200 * we must compute its reciprocal value
201 */
202static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
203{
204 sg->__cpu_power += val;
205 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
206}
207#endif
208
91fcdd4e
BP
209/*
210 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
211 * to time slice values: [800ms ... 100ms ... 5ms]
212 *
213 * The higher a thread's priority, the bigger timeslices
214 * it gets during one round of execution. But even the lowest
215 * priority thread gets MIN_TIMESLICE worth of execution time.
216 */
217
36c8b586 218static inline unsigned int task_timeslice(struct task_struct *p)
2dd73a4f
PW
219{
220 return static_prio_timeslice(p->static_prio);
221}
222
e05606d3
IM
223static inline int rt_policy(int policy)
224{
225 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
226 return 1;
227 return 0;
228}
229
230static inline int task_has_rt_policy(struct task_struct *p)
231{
232 return rt_policy(p->policy);
233}
234
1da177e4 235/*
6aa645ea 236 * This is the priority-queue data structure of the RT scheduling class:
1da177e4 237 */
6aa645ea
IM
238struct rt_prio_array {
239 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
240 struct list_head queue[MAX_RT_PRIO];
241};
242
243struct load_stat {
244 struct load_weight load;
245 u64 load_update_start, load_update_last;
246 unsigned long delta_fair, delta_exec, delta_stat;
247};
248
249/* CFS-related fields in a runqueue */
250struct cfs_rq {
251 struct load_weight load;
252 unsigned long nr_running;
253
254 s64 fair_clock;
255 u64 exec_clock;
256 s64 wait_runtime;
257 u64 sleeper_bonus;
258 unsigned long wait_runtime_overruns, wait_runtime_underruns;
259
260 struct rb_root tasks_timeline;
261 struct rb_node *rb_leftmost;
262 struct rb_node *rb_load_balance_curr;
263#ifdef CONFIG_FAIR_GROUP_SCHED
264 /* 'curr' points to currently running entity on this cfs_rq.
265 * It is set to NULL otherwise (i.e when none are currently running).
266 */
267 struct sched_entity *curr;
268 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
269
270 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
271 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
272 * (like users, containers etc.)
273 *
274 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
275 * list is used during load balance.
276 */
277 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
278#endif
279};
1da177e4 280
6aa645ea
IM
281/* Real-Time classes' related field in a runqueue: */
282struct rt_rq {
283 struct rt_prio_array active;
284 int rt_load_balance_idx;
285 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
286};
287
288/*
289 * The prio-array type of the old scheduler:
290 */
1da177e4
LT
291struct prio_array {
292 unsigned int nr_active;
d444886e 293 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
1da177e4
LT
294 struct list_head queue[MAX_PRIO];
295};
296
297/*
298 * This is the main, per-CPU runqueue data structure.
299 *
300 * Locking rule: those places that want to lock multiple runqueues
301 * (such as the load balancing or the thread migration code), lock
302 * acquire operations must be ordered by ascending &runqueue.
303 */
70b97a7f 304struct rq {
6aa645ea 305 spinlock_t lock; /* runqueue lock */
1da177e4
LT
306
307 /*
308 * nr_running and cpu_load should be in the same cacheline because
309 * remote CPUs use both these fields when doing load calculation.
310 */
311 unsigned long nr_running;
2dd73a4f 312 unsigned long raw_weighted_load;
6aa645ea
IM
313 #define CPU_LOAD_IDX_MAX 5
314 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
bdecea3a 315 unsigned char idle_at_tick;
46cb4b7c
SS
316#ifdef CONFIG_NO_HZ
317 unsigned char in_nohz_recently;
318#endif
6aa645ea
IM
319 struct load_stat ls; /* capture load from *all* tasks on this cpu */
320 unsigned long nr_load_updates;
321 u64 nr_switches;
322
323 struct cfs_rq cfs;
324#ifdef CONFIG_FAIR_GROUP_SCHED
325 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
1da177e4 326#endif
6aa645ea 327 struct rt_rq rt;
1da177e4
LT
328
329 /*
330 * This is part of a global counter where only the total sum
331 * over all CPUs matters. A task can increase this counter on
332 * one CPU and if it got migrated afterwards it may decrease
333 * it on another CPU. Always updated under the runqueue lock:
334 */
335 unsigned long nr_uninterruptible;
336
337 unsigned long expired_timestamp;
b18ec803 338 unsigned long long most_recent_timestamp;
6aa645ea 339
36c8b586 340 struct task_struct *curr, *idle;
c9819f45 341 unsigned long next_balance;
1da177e4 342 struct mm_struct *prev_mm;
6aa645ea 343
70b97a7f 344 struct prio_array *active, *expired, arrays[2];
1da177e4 345 int best_expired_prio;
6aa645ea
IM
346
347 u64 clock, prev_clock_raw;
348 s64 clock_max_delta;
349
350 unsigned int clock_warps, clock_overflows;
351 unsigned int clock_unstable_events;
352
353 struct sched_class *load_balance_class;
354
1da177e4
LT
355 atomic_t nr_iowait;
356
357#ifdef CONFIG_SMP
358 struct sched_domain *sd;
359
360 /* For active balancing */
361 int active_balance;
362 int push_cpu;
0a2966b4 363 int cpu; /* cpu of this runqueue */
1da177e4 364
36c8b586 365 struct task_struct *migration_thread;
1da177e4
LT
366 struct list_head migration_queue;
367#endif
368
369#ifdef CONFIG_SCHEDSTATS
370 /* latency stats */
371 struct sched_info rq_sched_info;
372
373 /* sys_sched_yield() stats */
374 unsigned long yld_exp_empty;
375 unsigned long yld_act_empty;
376 unsigned long yld_both_empty;
377 unsigned long yld_cnt;
378
379 /* schedule() stats */
380 unsigned long sched_switch;
381 unsigned long sched_cnt;
382 unsigned long sched_goidle;
383
384 /* try_to_wake_up() stats */
385 unsigned long ttwu_cnt;
386 unsigned long ttwu_local;
387#endif
fcb99371 388 struct lock_class_key rq_lock_key;
1da177e4
LT
389};
390
c3396620 391static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
5be9361c 392static DEFINE_MUTEX(sched_hotcpu_mutex);
1da177e4 393
dd41f596
IM
394static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
395{
396 rq->curr->sched_class->check_preempt_curr(rq, p);
397}
398
0a2966b4
CL
399static inline int cpu_of(struct rq *rq)
400{
401#ifdef CONFIG_SMP
402 return rq->cpu;
403#else
404 return 0;
405#endif
406}
407
20d315d4
IM
408/*
409 * Per-runqueue clock, as finegrained as the platform can give us:
410 */
411static unsigned long long __rq_clock(struct rq *rq)
412{
413 u64 prev_raw = rq->prev_clock_raw;
414 u64 now = sched_clock();
415 s64 delta = now - prev_raw;
416 u64 clock = rq->clock;
417
418 /*
419 * Protect against sched_clock() occasionally going backwards:
420 */
421 if (unlikely(delta < 0)) {
422 clock++;
423 rq->clock_warps++;
424 } else {
425 /*
426 * Catch too large forward jumps too:
427 */
428 if (unlikely(delta > 2*TICK_NSEC)) {
429 clock++;
430 rq->clock_overflows++;
431 } else {
432 if (unlikely(delta > rq->clock_max_delta))
433 rq->clock_max_delta = delta;
434 clock += delta;
435 }
436 }
437
438 rq->prev_clock_raw = now;
439 rq->clock = clock;
440
441 return clock;
442}
443
444static inline unsigned long long rq_clock(struct rq *rq)
445{
446 int this_cpu = smp_processor_id();
447
448 if (this_cpu == cpu_of(rq))
449 return __rq_clock(rq);
450
451 return rq->clock;
452}
453
674311d5
NP
454/*
455 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
1a20ff27 456 * See detach_destroy_domains: synchronize_sched for details.
674311d5
NP
457 *
458 * The domain tree of any CPU may only be accessed from within
459 * preempt-disabled sections.
460 */
48f24c4d
IM
461#define for_each_domain(cpu, __sd) \
462 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
1da177e4
LT
463
464#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
465#define this_rq() (&__get_cpu_var(runqueues))
466#define task_rq(p) cpu_rq(task_cpu(p))
467#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
468
138a8aeb
IM
469#ifdef CONFIG_FAIR_GROUP_SCHED
470/* Change a task's ->cfs_rq if it moves across CPUs */
471static inline void set_task_cfs_rq(struct task_struct *p)
472{
473 p->se.cfs_rq = &task_rq(p)->cfs;
474}
475#else
476static inline void set_task_cfs_rq(struct task_struct *p)
477{
478}
479#endif
480
1da177e4 481#ifndef prepare_arch_switch
4866cde0
NP
482# define prepare_arch_switch(next) do { } while (0)
483#endif
484#ifndef finish_arch_switch
485# define finish_arch_switch(prev) do { } while (0)
486#endif
487
488#ifndef __ARCH_WANT_UNLOCKED_CTXSW
70b97a7f 489static inline int task_running(struct rq *rq, struct task_struct *p)
4866cde0
NP
490{
491 return rq->curr == p;
492}
493
70b97a7f 494static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
4866cde0
NP
495{
496}
497
70b97a7f 498static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
4866cde0 499{
da04c035
IM
500#ifdef CONFIG_DEBUG_SPINLOCK
501 /* this is a valid case when another task releases the spinlock */
502 rq->lock.owner = current;
503#endif
8a25d5de
IM
504 /*
505 * If we are tracking spinlock dependencies then we have to
506 * fix up the runqueue lock - which gets 'carried over' from
507 * prev into current:
508 */
509 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
510
4866cde0
NP
511 spin_unlock_irq(&rq->lock);
512}
513
514#else /* __ARCH_WANT_UNLOCKED_CTXSW */
70b97a7f 515static inline int task_running(struct rq *rq, struct task_struct *p)
4866cde0
NP
516{
517#ifdef CONFIG_SMP
518 return p->oncpu;
519#else
520 return rq->curr == p;
521#endif
522}
523
70b97a7f 524static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
4866cde0
NP
525{
526#ifdef CONFIG_SMP
527 /*
528 * We can optimise this out completely for !SMP, because the
529 * SMP rebalancing from interrupt is the only thing that cares
530 * here.
531 */
532 next->oncpu = 1;
533#endif
534#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
535 spin_unlock_irq(&rq->lock);
536#else
537 spin_unlock(&rq->lock);
538#endif
539}
540
70b97a7f 541static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
4866cde0
NP
542{
543#ifdef CONFIG_SMP
544 /*
545 * After ->oncpu is cleared, the task can be moved to a different CPU.
546 * We must ensure this doesn't happen until the switch is completely
547 * finished.
548 */
549 smp_wmb();
550 prev->oncpu = 0;
551#endif
552#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
553 local_irq_enable();
1da177e4 554#endif
4866cde0
NP
555}
556#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
1da177e4 557
b29739f9
IM
558/*
559 * __task_rq_lock - lock the runqueue a given task resides on.
560 * Must be called interrupts disabled.
561 */
70b97a7f 562static inline struct rq *__task_rq_lock(struct task_struct *p)
b29739f9
IM
563 __acquires(rq->lock)
564{
70b97a7f 565 struct rq *rq;
b29739f9
IM
566
567repeat_lock_task:
568 rq = task_rq(p);
569 spin_lock(&rq->lock);
570 if (unlikely(rq != task_rq(p))) {
571 spin_unlock(&rq->lock);
572 goto repeat_lock_task;
573 }
574 return rq;
575}
576
1da177e4
LT
577/*
578 * task_rq_lock - lock the runqueue a given task resides on and disable
579 * interrupts. Note the ordering: we can safely lookup the task_rq without
580 * explicitly disabling preemption.
581 */
70b97a7f 582static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
1da177e4
LT
583 __acquires(rq->lock)
584{
70b97a7f 585 struct rq *rq;
1da177e4
LT
586
587repeat_lock_task:
588 local_irq_save(*flags);
589 rq = task_rq(p);
590 spin_lock(&rq->lock);
591 if (unlikely(rq != task_rq(p))) {
592 spin_unlock_irqrestore(&rq->lock, *flags);
593 goto repeat_lock_task;
594 }
595 return rq;
596}
597
70b97a7f 598static inline void __task_rq_unlock(struct rq *rq)
b29739f9
IM
599 __releases(rq->lock)
600{
601 spin_unlock(&rq->lock);
602}
603
70b97a7f 604static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
1da177e4
LT
605 __releases(rq->lock)
606{
607 spin_unlock_irqrestore(&rq->lock, *flags);
608}
609
1da177e4 610/*
cc2a73b5 611 * this_rq_lock - lock this runqueue and disable interrupts.
1da177e4 612 */
70b97a7f 613static inline struct rq *this_rq_lock(void)
1da177e4
LT
614 __acquires(rq->lock)
615{
70b97a7f 616 struct rq *rq;
1da177e4
LT
617
618 local_irq_disable();
619 rq = this_rq();
620 spin_lock(&rq->lock);
621
622 return rq;
623}
624
c24d20db
IM
625/*
626 * resched_task - mark a task 'to be rescheduled now'.
627 *
628 * On UP this means the setting of the need_resched flag, on SMP it
629 * might also involve a cross-CPU call to trigger the scheduler on
630 * the target CPU.
631 */
632#ifdef CONFIG_SMP
633
634#ifndef tsk_is_polling
635#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
636#endif
637
638static void resched_task(struct task_struct *p)
639{
640 int cpu;
641
642 assert_spin_locked(&task_rq(p)->lock);
643
644 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
645 return;
646
647 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
648
649 cpu = task_cpu(p);
650 if (cpu == smp_processor_id())
651 return;
652
653 /* NEED_RESCHED must be visible before we test polling */
654 smp_mb();
655 if (!tsk_is_polling(p))
656 smp_send_reschedule(cpu);
657}
658
659static void resched_cpu(int cpu)
660{
661 struct rq *rq = cpu_rq(cpu);
662 unsigned long flags;
663
664 if (!spin_trylock_irqsave(&rq->lock, flags))
665 return;
666 resched_task(cpu_curr(cpu));
667 spin_unlock_irqrestore(&rq->lock, flags);
668}
669#else
670static inline void resched_task(struct task_struct *p)
671{
672 assert_spin_locked(&task_rq(p)->lock);
673 set_tsk_need_resched(p);
674}
675#endif
676
45bf76df
IM
677static u64 div64_likely32(u64 divident, unsigned long divisor)
678{
679#if BITS_PER_LONG == 32
680 if (likely(divident <= 0xffffffffULL))
681 return (u32)divident / divisor;
682 do_div(divident, divisor);
683
684 return divident;
685#else
686 return divident / divisor;
687#endif
688}
689
690#if BITS_PER_LONG == 32
691# define WMULT_CONST (~0UL)
692#else
693# define WMULT_CONST (1UL << 32)
694#endif
695
696#define WMULT_SHIFT 32
697
698static inline unsigned long
699calc_delta_mine(unsigned long delta_exec, unsigned long weight,
700 struct load_weight *lw)
701{
702 u64 tmp;
703
704 if (unlikely(!lw->inv_weight))
705 lw->inv_weight = WMULT_CONST / lw->weight;
706
707 tmp = (u64)delta_exec * weight;
708 /*
709 * Check whether we'd overflow the 64-bit multiplication:
710 */
711 if (unlikely(tmp > WMULT_CONST)) {
712 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
713 >> (WMULT_SHIFT/2);
714 } else {
715 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
716 }
717
718 return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
719}
720
721static inline unsigned long
722calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
723{
724 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
725}
726
727static void update_load_add(struct load_weight *lw, unsigned long inc)
728{
729 lw->weight += inc;
730 lw->inv_weight = 0;
731}
732
733static void update_load_sub(struct load_weight *lw, unsigned long dec)
734{
735 lw->weight -= dec;
736 lw->inv_weight = 0;
737}
738
739static void __update_curr_load(struct rq *rq, struct load_stat *ls)
740{
741 if (rq->curr != rq->idle && ls->load.weight) {
742 ls->delta_exec += ls->delta_stat;
743 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
744 ls->delta_stat = 0;
745 }
746}
747
748/*
749 * Update delta_exec, delta_fair fields for rq.
750 *
751 * delta_fair clock advances at a rate inversely proportional to
752 * total load (rq->ls.load.weight) on the runqueue, while
753 * delta_exec advances at the same rate as wall-clock (provided
754 * cpu is not idle).
755 *
756 * delta_exec / delta_fair is a measure of the (smoothened) load on this
757 * runqueue over any given interval. This (smoothened) load is used
758 * during load balance.
759 *
760 * This function is called /before/ updating rq->ls.load
761 * and when switching tasks.
762 */
763static void update_curr_load(struct rq *rq, u64 now)
764{
765 struct load_stat *ls = &rq->ls;
766 u64 start;
767
768 start = ls->load_update_start;
769 ls->load_update_start = now;
770 ls->delta_stat += now - start;
771 /*
772 * Stagger updates to ls->delta_fair. Very frequent updates
773 * can be expensive.
774 */
775 if (ls->delta_stat >= sysctl_sched_stat_granularity)
776 __update_curr_load(rq, ls);
777}
778
2dd73a4f
PW
779/*
780 * To aid in avoiding the subversion of "niceness" due to uneven distribution
781 * of tasks with abnormal "nice" values across CPUs the contribution that
782 * each task makes to its run queue's load is weighted according to its
783 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
784 * scaled version of the new time slice allocation that they receive on time
785 * slice expiry etc.
786 */
787
788/*
789 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
790 * If static_prio_timeslice() is ever changed to break this assumption then
791 * this code will need modification
792 */
793#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
dd41f596 794#define load_weight(lp) \
2dd73a4f
PW
795 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
796#define PRIO_TO_LOAD_WEIGHT(prio) \
dd41f596 797 load_weight(static_prio_timeslice(prio))
2dd73a4f 798#define RTPRIO_TO_LOAD_WEIGHT(rp) \
dd41f596
IM
799 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
800
801#define WEIGHT_IDLEPRIO 2
802#define WMULT_IDLEPRIO (1 << 31)
803
804/*
805 * Nice levels are multiplicative, with a gentle 10% change for every
806 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
807 * nice 1, it will get ~10% less CPU time than another CPU-bound task
808 * that remained on nice 0.
809 *
810 * The "10% effect" is relative and cumulative: from _any_ nice level,
811 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
812 * it's +10% CPU usage.
813 */
814static const int prio_to_weight[40] = {
815/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
816/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
817/* 0 */ NICE_0_LOAD /* 1024 */,
818/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
819/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
820};
821
822static const u32 prio_to_wmult[40] = {
823 48356, 60446, 75558, 94446, 118058, 147573,
824 184467, 230589, 288233, 360285, 450347,
825 562979, 703746, 879575, 1099582, 1374389,
826 717986, 2147483, 2684354, 3355443, 4194304,
827 244160, 6557201, 8196502, 10250518, 12782640,
828 16025997, 19976592, 24970740, 31350126, 39045157,
829 49367440, 61356675, 76695844, 95443717, 119304647,
830 148102320, 186737708, 238609294, 286331153,
831};
2dd73a4f 832
36c8b586 833static inline void
dd41f596 834inc_load(struct rq *rq, const struct task_struct *p, u64 now)
2dd73a4f 835{
dd41f596
IM
836 update_curr_load(rq, now);
837 update_load_add(&rq->ls.load, p->se.load.weight);
2dd73a4f
PW
838}
839
36c8b586 840static inline void
dd41f596 841dec_load(struct rq *rq, const struct task_struct *p, u64 now)
2dd73a4f 842{
dd41f596
IM
843 update_curr_load(rq, now);
844 update_load_sub(&rq->ls.load, p->se.load.weight);
2dd73a4f
PW
845}
846
dd41f596 847static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
2dd73a4f
PW
848{
849 rq->nr_running++;
dd41f596 850 inc_load(rq, p, now);
2dd73a4f
PW
851}
852
dd41f596 853static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
2dd73a4f
PW
854{
855 rq->nr_running--;
dd41f596 856 dec_load(rq, p, now);
2dd73a4f
PW
857}
858
dd41f596
IM
859static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
860
861/*
862 * runqueue iterator, to support SMP load-balancing between different
863 * scheduling classes, without having to expose their internal data
864 * structures to the load-balancing proper:
865 */
866struct rq_iterator {
867 void *arg;
868 struct task_struct *(*start)(void *);
869 struct task_struct *(*next)(void *);
870};
871
872static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
873 unsigned long max_nr_move, unsigned long max_load_move,
874 struct sched_domain *sd, enum cpu_idle_type idle,
875 int *all_pinned, unsigned long *load_moved,
876 int this_best_prio, int best_prio, int best_prio_seen,
877 struct rq_iterator *iterator);
878
879#include "sched_stats.h"
880#include "sched_rt.c"
881#include "sched_fair.c"
882#include "sched_idletask.c"
883#ifdef CONFIG_SCHED_DEBUG
884# include "sched_debug.c"
885#endif
886
887#define sched_class_highest (&rt_sched_class)
888
45bf76df
IM
889static void set_load_weight(struct task_struct *p)
890{
dd41f596
IM
891 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
892 p->se.wait_runtime = 0;
893
45bf76df 894 if (task_has_rt_policy(p)) {
dd41f596
IM
895 p->se.load.weight = prio_to_weight[0] * 2;
896 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
897 return;
898 }
45bf76df 899
dd41f596
IM
900 /*
901 * SCHED_IDLE tasks get minimal weight:
902 */
903 if (p->policy == SCHED_IDLE) {
904 p->se.load.weight = WEIGHT_IDLEPRIO;
905 p->se.load.inv_weight = WMULT_IDLEPRIO;
906 return;
907 }
71f8bd46 908
dd41f596
IM
909 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
910 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
71f8bd46
IM
911}
912
dd41f596
IM
913static void
914enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
71f8bd46 915{
dd41f596
IM
916 sched_info_queued(p);
917 p->sched_class->enqueue_task(rq, p, wakeup, now);
918 p->se.on_rq = 1;
71f8bd46
IM
919}
920
dd41f596
IM
921static void
922dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
71f8bd46 923{
dd41f596
IM
924 p->sched_class->dequeue_task(rq, p, sleep, now);
925 p->se.on_rq = 0;
71f8bd46
IM
926}
927
14531189 928/*
dd41f596 929 * __normal_prio - return the priority that is based on the static prio
14531189 930 */
14531189
IM
931static inline int __normal_prio(struct task_struct *p)
932{
dd41f596 933 return p->static_prio;
14531189
IM
934}
935
b29739f9
IM
936/*
937 * Calculate the expected normal priority: i.e. priority
938 * without taking RT-inheritance into account. Might be
939 * boosted by interactivity modifiers. Changes upon fork,
940 * setprio syscalls, and whenever the interactivity
941 * estimator recalculates.
942 */
36c8b586 943static inline int normal_prio(struct task_struct *p)
b29739f9
IM
944{
945 int prio;
946
e05606d3 947 if (task_has_rt_policy(p))
b29739f9
IM
948 prio = MAX_RT_PRIO-1 - p->rt_priority;
949 else
950 prio = __normal_prio(p);
951 return prio;
952}
953
954/*
955 * Calculate the current priority, i.e. the priority
956 * taken into account by the scheduler. This value might
957 * be boosted by RT tasks, or might be boosted by
958 * interactivity modifiers. Will be RT if the task got
959 * RT-boosted. If not then it returns p->normal_prio.
960 */
36c8b586 961static int effective_prio(struct task_struct *p)
b29739f9
IM
962{
963 p->normal_prio = normal_prio(p);
964 /*
965 * If we are RT tasks or we were boosted to RT priority,
966 * keep the priority unchanged. Otherwise, update priority
967 * to the normal priority:
968 */
969 if (!rt_prio(p->prio))
970 return p->normal_prio;
971 return p->prio;
972}
973
1da177e4 974/*
dd41f596 975 * activate_task - move a task to the runqueue.
1da177e4 976 */
dd41f596 977static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1da177e4 978{
dd41f596 979 u64 now = rq_clock(rq);
d425b274 980
dd41f596
IM
981 if (p->state == TASK_UNINTERRUPTIBLE)
982 rq->nr_uninterruptible--;
1da177e4 983
dd41f596
IM
984 enqueue_task(rq, p, wakeup, now);
985 inc_nr_running(p, rq, now);
1da177e4
LT
986}
987
988/*
dd41f596 989 * activate_idle_task - move idle task to the _front_ of runqueue.
1da177e4 990 */
dd41f596 991static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
1da177e4 992{
dd41f596 993 u64 now = rq_clock(rq);
1da177e4 994
dd41f596
IM
995 if (p->state == TASK_UNINTERRUPTIBLE)
996 rq->nr_uninterruptible--;
ece8a684 997
dd41f596
IM
998 enqueue_task(rq, p, 0, now);
999 inc_nr_running(p, rq, now);
1da177e4
LT
1000}
1001
1002/*
1003 * deactivate_task - remove a task from the runqueue.
1004 */
dd41f596 1005static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1da177e4 1006{
dd41f596
IM
1007 u64 now = rq_clock(rq);
1008
1009 if (p->state == TASK_UNINTERRUPTIBLE)
1010 rq->nr_uninterruptible++;
1011
1012 dequeue_task(rq, p, sleep, now);
1013 dec_nr_running(p, rq, now);
1da177e4
LT
1014}
1015
1da177e4
LT
1016/**
1017 * task_curr - is this task currently executing on a CPU?
1018 * @p: the task in question.
1019 */
36c8b586 1020inline int task_curr(const struct task_struct *p)
1da177e4
LT
1021{
1022 return cpu_curr(task_cpu(p)) == p;
1023}
1024
2dd73a4f
PW
1025/* Used instead of source_load when we know the type == 0 */
1026unsigned long weighted_cpuload(const int cpu)
1027{
dd41f596
IM
1028 return cpu_rq(cpu)->ls.load.weight;
1029}
1030
1031static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1032{
1033#ifdef CONFIG_SMP
1034 task_thread_info(p)->cpu = cpu;
1035 set_task_cfs_rq(p);
1036#endif
2dd73a4f
PW
1037}
1038
1da177e4 1039#ifdef CONFIG_SMP
c65cc870 1040
dd41f596 1041void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
c65cc870 1042{
dd41f596
IM
1043 int old_cpu = task_cpu(p);
1044 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1045 u64 clock_offset, fair_clock_offset;
1046
1047 clock_offset = old_rq->clock - new_rq->clock;
1048 fair_clock_offset = old_rq->cfs.fair_clock -
1049 new_rq->cfs.fair_clock;
1050 if (p->se.wait_start)
1051 p->se.wait_start -= clock_offset;
1052 if (p->se.wait_start_fair)
1053 p->se.wait_start_fair -= fair_clock_offset;
1054 if (p->se.sleep_start)
1055 p->se.sleep_start -= clock_offset;
1056 if (p->se.block_start)
1057 p->se.block_start -= clock_offset;
1058 if (p->se.sleep_start_fair)
1059 p->se.sleep_start_fair -= fair_clock_offset;
1060
1061 __set_task_cpu(p, new_cpu);
c65cc870
IM
1062}
1063
70b97a7f 1064struct migration_req {
1da177e4 1065 struct list_head list;
1da177e4 1066
36c8b586 1067 struct task_struct *task;
1da177e4
LT
1068 int dest_cpu;
1069
1da177e4 1070 struct completion done;
70b97a7f 1071};
1da177e4
LT
1072
1073/*
1074 * The task's runqueue lock must be held.
1075 * Returns true if you have to wait for migration thread.
1076 */
36c8b586 1077static int
70b97a7f 1078migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1da177e4 1079{
70b97a7f 1080 struct rq *rq = task_rq(p);
1da177e4
LT
1081
1082 /*
1083 * If the task is not on a runqueue (and not running), then
1084 * it is sufficient to simply update the task's cpu field.
1085 */
dd41f596 1086 if (!p->se.on_rq && !task_running(rq, p)) {
1da177e4
LT
1087 set_task_cpu(p, dest_cpu);
1088 return 0;
1089 }
1090
1091 init_completion(&req->done);
1da177e4
LT
1092 req->task = p;
1093 req->dest_cpu = dest_cpu;
1094 list_add(&req->list, &rq->migration_queue);
48f24c4d 1095
1da177e4
LT
1096 return 1;
1097}
1098
1099/*
1100 * wait_task_inactive - wait for a thread to unschedule.
1101 *
1102 * The caller must ensure that the task *will* unschedule sometime soon,
1103 * else this function might spin for a *long* time. This function can't
1104 * be called with interrupts off, or it may introduce deadlock with
1105 * smp_call_function() if an IPI is sent by the same process we are
1106 * waiting to become inactive.
1107 */
36c8b586 1108void wait_task_inactive(struct task_struct *p)
1da177e4
LT
1109{
1110 unsigned long flags;
dd41f596 1111 int running, on_rq;
70b97a7f 1112 struct rq *rq;
1da177e4
LT
1113
1114repeat:
fa490cfd
LT
1115 /*
1116 * We do the initial early heuristics without holding
1117 * any task-queue locks at all. We'll only try to get
1118 * the runqueue lock when things look like they will
1119 * work out!
1120 */
1121 rq = task_rq(p);
1122
1123 /*
1124 * If the task is actively running on another CPU
1125 * still, just relax and busy-wait without holding
1126 * any locks.
1127 *
1128 * NOTE! Since we don't hold any locks, it's not
1129 * even sure that "rq" stays as the right runqueue!
1130 * But we don't care, since "task_running()" will
1131 * return false if the runqueue has changed and p
1132 * is actually now running somewhere else!
1133 */
1134 while (task_running(rq, p))
1135 cpu_relax();
1136
1137 /*
1138 * Ok, time to look more closely! We need the rq
1139 * lock now, to be *sure*. If we're wrong, we'll
1140 * just go back and repeat.
1141 */
1da177e4 1142 rq = task_rq_lock(p, &flags);
fa490cfd 1143 running = task_running(rq, p);
dd41f596 1144 on_rq = p->se.on_rq;
fa490cfd
LT
1145 task_rq_unlock(rq, &flags);
1146
1147 /*
1148 * Was it really running after all now that we
1149 * checked with the proper locks actually held?
1150 *
1151 * Oops. Go back and try again..
1152 */
1153 if (unlikely(running)) {
1da177e4 1154 cpu_relax();
1da177e4
LT
1155 goto repeat;
1156 }
fa490cfd
LT
1157
1158 /*
1159 * It's not enough that it's not actively running,
1160 * it must be off the runqueue _entirely_, and not
1161 * preempted!
1162 *
1163 * So if it wa still runnable (but just not actively
1164 * running right now), it's preempted, and we should
1165 * yield - it could be a while.
1166 */
dd41f596 1167 if (unlikely(on_rq)) {
fa490cfd
LT
1168 yield();
1169 goto repeat;
1170 }
1171
1172 /*
1173 * Ahh, all good. It wasn't running, and it wasn't
1174 * runnable, which means that it will never become
1175 * running in the future either. We're all done!
1176 */
1da177e4
LT
1177}
1178
1179/***
1180 * kick_process - kick a running thread to enter/exit the kernel
1181 * @p: the to-be-kicked thread
1182 *
1183 * Cause a process which is running on another CPU to enter
1184 * kernel-mode, without any delay. (to get signals handled.)
1185 *
1186 * NOTE: this function doesnt have to take the runqueue lock,
1187 * because all it wants to ensure is that the remote task enters
1188 * the kernel. If the IPI races and the task has been migrated
1189 * to another CPU then no harm is done and the purpose has been
1190 * achieved as well.
1191 */
36c8b586 1192void kick_process(struct task_struct *p)
1da177e4
LT
1193{
1194 int cpu;
1195
1196 preempt_disable();
1197 cpu = task_cpu(p);
1198 if ((cpu != smp_processor_id()) && task_curr(p))
1199 smp_send_reschedule(cpu);
1200 preempt_enable();
1201}
1202
1203/*
2dd73a4f
PW
1204 * Return a low guess at the load of a migration-source cpu weighted
1205 * according to the scheduling class and "nice" value.
1da177e4
LT
1206 *
1207 * We want to under-estimate the load of migration sources, to
1208 * balance conservatively.
1209 */
a2000572 1210static inline unsigned long source_load(int cpu, int type)
1da177e4 1211{
70b97a7f 1212 struct rq *rq = cpu_rq(cpu);
dd41f596 1213 unsigned long total = weighted_cpuload(cpu);
2dd73a4f 1214
3b0bd9bc 1215 if (type == 0)
dd41f596 1216 return total;
b910472d 1217
dd41f596 1218 return min(rq->cpu_load[type-1], total);
1da177e4
LT
1219}
1220
1221/*
2dd73a4f
PW
1222 * Return a high guess at the load of a migration-target cpu weighted
1223 * according to the scheduling class and "nice" value.
1da177e4 1224 */
a2000572 1225static inline unsigned long target_load(int cpu, int type)
1da177e4 1226{
70b97a7f 1227 struct rq *rq = cpu_rq(cpu);
dd41f596 1228 unsigned long total = weighted_cpuload(cpu);
2dd73a4f 1229
7897986b 1230 if (type == 0)
dd41f596 1231 return total;
3b0bd9bc 1232
dd41f596 1233 return max(rq->cpu_load[type-1], total);
2dd73a4f
PW
1234}
1235
1236/*
1237 * Return the average load per task on the cpu's run queue
1238 */
1239static inline unsigned long cpu_avg_load_per_task(int cpu)
1240{
70b97a7f 1241 struct rq *rq = cpu_rq(cpu);
dd41f596 1242 unsigned long total = weighted_cpuload(cpu);
2dd73a4f
PW
1243 unsigned long n = rq->nr_running;
1244
dd41f596 1245 return n ? total / n : SCHED_LOAD_SCALE;
1da177e4
LT
1246}
1247
147cbb4b
NP
1248/*
1249 * find_idlest_group finds and returns the least busy CPU group within the
1250 * domain.
1251 */
1252static struct sched_group *
1253find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1254{
1255 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1256 unsigned long min_load = ULONG_MAX, this_load = 0;
1257 int load_idx = sd->forkexec_idx;
1258 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1259
1260 do {
1261 unsigned long load, avg_load;
1262 int local_group;
1263 int i;
1264
da5a5522
BD
1265 /* Skip over this group if it has no CPUs allowed */
1266 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1267 goto nextgroup;
1268
147cbb4b 1269 local_group = cpu_isset(this_cpu, group->cpumask);
147cbb4b
NP
1270
1271 /* Tally up the load of all CPUs in the group */
1272 avg_load = 0;
1273
1274 for_each_cpu_mask(i, group->cpumask) {
1275 /* Bias balancing toward cpus of our domain */
1276 if (local_group)
1277 load = source_load(i, load_idx);
1278 else
1279 load = target_load(i, load_idx);
1280
1281 avg_load += load;
1282 }
1283
1284 /* Adjust by relative CPU power of the group */
5517d86b
ED
1285 avg_load = sg_div_cpu_power(group,
1286 avg_load * SCHED_LOAD_SCALE);
147cbb4b
NP
1287
1288 if (local_group) {
1289 this_load = avg_load;
1290 this = group;
1291 } else if (avg_load < min_load) {
1292 min_load = avg_load;
1293 idlest = group;
1294 }
da5a5522 1295nextgroup:
147cbb4b
NP
1296 group = group->next;
1297 } while (group != sd->groups);
1298
1299 if (!idlest || 100*this_load < imbalance*min_load)
1300 return NULL;
1301 return idlest;
1302}
1303
1304/*
0feaece9 1305 * find_idlest_cpu - find the idlest cpu among the cpus in group.
147cbb4b 1306 */
95cdf3b7
IM
1307static int
1308find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
147cbb4b 1309{
da5a5522 1310 cpumask_t tmp;
147cbb4b
NP
1311 unsigned long load, min_load = ULONG_MAX;
1312 int idlest = -1;
1313 int i;
1314
da5a5522
BD
1315 /* Traverse only the allowed CPUs */
1316 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1317
1318 for_each_cpu_mask(i, tmp) {
2dd73a4f 1319 load = weighted_cpuload(i);
147cbb4b
NP
1320
1321 if (load < min_load || (load == min_load && i == this_cpu)) {
1322 min_load = load;
1323 idlest = i;
1324 }
1325 }
1326
1327 return idlest;
1328}
1329
476d139c
NP
1330/*
1331 * sched_balance_self: balance the current task (running on cpu) in domains
1332 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1333 * SD_BALANCE_EXEC.
1334 *
1335 * Balance, ie. select the least loaded group.
1336 *
1337 * Returns the target CPU number, or the same CPU if no balancing is needed.
1338 *
1339 * preempt must be disabled.
1340 */
1341static int sched_balance_self(int cpu, int flag)
1342{
1343 struct task_struct *t = current;
1344 struct sched_domain *tmp, *sd = NULL;
147cbb4b 1345
c96d145e 1346 for_each_domain(cpu, tmp) {
5c45bf27
SS
1347 /*
1348 * If power savings logic is enabled for a domain, stop there.
1349 */
1350 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1351 break;
476d139c
NP
1352 if (tmp->flags & flag)
1353 sd = tmp;
c96d145e 1354 }
476d139c
NP
1355
1356 while (sd) {
1357 cpumask_t span;
1358 struct sched_group *group;
1a848870
SS
1359 int new_cpu, weight;
1360
1361 if (!(sd->flags & flag)) {
1362 sd = sd->child;
1363 continue;
1364 }
476d139c
NP
1365
1366 span = sd->span;
1367 group = find_idlest_group(sd, t, cpu);
1a848870
SS
1368 if (!group) {
1369 sd = sd->child;
1370 continue;
1371 }
476d139c 1372
da5a5522 1373 new_cpu = find_idlest_cpu(group, t, cpu);
1a848870
SS
1374 if (new_cpu == -1 || new_cpu == cpu) {
1375 /* Now try balancing at a lower domain level of cpu */
1376 sd = sd->child;
1377 continue;
1378 }
476d139c 1379
1a848870 1380 /* Now try balancing at a lower domain level of new_cpu */
476d139c 1381 cpu = new_cpu;
476d139c
NP
1382 sd = NULL;
1383 weight = cpus_weight(span);
1384 for_each_domain(cpu, tmp) {
1385 if (weight <= cpus_weight(tmp->span))
1386 break;
1387 if (tmp->flags & flag)
1388 sd = tmp;
1389 }
1390 /* while loop will break here if sd == NULL */
1391 }
1392
1393 return cpu;
1394}
1395
1396#endif /* CONFIG_SMP */
1da177e4
LT
1397
1398/*
1399 * wake_idle() will wake a task on an idle cpu if task->cpu is
1400 * not idle and an idle cpu is available. The span of cpus to
1401 * search starts with cpus closest then further out as needed,
1402 * so we always favor a closer, idle cpu.
1403 *
1404 * Returns the CPU we should wake onto.
1405 */
1406#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
36c8b586 1407static int wake_idle(int cpu, struct task_struct *p)
1da177e4
LT
1408{
1409 cpumask_t tmp;
1410 struct sched_domain *sd;
1411 int i;
1412
4953198b
SS
1413 /*
1414 * If it is idle, then it is the best cpu to run this task.
1415 *
1416 * This cpu is also the best, if it has more than one task already.
1417 * Siblings must be also busy(in most cases) as they didn't already
1418 * pickup the extra load from this cpu and hence we need not check
1419 * sibling runqueue info. This will avoid the checks and cache miss
1420 * penalities associated with that.
1421 */
1422 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1da177e4
LT
1423 return cpu;
1424
1425 for_each_domain(cpu, sd) {
1426 if (sd->flags & SD_WAKE_IDLE) {
e0f364f4 1427 cpus_and(tmp, sd->span, p->cpus_allowed);
1da177e4
LT
1428 for_each_cpu_mask(i, tmp) {
1429 if (idle_cpu(i))
1430 return i;
1431 }
1432 }
e0f364f4
NP
1433 else
1434 break;
1da177e4
LT
1435 }
1436 return cpu;
1437}
1438#else
36c8b586 1439static inline int wake_idle(int cpu, struct task_struct *p)
1da177e4
LT
1440{
1441 return cpu;
1442}
1443#endif
1444
1445/***
1446 * try_to_wake_up - wake up a thread
1447 * @p: the to-be-woken-up thread
1448 * @state: the mask of task states that can be woken
1449 * @sync: do a synchronous wakeup?
1450 *
1451 * Put it on the run-queue if it's not already there. The "current"
1452 * thread is always on the run-queue (except when the actual
1453 * re-schedule is in progress), and as such you're allowed to do
1454 * the simpler "current->state = TASK_RUNNING" to mark yourself
1455 * runnable without the overhead of this.
1456 *
1457 * returns failure only if the task is already active.
1458 */
36c8b586 1459static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1da177e4
LT
1460{
1461 int cpu, this_cpu, success = 0;
1462 unsigned long flags;
1463 long old_state;
70b97a7f 1464 struct rq *rq;
1da177e4 1465#ifdef CONFIG_SMP
7897986b 1466 struct sched_domain *sd, *this_sd = NULL;
70b97a7f 1467 unsigned long load, this_load;
1da177e4
LT
1468 int new_cpu;
1469#endif
1470
1471 rq = task_rq_lock(p, &flags);
1472 old_state = p->state;
1473 if (!(old_state & state))
1474 goto out;
1475
dd41f596 1476 if (p->se.on_rq)
1da177e4
LT
1477 goto out_running;
1478
1479 cpu = task_cpu(p);
1480 this_cpu = smp_processor_id();
1481
1482#ifdef CONFIG_SMP
1483 if (unlikely(task_running(rq, p)))
1484 goto out_activate;
1485
7897986b
NP
1486 new_cpu = cpu;
1487
1da177e4
LT
1488 schedstat_inc(rq, ttwu_cnt);
1489 if (cpu == this_cpu) {
1490 schedstat_inc(rq, ttwu_local);
7897986b
NP
1491 goto out_set_cpu;
1492 }
1493
1494 for_each_domain(this_cpu, sd) {
1495 if (cpu_isset(cpu, sd->span)) {
1496 schedstat_inc(sd, ttwu_wake_remote);
1497 this_sd = sd;
1498 break;
1da177e4
LT
1499 }
1500 }
1da177e4 1501
7897986b 1502 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1da177e4
LT
1503 goto out_set_cpu;
1504
1da177e4 1505 /*
7897986b 1506 * Check for affine wakeup and passive balancing possibilities.
1da177e4 1507 */
7897986b
NP
1508 if (this_sd) {
1509 int idx = this_sd->wake_idx;
1510 unsigned int imbalance;
1da177e4 1511
a3f21bce
NP
1512 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1513
7897986b
NP
1514 load = source_load(cpu, idx);
1515 this_load = target_load(this_cpu, idx);
1da177e4 1516
7897986b
NP
1517 new_cpu = this_cpu; /* Wake to this CPU if we can */
1518
a3f21bce
NP
1519 if (this_sd->flags & SD_WAKE_AFFINE) {
1520 unsigned long tl = this_load;
33859f7f
MOS
1521 unsigned long tl_per_task;
1522
1523 tl_per_task = cpu_avg_load_per_task(this_cpu);
2dd73a4f 1524
1da177e4 1525 /*
a3f21bce
NP
1526 * If sync wakeup then subtract the (maximum possible)
1527 * effect of the currently running task from the load
1528 * of the current CPU:
1da177e4 1529 */
a3f21bce 1530 if (sync)
dd41f596 1531 tl -= current->se.load.weight;
a3f21bce
NP
1532
1533 if ((tl <= load &&
2dd73a4f 1534 tl + target_load(cpu, idx) <= tl_per_task) ||
dd41f596 1535 100*(tl + p->se.load.weight) <= imbalance*load) {
a3f21bce
NP
1536 /*
1537 * This domain has SD_WAKE_AFFINE and
1538 * p is cache cold in this domain, and
1539 * there is no bad imbalance.
1540 */
1541 schedstat_inc(this_sd, ttwu_move_affine);
1542 goto out_set_cpu;
1543 }
1544 }
1545
1546 /*
1547 * Start passive balancing when half the imbalance_pct
1548 * limit is reached.
1549 */
1550 if (this_sd->flags & SD_WAKE_BALANCE) {
1551 if (imbalance*this_load <= 100*load) {
1552 schedstat_inc(this_sd, ttwu_move_balance);
1553 goto out_set_cpu;
1554 }
1da177e4
LT
1555 }
1556 }
1557
1558 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1559out_set_cpu:
1560 new_cpu = wake_idle(new_cpu, p);
1561 if (new_cpu != cpu) {
1562 set_task_cpu(p, new_cpu);
1563 task_rq_unlock(rq, &flags);
1564 /* might preempt at this point */
1565 rq = task_rq_lock(p, &flags);
1566 old_state = p->state;
1567 if (!(old_state & state))
1568 goto out;
dd41f596 1569 if (p->se.on_rq)
1da177e4
LT
1570 goto out_running;
1571
1572 this_cpu = smp_processor_id();
1573 cpu = task_cpu(p);
1574 }
1575
1576out_activate:
1577#endif /* CONFIG_SMP */
dd41f596 1578 activate_task(rq, p, 1);
1da177e4
LT
1579 /*
1580 * Sync wakeups (i.e. those types of wakeups where the waker
1581 * has indicated that it will leave the CPU in short order)
1582 * don't trigger a preemption, if the woken up task will run on
1583 * this cpu. (in this case the 'I will reschedule' promise of
1584 * the waker guarantees that the freshly woken up task is going
1585 * to be considered on this CPU.)
1586 */
dd41f596
IM
1587 if (!sync || cpu != this_cpu)
1588 check_preempt_curr(rq, p);
1da177e4
LT
1589 success = 1;
1590
1591out_running:
1592 p->state = TASK_RUNNING;
1593out:
1594 task_rq_unlock(rq, &flags);
1595
1596 return success;
1597}
1598
36c8b586 1599int fastcall wake_up_process(struct task_struct *p)
1da177e4
LT
1600{
1601 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1602 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1603}
1da177e4
LT
1604EXPORT_SYMBOL(wake_up_process);
1605
36c8b586 1606int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1da177e4
LT
1607{
1608 return try_to_wake_up(p, state, 0);
1609}
1610
1da177e4
LT
1611/*
1612 * Perform scheduler related setup for a newly forked process p.
1613 * p is forked by current.
dd41f596
IM
1614 *
1615 * __sched_fork() is basic setup used by init_idle() too:
1616 */
1617static void __sched_fork(struct task_struct *p)
1618{
1619 p->se.wait_start_fair = 0;
1620 p->se.wait_start = 0;
1621 p->se.exec_start = 0;
1622 p->se.sum_exec_runtime = 0;
1623 p->se.delta_exec = 0;
1624 p->se.delta_fair_run = 0;
1625 p->se.delta_fair_sleep = 0;
1626 p->se.wait_runtime = 0;
1627 p->se.sum_wait_runtime = 0;
1628 p->se.sum_sleep_runtime = 0;
1629 p->se.sleep_start = 0;
1630 p->se.sleep_start_fair = 0;
1631 p->se.block_start = 0;
1632 p->se.sleep_max = 0;
1633 p->se.block_max = 0;
1634 p->se.exec_max = 0;
1635 p->se.wait_max = 0;
1636 p->se.wait_runtime_overruns = 0;
1637 p->se.wait_runtime_underruns = 0;
476d139c 1638
dd41f596
IM
1639 INIT_LIST_HEAD(&p->run_list);
1640 p->se.on_rq = 0;
476d139c 1641
1da177e4
LT
1642 /*
1643 * We mark the process as running here, but have not actually
1644 * inserted it onto the runqueue yet. This guarantees that
1645 * nobody will actually run it, and a signal or other external
1646 * event cannot wake it up and insert it on the runqueue either.
1647 */
1648 p->state = TASK_RUNNING;
dd41f596
IM
1649}
1650
1651/*
1652 * fork()/clone()-time setup:
1653 */
1654void sched_fork(struct task_struct *p, int clone_flags)
1655{
1656 int cpu = get_cpu();
1657
1658 __sched_fork(p);
1659
1660#ifdef CONFIG_SMP
1661 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1662#endif
1663 __set_task_cpu(p, cpu);
b29739f9
IM
1664
1665 /*
1666 * Make sure we do not leak PI boosting priority to the child:
1667 */
1668 p->prio = current->normal_prio;
1669
52f17b6c 1670#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
dd41f596 1671 if (likely(sched_info_on()))
52f17b6c 1672 memset(&p->sched_info, 0, sizeof(p->sched_info));
1da177e4 1673#endif
d6077cb8 1674#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4866cde0
NP
1675 p->oncpu = 0;
1676#endif
1da177e4 1677#ifdef CONFIG_PREEMPT
4866cde0 1678 /* Want to start with kernel preemption disabled. */
a1261f54 1679 task_thread_info(p)->preempt_count = 1;
1da177e4 1680#endif
476d139c 1681 put_cpu();
1da177e4
LT
1682}
1683
dd41f596
IM
1684/*
1685 * After fork, child runs first. (default) If set to 0 then
1686 * parent will (try to) run first.
1687 */
1688unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1689
1da177e4
LT
1690/*
1691 * wake_up_new_task - wake up a newly created task for the first time.
1692 *
1693 * This function will do some initial scheduler statistics housekeeping
1694 * that must be done for every newly created context, then puts the task
1695 * on the runqueue and wakes it.
1696 */
36c8b586 1697void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1da177e4
LT
1698{
1699 unsigned long flags;
dd41f596
IM
1700 struct rq *rq;
1701 int this_cpu;
1da177e4
LT
1702
1703 rq = task_rq_lock(p, &flags);
147cbb4b 1704 BUG_ON(p->state != TASK_RUNNING);
dd41f596 1705 this_cpu = smp_processor_id(); /* parent's CPU */
1da177e4
LT
1706
1707 p->prio = effective_prio(p);
1708
dd41f596
IM
1709 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1710 task_cpu(p) != this_cpu || !current->se.on_rq) {
1711 activate_task(rq, p, 0);
1da177e4 1712 } else {
1da177e4 1713 /*
dd41f596
IM
1714 * Let the scheduling class do new task startup
1715 * management (if any):
1da177e4 1716 */
dd41f596 1717 p->sched_class->task_new(rq, p);
1da177e4 1718 }
dd41f596
IM
1719 check_preempt_curr(rq, p);
1720 task_rq_unlock(rq, &flags);
1da177e4
LT
1721}
1722
4866cde0
NP
1723/**
1724 * prepare_task_switch - prepare to switch tasks
1725 * @rq: the runqueue preparing to switch
1726 * @next: the task we are going to switch to.
1727 *
1728 * This is called with the rq lock held and interrupts off. It must
1729 * be paired with a subsequent finish_task_switch after the context
1730 * switch.
1731 *
1732 * prepare_task_switch sets up locking and calls architecture specific
1733 * hooks.
1734 */
70b97a7f 1735static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
4866cde0
NP
1736{
1737 prepare_lock_switch(rq, next);
1738 prepare_arch_switch(next);
1739}
1740
1da177e4
LT
1741/**
1742 * finish_task_switch - clean up after a task-switch
344babaa 1743 * @rq: runqueue associated with task-switch
1da177e4
LT
1744 * @prev: the thread we just switched away from.
1745 *
4866cde0
NP
1746 * finish_task_switch must be called after the context switch, paired
1747 * with a prepare_task_switch call before the context switch.
1748 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1749 * and do any other architecture-specific cleanup actions.
1da177e4
LT
1750 *
1751 * Note that we may have delayed dropping an mm in context_switch(). If
1752 * so, we finish that here outside of the runqueue lock. (Doing it
1753 * with the lock held can cause deadlocks; see schedule() for
1754 * details.)
1755 */
70b97a7f 1756static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1da177e4
LT
1757 __releases(rq->lock)
1758{
1da177e4 1759 struct mm_struct *mm = rq->prev_mm;
55a101f8 1760 long prev_state;
1da177e4
LT
1761
1762 rq->prev_mm = NULL;
1763
1764 /*
1765 * A task struct has one reference for the use as "current".
c394cc9f 1766 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
55a101f8
ON
1767 * schedule one last time. The schedule call will never return, and
1768 * the scheduled task must drop that reference.
c394cc9f 1769 * The test for TASK_DEAD must occur while the runqueue locks are
1da177e4
LT
1770 * still held, otherwise prev could be scheduled on another cpu, die
1771 * there before we look at prev->state, and then the reference would
1772 * be dropped twice.
1773 * Manfred Spraul <manfred@colorfullife.com>
1774 */
55a101f8 1775 prev_state = prev->state;
4866cde0
NP
1776 finish_arch_switch(prev);
1777 finish_lock_switch(rq, prev);
1da177e4
LT
1778 if (mm)
1779 mmdrop(mm);
c394cc9f 1780 if (unlikely(prev_state == TASK_DEAD)) {
c6fd91f0 1781 /*
1782 * Remove function-return probe instances associated with this
1783 * task and put them back on the free list.
1784 */
1785 kprobe_flush_task(prev);
1da177e4 1786 put_task_struct(prev);
c6fd91f0 1787 }
1da177e4
LT
1788}
1789
1790/**
1791 * schedule_tail - first thing a freshly forked thread must call.
1792 * @prev: the thread we just switched away from.
1793 */
36c8b586 1794asmlinkage void schedule_tail(struct task_struct *prev)
1da177e4
LT
1795 __releases(rq->lock)
1796{
70b97a7f
IM
1797 struct rq *rq = this_rq();
1798
4866cde0
NP
1799 finish_task_switch(rq, prev);
1800#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1801 /* In this case, finish_task_switch does not reenable preemption */
1802 preempt_enable();
1803#endif
1da177e4
LT
1804 if (current->set_child_tid)
1805 put_user(current->pid, current->set_child_tid);
1806}
1807
1808/*
1809 * context_switch - switch to the new MM and the new
1810 * thread's register state.
1811 */
dd41f596 1812static inline void
70b97a7f 1813context_switch(struct rq *rq, struct task_struct *prev,
36c8b586 1814 struct task_struct *next)
1da177e4 1815{
dd41f596 1816 struct mm_struct *mm, *oldmm;
1da177e4 1817
dd41f596
IM
1818 prepare_task_switch(rq, next);
1819 mm = next->mm;
1820 oldmm = prev->active_mm;
9226d125
ZA
1821 /*
1822 * For paravirt, this is coupled with an exit in switch_to to
1823 * combine the page table reload and the switch backend into
1824 * one hypercall.
1825 */
1826 arch_enter_lazy_cpu_mode();
1827
dd41f596 1828 if (unlikely(!mm)) {
1da177e4
LT
1829 next->active_mm = oldmm;
1830 atomic_inc(&oldmm->mm_count);
1831 enter_lazy_tlb(oldmm, next);
1832 } else
1833 switch_mm(oldmm, mm, next);
1834
dd41f596 1835 if (unlikely(!prev->mm)) {
1da177e4 1836 prev->active_mm = NULL;
1da177e4
LT
1837 rq->prev_mm = oldmm;
1838 }
3a5f5e48
IM
1839 /*
1840 * Since the runqueue lock will be released by the next
1841 * task (which is an invalid locking op but in the case
1842 * of the scheduler it's an obvious special-case), so we
1843 * do an early lockdep release here:
1844 */
1845#ifndef __ARCH_WANT_UNLOCKED_CTXSW
8a25d5de 1846 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
3a5f5e48 1847#endif
1da177e4
LT
1848
1849 /* Here we just switch the register state and the stack. */
1850 switch_to(prev, next, prev);
1851
dd41f596
IM
1852 barrier();
1853 /*
1854 * this_rq must be evaluated again because prev may have moved
1855 * CPUs since it called schedule(), thus the 'rq' on its stack
1856 * frame will be invalid.
1857 */
1858 finish_task_switch(this_rq(), prev);
1da177e4
LT
1859}
1860
1861/*
1862 * nr_running, nr_uninterruptible and nr_context_switches:
1863 *
1864 * externally visible scheduler statistics: current number of runnable
1865 * threads, current number of uninterruptible-sleeping threads, total
1866 * number of context switches performed since bootup.
1867 */
1868unsigned long nr_running(void)
1869{
1870 unsigned long i, sum = 0;
1871
1872 for_each_online_cpu(i)
1873 sum += cpu_rq(i)->nr_running;
1874
1875 return sum;
1876}
1877
1878unsigned long nr_uninterruptible(void)
1879{
1880 unsigned long i, sum = 0;
1881
0a945022 1882 for_each_possible_cpu(i)
1da177e4
LT
1883 sum += cpu_rq(i)->nr_uninterruptible;
1884
1885 /*
1886 * Since we read the counters lockless, it might be slightly
1887 * inaccurate. Do not allow it to go below zero though:
1888 */
1889 if (unlikely((long)sum < 0))
1890 sum = 0;
1891
1892 return sum;
1893}
1894
1895unsigned long long nr_context_switches(void)
1896{
cc94abfc
SR
1897 int i;
1898 unsigned long long sum = 0;
1da177e4 1899
0a945022 1900 for_each_possible_cpu(i)
1da177e4
LT
1901 sum += cpu_rq(i)->nr_switches;
1902
1903 return sum;
1904}
1905
1906unsigned long nr_iowait(void)
1907{
1908 unsigned long i, sum = 0;
1909
0a945022 1910 for_each_possible_cpu(i)
1da177e4
LT
1911 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1912
1913 return sum;
1914}
1915
db1b1fef
JS
1916unsigned long nr_active(void)
1917{
1918 unsigned long i, running = 0, uninterruptible = 0;
1919
1920 for_each_online_cpu(i) {
1921 running += cpu_rq(i)->nr_running;
1922 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1923 }
1924
1925 if (unlikely((long)uninterruptible < 0))
1926 uninterruptible = 0;
1927
1928 return running + uninterruptible;
1929}
1930
48f24c4d 1931/*
dd41f596
IM
1932 * Update rq->cpu_load[] statistics. This function is usually called every
1933 * scheduler tick (TICK_NSEC).
48f24c4d 1934 */
dd41f596 1935static void update_cpu_load(struct rq *this_rq)
48f24c4d 1936{
dd41f596
IM
1937 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1938 unsigned long total_load = this_rq->ls.load.weight;
1939 unsigned long this_load = total_load;
1940 struct load_stat *ls = &this_rq->ls;
1941 u64 now = __rq_clock(this_rq);
1942 int i, scale;
1943
1944 this_rq->nr_load_updates++;
1945 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1946 goto do_avg;
1947
1948 /* Update delta_fair/delta_exec fields first */
1949 update_curr_load(this_rq, now);
1950
1951 fair_delta64 = ls->delta_fair + 1;
1952 ls->delta_fair = 0;
1953
1954 exec_delta64 = ls->delta_exec + 1;
1955 ls->delta_exec = 0;
1956
1957 sample_interval64 = now - ls->load_update_last;
1958 ls->load_update_last = now;
1959
1960 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1961 sample_interval64 = TICK_NSEC;
1962
1963 if (exec_delta64 > sample_interval64)
1964 exec_delta64 = sample_interval64;
1965
1966 idle_delta64 = sample_interval64 - exec_delta64;
1967
1968 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1969 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1970
1971 this_load = (unsigned long)tmp64;
1972
1973do_avg:
1974
1975 /* Update our load: */
1976 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1977 unsigned long old_load, new_load;
1978
1979 /* scale is effectively 1 << i now, and >> i divides by scale */
1980
1981 old_load = this_rq->cpu_load[i];
1982 new_load = this_load;
1983
1984 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1985 }
48f24c4d
IM
1986}
1987
dd41f596
IM
1988#ifdef CONFIG_SMP
1989
1da177e4
LT
1990/*
1991 * double_rq_lock - safely lock two runqueues
1992 *
1993 * Note this does not disable interrupts like task_rq_lock,
1994 * you need to do so manually before calling.
1995 */
70b97a7f 1996static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1da177e4
LT
1997 __acquires(rq1->lock)
1998 __acquires(rq2->lock)
1999{
054b9108 2000 BUG_ON(!irqs_disabled());
1da177e4
LT
2001 if (rq1 == rq2) {
2002 spin_lock(&rq1->lock);
2003 __acquire(rq2->lock); /* Fake it out ;) */
2004 } else {
c96d145e 2005 if (rq1 < rq2) {
1da177e4
LT
2006 spin_lock(&rq1->lock);
2007 spin_lock(&rq2->lock);
2008 } else {
2009 spin_lock(&rq2->lock);
2010 spin_lock(&rq1->lock);
2011 }
2012 }
2013}
2014
2015/*
2016 * double_rq_unlock - safely unlock two runqueues
2017 *
2018 * Note this does not restore interrupts like task_rq_unlock,
2019 * you need to do so manually after calling.
2020 */
70b97a7f 2021static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1da177e4
LT
2022 __releases(rq1->lock)
2023 __releases(rq2->lock)
2024{
2025 spin_unlock(&rq1->lock);
2026 if (rq1 != rq2)
2027 spin_unlock(&rq2->lock);
2028 else
2029 __release(rq2->lock);
2030}
2031
2032/*
2033 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2034 */
70b97a7f 2035static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
1da177e4
LT
2036 __releases(this_rq->lock)
2037 __acquires(busiest->lock)
2038 __acquires(this_rq->lock)
2039{
054b9108
KK
2040 if (unlikely(!irqs_disabled())) {
2041 /* printk() doesn't work good under rq->lock */
2042 spin_unlock(&this_rq->lock);
2043 BUG_ON(1);
2044 }
1da177e4 2045 if (unlikely(!spin_trylock(&busiest->lock))) {
c96d145e 2046 if (busiest < this_rq) {
1da177e4
LT
2047 spin_unlock(&this_rq->lock);
2048 spin_lock(&busiest->lock);
2049 spin_lock(&this_rq->lock);
2050 } else
2051 spin_lock(&busiest->lock);
2052 }
2053}
2054
1da177e4
LT
2055/*
2056 * If dest_cpu is allowed for this process, migrate the task to it.
2057 * This is accomplished by forcing the cpu_allowed mask to only
2058 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
2059 * the cpu_allowed mask is restored.
2060 */
36c8b586 2061static void sched_migrate_task(struct task_struct *p, int dest_cpu)
1da177e4 2062{
70b97a7f 2063 struct migration_req req;
1da177e4 2064 unsigned long flags;
70b97a7f 2065 struct rq *rq;
1da177e4
LT
2066
2067 rq = task_rq_lock(p, &flags);
2068 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2069 || unlikely(cpu_is_offline(dest_cpu)))
2070 goto out;
2071
2072 /* force the process onto the specified CPU */
2073 if (migrate_task(p, dest_cpu, &req)) {
2074 /* Need to wait for migration thread (might exit: take ref). */
2075 struct task_struct *mt = rq->migration_thread;
36c8b586 2076
1da177e4
LT
2077 get_task_struct(mt);
2078 task_rq_unlock(rq, &flags);
2079 wake_up_process(mt);
2080 put_task_struct(mt);
2081 wait_for_completion(&req.done);
36c8b586 2082
1da177e4
LT
2083 return;
2084 }
2085out:
2086 task_rq_unlock(rq, &flags);
2087}
2088
2089/*
476d139c
NP
2090 * sched_exec - execve() is a valuable balancing opportunity, because at
2091 * this point the task has the smallest effective memory and cache footprint.
1da177e4
LT
2092 */
2093void sched_exec(void)
2094{
1da177e4 2095 int new_cpu, this_cpu = get_cpu();
476d139c 2096 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
1da177e4 2097 put_cpu();
476d139c
NP
2098 if (new_cpu != this_cpu)
2099 sched_migrate_task(current, new_cpu);
1da177e4
LT
2100}
2101
2102/*
2103 * pull_task - move a task from a remote runqueue to the local runqueue.
2104 * Both runqueues must be locked.
2105 */
dd41f596
IM
2106static void pull_task(struct rq *src_rq, struct task_struct *p,
2107 struct rq *this_rq, int this_cpu)
1da177e4 2108{
dd41f596 2109 deactivate_task(src_rq, p, 0);
1da177e4 2110 set_task_cpu(p, this_cpu);
dd41f596 2111 activate_task(this_rq, p, 0);
1da177e4
LT
2112 /*
2113 * Note that idle threads have a prio of MAX_PRIO, for this test
2114 * to be always true for them.
2115 */
dd41f596 2116 check_preempt_curr(this_rq, p);
1da177e4
LT
2117}
2118
2119/*
2120 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2121 */
858119e1 2122static
70b97a7f 2123int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
d15bcfdb 2124 struct sched_domain *sd, enum cpu_idle_type idle,
95cdf3b7 2125 int *all_pinned)
1da177e4
LT
2126{
2127 /*
2128 * We do not migrate tasks that are:
2129 * 1) running (obviously), or
2130 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2131 * 3) are cache-hot on their current CPU.
2132 */
1da177e4
LT
2133 if (!cpu_isset(this_cpu, p->cpus_allowed))
2134 return 0;
81026794
NP
2135 *all_pinned = 0;
2136
2137 if (task_running(rq, p))
2138 return 0;
1da177e4
LT
2139
2140 /*
dd41f596 2141 * Aggressive migration if too many balance attempts have failed:
1da177e4 2142 */
dd41f596 2143 if (sd->nr_balance_failed > sd->cache_nice_tries)
1da177e4
LT
2144 return 1;
2145
1da177e4
LT
2146 return 1;
2147}
2148
dd41f596 2149static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2dd73a4f 2150 unsigned long max_nr_move, unsigned long max_load_move,
d15bcfdb 2151 struct sched_domain *sd, enum cpu_idle_type idle,
dd41f596
IM
2152 int *all_pinned, unsigned long *load_moved,
2153 int this_best_prio, int best_prio, int best_prio_seen,
2154 struct rq_iterator *iterator)
1da177e4 2155{
dd41f596
IM
2156 int pulled = 0, pinned = 0, skip_for_load;
2157 struct task_struct *p;
2158 long rem_load_move = max_load_move;
1da177e4 2159
2dd73a4f 2160 if (max_nr_move == 0 || max_load_move == 0)
1da177e4
LT
2161 goto out;
2162
81026794
NP
2163 pinned = 1;
2164
1da177e4 2165 /*
dd41f596 2166 * Start the load-balancing iterator:
1da177e4 2167 */
dd41f596
IM
2168 p = iterator->start(iterator->arg);
2169next:
2170 if (!p)
1da177e4 2171 goto out;
50ddd969
PW
2172 /*
2173 * To help distribute high priority tasks accross CPUs we don't
2174 * skip a task if it will be the highest priority task (i.e. smallest
2175 * prio value) on its new queue regardless of its load weight
2176 */
dd41f596
IM
2177 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2178 SCHED_LOAD_SCALE_FUZZ;
2179 if (skip_for_load && p->prio < this_best_prio)
2180 skip_for_load = !best_prio_seen && p->prio == best_prio;
615052dc 2181 if (skip_for_load ||
dd41f596 2182 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
48f24c4d 2183
dd41f596
IM
2184 best_prio_seen |= p->prio == best_prio;
2185 p = iterator->next(iterator->arg);
2186 goto next;
1da177e4
LT
2187 }
2188
dd41f596 2189 pull_task(busiest, p, this_rq, this_cpu);
1da177e4 2190 pulled++;
dd41f596 2191 rem_load_move -= p->se.load.weight;
1da177e4 2192
2dd73a4f
PW
2193 /*
2194 * We only want to steal up to the prescribed number of tasks
2195 * and the prescribed amount of weighted load.
2196 */
2197 if (pulled < max_nr_move && rem_load_move > 0) {
dd41f596
IM
2198 if (p->prio < this_best_prio)
2199 this_best_prio = p->prio;
2200 p = iterator->next(iterator->arg);
2201 goto next;
1da177e4
LT
2202 }
2203out:
2204 /*
2205 * Right now, this is the only place pull_task() is called,
2206 * so we can safely collect pull_task() stats here rather than
2207 * inside pull_task().
2208 */
2209 schedstat_add(sd, lb_gained[idle], pulled);
81026794
NP
2210
2211 if (all_pinned)
2212 *all_pinned = pinned;
dd41f596 2213 *load_moved = max_load_move - rem_load_move;
1da177e4
LT
2214 return pulled;
2215}
2216
dd41f596
IM
2217/*
2218 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2219 * load from busiest to this_rq, as part of a balancing operation within
2220 * "domain". Returns the number of tasks moved.
2221 *
2222 * Called with both runqueues locked.
2223 */
2224static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2225 unsigned long max_nr_move, unsigned long max_load_move,
2226 struct sched_domain *sd, enum cpu_idle_type idle,
2227 int *all_pinned)
2228{
2229 struct sched_class *class = sched_class_highest;
2230 unsigned long load_moved, total_nr_moved = 0, nr_moved;
2231 long rem_load_move = max_load_move;
2232
2233 do {
2234 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2235 max_nr_move, (unsigned long)rem_load_move,
2236 sd, idle, all_pinned, &load_moved);
2237 total_nr_moved += nr_moved;
2238 max_nr_move -= nr_moved;
2239 rem_load_move -= load_moved;
2240 class = class->next;
2241 } while (class && max_nr_move && rem_load_move > 0);
2242
2243 return total_nr_moved;
2244}
2245
1da177e4
LT
2246/*
2247 * find_busiest_group finds and returns the busiest CPU group within the
48f24c4d
IM
2248 * domain. It calculates and returns the amount of weighted load which
2249 * should be moved to restore balance via the imbalance parameter.
1da177e4
LT
2250 */
2251static struct sched_group *
2252find_busiest_group(struct sched_domain *sd, int this_cpu,
dd41f596
IM
2253 unsigned long *imbalance, enum cpu_idle_type idle,
2254 int *sd_idle, cpumask_t *cpus, int *balance)
1da177e4
LT
2255{
2256 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2257 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
0c117f1b 2258 unsigned long max_pull;
2dd73a4f
PW
2259 unsigned long busiest_load_per_task, busiest_nr_running;
2260 unsigned long this_load_per_task, this_nr_running;
7897986b 2261 int load_idx;
5c45bf27
SS
2262#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2263 int power_savings_balance = 1;
2264 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2265 unsigned long min_nr_running = ULONG_MAX;
2266 struct sched_group *group_min = NULL, *group_leader = NULL;
2267#endif
1da177e4
LT
2268
2269 max_load = this_load = total_load = total_pwr = 0;
2dd73a4f
PW
2270 busiest_load_per_task = busiest_nr_running = 0;
2271 this_load_per_task = this_nr_running = 0;
d15bcfdb 2272 if (idle == CPU_NOT_IDLE)
7897986b 2273 load_idx = sd->busy_idx;
d15bcfdb 2274 else if (idle == CPU_NEWLY_IDLE)
7897986b
NP
2275 load_idx = sd->newidle_idx;
2276 else
2277 load_idx = sd->idle_idx;
1da177e4
LT
2278
2279 do {
5c45bf27 2280 unsigned long load, group_capacity;
1da177e4
LT
2281 int local_group;
2282 int i;
783609c6 2283 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2dd73a4f 2284 unsigned long sum_nr_running, sum_weighted_load;
1da177e4
LT
2285
2286 local_group = cpu_isset(this_cpu, group->cpumask);
2287
783609c6
SS
2288 if (local_group)
2289 balance_cpu = first_cpu(group->cpumask);
2290
1da177e4 2291 /* Tally up the load of all CPUs in the group */
2dd73a4f 2292 sum_weighted_load = sum_nr_running = avg_load = 0;
1da177e4
LT
2293
2294 for_each_cpu_mask(i, group->cpumask) {
0a2966b4
CL
2295 struct rq *rq;
2296
2297 if (!cpu_isset(i, *cpus))
2298 continue;
2299
2300 rq = cpu_rq(i);
2dd73a4f 2301
5969fe06
NP
2302 if (*sd_idle && !idle_cpu(i))
2303 *sd_idle = 0;
2304
1da177e4 2305 /* Bias balancing toward cpus of our domain */
783609c6
SS
2306 if (local_group) {
2307 if (idle_cpu(i) && !first_idle_cpu) {
2308 first_idle_cpu = 1;
2309 balance_cpu = i;
2310 }
2311
a2000572 2312 load = target_load(i, load_idx);
783609c6 2313 } else
a2000572 2314 load = source_load(i, load_idx);
1da177e4
LT
2315
2316 avg_load += load;
2dd73a4f 2317 sum_nr_running += rq->nr_running;
dd41f596 2318 sum_weighted_load += weighted_cpuload(i);
1da177e4
LT
2319 }
2320
783609c6
SS
2321 /*
2322 * First idle cpu or the first cpu(busiest) in this sched group
2323 * is eligible for doing load balancing at this and above
2324 * domains.
2325 */
2326 if (local_group && balance_cpu != this_cpu && balance) {
2327 *balance = 0;
2328 goto ret;
2329 }
2330
1da177e4 2331 total_load += avg_load;
5517d86b 2332 total_pwr += group->__cpu_power;
1da177e4
LT
2333
2334 /* Adjust by relative CPU power of the group */
5517d86b
ED
2335 avg_load = sg_div_cpu_power(group,
2336 avg_load * SCHED_LOAD_SCALE);
1da177e4 2337
5517d86b 2338 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
5c45bf27 2339
1da177e4
LT
2340 if (local_group) {
2341 this_load = avg_load;
2342 this = group;
2dd73a4f
PW
2343 this_nr_running = sum_nr_running;
2344 this_load_per_task = sum_weighted_load;
2345 } else if (avg_load > max_load &&
5c45bf27 2346 sum_nr_running > group_capacity) {
1da177e4
LT
2347 max_load = avg_load;
2348 busiest = group;
2dd73a4f
PW
2349 busiest_nr_running = sum_nr_running;
2350 busiest_load_per_task = sum_weighted_load;
1da177e4 2351 }
5c45bf27
SS
2352
2353#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2354 /*
2355 * Busy processors will not participate in power savings
2356 * balance.
2357 */
dd41f596
IM
2358 if (idle == CPU_NOT_IDLE ||
2359 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2360 goto group_next;
5c45bf27
SS
2361
2362 /*
2363 * If the local group is idle or completely loaded
2364 * no need to do power savings balance at this domain
2365 */
2366 if (local_group && (this_nr_running >= group_capacity ||
2367 !this_nr_running))
2368 power_savings_balance = 0;
2369
dd41f596 2370 /*
5c45bf27
SS
2371 * If a group is already running at full capacity or idle,
2372 * don't include that group in power savings calculations
dd41f596
IM
2373 */
2374 if (!power_savings_balance || sum_nr_running >= group_capacity
5c45bf27 2375 || !sum_nr_running)
dd41f596 2376 goto group_next;
5c45bf27 2377
dd41f596 2378 /*
5c45bf27 2379 * Calculate the group which has the least non-idle load.
dd41f596
IM
2380 * This is the group from where we need to pick up the load
2381 * for saving power
2382 */
2383 if ((sum_nr_running < min_nr_running) ||
2384 (sum_nr_running == min_nr_running &&
5c45bf27
SS
2385 first_cpu(group->cpumask) <
2386 first_cpu(group_min->cpumask))) {
dd41f596
IM
2387 group_min = group;
2388 min_nr_running = sum_nr_running;
5c45bf27
SS
2389 min_load_per_task = sum_weighted_load /
2390 sum_nr_running;
dd41f596 2391 }
5c45bf27 2392
dd41f596 2393 /*
5c45bf27 2394 * Calculate the group which is almost near its
dd41f596
IM
2395 * capacity but still has some space to pick up some load
2396 * from other group and save more power
2397 */
2398 if (sum_nr_running <= group_capacity - 1) {
2399 if (sum_nr_running > leader_nr_running ||
2400 (sum_nr_running == leader_nr_running &&
2401 first_cpu(group->cpumask) >
2402 first_cpu(group_leader->cpumask))) {
2403 group_leader = group;
2404 leader_nr_running = sum_nr_running;
2405 }
48f24c4d 2406 }
5c45bf27
SS
2407group_next:
2408#endif
1da177e4
LT
2409 group = group->next;
2410 } while (group != sd->groups);
2411
2dd73a4f 2412 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
1da177e4
LT
2413 goto out_balanced;
2414
2415 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2416
2417 if (this_load >= avg_load ||
2418 100*max_load <= sd->imbalance_pct*this_load)
2419 goto out_balanced;
2420
2dd73a4f 2421 busiest_load_per_task /= busiest_nr_running;
1da177e4
LT
2422 /*
2423 * We're trying to get all the cpus to the average_load, so we don't
2424 * want to push ourselves above the average load, nor do we wish to
2425 * reduce the max loaded cpu below the average load, as either of these
2426 * actions would just result in more rebalancing later, and ping-pong
2427 * tasks around. Thus we look for the minimum possible imbalance.
2428 * Negative imbalances (*we* are more loaded than anyone else) will
2429 * be counted as no imbalance for these purposes -- we can't fix that
2430 * by pulling tasks to us. Be careful of negative numbers as they'll
2431 * appear as very large values with unsigned longs.
2432 */
2dd73a4f
PW
2433 if (max_load <= busiest_load_per_task)
2434 goto out_balanced;
2435
2436 /*
2437 * In the presence of smp nice balancing, certain scenarios can have
2438 * max load less than avg load(as we skip the groups at or below
2439 * its cpu_power, while calculating max_load..)
2440 */
2441 if (max_load < avg_load) {
2442 *imbalance = 0;
2443 goto small_imbalance;
2444 }
0c117f1b
SS
2445
2446 /* Don't want to pull so many tasks that a group would go idle */
2dd73a4f 2447 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
0c117f1b 2448
1da177e4 2449 /* How much load to actually move to equalise the imbalance */
5517d86b
ED
2450 *imbalance = min(max_pull * busiest->__cpu_power,
2451 (avg_load - this_load) * this->__cpu_power)
1da177e4
LT
2452 / SCHED_LOAD_SCALE;
2453
2dd73a4f
PW
2454 /*
2455 * if *imbalance is less than the average load per runnable task
2456 * there is no gaurantee that any tasks will be moved so we'll have
2457 * a think about bumping its value to force at least one task to be
2458 * moved
2459 */
dd41f596 2460 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
48f24c4d 2461 unsigned long tmp, pwr_now, pwr_move;
2dd73a4f
PW
2462 unsigned int imbn;
2463
2464small_imbalance:
2465 pwr_move = pwr_now = 0;
2466 imbn = 2;
2467 if (this_nr_running) {
2468 this_load_per_task /= this_nr_running;
2469 if (busiest_load_per_task > this_load_per_task)
2470 imbn = 1;
2471 } else
2472 this_load_per_task = SCHED_LOAD_SCALE;
1da177e4 2473
dd41f596
IM
2474 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2475 busiest_load_per_task * imbn) {
2dd73a4f 2476 *imbalance = busiest_load_per_task;
1da177e4
LT
2477 return busiest;
2478 }
2479
2480 /*
2481 * OK, we don't have enough imbalance to justify moving tasks,
2482 * however we may be able to increase total CPU power used by
2483 * moving them.
2484 */
2485
5517d86b
ED
2486 pwr_now += busiest->__cpu_power *
2487 min(busiest_load_per_task, max_load);
2488 pwr_now += this->__cpu_power *
2489 min(this_load_per_task, this_load);
1da177e4
LT
2490 pwr_now /= SCHED_LOAD_SCALE;
2491
2492 /* Amount of load we'd subtract */
5517d86b
ED
2493 tmp = sg_div_cpu_power(busiest,
2494 busiest_load_per_task * SCHED_LOAD_SCALE);
1da177e4 2495 if (max_load > tmp)
5517d86b 2496 pwr_move += busiest->__cpu_power *
2dd73a4f 2497 min(busiest_load_per_task, max_load - tmp);
1da177e4
LT
2498
2499 /* Amount of load we'd add */
5517d86b 2500 if (max_load * busiest->__cpu_power <
33859f7f 2501 busiest_load_per_task * SCHED_LOAD_SCALE)
5517d86b
ED
2502 tmp = sg_div_cpu_power(this,
2503 max_load * busiest->__cpu_power);
1da177e4 2504 else
5517d86b
ED
2505 tmp = sg_div_cpu_power(this,
2506 busiest_load_per_task * SCHED_LOAD_SCALE);
2507 pwr_move += this->__cpu_power *
2508 min(this_load_per_task, this_load + tmp);
1da177e4
LT
2509 pwr_move /= SCHED_LOAD_SCALE;
2510
2511 /* Move if we gain throughput */
2512 if (pwr_move <= pwr_now)
2513 goto out_balanced;
2514
2dd73a4f 2515 *imbalance = busiest_load_per_task;
1da177e4
LT
2516 }
2517
1da177e4
LT
2518 return busiest;
2519
2520out_balanced:
5c45bf27 2521#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
d15bcfdb 2522 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
5c45bf27 2523 goto ret;
1da177e4 2524
5c45bf27
SS
2525 if (this == group_leader && group_leader != group_min) {
2526 *imbalance = min_load_per_task;
2527 return group_min;
2528 }
5c45bf27 2529#endif
783609c6 2530ret:
1da177e4
LT
2531 *imbalance = 0;
2532 return NULL;
2533}
2534
2535/*
2536 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2537 */
70b97a7f 2538static struct rq *
d15bcfdb 2539find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
0a2966b4 2540 unsigned long imbalance, cpumask_t *cpus)
1da177e4 2541{
70b97a7f 2542 struct rq *busiest = NULL, *rq;
2dd73a4f 2543 unsigned long max_load = 0;
1da177e4
LT
2544 int i;
2545
2546 for_each_cpu_mask(i, group->cpumask) {
dd41f596 2547 unsigned long wl;
0a2966b4
CL
2548
2549 if (!cpu_isset(i, *cpus))
2550 continue;
2551
48f24c4d 2552 rq = cpu_rq(i);
dd41f596 2553 wl = weighted_cpuload(i);
2dd73a4f 2554
dd41f596 2555 if (rq->nr_running == 1 && wl > imbalance)
2dd73a4f 2556 continue;
1da177e4 2557
dd41f596
IM
2558 if (wl > max_load) {
2559 max_load = wl;
48f24c4d 2560 busiest = rq;
1da177e4
LT
2561 }
2562 }
2563
2564 return busiest;
2565}
2566
77391d71
NP
2567/*
2568 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2569 * so long as it is large enough.
2570 */
2571#define MAX_PINNED_INTERVAL 512
2572
48f24c4d
IM
2573static inline unsigned long minus_1_or_zero(unsigned long n)
2574{
2575 return n > 0 ? n - 1 : 0;
2576}
2577
1da177e4
LT
2578/*
2579 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2580 * tasks if there is an imbalance.
1da177e4 2581 */
70b97a7f 2582static int load_balance(int this_cpu, struct rq *this_rq,
d15bcfdb 2583 struct sched_domain *sd, enum cpu_idle_type idle,
783609c6 2584 int *balance)
1da177e4 2585{
48f24c4d 2586 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
1da177e4 2587 struct sched_group *group;
1da177e4 2588 unsigned long imbalance;
70b97a7f 2589 struct rq *busiest;
0a2966b4 2590 cpumask_t cpus = CPU_MASK_ALL;
fe2eea3f 2591 unsigned long flags;
5969fe06 2592
89c4710e
SS
2593 /*
2594 * When power savings policy is enabled for the parent domain, idle
2595 * sibling can pick up load irrespective of busy siblings. In this case,
dd41f596 2596 * let the state of idle sibling percolate up as CPU_IDLE, instead of
d15bcfdb 2597 * portraying it as CPU_NOT_IDLE.
89c4710e 2598 */
d15bcfdb 2599 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
89c4710e 2600 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
5969fe06 2601 sd_idle = 1;
1da177e4 2602
1da177e4
LT
2603 schedstat_inc(sd, lb_cnt[idle]);
2604
0a2966b4
CL
2605redo:
2606 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
783609c6
SS
2607 &cpus, balance);
2608
06066714 2609 if (*balance == 0)
783609c6 2610 goto out_balanced;
783609c6 2611
1da177e4
LT
2612 if (!group) {
2613 schedstat_inc(sd, lb_nobusyg[idle]);
2614 goto out_balanced;
2615 }
2616
0a2966b4 2617 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
1da177e4
LT
2618 if (!busiest) {
2619 schedstat_inc(sd, lb_nobusyq[idle]);
2620 goto out_balanced;
2621 }
2622
db935dbd 2623 BUG_ON(busiest == this_rq);
1da177e4
LT
2624
2625 schedstat_add(sd, lb_imbalance[idle], imbalance);
2626
2627 nr_moved = 0;
2628 if (busiest->nr_running > 1) {
2629 /*
2630 * Attempt to move tasks. If find_busiest_group has found
2631 * an imbalance but busiest->nr_running <= 1, the group is
2632 * still unbalanced. nr_moved simply stays zero, so it is
2633 * correctly treated as an imbalance.
2634 */
fe2eea3f 2635 local_irq_save(flags);
e17224bf 2636 double_rq_lock(this_rq, busiest);
1da177e4 2637 nr_moved = move_tasks(this_rq, this_cpu, busiest,
48f24c4d
IM
2638 minus_1_or_zero(busiest->nr_running),
2639 imbalance, sd, idle, &all_pinned);
e17224bf 2640 double_rq_unlock(this_rq, busiest);
fe2eea3f 2641 local_irq_restore(flags);
81026794 2642
46cb4b7c
SS
2643 /*
2644 * some other cpu did the load balance for us.
2645 */
2646 if (nr_moved && this_cpu != smp_processor_id())
2647 resched_cpu(this_cpu);
2648
81026794 2649 /* All tasks on this runqueue were pinned by CPU affinity */
0a2966b4
CL
2650 if (unlikely(all_pinned)) {
2651 cpu_clear(cpu_of(busiest), cpus);
2652 if (!cpus_empty(cpus))
2653 goto redo;
81026794 2654 goto out_balanced;
0a2966b4 2655 }
1da177e4 2656 }
81026794 2657
1da177e4
LT
2658 if (!nr_moved) {
2659 schedstat_inc(sd, lb_failed[idle]);
2660 sd->nr_balance_failed++;
2661
2662 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
1da177e4 2663
fe2eea3f 2664 spin_lock_irqsave(&busiest->lock, flags);
fa3b6ddc
SS
2665
2666 /* don't kick the migration_thread, if the curr
2667 * task on busiest cpu can't be moved to this_cpu
2668 */
2669 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
fe2eea3f 2670 spin_unlock_irqrestore(&busiest->lock, flags);
fa3b6ddc
SS
2671 all_pinned = 1;
2672 goto out_one_pinned;
2673 }
2674
1da177e4
LT
2675 if (!busiest->active_balance) {
2676 busiest->active_balance = 1;
2677 busiest->push_cpu = this_cpu;
81026794 2678 active_balance = 1;
1da177e4 2679 }
fe2eea3f 2680 spin_unlock_irqrestore(&busiest->lock, flags);
81026794 2681 if (active_balance)
1da177e4
LT
2682 wake_up_process(busiest->migration_thread);
2683
2684 /*
2685 * We've kicked active balancing, reset the failure
2686 * counter.
2687 */
39507451 2688 sd->nr_balance_failed = sd->cache_nice_tries+1;
1da177e4 2689 }
81026794 2690 } else
1da177e4
LT
2691 sd->nr_balance_failed = 0;
2692
81026794 2693 if (likely(!active_balance)) {
1da177e4
LT
2694 /* We were unbalanced, so reset the balancing interval */
2695 sd->balance_interval = sd->min_interval;
81026794
NP
2696 } else {
2697 /*
2698 * If we've begun active balancing, start to back off. This
2699 * case may not be covered by the all_pinned logic if there
2700 * is only 1 task on the busy runqueue (because we don't call
2701 * move_tasks).
2702 */
2703 if (sd->balance_interval < sd->max_interval)
2704 sd->balance_interval *= 2;
1da177e4
LT
2705 }
2706
5c45bf27 2707 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
89c4710e 2708 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
5969fe06 2709 return -1;
1da177e4
LT
2710 return nr_moved;
2711
2712out_balanced:
1da177e4
LT
2713 schedstat_inc(sd, lb_balanced[idle]);
2714
16cfb1c0 2715 sd->nr_balance_failed = 0;
fa3b6ddc
SS
2716
2717out_one_pinned:
1da177e4 2718 /* tune up the balancing interval */
77391d71
NP
2719 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2720 (sd->balance_interval < sd->max_interval))
1da177e4
LT
2721 sd->balance_interval *= 2;
2722
48f24c4d 2723 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
89c4710e 2724 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
5969fe06 2725 return -1;
1da177e4
LT
2726 return 0;
2727}
2728
2729/*
2730 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2731 * tasks if there is an imbalance.
2732 *
d15bcfdb 2733 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
1da177e4
LT
2734 * this_rq is locked.
2735 */
48f24c4d 2736static int
70b97a7f 2737load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
1da177e4
LT
2738{
2739 struct sched_group *group;
70b97a7f 2740 struct rq *busiest = NULL;
1da177e4
LT
2741 unsigned long imbalance;
2742 int nr_moved = 0;
5969fe06 2743 int sd_idle = 0;
0a2966b4 2744 cpumask_t cpus = CPU_MASK_ALL;
5969fe06 2745
89c4710e
SS
2746 /*
2747 * When power savings policy is enabled for the parent domain, idle
2748 * sibling can pick up load irrespective of busy siblings. In this case,
2749 * let the state of idle sibling percolate up as IDLE, instead of
d15bcfdb 2750 * portraying it as CPU_NOT_IDLE.
89c4710e
SS
2751 */
2752 if (sd->flags & SD_SHARE_CPUPOWER &&
2753 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
5969fe06 2754 sd_idle = 1;
1da177e4 2755
d15bcfdb 2756 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
0a2966b4 2757redo:
d15bcfdb 2758 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
783609c6 2759 &sd_idle, &cpus, NULL);
1da177e4 2760 if (!group) {
d15bcfdb 2761 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
16cfb1c0 2762 goto out_balanced;
1da177e4
LT
2763 }
2764
d15bcfdb 2765 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
0a2966b4 2766 &cpus);
db935dbd 2767 if (!busiest) {
d15bcfdb 2768 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
16cfb1c0 2769 goto out_balanced;
1da177e4
LT
2770 }
2771
db935dbd
NP
2772 BUG_ON(busiest == this_rq);
2773
d15bcfdb 2774 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
d6d5cfaf
NP
2775
2776 nr_moved = 0;
2777 if (busiest->nr_running > 1) {
2778 /* Attempt to move tasks */
2779 double_lock_balance(this_rq, busiest);
2780 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2dd73a4f 2781 minus_1_or_zero(busiest->nr_running),
d15bcfdb 2782 imbalance, sd, CPU_NEWLY_IDLE, NULL);
d6d5cfaf 2783 spin_unlock(&busiest->lock);
0a2966b4
CL
2784
2785 if (!nr_moved) {
2786 cpu_clear(cpu_of(busiest), cpus);
2787 if (!cpus_empty(cpus))
2788 goto redo;
2789 }
d6d5cfaf
NP
2790 }
2791
5969fe06 2792 if (!nr_moved) {
d15bcfdb 2793 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
89c4710e
SS
2794 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2795 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
5969fe06
NP
2796 return -1;
2797 } else
16cfb1c0 2798 sd->nr_balance_failed = 0;
1da177e4 2799
1da177e4 2800 return nr_moved;
16cfb1c0
NP
2801
2802out_balanced:
d15bcfdb 2803 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
48f24c4d 2804 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
89c4710e 2805 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
5969fe06 2806 return -1;
16cfb1c0 2807 sd->nr_balance_failed = 0;
48f24c4d 2808
16cfb1c0 2809 return 0;
1da177e4
LT
2810}
2811
2812/*
2813 * idle_balance is called by schedule() if this_cpu is about to become
2814 * idle. Attempts to pull tasks from other CPUs.
2815 */
70b97a7f 2816static void idle_balance(int this_cpu, struct rq *this_rq)
1da177e4
LT
2817{
2818 struct sched_domain *sd;
dd41f596
IM
2819 int pulled_task = -1;
2820 unsigned long next_balance = jiffies + HZ;
1da177e4
LT
2821
2822 for_each_domain(this_cpu, sd) {
92c4ca5c
CL
2823 unsigned long interval;
2824
2825 if (!(sd->flags & SD_LOAD_BALANCE))
2826 continue;
2827
2828 if (sd->flags & SD_BALANCE_NEWIDLE)
48f24c4d 2829 /* If we've pulled tasks over stop searching: */
1bd77f2d 2830 pulled_task = load_balance_newidle(this_cpu,
92c4ca5c
CL
2831 this_rq, sd);
2832
2833 interval = msecs_to_jiffies(sd->balance_interval);
2834 if (time_after(next_balance, sd->last_balance + interval))
2835 next_balance = sd->last_balance + interval;
2836 if (pulled_task)
2837 break;
1da177e4 2838 }
dd41f596 2839 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
1bd77f2d
CL
2840 /*
2841 * We are going idle. next_balance may be set based on
2842 * a busy processor. So reset next_balance.
2843 */
2844 this_rq->next_balance = next_balance;
dd41f596 2845 }
1da177e4
LT
2846}
2847
2848/*
2849 * active_load_balance is run by migration threads. It pushes running tasks
2850 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2851 * running on each physical CPU where possible, and avoids physical /
2852 * logical imbalances.
2853 *
2854 * Called with busiest_rq locked.
2855 */
70b97a7f 2856static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
1da177e4 2857{
39507451 2858 int target_cpu = busiest_rq->push_cpu;
70b97a7f
IM
2859 struct sched_domain *sd;
2860 struct rq *target_rq;
39507451 2861
48f24c4d 2862 /* Is there any task to move? */
39507451 2863 if (busiest_rq->nr_running <= 1)
39507451
NP
2864 return;
2865
2866 target_rq = cpu_rq(target_cpu);
1da177e4
LT
2867
2868 /*
39507451
NP
2869 * This condition is "impossible", if it occurs
2870 * we need to fix it. Originally reported by
2871 * Bjorn Helgaas on a 128-cpu setup.
1da177e4 2872 */
39507451 2873 BUG_ON(busiest_rq == target_rq);
1da177e4 2874
39507451
NP
2875 /* move a task from busiest_rq to target_rq */
2876 double_lock_balance(busiest_rq, target_rq);
2877
2878 /* Search for an sd spanning us and the target CPU. */
c96d145e 2879 for_each_domain(target_cpu, sd) {
39507451 2880 if ((sd->flags & SD_LOAD_BALANCE) &&
48f24c4d 2881 cpu_isset(busiest_cpu, sd->span))
39507451 2882 break;
c96d145e 2883 }
39507451 2884
48f24c4d
IM
2885 if (likely(sd)) {
2886 schedstat_inc(sd, alb_cnt);
39507451 2887
48f24c4d 2888 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
d15bcfdb 2889 RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
48f24c4d
IM
2890 NULL))
2891 schedstat_inc(sd, alb_pushed);
2892 else
2893 schedstat_inc(sd, alb_failed);
2894 }
39507451 2895 spin_unlock(&target_rq->lock);
1da177e4
LT
2896}
2897
46cb4b7c
SS
2898#ifdef CONFIG_NO_HZ
2899static struct {
2900 atomic_t load_balancer;
2901 cpumask_t cpu_mask;
2902} nohz ____cacheline_aligned = {
2903 .load_balancer = ATOMIC_INIT(-1),
2904 .cpu_mask = CPU_MASK_NONE,
2905};
2906
7835b98b 2907/*
46cb4b7c
SS
2908 * This routine will try to nominate the ilb (idle load balancing)
2909 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2910 * load balancing on behalf of all those cpus. If all the cpus in the system
2911 * go into this tickless mode, then there will be no ilb owner (as there is
2912 * no need for one) and all the cpus will sleep till the next wakeup event
2913 * arrives...
2914 *
2915 * For the ilb owner, tick is not stopped. And this tick will be used
2916 * for idle load balancing. ilb owner will still be part of
2917 * nohz.cpu_mask..
7835b98b 2918 *
46cb4b7c
SS
2919 * While stopping the tick, this cpu will become the ilb owner if there
2920 * is no other owner. And will be the owner till that cpu becomes busy
2921 * or if all cpus in the system stop their ticks at which point
2922 * there is no need for ilb owner.
2923 *
2924 * When the ilb owner becomes busy, it nominates another owner, during the
2925 * next busy scheduler_tick()
2926 */
2927int select_nohz_load_balancer(int stop_tick)
2928{
2929 int cpu = smp_processor_id();
2930
2931 if (stop_tick) {
2932 cpu_set(cpu, nohz.cpu_mask);
2933 cpu_rq(cpu)->in_nohz_recently = 1;
2934
2935 /*
2936 * If we are going offline and still the leader, give up!
2937 */
2938 if (cpu_is_offline(cpu) &&
2939 atomic_read(&nohz.load_balancer) == cpu) {
2940 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2941 BUG();
2942 return 0;
2943 }
2944
2945 /* time for ilb owner also to sleep */
2946 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
2947 if (atomic_read(&nohz.load_balancer) == cpu)
2948 atomic_set(&nohz.load_balancer, -1);
2949 return 0;
2950 }
2951
2952 if (atomic_read(&nohz.load_balancer) == -1) {
2953 /* make me the ilb owner */
2954 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
2955 return 1;
2956 } else if (atomic_read(&nohz.load_balancer) == cpu)
2957 return 1;
2958 } else {
2959 if (!cpu_isset(cpu, nohz.cpu_mask))
2960 return 0;
2961
2962 cpu_clear(cpu, nohz.cpu_mask);
2963
2964 if (atomic_read(&nohz.load_balancer) == cpu)
2965 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2966 BUG();
2967 }
2968 return 0;
2969}
2970#endif
2971
2972static DEFINE_SPINLOCK(balancing);
2973
2974/*
7835b98b
CL
2975 * It checks each scheduling domain to see if it is due to be balanced,
2976 * and initiates a balancing operation if so.
2977 *
2978 * Balancing parameters are set up in arch_init_sched_domains.
2979 */
d15bcfdb 2980static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
7835b98b 2981{
46cb4b7c
SS
2982 int balance = 1;
2983 struct rq *rq = cpu_rq(cpu);
7835b98b
CL
2984 unsigned long interval;
2985 struct sched_domain *sd;
46cb4b7c 2986 /* Earliest time when we have to do rebalance again */
c9819f45 2987 unsigned long next_balance = jiffies + 60*HZ;
1da177e4 2988
46cb4b7c 2989 for_each_domain(cpu, sd) {
1da177e4
LT
2990 if (!(sd->flags & SD_LOAD_BALANCE))
2991 continue;
2992
2993 interval = sd->balance_interval;
d15bcfdb 2994 if (idle != CPU_IDLE)
1da177e4
LT
2995 interval *= sd->busy_factor;
2996
2997 /* scale ms to jiffies */
2998 interval = msecs_to_jiffies(interval);
2999 if (unlikely(!interval))
3000 interval = 1;
dd41f596
IM
3001 if (interval > HZ*NR_CPUS/10)
3002 interval = HZ*NR_CPUS/10;
3003
1da177e4 3004
08c183f3
CL
3005 if (sd->flags & SD_SERIALIZE) {
3006 if (!spin_trylock(&balancing))
3007 goto out;
3008 }
3009
c9819f45 3010 if (time_after_eq(jiffies, sd->last_balance + interval)) {
46cb4b7c 3011 if (load_balance(cpu, rq, sd, idle, &balance)) {
fa3b6ddc
SS
3012 /*
3013 * We've pulled tasks over so either we're no
5969fe06
NP
3014 * longer idle, or one of our SMT siblings is
3015 * not idle.
3016 */
d15bcfdb 3017 idle = CPU_NOT_IDLE;
1da177e4 3018 }
1bd77f2d 3019 sd->last_balance = jiffies;
1da177e4 3020 }
08c183f3
CL
3021 if (sd->flags & SD_SERIALIZE)
3022 spin_unlock(&balancing);
3023out:
c9819f45
CL
3024 if (time_after(next_balance, sd->last_balance + interval))
3025 next_balance = sd->last_balance + interval;
783609c6
SS
3026
3027 /*
3028 * Stop the load balance at this level. There is another
3029 * CPU in our sched group which is doing load balancing more
3030 * actively.
3031 */
3032 if (!balance)
3033 break;
1da177e4 3034 }
46cb4b7c
SS
3035 rq->next_balance = next_balance;
3036}
3037
3038/*
3039 * run_rebalance_domains is triggered when needed from the scheduler tick.
3040 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3041 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3042 */
3043static void run_rebalance_domains(struct softirq_action *h)
3044{
dd41f596
IM
3045 int this_cpu = smp_processor_id();
3046 struct rq *this_rq = cpu_rq(this_cpu);
3047 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3048 CPU_IDLE : CPU_NOT_IDLE;
46cb4b7c 3049
dd41f596 3050 rebalance_domains(this_cpu, idle);
46cb4b7c
SS
3051
3052#ifdef CONFIG_NO_HZ
3053 /*
3054 * If this cpu is the owner for idle load balancing, then do the
3055 * balancing on behalf of the other idle cpus whose ticks are
3056 * stopped.
3057 */
dd41f596
IM
3058 if (this_rq->idle_at_tick &&
3059 atomic_read(&nohz.load_balancer) == this_cpu) {
46cb4b7c
SS
3060 cpumask_t cpus = nohz.cpu_mask;
3061 struct rq *rq;
3062 int balance_cpu;
3063
dd41f596 3064 cpu_clear(this_cpu, cpus);
46cb4b7c
SS
3065 for_each_cpu_mask(balance_cpu, cpus) {
3066 /*
3067 * If this cpu gets work to do, stop the load balancing
3068 * work being done for other cpus. Next load
3069 * balancing owner will pick it up.
3070 */
3071 if (need_resched())
3072 break;
3073
dd41f596 3074 rebalance_domains(balance_cpu, SCHED_IDLE);
46cb4b7c
SS
3075
3076 rq = cpu_rq(balance_cpu);
dd41f596
IM
3077 if (time_after(this_rq->next_balance, rq->next_balance))
3078 this_rq->next_balance = rq->next_balance;
46cb4b7c
SS
3079 }
3080 }
3081#endif
3082}
3083
3084/*
3085 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3086 *
3087 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3088 * idle load balancing owner or decide to stop the periodic load balancing,
3089 * if the whole system is idle.
3090 */
dd41f596 3091static inline void trigger_load_balance(struct rq *rq, int cpu)
46cb4b7c 3092{
46cb4b7c
SS
3093#ifdef CONFIG_NO_HZ
3094 /*
3095 * If we were in the nohz mode recently and busy at the current
3096 * scheduler tick, then check if we need to nominate new idle
3097 * load balancer.
3098 */
3099 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3100 rq->in_nohz_recently = 0;
3101
3102 if (atomic_read(&nohz.load_balancer) == cpu) {
3103 cpu_clear(cpu, nohz.cpu_mask);
3104 atomic_set(&nohz.load_balancer, -1);
3105 }
3106
3107 if (atomic_read(&nohz.load_balancer) == -1) {
3108 /*
3109 * simple selection for now: Nominate the
3110 * first cpu in the nohz list to be the next
3111 * ilb owner.
3112 *
3113 * TBD: Traverse the sched domains and nominate
3114 * the nearest cpu in the nohz.cpu_mask.
3115 */
3116 int ilb = first_cpu(nohz.cpu_mask);
3117
3118 if (ilb != NR_CPUS)
3119 resched_cpu(ilb);
3120 }
3121 }
3122
3123 /*
3124 * If this cpu is idle and doing idle load balancing for all the
3125 * cpus with ticks stopped, is it time for that to stop?
3126 */
3127 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3128 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3129 resched_cpu(cpu);
3130 return;
3131 }
3132
3133 /*
3134 * If this cpu is idle and the idle load balancing is done by
3135 * someone else, then no need raise the SCHED_SOFTIRQ
3136 */
3137 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3138 cpu_isset(cpu, nohz.cpu_mask))
3139 return;
3140#endif
3141 if (time_after_eq(jiffies, rq->next_balance))
3142 raise_softirq(SCHED_SOFTIRQ);
1da177e4 3143}
dd41f596
IM
3144
3145#else /* CONFIG_SMP */
3146
1da177e4
LT
3147/*
3148 * on UP we do not need to balance between CPUs:
3149 */
70b97a7f 3150static inline void idle_balance(int cpu, struct rq *rq)
1da177e4
LT
3151{
3152}
dd41f596
IM
3153
3154/* Avoid "used but not defined" warning on UP */
3155static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3156 unsigned long max_nr_move, unsigned long max_load_move,
3157 struct sched_domain *sd, enum cpu_idle_type idle,
3158 int *all_pinned, unsigned long *load_moved,
3159 int this_best_prio, int best_prio, int best_prio_seen,
3160 struct rq_iterator *iterator)
3161{
3162 *load_moved = 0;
3163
3164 return 0;
3165}
3166
1da177e4
LT
3167#endif
3168
1da177e4
LT
3169DEFINE_PER_CPU(struct kernel_stat, kstat);
3170
3171EXPORT_PER_CPU_SYMBOL(kstat);
3172
3173/*
41b86e9c
IM
3174 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3175 * that have not yet been banked in case the task is currently running.
1da177e4 3176 */
41b86e9c 3177unsigned long long task_sched_runtime(struct task_struct *p)
1da177e4 3178{
1da177e4 3179 unsigned long flags;
41b86e9c
IM
3180 u64 ns, delta_exec;
3181 struct rq *rq;
48f24c4d 3182
41b86e9c
IM
3183 rq = task_rq_lock(p, &flags);
3184 ns = p->se.sum_exec_runtime;
3185 if (rq->curr == p) {
3186 delta_exec = rq_clock(rq) - p->se.exec_start;
3187 if ((s64)delta_exec > 0)
3188 ns += delta_exec;
3189 }
3190 task_rq_unlock(rq, &flags);
48f24c4d 3191
1da177e4
LT
3192 return ns;
3193}
3194
1da177e4
LT
3195/*
3196 * Account user cpu time to a process.
3197 * @p: the process that the cpu time gets accounted to
3198 * @hardirq_offset: the offset to subtract from hardirq_count()
3199 * @cputime: the cpu time spent in user space since the last update
3200 */
3201void account_user_time(struct task_struct *p, cputime_t cputime)
3202{
3203 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3204 cputime64_t tmp;
3205
3206 p->utime = cputime_add(p->utime, cputime);
3207
3208 /* Add user time to cpustat. */
3209 tmp = cputime_to_cputime64(cputime);
3210 if (TASK_NICE(p) > 0)
3211 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3212 else
3213 cpustat->user = cputime64_add(cpustat->user, tmp);
3214}
3215
3216/*
3217 * Account system cpu time to a process.
3218 * @p: the process that the cpu time gets accounted to
3219 * @hardirq_offset: the offset to subtract from hardirq_count()
3220 * @cputime: the cpu time spent in kernel space since the last update
3221 */
3222void account_system_time(struct task_struct *p, int hardirq_offset,
3223 cputime_t cputime)
3224{
3225 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
70b97a7f 3226 struct rq *rq = this_rq();
1da177e4
LT
3227 cputime64_t tmp;
3228
3229 p->stime = cputime_add(p->stime, cputime);
3230
3231 /* Add system time to cpustat. */
3232 tmp = cputime_to_cputime64(cputime);
3233 if (hardirq_count() - hardirq_offset)
3234 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3235 else if (softirq_count())
3236 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3237 else if (p != rq->idle)
3238 cpustat->system = cputime64_add(cpustat->system, tmp);
3239 else if (atomic_read(&rq->nr_iowait) > 0)
3240 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3241 else
3242 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3243 /* Account for system time used */
3244 acct_update_integrals(p);
1da177e4
LT
3245}
3246
3247/*
3248 * Account for involuntary wait time.
3249 * @p: the process from which the cpu time has been stolen
3250 * @steal: the cpu time spent in involuntary wait
3251 */
3252void account_steal_time(struct task_struct *p, cputime_t steal)
3253{
3254 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3255 cputime64_t tmp = cputime_to_cputime64(steal);
70b97a7f 3256 struct rq *rq = this_rq();
1da177e4
LT
3257
3258 if (p == rq->idle) {
3259 p->stime = cputime_add(p->stime, steal);
3260 if (atomic_read(&rq->nr_iowait) > 0)
3261 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3262 else
3263 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3264 } else
3265 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3266}
3267
7835b98b
CL
3268/*
3269 * This function gets called by the timer code, with HZ frequency.
3270 * We call it with interrupts disabled.
3271 *
3272 * It also gets called by the fork code, when changing the parent's
3273 * timeslices.
3274 */
3275void scheduler_tick(void)
3276{
7835b98b
CL
3277 int cpu = smp_processor_id();
3278 struct rq *rq = cpu_rq(cpu);
dd41f596
IM
3279 struct task_struct *curr = rq->curr;
3280
3281 spin_lock(&rq->lock);
3282 if (curr != rq->idle) /* FIXME: needed? */
3283 curr->sched_class->task_tick(rq, curr);
3284 update_cpu_load(rq);
3285 spin_unlock(&rq->lock);
7835b98b 3286
e418e1c2 3287#ifdef CONFIG_SMP
dd41f596
IM
3288 rq->idle_at_tick = idle_cpu(cpu);
3289 trigger_load_balance(rq, cpu);
e418e1c2 3290#endif
1da177e4
LT
3291}
3292
1da177e4
LT
3293#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3294
3295void fastcall add_preempt_count(int val)
3296{
3297 /*
3298 * Underflow?
3299 */
9a11b49a
IM
3300 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3301 return;
1da177e4
LT
3302 preempt_count() += val;
3303 /*
3304 * Spinlock count overflowing soon?
3305 */
33859f7f
MOS
3306 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3307 PREEMPT_MASK - 10);
1da177e4
LT
3308}
3309EXPORT_SYMBOL(add_preempt_count);
3310
3311void fastcall sub_preempt_count(int val)
3312{
3313 /*
3314 * Underflow?
3315 */
9a11b49a
IM
3316 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3317 return;
1da177e4
LT
3318 /*
3319 * Is the spinlock portion underflowing?
3320 */
9a11b49a
IM
3321 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3322 !(preempt_count() & PREEMPT_MASK)))
3323 return;
3324
1da177e4
LT
3325 preempt_count() -= val;
3326}
3327EXPORT_SYMBOL(sub_preempt_count);
3328
3329#endif
3330
3331/*
dd41f596 3332 * Print scheduling while atomic bug:
1da177e4 3333 */
dd41f596 3334static noinline void __schedule_bug(struct task_struct *prev)
1da177e4 3335{
dd41f596
IM
3336 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3337 prev->comm, preempt_count(), prev->pid);
3338 debug_show_held_locks(prev);
3339 if (irqs_disabled())
3340 print_irqtrace_events(prev);
3341 dump_stack();
3342}
1da177e4 3343
dd41f596
IM
3344/*
3345 * Various schedule()-time debugging checks and statistics:
3346 */
3347static inline void schedule_debug(struct task_struct *prev)
3348{
1da177e4
LT
3349 /*
3350 * Test if we are atomic. Since do_exit() needs to call into
3351 * schedule() atomically, we ignore that path for now.
3352 * Otherwise, whine if we are scheduling when we should not be.
3353 */
dd41f596
IM
3354 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3355 __schedule_bug(prev);
3356
1da177e4
LT
3357 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3358
dd41f596
IM
3359 schedstat_inc(this_rq(), sched_cnt);
3360}
3361
3362/*
3363 * Pick up the highest-prio task:
3364 */
3365static inline struct task_struct *
3366pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3367{
3368 struct sched_class *class;
3369 struct task_struct *p;
1da177e4
LT
3370
3371 /*
dd41f596
IM
3372 * Optimization: we know that if all tasks are in
3373 * the fair class we can call that function directly:
1da177e4 3374 */
dd41f596
IM
3375 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3376 p = fair_sched_class.pick_next_task(rq, now);
3377 if (likely(p))
3378 return p;
1da177e4
LT
3379 }
3380
dd41f596
IM
3381 class = sched_class_highest;
3382 for ( ; ; ) {
3383 p = class->pick_next_task(rq, now);
3384 if (p)
3385 return p;
3386 /*
3387 * Will never be NULL as the idle class always
3388 * returns a non-NULL p:
3389 */
3390 class = class->next;
3391 }
3392}
1da177e4 3393
dd41f596
IM
3394/*
3395 * schedule() is the main scheduler function.
3396 */
3397asmlinkage void __sched schedule(void)
3398{
3399 struct task_struct *prev, *next;
3400 long *switch_count;
3401 struct rq *rq;
3402 u64 now;
3403 int cpu;
3404
3405need_resched:
3406 preempt_disable();
3407 cpu = smp_processor_id();
3408 rq = cpu_rq(cpu);
3409 rcu_qsctr_inc(cpu);
3410 prev = rq->curr;
3411 switch_count = &prev->nivcsw;
3412
3413 release_kernel_lock(prev);
3414need_resched_nonpreemptible:
3415
3416 schedule_debug(prev);
1da177e4
LT
3417
3418 spin_lock_irq(&rq->lock);
dd41f596 3419 clear_tsk_need_resched(prev);
1da177e4 3420
1da177e4 3421 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
1da177e4 3422 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
dd41f596 3423 unlikely(signal_pending(prev)))) {
1da177e4 3424 prev->state = TASK_RUNNING;
dd41f596
IM
3425 } else {
3426 deactivate_task(rq, prev, 1);
1da177e4 3427 }
dd41f596 3428 switch_count = &prev->nvcsw;
1da177e4
LT
3429 }
3430
dd41f596 3431 if (unlikely(!rq->nr_running))
1da177e4 3432 idle_balance(cpu, rq);
1da177e4 3433
dd41f596
IM
3434 now = __rq_clock(rq);
3435 prev->sched_class->put_prev_task(rq, prev, now);
3436 next = pick_next_task(rq, prev, now);
1da177e4
LT
3437
3438 sched_info_switch(prev, next);
dd41f596 3439
1da177e4 3440 if (likely(prev != next)) {
1da177e4
LT
3441 rq->nr_switches++;
3442 rq->curr = next;
3443 ++*switch_count;
3444
dd41f596 3445 context_switch(rq, prev, next); /* unlocks the rq */
1da177e4
LT
3446 } else
3447 spin_unlock_irq(&rq->lock);
3448
dd41f596
IM
3449 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3450 cpu = smp_processor_id();
3451 rq = cpu_rq(cpu);
1da177e4 3452 goto need_resched_nonpreemptible;
dd41f596 3453 }
1da177e4
LT
3454 preempt_enable_no_resched();
3455 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3456 goto need_resched;
3457}
1da177e4
LT
3458EXPORT_SYMBOL(schedule);
3459
3460#ifdef CONFIG_PREEMPT
3461/*
2ed6e34f 3462 * this is the entry point to schedule() from in-kernel preemption
1da177e4
LT
3463 * off of preempt_enable. Kernel preemptions off return from interrupt
3464 * occur there and call schedule directly.
3465 */
3466asmlinkage void __sched preempt_schedule(void)
3467{
3468 struct thread_info *ti = current_thread_info();
3469#ifdef CONFIG_PREEMPT_BKL
3470 struct task_struct *task = current;
3471 int saved_lock_depth;
3472#endif
3473 /*
3474 * If there is a non-zero preempt_count or interrupts are disabled,
3475 * we do not want to preempt the current task. Just return..
3476 */
beed33a8 3477 if (likely(ti->preempt_count || irqs_disabled()))
1da177e4
LT
3478 return;
3479
3480need_resched:
3481 add_preempt_count(PREEMPT_ACTIVE);
3482 /*
3483 * We keep the big kernel semaphore locked, but we
3484 * clear ->lock_depth so that schedule() doesnt
3485 * auto-release the semaphore:
3486 */
3487#ifdef CONFIG_PREEMPT_BKL
3488 saved_lock_depth = task->lock_depth;
3489 task->lock_depth = -1;
3490#endif
3491 schedule();
3492#ifdef CONFIG_PREEMPT_BKL
3493 task->lock_depth = saved_lock_depth;
3494#endif
3495 sub_preempt_count(PREEMPT_ACTIVE);
3496
3497 /* we could miss a preemption opportunity between schedule and now */
3498 barrier();
3499 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3500 goto need_resched;
3501}
1da177e4
LT
3502EXPORT_SYMBOL(preempt_schedule);
3503
3504/*
2ed6e34f 3505 * this is the entry point to schedule() from kernel preemption
1da177e4
LT
3506 * off of irq context.
3507 * Note, that this is called and return with irqs disabled. This will
3508 * protect us against recursive calling from irq.
3509 */
3510asmlinkage void __sched preempt_schedule_irq(void)
3511{
3512 struct thread_info *ti = current_thread_info();
3513#ifdef CONFIG_PREEMPT_BKL
3514 struct task_struct *task = current;
3515 int saved_lock_depth;
3516#endif
2ed6e34f 3517 /* Catch callers which need to be fixed */
1da177e4
LT
3518 BUG_ON(ti->preempt_count || !irqs_disabled());
3519
3520need_resched:
3521 add_preempt_count(PREEMPT_ACTIVE);
3522 /*
3523 * We keep the big kernel semaphore locked, but we
3524 * clear ->lock_depth so that schedule() doesnt
3525 * auto-release the semaphore:
3526 */
3527#ifdef CONFIG_PREEMPT_BKL
3528 saved_lock_depth = task->lock_depth;
3529 task->lock_depth = -1;
3530#endif
3531 local_irq_enable();
3532 schedule();
3533 local_irq_disable();
3534#ifdef CONFIG_PREEMPT_BKL
3535 task->lock_depth = saved_lock_depth;
3536#endif
3537 sub_preempt_count(PREEMPT_ACTIVE);
3538
3539 /* we could miss a preemption opportunity between schedule and now */
3540 barrier();
3541 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3542 goto need_resched;
3543}
3544
3545#endif /* CONFIG_PREEMPT */
3546
95cdf3b7
IM
3547int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3548 void *key)
1da177e4 3549{
48f24c4d 3550 return try_to_wake_up(curr->private, mode, sync);
1da177e4 3551}
1da177e4
LT
3552EXPORT_SYMBOL(default_wake_function);
3553
3554/*
3555 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3556 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3557 * number) then we wake all the non-exclusive tasks and one exclusive task.
3558 *
3559 * There are circumstances in which we can try to wake a task which has already
3560 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3561 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3562 */
3563static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3564 int nr_exclusive, int sync, void *key)
3565{
3566 struct list_head *tmp, *next;
3567
3568 list_for_each_safe(tmp, next, &q->task_list) {
48f24c4d
IM
3569 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3570 unsigned flags = curr->flags;
3571
1da177e4 3572 if (curr->func(curr, mode, sync, key) &&
48f24c4d 3573 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
1da177e4
LT
3574 break;
3575 }
3576}
3577
3578/**
3579 * __wake_up - wake up threads blocked on a waitqueue.
3580 * @q: the waitqueue
3581 * @mode: which threads
3582 * @nr_exclusive: how many wake-one or wake-many threads to wake up
67be2dd1 3583 * @key: is directly passed to the wakeup function
1da177e4
LT
3584 */
3585void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
95cdf3b7 3586 int nr_exclusive, void *key)
1da177e4
LT
3587{
3588 unsigned long flags;
3589
3590 spin_lock_irqsave(&q->lock, flags);
3591 __wake_up_common(q, mode, nr_exclusive, 0, key);
3592 spin_unlock_irqrestore(&q->lock, flags);
3593}
1da177e4
LT
3594EXPORT_SYMBOL(__wake_up);
3595
3596/*
3597 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3598 */
3599void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3600{
3601 __wake_up_common(q, mode, 1, 0, NULL);
3602}
3603
3604/**
67be2dd1 3605 * __wake_up_sync - wake up threads blocked on a waitqueue.
1da177e4
LT
3606 * @q: the waitqueue
3607 * @mode: which threads
3608 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3609 *
3610 * The sync wakeup differs that the waker knows that it will schedule
3611 * away soon, so while the target thread will be woken up, it will not
3612 * be migrated to another CPU - ie. the two threads are 'synchronized'
3613 * with each other. This can prevent needless bouncing between CPUs.
3614 *
3615 * On UP it can prevent extra preemption.
3616 */
95cdf3b7
IM
3617void fastcall
3618__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
1da177e4
LT
3619{
3620 unsigned long flags;
3621 int sync = 1;
3622
3623 if (unlikely(!q))
3624 return;
3625
3626 if (unlikely(!nr_exclusive))
3627 sync = 0;
3628
3629 spin_lock_irqsave(&q->lock, flags);
3630 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3631 spin_unlock_irqrestore(&q->lock, flags);
3632}
3633EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3634
3635void fastcall complete(struct completion *x)
3636{
3637 unsigned long flags;
3638
3639 spin_lock_irqsave(&x->wait.lock, flags);
3640 x->done++;
3641 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3642 1, 0, NULL);
3643 spin_unlock_irqrestore(&x->wait.lock, flags);
3644}
3645EXPORT_SYMBOL(complete);
3646
3647void fastcall complete_all(struct completion *x)
3648{
3649 unsigned long flags;
3650
3651 spin_lock_irqsave(&x->wait.lock, flags);
3652 x->done += UINT_MAX/2;
3653 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3654 0, 0, NULL);
3655 spin_unlock_irqrestore(&x->wait.lock, flags);
3656}
3657EXPORT_SYMBOL(complete_all);
3658
3659void fastcall __sched wait_for_completion(struct completion *x)
3660{
3661 might_sleep();
48f24c4d 3662
1da177e4
LT
3663 spin_lock_irq(&x->wait.lock);
3664 if (!x->done) {
3665 DECLARE_WAITQUEUE(wait, current);
3666
3667 wait.flags |= WQ_FLAG_EXCLUSIVE;
3668 __add_wait_queue_tail(&x->wait, &wait);
3669 do {
3670 __set_current_state(TASK_UNINTERRUPTIBLE);
3671 spin_unlock_irq(&x->wait.lock);
3672 schedule();
3673 spin_lock_irq(&x->wait.lock);
3674 } while (!x->done);
3675 __remove_wait_queue(&x->wait, &wait);
3676 }
3677 x->done--;
3678 spin_unlock_irq(&x->wait.lock);
3679}
3680EXPORT_SYMBOL(wait_for_completion);
3681
3682unsigned long fastcall __sched
3683wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3684{
3685 might_sleep();
3686
3687 spin_lock_irq(&x->wait.lock);
3688 if (!x->done) {
3689 DECLARE_WAITQUEUE(wait, current);
3690
3691 wait.flags |= WQ_FLAG_EXCLUSIVE;
3692 __add_wait_queue_tail(&x->wait, &wait);
3693 do {
3694 __set_current_state(TASK_UNINTERRUPTIBLE);
3695 spin_unlock_irq(&x->wait.lock);
3696 timeout = schedule_timeout(timeout);
3697 spin_lock_irq(&x->wait.lock);
3698 if (!timeout) {
3699 __remove_wait_queue(&x->wait, &wait);
3700 goto out;
3701 }
3702 } while (!x->done);
3703 __remove_wait_queue(&x->wait, &wait);
3704 }
3705 x->done--;
3706out:
3707 spin_unlock_irq(&x->wait.lock);
3708 return timeout;
3709}
3710EXPORT_SYMBOL(wait_for_completion_timeout);
3711
3712int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3713{
3714 int ret = 0;
3715
3716 might_sleep();
3717
3718 spin_lock_irq(&x->wait.lock);
3719 if (!x->done) {
3720 DECLARE_WAITQUEUE(wait, current);
3721
3722 wait.flags |= WQ_FLAG_EXCLUSIVE;
3723 __add_wait_queue_tail(&x->wait, &wait);
3724 do {
3725 if (signal_pending(current)) {
3726 ret = -ERESTARTSYS;
3727 __remove_wait_queue(&x->wait, &wait);
3728 goto out;
3729 }
3730 __set_current_state(TASK_INTERRUPTIBLE);
3731 spin_unlock_irq(&x->wait.lock);
3732 schedule();
3733 spin_lock_irq(&x->wait.lock);
3734 } while (!x->done);
3735 __remove_wait_queue(&x->wait, &wait);
3736 }
3737 x->done--;
3738out:
3739 spin_unlock_irq(&x->wait.lock);
3740
3741 return ret;
3742}
3743EXPORT_SYMBOL(wait_for_completion_interruptible);
3744
3745unsigned long fastcall __sched
3746wait_for_completion_interruptible_timeout(struct completion *x,
3747 unsigned long timeout)
3748{
3749 might_sleep();
3750
3751 spin_lock_irq(&x->wait.lock);
3752 if (!x->done) {
3753 DECLARE_WAITQUEUE(wait, current);
3754
3755 wait.flags |= WQ_FLAG_EXCLUSIVE;
3756 __add_wait_queue_tail(&x->wait, &wait);
3757 do {
3758 if (signal_pending(current)) {
3759 timeout = -ERESTARTSYS;
3760 __remove_wait_queue(&x->wait, &wait);
3761 goto out;
3762 }
3763 __set_current_state(TASK_INTERRUPTIBLE);
3764 spin_unlock_irq(&x->wait.lock);
3765 timeout = schedule_timeout(timeout);
3766 spin_lock_irq(&x->wait.lock);
3767 if (!timeout) {
3768 __remove_wait_queue(&x->wait, &wait);
3769 goto out;
3770 }
3771 } while (!x->done);
3772 __remove_wait_queue(&x->wait, &wait);
3773 }
3774 x->done--;
3775out:
3776 spin_unlock_irq(&x->wait.lock);
3777 return timeout;
3778}
3779EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3780
3781
3782#define SLEEP_ON_VAR \
3783 unsigned long flags; \
3784 wait_queue_t wait; \
3785 init_waitqueue_entry(&wait, current);
3786
3787#define SLEEP_ON_HEAD \
3788 spin_lock_irqsave(&q->lock,flags); \
3789 __add_wait_queue(q, &wait); \
3790 spin_unlock(&q->lock);
3791
3792#define SLEEP_ON_TAIL \
3793 spin_lock_irq(&q->lock); \
3794 __remove_wait_queue(q, &wait); \
3795 spin_unlock_irqrestore(&q->lock, flags);
3796
3797void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3798{
3799 SLEEP_ON_VAR
3800
3801 current->state = TASK_INTERRUPTIBLE;
3802
3803 SLEEP_ON_HEAD
3804 schedule();
3805 SLEEP_ON_TAIL
3806}
1da177e4
LT
3807EXPORT_SYMBOL(interruptible_sleep_on);
3808
95cdf3b7
IM
3809long fastcall __sched
3810interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
1da177e4
LT
3811{
3812 SLEEP_ON_VAR
3813
3814 current->state = TASK_INTERRUPTIBLE;
3815
3816 SLEEP_ON_HEAD
3817 timeout = schedule_timeout(timeout);
3818 SLEEP_ON_TAIL
3819
3820 return timeout;
3821}
1da177e4
LT
3822EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3823
3824void fastcall __sched sleep_on(wait_queue_head_t *q)
3825{
3826 SLEEP_ON_VAR
3827
3828 current->state = TASK_UNINTERRUPTIBLE;
3829
3830 SLEEP_ON_HEAD
3831 schedule();
3832 SLEEP_ON_TAIL
3833}
1da177e4
LT
3834EXPORT_SYMBOL(sleep_on);
3835
3836long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3837{
3838 SLEEP_ON_VAR
3839
3840 current->state = TASK_UNINTERRUPTIBLE;
3841
3842 SLEEP_ON_HEAD
3843 timeout = schedule_timeout(timeout);
3844 SLEEP_ON_TAIL
3845
3846 return timeout;
3847}
3848
3849EXPORT_SYMBOL(sleep_on_timeout);
3850
b29739f9
IM
3851#ifdef CONFIG_RT_MUTEXES
3852
3853/*
3854 * rt_mutex_setprio - set the current priority of a task
3855 * @p: task
3856 * @prio: prio value (kernel-internal form)
3857 *
3858 * This function changes the 'effective' priority of a task. It does
3859 * not touch ->normal_prio like __setscheduler().
3860 *
3861 * Used by the rt_mutex code to implement priority inheritance logic.
3862 */
36c8b586 3863void rt_mutex_setprio(struct task_struct *p, int prio)
b29739f9
IM
3864{
3865 unsigned long flags;
dd41f596 3866 int oldprio, on_rq;
70b97a7f 3867 struct rq *rq;
dd41f596 3868 u64 now;
b29739f9
IM
3869
3870 BUG_ON(prio < 0 || prio > MAX_PRIO);
3871
3872 rq = task_rq_lock(p, &flags);
dd41f596 3873 now = rq_clock(rq);
b29739f9 3874
d5f9f942 3875 oldprio = p->prio;
dd41f596
IM
3876 on_rq = p->se.on_rq;
3877 if (on_rq)
3878 dequeue_task(rq, p, 0, now);
3879
3880 if (rt_prio(prio))
3881 p->sched_class = &rt_sched_class;
3882 else
3883 p->sched_class = &fair_sched_class;
3884
b29739f9
IM
3885 p->prio = prio;
3886
dd41f596
IM
3887 if (on_rq) {
3888 enqueue_task(rq, p, 0, now);
b29739f9
IM
3889 /*
3890 * Reschedule if we are currently running on this runqueue and
d5f9f942
AM
3891 * our priority decreased, or if we are not currently running on
3892 * this runqueue and our priority is higher than the current's
b29739f9 3893 */
d5f9f942
AM
3894 if (task_running(rq, p)) {
3895 if (p->prio > oldprio)
3896 resched_task(rq->curr);
dd41f596
IM
3897 } else {
3898 check_preempt_curr(rq, p);
3899 }
b29739f9
IM
3900 }
3901 task_rq_unlock(rq, &flags);
3902}
3903
3904#endif
3905
36c8b586 3906void set_user_nice(struct task_struct *p, long nice)
1da177e4 3907{
dd41f596 3908 int old_prio, delta, on_rq;
1da177e4 3909 unsigned long flags;
70b97a7f 3910 struct rq *rq;
dd41f596 3911 u64 now;
1da177e4
LT
3912
3913 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3914 return;
3915 /*
3916 * We have to be careful, if called from sys_setpriority(),
3917 * the task might be in the middle of scheduling on another CPU.
3918 */
3919 rq = task_rq_lock(p, &flags);
dd41f596 3920 now = rq_clock(rq);
1da177e4
LT
3921 /*
3922 * The RT priorities are set via sched_setscheduler(), but we still
3923 * allow the 'normal' nice value to be set - but as expected
3924 * it wont have any effect on scheduling until the task is
dd41f596 3925 * SCHED_FIFO/SCHED_RR:
1da177e4 3926 */
e05606d3 3927 if (task_has_rt_policy(p)) {
1da177e4
LT
3928 p->static_prio = NICE_TO_PRIO(nice);
3929 goto out_unlock;
3930 }
dd41f596
IM
3931 on_rq = p->se.on_rq;
3932 if (on_rq) {
3933 dequeue_task(rq, p, 0, now);
3934 dec_load(rq, p, now);
2dd73a4f 3935 }
1da177e4 3936
1da177e4 3937 p->static_prio = NICE_TO_PRIO(nice);
2dd73a4f 3938 set_load_weight(p);
b29739f9
IM
3939 old_prio = p->prio;
3940 p->prio = effective_prio(p);
3941 delta = p->prio - old_prio;
1da177e4 3942
dd41f596
IM
3943 if (on_rq) {
3944 enqueue_task(rq, p, 0, now);
3945 inc_load(rq, p, now);
1da177e4 3946 /*
d5f9f942
AM
3947 * If the task increased its priority or is running and
3948 * lowered its priority, then reschedule its CPU:
1da177e4 3949 */
d5f9f942 3950 if (delta < 0 || (delta > 0 && task_running(rq, p)))
1da177e4
LT
3951 resched_task(rq->curr);
3952 }
3953out_unlock:
3954 task_rq_unlock(rq, &flags);
3955}
1da177e4
LT
3956EXPORT_SYMBOL(set_user_nice);
3957
e43379f1
MM
3958/*
3959 * can_nice - check if a task can reduce its nice value
3960 * @p: task
3961 * @nice: nice value
3962 */
36c8b586 3963int can_nice(const struct task_struct *p, const int nice)
e43379f1 3964{
024f4747
MM
3965 /* convert nice value [19,-20] to rlimit style value [1,40] */
3966 int nice_rlim = 20 - nice;
48f24c4d 3967
e43379f1
MM
3968 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3969 capable(CAP_SYS_NICE));
3970}
3971
1da177e4
LT
3972#ifdef __ARCH_WANT_SYS_NICE
3973
3974/*
3975 * sys_nice - change the priority of the current process.
3976 * @increment: priority increment
3977 *
3978 * sys_setpriority is a more generic, but much slower function that
3979 * does similar things.
3980 */
3981asmlinkage long sys_nice(int increment)
3982{
48f24c4d 3983 long nice, retval;
1da177e4
LT
3984
3985 /*
3986 * Setpriority might change our priority at the same moment.
3987 * We don't have to worry. Conceptually one call occurs first
3988 * and we have a single winner.
3989 */
e43379f1
MM
3990 if (increment < -40)
3991 increment = -40;
1da177e4
LT
3992 if (increment > 40)
3993 increment = 40;
3994
3995 nice = PRIO_TO_NICE(current->static_prio) + increment;
3996 if (nice < -20)
3997 nice = -20;
3998 if (nice > 19)
3999 nice = 19;
4000
e43379f1
MM
4001 if (increment < 0 && !can_nice(current, nice))
4002 return -EPERM;
4003
1da177e4
LT
4004 retval = security_task_setnice(current, nice);
4005 if (retval)
4006 return retval;
4007
4008 set_user_nice(current, nice);
4009 return 0;
4010}
4011
4012#endif
4013
4014/**
4015 * task_prio - return the priority value of a given task.
4016 * @p: the task in question.
4017 *
4018 * This is the priority value as seen by users in /proc.
4019 * RT tasks are offset by -200. Normal tasks are centered
4020 * around 0, value goes from -16 to +15.
4021 */
36c8b586 4022int task_prio(const struct task_struct *p)
1da177e4
LT
4023{
4024 return p->prio - MAX_RT_PRIO;
4025}
4026
4027/**
4028 * task_nice - return the nice value of a given task.
4029 * @p: the task in question.
4030 */
36c8b586 4031int task_nice(const struct task_struct *p)
1da177e4
LT
4032{
4033 return TASK_NICE(p);
4034}
1da177e4 4035EXPORT_SYMBOL_GPL(task_nice);
1da177e4
LT
4036
4037/**
4038 * idle_cpu - is a given cpu idle currently?
4039 * @cpu: the processor in question.
4040 */
4041int idle_cpu(int cpu)
4042{
4043 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4044}
4045
1da177e4
LT
4046/**
4047 * idle_task - return the idle task for a given cpu.
4048 * @cpu: the processor in question.
4049 */
36c8b586 4050struct task_struct *idle_task(int cpu)
1da177e4
LT
4051{
4052 return cpu_rq(cpu)->idle;
4053}
4054
4055/**
4056 * find_process_by_pid - find a process with a matching PID value.
4057 * @pid: the pid in question.
4058 */
36c8b586 4059static inline struct task_struct *find_process_by_pid(pid_t pid)
1da177e4
LT
4060{
4061 return pid ? find_task_by_pid(pid) : current;
4062}
4063
4064/* Actually do priority change: must hold rq lock. */
dd41f596
IM
4065static void
4066__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
1da177e4 4067{
dd41f596 4068 BUG_ON(p->se.on_rq);
48f24c4d 4069
1da177e4 4070 p->policy = policy;
dd41f596
IM
4071 switch (p->policy) {
4072 case SCHED_NORMAL:
4073 case SCHED_BATCH:
4074 case SCHED_IDLE:
4075 p->sched_class = &fair_sched_class;
4076 break;
4077 case SCHED_FIFO:
4078 case SCHED_RR:
4079 p->sched_class = &rt_sched_class;
4080 break;
4081 }
4082
1da177e4 4083 p->rt_priority = prio;
b29739f9
IM
4084 p->normal_prio = normal_prio(p);
4085 /* we are holding p->pi_lock already */
4086 p->prio = rt_mutex_getprio(p);
2dd73a4f 4087 set_load_weight(p);
1da177e4
LT
4088}
4089
4090/**
72fd4a35 4091 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
1da177e4
LT
4092 * @p: the task in question.
4093 * @policy: new policy.
4094 * @param: structure containing the new RT priority.
5fe1d75f 4095 *
72fd4a35 4096 * NOTE that the task may be already dead.
1da177e4 4097 */
95cdf3b7
IM
4098int sched_setscheduler(struct task_struct *p, int policy,
4099 struct sched_param *param)
1da177e4 4100{
dd41f596 4101 int retval, oldprio, oldpolicy = -1, on_rq;
1da177e4 4102 unsigned long flags;
70b97a7f 4103 struct rq *rq;
1da177e4 4104
66e5393a
SR
4105 /* may grab non-irq protected spin_locks */
4106 BUG_ON(in_interrupt());
1da177e4
LT
4107recheck:
4108 /* double check policy once rq lock held */
4109 if (policy < 0)
4110 policy = oldpolicy = p->policy;
4111 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
dd41f596
IM
4112 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4113 policy != SCHED_IDLE)
b0a9499c 4114 return -EINVAL;
1da177e4
LT
4115 /*
4116 * Valid priorities for SCHED_FIFO and SCHED_RR are
dd41f596
IM
4117 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4118 * SCHED_BATCH and SCHED_IDLE is 0.
1da177e4
LT
4119 */
4120 if (param->sched_priority < 0 ||
95cdf3b7 4121 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
d46523ea 4122 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
1da177e4 4123 return -EINVAL;
e05606d3 4124 if (rt_policy(policy) != (param->sched_priority != 0))
1da177e4
LT
4125 return -EINVAL;
4126
37e4ab3f
OC
4127 /*
4128 * Allow unprivileged RT tasks to decrease priority:
4129 */
4130 if (!capable(CAP_SYS_NICE)) {
e05606d3 4131 if (rt_policy(policy)) {
8dc3e909 4132 unsigned long rlim_rtprio;
8dc3e909
ON
4133
4134 if (!lock_task_sighand(p, &flags))
4135 return -ESRCH;
4136 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4137 unlock_task_sighand(p, &flags);
4138
4139 /* can't set/change the rt policy */
4140 if (policy != p->policy && !rlim_rtprio)
4141 return -EPERM;
4142
4143 /* can't increase priority */
4144 if (param->sched_priority > p->rt_priority &&
4145 param->sched_priority > rlim_rtprio)
4146 return -EPERM;
4147 }
dd41f596
IM
4148 /*
4149 * Like positive nice levels, dont allow tasks to
4150 * move out of SCHED_IDLE either:
4151 */
4152 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4153 return -EPERM;
5fe1d75f 4154
37e4ab3f
OC
4155 /* can't change other user's priorities */
4156 if ((current->euid != p->euid) &&
4157 (current->euid != p->uid))
4158 return -EPERM;
4159 }
1da177e4
LT
4160
4161 retval = security_task_setscheduler(p, policy, param);
4162 if (retval)
4163 return retval;
b29739f9
IM
4164 /*
4165 * make sure no PI-waiters arrive (or leave) while we are
4166 * changing the priority of the task:
4167 */
4168 spin_lock_irqsave(&p->pi_lock, flags);
1da177e4
LT
4169 /*
4170 * To be able to change p->policy safely, the apropriate
4171 * runqueue lock must be held.
4172 */
b29739f9 4173 rq = __task_rq_lock(p);
1da177e4
LT
4174 /* recheck policy now with rq lock held */
4175 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4176 policy = oldpolicy = -1;
b29739f9
IM
4177 __task_rq_unlock(rq);
4178 spin_unlock_irqrestore(&p->pi_lock, flags);
1da177e4
LT
4179 goto recheck;
4180 }
dd41f596
IM
4181 on_rq = p->se.on_rq;
4182 if (on_rq)
4183 deactivate_task(rq, p, 0);
1da177e4 4184 oldprio = p->prio;
dd41f596
IM
4185 __setscheduler(rq, p, policy, param->sched_priority);
4186 if (on_rq) {
4187 activate_task(rq, p, 0);
1da177e4
LT
4188 /*
4189 * Reschedule if we are currently running on this runqueue and
d5f9f942
AM
4190 * our priority decreased, or if we are not currently running on
4191 * this runqueue and our priority is higher than the current's
1da177e4 4192 */
d5f9f942
AM
4193 if (task_running(rq, p)) {
4194 if (p->prio > oldprio)
4195 resched_task(rq->curr);
dd41f596
IM
4196 } else {
4197 check_preempt_curr(rq, p);
4198 }
1da177e4 4199 }
b29739f9
IM
4200 __task_rq_unlock(rq);
4201 spin_unlock_irqrestore(&p->pi_lock, flags);
4202
95e02ca9
TG
4203 rt_mutex_adjust_pi(p);
4204
1da177e4
LT
4205 return 0;
4206}
4207EXPORT_SYMBOL_GPL(sched_setscheduler);
4208
95cdf3b7
IM
4209static int
4210do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
1da177e4 4211{
1da177e4
LT
4212 struct sched_param lparam;
4213 struct task_struct *p;
36c8b586 4214 int retval;
1da177e4
LT
4215
4216 if (!param || pid < 0)
4217 return -EINVAL;
4218 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4219 return -EFAULT;
5fe1d75f
ON
4220
4221 rcu_read_lock();
4222 retval = -ESRCH;
1da177e4 4223 p = find_process_by_pid(pid);
5fe1d75f
ON
4224 if (p != NULL)
4225 retval = sched_setscheduler(p, policy, &lparam);
4226 rcu_read_unlock();
36c8b586 4227
1da177e4
LT
4228 return retval;
4229}
4230
4231/**
4232 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4233 * @pid: the pid in question.
4234 * @policy: new policy.
4235 * @param: structure containing the new RT priority.
4236 */
4237asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4238 struct sched_param __user *param)
4239{
c21761f1
JB
4240 /* negative values for policy are not valid */
4241 if (policy < 0)
4242 return -EINVAL;
4243
1da177e4
LT
4244 return do_sched_setscheduler(pid, policy, param);
4245}
4246
4247/**
4248 * sys_sched_setparam - set/change the RT priority of a thread
4249 * @pid: the pid in question.
4250 * @param: structure containing the new RT priority.
4251 */
4252asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4253{
4254 return do_sched_setscheduler(pid, -1, param);
4255}
4256
4257/**
4258 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4259 * @pid: the pid in question.
4260 */
4261asmlinkage long sys_sched_getscheduler(pid_t pid)
4262{
36c8b586 4263 struct task_struct *p;
1da177e4 4264 int retval = -EINVAL;
1da177e4
LT
4265
4266 if (pid < 0)
4267 goto out_nounlock;
4268
4269 retval = -ESRCH;
4270 read_lock(&tasklist_lock);
4271 p = find_process_by_pid(pid);
4272 if (p) {
4273 retval = security_task_getscheduler(p);
4274 if (!retval)
4275 retval = p->policy;
4276 }
4277 read_unlock(&tasklist_lock);
4278
4279out_nounlock:
4280 return retval;
4281}
4282
4283/**
4284 * sys_sched_getscheduler - get the RT priority of a thread
4285 * @pid: the pid in question.
4286 * @param: structure containing the RT priority.
4287 */
4288asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4289{
4290 struct sched_param lp;
36c8b586 4291 struct task_struct *p;
1da177e4 4292 int retval = -EINVAL;
1da177e4
LT
4293
4294 if (!param || pid < 0)
4295 goto out_nounlock;
4296
4297 read_lock(&tasklist_lock);
4298 p = find_process_by_pid(pid);
4299 retval = -ESRCH;
4300 if (!p)
4301 goto out_unlock;
4302
4303 retval = security_task_getscheduler(p);
4304 if (retval)
4305 goto out_unlock;
4306
4307 lp.sched_priority = p->rt_priority;
4308 read_unlock(&tasklist_lock);
4309
4310 /*
4311 * This one might sleep, we cannot do it with a spinlock held ...
4312 */
4313 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4314
4315out_nounlock:
4316 return retval;
4317
4318out_unlock:
4319 read_unlock(&tasklist_lock);
4320 return retval;
4321}
4322
4323long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4324{
1da177e4 4325 cpumask_t cpus_allowed;
36c8b586
IM
4326 struct task_struct *p;
4327 int retval;
1da177e4 4328
5be9361c 4329 mutex_lock(&sched_hotcpu_mutex);
1da177e4
LT
4330 read_lock(&tasklist_lock);
4331
4332 p = find_process_by_pid(pid);
4333 if (!p) {
4334 read_unlock(&tasklist_lock);
5be9361c 4335 mutex_unlock(&sched_hotcpu_mutex);
1da177e4
LT
4336 return -ESRCH;
4337 }
4338
4339 /*
4340 * It is not safe to call set_cpus_allowed with the
4341 * tasklist_lock held. We will bump the task_struct's
4342 * usage count and then drop tasklist_lock.
4343 */
4344 get_task_struct(p);
4345 read_unlock(&tasklist_lock);
4346
4347 retval = -EPERM;
4348 if ((current->euid != p->euid) && (current->euid != p->uid) &&
4349 !capable(CAP_SYS_NICE))
4350 goto out_unlock;
4351
e7834f8f
DQ
4352 retval = security_task_setscheduler(p, 0, NULL);
4353 if (retval)
4354 goto out_unlock;
4355
1da177e4
LT
4356 cpus_allowed = cpuset_cpus_allowed(p);
4357 cpus_and(new_mask, new_mask, cpus_allowed);
4358 retval = set_cpus_allowed(p, new_mask);
4359
4360out_unlock:
4361 put_task_struct(p);
5be9361c 4362 mutex_unlock(&sched_hotcpu_mutex);
1da177e4
LT
4363 return retval;
4364}
4365
4366static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4367 cpumask_t *new_mask)
4368{
4369 if (len < sizeof(cpumask_t)) {
4370 memset(new_mask, 0, sizeof(cpumask_t));
4371 } else if (len > sizeof(cpumask_t)) {
4372 len = sizeof(cpumask_t);
4373 }
4374 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4375}
4376
4377/**
4378 * sys_sched_setaffinity - set the cpu affinity of a process
4379 * @pid: pid of the process
4380 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4381 * @user_mask_ptr: user-space pointer to the new cpu mask
4382 */
4383asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4384 unsigned long __user *user_mask_ptr)
4385{
4386 cpumask_t new_mask;
4387 int retval;
4388
4389 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4390 if (retval)
4391 return retval;
4392
4393 return sched_setaffinity(pid, new_mask);
4394}
4395
4396/*
4397 * Represents all cpu's present in the system
4398 * In systems capable of hotplug, this map could dynamically grow
4399 * as new cpu's are detected in the system via any platform specific
4400 * method, such as ACPI for e.g.
4401 */
4402
4cef0c61 4403cpumask_t cpu_present_map __read_mostly;
1da177e4
LT
4404EXPORT_SYMBOL(cpu_present_map);
4405
4406#ifndef CONFIG_SMP
4cef0c61 4407cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
e16b38f7
GB
4408EXPORT_SYMBOL(cpu_online_map);
4409
4cef0c61 4410cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
e16b38f7 4411EXPORT_SYMBOL(cpu_possible_map);
1da177e4
LT
4412#endif
4413
4414long sched_getaffinity(pid_t pid, cpumask_t *mask)
4415{
36c8b586 4416 struct task_struct *p;
1da177e4 4417 int retval;
1da177e4 4418
5be9361c 4419 mutex_lock(&sched_hotcpu_mutex);
1da177e4
LT
4420 read_lock(&tasklist_lock);
4421
4422 retval = -ESRCH;
4423 p = find_process_by_pid(pid);
4424 if (!p)
4425 goto out_unlock;
4426
e7834f8f
DQ
4427 retval = security_task_getscheduler(p);
4428 if (retval)
4429 goto out_unlock;
4430
2f7016d9 4431 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
1da177e4
LT
4432
4433out_unlock:
4434 read_unlock(&tasklist_lock);
5be9361c 4435 mutex_unlock(&sched_hotcpu_mutex);
1da177e4
LT
4436 if (retval)
4437 return retval;
4438
4439 return 0;
4440}
4441
4442/**
4443 * sys_sched_getaffinity - get the cpu affinity of a process
4444 * @pid: pid of the process
4445 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4446 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4447 */
4448asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4449 unsigned long __user *user_mask_ptr)
4450{
4451 int ret;
4452 cpumask_t mask;
4453
4454 if (len < sizeof(cpumask_t))
4455 return -EINVAL;
4456
4457 ret = sched_getaffinity(pid, &mask);
4458 if (ret < 0)
4459 return ret;
4460
4461 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4462 return -EFAULT;
4463
4464 return sizeof(cpumask_t);
4465}
4466
4467/**
4468 * sys_sched_yield - yield the current processor to other threads.
4469 *
dd41f596
IM
4470 * This function yields the current CPU to other tasks. If there are no
4471 * other threads running on this CPU then this function will return.
1da177e4
LT
4472 */
4473asmlinkage long sys_sched_yield(void)
4474{
70b97a7f 4475 struct rq *rq = this_rq_lock();
1da177e4
LT
4476
4477 schedstat_inc(rq, yld_cnt);
dd41f596 4478 if (unlikely(rq->nr_running == 1))
1da177e4 4479 schedstat_inc(rq, yld_act_empty);
dd41f596
IM
4480 else
4481 current->sched_class->yield_task(rq, current);
1da177e4
LT
4482
4483 /*
4484 * Since we are going to call schedule() anyway, there's
4485 * no need to preempt or enable interrupts:
4486 */
4487 __release(rq->lock);
8a25d5de 4488 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1da177e4
LT
4489 _raw_spin_unlock(&rq->lock);
4490 preempt_enable_no_resched();
4491
4492 schedule();
4493
4494 return 0;
4495}
4496
e7b38404 4497static void __cond_resched(void)
1da177e4 4498{
8e0a43d8
IM
4499#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4500 __might_sleep(__FILE__, __LINE__);
4501#endif
5bbcfd90
IM
4502 /*
4503 * The BKS might be reacquired before we have dropped
4504 * PREEMPT_ACTIVE, which could trigger a second
4505 * cond_resched() call.
4506 */
1da177e4
LT
4507 do {
4508 add_preempt_count(PREEMPT_ACTIVE);
4509 schedule();
4510 sub_preempt_count(PREEMPT_ACTIVE);
4511 } while (need_resched());
4512}
4513
4514int __sched cond_resched(void)
4515{
9414232f
IM
4516 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4517 system_state == SYSTEM_RUNNING) {
1da177e4
LT
4518 __cond_resched();
4519 return 1;
4520 }
4521 return 0;
4522}
1da177e4
LT
4523EXPORT_SYMBOL(cond_resched);
4524
4525/*
4526 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4527 * call schedule, and on return reacquire the lock.
4528 *
4529 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4530 * operations here to prevent schedule() from being called twice (once via
4531 * spin_unlock(), once by hand).
4532 */
95cdf3b7 4533int cond_resched_lock(spinlock_t *lock)
1da177e4 4534{
6df3cecb
JK
4535 int ret = 0;
4536
1da177e4
LT
4537 if (need_lockbreak(lock)) {
4538 spin_unlock(lock);
4539 cpu_relax();
6df3cecb 4540 ret = 1;
1da177e4
LT
4541 spin_lock(lock);
4542 }
9414232f 4543 if (need_resched() && system_state == SYSTEM_RUNNING) {
8a25d5de 4544 spin_release(&lock->dep_map, 1, _THIS_IP_);
1da177e4
LT
4545 _raw_spin_unlock(lock);
4546 preempt_enable_no_resched();
4547 __cond_resched();
6df3cecb 4548 ret = 1;
1da177e4 4549 spin_lock(lock);
1da177e4 4550 }
6df3cecb 4551 return ret;
1da177e4 4552}
1da177e4
LT
4553EXPORT_SYMBOL(cond_resched_lock);
4554
4555int __sched cond_resched_softirq(void)
4556{
4557 BUG_ON(!in_softirq());
4558
9414232f 4559 if (need_resched() && system_state == SYSTEM_RUNNING) {
98d82567 4560 local_bh_enable();
1da177e4
LT
4561 __cond_resched();
4562 local_bh_disable();
4563 return 1;
4564 }
4565 return 0;
4566}
1da177e4
LT
4567EXPORT_SYMBOL(cond_resched_softirq);
4568
1da177e4
LT
4569/**
4570 * yield - yield the current processor to other threads.
4571 *
72fd4a35 4572 * This is a shortcut for kernel-space yielding - it marks the
1da177e4
LT
4573 * thread runnable and calls sys_sched_yield().
4574 */
4575void __sched yield(void)
4576{
4577 set_current_state(TASK_RUNNING);
4578 sys_sched_yield();
4579}
1da177e4
LT
4580EXPORT_SYMBOL(yield);
4581
4582/*
4583 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4584 * that process accounting knows that this is a task in IO wait state.
4585 *
4586 * But don't do that if it is a deliberate, throttling IO wait (this task
4587 * has set its backing_dev_info: the queue against which it should throttle)
4588 */
4589void __sched io_schedule(void)
4590{
70b97a7f 4591 struct rq *rq = &__raw_get_cpu_var(runqueues);
1da177e4 4592
0ff92245 4593 delayacct_blkio_start();
1da177e4
LT
4594 atomic_inc(&rq->nr_iowait);
4595 schedule();
4596 atomic_dec(&rq->nr_iowait);
0ff92245 4597 delayacct_blkio_end();
1da177e4 4598}
1da177e4
LT
4599EXPORT_SYMBOL(io_schedule);
4600
4601long __sched io_schedule_timeout(long timeout)
4602{
70b97a7f 4603 struct rq *rq = &__raw_get_cpu_var(runqueues);
1da177e4
LT
4604 long ret;
4605
0ff92245 4606 delayacct_blkio_start();
1da177e4
LT
4607 atomic_inc(&rq->nr_iowait);
4608 ret = schedule_timeout(timeout);
4609 atomic_dec(&rq->nr_iowait);
0ff92245 4610 delayacct_blkio_end();
1da177e4
LT
4611 return ret;
4612}
4613
4614/**
4615 * sys_sched_get_priority_max - return maximum RT priority.
4616 * @policy: scheduling class.
4617 *
4618 * this syscall returns the maximum rt_priority that can be used
4619 * by a given scheduling class.
4620 */
4621asmlinkage long sys_sched_get_priority_max(int policy)
4622{
4623 int ret = -EINVAL;
4624
4625 switch (policy) {
4626 case SCHED_FIFO:
4627 case SCHED_RR:
4628 ret = MAX_USER_RT_PRIO-1;
4629 break;
4630 case SCHED_NORMAL:
b0a9499c 4631 case SCHED_BATCH:
dd41f596 4632 case SCHED_IDLE:
1da177e4
LT
4633 ret = 0;
4634 break;
4635 }
4636 return ret;
4637}
4638
4639/**
4640 * sys_sched_get_priority_min - return minimum RT priority.
4641 * @policy: scheduling class.
4642 *
4643 * this syscall returns the minimum rt_priority that can be used
4644 * by a given scheduling class.
4645 */
4646asmlinkage long sys_sched_get_priority_min(int policy)
4647{
4648 int ret = -EINVAL;
4649
4650 switch (policy) {
4651 case SCHED_FIFO:
4652 case SCHED_RR:
4653 ret = 1;
4654 break;
4655 case SCHED_NORMAL:
b0a9499c 4656 case SCHED_BATCH:
dd41f596 4657 case SCHED_IDLE:
1da177e4
LT
4658 ret = 0;
4659 }
4660 return ret;
4661}
4662
4663/**
4664 * sys_sched_rr_get_interval - return the default timeslice of a process.
4665 * @pid: pid of the process.
4666 * @interval: userspace pointer to the timeslice value.
4667 *
4668 * this syscall writes the default timeslice value of a given process
4669 * into the user-space timespec buffer. A value of '0' means infinity.
4670 */
4671asmlinkage
4672long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4673{
36c8b586 4674 struct task_struct *p;
1da177e4
LT
4675 int retval = -EINVAL;
4676 struct timespec t;
1da177e4
LT
4677
4678 if (pid < 0)
4679 goto out_nounlock;
4680
4681 retval = -ESRCH;
4682 read_lock(&tasklist_lock);
4683 p = find_process_by_pid(pid);
4684 if (!p)
4685 goto out_unlock;
4686
4687 retval = security_task_getscheduler(p);
4688 if (retval)
4689 goto out_unlock;
4690
b78709cf 4691 jiffies_to_timespec(p->policy == SCHED_FIFO ?
dd41f596 4692 0 : static_prio_timeslice(p->static_prio), &t);
1da177e4
LT
4693 read_unlock(&tasklist_lock);
4694 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4695out_nounlock:
4696 return retval;
4697out_unlock:
4698 read_unlock(&tasklist_lock);
4699 return retval;
4700}
4701
2ed6e34f 4702static const char stat_nam[] = "RSDTtZX";
36c8b586
IM
4703
4704static void show_task(struct task_struct *p)
1da177e4 4705{
1da177e4 4706 unsigned long free = 0;
36c8b586 4707 unsigned state;
1da177e4 4708
1da177e4 4709 state = p->state ? __ffs(p->state) + 1 : 0;
2ed6e34f
AM
4710 printk("%-13.13s %c", p->comm,
4711 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
1da177e4
LT
4712#if (BITS_PER_LONG == 32)
4713 if (state == TASK_RUNNING)
4714 printk(" running ");
4715 else
4716 printk(" %08lX ", thread_saved_pc(p));
4717#else
4718 if (state == TASK_RUNNING)
4719 printk(" running task ");
4720 else
4721 printk(" %016lx ", thread_saved_pc(p));
4722#endif
4723#ifdef CONFIG_DEBUG_STACK_USAGE
4724 {
10ebffde 4725 unsigned long *n = end_of_stack(p);
1da177e4
LT
4726 while (!*n)
4727 n++;
10ebffde 4728 free = (unsigned long)n - (unsigned long)end_of_stack(p);
1da177e4
LT
4729 }
4730#endif
35f6f753 4731 printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
1da177e4
LT
4732 if (!p->mm)
4733 printk(" (L-TLB)\n");
4734 else
4735 printk(" (NOTLB)\n");
4736
4737 if (state != TASK_RUNNING)
4738 show_stack(p, NULL);
4739}
4740
e59e2ae2 4741void show_state_filter(unsigned long state_filter)
1da177e4 4742{
36c8b586 4743 struct task_struct *g, *p;
1da177e4
LT
4744
4745#if (BITS_PER_LONG == 32)
4746 printk("\n"
301827ac
CC
4747 " free sibling\n");
4748 printk(" task PC stack pid father child younger older\n");
1da177e4
LT
4749#else
4750 printk("\n"
301827ac
CC
4751 " free sibling\n");
4752 printk(" task PC stack pid father child younger older\n");
1da177e4
LT
4753#endif
4754 read_lock(&tasklist_lock);
4755 do_each_thread(g, p) {
4756 /*
4757 * reset the NMI-timeout, listing all files on a slow
4758 * console might take alot of time:
4759 */
4760 touch_nmi_watchdog();
39bc89fd 4761 if (!state_filter || (p->state & state_filter))
e59e2ae2 4762 show_task(p);
1da177e4
LT
4763 } while_each_thread(g, p);
4764
04c9167f
JF
4765 touch_all_softlockup_watchdogs();
4766
dd41f596
IM
4767#ifdef CONFIG_SCHED_DEBUG
4768 sysrq_sched_debug_show();
4769#endif
1da177e4 4770 read_unlock(&tasklist_lock);
e59e2ae2
IM
4771 /*
4772 * Only show locks if all tasks are dumped:
4773 */
4774 if (state_filter == -1)
4775 debug_show_all_locks();
1da177e4
LT
4776}
4777
1df21055
IM
4778void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4779{
dd41f596 4780 idle->sched_class = &idle_sched_class;
1df21055
IM
4781}
4782
f340c0d1
IM
4783/**
4784 * init_idle - set up an idle thread for a given CPU
4785 * @idle: task in question
4786 * @cpu: cpu the idle task belongs to
4787 *
4788 * NOTE: this function does not set the idle thread's NEED_RESCHED
4789 * flag, to make booting more robust.
4790 */
5c1e1767 4791void __cpuinit init_idle(struct task_struct *idle, int cpu)
1da177e4 4792{
70b97a7f 4793 struct rq *rq = cpu_rq(cpu);
1da177e4
LT
4794 unsigned long flags;
4795
dd41f596
IM
4796 __sched_fork(idle);
4797 idle->se.exec_start = sched_clock();
4798
b29739f9 4799 idle->prio = idle->normal_prio = MAX_PRIO;
1da177e4 4800 idle->cpus_allowed = cpumask_of_cpu(cpu);
dd41f596 4801 __set_task_cpu(idle, cpu);
1da177e4
LT
4802
4803 spin_lock_irqsave(&rq->lock, flags);
4804 rq->curr = rq->idle = idle;
4866cde0
NP
4805#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4806 idle->oncpu = 1;
4807#endif
1da177e4
LT
4808 spin_unlock_irqrestore(&rq->lock, flags);
4809
4810 /* Set the preempt count _outside_ the spinlocks! */
4811#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
a1261f54 4812 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
1da177e4 4813#else
a1261f54 4814 task_thread_info(idle)->preempt_count = 0;
1da177e4 4815#endif
dd41f596
IM
4816 /*
4817 * The idle tasks have their own, simple scheduling class:
4818 */
4819 idle->sched_class = &idle_sched_class;
1da177e4
LT
4820}
4821
4822/*
4823 * In a system that switches off the HZ timer nohz_cpu_mask
4824 * indicates which cpus entered this state. This is used
4825 * in the rcu update to wait only for active cpus. For system
4826 * which do not switch off the HZ timer nohz_cpu_mask should
4827 * always be CPU_MASK_NONE.
4828 */
4829cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4830
dd41f596
IM
4831/*
4832 * Increase the granularity value when there are more CPUs,
4833 * because with more CPUs the 'effective latency' as visible
4834 * to users decreases. But the relationship is not linear,
4835 * so pick a second-best guess by going with the log2 of the
4836 * number of CPUs.
4837 *
4838 * This idea comes from the SD scheduler of Con Kolivas:
4839 */
4840static inline void sched_init_granularity(void)
4841{
4842 unsigned int factor = 1 + ilog2(num_online_cpus());
4843 const unsigned long gran_limit = 10000000;
4844
4845 sysctl_sched_granularity *= factor;
4846 if (sysctl_sched_granularity > gran_limit)
4847 sysctl_sched_granularity = gran_limit;
4848
4849 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4850 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4851}
4852
1da177e4
LT
4853#ifdef CONFIG_SMP
4854/*
4855 * This is how migration works:
4856 *
70b97a7f 4857 * 1) we queue a struct migration_req structure in the source CPU's
1da177e4
LT
4858 * runqueue and wake up that CPU's migration thread.
4859 * 2) we down() the locked semaphore => thread blocks.
4860 * 3) migration thread wakes up (implicitly it forces the migrated
4861 * thread off the CPU)
4862 * 4) it gets the migration request and checks whether the migrated
4863 * task is still in the wrong runqueue.
4864 * 5) if it's in the wrong runqueue then the migration thread removes
4865 * it and puts it into the right queue.
4866 * 6) migration thread up()s the semaphore.
4867 * 7) we wake up and the migration is done.
4868 */
4869
4870/*
4871 * Change a given task's CPU affinity. Migrate the thread to a
4872 * proper CPU and schedule it away if the CPU it's executing on
4873 * is removed from the allowed bitmask.
4874 *
4875 * NOTE: the caller must have a valid reference to the task, the
4876 * task must not exit() & deallocate itself prematurely. The
4877 * call is not atomic; no spinlocks may be held.
4878 */
36c8b586 4879int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
1da177e4 4880{
70b97a7f 4881 struct migration_req req;
1da177e4 4882 unsigned long flags;
70b97a7f 4883 struct rq *rq;
48f24c4d 4884 int ret = 0;
1da177e4
LT
4885
4886 rq = task_rq_lock(p, &flags);
4887 if (!cpus_intersects(new_mask, cpu_online_map)) {
4888 ret = -EINVAL;
4889 goto out;
4890 }
4891
4892 p->cpus_allowed = new_mask;
4893 /* Can the task run on the task's current CPU? If so, we're done */
4894 if (cpu_isset(task_cpu(p), new_mask))
4895 goto out;
4896
4897 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4898 /* Need help from migration thread: drop lock and wait. */
4899 task_rq_unlock(rq, &flags);
4900 wake_up_process(rq->migration_thread);
4901 wait_for_completion(&req.done);
4902 tlb_migrate_finish(p->mm);
4903 return 0;
4904 }
4905out:
4906 task_rq_unlock(rq, &flags);
48f24c4d 4907
1da177e4
LT
4908 return ret;
4909}
1da177e4
LT
4910EXPORT_SYMBOL_GPL(set_cpus_allowed);
4911
4912/*
4913 * Move (not current) task off this cpu, onto dest cpu. We're doing
4914 * this because either it can't run here any more (set_cpus_allowed()
4915 * away from this CPU, or CPU going down), or because we're
4916 * attempting to rebalance this task on exec (sched_exec).
4917 *
4918 * So we race with normal scheduler movements, but that's OK, as long
4919 * as the task is no longer on this CPU.
efc30814
KK
4920 *
4921 * Returns non-zero if task was successfully migrated.
1da177e4 4922 */
efc30814 4923static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
1da177e4 4924{
70b97a7f 4925 struct rq *rq_dest, *rq_src;
dd41f596 4926 int ret = 0, on_rq;
1da177e4
LT
4927
4928 if (unlikely(cpu_is_offline(dest_cpu)))
efc30814 4929 return ret;
1da177e4
LT
4930
4931 rq_src = cpu_rq(src_cpu);
4932 rq_dest = cpu_rq(dest_cpu);
4933
4934 double_rq_lock(rq_src, rq_dest);
4935 /* Already moved. */
4936 if (task_cpu(p) != src_cpu)
4937 goto out;
4938 /* Affinity changed (again). */
4939 if (!cpu_isset(dest_cpu, p->cpus_allowed))
4940 goto out;
4941
dd41f596
IM
4942 on_rq = p->se.on_rq;
4943 if (on_rq)
4944 deactivate_task(rq_src, p, 0);
1da177e4 4945 set_task_cpu(p, dest_cpu);
dd41f596
IM
4946 if (on_rq) {
4947 activate_task(rq_dest, p, 0);
4948 check_preempt_curr(rq_dest, p);
1da177e4 4949 }
efc30814 4950 ret = 1;
1da177e4
LT
4951out:
4952 double_rq_unlock(rq_src, rq_dest);
efc30814 4953 return ret;
1da177e4
LT
4954}
4955
4956/*
4957 * migration_thread - this is a highprio system thread that performs
4958 * thread migration by bumping thread off CPU then 'pushing' onto
4959 * another runqueue.
4960 */
95cdf3b7 4961static int migration_thread(void *data)
1da177e4 4962{
1da177e4 4963 int cpu = (long)data;
70b97a7f 4964 struct rq *rq;
1da177e4
LT
4965
4966 rq = cpu_rq(cpu);
4967 BUG_ON(rq->migration_thread != current);
4968
4969 set_current_state(TASK_INTERRUPTIBLE);
4970 while (!kthread_should_stop()) {
70b97a7f 4971 struct migration_req *req;
1da177e4 4972 struct list_head *head;
1da177e4 4973
3e1d1d28 4974 try_to_freeze();
1da177e4
LT
4975
4976 spin_lock_irq(&rq->lock);
4977
4978 if (cpu_is_offline(cpu)) {
4979 spin_unlock_irq(&rq->lock);
4980 goto wait_to_die;
4981 }
4982
4983 if (rq->active_balance) {
4984 active_load_balance(rq, cpu);
4985 rq->active_balance = 0;
4986 }
4987
4988 head = &rq->migration_queue;
4989
4990 if (list_empty(head)) {
4991 spin_unlock_irq(&rq->lock);
4992 schedule();
4993 set_current_state(TASK_INTERRUPTIBLE);
4994 continue;
4995 }
70b97a7f 4996 req = list_entry(head->next, struct migration_req, list);
1da177e4
LT
4997 list_del_init(head->next);
4998
674311d5
NP
4999 spin_unlock(&rq->lock);
5000 __migrate_task(req->task, cpu, req->dest_cpu);
5001 local_irq_enable();
1da177e4
LT
5002
5003 complete(&req->done);
5004 }
5005 __set_current_state(TASK_RUNNING);
5006 return 0;
5007
5008wait_to_die:
5009 /* Wait for kthread_stop */
5010 set_current_state(TASK_INTERRUPTIBLE);
5011 while (!kthread_should_stop()) {
5012 schedule();
5013 set_current_state(TASK_INTERRUPTIBLE);
5014 }
5015 __set_current_state(TASK_RUNNING);
5016 return 0;
5017}
5018
5019#ifdef CONFIG_HOTPLUG_CPU
054b9108
KK
5020/*
5021 * Figure out where task on dead CPU should go, use force if neccessary.
5022 * NOTE: interrupts should be disabled by the caller
5023 */
48f24c4d 5024static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
1da177e4 5025{
efc30814 5026 unsigned long flags;
1da177e4 5027 cpumask_t mask;
70b97a7f
IM
5028 struct rq *rq;
5029 int dest_cpu;
1da177e4 5030
efc30814 5031restart:
1da177e4
LT
5032 /* On same node? */
5033 mask = node_to_cpumask(cpu_to_node(dead_cpu));
48f24c4d 5034 cpus_and(mask, mask, p->cpus_allowed);
1da177e4
LT
5035 dest_cpu = any_online_cpu(mask);
5036
5037 /* On any allowed CPU? */
5038 if (dest_cpu == NR_CPUS)
48f24c4d 5039 dest_cpu = any_online_cpu(p->cpus_allowed);
1da177e4
LT
5040
5041 /* No more Mr. Nice Guy. */
5042 if (dest_cpu == NR_CPUS) {
48f24c4d
IM
5043 rq = task_rq_lock(p, &flags);
5044 cpus_setall(p->cpus_allowed);
5045 dest_cpu = any_online_cpu(p->cpus_allowed);
efc30814 5046 task_rq_unlock(rq, &flags);
1da177e4
LT
5047
5048 /*
5049 * Don't tell them about moving exiting tasks or
5050 * kernel threads (both mm NULL), since they never
5051 * leave kernel.
5052 */
48f24c4d 5053 if (p->mm && printk_ratelimit())
1da177e4
LT
5054 printk(KERN_INFO "process %d (%s) no "
5055 "longer affine to cpu%d\n",
48f24c4d 5056 p->pid, p->comm, dead_cpu);
1da177e4 5057 }
48f24c4d 5058 if (!__migrate_task(p, dead_cpu, dest_cpu))
efc30814 5059 goto restart;
1da177e4
LT
5060}
5061
5062/*
5063 * While a dead CPU has no uninterruptible tasks queued at this point,
5064 * it might still have a nonzero ->nr_uninterruptible counter, because
5065 * for performance reasons the counter is not stricly tracking tasks to
5066 * their home CPUs. So we just add the counter to another CPU's counter,
5067 * to keep the global sum constant after CPU-down:
5068 */
70b97a7f 5069static void migrate_nr_uninterruptible(struct rq *rq_src)
1da177e4 5070{
70b97a7f 5071 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
1da177e4
LT
5072 unsigned long flags;
5073
5074 local_irq_save(flags);
5075 double_rq_lock(rq_src, rq_dest);
5076 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5077 rq_src->nr_uninterruptible = 0;
5078 double_rq_unlock(rq_src, rq_dest);
5079 local_irq_restore(flags);
5080}
5081
5082/* Run through task list and migrate tasks from the dead cpu. */
5083static void migrate_live_tasks(int src_cpu)
5084{
48f24c4d 5085 struct task_struct *p, *t;
1da177e4
LT
5086
5087 write_lock_irq(&tasklist_lock);
5088
48f24c4d
IM
5089 do_each_thread(t, p) {
5090 if (p == current)
1da177e4
LT
5091 continue;
5092
48f24c4d
IM
5093 if (task_cpu(p) == src_cpu)
5094 move_task_off_dead_cpu(src_cpu, p);
5095 } while_each_thread(t, p);
1da177e4
LT
5096
5097 write_unlock_irq(&tasklist_lock);
5098}
5099
dd41f596
IM
5100/*
5101 * Schedules idle task to be the next runnable task on current CPU.
1da177e4 5102 * It does so by boosting its priority to highest possible and adding it to
48f24c4d 5103 * the _front_ of the runqueue. Used by CPU offline code.
1da177e4
LT
5104 */
5105void sched_idle_next(void)
5106{
48f24c4d 5107 int this_cpu = smp_processor_id();
70b97a7f 5108 struct rq *rq = cpu_rq(this_cpu);
1da177e4
LT
5109 struct task_struct *p = rq->idle;
5110 unsigned long flags;
5111
5112 /* cpu has to be offline */
48f24c4d 5113 BUG_ON(cpu_online(this_cpu));
1da177e4 5114
48f24c4d
IM
5115 /*
5116 * Strictly not necessary since rest of the CPUs are stopped by now
5117 * and interrupts disabled on the current cpu.
1da177e4
LT
5118 */
5119 spin_lock_irqsave(&rq->lock, flags);
5120
dd41f596 5121 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
48f24c4d
IM
5122
5123 /* Add idle task to the _front_ of its priority queue: */
dd41f596 5124 activate_idle_task(p, rq);
1da177e4
LT
5125
5126 spin_unlock_irqrestore(&rq->lock, flags);
5127}
5128
48f24c4d
IM
5129/*
5130 * Ensures that the idle task is using init_mm right before its cpu goes
1da177e4
LT
5131 * offline.
5132 */
5133void idle_task_exit(void)
5134{
5135 struct mm_struct *mm = current->active_mm;
5136
5137 BUG_ON(cpu_online(smp_processor_id()));
5138
5139 if (mm != &init_mm)
5140 switch_mm(mm, &init_mm, current);
5141 mmdrop(mm);
5142}
5143
054b9108 5144/* called under rq->lock with disabled interrupts */
36c8b586 5145static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
1da177e4 5146{
70b97a7f 5147 struct rq *rq = cpu_rq(dead_cpu);
1da177e4
LT
5148
5149 /* Must be exiting, otherwise would be on tasklist. */
48f24c4d 5150 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
1da177e4
LT
5151
5152 /* Cannot have done final schedule yet: would have vanished. */
c394cc9f 5153 BUG_ON(p->state == TASK_DEAD);
1da177e4 5154
48f24c4d 5155 get_task_struct(p);
1da177e4
LT
5156
5157 /*
5158 * Drop lock around migration; if someone else moves it,
5159 * that's OK. No task can be added to this CPU, so iteration is
5160 * fine.
054b9108 5161 * NOTE: interrupts should be left disabled --dev@
1da177e4 5162 */
054b9108 5163 spin_unlock(&rq->lock);
48f24c4d 5164 move_task_off_dead_cpu(dead_cpu, p);
054b9108 5165 spin_lock(&rq->lock);
1da177e4 5166
48f24c4d 5167 put_task_struct(p);
1da177e4
LT
5168}
5169
5170/* release_task() removes task from tasklist, so we won't find dead tasks. */
5171static void migrate_dead_tasks(unsigned int dead_cpu)
5172{
70b97a7f 5173 struct rq *rq = cpu_rq(dead_cpu);
dd41f596 5174 struct task_struct *next;
48f24c4d 5175
dd41f596
IM
5176 for ( ; ; ) {
5177 if (!rq->nr_running)
5178 break;
5179 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5180 if (!next)
5181 break;
5182 migrate_dead(dead_cpu, next);
1da177e4
LT
5183 }
5184}
5185#endif /* CONFIG_HOTPLUG_CPU */
5186
5187/*
5188 * migration_call - callback that gets triggered when a CPU is added.
5189 * Here we can start up the necessary migration thread for the new CPU.
5190 */
48f24c4d
IM
5191static int __cpuinit
5192migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
1da177e4 5193{
1da177e4 5194 struct task_struct *p;
48f24c4d 5195 int cpu = (long)hcpu;
1da177e4 5196 unsigned long flags;
70b97a7f 5197 struct rq *rq;
1da177e4
LT
5198
5199 switch (action) {
5be9361c
GS
5200 case CPU_LOCK_ACQUIRE:
5201 mutex_lock(&sched_hotcpu_mutex);
5202 break;
5203
1da177e4 5204 case CPU_UP_PREPARE:
8bb78442 5205 case CPU_UP_PREPARE_FROZEN:
dd41f596 5206 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
1da177e4
LT
5207 if (IS_ERR(p))
5208 return NOTIFY_BAD;
5209 p->flags |= PF_NOFREEZE;
5210 kthread_bind(p, cpu);
5211 /* Must be high prio: stop_machine expects to yield to it. */
5212 rq = task_rq_lock(p, &flags);
dd41f596 5213 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
1da177e4
LT
5214 task_rq_unlock(rq, &flags);
5215 cpu_rq(cpu)->migration_thread = p;
5216 break;
48f24c4d 5217
1da177e4 5218 case CPU_ONLINE:
8bb78442 5219 case CPU_ONLINE_FROZEN:
1da177e4
LT
5220 /* Strictly unneccessary, as first user will wake it. */
5221 wake_up_process(cpu_rq(cpu)->migration_thread);
5222 break;
48f24c4d 5223
1da177e4
LT
5224#ifdef CONFIG_HOTPLUG_CPU
5225 case CPU_UP_CANCELED:
8bb78442 5226 case CPU_UP_CANCELED_FROZEN:
fc75cdfa
HC
5227 if (!cpu_rq(cpu)->migration_thread)
5228 break;
1da177e4 5229 /* Unbind it from offline cpu so it can run. Fall thru. */
a4c4af7c
HC
5230 kthread_bind(cpu_rq(cpu)->migration_thread,
5231 any_online_cpu(cpu_online_map));
1da177e4
LT
5232 kthread_stop(cpu_rq(cpu)->migration_thread);
5233 cpu_rq(cpu)->migration_thread = NULL;
5234 break;
48f24c4d 5235
1da177e4 5236 case CPU_DEAD:
8bb78442 5237 case CPU_DEAD_FROZEN:
1da177e4
LT
5238 migrate_live_tasks(cpu);
5239 rq = cpu_rq(cpu);
5240 kthread_stop(rq->migration_thread);
5241 rq->migration_thread = NULL;
5242 /* Idle task back to normal (off runqueue, low prio) */
5243 rq = task_rq_lock(rq->idle, &flags);
dd41f596 5244 deactivate_task(rq, rq->idle, 0);
1da177e4 5245 rq->idle->static_prio = MAX_PRIO;
dd41f596
IM
5246 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5247 rq->idle->sched_class = &idle_sched_class;
1da177e4
LT
5248 migrate_dead_tasks(cpu);
5249 task_rq_unlock(rq, &flags);
5250 migrate_nr_uninterruptible(rq);
5251 BUG_ON(rq->nr_running != 0);
5252
5253 /* No need to migrate the tasks: it was best-effort if
5be9361c 5254 * they didn't take sched_hotcpu_mutex. Just wake up
1da177e4
LT
5255 * the requestors. */
5256 spin_lock_irq(&rq->lock);
5257 while (!list_empty(&rq->migration_queue)) {
70b97a7f
IM
5258 struct migration_req *req;
5259
1da177e4 5260 req = list_entry(rq->migration_queue.next,
70b97a7f 5261 struct migration_req, list);
1da177e4
LT
5262 list_del_init(&req->list);
5263 complete(&req->done);
5264 }
5265 spin_unlock_irq(&rq->lock);
5266 break;
5267#endif
5be9361c
GS
5268 case CPU_LOCK_RELEASE:
5269 mutex_unlock(&sched_hotcpu_mutex);
5270 break;
1da177e4
LT
5271 }
5272 return NOTIFY_OK;
5273}
5274
5275/* Register at highest priority so that task migration (migrate_all_tasks)
5276 * happens before everything else.
5277 */
26c2143b 5278static struct notifier_block __cpuinitdata migration_notifier = {
1da177e4
LT
5279 .notifier_call = migration_call,
5280 .priority = 10
5281};
5282
5283int __init migration_init(void)
5284{
5285 void *cpu = (void *)(long)smp_processor_id();
07dccf33 5286 int err;
48f24c4d
IM
5287
5288 /* Start one for the boot CPU: */
07dccf33
AM
5289 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5290 BUG_ON(err == NOTIFY_BAD);
1da177e4
LT
5291 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5292 register_cpu_notifier(&migration_notifier);
48f24c4d 5293
1da177e4
LT
5294 return 0;
5295}
5296#endif
5297
5298#ifdef CONFIG_SMP
476f3534
CL
5299
5300/* Number of possible processor ids */
5301int nr_cpu_ids __read_mostly = NR_CPUS;
5302EXPORT_SYMBOL(nr_cpu_ids);
5303
1a20ff27 5304#undef SCHED_DOMAIN_DEBUG
1da177e4
LT
5305#ifdef SCHED_DOMAIN_DEBUG
5306static void sched_domain_debug(struct sched_domain *sd, int cpu)
5307{
5308 int level = 0;
5309
41c7ce9a
NP
5310 if (!sd) {
5311 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5312 return;
5313 }
5314
1da177e4
LT
5315 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5316
5317 do {
5318 int i;
5319 char str[NR_CPUS];
5320 struct sched_group *group = sd->groups;
5321 cpumask_t groupmask;
5322
5323 cpumask_scnprintf(str, NR_CPUS, sd->span);
5324 cpus_clear(groupmask);
5325
5326 printk(KERN_DEBUG);
5327 for (i = 0; i < level + 1; i++)
5328 printk(" ");
5329 printk("domain %d: ", level);
5330
5331 if (!(sd->flags & SD_LOAD_BALANCE)) {
5332 printk("does not load-balance\n");
5333 if (sd->parent)
33859f7f
MOS
5334 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5335 " has parent");
1da177e4
LT
5336 break;
5337 }
5338
5339 printk("span %s\n", str);
5340
5341 if (!cpu_isset(cpu, sd->span))
33859f7f
MOS
5342 printk(KERN_ERR "ERROR: domain->span does not contain "
5343 "CPU%d\n", cpu);
1da177e4 5344 if (!cpu_isset(cpu, group->cpumask))
33859f7f
MOS
5345 printk(KERN_ERR "ERROR: domain->groups does not contain"
5346 " CPU%d\n", cpu);
1da177e4
LT
5347
5348 printk(KERN_DEBUG);
5349 for (i = 0; i < level + 2; i++)
5350 printk(" ");
5351 printk("groups:");
5352 do {
5353 if (!group) {
5354 printk("\n");
5355 printk(KERN_ERR "ERROR: group is NULL\n");
5356 break;
5357 }
5358
5517d86b 5359 if (!group->__cpu_power) {
1da177e4 5360 printk("\n");
33859f7f
MOS
5361 printk(KERN_ERR "ERROR: domain->cpu_power not "
5362 "set\n");
1da177e4
LT
5363 }
5364
5365 if (!cpus_weight(group->cpumask)) {
5366 printk("\n");
5367 printk(KERN_ERR "ERROR: empty group\n");
5368 }
5369
5370 if (cpus_intersects(groupmask, group->cpumask)) {
5371 printk("\n");
5372 printk(KERN_ERR "ERROR: repeated CPUs\n");
5373 }
5374
5375 cpus_or(groupmask, groupmask, group->cpumask);
5376
5377 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5378 printk(" %s", str);
5379
5380 group = group->next;
5381 } while (group != sd->groups);
5382 printk("\n");
5383
5384 if (!cpus_equal(sd->span, groupmask))
33859f7f
MOS
5385 printk(KERN_ERR "ERROR: groups don't span "
5386 "domain->span\n");
1da177e4
LT
5387
5388 level++;
5389 sd = sd->parent;
33859f7f
MOS
5390 if (!sd)
5391 continue;
1da177e4 5392
33859f7f
MOS
5393 if (!cpus_subset(groupmask, sd->span))
5394 printk(KERN_ERR "ERROR: parent span is not a superset "
5395 "of domain->span\n");
1da177e4
LT
5396
5397 } while (sd);
5398}
5399#else
48f24c4d 5400# define sched_domain_debug(sd, cpu) do { } while (0)
1da177e4
LT
5401#endif
5402
1a20ff27 5403static int sd_degenerate(struct sched_domain *sd)
245af2c7
SS
5404{
5405 if (cpus_weight(sd->span) == 1)
5406 return 1;
5407
5408 /* Following flags need at least 2 groups */
5409 if (sd->flags & (SD_LOAD_BALANCE |
5410 SD_BALANCE_NEWIDLE |
5411 SD_BALANCE_FORK |
89c4710e
SS
5412 SD_BALANCE_EXEC |
5413 SD_SHARE_CPUPOWER |
5414 SD_SHARE_PKG_RESOURCES)) {
245af2c7
SS
5415 if (sd->groups != sd->groups->next)
5416 return 0;
5417 }
5418
5419 /* Following flags don't use groups */
5420 if (sd->flags & (SD_WAKE_IDLE |
5421 SD_WAKE_AFFINE |
5422 SD_WAKE_BALANCE))
5423 return 0;
5424
5425 return 1;
5426}
5427
48f24c4d
IM
5428static int
5429sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
245af2c7
SS
5430{
5431 unsigned long cflags = sd->flags, pflags = parent->flags;
5432
5433 if (sd_degenerate(parent))
5434 return 1;
5435
5436 if (!cpus_equal(sd->span, parent->span))
5437 return 0;
5438
5439 /* Does parent contain flags not in child? */
5440 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5441 if (cflags & SD_WAKE_AFFINE)
5442 pflags &= ~SD_WAKE_BALANCE;
5443 /* Flags needing groups don't count if only 1 group in parent */
5444 if (parent->groups == parent->groups->next) {
5445 pflags &= ~(SD_LOAD_BALANCE |
5446 SD_BALANCE_NEWIDLE |
5447 SD_BALANCE_FORK |
89c4710e
SS
5448 SD_BALANCE_EXEC |
5449 SD_SHARE_CPUPOWER |
5450 SD_SHARE_PKG_RESOURCES);
245af2c7
SS
5451 }
5452 if (~cflags & pflags)
5453 return 0;
5454
5455 return 1;
5456}
5457
1da177e4
LT
5458/*
5459 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5460 * hold the hotplug lock.
5461 */
9c1cfda2 5462static void cpu_attach_domain(struct sched_domain *sd, int cpu)
1da177e4 5463{
70b97a7f 5464 struct rq *rq = cpu_rq(cpu);
245af2c7
SS
5465 struct sched_domain *tmp;
5466
5467 /* Remove the sched domains which do not contribute to scheduling. */
5468 for (tmp = sd; tmp; tmp = tmp->parent) {
5469 struct sched_domain *parent = tmp->parent;
5470 if (!parent)
5471 break;
1a848870 5472 if (sd_parent_degenerate(tmp, parent)) {
245af2c7 5473 tmp->parent = parent->parent;
1a848870
SS
5474 if (parent->parent)
5475 parent->parent->child = tmp;
5476 }
245af2c7
SS
5477 }
5478
1a848870 5479 if (sd && sd_degenerate(sd)) {
245af2c7 5480 sd = sd->parent;
1a848870
SS
5481 if (sd)
5482 sd->child = NULL;
5483 }
1da177e4
LT
5484
5485 sched_domain_debug(sd, cpu);
5486
674311d5 5487 rcu_assign_pointer(rq->sd, sd);
1da177e4
LT
5488}
5489
5490/* cpus with isolated domains */
67af63a6 5491static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
1da177e4
LT
5492
5493/* Setup the mask of cpus configured for isolated domains */
5494static int __init isolated_cpu_setup(char *str)
5495{
5496 int ints[NR_CPUS], i;
5497
5498 str = get_options(str, ARRAY_SIZE(ints), ints);
5499 cpus_clear(cpu_isolated_map);
5500 for (i = 1; i <= ints[0]; i++)
5501 if (ints[i] < NR_CPUS)
5502 cpu_set(ints[i], cpu_isolated_map);
5503 return 1;
5504}
5505
5506__setup ("isolcpus=", isolated_cpu_setup);
5507
5508/*
6711cab4
SS
5509 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5510 * to a function which identifies what group(along with sched group) a CPU
5511 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5512 * (due to the fact that we keep track of groups covered with a cpumask_t).
1da177e4
LT
5513 *
5514 * init_sched_build_groups will build a circular linked list of the groups
5515 * covered by the given span, and will set each group's ->cpumask correctly,
5516 * and ->cpu_power to 0.
5517 */
a616058b 5518static void
6711cab4
SS
5519init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5520 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5521 struct sched_group **sg))
1da177e4
LT
5522{
5523 struct sched_group *first = NULL, *last = NULL;
5524 cpumask_t covered = CPU_MASK_NONE;
5525 int i;
5526
5527 for_each_cpu_mask(i, span) {
6711cab4
SS
5528 struct sched_group *sg;
5529 int group = group_fn(i, cpu_map, &sg);
1da177e4
LT
5530 int j;
5531
5532 if (cpu_isset(i, covered))
5533 continue;
5534
5535 sg->cpumask = CPU_MASK_NONE;
5517d86b 5536 sg->__cpu_power = 0;
1da177e4
LT
5537
5538 for_each_cpu_mask(j, span) {
6711cab4 5539 if (group_fn(j, cpu_map, NULL) != group)
1da177e4
LT
5540 continue;
5541
5542 cpu_set(j, covered);
5543 cpu_set(j, sg->cpumask);
5544 }
5545 if (!first)
5546 first = sg;
5547 if (last)
5548 last->next = sg;
5549 last = sg;
5550 }
5551 last->next = first;
5552}
5553
9c1cfda2 5554#define SD_NODES_PER_DOMAIN 16
1da177e4 5555
9c1cfda2 5556#ifdef CONFIG_NUMA
198e2f18 5557
9c1cfda2
JH
5558/**
5559 * find_next_best_node - find the next node to include in a sched_domain
5560 * @node: node whose sched_domain we're building
5561 * @used_nodes: nodes already in the sched_domain
5562 *
5563 * Find the next node to include in a given scheduling domain. Simply
5564 * finds the closest node not already in the @used_nodes map.
5565 *
5566 * Should use nodemask_t.
5567 */
5568static int find_next_best_node(int node, unsigned long *used_nodes)
5569{
5570 int i, n, val, min_val, best_node = 0;
5571
5572 min_val = INT_MAX;
5573
5574 for (i = 0; i < MAX_NUMNODES; i++) {
5575 /* Start at @node */
5576 n = (node + i) % MAX_NUMNODES;
5577
5578 if (!nr_cpus_node(n))
5579 continue;
5580
5581 /* Skip already used nodes */
5582 if (test_bit(n, used_nodes))
5583 continue;
5584
5585 /* Simple min distance search */
5586 val = node_distance(node, n);
5587
5588 if (val < min_val) {
5589 min_val = val;
5590 best_node = n;
5591 }
5592 }
5593
5594 set_bit(best_node, used_nodes);
5595 return best_node;
5596}
5597
5598/**
5599 * sched_domain_node_span - get a cpumask for a node's sched_domain
5600 * @node: node whose cpumask we're constructing
5601 * @size: number of nodes to include in this span
5602 *
5603 * Given a node, construct a good cpumask for its sched_domain to span. It
5604 * should be one that prevents unnecessary balancing, but also spreads tasks
5605 * out optimally.
5606 */
5607static cpumask_t sched_domain_node_span(int node)
5608{
9c1cfda2 5609 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
48f24c4d
IM
5610 cpumask_t span, nodemask;
5611 int i;
9c1cfda2
JH
5612
5613 cpus_clear(span);
5614 bitmap_zero(used_nodes, MAX_NUMNODES);
5615
5616 nodemask = node_to_cpumask(node);
5617 cpus_or(span, span, nodemask);
5618 set_bit(node, used_nodes);
5619
5620 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5621 int next_node = find_next_best_node(node, used_nodes);
48f24c4d 5622
9c1cfda2
JH
5623 nodemask = node_to_cpumask(next_node);
5624 cpus_or(span, span, nodemask);
5625 }
5626
5627 return span;
5628}
5629#endif
5630
5c45bf27 5631int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
48f24c4d 5632
9c1cfda2 5633/*
48f24c4d 5634 * SMT sched-domains:
9c1cfda2 5635 */
1da177e4
LT
5636#ifdef CONFIG_SCHED_SMT
5637static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
6711cab4 5638static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
48f24c4d 5639
6711cab4
SS
5640static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5641 struct sched_group **sg)
1da177e4 5642{
6711cab4
SS
5643 if (sg)
5644 *sg = &per_cpu(sched_group_cpus, cpu);
1da177e4
LT
5645 return cpu;
5646}
5647#endif
5648
48f24c4d
IM
5649/*
5650 * multi-core sched-domains:
5651 */
1e9f28fa
SS
5652#ifdef CONFIG_SCHED_MC
5653static DEFINE_PER_CPU(struct sched_domain, core_domains);
6711cab4 5654static DEFINE_PER_CPU(struct sched_group, sched_group_core);
1e9f28fa
SS
5655#endif
5656
5657#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6711cab4
SS
5658static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5659 struct sched_group **sg)
1e9f28fa 5660{
6711cab4 5661 int group;
a616058b
SS
5662 cpumask_t mask = cpu_sibling_map[cpu];
5663 cpus_and(mask, mask, *cpu_map);
6711cab4
SS
5664 group = first_cpu(mask);
5665 if (sg)
5666 *sg = &per_cpu(sched_group_core, group);
5667 return group;
1e9f28fa
SS
5668}
5669#elif defined(CONFIG_SCHED_MC)
6711cab4
SS
5670static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5671 struct sched_group **sg)
1e9f28fa 5672{
6711cab4
SS
5673 if (sg)
5674 *sg = &per_cpu(sched_group_core, cpu);
1e9f28fa
SS
5675 return cpu;
5676}
5677#endif
5678
1da177e4 5679static DEFINE_PER_CPU(struct sched_domain, phys_domains);
6711cab4 5680static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
48f24c4d 5681
6711cab4
SS
5682static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5683 struct sched_group **sg)
1da177e4 5684{
6711cab4 5685 int group;
48f24c4d 5686#ifdef CONFIG_SCHED_MC
1e9f28fa 5687 cpumask_t mask = cpu_coregroup_map(cpu);
a616058b 5688 cpus_and(mask, mask, *cpu_map);
6711cab4 5689 group = first_cpu(mask);
1e9f28fa 5690#elif defined(CONFIG_SCHED_SMT)
a616058b
SS
5691 cpumask_t mask = cpu_sibling_map[cpu];
5692 cpus_and(mask, mask, *cpu_map);
6711cab4 5693 group = first_cpu(mask);
1da177e4 5694#else
6711cab4 5695 group = cpu;
1da177e4 5696#endif
6711cab4
SS
5697 if (sg)
5698 *sg = &per_cpu(sched_group_phys, group);
5699 return group;
1da177e4
LT
5700}
5701
5702#ifdef CONFIG_NUMA
1da177e4 5703/*
9c1cfda2
JH
5704 * The init_sched_build_groups can't handle what we want to do with node
5705 * groups, so roll our own. Now each node has its own list of groups which
5706 * gets dynamically allocated.
1da177e4 5707 */
9c1cfda2 5708static DEFINE_PER_CPU(struct sched_domain, node_domains);
d1b55138 5709static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
1da177e4 5710
9c1cfda2 5711static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
6711cab4 5712static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
9c1cfda2 5713
6711cab4
SS
5714static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5715 struct sched_group **sg)
9c1cfda2 5716{
6711cab4
SS
5717 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5718 int group;
5719
5720 cpus_and(nodemask, nodemask, *cpu_map);
5721 group = first_cpu(nodemask);
5722
5723 if (sg)
5724 *sg = &per_cpu(sched_group_allnodes, group);
5725 return group;
1da177e4 5726}
6711cab4 5727
08069033
SS
5728static void init_numa_sched_groups_power(struct sched_group *group_head)
5729{
5730 struct sched_group *sg = group_head;
5731 int j;
5732
5733 if (!sg)
5734 return;
5735next_sg:
5736 for_each_cpu_mask(j, sg->cpumask) {
5737 struct sched_domain *sd;
5738
5739 sd = &per_cpu(phys_domains, j);
5740 if (j != first_cpu(sd->groups->cpumask)) {
5741 /*
5742 * Only add "power" once for each
5743 * physical package.
5744 */
5745 continue;
5746 }
5747
5517d86b 5748 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
08069033
SS
5749 }
5750 sg = sg->next;
5751 if (sg != group_head)
5752 goto next_sg;
5753}
1da177e4
LT
5754#endif
5755
a616058b 5756#ifdef CONFIG_NUMA
51888ca2
SV
5757/* Free memory allocated for various sched_group structures */
5758static void free_sched_groups(const cpumask_t *cpu_map)
5759{
a616058b 5760 int cpu, i;
51888ca2
SV
5761
5762 for_each_cpu_mask(cpu, *cpu_map) {
51888ca2
SV
5763 struct sched_group **sched_group_nodes
5764 = sched_group_nodes_bycpu[cpu];
5765
51888ca2
SV
5766 if (!sched_group_nodes)
5767 continue;
5768
5769 for (i = 0; i < MAX_NUMNODES; i++) {
5770 cpumask_t nodemask = node_to_cpumask(i);
5771 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5772
5773 cpus_and(nodemask, nodemask, *cpu_map);
5774 if (cpus_empty(nodemask))
5775 continue;
5776
5777 if (sg == NULL)
5778 continue;
5779 sg = sg->next;
5780next_sg:
5781 oldsg = sg;
5782 sg = sg->next;
5783 kfree(oldsg);
5784 if (oldsg != sched_group_nodes[i])
5785 goto next_sg;
5786 }
5787 kfree(sched_group_nodes);
5788 sched_group_nodes_bycpu[cpu] = NULL;
5789 }
51888ca2 5790}
a616058b
SS
5791#else
5792static void free_sched_groups(const cpumask_t *cpu_map)
5793{
5794}
5795#endif
51888ca2 5796
89c4710e
SS
5797/*
5798 * Initialize sched groups cpu_power.
5799 *
5800 * cpu_power indicates the capacity of sched group, which is used while
5801 * distributing the load between different sched groups in a sched domain.
5802 * Typically cpu_power for all the groups in a sched domain will be same unless
5803 * there are asymmetries in the topology. If there are asymmetries, group
5804 * having more cpu_power will pickup more load compared to the group having
5805 * less cpu_power.
5806 *
5807 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5808 * the maximum number of tasks a group can handle in the presence of other idle
5809 * or lightly loaded groups in the same sched domain.
5810 */
5811static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5812{
5813 struct sched_domain *child;
5814 struct sched_group *group;
5815
5816 WARN_ON(!sd || !sd->groups);
5817
5818 if (cpu != first_cpu(sd->groups->cpumask))
5819 return;
5820
5821 child = sd->child;
5822
5517d86b
ED
5823 sd->groups->__cpu_power = 0;
5824
89c4710e
SS
5825 /*
5826 * For perf policy, if the groups in child domain share resources
5827 * (for example cores sharing some portions of the cache hierarchy
5828 * or SMT), then set this domain groups cpu_power such that each group
5829 * can handle only one task, when there are other idle groups in the
5830 * same sched domain.
5831 */
5832 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
5833 (child->flags &
5834 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
5517d86b 5835 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
89c4710e
SS
5836 return;
5837 }
5838
89c4710e
SS
5839 /*
5840 * add cpu_power of each child group to this groups cpu_power
5841 */
5842 group = child->groups;
5843 do {
5517d86b 5844 sg_inc_cpu_power(sd->groups, group->__cpu_power);
89c4710e
SS
5845 group = group->next;
5846 } while (group != child->groups);
5847}
5848
1da177e4 5849/*
1a20ff27
DG
5850 * Build sched domains for a given set of cpus and attach the sched domains
5851 * to the individual cpus
1da177e4 5852 */
51888ca2 5853static int build_sched_domains(const cpumask_t *cpu_map)
1da177e4
LT
5854{
5855 int i;
d1b55138
JH
5856#ifdef CONFIG_NUMA
5857 struct sched_group **sched_group_nodes = NULL;
6711cab4 5858 int sd_allnodes = 0;
d1b55138
JH
5859
5860 /*
5861 * Allocate the per-node list of sched groups
5862 */
dd41f596 5863 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
d3a5aa98 5864 GFP_KERNEL);
d1b55138
JH
5865 if (!sched_group_nodes) {
5866 printk(KERN_WARNING "Can not alloc sched group node list\n");
51888ca2 5867 return -ENOMEM;
d1b55138
JH
5868 }
5869 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5870#endif
1da177e4
LT
5871
5872 /*
1a20ff27 5873 * Set up domains for cpus specified by the cpu_map.
1da177e4 5874 */
1a20ff27 5875 for_each_cpu_mask(i, *cpu_map) {
1da177e4
LT
5876 struct sched_domain *sd = NULL, *p;
5877 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
5878
1a20ff27 5879 cpus_and(nodemask, nodemask, *cpu_map);
1da177e4
LT
5880
5881#ifdef CONFIG_NUMA
dd41f596
IM
5882 if (cpus_weight(*cpu_map) >
5883 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
9c1cfda2
JH
5884 sd = &per_cpu(allnodes_domains, i);
5885 *sd = SD_ALLNODES_INIT;
5886 sd->span = *cpu_map;
6711cab4 5887 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
9c1cfda2 5888 p = sd;
6711cab4 5889 sd_allnodes = 1;
9c1cfda2
JH
5890 } else
5891 p = NULL;
5892
1da177e4 5893 sd = &per_cpu(node_domains, i);
1da177e4 5894 *sd = SD_NODE_INIT;
9c1cfda2
JH
5895 sd->span = sched_domain_node_span(cpu_to_node(i));
5896 sd->parent = p;
1a848870
SS
5897 if (p)
5898 p->child = sd;
9c1cfda2 5899 cpus_and(sd->span, sd->span, *cpu_map);
1da177e4
LT
5900#endif
5901
5902 p = sd;
5903 sd = &per_cpu(phys_domains, i);
1da177e4
LT
5904 *sd = SD_CPU_INIT;
5905 sd->span = nodemask;
5906 sd->parent = p;
1a848870
SS
5907 if (p)
5908 p->child = sd;
6711cab4 5909 cpu_to_phys_group(i, cpu_map, &sd->groups);
1da177e4 5910
1e9f28fa
SS
5911#ifdef CONFIG_SCHED_MC
5912 p = sd;
5913 sd = &per_cpu(core_domains, i);
1e9f28fa
SS
5914 *sd = SD_MC_INIT;
5915 sd->span = cpu_coregroup_map(i);
5916 cpus_and(sd->span, sd->span, *cpu_map);
5917 sd->parent = p;
1a848870 5918 p->child = sd;
6711cab4 5919 cpu_to_core_group(i, cpu_map, &sd->groups);
1e9f28fa
SS
5920#endif
5921
1da177e4
LT
5922#ifdef CONFIG_SCHED_SMT
5923 p = sd;
5924 sd = &per_cpu(cpu_domains, i);
1da177e4
LT
5925 *sd = SD_SIBLING_INIT;
5926 sd->span = cpu_sibling_map[i];
1a20ff27 5927 cpus_and(sd->span, sd->span, *cpu_map);
1da177e4 5928 sd->parent = p;
1a848870 5929 p->child = sd;
6711cab4 5930 cpu_to_cpu_group(i, cpu_map, &sd->groups);
1da177e4
LT
5931#endif
5932 }
5933
5934#ifdef CONFIG_SCHED_SMT
5935 /* Set up CPU (sibling) groups */
9c1cfda2 5936 for_each_cpu_mask(i, *cpu_map) {
1da177e4 5937 cpumask_t this_sibling_map = cpu_sibling_map[i];
1a20ff27 5938 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
1da177e4
LT
5939 if (i != first_cpu(this_sibling_map))
5940 continue;
5941
dd41f596
IM
5942 init_sched_build_groups(this_sibling_map, cpu_map,
5943 &cpu_to_cpu_group);
1da177e4
LT
5944 }
5945#endif
5946
1e9f28fa
SS
5947#ifdef CONFIG_SCHED_MC
5948 /* Set up multi-core groups */
5949 for_each_cpu_mask(i, *cpu_map) {
5950 cpumask_t this_core_map = cpu_coregroup_map(i);
5951 cpus_and(this_core_map, this_core_map, *cpu_map);
5952 if (i != first_cpu(this_core_map))
5953 continue;
dd41f596
IM
5954 init_sched_build_groups(this_core_map, cpu_map,
5955 &cpu_to_core_group);
1e9f28fa
SS
5956 }
5957#endif
5958
1da177e4
LT
5959 /* Set up physical groups */
5960 for (i = 0; i < MAX_NUMNODES; i++) {
5961 cpumask_t nodemask = node_to_cpumask(i);
5962
1a20ff27 5963 cpus_and(nodemask, nodemask, *cpu_map);
1da177e4
LT
5964 if (cpus_empty(nodemask))
5965 continue;
5966
6711cab4 5967 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
1da177e4
LT
5968 }
5969
5970#ifdef CONFIG_NUMA
5971 /* Set up node groups */
6711cab4 5972 if (sd_allnodes)
dd41f596
IM
5973 init_sched_build_groups(*cpu_map, cpu_map,
5974 &cpu_to_allnodes_group);
9c1cfda2
JH
5975
5976 for (i = 0; i < MAX_NUMNODES; i++) {
5977 /* Set up node groups */
5978 struct sched_group *sg, *prev;
5979 cpumask_t nodemask = node_to_cpumask(i);
5980 cpumask_t domainspan;
5981 cpumask_t covered = CPU_MASK_NONE;
5982 int j;
5983
5984 cpus_and(nodemask, nodemask, *cpu_map);
d1b55138
JH
5985 if (cpus_empty(nodemask)) {
5986 sched_group_nodes[i] = NULL;
9c1cfda2 5987 continue;
d1b55138 5988 }
9c1cfda2
JH
5989
5990 domainspan = sched_domain_node_span(i);
5991 cpus_and(domainspan, domainspan, *cpu_map);
5992
15f0b676 5993 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
51888ca2
SV
5994 if (!sg) {
5995 printk(KERN_WARNING "Can not alloc domain group for "
5996 "node %d\n", i);
5997 goto error;
5998 }
9c1cfda2
JH
5999 sched_group_nodes[i] = sg;
6000 for_each_cpu_mask(j, nodemask) {
6001 struct sched_domain *sd;
6002 sd = &per_cpu(node_domains, j);
6003 sd->groups = sg;
9c1cfda2 6004 }
5517d86b 6005 sg->__cpu_power = 0;
9c1cfda2 6006 sg->cpumask = nodemask;
51888ca2 6007 sg->next = sg;
9c1cfda2
JH
6008 cpus_or(covered, covered, nodemask);
6009 prev = sg;
6010
6011 for (j = 0; j < MAX_NUMNODES; j++) {
6012 cpumask_t tmp, notcovered;
6013 int n = (i + j) % MAX_NUMNODES;
6014
6015 cpus_complement(notcovered, covered);
6016 cpus_and(tmp, notcovered, *cpu_map);
6017 cpus_and(tmp, tmp, domainspan);
6018 if (cpus_empty(tmp))
6019 break;
6020
6021 nodemask = node_to_cpumask(n);
6022 cpus_and(tmp, tmp, nodemask);
6023 if (cpus_empty(tmp))
6024 continue;
6025
15f0b676
SV
6026 sg = kmalloc_node(sizeof(struct sched_group),
6027 GFP_KERNEL, i);
9c1cfda2
JH
6028 if (!sg) {
6029 printk(KERN_WARNING
6030 "Can not alloc domain group for node %d\n", j);
51888ca2 6031 goto error;
9c1cfda2 6032 }
5517d86b 6033 sg->__cpu_power = 0;
9c1cfda2 6034 sg->cpumask = tmp;
51888ca2 6035 sg->next = prev->next;
9c1cfda2
JH
6036 cpus_or(covered, covered, tmp);
6037 prev->next = sg;
6038 prev = sg;
6039 }
9c1cfda2 6040 }
1da177e4
LT
6041#endif
6042
6043 /* Calculate CPU power for physical packages and nodes */
5c45bf27 6044#ifdef CONFIG_SCHED_SMT
1a20ff27 6045 for_each_cpu_mask(i, *cpu_map) {
dd41f596
IM
6046 struct sched_domain *sd = &per_cpu(cpu_domains, i);
6047
89c4710e 6048 init_sched_groups_power(i, sd);
5c45bf27 6049 }
1da177e4 6050#endif
1e9f28fa 6051#ifdef CONFIG_SCHED_MC
5c45bf27 6052 for_each_cpu_mask(i, *cpu_map) {
dd41f596
IM
6053 struct sched_domain *sd = &per_cpu(core_domains, i);
6054
89c4710e 6055 init_sched_groups_power(i, sd);
5c45bf27
SS
6056 }
6057#endif
1e9f28fa 6058
5c45bf27 6059 for_each_cpu_mask(i, *cpu_map) {
dd41f596
IM
6060 struct sched_domain *sd = &per_cpu(phys_domains, i);
6061
89c4710e 6062 init_sched_groups_power(i, sd);
1da177e4
LT
6063 }
6064
9c1cfda2 6065#ifdef CONFIG_NUMA
08069033
SS
6066 for (i = 0; i < MAX_NUMNODES; i++)
6067 init_numa_sched_groups_power(sched_group_nodes[i]);
9c1cfda2 6068
6711cab4
SS
6069 if (sd_allnodes) {
6070 struct sched_group *sg;
f712c0c7 6071
6711cab4 6072 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
f712c0c7
SS
6073 init_numa_sched_groups_power(sg);
6074 }
9c1cfda2
JH
6075#endif
6076
1da177e4 6077 /* Attach the domains */
1a20ff27 6078 for_each_cpu_mask(i, *cpu_map) {
1da177e4
LT
6079 struct sched_domain *sd;
6080#ifdef CONFIG_SCHED_SMT
6081 sd = &per_cpu(cpu_domains, i);
1e9f28fa
SS
6082#elif defined(CONFIG_SCHED_MC)
6083 sd = &per_cpu(core_domains, i);
1da177e4
LT
6084#else
6085 sd = &per_cpu(phys_domains, i);
6086#endif
6087 cpu_attach_domain(sd, i);
6088 }
51888ca2
SV
6089
6090 return 0;
6091
a616058b 6092#ifdef CONFIG_NUMA
51888ca2
SV
6093error:
6094 free_sched_groups(cpu_map);
6095 return -ENOMEM;
a616058b 6096#endif
1da177e4 6097}
1a20ff27
DG
6098/*
6099 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6100 */
51888ca2 6101static int arch_init_sched_domains(const cpumask_t *cpu_map)
1a20ff27
DG
6102{
6103 cpumask_t cpu_default_map;
51888ca2 6104 int err;
1da177e4 6105
1a20ff27
DG
6106 /*
6107 * Setup mask for cpus without special case scheduling requirements.
6108 * For now this just excludes isolated cpus, but could be used to
6109 * exclude other special cases in the future.
6110 */
6111 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6112
51888ca2
SV
6113 err = build_sched_domains(&cpu_default_map);
6114
6115 return err;
1a20ff27
DG
6116}
6117
6118static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
1da177e4 6119{
51888ca2 6120 free_sched_groups(cpu_map);
9c1cfda2 6121}
1da177e4 6122
1a20ff27
DG
6123/*
6124 * Detach sched domains from a group of cpus specified in cpu_map
6125 * These cpus will now be attached to the NULL domain
6126 */
858119e1 6127static void detach_destroy_domains(const cpumask_t *cpu_map)
1a20ff27
DG
6128{
6129 int i;
6130
6131 for_each_cpu_mask(i, *cpu_map)
6132 cpu_attach_domain(NULL, i);
6133 synchronize_sched();
6134 arch_destroy_sched_domains(cpu_map);
6135}
6136
6137/*
6138 * Partition sched domains as specified by the cpumasks below.
6139 * This attaches all cpus from the cpumasks to the NULL domain,
6140 * waits for a RCU quiescent period, recalculates sched
6141 * domain information and then attaches them back to the
6142 * correct sched domains
6143 * Call with hotplug lock held
6144 */
51888ca2 6145int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
1a20ff27
DG
6146{
6147 cpumask_t change_map;
51888ca2 6148 int err = 0;
1a20ff27
DG
6149
6150 cpus_and(*partition1, *partition1, cpu_online_map);
6151 cpus_and(*partition2, *partition2, cpu_online_map);
6152 cpus_or(change_map, *partition1, *partition2);
6153
6154 /* Detach sched domains from all of the affected cpus */
6155 detach_destroy_domains(&change_map);
6156 if (!cpus_empty(*partition1))
51888ca2
SV
6157 err = build_sched_domains(partition1);
6158 if (!err && !cpus_empty(*partition2))
6159 err = build_sched_domains(partition2);
6160
6161 return err;
1a20ff27
DG
6162}
6163
5c45bf27
SS
6164#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6165int arch_reinit_sched_domains(void)
6166{
6167 int err;
6168
5be9361c 6169 mutex_lock(&sched_hotcpu_mutex);
5c45bf27
SS
6170 detach_destroy_domains(&cpu_online_map);
6171 err = arch_init_sched_domains(&cpu_online_map);
5be9361c 6172 mutex_unlock(&sched_hotcpu_mutex);
5c45bf27
SS
6173
6174 return err;
6175}
6176
6177static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6178{
6179 int ret;
6180
6181 if (buf[0] != '0' && buf[0] != '1')
6182 return -EINVAL;
6183
6184 if (smt)
6185 sched_smt_power_savings = (buf[0] == '1');
6186 else
6187 sched_mc_power_savings = (buf[0] == '1');
6188
6189 ret = arch_reinit_sched_domains();
6190
6191 return ret ? ret : count;
6192}
6193
6194int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6195{
6196 int err = 0;
48f24c4d 6197
5c45bf27
SS
6198#ifdef CONFIG_SCHED_SMT
6199 if (smt_capable())
6200 err = sysfs_create_file(&cls->kset.kobj,
6201 &attr_sched_smt_power_savings.attr);
6202#endif
6203#ifdef CONFIG_SCHED_MC
6204 if (!err && mc_capable())
6205 err = sysfs_create_file(&cls->kset.kobj,
6206 &attr_sched_mc_power_savings.attr);
6207#endif
6208 return err;
6209}
6210#endif
6211
6212#ifdef CONFIG_SCHED_MC
6213static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6214{
6215 return sprintf(page, "%u\n", sched_mc_power_savings);
6216}
48f24c4d
IM
6217static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6218 const char *buf, size_t count)
5c45bf27
SS
6219{
6220 return sched_power_savings_store(buf, count, 0);
6221}
6222SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6223 sched_mc_power_savings_store);
6224#endif
6225
6226#ifdef CONFIG_SCHED_SMT
6227static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6228{
6229 return sprintf(page, "%u\n", sched_smt_power_savings);
6230}
48f24c4d
IM
6231static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6232 const char *buf, size_t count)
5c45bf27
SS
6233{
6234 return sched_power_savings_store(buf, count, 1);
6235}
6236SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6237 sched_smt_power_savings_store);
6238#endif
6239
1da177e4
LT
6240/*
6241 * Force a reinitialization of the sched domains hierarchy. The domains
6242 * and groups cannot be updated in place without racing with the balancing
41c7ce9a 6243 * code, so we temporarily attach all running cpus to the NULL domain
1da177e4
LT
6244 * which will prevent rebalancing while the sched domains are recalculated.
6245 */
6246static int update_sched_domains(struct notifier_block *nfb,
6247 unsigned long action, void *hcpu)
6248{
1da177e4
LT
6249 switch (action) {
6250 case CPU_UP_PREPARE:
8bb78442 6251 case CPU_UP_PREPARE_FROZEN:
1da177e4 6252 case CPU_DOWN_PREPARE:
8bb78442 6253 case CPU_DOWN_PREPARE_FROZEN:
1a20ff27 6254 detach_destroy_domains(&cpu_online_map);
1da177e4
LT
6255 return NOTIFY_OK;
6256
6257 case CPU_UP_CANCELED:
8bb78442 6258 case CPU_UP_CANCELED_FROZEN:
1da177e4 6259 case CPU_DOWN_FAILED:
8bb78442 6260 case CPU_DOWN_FAILED_FROZEN:
1da177e4 6261 case CPU_ONLINE:
8bb78442 6262 case CPU_ONLINE_FROZEN:
1da177e4 6263 case CPU_DEAD:
8bb78442 6264 case CPU_DEAD_FROZEN:
1da177e4
LT
6265 /*
6266 * Fall through and re-initialise the domains.
6267 */
6268 break;
6269 default:
6270 return NOTIFY_DONE;
6271 }
6272
6273 /* The hotplug lock is already held by cpu_up/cpu_down */
1a20ff27 6274 arch_init_sched_domains(&cpu_online_map);
1da177e4
LT
6275
6276 return NOTIFY_OK;
6277}
1da177e4
LT
6278
6279void __init sched_init_smp(void)
6280{
5c1e1767
NP
6281 cpumask_t non_isolated_cpus;
6282
5be9361c 6283 mutex_lock(&sched_hotcpu_mutex);
1a20ff27 6284 arch_init_sched_domains(&cpu_online_map);
e5e5673f 6285 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
5c1e1767
NP
6286 if (cpus_empty(non_isolated_cpus))
6287 cpu_set(smp_processor_id(), non_isolated_cpus);
5be9361c 6288 mutex_unlock(&sched_hotcpu_mutex);
1da177e4
LT
6289 /* XXX: Theoretical race here - CPU may be hotplugged now */
6290 hotcpu_notifier(update_sched_domains, 0);
5c1e1767
NP
6291
6292 /* Move init over to a non-isolated CPU */
6293 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6294 BUG();
dd41f596 6295 sched_init_granularity();
1da177e4
LT
6296}
6297#else
6298void __init sched_init_smp(void)
6299{
dd41f596 6300 sched_init_granularity();
1da177e4
LT
6301}
6302#endif /* CONFIG_SMP */
6303
6304int in_sched_functions(unsigned long addr)
6305{
6306 /* Linker adds these: start and end of __sched functions */
6307 extern char __sched_text_start[], __sched_text_end[];
48f24c4d 6308
1da177e4
LT
6309 return in_lock_functions(addr) ||
6310 (addr >= (unsigned long)__sched_text_start
6311 && addr < (unsigned long)__sched_text_end);
6312}
6313
dd41f596
IM
6314static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6315{
6316 cfs_rq->tasks_timeline = RB_ROOT;
6317 cfs_rq->fair_clock = 1;
6318#ifdef CONFIG_FAIR_GROUP_SCHED
6319 cfs_rq->rq = rq;
6320#endif
6321}
6322
1da177e4
LT
6323void __init sched_init(void)
6324{
dd41f596 6325 u64 now = sched_clock();
476f3534 6326 int highest_cpu = 0;
dd41f596
IM
6327 int i, j;
6328
6329 /*
6330 * Link up the scheduling class hierarchy:
6331 */
6332 rt_sched_class.next = &fair_sched_class;
6333 fair_sched_class.next = &idle_sched_class;
6334 idle_sched_class.next = NULL;
1da177e4 6335
0a945022 6336 for_each_possible_cpu(i) {
dd41f596 6337 struct rt_prio_array *array;
70b97a7f 6338 struct rq *rq;
1da177e4
LT
6339
6340 rq = cpu_rq(i);
6341 spin_lock_init(&rq->lock);
fcb99371 6342 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
7897986b 6343 rq->nr_running = 0;
dd41f596
IM
6344 rq->clock = 1;
6345 init_cfs_rq(&rq->cfs, rq);
6346#ifdef CONFIG_FAIR_GROUP_SCHED
6347 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6348 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6349#endif
6350 rq->ls.load_update_last = now;
6351 rq->ls.load_update_start = now;
1da177e4 6352
dd41f596
IM
6353 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6354 rq->cpu_load[j] = 0;
1da177e4 6355#ifdef CONFIG_SMP
41c7ce9a 6356 rq->sd = NULL;
1da177e4 6357 rq->active_balance = 0;
dd41f596 6358 rq->next_balance = jiffies;
1da177e4 6359 rq->push_cpu = 0;
0a2966b4 6360 rq->cpu = i;
1da177e4
LT
6361 rq->migration_thread = NULL;
6362 INIT_LIST_HEAD(&rq->migration_queue);
6363#endif
6364 atomic_set(&rq->nr_iowait, 0);
6365
dd41f596
IM
6366 array = &rq->rt.active;
6367 for (j = 0; j < MAX_RT_PRIO; j++) {
6368 INIT_LIST_HEAD(array->queue + j);
6369 __clear_bit(j, array->bitmap);
1da177e4 6370 }
476f3534 6371 highest_cpu = i;
dd41f596
IM
6372 /* delimiter for bitsearch: */
6373 __set_bit(MAX_RT_PRIO, array->bitmap);
1da177e4
LT
6374 }
6375
2dd73a4f 6376 set_load_weight(&init_task);
b50f60ce 6377
c9819f45 6378#ifdef CONFIG_SMP
476f3534 6379 nr_cpu_ids = highest_cpu + 1;
c9819f45
CL
6380 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6381#endif
6382
b50f60ce
HC
6383#ifdef CONFIG_RT_MUTEXES
6384 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6385#endif
6386
1da177e4
LT
6387 /*
6388 * The boot idle thread does lazy MMU switching as well:
6389 */
6390 atomic_inc(&init_mm.mm_count);
6391 enter_lazy_tlb(&init_mm, current);
6392
6393 /*
6394 * Make us the idle thread. Technically, schedule() should not be
6395 * called from this thread, however somewhere below it might be,
6396 * but because we are the idle thread, we just pick up running again
6397 * when this runqueue becomes "idle".
6398 */
6399 init_idle(current, smp_processor_id());
dd41f596
IM
6400 /*
6401 * During early bootup we pretend to be a normal task:
6402 */
6403 current->sched_class = &fair_sched_class;
1da177e4
LT
6404}
6405
6406#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6407void __might_sleep(char *file, int line)
6408{
48f24c4d 6409#ifdef in_atomic
1da177e4
LT
6410 static unsigned long prev_jiffy; /* ratelimiting */
6411
6412 if ((in_atomic() || irqs_disabled()) &&
6413 system_state == SYSTEM_RUNNING && !oops_in_progress) {
6414 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6415 return;
6416 prev_jiffy = jiffies;
91368d73 6417 printk(KERN_ERR "BUG: sleeping function called from invalid"
1da177e4
LT
6418 " context at %s:%d\n", file, line);
6419 printk("in_atomic():%d, irqs_disabled():%d\n",
6420 in_atomic(), irqs_disabled());
a4c410f0 6421 debug_show_held_locks(current);
3117df04
IM
6422 if (irqs_disabled())
6423 print_irqtrace_events(current);
1da177e4
LT
6424 dump_stack();
6425 }
6426#endif
6427}
6428EXPORT_SYMBOL(__might_sleep);
6429#endif
6430
6431#ifdef CONFIG_MAGIC_SYSRQ
6432void normalize_rt_tasks(void)
6433{
a0f98a1c 6434 struct task_struct *g, *p;
1da177e4 6435 unsigned long flags;
70b97a7f 6436 struct rq *rq;
dd41f596 6437 int on_rq;
1da177e4
LT
6438
6439 read_lock_irq(&tasklist_lock);
a0f98a1c 6440 do_each_thread(g, p) {
dd41f596
IM
6441 p->se.fair_key = 0;
6442 p->se.wait_runtime = 0;
6443 p->se.wait_start_fair = 0;
6444 p->se.wait_start = 0;
6445 p->se.exec_start = 0;
6446 p->se.sleep_start = 0;
6447 p->se.sleep_start_fair = 0;
6448 p->se.block_start = 0;
6449 task_rq(p)->cfs.fair_clock = 0;
6450 task_rq(p)->clock = 0;
6451
6452 if (!rt_task(p)) {
6453 /*
6454 * Renice negative nice level userspace
6455 * tasks back to 0:
6456 */
6457 if (TASK_NICE(p) < 0 && p->mm)
6458 set_user_nice(p, 0);
1da177e4 6459 continue;
dd41f596 6460 }
1da177e4 6461
b29739f9
IM
6462 spin_lock_irqsave(&p->pi_lock, flags);
6463 rq = __task_rq_lock(p);
dd41f596
IM
6464#ifdef CONFIG_SMP
6465 /*
6466 * Do not touch the migration thread:
6467 */
6468 if (p == rq->migration_thread)
6469 goto out_unlock;
6470#endif
1da177e4 6471
dd41f596
IM
6472 on_rq = p->se.on_rq;
6473 if (on_rq)
6474 deactivate_task(task_rq(p), p, 0);
6475 __setscheduler(rq, p, SCHED_NORMAL, 0);
6476 if (on_rq) {
6477 activate_task(task_rq(p), p, 0);
1da177e4
LT
6478 resched_task(rq->curr);
6479 }
dd41f596
IM
6480#ifdef CONFIG_SMP
6481 out_unlock:
6482#endif
b29739f9
IM
6483 __task_rq_unlock(rq);
6484 spin_unlock_irqrestore(&p->pi_lock, flags);
a0f98a1c
IM
6485 } while_each_thread(g, p);
6486
1da177e4
LT
6487 read_unlock_irq(&tasklist_lock);
6488}
6489
6490#endif /* CONFIG_MAGIC_SYSRQ */
1df5c10a
LT
6491
6492#ifdef CONFIG_IA64
6493/*
6494 * These functions are only useful for the IA64 MCA handling.
6495 *
6496 * They can only be called when the whole system has been
6497 * stopped - every CPU needs to be quiescent, and no scheduling
6498 * activity can take place. Using them for anything else would
6499 * be a serious bug, and as a result, they aren't even visible
6500 * under any other configuration.
6501 */
6502
6503/**
6504 * curr_task - return the current task for a given cpu.
6505 * @cpu: the processor in question.
6506 *
6507 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6508 */
36c8b586 6509struct task_struct *curr_task(int cpu)
1df5c10a
LT
6510{
6511 return cpu_curr(cpu);
6512}
6513
6514/**
6515 * set_curr_task - set the current task for a given cpu.
6516 * @cpu: the processor in question.
6517 * @p: the task pointer to set.
6518 *
6519 * Description: This function must only be used when non-maskable interrupts
6520 * are serviced on a separate stack. It allows the architecture to switch the
6521 * notion of the current task on a cpu in a non-blocking manner. This function
6522 * must be called with all CPU's synchronized, and interrupts disabled, the
6523 * and caller must save the original value of the current task (see
6524 * curr_task() above) and restore that value before reenabling interrupts and
6525 * re-starting the system.
6526 *
6527 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6528 */
36c8b586 6529void set_curr_task(int cpu, struct task_struct *p)
1df5c10a
LT
6530{
6531 cpu_curr(cpu) = p;
6532}
6533
6534#endif
This page took 0.754927 seconds and 5 git commands to generate.